import requests,refrom lxml import etreestart_url=‘https://www.23us.so/files/article/HTML‘url=start_url+‘/10/10839/index.HTML‘response=requests.get(url).textnumbers_List=re.findall(‘\w\shref="‘+start_url+‘/10/10839/(\d+).HTML‘,response,re.S)#j=re.findall(‘<a href="‘+ur+‘/9/9579/9633139.HTML">(.*?)</a>‘,k,re.S)x1=urly1=‘//*[@]//text()‘novel_name = ‘剑来.txt‘def pares(x,y): m = requests.get(x) m.enCoding = m.apparent_enCoding um = etree.HTML(m.text) poo = um.xpath(y) return poodef writecontext(): for i in do: with open(novel_name,‘a‘,enCoding=‘utf-8‘)as f: f.write(str(i)) print(i)def writeTitle(): with open(novel_name,enCoding=‘utf-8‘)as f: f.write("\n\n"+o+"\n") print(o) writecontext() doo=pares(x1,y1)e=0while e<10000:#控制章节数, x2=start_url+"/10/10839/{}.HTML".format(numbers_List[e]) y2=‘//*[@ID="contents"]/text()‘ do = pares(x2,y2) o = doo[e] e = e + 1 writeTitle()@H_404_4@ @H_404_4@ @H_404_4@ @H_404_4@ 总结
以上是内存溢出为你收集整理的python---requests爬取顶点小说全部内容,希望文章能够帮你解决python---requests爬取顶点小说所遇到的程序开发问题。
如果觉得内存溢出网站内容还不错,欢迎将内存溢出网站推荐给程序员好友。
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)