import requestsimport osfrom lxml import HTMLimport timedef get_Title_url(tree): ‘‘‘一级 获取标题‘‘‘ # 史书典籍 # 格式:/book/sanguoyanyi.HTML History_book_url_List = tree.xpath("//div[@class=‘index-li‘][3]/ul/li/a/@href") # 格式:三国演义 History_book_name_List = tree.xpath("//div[@class=‘index-li‘][3]/ul/li/a/text()") return History_book_url_List,History_book_name_Listdef get_article_url(tree): ‘‘‘二级 获取文章标题‘‘‘ # 三国演义典籍 # 格式:/book/sanguoyanyi/1.HTML book_url_List = tree.xpath("//div[@class=‘book-mulu‘]/ul/li/a/@href") # 格式:第一回·宴桃园豪杰三结义 斩黄巾英雄首立功 book_name_List = tree.xpath("//div[@class=‘book-mulu‘]/ul/li/a/text()") return book_url_List,book_name_Listdef get_article(tree): ‘‘‘三级 获取文章内容‘‘‘ # 第一回·宴桃园豪杰三结义 斩黄巾英雄首立功 # 格式:/book/sanguoyanyi/1.HTML article_List = tree.xpath("//div[@class=‘chapter_content‘]/p/text()") return ‘‘.join(article_List)def get_request(url,headers): ‘‘‘获取页面‘‘‘ response = requests.get(url=url,headers=headers) tree = HTML.fromstring(response.text) return treedef save_mkdir(two): ‘‘‘三级 保存文章夹‘‘‘ # 一级文件夹 if os.path.exists(‘史书典籍‘): pass else: os.mkdir(‘史书典籍‘) # 二级文件夹 if os.path.exists(‘史书典籍/‘+ two): pass else: os.mkdir(‘史书典籍/‘+ two)def police_2(a): ‘‘‘二级中断检测‘‘‘ b = None if os.path.exists(‘史书典籍/police_2.txt‘): with open(‘史书典籍/police_2.txt‘,‘r‘) as f: b = f.read() f.close() if b is None: return True elif b is ‘‘: return True if a < int(b): return False # 写入并返回True with open(‘史书典籍/police_2.txt‘,‘w‘) as f: f.write(str(a)) f.close() return Truedef police_3(a): ‘‘‘三级中断检测‘‘‘ b = None if os.path.exists(‘史书典籍/police_3.txt‘): with open(‘史书典籍/police_3.txt‘,‘r‘) as f: b = f.read() f.close() if b is None: return True elif b is ‘‘: return True if a < int(b): return False # 写入并返回True with open(‘史书典籍/police_3.txt‘,‘w‘) as f: f.write(str(a)) f.close() return Truedef main(): ‘‘‘主函数‘‘‘ # 根路由 root = ‘http://www.shicimingju.com‘ # 头部 headers = { ‘user-agent‘: ‘Mozilla/5.0 (windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/76.0.3809.87 Safari/537.36‘ } # 获取root页面 tree1 = get_request(root,headers) # 获取一级名字和路由 History_book_url_List,History_book_name_List = get_Title_url(tree1) # 获取二级页面 for i in range(len(History_book_url_List)): if police_2(i) is False: continue # 二级路由 url2 = root + History_book_url_List[i] print("爬取>>>"+History_book_name_List[i]+‘开始‘) tree2 = get_request(url2,headers) # 获取二级名字和路由 book_url_List,book_name_List = get_article_url(tree2) # 文章夹保存 save_mkdir(History_book_name_List[i]) # 下载文章 for j in range(len(book_url_List)): if police_3(j) is False: continue time.sleep(1) # 三级路由 url3 = root + book_url_List[j] print("爬取:" + book_name_List[j]) # 文章 tree3 = get_request(url3,headers) txt = get_article(tree3) # 文章标题 txt_name = book_name_List[j] # 文章保存 file_path = ‘史书典籍/{}/{}.txt‘.format(History_book_name_List[i],(txt_name.replace(‘ ‘,‘‘)).replace(‘·‘,‘‘)) with open(file_path,‘w‘,enCoding=‘utf-8‘) as f: f.write(txt) f.close() print("爬取>>>" + History_book_name_List[i] + ‘结束‘)if __name__ == ‘__main__‘: main()总结
以上是内存溢出为你收集整理的python-爬虫-史书典籍全部内容,希望文章能够帮你解决python-爬虫-史书典籍所遇到的程序开发问题。
如果觉得内存溢出网站内容还不错,欢迎将内存溢出网站推荐给程序员好友。
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)