python-爬虫-史书典籍

python-爬虫-史书典籍,第1张

概述import requestsimport osfrom lxml import htmlimport timedef get_title_url(tree): ‘‘‘一级 获取标题‘‘‘ # 史书典籍 # 格式:/book/sanguoyanyi.html History_book_url_list = tree.xpath("/
import requestsimport osfrom lxml import HTMLimport timedef get_Title_url(tree):    ‘‘‘一级  获取标题‘‘‘    # 史书典籍    # 格式:/book/sanguoyanyi.HTML    History_book_url_List = tree.xpath("//div[@class=‘index-li‘][3]/ul/li/a/@href")    # 格式:三国演义    History_book_name_List = tree.xpath("//div[@class=‘index-li‘][3]/ul/li/a/text()")    return History_book_url_List,History_book_name_Listdef get_article_url(tree):    ‘‘‘二级  获取文章标题‘‘‘    # 三国演义典籍    # 格式:/book/sanguoyanyi/1.HTML    book_url_List = tree.xpath("//div[@class=‘book-mulu‘]/ul/li/a/@href")    # 格式:第一回·宴桃园豪杰三结义  斩黄巾英雄首立功    book_name_List = tree.xpath("//div[@class=‘book-mulu‘]/ul/li/a/text()")    return book_url_List,book_name_Listdef get_article(tree):    ‘‘‘三级  获取文章内容‘‘‘    # 第一回·宴桃园豪杰三结义  斩黄巾英雄首立功    # 格式:/book/sanguoyanyi/1.HTML    article_List = tree.xpath("//div[@class=‘chapter_content‘]/p/text()")    return ‘‘.join(article_List)def get_request(url,headers):    ‘‘‘获取页面‘‘‘    response = requests.get(url=url,headers=headers)    tree = HTML.fromstring(response.text)    return treedef save_mkdir(two):    ‘‘‘三级  保存文章夹‘‘‘    # 一级文件夹    if os.path.exists(史书典籍):        pass    else:        os.mkdir(史书典籍)    # 二级文件夹    if os.path.exists(史书典籍/+ two):        pass    else:        os.mkdir(史书典籍/+ two)def police_2(a):    ‘‘‘二级中断检测‘‘‘    b = None    if os.path.exists(史书典籍/police_2.txt):        with open(史书典籍/police_2.txt,r) as f:            b = f.read()            f.close()            if b is None:                return True            elif b is ‘‘:                return True            if a < int(b):                return False    # 写入并返回True    with open(史书典籍/police_2.txt,w) as f:        f.write(str(a))        f.close()        return Truedef police_3(a):    ‘‘‘三级中断检测‘‘‘    b = None    if os.path.exists(史书典籍/police_3.txt):        with open(史书典籍/police_3.txt,r) as f:            b = f.read()            f.close()            if b is None:                return True            elif b is ‘‘:                return True            if a < int(b):                return False    # 写入并返回True    with open(史书典籍/police_3.txt,w) as f:        f.write(str(a))        f.close()        return Truedef main():    ‘‘‘主函数‘‘‘    # 根路由    root = http://www.shicimingju.com    # 头部    headers = {        user-agent: Mozilla/5.0 (windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/76.0.3809.87 Safari/537.36    }    # 获取root页面    tree1 = get_request(root,headers)    # 获取一级名字和路由    History_book_url_List,History_book_name_List = get_Title_url(tree1)    # 获取二级页面    for i in range(len(History_book_url_List)):        if police_2(i) is False:            continue        # 二级路由        url2 = root + History_book_url_List[i]        print("爬取>>>"+History_book_name_List[i]+开始)        tree2 = get_request(url2,headers)        # 获取二级名字和路由        book_url_List,book_name_List = get_article_url(tree2)        # 文章夹保存        save_mkdir(History_book_name_List[i])        # 下载文章        for j in range(len(book_url_List)):            if police_3(j) is False:                continue            time.sleep(1)            # 三级路由            url3 = root + book_url_List[j]            print("爬取:" + book_name_List[j])            # 文章            tree3 = get_request(url3,headers)            txt = get_article(tree3)            # 文章标题            txt_name = book_name_List[j]            # 文章保存            file_path = 史书典籍/{}/{}.txt.format(History_book_name_List[i],(txt_name.replace( ,‘‘)).replace(·,‘‘))            with open(file_path,w,enCoding=utf-8) as f:                f.write(txt)                f.close()        print("爬取>>>" + History_book_name_List[i] + 结束)if __name__ == __main__:    main()
总结

以上是内存溢出为你收集整理的python-爬虫-史书典籍全部内容,希望文章能够帮你解决python-爬虫-史书典籍所遇到的程序开发问题。

如果觉得内存溢出网站内容还不错,欢迎将内存溢出网站推荐给程序员好友。

欢迎分享,转载请注明来源:内存溢出

原文地址: https://outofmemory.cn/langs/1195344.html

(0)
打赏 微信扫一扫 微信扫一扫 支付宝扫一扫 支付宝扫一扫
上一篇 2022-06-03
下一篇 2022-06-03

发表评论

登录后才能评论

评论列表(0条)

保存