python-爬虫-史书典籍_python

概述import requestsimport osfrom lxml import htmlimport timedef get_title_url(tree): ‘‘‘一级获取标题‘‘‘ # 史书典籍 # 格式：/book/sanguoyanyi.html History_book_url_list = tree.xpath("/

import requestsimport osfrom lxml import HTMLimport timedef get_Title_url(tree):    ‘‘‘一级  获取标题‘‘‘    # 史书典籍    # 格式：/book/sanguoyanyi.HTML    History_book_url_List = tree.xpath("//div[@class=‘index-li‘][3]/ul/li/a/@href")    # 格式：三国演义    History_book_name_List = tree.xpath("//div[@class=‘index-li‘][3]/ul/li/a/text()")    return History_book_url_List,History_book_name_Listdef get_article_url(tree):    ‘‘‘二级  获取文章标题‘‘‘    # 三国演义典籍    # 格式：/book/sanguoyanyi/1.HTML    book_url_List = tree.xpath("//div[@class=‘book-mulu‘]/ul/li/a/@href")    # 格式：第一回·宴桃园豪杰三结义  斩黄巾英雄首立功    book_name_List = tree.xpath("//div[@class=‘book-mulu‘]/ul/li/a/text()")    return book_url_List,book_name_Listdef get_article(tree):    ‘‘‘三级  获取文章内容‘‘‘    # 第一回·宴桃园豪杰三结义  斩黄巾英雄首立功    # 格式：/book/sanguoyanyi/1.HTML    article_List = tree.xpath("//div[@class=‘chapter_content‘]/p/text()")    return ‘‘.join(article_List)def get_request(url,headers):    ‘‘‘获取页面‘‘‘    response = requests.get(url=url,headers=headers)    tree = HTML.fromstring(response.text)    return treedef save_mkdir(two):    ‘‘‘三级  保存文章夹‘‘‘    # 一级文件夹    if os.path.exists(‘史书典籍‘):        pass    else:        os.mkdir(‘史书典籍‘)    # 二级文件夹    if os.path.exists(‘史书典籍/‘+ two):        pass    else:        os.mkdir(‘史书典籍/‘+ two)def police_2(a):    ‘‘‘二级中断检测‘‘‘    b = None    if os.path.exists(‘史书典籍/police_2.txt‘):        with open(‘史书典籍/police_2.txt‘,‘r‘) as f:            b = f.read()            f.close()            if b is None:                return True            elif b is ‘‘:                return True            if a < int(b):                return False    # 写入并返回True    with open(‘史书典籍/police_2.txt‘,‘w‘) as f:        f.write(str(a))        f.close()        return Truedef police_3(a):    ‘‘‘三级中断检测‘‘‘    b = None    if os.path.exists(‘史书典籍/police_3.txt‘):        with open(‘史书典籍/police_3.txt‘,‘r‘) as f:            b = f.read()            f.close()            if b is None:                return True            elif b is ‘‘:                return True            if a < int(b):                return False    # 写入并返回True    with open(‘史书典籍/police_3.txt‘,‘w‘) as f:        f.write(str(a))        f.close()        return Truedef main():    ‘‘‘主函数‘‘‘    # 根路由    root = ‘http://www.shicimingju.com‘    # 头部    headers = {        ‘user-agent‘: ‘Mozilla/5.0 (windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/76.0.3809.87 Safari/537.36‘    }    # 获取root页面    tree1 = get_request(root,headers)    # 获取一级名字和路由    History_book_url_List,History_book_name_List = get_Title_url(tree1)    # 获取二级页面    for i in range(len(History_book_url_List)):        if police_2(i) is False:            continue        # 二级路由        url2 = root + History_book_url_List[i]        print("爬取>>>"+History_book_name_List[i]+‘开始‘)        tree2 = get_request(url2,headers)        # 获取二级名字和路由        book_url_List,book_name_List = get_article_url(tree2)        # 文章夹保存        save_mkdir(History_book_name_List[i])        # 下载文章        for j in range(len(book_url_List)):            if police_3(j) is False:                continue            time.sleep(1)            # 三级路由            url3 = root + book_url_List[j]            print("爬取:" + book_name_List[j])            # 文章            tree3 = get_request(url3,headers)            txt = get_article(tree3)            # 文章标题            txt_name = book_name_List[j]            # 文章保存            file_path = ‘史书典籍/{}/{}.txt‘.format(History_book_name_List[i],(txt_name.replace(‘ ‘,‘‘)).replace(‘·‘,‘‘))            with open(file_path,‘w‘,enCoding=‘utf-8‘) as f:                f.write(txt)                f.close()        print("爬取>>>" + History_book_name_List[i] + ‘结束‘)if __name__ == ‘__main__‘:    main()

总结

以上是内存溢出为你收集整理的python-爬虫-史书典籍全部内容，希望文章能够帮你解决python-爬虫-史书典籍所遇到的程序开发问题。

如果觉得内存溢出网站内容还不错，欢迎将内存溢出网站推荐给程序员好友。

欢迎分享，转载请注明来源：内存溢出

原文地址: http://outofmemory.cn/langs/1195344.html

python-爬虫-史书典籍

发表评论

评论列表（0条）