Python爬虫诗词名句网教程

Python爬虫诗词名句网教程,第1张

Python爬虫诗词名句网教程 爬取的基本步骤

 

一、明确需求

爬取网站内的小说

小说内容

二、代码讲解

 下面根据代码,从浅入深给大家讲解分析一遍

-- codeing = utf-8 --,开头的这个是设置编码为utf-8 ,写在开头,防止乱码

然后下面 import就是导入一些库,做做准备工作

import os
import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import requests
import SQL
import pinyin
from bs4 import BeautifulSoup
from pymysql import *

首先我们要伪装成一个浏览器,再去访问我们需要爬取的网站

百度百科:
User Agent中文名为用户代理,简称 UA,它是一个特殊字符串头,使得服务器能够识别客户使用的 *** 作系统及版本、CPU 类型、浏览器及版本、浏览器渲染引擎、浏览器语言、浏览器插件等。

以Chrome浏览器为例,在浏览器地址栏输入

可以看到,浏览器User-Agent为Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36 

下面是详细的代码

        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36 Edg/96.0.1054.62'
        }
        url = 'https://www.shicimingju.com/book/%s.html'
        name_p = pinyin.pinyin(name)

我们先建立起一个数据表用来保存一会我们要爬取的数据

sql = "CREATE TABLE `novel`.`{}`(     `section` TEXT(100) ,     `article` TEXT(10000)   );".format(
            name
        )
        SQL.doSql(sql)

现在我们要开始爬取网页上我们需要的数据了,并且把爬取到的数据存入到我们所建立起的数据库里

for li in li_list:
            title = li.a.string
            detail_url = 'https://www.shicimingju.com'+li.a['href']
            detail_page_text = requests.get(url=detail_url,headers=headers)

            detail_page_text.encoding = 'utf-8'
            html = detail_page_text
            detail_page_text = detail_page_text.text

            detail_soup = BeautifulSoup(detail_page_text,'lxml')
            div_tag = detail_soup.find('div',class_='chapter_content')
            content = div_tag.text
            sql = "INSERT INTO `novel`.`{}`(`section`,`article`) VALUES ( '{}','{}');".format(
                name,title,content
            )
            SQL.doSql(sql)

            print(title,'爬取成功!!!! ')

最后附上我们的完整代码

import os
import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import requests
import SQL
import pinyin
from bs4 import BeautifulSoup
from pymysql import *

conn = connect(host='localhost',user='root',password='',db='novel',charset='utf8')
cs1 = conn.cursor()
# cur = connect.cursor()
# cur.execute()


# 生成小说
def book(name):
    novel_path = os.path.join('./text', name)
    os.mkdir(novel_path)
    count = cs1.execute('SELECt section,article FROM `{}`;'.format(name))
    for i in range(count):
        result = cs1.fetchone()
        I = str(i)
        title_path = './text/' + name + '/' + I + '_' + result[0] + '.txt'
        with open(title_path, 'w', encoding='utf-8') as fp:
            fp.write(result[1])

# text中有哪些小说
def path():
    import os
    dir_path = './text'
    for root, dirs, files in os.walk(dir_path):
        list = dirs
        break
    return list
# 词云
def wc(word,name):
    count = cs1.execute('SELECt section,article FROM `{}`;'.format(name))
    content_num = []
    for i in range(count):
        result = cs1.fetchone()
        words = jieba.lcut(result[1])
        for word in words:
            content_num.append(result[0])
    # content_num = str(content_num)
    cut_text = "".join(content_num)
    if cut_text == NULL:
        name = input('请重新输入关键词:')
        wc(word,name)
    wordcloud = WordCloud(
        font_path="C:/Windows/Fonts/simfang.ttf",
        background_color="white",
        width=1000,
        height=800
    ).generate(cut_text)
    plt.imshow(wordcloud)
    plt.show()


# 打开小说文件夹
def sel(name_r):
    path_r = '.\text\' + name_r
    os.startfile(path_r)

if __name__ == "__main__":
    name = input('请输入小说名字:')
    count = cs1.execute('SHOW TABLES FROM novel;')
    content_list = []

    # 处理小说名字加入列表中
    for i in range(count):
        result = str(cs1.fetchone())
        result = result[2:-3]
        content_list.append(result)
    # 判断是否重名,重名打开小说,没重名爬下来
    if (name in content_list):
        for i in content_list:
            print(i)
        while True :
            name_r = input("选择您要读的书籍:")
            if (name_r in content_list):
                break;
            else:
                print("输入有误请重新输入")
        word = input("关键词:")
        wc(word,name_r)


        if (name_r in path()):
            sel(name_r)
        else:
            book(name)
            sel(name)


    else:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36 Edg/96.0.1054.62'
        }
        url = 'https://www.shicimingju.com/book/%s.html'
        name_p = pinyin.pinyin(name)

        url = format(url%name_p)
        page_text = requests.get(url=url,headers=headers)
        page_text.encoding = 'utf-8'
        html = page_text
        page_text = page_text.text

        soup = BeautifulSoup(page_text, 'lxml')
        li_list = soup.select('.book-mulu > ul > li')


        sql = "CREATE TABLE `novel`.`{}`(     `section` TEXT(100) ,     `article` TEXT(10000)   );".format(
            name
        )
        SQL.doSql(sql)
        for li in li_list:
            title = li.a.string
            detail_url = 'https://www.shicimingju.com'+li.a['href']
            detail_page_text = requests.get(url=detail_url,headers=headers)

            detail_page_text.encoding = 'utf-8'
            html = detail_page_text
            detail_page_text = detail_page_text.text

            detail_soup = BeautifulSoup(detail_page_text,'lxml')
            div_tag = detail_soup.find('div',class_='chapter_content')
            content = div_tag.text
            sql = "INSERT INTO `novel`.`{}`(`section`,`article`) VALUES ( '{}','{}');".format(
                name,title,content
            )
            SQL.doSql(sql)

            print(title,'爬取成功!!!! ')
        print("$$$$$$爬取结束!!!$$$$$$")
        word = input("关键词:")
        wc(word, name)
        if (name in path()):
            sel(name)
        else:
            book(name)
        sel(name)



cs1.close()
conn.close()

完整代码里还有一些其他的功能,有感兴趣的小伙伴可以自己研究研究!

欢迎分享,转载请注明来源:内存溢出

原文地址: http://outofmemory.cn/zaji/5679834.html

(0)
打赏 微信扫一扫 微信扫一扫 支付宝扫一扫 支付宝扫一扫
上一篇 2022-12-17
下一篇 2022-12-17

发表评论

登录后才能评论

评论列表(0条)

保存