一、明确需求
爬取网站内的小说名
小说内容
二、代码讲解下面根据代码,从浅入深给大家讲解分析一遍
-- codeing = utf-8 --,开头的这个是设置编码为utf-8 ,写在开头,防止乱码
然后下面 import就是导入一些库,做做准备工作
import os import jieba from wordcloud import WordCloud import matplotlib.pyplot as plt import requests import SQL import pinyin from bs4 import BeautifulSoup from pymysql import *
首先我们要伪装成一个浏览器,再去访问我们需要爬取的网站
百度百科:
User Agent中文名为用户代理,简称 UA,它是一个特殊字符串头,使得服务器能够识别客户使用的 *** 作系统及版本、CPU 类型、浏览器及版本、浏览器渲染引擎、浏览器语言、浏览器插件等。
以Chrome浏览器为例,在浏览器地址栏输入
可以看到,浏览器User-Agent为Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36
下面是详细的代码
headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36 Edg/96.0.1054.62' } url = 'https://www.shicimingju.com/book/%s.html' name_p = pinyin.pinyin(name)
我们先建立起一个数据表用来保存一会我们要爬取的数据
sql = "CREATE TABLE `novel`.`{}`( `section` TEXT(100) , `article` TEXT(10000) );".format( name ) SQL.doSql(sql)
现在我们要开始爬取网页上我们需要的数据了,并且把爬取到的数据存入到我们所建立起的数据库里
for li in li_list: title = li.a.string detail_url = 'https://www.shicimingju.com'+li.a['href'] detail_page_text = requests.get(url=detail_url,headers=headers) detail_page_text.encoding = 'utf-8' html = detail_page_text detail_page_text = detail_page_text.text detail_soup = BeautifulSoup(detail_page_text,'lxml') div_tag = detail_soup.find('div',class_='chapter_content') content = div_tag.text sql = "INSERT INTO `novel`.`{}`(`section`,`article`) VALUES ( '{}','{}');".format( name,title,content ) SQL.doSql(sql) print(title,'爬取成功!!!! ')
最后附上我们的完整代码
import os import jieba from wordcloud import WordCloud import matplotlib.pyplot as plt import requests import SQL import pinyin from bs4 import BeautifulSoup from pymysql import * conn = connect(host='localhost',user='root',password='',db='novel',charset='utf8') cs1 = conn.cursor() # cur = connect.cursor() # cur.execute() # 生成小说 def book(name): novel_path = os.path.join('./text', name) os.mkdir(novel_path) count = cs1.execute('SELECt section,article FROM `{}`;'.format(name)) for i in range(count): result = cs1.fetchone() I = str(i) title_path = './text/' + name + '/' + I + '_' + result[0] + '.txt' with open(title_path, 'w', encoding='utf-8') as fp: fp.write(result[1]) # text中有哪些小说 def path(): import os dir_path = './text' for root, dirs, files in os.walk(dir_path): list = dirs break return list # 词云 def wc(word,name): count = cs1.execute('SELECt section,article FROM `{}`;'.format(name)) content_num = [] for i in range(count): result = cs1.fetchone() words = jieba.lcut(result[1]) for word in words: content_num.append(result[0]) # content_num = str(content_num) cut_text = "".join(content_num) if cut_text == NULL: name = input('请重新输入关键词:') wc(word,name) wordcloud = WordCloud( font_path="C:/Windows/Fonts/simfang.ttf", background_color="white", width=1000, height=800 ).generate(cut_text) plt.imshow(wordcloud) plt.show() # 打开小说文件夹 def sel(name_r): path_r = '.\text\' + name_r os.startfile(path_r) if __name__ == "__main__": name = input('请输入小说名字:') count = cs1.execute('SHOW TABLES FROM novel;') content_list = [] # 处理小说名字加入列表中 for i in range(count): result = str(cs1.fetchone()) result = result[2:-3] content_list.append(result) # 判断是否重名,重名打开小说,没重名爬下来 if (name in content_list): for i in content_list: print(i) while True : name_r = input("选择您要读的书籍:") if (name_r in content_list): break; else: print("输入有误请重新输入") word = input("关键词:") wc(word,name_r) if (name_r in path()): sel(name_r) else: book(name) sel(name) else: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36 Edg/96.0.1054.62' } url = 'https://www.shicimingju.com/book/%s.html' name_p = pinyin.pinyin(name) url = format(url%name_p) page_text = requests.get(url=url,headers=headers) page_text.encoding = 'utf-8' html = page_text page_text = page_text.text soup = BeautifulSoup(page_text, 'lxml') li_list = soup.select('.book-mulu > ul > li') sql = "CREATE TABLE `novel`.`{}`( `section` TEXT(100) , `article` TEXT(10000) );".format( name ) SQL.doSql(sql) for li in li_list: title = li.a.string detail_url = 'https://www.shicimingju.com'+li.a['href'] detail_page_text = requests.get(url=detail_url,headers=headers) detail_page_text.encoding = 'utf-8' html = detail_page_text detail_page_text = detail_page_text.text detail_soup = BeautifulSoup(detail_page_text,'lxml') div_tag = detail_soup.find('div',class_='chapter_content') content = div_tag.text sql = "INSERT INTO `novel`.`{}`(`section`,`article`) VALUES ( '{}','{}');".format( name,title,content ) SQL.doSql(sql) print(title,'爬取成功!!!! ') print("$$$$$$爬取结束!!!$$$$$$") word = input("关键词:") wc(word, name) if (name in path()): sel(name) else: book(name) sel(name) cs1.close() conn.close()
完整代码里还有一些其他的功能,有感兴趣的小伙伴可以自己研究研究!
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)