(由于存在一定的反爬机制,以下代码最多只能爬取210条评论)
# 导入工具包 import requests from bs4 import BeautifulSoup import time import pandas as pd import numpy as np # 请求头 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'} # ============================================================================= # 爬取一页 # ============================================================================= # 爬取的网址 url = 'https://movie.douban.com/subject/25845392/comments?start=20&limit=20&status=P&sort=new_score' # 获取信息 html = requests.get(url, headers=headers) # 获取内容 data = html.text soup = BeautifulSoup(data, 'html.parser') # 信息 # 用户 names = soup.select('#comments > div > div.comment > h3 > span.comment-info > a') # 评级 pingjis = soup.select('#comments > div > div.comment > h3 > span.comment-info') # 日期 riqis = soup.select('#comments > div > div.comment > h3 > span.comment-info > span.comment-time') # 内容 neirongs = soup.select('#comments > div > div.comment > p > span') # 空list lis = [] for name, pingji, riqi, neirong in zip(names, pingjis, riqis, neirongs): pingji_re = pingji.find_all('span') lis.append([name.get_text(), pingji_re[1]['class'], pingji_re[1]['title'], riqi.get_text().strip(), neirong.get_text()]) result1 = pd.Dataframe(lis, columns=['用户', '评级', '等级', '日期', '内容']) # print(result1) # ============================================================================= # 利用循环结构爬取多页 # ============================================================================= url = ['https://movie.douban.com/subject/25845392/comments?start={}&limit=20&status=P&sort=new_score'.format(i) for i in range(0, 500, 20)] lis2 = [] for urli in url: # 获取信息 html = requests.get(urli, headers=headers) # 获取内容 data = html.text soup = BeautifulSoup(data, 'html.parser') # 用户 names = soup.select('#comments > div > div.comment > h3 > span.comment-info > a') # 评级 pingjis = soup.select('#comments > div > div.comment > h3 > span.comment-info') # 日期 riqis = soup.select('#comments > div > div.comment > h3 > span.comment-info > span.comment-time') # 内容 neirongs = soup.select('#comments > div > div.comment > p > span') for name, pingji, riqi, neirong in zip(names, pingjis, riqis, neirongs): pingji_re = pingji.find_all('span') lis2.append([name.get_text(), pingji_re[1]['class'], pingji_re[1]['title'], riqi.get_text().strip(), neirong.get_text()]) print('完成:', urli) time.sleep(np.random.randint(5, 10)) result2 = pd.Dataframe(lis2, columns=['用户', '评级', '等级', '日期', '内容']) # 写入excel frame = pd.Dataframe(result2) file = frame.to_csv('movie.csv')二、文本分析
# -*- coding: utf-8 -*- import pandas import jieba import re # loading data data = pandas.read_excel( "C:\Users\Lenovo\documents\comments.xlsx" ) # 1.文本内容清洗,清楚特殊符号,用正则表达式 pattern = r"[!"#$%&'()*+,-./:;<=>?@[\]^_^{|}~—!,。?、¥…():【】《》‘’“”s]+" re_obj = re.compile(pattern) def clear(text): return re.sub(pattern, "", text) data['comment'] = data['comment'].apply(clear) print(data.head()) def cut_word(text): # 返回生成器 return jieba.cut(text) # 2.分词 用jieba来实现分词 data['comment'] = data['comment'].apply(cut_word) # 3.停用词处理,这里我用的是中文停词用表(文末附) def get_stopword(): # 使用set s = set() with open('C:\Users\Lenovo\Desktop\cn_stopwords.txt', encoding='UTF-8') as f: for line in f: s.add(line.strip()) return s def remove_stopword(words): return [word for word in words if word not in stopword] stopword = get_stopword() data['comment'] = data['comment'].apply(remove_stopword) # 4.词汇统计 from itertools import chain from collections import Counter li_2d = data['comment'].tolist() # 将二维列表转换为一维 li_1d = list(chain.from_iterable(li_2d)) print(f'总词汇量:{len(li_1d)}') c = Counter(li_1d) print(f'不重复词汇量:{len(c)}') common = c.most_common(50) # print(common) import pandas as pd frame = pd.Dataframe(common) file = frame.to_csv('common11.csv') # 计算每个评论的用词数 num = [len(li) for li in li_2d] import matplotlib.pyplot as plt # 绘制所有用户在评论时所用词汇书,绘制直方图 # n, bins, patches = plt.hist(num, bins=20, alpha=0.5) # plt.yscale('log') # plt.show() # 生成词云图 from wordcloud import WordCloud # 导入图像处理库 import PIL.Image as image # 导入数据处理库 import numpy as np import matplotlib.colors as colors # 处理图片相关内容 # 文末附颜色对照表 colormaps = colors.ListedColormap(['#FF4500', '#FF7F50', '#FFD700']) # mask可以用PPT画自己想要的图形(我这里是用来“长津湖”的艺术字) mask1 = np.array(image.open('C:\Users\Lenovo\Desktop\aa.png')) wc = WordCloud(font_path="simsun.ttc", background_color="white", mask=mask1, colormap=colormaps) img = wc.generate_from_frequencies(c) plt.figure(figsize=(15, 10)) plt.imshow(img) plt.axis('off') plt.show()
下图为我的代码词云图:
(毕竟计算机不一定能满足我们的需求)
# 加载词典 jieba.load_userdict('my_dictionary.txt') # 也可以添加自定义词典 jieba.add_word('易烊千玺') jieba.add_word('长津湖')四、分本情感分析
snownlp是对一个评论进行情感分析输出一个[0,1]之间的值,越高越表示评论越偏向于积极好的方面。
from snownlp import SnowNLP # 示例: # a = SnowNLP(u'确实很满意的一次购物。做工很好很精致。内外都很特别。这几天穿着很暖和的。而且轻薄。包装都很好。').sentiments # print(a) # 打开文件 fp = open(r"movie.txt", "r", encoding='utf-8') lines = fp.readlines() k = 0 m = 0 # 逐行读入 for line in lines: try: s = SnowNLP(line) # 进行对没条评论情感分析打分累加 k = k+s.sentiments # 对评论总数进行累加 m = m+1 except: print("") print(round(k/m, 3)) # 得到平均分
从输出结果为0.827,可以得出结论:《长津湖》广受好评。
五、附录1.停用词:
2.颜色代码:(很全很好用!)
百度文库链接
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)