

概述一、选题背景:2019年年初,《流浪地球》全国上榜。在豆瓣上,首日开分站稳8分以上,评分了之后点映的高热。微博上跟着出现吴京客串31天与6000万的热度搜。知乎上关于“评价刘慈欣如何评价刘慈欣小说改编的同名电影《流浪地球片》”的热门话题,包括导演郭帆的最高赞回答。二、数据说 一、选题背景:




 1 # 爬取电影《流浪地球》的影评 2 import requests 3 from lxml import etree 4 from tqdm import tqdm 5 import time 6 import random 7 import pandas as pd 8 import re 9 10 name_List, content_List, date_List, score_List, city_List = [], [], [], [], []11 movIE_name = ""12 13 def get_city(url, i):14     time.sleep(round(random.uniform(2, 3), 2))15     headers = {16         'User-Agent': 'Mozilla/5.0 (windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}17     cookies = {'cookie': 'bID=Ge7txCUP3v4; ll="108303"; _vwo_uuID_v2=DB48689393ACB497681C7C540C832B546|f3d53bcb0314c9a34c861e9c724fcdec; ap_v=0,6.0; dbcl2="159607750:sijMjNWV7ek"; ck=kgmP; push_doumail_num=0; push_noty_num=0;; _pk_ses.100001.8cb4=*; __lnkrntdmcvrd=-1; __yadk_uID=KqejvPo3L0HIkc2Zx7UXOJF6Vt9PpoJU; _pk_ID.100001.8cb4=91514e1ada30bfa5.1549433417.1.1549433694.1549433417'}  # 2018.7.25修改,18     res = requests.get(url, cookies=cookies, headers=headers)19     if (res.status_code == 200):20         print("\n成功获取第{}个用户城市信息!".format(i))21     else:22         print("\n第{}个用户城市信息获取失败".format(i))23     pattern = re.compile('<div >.*?<a href=".*?">(.*?)</a>', re.S)24     item = re.findall(pattern, res.text)  # List类型25     return (item[0])  # 只有一个元素,所以直接返回


















 1 def get_content(ID, page): 2     headers = { 3         'User-Agent': 'Mozilla/5.0 (windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'} 4     cookies = {'cookie': '                  此处填入自己的cookies,否则不能正常爬取                  '} 5     url = "" + str(ID) + "/comments?start=" + str(page * 10) + "&limit=20&sort=new_score&status=P" 6     res = requests.get(url, headers=headers, cookies=cookies) 7  8     pattern = re.compile('<div ID="wrapper">.*?<div ID="content">.*?<h1>(.*?) 短评</h1>', re.S) 9     global movIE_name10     movIE_name = re.findall(pattern, res.text)[0]  # List类型11 12     res.enCoding = "utf-8"13     if (res.status_code == 200):14         print("\n第{}页短评爬取成功!".format(page + 1))15         print(url)16     else:17         print("\n第{}页爬取失败!".format(page + 1))18 19     with open('HTML.HTML', 'w', enCoding='utf-8') as f:20         f.write(res.text)21         f.close()22     x = etree.HTML(res.text)



 1     for i in range(1, 21):   # 每页20个评论用户 2         name = x.xpath('//*[@ID="comments"]/div[{}]/div[2]/h3/span[2]/a/text()'.format(i)) 3         # 下面是个大BUG,如果有的人没有评分,但是评论了,那么score解析出来是日期,而日期所在位置spen[3]为空 4         score = x.xpath('//*[@ID="comments"]/div[{}]/div[2]/h3/span[2]/span[2]/@Title'.format(i)) 5         date = x.xpath('//*[@ID="comments"]/div[{}]/div[2]/h3/span[2]/span[3]/@Title'.format(i)) 6         m = '\d{4}-\d{2}-\d{2}' 7         try: 8             match = re.compile(m).match(score[0]) 9         except IndexError:10             break11         if match is not None:12             date = score13             score = ["null"]14         else:15             pass16         content = x.xpath('//*[@ID="comments"]/div[{}]/div[2]/p/span/text()'.format(i))17         ID = x.xpath('//*[@ID="comments"]/div[{}]/div[2]/h3/span[2]/a/@href'.format(i))18         try:19             city = get_city(ID[0], i)  # 调用评论用户的ID城市信息获取20         except IndexError:21             city = " "22         name_List.append(str(name[0]))23         score_List.append(str(score[0]).strip('[]\''))  # BUG 有些人评论了文字,但是没有给出评分24         date_List.append(str(date[0]).strip('[\'').split(' ')[0])25         content_List.append(str(content[0]).strip())26         city_List.append(city)


1 pattern = re.compile('<div ID="wrapper">.*?<div ID="content">.*?<h1>(.*?) 短评</h1>', re.S)2 global movIE_name3 movIE_name = re.findall(pattern, res.text)[0]  # List类型


1 def main(ID, pages):2     global movIE_name3     for i in tqdm(range(0, pages)):  # 豆瓣只开放500条评论4         get_content(ID, i)  # 第一个参数是豆瓣电影对应的ID序号,第二个参数是想爬取的评论页数5         time.sleep(round(random.uniform(3, 5), 2))6     infos = {'name': name_List, 'city': city_List, 'content': content_List, 'score': score_List, 'date': date_List}7     data = pd.DataFrame(infos, columns=['name', 'city', 'content', 'score', 'date'])8     data.to_csv(movIE_name + ".csv")  # 存储名为  电影名.csv



 1 # 数据分析可视化 2 import os 3 import pandas as pd 4 from pandas import DataFrame 5 import re 6 from pyecharts import line, Geo, bar, PIE, Page, themeRiver 7 from sNownlp import SNowNLP 8 import jIEba 9 import matplotlib.pyplot as plt10 from wordcloud import WordCloud, StopWORDS, ImagecolorGenerator11 12 fth = open('pyecharts_citys_supported.txt', 'r', enCoding='utf-8').read() # pyecharts支持城市列表


1 # 过滤字符串只保留中文2 def translate(str):3     line = str.strip()4     p2 = re.compile('[^\u4e00-\u9fa5]')   # 中文的编码范围是:\u4e00到\u9fa55     zh = " ".join(p2.split(line)).strip()6     zh = ",".join(zh.split())7     str = re.sub("[A-Za-z0-9!!,%\[\],。]", "", zh)8     return str


 1 # 下面是按照列属性读取的 2 def count_sentiment(csv_file): 3     path = os.path.abspath(os.curdir) 4     csv_file = path+ "\" + csv_file + ".csv" 5     csv_file = csv_file.replace('\', '\\') 6     d = pd.read_csv(csv_file, engine='python', enCoding='utf-8') 7     motion_List = [] 8     for i in d['content']: 9         try:10             s = round(SNowNLP(i).sentiments, 2)11             motion_List.append(s)12         except TypeError:13             continue14     result = {}15     for i in set(motion_List):16         result[i] = motion_List.count(i)17     return result


  sNownlp主要进行中文分词(算法是Character-Base Generative Model)、词性可以官网的原理是TnT、3-gram 隐马)、情感分析(有介绍原理,但指定购物类的评论的准确率,其实是因为它的语料库主要是再生方面的,可以自己构建相关领域语料库,替换原来的,准确率也相当不错的)、文本分类(原理是朴素贝叶斯)、转换拼音、繁体转简体、提取文本关键词(原理是TextRank)、提取摘要(原理是TextRank)、分割句子、相似文本(原理是BM25)【摘自CSDN】。在此之前,可以先看一下官网,里面有最基础的一些命令的介绍。官网链接:


 1 def draw_sentiment_pic(csv_file): 2     attr, val = [], [] 3     info = count_sentiment(csv_file) 4     info = sorted(info.items(), key=lambda x: x[0], reverse=False)  # dict的排序方法 5     for each in info[:-1]: 6         attr.append(each[0]) 7         val.append(each[1]) 8     line = line(csv_file+":影评情感分析") 9     line.add("", attr, val, is_smooth=True, is_more_utils=True)10     line.render(csv_file+"_情感分析曲线图.HTML")



 1 def draw_citys_pic(csv_file): 2     page = Page(csv_file+":评论城市分析") 3     info = count_city(csv_file) 4     geo = Geo("","小本聪原创",Title_pos="center", wIDth=1200,height=600, background_color='#404a59', Title_color="#fff") 5     while True:   # 二次筛选,和pyecharts支持的城市库进行匹配,如果报错则删除该城市对应的统计 6         try: 7             attr, val = geo.cast(info) 8             geo.add("", attr, val, visual_range=[0, 300], visual_text_color="#fff", is_geo_effect_show=False, 9                     is_pIEcewise=True, visual_split_number=6, symbol_size=15, is_visualmap=True)10         except ValueError as e:11             e = str(e)12             e = e.split("No coordinate is specifIEd for ")[1]  # 获取不支持的城市名称13             info.pop(e)14         else:15             break16     info = sorted(info.items(), key=lambda x: x[1], reverse=False)  # List排序17     print(info)18     info = dict(info)   # List转dict19     print(info)20     attr, val = [], []21     for key in info:22         attr.append(key)23         val.append(info[key])24 25 26     geo1 = Geo("", "评论城市分布", Title_pos="center", wIDth=1200, height=600,27               background_color='#404a59', Title_color="#fff")28     geo1.add("", attr, val, visual_range=[0, 300], visual_text_color="#fff", is_geo_effect_show=False,29             is_pIEcewise=True, visual_split_number=10, symbol_size=15, is_visualmap=True, is_more_utils=True)30     # geo1.render(csv_file + "_城市dotmap.HTML")31     page.add_chart(geo1)32     geo2 = Geo("", "评论来源热力图",Title_pos="center", wIDth=1200,height=600, background_color='#404a59', Title_color="#fff",)33     geo2.add("", attr, val, type="heatmap", is_visualmap=True, visual_range=[0, 50],visual_text_color='#fff', is_more_utils=True)34     # geo2.render(csv_file+"_城市heatmap.HTML")  # 取CSV文件名的前8位数35     page.add_chart(geo2)36     bar = bar("", "评论来源排行", Title_pos="center", wIDth=1200, height=600 )37     bar.add("", attr, val, is_visualmap=True, visual_range=[0, 100], visual_text_color='#fff',mark_point=["average"],mark_line=["average"],38             is_more_utils=True, is_label_show=True, is_datazoom_show=True, xaxis_rotate=45)39     bar.render(csv_file+"_城市评论bar.HTML")  # 取CSV文件名的前8位数40     page.add_chart(bar)41     pIE = PIE("", "评论来源饼图", Title_pos="right", wIDth=1200, height=600)42     pIE.add("", attr, val, radius=[20, 50], label_text_color=None, is_label_show=True, legend_orIEnt='vertical', is_more_utils=True, legend_pos='left')43     pIE.render(csv_file + "_城市评论PIE.HTML")  # 取CSV文件名的前8位数44     page.add_chart(pIE)45     page.render(csv_file + "_城市评论分析汇总.HTML")






统计相同 日期相同 评分的个数







 1 def score_draw(csv_file): 2     page = Page(csv_file+":评论等级分析") 3     score, date, val, score_List = [], [], [], [] 4     result = {} 5     path = os.path.abspath(os.curdir) 6     csv_file = path + "\" + csv_file + ".csv" 7     csv_file = csv_file.replace('\', '\\') 8     d = pd.read_csv(csv_file, engine='python', enCoding='utf-8')[['score', 'date']].dropna()  # 读取CSV转为dataframe格式,并丢弃评论为空的记录 9     for indexs in d.index:  # 一种遍历df行的方法(下面还有第二种,iterrows)10         score_List.append(tuple(d.loc[indexs].values[:])) # 目前只找到转换为tuple然后统计相同元素个数的方法11     print("有效评分总数量为:",len(score_List), " 条")12     for i in set(List(score_List)):13         result[i] = score_List.count(i)  # dict类型14     info = []15     for key in result:16         score= key[0]17         date = key[1]18         val = result[key]19         info.append([score, date, val])20     info_new = DataFrame(info)  # 将字典转换成为数据框21     info_new.columns = ['score', 'date', 'Votes']22     info_new.sort_values('date', inplace=True)    # 按日期升序排列df,便于找最早date和最晚data,方便后面插值23     print("first df", info_new)24     # 以下代码用于插入空缺的数据,每个日期的评分类型应该有5中,依次遍历判断是否存在,若不存在则往新的df中插入新数值25     mark = 026     creat_df = pd.DataFrame(columns = ['score', 'date', 'Votes']) # 创建空的dataframe27     for i in List(info_new['date']):28         location = info_new[("力荐")].index.toList()29         if location == []:30             creat_df.loc[mark] = ["力荐", i, 0]31             mark += 132         location = info_new[("推荐")].index.toList()33         if location == []:34             creat_df.loc[mark] = ["推荐", i, 0]35             mark += 136         location = info_new[("还行")].index.toList()37         if location == []:38             creat_df.loc[mark] = ["还行", i, 0]39             mark += 140         location = info_new[("较差")].index.toList()41         if location == []:42             creat_df.loc[mark] = ["较差", i, 0]43             mark += 144         location = info_new[("很差")].index.toList()45         if location == []:46             creat_df.loc[mark] = ["很差", i, 0]47             mark += 148     info_new = info_new.append(creat_df.drop_duplicates(), ignore_index=True)49     score_List = []50     info_new.sort_values('date', inplace=True)    # 按日期升序排列df,便于找最早date和最晚data,方便后面插值51     print(info_new)52     for index, row in info_new.iterrows():   # 第二种遍历df的方法53         score_List.append([row['date'], row['Votes'], row['score']])54     tr = themeRiver()55     tr.add(['力荐', '推荐', '还行', '较差', '很差'], score_List, is_label_show=True, is_more_utils=True)56     page.add_chart(tr)57 58     attr, v1, v2, v3, v4, v5 = [], [], [], [], [], []59     attr = List(sorted(set(info_new['date'])))60     bar = bar()61     for i in attr:62         v1.append(int(info_new[(info_new['date']==i)&(info_new['score']=="力荐")]['Votes']))63         v2.append(int(info_new[(info_new['date']==i)&(info_new['score']=="推荐")]['Votes']))64         v3.append(int(info_new[(info_new['date']==i)&(info_new['score']=="还行")]['Votes']))65         v4.append(int(info_new[(info_new['date']==i)&(info_new['score']=="较差")]['Votes']))66         v5.append(int(info_new[(info_new['date']==i)&(info_new['score']=="很差")]['Votes']))67     bar.add("力荐", attr, v1, is_stack=True)68     bar.add("推荐", attr, v2, is_stack=True)69     bar.add("还行", attr, v3, is_stack=True)70     bar.add("较差", attr, v4, is_stack=True)71     bar.add("很差", attr, v5, is_stack=True, is_convert=True, mark_line=["average"], is_more_utils=True)72     page.add_chart(bar)73 74     line = line()75     line.add("力荐", attr, v1, is_stack=True)76     line.add("推荐", attr, v2, is_stack=True)77     line.add("还行", attr, v3, is_stack=True)78     line.add("较差", attr, v4, is_stack=True)79     line.add("很差", attr, v5, is_stack=True, is_convert=False, mark_line=["average"], is_more_utils=True)80     page.add_chart(line)





1 def main(csv_file, stopwords_path, pic_path):2     draw_sentiment_pic(csv_file)3     draw_citys_pic(csv_file)4     score_draw(csv_file)5     word_cloud(csv_file,stopwords_path, pic_path)6 7 8 if __name__ == '__main__':9     main("流浪地球", "stopwords.txt", "胡歌.jpg" )




