Python爬虫爬取搜狐视频电影并存储到mysql数据库

Python爬虫爬取搜狐视频电影并存储到mysql数据库,第1张

概述代码:1importtime2importtraceback3importrequests4fromlxmlimportetree5importre6frombs4importBeautifulSoup7fromlxml.html.diffimportend_tag8importjson9importpymysql10#连接数据库获取游标11defget_conn(): 代码:
  1 import time  2 import traceback  3 import requests  4 from lxml import etree  5 import re  6 from bs4 import BeautifulSoup  7 from lxml.HTML.diff import end_tag  8 import Json  9 import pyMysqL 10 #连接数据库  获取游标 11 def get_conn(): 12     """ 13     :return: 连接,游标 14     """ 15     # 创建连接 16     conn = pyMysqL.connect(host="127.0.0.1", 17                     user="root", 18                     password="000429", 19                     db="movIErankings", 20                     charset="utf8") 21     # 创建游标 22     cursor = conn.cursor()  # 执行完毕返回的结果集默认以元组显示 23     if ((conn != None) & (cursor != None)): 24         print("数据库连接成功!游标创建成功!") 25     else: 26         print("数据库连接失败!") 27     return conn, cursor 28 #关闭数据库连接和游标 29 def close_conn(conn, cursor): 30     if cursor: 31         cursor.close() 32     if conn: 33         conn.close() 34     return 1 35  36 def get_souhu(): 37     url='https://film.sohu.com/List_0_0_0_2_2_1_60.HTML?channeled=1200100000' 38     #最新上架 39     new_url='https://film.sohu.com/List_0_0_0_2_1_1_60.HTML?channeled=1200100000' 40     #本周热播 41     week_url='https://film.sohu.com/List_0_0_0_2_0_1_60.HTML?channeled=1200100000' 42     headers={ 43         'User-Agent':'Mozilla/5.0 (windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36' 44     } 45  46     #初始化List 47     tempList=[] 48     dataRes=[] 49     #最受好评 50     for i in range(1,31): 51         url_1='https://film.sohu.com/List_0_0_0_2_2_' 52         auto=str(i) 53         url_2='_60.HTML?channeled=1200100000' 54         url=url_1+auto+url_2 55         response = requests.get(url, headers) 56         response.enCoding = 'utf-8' 57         page_text = response.text 58         # etree_ = etree.HTML(page_text) 59         # 获取所有的li 60         soup = BeautifulSoup(page_text, 'lxml') 61         # 标签层级选择 62         li_List = soup.select('.movIE-List>li') 63         print(len(li_List)) 64         if(len(li_List)==0): 65             print("最受好评爬取结束!") 66             if(len(dataRes)!=0): 67                 return dataRes 68         for li in li_List: 69             li_text=str(li) 70             # print(li_text) 71             li_soup=BeautifulSoup(li_text,'lxml') 72             name=li_soup.find('div',class_="v_name_info").text 73             #添加名字 74             tempList.append(name) 75             # print(name) 76             #添加评分 77             score=li_soup.find('span',class_='v_score').text 78             #处理评分 79             score=score[-4:-1] 80             tempList.append(score) 81             # print(score) 82             #添加path 83             path=li_soup.find('a',target="_blank")['href'] 84             tempList.append(path) 85             # print(path) 86             #添加播放状态 87             state="VIP" 88             tempList.append(state) 89             print(tempList) 90             dataRes.append(tempList) 91             tempList=[] 92         print("-------------------------------------------") 93     # print(len(dataRes)) 94  95     # #最新上架 96     # 97     # tempList = [] 98     # for i in range(1,31): 99     #     url_1='https://film.sohu.com/List_0_0_0_2_1_'100     #     auto=str(i)101     #     url_2='_60.HTML?channeled=1200100000'102     #     url=url_1+auto+url_2103     #     response = requests.get(url, headers)104     #     response.enCoding = 'utf-8'105     #     page_text = response.text106     #     # etree_ = etree.HTML(page_text)107     #     # 获取所有的li108     #     soup = BeautifulSoup(page_text, 'lxml')109     #     # 标签层级选择110     #     li_List = soup.select('.movIE-List>li')111     #     print(len(li_List))112     #     if(len(li_List)==0):113     #         print("最新上架爬取结束!")114     #         if(len(dataRes)!=0):115     #             return dataRes116     #     for li in li_List:117     #         li_text=str(li)118     #         # print(li_text)119     #         li_soup=BeautifulSoup(li_text,'lxml')120     #         name=li_soup.find('div',class_="v_name_info").text121     #         #添加名字122     #         tempList.append(name)123     #         # print(name)124     #         #添加评分125     #         score=li_soup.find('span',class_='v_score').text126     #         #处理评分127     #         score=score[-4:-1]128     #         tempList.append(score)129     #         # print(score)130     #         #添加path131     #         path=li_soup.find('a',target="_blank")['href']132     #         tempList.append(path)133     #         # print(path)134     #         #添加播放状态135     #         state="VIP"136     #         tempList.append(state)137     #         print(tempList)138     #         dataRes.append(tempList)139     #         tempList=[]140     #     print("-------------------------------------------")141     # # print(len(dataRes))142     # #本周热播143     # tempList = []144     # for i in range(1, 31):145     #     url_1 = 'https://film.sohu.com/List_0_0_0_2_0_'146     #     auto = str(i)147     #     url_2 = '_60.HTML?channeled=1200100000'148     #     url = url_1 + auto + url_2149     #     response = requests.get(url, headers)150     #     response.enCoding = 'utf-8'151     #     page_text = response.text152     #     # etree_ = etree.HTML(page_text)153     #     # 获取所有的li154     #     soup = BeautifulSoup(page_text, 'lxml')155     #     # 标签层级选择156     #     li_List = soup.select('.movIE-List>li')157     #     print(len(li_List))158     #     if (len(li_List) == 0):159     #         print("本周热播爬取结束!")160     #         if (len(dataRes) != 0):161     #             return dataRes162     #     for li in li_List:163     #         li_text = str(li)164     #         # print(li_text)165     #         li_soup = BeautifulSoup(li_text, 'lxml')166     #         name = li_soup.find('div', class_="v_name_info").text167     #         # 添加名字168     #         tempList.append(name)169     #         # print(name)170     #         # 添加评分171     #         score = li_soup.find('span', class_='v_score').text172     #         # 处理评分173     #         score = score[-4:-1]174     #         tempList.append(score)175     #         # print(score)176     #         # 添加path177     #         path = li_soup.find('a', target="_blank")['href']178     #         tempList.append(path)179     #         # print(path)180     #         # 添加播放状态181     #         state = "VIP"182     #         tempList.append(state)183     #         print(tempList)184     #         dataRes.append(tempList)185     #         tempList = []186     #     print("-------------------------------------------")187     # print(len(dataRes))188     #List去重189     # old_List = dataRes190     # new_List = []191     # for i in old_List:192     #     if i not in new_List:193     #         new_List.append(i)194     # print(new_List)  # [2, 3, 4, 5, 1]195     return dataRes196 #插入数据库197 def insert_souhu():198     cursor = None199     conn = None200     try:201         count=0202         List = get_souhu()203         print(f"{time.asctime()}开始插入搜狐电影数据")204         conn, cursor = get_conn()205         sql = "insert into movIEsohu (ID,name,score,path,state) values(%s,%s,%s,%s,%s)"206         for item in List:207             print(item)208             count = count + 1209             #异常捕获,防止数据库主键冲突210             try:211                 cursor.execute(sql, [0, item[0], item[1], item[2], item[3] ])212             except pyMysqL.err.IntegrityError:213                 print("重复!跳过!")214         conn.commit()  # 提交事务 update delete insert *** 作215         print(f"{time.asctime()}插入搜狐电影数据完毕")216     except:217         traceback.print_exc()218     finally:219         close_conn(conn, cursor)220     return;221 222 if __name__ == '__main__':223     # get_iqy()224     # get_souhu()225     insert_souhu()
运行截图

数据库截图

 

建表语句
1 CREATE table `movIEsohu` (2   `ID` INT(11) NOT NulL auto_INCREMENT,3   `name` VARCHAR(45) ColLATE utf8_bin NOT NulL,4   `score` VARCHAR(45) ColLATE utf8_bin NOT NulL,5   `path` VARCHAR(100) ColLATE utf8_bin NOT NulL,6   `state` VARCHAR(10) ColLATE utf8_bin NOT NulL,7   PRIMARY KEY (`name`),8   KEY `ID` (`ID`)9 ) ENGINE=INNODB auto_INCREMENT=1 DEFAulT CHARSET=utf8 ColLATE=utf8_bin;
总结

以上是内存溢出为你收集整理的Python爬虫爬取搜狐视频电影并存储到mysql数据库全部内容,希望文章能够帮你解决Python爬虫爬取搜狐视频电影并存储到mysql数据库所遇到的程序开发问题。

如果觉得内存溢出网站内容还不错,欢迎将内存溢出网站推荐给程序员好友。

欢迎分享,转载请注明来源:内存溢出

原文地址: http://outofmemory.cn/langs/1186305.html

(0)
打赏 微信扫一扫 微信扫一扫 支付宝扫一扫 支付宝扫一扫
上一篇 2022-06-03
下一篇 2022-06-03

发表评论

登录后才能评论

评论列表(0条)

保存