1 import time 2 import traceback 3 import requests 4 from lxml import etree 5 import re 6 from bs4 import BeautifulSoup 7 from lxml.HTML.diff import end_tag 8 import Json 9 import pyMysqL 10 #连接数据库 获取游标 11 def get_conn(): 12 """ 13 :return: 连接,游标 14 """ 15 # 创建连接 16 conn = pyMysqL.connect(host="127.0.0.1", 17 user="root", 18 password="000429", 19 db="movIErankings", 20 charset="utf8") 21 # 创建游标 22 cursor = conn.cursor() # 执行完毕返回的结果集默认以元组显示 23 if ((conn != None) & (cursor != None)): 24 print("数据库连接成功!游标创建成功!") 25 else: 26 print("数据库连接失败!") 27 return conn, cursor 28 #关闭数据库连接和游标 29 def close_conn(conn, cursor): 30 if cursor: 31 cursor.close() 32 if conn: 33 conn.close() 34 return 1 35 36 def get_souhu(): 37 url='https://film.sohu.com/List_0_0_0_2_2_1_60.HTML?channeled=1200100000' 38 #最新上架 39 new_url='https://film.sohu.com/List_0_0_0_2_1_1_60.HTML?channeled=1200100000' 40 #本周热播 41 week_url='https://film.sohu.com/List_0_0_0_2_0_1_60.HTML?channeled=1200100000' 42 headers={ 43 'User-Agent':'Mozilla/5.0 (windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36' 44 } 45 46 #初始化List 47 tempList=[] 48 dataRes=[] 49 #最受好评 50 for i in range(1,31): 51 url_1='https://film.sohu.com/List_0_0_0_2_2_' 52 auto=str(i) 53 url_2='_60.HTML?channeled=1200100000' 54 url=url_1+auto+url_2 55 response = requests.get(url, headers) 56 response.enCoding = 'utf-8' 57 page_text = response.text 58 # etree_ = etree.HTML(page_text) 59 # 获取所有的li 60 soup = BeautifulSoup(page_text, 'lxml') 61 # 标签层级选择 62 li_List = soup.select('.movIE-List>li') 63 print(len(li_List)) 64 if(len(li_List)==0): 65 print("最受好评爬取结束!") 66 if(len(dataRes)!=0): 67 return dataRes 68 for li in li_List: 69 li_text=str(li) 70 # print(li_text) 71 li_soup=BeautifulSoup(li_text,'lxml') 72 name=li_soup.find('div',class_="v_name_info").text 73 #添加名字 74 tempList.append(name) 75 # print(name) 76 #添加评分 77 score=li_soup.find('span',class_='v_score').text 78 #处理评分 79 score=score[-4:-1] 80 tempList.append(score) 81 # print(score) 82 #添加path 83 path=li_soup.find('a',target="_blank")['href'] 84 tempList.append(path) 85 # print(path) 86 #添加播放状态 87 state="VIP" 88 tempList.append(state) 89 print(tempList) 90 dataRes.append(tempList) 91 tempList=[] 92 print("-------------------------------------------") 93 # print(len(dataRes)) 94 95 # #最新上架 96 # 97 # tempList = [] 98 # for i in range(1,31): 99 # url_1='https://film.sohu.com/List_0_0_0_2_1_'100 # auto=str(i)101 # url_2='_60.HTML?channeled=1200100000'102 # url=url_1+auto+url_2103 # response = requests.get(url, headers)104 # response.enCoding = 'utf-8'105 # page_text = response.text106 # # etree_ = etree.HTML(page_text)107 # # 获取所有的li108 # soup = BeautifulSoup(page_text, 'lxml')109 # # 标签层级选择110 # li_List = soup.select('.movIE-List>li')111 # print(len(li_List))112 # if(len(li_List)==0):113 # print("最新上架爬取结束!")114 # if(len(dataRes)!=0):115 # return dataRes116 # for li in li_List:117 # li_text=str(li)118 # # print(li_text)119 # li_soup=BeautifulSoup(li_text,'lxml')120 # name=li_soup.find('div',class_="v_name_info").text121 # #添加名字122 # tempList.append(name)123 # # print(name)124 # #添加评分125 # score=li_soup.find('span',class_='v_score').text126 # #处理评分127 # score=score[-4:-1]128 # tempList.append(score)129 # # print(score)130 # #添加path131 # path=li_soup.find('a',target="_blank")['href']132 # tempList.append(path)133 # # print(path)134 # #添加播放状态135 # state="VIP"136 # tempList.append(state)137 # print(tempList)138 # dataRes.append(tempList)139 # tempList=[]140 # print("-------------------------------------------")141 # # print(len(dataRes))142 # #本周热播143 # tempList = []144 # for i in range(1, 31):145 # url_1 = 'https://film.sohu.com/List_0_0_0_2_0_'146 # auto = str(i)147 # url_2 = '_60.HTML?channeled=1200100000'148 # url = url_1 + auto + url_2149 # response = requests.get(url, headers)150 # response.enCoding = 'utf-8'151 # page_text = response.text152 # # etree_ = etree.HTML(page_text)153 # # 获取所有的li154 # soup = BeautifulSoup(page_text, 'lxml')155 # # 标签层级选择156 # li_List = soup.select('.movIE-List>li')157 # print(len(li_List))158 # if (len(li_List) == 0):159 # print("本周热播爬取结束!")160 # if (len(dataRes) != 0):161 # return dataRes162 # for li in li_List:163 # li_text = str(li)164 # # print(li_text)165 # li_soup = BeautifulSoup(li_text, 'lxml')166 # name = li_soup.find('div', class_="v_name_info").text167 # # 添加名字168 # tempList.append(name)169 # # print(name)170 # # 添加评分171 # score = li_soup.find('span', class_='v_score').text172 # # 处理评分173 # score = score[-4:-1]174 # tempList.append(score)175 # # print(score)176 # # 添加path177 # path = li_soup.find('a', target="_blank")['href']178 # tempList.append(path)179 # # print(path)180 # # 添加播放状态181 # state = "VIP"182 # tempList.append(state)183 # print(tempList)184 # dataRes.append(tempList)185 # tempList = []186 # print("-------------------------------------------")187 # print(len(dataRes))188 #List去重189 # old_List = dataRes190 # new_List = []191 # for i in old_List:192 # if i not in new_List:193 # new_List.append(i)194 # print(new_List) # [2, 3, 4, 5, 1]195 return dataRes196 #插入数据库197 def insert_souhu():198 cursor = None199 conn = None200 try:201 count=0202 List = get_souhu()203 print(f"{time.asctime()}开始插入搜狐电影数据")204 conn, cursor = get_conn()205 sql = "insert into movIEsohu (ID,name,score,path,state) values(%s,%s,%s,%s,%s)"206 for item in List:207 print(item)208 count = count + 1209 #异常捕获,防止数据库主键冲突210 try:211 cursor.execute(sql, [0, item[0], item[1], item[2], item[3] ])212 except pyMysqL.err.IntegrityError:213 print("重复!跳过!")214 conn.commit() # 提交事务 update delete insert *** 作215 print(f"{time.asctime()}插入搜狐电影数据完毕")216 except:217 traceback.print_exc()218 finally:219 close_conn(conn, cursor)220 return;221 222 if __name__ == '__main__':223 # get_iqy()224 # get_souhu()225 insert_souhu()运行截图数据库截图
建表语句
1 CREATE table `movIEsohu` (2 `ID` INT(11) NOT NulL auto_INCREMENT,3 `name` VARCHAR(45) ColLATE utf8_bin NOT NulL,4 `score` VARCHAR(45) ColLATE utf8_bin NOT NulL,5 `path` VARCHAR(100) ColLATE utf8_bin NOT NulL,6 `state` VARCHAR(10) ColLATE utf8_bin NOT NulL,7 PRIMARY KEY (`name`),8 KEY `ID` (`ID`)9 ) ENGINE=INNODB auto_INCREMENT=1 DEFAulT CHARSET=utf8 ColLATE=utf8_bin;总结
以上是内存溢出为你收集整理的Python爬虫爬取搜狐视频电影并存储到mysql数据库全部内容,希望文章能够帮你解决Python爬虫爬取搜狐视频电影并存储到mysql数据库所遇到的程序开发问题。
如果觉得内存溢出网站内容还不错,欢迎将内存溢出网站推荐给程序员好友。
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)