1. 先使用抓包工具进行抓包分析,得到对应的接口 2. 直接用requests库来请求响应数据 3. 存入数据库代码
from functools import reduce import pymysql import requests import json # 链接数据库 def connectdatabase(): conn = pymysql.connect( host='127.0.0.1', user='root', password='root', db='site', port=3308, charset='utf8' ) return conn # 请求对象并返回响应的内容 def handle_request(pageno): url='https://api.web.360kan.com/v1/filter/list?catid=1&rank=rankhot&cat=%E5%96%9C%E5%89%A7&year=&area=&act=&size=35&pageno={}&callback=__jp2'.format(pageno) resp = requests.get(url=url,headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:94.0) Gecko/20100101 Firefox/94.0' }) return resp.text def merge(x,y): return x+'/'+y # 解析数据 def parse_data(movielist,sql,cusor): for move in movielist: # 演员列表(转化为json字符串) actorlist = move['actor'] # 地区列表(遍历) arealist = move['area'] # 电影简介(本身就是字符串) description = move['description'] # 导演列表(转化为json字符串) directorlist = move['director'] # 豆瓣评分 doubanscore = move['doubanscore'] # 电影类型列表(遍历) moviecategorylist = move['moviecategory'] # 电影名 movename = move['title'] # 上映时间 pubdate = move['pubdate'] # 电影播放网站列表 playlink_siteslist = move['playlink_sites'] # 电影播放网站对应的地址链接 # 播放网站及地址 play_address = '' for site in playlink_siteslist: linkaddress = move['playlinks'][site] play_address += (site+':'+linkaddress+';') # 电影图片地址 movecover = 'https:'+move['cover'] # 电影评论 movecomment = move['comment'] # 遍历地区 for area in arealist: # 遍历电影类型列表 for movetype in moviecategorylist: cusor.execute(sql,(reduce(merge,actorlist),area,str(description),reduce(merge,directorlist),doubanscore,movetype,movename,pubdate,play_address,movecomment,movecover)) print(reduce(merge,actorlist),area,str(description),reduce(merge,directorlist),doubanscore,movetype,movename,pubdate,play_address,movecomment,movecover) # 连接数据库 conn = connectdatabase() # 获取游标对象 cusor = conn.cursor() #遍历页数 for i in range(1,20): # 电影列表 movielist = json.loads((handle_request(i))[6:-2])['data']['movies'] sql = "insert into movie values(NULL,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" parse_data(movielist, sql, cusor) conn.commit() cusor.close() conn.close()
总结:遇到数据无法回显时用抓包工具抓一下包,再去使用selenium,最后再去考虑js逆向的问题。
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)