准备工作:
- 打开pycharm安装好所需的包
- 在 Terminal 运行窗口 输入命令 :
- pip install requests
- pip install lxml
- pip install selenium
- 也可以在 菜单栏:File - Setting - Python Interperter - 下方 + 搜索对应的包名称进行安装
在IDE里面创建新项目把下载好的谷歌浏览器驱动(chromedriver.exe) 导入到文件夹中.
chromedriver驱动下载地址:
执行一下代码:
import requests import time import os import re import random from lxml import etree from selenium import webdriver from selenium.webdriver.chrome.options import Options class GetVideo(object): def __init__(self,url): self.url = url self.url_list = url.split("/") self.url_list.pop(-1) self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3872.400 QQBrowser/10.8.4455.400' } def get_eval_data(self): # 2.获取网页链接视频源码url self.headers["cookie"] ="PHPSESSID=1n9u3kih72g3f4tlqvobv42gf4" page_text = requests.get(url=self.url, headers=self.headers).text # print(page_text) tree = etree.HTML(page_text) self.headers["Host"] = "suumo.smbb.jp" video_url = tree.xpath("//div[@class='block-inqfree2 clr']/iframe/@src") # print(video_url[0]) self.headers["Referer"] = video_url[0] # 3.正则提取视频 eval算法 get_video_url = requests.get(url=video_url[0], headers=self.headers).text # print(get_video_url) res = re.search("eval.*", get_video_url) # print(res.group()) eval_str = res.group() # 4.把eval算法存为本地html页面 l = eval_str.split("}return p") eval_str = "%sdocument.getElementById('textareaID').innerText=p;}return p%s" % (l[0], l[1]) # print(eval_str) with open('test.html', 'w', encoding='utf-8') as fp: fp.write(f"""Crack Baidu统计构造函数 """) def get_encmoviepath(self): # 5.通过selenium 模拟登录获取 encmoviepath 链接地址 chrome_options = Options() chrome_options.add_argument('--headless') chrome_options.add_argument('--disable-gpu') bro = webdriver.Chrome(executable_path='./chromedriver.exe', chrome_options=chrome_options) bro.get(os.path.dirname(__file__) + "\test.html") encmoviepath = bro.find_element_by_id("textareaID").text # print(encmoviepath) encmoviepath_url = re.search("encmoviepath..*');hls?", encmoviepath).group().split("');hls") # print(encmoviepath_url[0]) # 6.访问encmoviepath链接 获取 正确的视频分段url res = requests.get(url=f"https://suumo.smbb.jp/dvic/{encmoviepath_url[0]}", headers=self.headers) return res.text def save_video(self): file_path = f"{os.path.dirname(__file__)}\{self.url_list[-1]}_{int(time.time())}.mp4" res_text = self.get_encmoviepath() # 7.访问正确视频url 进行本地持久化存储 if "https" in res_text: with open(file_path, 'wb+') as fp: ulr_ts = re.findall("https.*", res_text) headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3872.400 QQBrowser/10.8.4455.400' } if len(ulr_ts) >= 2: # print(ulr_ts) for u in ulr_ts: print("正在下载:", u) get_video_url = requests.get(url=u, headers=headers) fp.write(get_video_url.content) time.sleep(random.randint(100, 500) / 1000) print("视频下载成功!") else: print("没有可下载的视频连接") def run(self): # 运行函数 try: self.get_eval_data() self.get_encmoviepath() self.save_video() except Exception as e: print("异常信息",e) if __name__ == '__main__': # 1. 用户输入url # url: https://suumo.jp/ikkodate/chiba/sc_chibashiwakaba/nc_91095290/ url = input("请输入网页链接:").strip() if len(url) <5: print("链接错误!") else: video = GetVideo(url) video.run()
执行结果图片:
学习更多的爬虫示例可以关注:Aitlo学习站http://www.iaitlo.com
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)