【Python学习】爬虫源码

【Python学习】爬虫源码,第1张

概述1、在巨人的肩膀上,结合网上的资源,梳理出来的。2、其中应用了几个常用的包,requests、re等,3、注意创建多级文件夹要用--makesdir,创建单级文件用--mkdir1#导入相应的包2#请求网页3importrequests4#正则解析网页5importre6#告诉服务,自己的身份,7i

1、在巨人的肩膀上,结合网上的资源,梳理出来的。

2、其中应用了几个常用的包,requests、re等,

3、注意创建多级文件夹要用--makesdir,创建单级文件用--mkdir

  1 # 导入相应的包  2 # 请求网页  3 import requests  4 # 正则解析网页  5 import re  6 # 告诉服务,自己的身份,  7 import time  8   9 import os 10  11 # 函数请求的头部信息 12 headers = { 13     "user-agent": 'Mozilla/5.0 (windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36 Edg/89.0.774.68' 14 } 15  16 ############################################  开始编写程序  ############################################ 17 # 定义轮询的页面数 18 polling_page_num = int(input("请输入主页页码轮询的数量:")) 19  20 ############################################  函数区  ############################################ 21  22 # 下载界面HTML代码的函数 23 def download_page(url, HTML_encode='utf-8', *args, **kwargs): 24     """ 25     下载界面HTML代码的函数 26     :param url:需要下载网页代码的链接 27     :param HTML_encode:网页的解码方式。默认是“utf-8” 28     :param args: 29     :param kwargs: 30     :return:返回值为该页面的HTML代码 31     """ 32     headers = { 33         "user-agent": 'Mozilla/5.0 (windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36 Edg/89.0.774.68'} 34     r = requests.get(url, headers=headers) 35     r.enCoding = HTML_encode 36     response_status = r.status_code 37     return r.text, response_status 38  39  40 # 分析主网页并且返回已经处理后的链接列表 41 def anasy_main_page(HTML, *args, **kwargs): 42     ex = '<a href="(.*?)" title=".*?"><img alt=".*?" src=".*?"><i>.*?</i></a>' 43     # ex = '<a href="(.*?)" Title=.*?><img alt=.*? src=.*?><i>.*?</i></a>' 44     test_src_List = re.findall(ex, HTML, re.S) 45     new_src_List = test_src_List[1:31] 46     li_piclink = [] 47     for pic_link in new_src_List: 48         new_pic_link = '链接' + pic_link 49         li_piclink.append(new_pic_link) 50     return li_piclink 51  52  53 # 分析副网页,返回图片的下载地址 54 def anasy_Secondary_page(Secondary_HTML): 55     """ 56     :param Secondary_HTML: 57     :return: 一个元组, 58             dir_name --文件夹名称 59             pic_link --- 链接 60     """ 61     ex_link = '<img  alt=".*?" src="(.*?)"  />' 62     ex_name = '<h1>(.*?)</h1>'  # 需要修改的地方 63     pic_link = re.findall(ex_link, Secondary_HTML, re.S)[0] 64     dir_name = re.findall(ex_name, Secondary_HTML, re.S)[0] 65     return dir_name, pic_link 66  67  68 # 创建文件夹 69 def create_folder(dir_name): 70     dir_name_cl = "".join(dir_name.split()) 71     dir_name = dir_name_cl 72     if not os.path.exists(dir_name): 73         os.mkdir(dir_name) 74     return dir_name 75  76  77 # 下载图片 78 def down_pic(dir_name, pic_link): 79     """ 80     :param dir_name: 81     :param pic_link: 82     :return: 83     """ 84     img_data = requests.get(url=pic_link, headers=headers).content 85     img_name = pic_link.split('/')[-1] 86     imgPath = dir_name + '/' + img_name 87     with open(imgPath, 'wb') as f: 88         f.write(img_data) 89     return 90  91  92 # 网页主页生成器 93 def create_main_url(url_num): 94     url_ys = '子链接' 95     mian_url_List = [] 96     if url_num > 1: 97         start_num = 2 98     else: 99         start_num = 1100 101     for url_n in range(start_num, url_num + 1):102         if url_n != 1:103             url = url_ys + 'index_%d.HTML'104             new_url = format(url % url_n)105         else:106             new_url = url_ys107         mian_url_List.append(new_url)108     return mian_url_List109 110 111 # 子网页主页生成器112 def create_sec_url(url, url_num, *args, **kwargs):113     """114     :param url:115     :param url_num:116     :return:117     """118     sec_url_List = []119     for url_n in range(1, url_num + 1):120         if url_n != 1:121             # new_url = url + '_'+str(url_n)+'.HTML'122             begin = url.find("h")123             end = url.rfind(".")124             find_url = url[begin:end]125             new_url = find_url + '_' + str(url_n) + '.HTML'126         else:127             new_url = url128         sec_url_List.append(new_url)129     return sec_url_List130 131 132 # 下载日志生成133 def create_log(log_content):134     """135     下载日志生成函数136     :param log_content: 写入log的内容137     :return: 无138     """139     with open("log.txt", "a") as file:140         file.write(log_content)141     return142 143 144 # 页面记录器145 def page_record(page_num=0, *args, **kwargs):146     with open("page_record.txt", "w+") as file:147         file.write(page_num)148     return149 150 151 # 读取配置152 def page_read():153     with open("page_record.txt", "r") as file:154         r_page_num = file.readline()155     return r_page_num156 157 158 ############################################  爬虫工作区  ############################################159 160 n_yema = int(page_read())161 # print(n_yema)162 if polling_page_num > 361:163     print("您输入的超出轮询范围,请重新输入!")164 elif polling_page_num > n_yema:165     end_page_num = polling_page_num166     print("主程序即将进行")167 168     # 生成主网页169     mian_url_List_ys = create_main_url(end_page_num)170     mian_url_List = mian_url_List_ys[int(n_yema)-1:int(end_page_num)+1]171 172 173     for url in mian_url_List:174         n_yema = n_yema + 1175         sec_url_li = anasy_main_page(download_page(url)[0])  # 分析主链接,获得套图列表176         print(len(sec_url_li), sec_url_li)177         log_mian_start = "*" * 15 + "第" + str(n_yema) + "页,开始下载-->" + url + "*" * 15178         print(log_mian_start)  # 某些页面开始下载的提示179         n_tao = 0180         for url_sec in sec_url_li[0:31]:181             n_tao = n_tao + 1182             dir_name = anasy_Secondary_page(download_page(url_sec, HTML_encode="utf-8")[0])[0]  # 分析指定的套图链接,获得套图的名称183 184 185             print("*" * 15 + "第" + str(n_yema) + "页,第" + str(n_tao) + "套图--" + dir_name + "--" + "开始下载" + "*" * 15)186             dir_name_sj = create_folder(dir_name)187             sec_url_List = create_sec_url(url_sec, 60)188             m = 0189             for pic_link in sec_url_List:190                 m = m + 1191                 page_text, response_status_pic = download_page(pic_link)192                 if response_status_pic == 200:193                     donw_pic_link = anasy_Secondary_page(page_text)[1]  # 分析指定的套图链接,获得图片链接194                     down_pic(dir_name_sj, donw_pic_link)195                     print("第" + str(m) + "张图片下成功", donw_pic_link)196                     time.sleep(1)197                 else:198                     continue199 200             print("第" + str(n_yema) + "页,第" + str(n_tao) + "套图--" + dir_name + "全部图片下载完毕" + "\n")201             log_text = time.strftime('%Y-%m-%d %H:%M:%s', time.localtime(time.time())) + "---" + "第" + str(202                 n_yema) + "页,第" + str(n_tao) + "套图--" + dir_name + "已经下载完毕" + "\n"203             create_log(log_content=log_text)204         log_main_end = "*" * 10 + "第" + str(n_yema) + "页,下载完成-->" + url + "*" * 10 + "\n\n"205         print(log_main_end)206         # 要记录已经下载的主页的页面页码(n_yema)207         page_record(str(n_yema))

 

总结

以上是内存溢出为你收集整理的【Python学习爬虫源码全部内容,希望文章能够帮你解决【Python学习】爬虫源码所遇到的程序开发问题。

如果觉得内存溢出网站内容还不错,欢迎将内存溢出网站推荐给程序员好友。

欢迎分享,转载请注明来源:内存溢出

原文地址: http://outofmemory.cn/langs/1186217.html

(0)
打赏 微信扫一扫 微信扫一扫 支付宝扫一扫 支付宝扫一扫
上一篇 2022-06-03
下一篇 2022-06-03

发表评论

登录后才能评论

评论列表(0条)

保存