1、在巨人的肩膀上,结合网上的资源,梳理出来的。
2、其中应用了几个常用的包,requests、re等,
3、注意创建多级文件夹要用--makesdir,创建单级文件用--mkdir
1 # 导入相应的包 2 # 请求网页 3 import requests 4 # 正则解析网页 5 import re 6 # 告诉服务,自己的身份, 7 import time 8 9 import os 10 11 # 函数请求的头部信息 12 headers = { 13 "user-agent": 'Mozilla/5.0 (windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36 Edg/89.0.774.68' 14 } 15 16 ############################################ 开始编写程序 ############################################ 17 # 定义轮询的页面数 18 polling_page_num = int(input("请输入主页页码轮询的数量:")) 19 20 ############################################ 函数区 ############################################ 21 22 # 下载界面HTML代码的函数 23 def download_page(url, HTML_encode='utf-8', *args, **kwargs): 24 """ 25 下载界面HTML代码的函数 26 :param url:需要下载网页代码的链接 27 :param HTML_encode:网页的解码方式。默认是“utf-8” 28 :param args: 29 :param kwargs: 30 :return:返回值为该页面的HTML代码 31 """ 32 headers = { 33 "user-agent": 'Mozilla/5.0 (windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36 Edg/89.0.774.68'} 34 r = requests.get(url, headers=headers) 35 r.enCoding = HTML_encode 36 response_status = r.status_code 37 return r.text, response_status 38 39 40 # 分析主网页并且返回已经处理后的链接列表 41 def anasy_main_page(HTML, *args, **kwargs): 42 ex = '<a href="(.*?)" title=".*?"><img alt=".*?" src=".*?"><i>.*?</i></a>' 43 # ex = '<a href="(.*?)" Title=.*?><img alt=.*? src=.*?><i>.*?</i></a>' 44 test_src_List = re.findall(ex, HTML, re.S) 45 new_src_List = test_src_List[1:31] 46 li_piclink = [] 47 for pic_link in new_src_List: 48 new_pic_link = '链接' + pic_link 49 li_piclink.append(new_pic_link) 50 return li_piclink 51 52 53 # 分析副网页,返回图片的下载地址 54 def anasy_Secondary_page(Secondary_HTML): 55 """ 56 :param Secondary_HTML: 57 :return: 一个元组, 58 dir_name --文件夹名称 59 pic_link --- 链接 60 """ 61 ex_link = '<img alt=".*?" src="(.*?)" />' 62 ex_name = '<h1>(.*?)</h1>' # 需要修改的地方 63 pic_link = re.findall(ex_link, Secondary_HTML, re.S)[0] 64 dir_name = re.findall(ex_name, Secondary_HTML, re.S)[0] 65 return dir_name, pic_link 66 67 68 # 创建文件夹 69 def create_folder(dir_name): 70 dir_name_cl = "".join(dir_name.split()) 71 dir_name = dir_name_cl 72 if not os.path.exists(dir_name): 73 os.mkdir(dir_name) 74 return dir_name 75 76 77 # 下载图片 78 def down_pic(dir_name, pic_link): 79 """ 80 :param dir_name: 81 :param pic_link: 82 :return: 83 """ 84 img_data = requests.get(url=pic_link, headers=headers).content 85 img_name = pic_link.split('/')[-1] 86 imgPath = dir_name + '/' + img_name 87 with open(imgPath, 'wb') as f: 88 f.write(img_data) 89 return 90 91 92 # 网页主页生成器 93 def create_main_url(url_num): 94 url_ys = '子链接' 95 mian_url_List = [] 96 if url_num > 1: 97 start_num = 2 98 else: 99 start_num = 1100 101 for url_n in range(start_num, url_num + 1):102 if url_n != 1:103 url = url_ys + 'index_%d.HTML'104 new_url = format(url % url_n)105 else:106 new_url = url_ys107 mian_url_List.append(new_url)108 return mian_url_List109 110 111 # 子网页主页生成器112 def create_sec_url(url, url_num, *args, **kwargs):113 """114 :param url:115 :param url_num:116 :return:117 """118 sec_url_List = []119 for url_n in range(1, url_num + 1):120 if url_n != 1:121 # new_url = url + '_'+str(url_n)+'.HTML'122 begin = url.find("h")123 end = url.rfind(".")124 find_url = url[begin:end]125 new_url = find_url + '_' + str(url_n) + '.HTML'126 else:127 new_url = url128 sec_url_List.append(new_url)129 return sec_url_List130 131 132 # 下载日志生成133 def create_log(log_content):134 """135 下载日志生成函数136 :param log_content: 写入log的内容137 :return: 无138 """139 with open("log.txt", "a") as file:140 file.write(log_content)141 return142 143 144 # 页面记录器145 def page_record(page_num=0, *args, **kwargs):146 with open("page_record.txt", "w+") as file:147 file.write(page_num)148 return149 150 151 # 读取配置152 def page_read():153 with open("page_record.txt", "r") as file:154 r_page_num = file.readline()155 return r_page_num156 157 158 ############################################ 爬虫工作区 ############################################159 160 n_yema = int(page_read())161 # print(n_yema)162 if polling_page_num > 361:163 print("您输入的超出轮询范围,请重新输入!")164 elif polling_page_num > n_yema:165 end_page_num = polling_page_num166 print("主程序即将进行")167 168 # 生成主网页169 mian_url_List_ys = create_main_url(end_page_num)170 mian_url_List = mian_url_List_ys[int(n_yema)-1:int(end_page_num)+1]171 172 173 for url in mian_url_List:174 n_yema = n_yema + 1175 sec_url_li = anasy_main_page(download_page(url)[0]) # 分析主链接,获得套图列表176 print(len(sec_url_li), sec_url_li)177 log_mian_start = "*" * 15 + "第" + str(n_yema) + "页,开始下载-->" + url + "*" * 15178 print(log_mian_start) # 某些页面开始下载的提示179 n_tao = 0180 for url_sec in sec_url_li[0:31]:181 n_tao = n_tao + 1182 dir_name = anasy_Secondary_page(download_page(url_sec, HTML_encode="utf-8")[0])[0] # 分析指定的套图链接,获得套图的名称183 184 185 print("*" * 15 + "第" + str(n_yema) + "页,第" + str(n_tao) + "套图--" + dir_name + "--" + "开始下载" + "*" * 15)186 dir_name_sj = create_folder(dir_name)187 sec_url_List = create_sec_url(url_sec, 60)188 m = 0189 for pic_link in sec_url_List:190 m = m + 1191 page_text, response_status_pic = download_page(pic_link)192 if response_status_pic == 200:193 donw_pic_link = anasy_Secondary_page(page_text)[1] # 分析指定的套图链接,获得图片链接194 down_pic(dir_name_sj, donw_pic_link)195 print("第" + str(m) + "张图片下成功", donw_pic_link)196 time.sleep(1)197 else:198 continue199 200 print("第" + str(n_yema) + "页,第" + str(n_tao) + "套图--" + dir_name + "全部图片下载完毕" + "\n")201 log_text = time.strftime('%Y-%m-%d %H:%M:%s', time.localtime(time.time())) + "---" + "第" + str(202 n_yema) + "页,第" + str(n_tao) + "套图--" + dir_name + "已经下载完毕" + "\n"203 create_log(log_content=log_text)204 log_main_end = "*" * 10 + "第" + str(n_yema) + "页,下载完成-->" + url + "*" * 10 + "\n\n"205 print(log_main_end)206 # 要记录已经下载的主页的页面页码(n_yema)207 page_record(str(n_yema))
总结
以上是内存溢出为你收集整理的【Python学习】爬虫源码全部内容,希望文章能够帮你解决【Python学习】爬虫源码所遇到的程序开发问题。
如果觉得内存溢出网站内容还不错,欢迎将内存溢出网站推荐给程序员好友。
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)