记录过程,怕忘了。复制就能用。
# Coding=utf-8import osimport platformfrom multiprocessing.pool import ThreadPoolimport lxmlimport requestsfrom lxml import etreeimport timefrom apscheduler.schedulers.blocking import BlockingSchedulerimport loggingimport randomimport bs4import sysfrom random import randintfrom clint.textui import progress# 抓取网址 https://wallhaven.cc/topList 排行榜 latest 最新 hot 热门 random 随机url = "https://wallhaven.cc/latest"# 下载路径path = "D:\\Download\\wallhaven\\latest"# http请求头headers = 'Mozilla/5.0 (windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'# 模拟浏览器请求Hostreferer = { 'User-Agent': headers, # 'Connection': 'keep-alive', 'Referer': 'https://wallhaven.cc/topList', 'Accept': 'text/HTML,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'}# 创建文件夹def createfile(file_path): if os.path.exists(file_path) is False: os.makedirs(file_path) # 切换路径至上面创建的文件夹 os.chdir(file_path)times = time.strftime("%Y-%m-%d_%H:%M:%s")logging.basicConfig(level=logging.INFO, format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s', datefmt='%Y-%m-%d %H:%M:%s', filename="log.txt", filemode='a')def imgs(lv): # 页码 global s, t, k, page pich = "" pagenumber = {} # 套图 pictures = {} # 张码 pIEcecode = {} urls = [] try: HTML = requests.get(url, headers=Hostreferer, timeout=5) HTML.enCoding = "utf-8" createfile(path) # 获取网页内容 text = etree.HTML(HTML.text) # 获取最大页数 kom = requests.get("https://wallhaven.cc/topList?page=2", headers=Hostreferer, timeout=5) ls = etree.HTML(kom.text) pagenum = ls.xpath("//div[@ID='thumbs']/section/header/h2/text()")[1] pagenum = pagenum[3:] print("总页数:" + pagenum) # 定义循环页数 for i in range(1, int(3) + 1): page = url + "?page=" + str(i) print("图片页码:" + page) file = path + "\\" + str(i) createfile(file) try: two = requests.get(page, headers=Hostreferer, timeout=5) two.enCoding = "utf-8" two_j = etree.HTML(two.text) pic = two_j.xpath("//div[@ID='thumbs']/section/ul/li/figure/a/@href") for j in range(1, len(pic)): go = [] # print("第" + str(i) + "页第" + str(j) + "个图片") echo("success", "第" + str(i) + "页第" + str(j) + "个图片") # print("图片地址:" + pic[j]) pich = pic[j] t = j try: imgurl = requests.get(pic[j], headers=Hostreferer, timeout=5) imgurl.enCoding = "utf-8" img = etree.HTML(imgurl.text) image = img.xpath("//section[@ID='showcase']/div/img/@src") echo("success", "图片下载路径:" + image[0]) # print("图片下载路径:" + image[0]) arry = image[0].split('/') file_name = arry[len(arry) - 1] fileimg = file + "\\" + str(file_name) k = fileimg # imageurl = requests.get(image[0], headers=Hostreferer, timeout=5,stream=True) # print(fileimg) # 存取图片路径 go.append(fileimg) go.append(image[0]) urls.append(go) # 第一种下载 # f = open(fileimg, 'ab') # f.write(imageurl.content) # imageurl.close() imgurl.close() except Exception as e: print("下载出了个问题") print(e) # time.sleep(1) two.close() except Exception as e: # pictures["第" + str(s) + "页第" + str(t) + "个套图"] = pich print(e) # time.sleep(1) s = i kom.close() print(urls) # 存入网址 imgurl = "imgurl.txt" ts = [] n = 0 print("开始下载...") print("图片数量:" + str(len(urls))) # 第二种下载 for x in urls: n = n + 1 g = open(imgurl, "a+") b = str(x[1]) + "\n" g.write(b) url_response(x[1], x[0], n) print("图片抓取完成") if pagenumber: print("无法抓取页面:") print(pagenumber) else: print("无法抓取页面:0") if pictures: print("无法抓取套图:") print(pictures) else: print("无法抓取套图:0") if pIEcecode: print("无法抓取张码:") print(pIEcecode) else: print("无法抓取张码:0") except Exception as e: timetyr = 3 # 重试次数 if lv < timetyr: lv += 1 imgs(lv) print(e) pagenumber["第" + str(s) + "页"] = page# 定义下载函数def url_response(url, imglen, n): r = requests.get(url, headers=Hostreferer, timeout=5, stream=True) # 定义一个1024的字节 chunk_size = 1024 size = 0 content_size = int(r.headers['content-length']) with open(imglen, 'ab') as f: # 边下载边存硬盘 chunk_size=chunk_size可修改 单位为B for chunl in r.iter_content(chunk_size=chunk_size): # 写入文件 f.write(chunl) size += len(chunl) # 已下载文件大小 # \r 指定第一个字符开始,搭配end属性完成覆盖进度条 print('\r' + '[下载进度]: %s%.2f%%' % ('>' * int(size * 50 / content_size), float(size / content_size * 100)), end='') print('[第' + str(n) + '个图片大小]: %0.2f MB' % (content_size / chunk_size / 1024)) f.close()def echo(color, *args): colors = {'error': '\033[91m', 'success': '\033[94m', 'info': '\033[93m'} if not color in colors or platform.system() == 'windows': print(' '.join(args)) print(colors[color], ' '.join(args), '\033[0m')if __name__ == '__main__': imgurl = "imgurl.txt" open(imgurl, "a+") start = time.time() imgs(1) end = time.time() print('\n' + "全部下载完成!用时%s秒" % (end - start)) scheduler = BlockingScheduler() scheduler._logger = logging scheduler.start()
总结 以上是内存溢出为你收集整理的Python抓取图片全部内容,希望文章能够帮你解决Python抓取图片所遇到的程序开发问题。
如果觉得内存溢出网站内容还不错,欢迎将内存溢出网站推荐给程序员好友。
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)