Python抓取图片

Python抓取图片,第1张

概述Python抓取图片(记录)记录过程,怕忘了。复制就能用。#coding=utf-8importosimportplatformfrommultiprocessing.poolimportThreadPoolimportlxmlimportrequestsfromlxmlimportetreeimporttimefromapscheduler.schedulers.blockingimportBlockingSc Python 抓取图片(记录)

记录过程,怕忘了。复制就能用。

# Coding=utf-8import osimport platformfrom multiprocessing.pool import ThreadPoolimport lxmlimport requestsfrom lxml import etreeimport timefrom apscheduler.schedulers.blocking import BlockingSchedulerimport loggingimport randomimport bs4import sysfrom random import randintfrom clint.textui import progress# 抓取网址 https://wallhaven.cc/topList  排行榜  latest 最新  hot 热门  random 随机url = "https://wallhaven.cc/latest"# 下载路径path = "D:\\Download\\wallhaven\\latest"# http请求头headers = 'Mozilla/5.0 (windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'# 模拟浏览器请求Hostreferer = {    'User-Agent': headers,    # 'Connection': 'keep-alive',    'Referer': 'https://wallhaven.cc/topList',    'Accept': 'text/HTML,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'}# 创建文件夹def createfile(file_path):    if os.path.exists(file_path) is False:        os.makedirs(file_path)    # 切换路径至上面创建的文件夹    os.chdir(file_path)times = time.strftime("%Y-%m-%d_%H:%M:%s")logging.basicConfig(level=logging.INFO,                    format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',                    datefmt='%Y-%m-%d %H:%M:%s',                    filename="log.txt",                    filemode='a')def imgs(lv):    # 页码    global s, t, k, page    pich = ""    pagenumber = {}    # 套图    pictures = {}    # 张码    pIEcecode = {}    urls = []    try:        HTML = requests.get(url, headers=Hostreferer, timeout=5)        HTML.enCoding = "utf-8"        createfile(path)        # 获取网页内容        text = etree.HTML(HTML.text)        # 获取最大页数        kom = requests.get("https://wallhaven.cc/topList?page=2", headers=Hostreferer, timeout=5)        ls = etree.HTML(kom.text)        pagenum = ls.xpath("//div[@ID='thumbs']/section/header/h2/text()")[1]        pagenum = pagenum[3:]        print("总页数:" + pagenum)        # 定义循环页数        for i in range(1, int(3) + 1):            page = url + "?page=" + str(i)            print("图片页码:" + page)            file = path + "\\" + str(i)            createfile(file)            try:                two = requests.get(page, headers=Hostreferer, timeout=5)                two.enCoding = "utf-8"                two_j = etree.HTML(two.text)                pic = two_j.xpath("//div[@ID='thumbs']/section/ul/li/figure/a/@href")                for j in range(1, len(pic)):                    go = []                    # print("第" + str(i) + "页第" + str(j) + "个图片")                    echo("success", "第" + str(i) + "页第" + str(j) + "个图片")                    # print("图片地址:" + pic[j])                    pich = pic[j]                    t = j                    try:                        imgurl = requests.get(pic[j], headers=Hostreferer, timeout=5)                        imgurl.enCoding = "utf-8"                        img = etree.HTML(imgurl.text)                        image = img.xpath("//section[@ID='showcase']/div/img/@src")                        echo("success", "图片下载路径:" + image[0])                        # print("图片下载路径:" + image[0])                        arry = image[0].split('/')                        file_name = arry[len(arry) - 1]                        fileimg = file + "\\" + str(file_name)                        k = fileimg                        # imageurl = requests.get(image[0], headers=Hostreferer, timeout=5,stream=True)                        # print(fileimg)						# 存取图片路径                        go.append(fileimg)                        go.append(image[0])                        urls.append(go)						# 第一种下载                        # f = open(fileimg, 'ab')                        # f.write(imageurl.content)                        # imageurl.close()                        imgurl.close()                    except Exception as e:                        print("下载出了个问题")                        print(e)                # time.sleep(1)                two.close()            except Exception as e:                # pictures["第" + str(s) + "页第" + str(t) + "个套图"] = pich                print(e)            # time.sleep(1)            s = i        kom.close()        print(urls)        # 存入网址        imgurl = "imgurl.txt"        ts = []        n = 0        print("开始下载...")        print("图片数量:" + str(len(urls)))        # 第二种下载        for x in urls:            n = n + 1            g = open(imgurl, "a+")            b = str(x[1]) + "\n"            g.write(b)            url_response(x[1], x[0], n)        print("图片抓取完成")        if pagenumber:            print("无法抓取页面:")            print(pagenumber)        else:            print("无法抓取页面:0")        if pictures:            print("无法抓取套图:")            print(pictures)        else:            print("无法抓取套图:0")        if pIEcecode:            print("无法抓取张码:")            print(pIEcecode)        else:            print("无法抓取张码:0")    except Exception as e:        timetyr = 3  # 重试次数        if lv < timetyr:            lv += 1            imgs(lv)        print(e)        pagenumber["第" + str(s) + "页"] = page# 定义下载函数def url_response(url, imglen, n):    r = requests.get(url, headers=Hostreferer, timeout=5, stream=True)    # 定义一个1024的字节    chunk_size = 1024    size = 0    content_size = int(r.headers['content-length'])    with open(imglen, 'ab') as f:        # 边下载边存硬盘  chunk_size=chunk_size可修改 单位为B        for chunl in r.iter_content(chunk_size=chunk_size):            # 写入文件            f.write(chunl)            size += len(chunl)  # 已下载文件大小            # \r 指定第一个字符开始,搭配end属性完成覆盖进度条            print('\r' + '[下载进度]: %s%.2f%%' % ('>' * int(size * 50 / content_size), float(size / content_size * 100)),                  end='')        print('[第' + str(n) + '个图片大小]: %0.2f MB' % (content_size / chunk_size / 1024))        f.close()def echo(color, *args):    colors = {'error': '\033[91m', 'success': '\033[94m', 'info': '\033[93m'}    if not color in colors or platform.system() == 'windows':        print(' '.join(args))    print(colors[color], ' '.join(args), '\033[0m')if __name__ == '__main__':    imgurl = "imgurl.txt"    open(imgurl, "a+")    start = time.time()    imgs(1)    end = time.time()    print('\n' + "全部下载完成!用时%s秒" % (end - start))    scheduler = BlockingScheduler()    scheduler._logger = logging    scheduler.start()
总结

以上是内存溢出为你收集整理的Python抓取图片全部内容,希望文章能够帮你解决Python抓取图片所遇到的程序开发问题。

如果觉得内存溢出网站内容还不错,欢迎将内存溢出网站推荐给程序员好友。

欢迎分享,转载请注明来源:内存溢出

原文地址: http://outofmemory.cn/langs/1188492.html

(0)
打赏 微信扫一扫 微信扫一扫 支付宝扫一扫 支付宝扫一扫
上一篇 2022-06-03
下一篇 2022-06-03

发表评论

登录后才能评论

评论列表(0条)

保存