python异步协程实战:wallhaven壁纸网站

python异步协程实战:wallhaven壁纸网站,第1张

开发环境:Python3.9、idea  相较于线程池大约优化了50%的速度,在学校网100M宽带下,爬取一页24张需要60s 代码中默认是下载第一页的图片,想要多爬点的话加个循环就好了

url = "https://wallhaven.cc/toplist?page=1"

整套代码如下,仅供参考

import time
import aiohttp
import asyncio
import requests
import aiofiles
from lxml import etree


async def aiodownload(li):
    # 发送下载请求
    # 得到图片内容
    # 保存到文件
    # resp.content.read()  ==>  requests.content()

    async with aiohttp.ClientSession() as session:
        sub_url = "".join(li.xpath('./figure/a/@href'))
        sub_resp = requests.get(sub_url, headers=headers)  # get请求子网页
        if len(sub_resp.text) < 1000:  # 粗糙的429处理办法 :(
            await asyncio.sleep(3)
        sub_html = etree.HTML(sub_resp.text)
        img_url = "".join(sub_html.xpath('/html/body/main/section/div[1]/img/@src'))  # 图片的下载链接

        async with session.get(img_url, headers=headers) as img_resp:
            img_name = img_url.split('/')[-1]
            async with aiofiles.open("img/" + img_name, mode="wb") as f:
                await f.write(await img_resp.content.read())
                print("下载完成", img_name)
        await asyncio.sleep(0)


async def main():
    url = "https://wallhaven.cc/toplist?page=1"
    resp = requests.get(url, headers=headers)
    html = etree.HTML(resp.text)
    # 拿到图片组
    lis = html.xpath('/html/body/main/div[1]/section[1]/ul/li')
    tasks = []
    for li in lis:
        tasks.append(asyncio.create_task(aiodownload(li)))  # 将转换后的协程对象添加进入任务列表
    await asyncio.wait(tasks)


if __name__ == '__main__':
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36 Edg/100.0.1185.44"
    }
    t1 = time.time()
    print("开始下载...")
    loop = asyncio.get_event_loop()
    loop.run_until_complete(main())
    print("下载完成...")
    t2 = time.time()
    print('运行时间:', t2 - t1)
# requests.get() --> 异步 *** 作
# 实战:wallhaven壁纸网站
import time
import aiohttp
import asyncio
import requests
import aiofiles
from lxml import etree


# 异步协程优化版,跳过获取子页面,通过拼接获取图片下载地址,从单线程100秒,到异步协程只需要20秒,500%的提升
async def aiodownload(li):
    # 发送下载请求
    # 得到图片内容
    # 保存到文件
    # resp.content.read()  ==>  requests.content()

    async with aiohttp.ClientSession() as session:
        sub_url = "".join(li.xpath('./figure/a/@href'))
        url_1 = sub_url.split('/')[-1]
        url_2 = url_1[:2]
        img_url = 'https://w.wallhaven.cc/full/' + url_2 + '/wallhaven-' + url_1 + '.jpg'  # 图片的下载链接
        async with session.get(img_url, headers=headers) as img_resp:
            if not img_resp.ok:
                img_url = 'https://w.wallhaven.cc/full/' + url_2 + '/wallhaven-' + url_1 + '.png'  # 图片的下载链接
                async with session.get(img_url, headers=headers) as img_resp:
                    img_name = img_url.split('/')[-1]
                    async with aiofiles.open("img/" + img_name, mode="wb") as f:
                        await f.write(await img_resp.content.read())
                        print("下载完成", img_name)
            else:
                img_name = img_url.split('/')[-1]
                async with aiofiles.open("img/" + img_name, mode="wb") as f:
                    await f.write(await img_resp.content.read())
                    print("下载完成", img_name)
            await asyncio.sleep(0)


async def main(page):
    tasks = []
    url = f"https://wallhaven.cc/toplist?page={page}"
    resp = requests.get(url, headers=headers)
    html = etree.HTML(resp.text)
    # 拿到图片组
    lis = html.xpath('/html/body/main/div[1]/section[1]/ul/li')
    for li in lis:
        tasks.append(asyncio.create_task(aiodownload(li)))  # 将转换后的协程对象添加进入任务列表
    await asyncio.wait(tasks)


if __name__ == '__main__':
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36 Edg/100.0.1185.44"
    }
    t1 = time.time()
    page = int(input("请输入要下载的页数,从第一页开始:"))
    for i in range(1, page+1):
        print(f"开始下载第{i}页")
        loop = asyncio.get_event_loop()
        loop.run_until_complete(main(page))
    print("下载完成...")
    t2 = time.time()
    print('运行时间:', t2 - t1)

 

欢迎分享,转载请注明来源:内存溢出

原文地址: http://outofmemory.cn/langs/739982.html

(0)
打赏 微信扫一扫 微信扫一扫 支付宝扫一扫 支付宝扫一扫
上一篇 2022-04-28
下一篇 2022-04-28

发表评论

登录后才能评论

评论列表(0条)

保存