爬虫--多线程,线程池,异步协程

爬虫--多线程,线程池,异步协程,第1张

# 1 , 多线程
from threading import Thread


def func():
    for i in range(1000):
        print("线程1", i)


def func2():
    for i in range(1000):
        print("线程2", i)


if __name__ == '__main__':
    t = Thread(target=func())
    t.start()  # 开辟一个线程,具体实现要看cpu的
    t2 = Thread(target=func())
    t2.start()  # 开辟一个线程,具体实现要看cpu的
    for i in range(1000):
        print("主线程", i)
# 线程池,一次性开辟多个线程

from concurrent.futures import ThreadPoolExecutor,ProcessPoolExecutor

def func(name):
    for i in range(1000):
        print(name,i)
if __name__ == '__main__':
    # 创建线程池
    with ThreadPoolExecutor(50) as t:
        for i in range(100):
            t.submit(func,name = f"线程{i}")
    # 等待线程池完成任务之后,才进行下一步 *** 作
    print("ok")

异步协程

 

import asyncio
import time
# 4 爬虫应用 , 相当于一个模板
async def downLoad(url):
    print("开始下载")
    await asyncio.sleep(2) # 下载两秒
    print("下载完成")

async def main():
    urls = [
        "https://www.baidu.com",
        "https://www.baida.com",
        "https://www.aidu.com"
    ]
    tasks = []
    for url in urls:
        d = downLoad(url)
        tasks.append(d)
    await asyncio.wait(tasks)


if __name__ == '__main__':
    t1 = time.time()
    asyncio.run(main())
    t2 = time.time()
    print(t2 - t1)  # 得出时间
import asyncio
import aiohttp


# 5 案例

async def urlDownload(url):
    name = url.rsplit('/', 1)[1]
    #    aiohttp.ClientSession() == requests
    # 在异步协程中with前面必须写上async
    # with可以不用手动关闭了,自动关闭
    async with aiohttp.ClientSession() as session:
        # session.get   session.post    左边是异步 *** 作的,右边是requests
        # resp.content.read() == resp.content
        # resp.text() == resp.text
        # resp.json() == resp.json()
        async with session.get(url) as resp:
            with open(f"{name}.jepg", mode="wb") as f:
                f.write(await resp.content.read())  # 读取内容是异步的,需要挂起


async def main():
    urls = [
        "http://www.kaotop.com/file/tupian/20220706/sitapix-photo-2290543-via-sitapix-com.jpeg",
        "http://www.kaotop.com/file/tupian/20220706/bloom-blooming-blossom-130168-via-sitapix-com.jpeg",
        "http://www.kaotop.com/file/tupian/20220706/albums-antique-audio-1181789-via-sitapix-com.jpeg"
    ]
    tasks = []
    for url in urls:
        d = urlDownload(url)
        tasks.append(d)
    await asyncio.wait(tasks)


if __name__ == '__main__':
    asyncio.run(main())
这个是文件的异步 *** 作
async with aiofiles.open(f"./novel/{n}.txt", mode="w", encoding="utf-8") as f:
await f.write(message)
# with open(f"./novel/{n}.txt", mode="w", encoding="utf-8") as f:
#     f.write(message)
import asyncio
import aiohttp
import requests
from lxml import etree
import aiofiles
from concurrent.futures import ThreadPoolExecutor
import time
import random

requests.packages.urllib3.disable_warnings()  # 关闭:不安全请求警告:正在发出未验证的HTTPS请求。


强烈建议添加证书验证。


# 1,同步 # 2,异步 def getBookid(url_1): url = "http://quanxiaoshuo.com" resp = requests.get(url_1, verify=False) # 给requests.get()传入 verify=False 避免ssl认证。


et = etree.HTML(resp.text) list_1 = et.xpath("//div[@class='chapter']") data = [] for item in list_1: # print(type(item)) getData = url + item.xpath("./a/@href")[0] data.append(getData) # break # for i in data: # print(i) return data async def downLoad(url,n): async with aiohttp.ClientSession() as session: async with session.get(url) as resp: txt = await resp.text() ########## # print(txt) et = etree.HTML(txt) message = ''.join(et.xpath('//*[@id="content"]/text()')) # 可以将列表中的各个字符连接起来 message = ''.join(message.split()) # print(message) # print("-"*100) async with aiofiles.open(f"./novel/{n}.txt", mode="w", encoding="utf-8") as f: await f.write(message) # with open(f"./novel/{n}.txt", mode="w", encoding="utf-8") as f: # f.write(message) async def main(urls): n = 1 tasks = [] for url in urls: tasks.append(downLoad(url,n)) print("第%d回" % n) n += 1 await asyncio.wait(tasks) if __name__ == '__main__': a = 0 b = 100 url_1 = "http://quanxiaoshuo.com/179092/" data = getBookid(url_1) asyncio.run(main(data[:5])) # for i in range(10): # # time.sleep(random.randint(1,4)) # asyncio.run(main(data[a:b])) # a += 5 # b += 5 print("ok")

欢迎分享,转载请注明来源:内存溢出

原文地址: http://outofmemory.cn/langs/571618.html

(0)
打赏 微信扫一扫 微信扫一扫 支付宝扫一扫 支付宝扫一扫
上一篇 2022-04-09
下一篇 2022-04-09

发表评论

登录后才能评论

评论列表(0条)

保存