requests爬取免费代理2

requests爬取免费代理2,第1张

1、明确需求,找到url

通过在network中搜索数据发现旗云代理目前没有隐藏数据,直接根据网址请求即可

2、进行数据的爬取
import requests
import parsel
import time
import os

start = time.time()   # 程序开始时间
proxy_list = [] # 接收爬取到的代理ip
def check_ip(proxies_list):
    # 检测代理的可用性
    can_use = []
    for proxy in proxies_list:
        try:
            response = requests.get(url='https://www.baidu.com',proxies=proxy,timeout=2)  # 等待时间
            if response.status_code == 200:
                can_use.append(proxy)
                print('当前代理:%s,---检测通过---' % proxy)
        except:
            print('当前代理:%s响应超时,不合格'%proxy)
    return can_use
count = 0
# 判断文件是否存在
if not os.path.exists('D:/studySpider/proxys'):
    # 创建文件
    os.mkdir('D:/studySpider/proxys')

url = 'https://proxy.ip3366.net/free/?action=china&page={0}'
headers = {
    'Cookie':'Hm_lvt_c4dd741ab3585e047d56cf99ebbbe102=1646291435,1646575373; https_waf_cookie=3c83bd9d-a843-4a11513523c874b52e0d69c4abd0024e8abd; Hm_lvt_96901db7af1741c2fd2d52f310d78eaa=1649315342; Hm_lpvt_96901db7af1741c2fd2d52f310d78eaa=1649315432',
    'Host':'proxy.ip3366.net',
    'Pragma':'no-cache',
    'sec-ch-ua':'" Not A;Brand";v="99", "Chromium";v="99", "Google Chrome";v="99"',
    'sec-ch-ua-mobile':'?0',
    'sec-ch-ua-platform':'"Windows"',
    'Sec-Fetch-Dest':'document',
    'Sec-Fetch-Mode':'navigate',
    'Sec-Fetch-Site':'none',
    'Sec-Fetch-User':'?1',
    'Upgrade-Insecure-Requests':'1',
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36'
}

for i in range(1,4):        # 遍历1~3
    response = requests.get(url=url.format(i), headers=headers).text    # 请求
    sel = parsel.Selector(response)     # 解析
    trs = sel.css('#content>section>div.container>table>tbody>tr')  # 定位
    for tr in trs:
        count += 1
        ip = tr.css('td[data-title="IP"]::text').get()      # 二次定位并获取数据
        port = tr.css('td[data-title="PORT"]::text').get() 
        type = tr.css('td[data-title="类型"]::text').get()
        proxy_dict = {type: ip + ':' + port}  # 拼接为代理IP
        proxy_list.append(proxy_dict)  # 追加到列表proxy_list
        print('爬取ip代理 {0} 成功,第 {1} 个'.format(proxy_dict, count))
print(proxy_list)       # 输出代理列表
can_use = check_ip(proxy_list)  # 调用ip检测可用性
with open('D:/studySpider/proxys/proxy2.txt', "w") as f:    # 保存可用ip
    f.write('{0}'.format(can_use))
    f.close()       # 关闭文件
print('保存高质量代理成功:{0}'.format(can_use))
print('ip代理爬取成功,一共爬取 {0} 个,高质量代理 {1} 个 爬取时间 {2}秒'.format(count, len(can_use), time.time()-start))

爬取到的高质量代理

[{'HTTP': '121.232.148.216:9000'}, {'HTTP': '103.140.126.61:8888'}, {'HTTP': '47.56.69.11:8000'}, {'HTTP': '121.232.148.115:9000'}, {'HTTPS': '58.215.201.98:56566'}, {'HTTP': '101.200.49.180:8118'}, {'HTTP': '103.148.72.192:80'}, {'HTTP': '39.108.88.42:80'}, {'HTTP': '27.214.50.223:9000'}, {'HTTP': '218.6.173.81:443'}, {'HTTPS': '218.106.60.94:21080'}, {'HTTP': '183.247.211.156:30001'}, {'HTTP': '47.117.2.33:8080'}, {'HTTP': '121.232.148.85:9000'}, {'HTTP': '122.9.101.6:8888'}, {'HTTP': '152.136.62.181:9999'}, {'HTTP': '115.218.2.149:9000'}, {'HTTPS': '58.37.233.45:8118'}, {'HTTP': '14.215.212.37:9168'}, {'HTTP': '121.41.167.77:8877'}, {'HTTP': '39.108.71.54:8088'}, {'HTTP': '121.232.148.151:9000'}, {'HTTP': '202.109.157.61:9000'}, {'HTTP': '47.99.122.96:82'}, {'HTTP': '221.226.75.86:55443'}, {'HTTP': '124.205.155.152:9090'}, {'HTTP': '39.175.75.53:30001'}, {'HTTP': '223.18.222.105:80'}, {'HTTP': '118.193.40.245:80'}, {'HTTP': '182.92.110.245:8118'}]

打卡第68天,对python大数据感兴趣的朋友欢迎一起讨论、交流,请多指教!

欢迎分享,转载请注明来源:内存溢出

原文地址: http://outofmemory.cn/langs/569650.html

(0)
打赏 微信扫一扫 微信扫一扫 支付宝扫一扫 支付宝扫一扫
上一篇 2022-04-09
下一篇 2022-04-09

发表评论

登录后才能评论

评论列表(0条)

保存