通过在network中搜索数据发现旗云代理目前没有隐藏数据,直接根据网址请求即可
2、进行数据的爬取import requests
import parsel
import time
import os
start = time.time() # 程序开始时间
proxy_list = [] # 接收爬取到的代理ip
def check_ip(proxies_list):
# 检测代理的可用性
can_use = []
for proxy in proxies_list:
try:
response = requests.get(url='https://www.baidu.com',proxies=proxy,timeout=2) # 等待时间
if response.status_code == 200:
can_use.append(proxy)
print('当前代理:%s,---检测通过---' % proxy)
except:
print('当前代理:%s响应超时,不合格'%proxy)
return can_use
count = 0
# 判断文件是否存在
if not os.path.exists('D:/studySpider/proxys'):
# 创建文件
os.mkdir('D:/studySpider/proxys')
url = 'https://proxy.ip3366.net/free/?action=china&page={0}'
headers = {
'Cookie':'Hm_lvt_c4dd741ab3585e047d56cf99ebbbe102=1646291435,1646575373; https_waf_cookie=3c83bd9d-a843-4a11513523c874b52e0d69c4abd0024e8abd; Hm_lvt_96901db7af1741c2fd2d52f310d78eaa=1649315342; Hm_lpvt_96901db7af1741c2fd2d52f310d78eaa=1649315432',
'Host':'proxy.ip3366.net',
'Pragma':'no-cache',
'sec-ch-ua':'" Not A;Brand";v="99", "Chromium";v="99", "Google Chrome";v="99"',
'sec-ch-ua-mobile':'?0',
'sec-ch-ua-platform':'"Windows"',
'Sec-Fetch-Dest':'document',
'Sec-Fetch-Mode':'navigate',
'Sec-Fetch-Site':'none',
'Sec-Fetch-User':'?1',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36'
}
for i in range(1,4): # 遍历1~3
response = requests.get(url=url.format(i), headers=headers).text # 请求
sel = parsel.Selector(response) # 解析
trs = sel.css('#content>section>div.container>table>tbody>tr') # 定位
for tr in trs:
count += 1
ip = tr.css('td[data-title="IP"]::text').get() # 二次定位并获取数据
port = tr.css('td[data-title="PORT"]::text').get()
type = tr.css('td[data-title="类型"]::text').get()
proxy_dict = {type: ip + ':' + port} # 拼接为代理IP
proxy_list.append(proxy_dict) # 追加到列表proxy_list
print('爬取ip代理 {0} 成功,第 {1} 个'.format(proxy_dict, count))
print(proxy_list) # 输出代理列表
can_use = check_ip(proxy_list) # 调用ip检测可用性
with open('D:/studySpider/proxys/proxy2.txt', "w") as f: # 保存可用ip
f.write('{0}'.format(can_use))
f.close() # 关闭文件
print('保存高质量代理成功:{0}'.format(can_use))
print('ip代理爬取成功,一共爬取 {0} 个,高质量代理 {1} 个 爬取时间 {2}秒'.format(count, len(can_use), time.time()-start))
爬取到的高质量代理
[{'HTTP': '121.232.148.216:9000'}, {'HTTP': '103.140.126.61:8888'}, {'HTTP': '47.56.69.11:8000'}, {'HTTP': '121.232.148.115:9000'}, {'HTTPS': '58.215.201.98:56566'}, {'HTTP': '101.200.49.180:8118'}, {'HTTP': '103.148.72.192:80'}, {'HTTP': '39.108.88.42:80'}, {'HTTP': '27.214.50.223:9000'}, {'HTTP': '218.6.173.81:443'}, {'HTTPS': '218.106.60.94:21080'}, {'HTTP': '183.247.211.156:30001'}, {'HTTP': '47.117.2.33:8080'}, {'HTTP': '121.232.148.85:9000'}, {'HTTP': '122.9.101.6:8888'}, {'HTTP': '152.136.62.181:9999'}, {'HTTP': '115.218.2.149:9000'}, {'HTTPS': '58.37.233.45:8118'}, {'HTTP': '14.215.212.37:9168'}, {'HTTP': '121.41.167.77:8877'}, {'HTTP': '39.108.71.54:8088'}, {'HTTP': '121.232.148.151:9000'}, {'HTTP': '202.109.157.61:9000'}, {'HTTP': '47.99.122.96:82'}, {'HTTP': '221.226.75.86:55443'}, {'HTTP': '124.205.155.152:9090'}, {'HTTP': '39.175.75.53:30001'}, {'HTTP': '223.18.222.105:80'}, {'HTTP': '118.193.40.245:80'}, {'HTTP': '182.92.110.245:8118'}]
打卡第68天,对python大数据感兴趣的朋友欢迎一起讨论、交流,请多指教!
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)