爬取id关注里指定数量用户的图片作品
gevent_download.py
from statistical_data import html_tree import os import multiprocessing from url_mange import Urlmanage from get_uid import get_uid from get_pid import get_pid import requests import re class gevent(object): def pro(self,url,uid): os.mkdir('/Users/' + uid) str=[] str2=[] urls=[] names=[] headers2=Urlmanage().head2() headers=Urlmanage().head1() res = requests.get(url, headers=headers) print("res===%s"%res) list = res.json() for l in list: l = list['body']['works'] for i in l: urls.append(l[i]['url']) names.append(l[i]['title']) # print(urls) for a in urls: str.append(re.findall('img/(.*?)_p0', a)) b=re.findall('img/(.*?)_p0', a) url_img = 'https://i.pximg.net/img-original/img/{}_p0.jpg' url_img2 = 'https://i.pximg.net/img-original/img/{}_p0.png' i = 0 string = 'https://www.pixiv.net/artworks/{}' pattern = re.compile(r'/(d{4}.*)') for astr in str: string2 = string.format(pattern.findall(astr[0])[0]) # referer主链接拼接字符串 pid = pattern.findall(astr[0])[0] html_tree(pid=pid) pid=pattern.findall(astr[0])[0] # string2 = string.format(astr[0][-8:]) # referer主链接拼接字符串 headers2['referer'] = string2 # 将headers的referer更新为拼接好的链接地址,进行访问 rrr = requests.get(url_img.format(astr[0]), headers=headers2) rrr2 = requests.get(url_img2.format(astr[0]), headers=headers2) if rrr.status_code == 200: with open('/Users/%s/%s.png' % (uid, pid), 'wb+') as f: ##默认的存储路径 # if rrr.status_code==200: f.write(rrr.content) print("%s下载成功n" % names[i]) else: with open('/Users/%s/%s.png' % (uid, pid), 'wb+') as p: p.write(rrr2.content) print("%s下载成功n" % names[i]) i += 1 if __name__ == '__main__': with open('data.csv', 'w', encoding='utf-8') as f: f.write('pid,page_view,Thumb_up_quantityn') date= input("请输入你的主页id: ") qua=input("请输入爬取的用户数: ") uids = get_uid().userid(data=date,qua=qua) # print("uids===%s"%uids) pool = multiprocessing.Pool(processes=4) #进程池 for uid in uids: pids = get_pid().getpid(uid=uid) url = 'https://www.pixiv.net/ajax/user/{uid}/profile/illusts?{pid}&work_category=illustManga&is_first_page=1&lang=zh'.format( uid=uid, pid=pids) pool.apply_async(gevent().pro, (url, uid)) pool.close() pool.join()
get_pid.py
import requests from url_mange import Urlmanage from get_uid import get_uid class get_pid(object): def getpid(self,uid): url='https://www.pixiv.net/ajax/user/{uid}/profile/all?lang=zh'.format(uid=uid) l=requests.get(url,headers=Urlmanage().head1()).json() r=(l['body']['illusts']) # print(r) return 'ids[]='+'&ids[]='.join(r)
get_uid.py
from url_mange import Urlmanage import requests class get_uid(object): def userid(self,data,qua): list_id = [] url="https://www.pixiv.net/ajax/user/%s/following?offset=0&limit=%s&rest=show&tag=&lang=zh"%(data,qua) a=requests.get(url,headers=Urlmanage().head1()) j=a.json() l=j['body']['users'] for i in l: list_id.append(i['userId']) print(list_id) # print(list_id) return list_id
url_mange.py
from anti_useragent import UserAgent class Urlmanage(object): def head1(self): headers = { 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,**;q=0.8', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh-HK;q=0.9,zh;q=0.8,en-US;q=0.7,en;q=0.6', 'referer': '', 'sec-fetch-dest': 'image', 'sec-fetch-mode': 'no-cors', 'sec-fetch-site': 'cross-site', 'user-agent': UserAgent(min_version=50, max_version=80).random, 'cookie': '29=b.6dd7c319e074b4f8; __utmc=235335808; tags_sended=1; categorized_tags=IVwLyT8B6k~RcahSSzeRf~_-agXPKuAQ~qiO14cZMBI; _gid=GA1.2.149545487.1641800553; __utma=235335808.1011429997.1641442221.1641800539.1641805707.10; __utmt=1; __cf_bm=IjpztpOQs3g87bPZvauxL7sWuWQHDZ4RQcR1btSMTrc-1641807660-0-AWvQ1Wgma2i2FQ9QE0bBFWUYYy5snGql/ghdkDEyKsJ7N6EE4yquIB7eXTyKo0fZFN98bgM0GY+oZbbq73jd3tmRH0dbO67Oxk6Gz9mzuSzNRpBksovwXZSkK4yxeKW3ALdaAts+aeV7Actqz0KySTbdZO2htBmpGis2m1lNq+fRzb5+Nqnx/4wTEhzEzMLjbQ==; tag_view_ranking=4TDL3X7bV9~jpIZPQ502H~fqoLWl17eh~zj9gD1HFwG~PwDMGzD6xn~vNpvefPsAB~uusOs0ipBx~yOqOtdektt~0sVgHoAwbd~qVjnJnSnAY~0xsDLqCEW6~cjsAyvz-bf~7OHNI0Gq-U~TFDu872vC0~RcahSSzeRf~FqVQndhufZ~Lt-oEicbBr~skx_-I2o4Y~WTz8QdIkZx~3cT9FM3R6t~LTW5GJcQwW~ETjPkL0e6r~qiO14cZMBI~zKLqKSPEAG~PFZpGHvD7Z~w6DOLSTOSN~faHcYIP1U0~mLrrjwTHBm~85bv9GYk84~Nqn2kKfM8q~zLYdlsj4Z9~QKeXYK2oSR~RTJMXD26Ak~iqLNss09Jd~hLeD_GxVsq~TaUYlgH_jM~JN2fNJ_Ue2~oCR2Pbz1ly~qWFESUmfEs~FySY6ZVB78~im3usT8hyU~t2ErccCFR9~jhuUT0OJva~WVrsHleeCL~tgP8r-gOe_~r_Jjn6Ua2V~ZBoVMjk2oM~dCMKBh0255~QxZFRkLR1E~2PzZzrnP0p~dqqWNpq7ul~zqe8dqUBGC~eZvMiRfsU3~LVSDGaCAdn~2-q1CV6LVL~AKT2U2P4W6~xufWQ15ZA3~F0-f08C1cg~YThJ5b-nhQ~eVxus64GZU~Cp5keYns6b~g0_XDQP_lq~YopcpQgQvo~wnGN3ZYkde~f-c_0dUV8c~VycxboLmxz~HY55MqmzzQ~QaiOjmwQnI~CrFcrMFJzz~plqXT5B4--~CiSfl_AE0h~-StjcwdYwv~X7o4FygncP~Xs-7j6fVPs~LJo91uBPz4~Jxg8TkZQdK~1LN8nwTqf_~GxEzX1Shma~A3wgamEIOQ~UdsOa6tZrT~cFYMvUloX0~0Q_7F0H35Y~klRGBoZBqU~rNs-bh_gk3~MhuNMsFpmN~Ie2c51_4Sp~5oPIfUbtd6~BtvmzyZ2Eh~zZZn32I7eS~rOnsP2Q5UN~6Bbq5MJhFe~leIwAgTj8E~bMWjDZvVht~UZootLOo57~Sbp1gmMeRy~f89C7fWBcE~MM6RXH_rlN~WLDE0_23UO~WCXA6OxHJa~6n5sWl9nNm; __utmb=235335808.10.10.1641805707; QSI_S_ZN_5hF4My7Ad6VNNAi=r:10:34' } return headers2
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)