Pivix 站爬虫

Pivix 站爬虫,第1张

Pivix 站爬虫

爬取id关注里指定数量用户的图片作品

gevent_download.py

from statistical_data import html_tree
import os
import multiprocessing
from url_mange import Urlmanage
from get_uid import get_uid
from get_pid import get_pid
import requests
import re
class gevent(object):


    def pro(self,url,uid):

        os.mkdir('/Users/' + uid)
        str=[]
        str2=[]
        urls=[]
        names=[]
        headers2=Urlmanage().head2()
        headers=Urlmanage().head1()
        res = requests.get(url, headers=headers)
        print("res===%s"%res)
        list = res.json()
        for l in list:
            l = list['body']['works']


        for i in l:

            urls.append(l[i]['url'])

            names.append(l[i]['title'])

        # print(urls)
        for a in urls:
            str.append(re.findall('img/(.*?)_p0', a))
            b=re.findall('img/(.*?)_p0', a)
        url_img = 'https://i.pximg.net/img-original/img/{}_p0.jpg'
        url_img2 = 'https://i.pximg.net/img-original/img/{}_p0.png'
        i = 0
        string = 'https://www.pixiv.net/artworks/{}'
        pattern = re.compile(r'/(d{4}.*)')
        for astr in str:
            string2 = string.format(pattern.findall(astr[0])[0]) # referer主链接拼接字符串
            pid = pattern.findall(astr[0])[0]
            html_tree(pid=pid)
            pid=pattern.findall(astr[0])[0]
            # string2 = string.format(astr[0][-8:])  # referer主链接拼接字符串
            headers2['referer'] = string2  # 将headers的referer更新为拼接好的链接地址,进行访问
            rrr = requests.get(url_img.format(astr[0]), headers=headers2)
            rrr2 = requests.get(url_img2.format(astr[0]), headers=headers2)
            if rrr.status_code == 200:
                with open('/Users/%s/%s.png' % (uid, pid), 'wb+') as f:  ##默认的存储路径
                    # if rrr.status_code==200:
                    f.write(rrr.content)

                    print("%s下载成功n" % names[i])

            else:
                with open('/Users/%s/%s.png' % (uid, pid), 'wb+') as p:
                    p.write(rrr2.content)
                    print("%s下载成功n" % names[i])
            i += 1
if __name__ == '__main__':
    with open('data.csv', 'w', encoding='utf-8') as f:
        f.write('pid,page_view,Thumb_up_quantityn')
    date= input("请输入你的主页id: ")
    qua=input("请输入爬取的用户数: ")
    uids = get_uid().userid(data=date,qua=qua)
    # print("uids===%s"%uids)
    pool = multiprocessing.Pool(processes=4) #进程池
    for uid in uids:
        pids = get_pid().getpid(uid=uid)
        url = 'https://www.pixiv.net/ajax/user/{uid}/profile/illusts?{pid}&work_category=illustManga&is_first_page=1&lang=zh'.format(
            uid=uid, pid=pids)
        pool.apply_async(gevent().pro, (url, uid))

    pool.close()
    pool.join()

get_pid.py

import requests
from url_mange import Urlmanage
from get_uid import get_uid
class get_pid(object):
    def getpid(self,uid):
        url='https://www.pixiv.net/ajax/user/{uid}/profile/all?lang=zh'.format(uid=uid)
        l=requests.get(url,headers=Urlmanage().head1()).json()
        r=(l['body']['illusts'])
        # print(r)
        return 'ids[]='+'&ids[]='.join(r)


get_uid.py

from  url_mange import Urlmanage
import requests
class get_uid(object):
    def userid(self,data,qua):
        list_id = []
        url="https://www.pixiv.net/ajax/user/%s/following?offset=0&limit=%s&rest=show&tag=&lang=zh"%(data,qua)
        a=requests.get(url,headers=Urlmanage().head1())
        j=a.json()
        l=j['body']['users']
        for i in l:
            list_id.append(i['userId'])
        print(list_id)
        # print(list_id)
        return list_id

url_mange.py

from anti_useragent import UserAgent
class Urlmanage(object):
    def head1(self):
        headers = {
            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,**;q=0.8',
                    'accept-encoding': 'gzip, deflate, br',
                    'accept-language': 'zh-CN,zh-HK;q=0.9,zh;q=0.8,en-US;q=0.7,en;q=0.6',
                    'referer': '',
                    'sec-fetch-dest': 'image',
                    'sec-fetch-mode': 'no-cors',
                    'sec-fetch-site': 'cross-site',
                    'user-agent': UserAgent(min_version=50, max_version=80).random,
                    'cookie': '29=b.6dd7c319e074b4f8; __utmc=235335808; tags_sended=1; categorized_tags=IVwLyT8B6k~RcahSSzeRf~_-agXPKuAQ~qiO14cZMBI; _gid=GA1.2.149545487.1641800553; __utma=235335808.1011429997.1641442221.1641800539.1641805707.10; __utmt=1; __cf_bm=IjpztpOQs3g87bPZvauxL7sWuWQHDZ4RQcR1btSMTrc-1641807660-0-AWvQ1Wgma2i2FQ9QE0bBFWUYYy5snGql/ghdkDEyKsJ7N6EE4yquIB7eXTyKo0fZFN98bgM0GY+oZbbq73jd3tmRH0dbO67Oxk6Gz9mzuSzNRpBksovwXZSkK4yxeKW3ALdaAts+aeV7Actqz0KySTbdZO2htBmpGis2m1lNq+fRzb5+Nqnx/4wTEhzEzMLjbQ==; tag_view_ranking=4TDL3X7bV9~jpIZPQ502H~fqoLWl17eh~zj9gD1HFwG~PwDMGzD6xn~vNpvefPsAB~uusOs0ipBx~yOqOtdektt~0sVgHoAwbd~qVjnJnSnAY~0xsDLqCEW6~cjsAyvz-bf~7OHNI0Gq-U~TFDu872vC0~RcahSSzeRf~FqVQndhufZ~Lt-oEicbBr~skx_-I2o4Y~WTz8QdIkZx~3cT9FM3R6t~LTW5GJcQwW~ETjPkL0e6r~qiO14cZMBI~zKLqKSPEAG~PFZpGHvD7Z~w6DOLSTOSN~faHcYIP1U0~mLrrjwTHBm~85bv9GYk84~Nqn2kKfM8q~zLYdlsj4Z9~QKeXYK2oSR~RTJMXD26Ak~iqLNss09Jd~hLeD_GxVsq~TaUYlgH_jM~JN2fNJ_Ue2~oCR2Pbz1ly~qWFESUmfEs~FySY6ZVB78~im3usT8hyU~t2ErccCFR9~jhuUT0OJva~WVrsHleeCL~tgP8r-gOe_~r_Jjn6Ua2V~ZBoVMjk2oM~dCMKBh0255~QxZFRkLR1E~2PzZzrnP0p~dqqWNpq7ul~zqe8dqUBGC~eZvMiRfsU3~LVSDGaCAdn~2-q1CV6LVL~AKT2U2P4W6~xufWQ15ZA3~F0-f08C1cg~YThJ5b-nhQ~eVxus64GZU~Cp5keYns6b~g0_XDQP_lq~YopcpQgQvo~wnGN3ZYkde~f-c_0dUV8c~VycxboLmxz~HY55MqmzzQ~QaiOjmwQnI~CrFcrMFJzz~plqXT5B4--~CiSfl_AE0h~-StjcwdYwv~X7o4FygncP~Xs-7j6fVPs~LJo91uBPz4~Jxg8TkZQdK~1LN8nwTqf_~GxEzX1Shma~A3wgamEIOQ~UdsOa6tZrT~cFYMvUloX0~0Q_7F0H35Y~klRGBoZBqU~rNs-bh_gk3~MhuNMsFpmN~Ie2c51_4Sp~5oPIfUbtd6~BtvmzyZ2Eh~zZZn32I7eS~rOnsP2Q5UN~6Bbq5MJhFe~leIwAgTj8E~bMWjDZvVht~UZootLOo57~Sbp1gmMeRy~f89C7fWBcE~MM6RXH_rlN~WLDE0_23UO~WCXA6OxHJa~6n5sWl9nNm; __utmb=235335808.10.10.1641805707; QSI_S_ZN_5hF4My7Ad6VNNAi=r:10:34'

                    }
        return headers2





欢迎分享,转载请注明来源:内存溢出

原文地址: http://outofmemory.cn/zaji/5712135.html

(0)
打赏 微信扫一扫 微信扫一扫 支付宝扫一扫 支付宝扫一扫
上一篇 2022-12-17
下一篇 2022-12-17

发表评论

登录后才能评论

评论列表(0条)

保存