关于Python爬取网页返回521状况码的解决方案_随笔

关于Python爬取网页返回521状况码的解决方案

文章目录

问题描述：原因分析：解决方案：

方法一：方法二：方法三：

代码一代码二代码三代码四代码五Test代码

# 项目场景： Python3.8 问题描述：

在使用Python爬虫爬取网页的列表页中的详情页时，返回的详情页的html文件的数据长度有限。

原因分析：

频繁爬取目标网站，导致的网址反爬虫措施

解决方案：

如果解决不了，你可以把要爬取网页的源码先保存下来，进行后续的处理。

方法一：

换一个vpn,也就是换一台电脑执行程序

方法二：

复制目标网页的Headers添加到代码中

根据目标情况不同修改

def askURL(url):
    head = {  # 模拟浏览器头部信息，向豆瓣服务器发送消息
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,**;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
    'Cache-Control': 'max-age=0',
    'Connection': 'keep-alive',
    'Host': 'www.mafengwo.cn',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36 Edg/97.0.1072.55',
}
url = 'http://www.mafengwo.cn/poi/5423409.html'

# response = requests.get(url)
# # cookie1
# cookie1 = response.cookies
# # js代码
# js_code = response.text




def get_521_content(url,head):

    req = requests.get(url, headers=head)
    cookies = req.cookies

    cookies = '; '.join(['='.join(item) for item in cookies.items()])
    txt_521 = req.text
    txt_521 = ''.join(re.findall('', txt_521))
    return (txt_521, cookies)


def fixed_fun(function):
    func_return = function.replace('eval', 'return')
    content = execjs.compile(func_return)

    req = requests.get(url, headers=head)
    evaled_func = ''.join(re.findall('', req.text))
    # print(js_con)
    # fn = js_con.split('=').split(' ')
    # evaled_func = content.call(fn)

    # print(evaled_func)
    mode_func = evaled_func.replace('while(window._phantom||window.__phantomas){};', ''). 
        replace('document.cookie=', 'return').replace(';if((function(){try{return !!window.addEventListener;}', ''). 
        replace("catch(e){return false;}})()){document.addEventListener('DOMContentLoaded',l,false);}", ''). 
        replace("else{document.attachEvent('onreadystatechange',l);}", '').replace(
        r"setTimeout('location.href=location.href.replace(/[?|&]captcha-challenge/,'')',1500);", '')
    content = execjs.compile(mode_func)
    cookies = content.call('l')
    __jsl_clearance = cookies.split(';')[0]
    return __jsl_clearance


def cookie_dict(js, id):
    dict = {}
    js = js.split('=')
    id = id.split('=')
    dict[js[0]] = js[1]
    dict[id[0]] = id[1]
    return dict


if __name__ == '__main__':
    func = get_521_content(url,head)
    content = func[0]

    cookie_id = func[1]
    cookie_js = fixed_fun(func[0])
    dicted_cookie = cookie_dict(cookie_js, cookie_id)

    head = {  # 模拟浏览器头部信息，向豆瓣服务器发送消息
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,**;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
    'Cache-Control': 'max-age=0',
    'Connection': 'keep-alive',
    'Host': 'www.mafengwo.cn',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36 Edg/97.0.1072.55',
}




def get_521_content(url):
    req = requests.get(url, headers=head, timeout=5)
    print(req.status_code, req.text)
    if req.status_code == 521:
        cookies = dict(req.cookies.items())
        print(cookies)
        js_con = ''.join(re.findall('', req.text))
        if js_con:
            __jsl_clearance = fixed_fun(js_con, url)
            if __jsl_clearance:
                key, value = __jsl_clearance.split('=')
                cookies[key] = value
                return cookies


# 执行js代码获取cookies 的__jsl_clearance的键值
def fixed_fun(js_con, url):  # js_con 第一次请求获取的js内容

    func_return = js_con.replace('eval(', 'return(')
    print('第一次替换eval==》return后：  ', func_return)
    content = execjs.compile(func_return)
    # fn = js_con.split('=')[0].split(' ')[1]
    # 只有['document.cookie']
    fn = js_con.split('=')[0].split(' ')[1]
    evaled_func = content.call(fn)
    print('第一次执行js代码后： ', evaled_func)
    fn = evaled_func.split('=')[0].split(' ')[1]  # 获取动态函数名
    aa = evaled_func.split("")  # 获取标签的内容
    aa = aa[1].split("")[0] if len(aa) >= 2 else ''
    mode_func = evaled_func. 
        replace(
        "setTimeout('location.href=location.pathname+location.search.replace(/[\?|&]captcha-challenge/,\'\')',1500);document.cookie=",
        'return'). 
        replace(';if((function(){try{return !!window.addEventListener;}', ''). 
        replace(
        "}catch(e){return false;}})()){document.addEventListener('DOMContentLoaded'," + fn + ",false)}else{document.attachEvent('onreadystatechange'," + fn + ")",
        ''). 
        replace(
        "if((function(){try{return !!window.addEventListener;}catch(e){return false;}})()){document.addEventListener('DOMContentLoaded'," + fn + ",false)}else{document.attachEvent('onreadystatechange'," + fn + ")",
        ''). 
        replace("return'__jsl_clearance", "var window={};return '__jsl_clearance"). 
        replace(
        "var " + fn + "=document.createElement('div');" + fn + ".innerHTML='" + aa + "';" + fn + "=" + fn + ".firstChild.href",
        "var " + fn + "='" + url + "'")
    print('第二次替换后的js代码：', mode_func)
    try:
        content = execjs.compile(mode_func)
        cookies = content.call(fn)
        __jsl_clearance = cookies.split(';')[0]
        print(__jsl_clearance)
        return __jsl_clearance
    except:
        print('js执行错误:', mode_func)
        return None


# 携带解密后的cookies第二次爬取详情页
def con_spider(cookies, url):
    response = requests.get(url, headers=head, cookies=cookies, timeout=5)
    if response.status_code == 200:
        response.encoding = 'utf-8'
        print(response.status_code)
        print(response.text)
        return response
    else:
        print('第二次爬取错误状态码：', response.status_code)
        return None


if __name__ == "__main__":
    cookies = get_521_content(url)
    con_spider(cookies, url)

代码三

# resource:https://www.cnblogs.com/gongs/p/10524710.html

import execjs

import re

import requests

url = 'http://www.mafengwo.cn/poi/5423409.html'

head = {  # 模拟浏览器头部信息，向豆瓣服务器发送消息
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,**;q=0.8,application/signed-exchange;v=b3;q=0.9",
    # "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
    # "Cache-Control": "max-age=0",
    # "Connection": "keep-alive",
    # "cookie": 'mfw_uuid=61dc38ef-2c67-45ce-ed26-c30fa04f2418; oad_n=a:3:{s:3:"oid";i:1029;s:2:"dm";s:15:"www.mafengwo.cn";s:2:"ft";s:19:"2022-01-10+21:47:27";}; __jsluid_h=aa6e6e4350e2fd0e52cc227da10e26b5; __omc_chl=; __omc_r=; __mfwc=direct; uva=s:78:"a:3:{s:2:"lt";i:1641822448;s:10:"last_refer";s:6:"direct";s:5:"rhost";s:0:"";}";; __mfwurd=a:3:{s:6:"f_time";i:1641822448;s:9:"f_rdomain";s:0:"";s:6:"f_host";s:3:"www";}; __mfwuuid=61dc38ef-2c67-45ce-ed26-c30fa04f2418; UM_distinctid=17e443e711c512-05dd7ff73ec639-5e181552-144000-17e443e711dc58; login=mafengwo; mafengwo=16a582a6e0ca5f6c73654cb640343886_35627906_61e15d7be119c7.29366428_61e15d7be11a11.54996187; __jsl_clearance=1642341544.979|0|fafiHNHGZB+baEyxg5NVjPfVXm0=; PHPSESSID=s4foj9fhkm3mq8rs64omagvvp2; mfw_uid=35627906; __mfwa=1641822449293.40635.14.1642238623523.1642341546971; __mfwlv=1642341546; __mfwvn=10; Hm_lvt_8288b2ed37e5bc9b4c9f7008798d2de0=1642215122,1642218839,1642238624,1642341547; CNZZDATA30065558=cnzz_eid=1067569765-1641819345-&ntime=1642337760; bottom_ad_status=0; uol_throttle=35627906; __mfwb=8cc49c72508e.10.direct; __mfwlt=1642343676; Hm_lpvt_8288b2ed37e5bc9b4c9f7008798d2de0=1642343676',
    # "Host": "www.mafengwo.cn",
    # "Upgrade-Insecure-Requests": "1",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36 Edg/97.0.1072.55",
}


def getResponse():
    """
    获取response
    :return:
    """
    response = requests.get(url1, headers=head)
    return response


def getJslid(response):
    """

    :param response:
    :return:
    """
    cook = response.cookies
    return '; '.join(['='.join(item) for item in cook.items()])


def getClearance(response):
    """

    :return:
    """
    txt = ''.join(re.findall('', response.text))
    func_return = txt.replace('eval', 'return')
    print(func_return)

    content = execjs.compile(func_return)
    print(type(content))
    # content = open("jsdom_document").read()
    # print(content)
    # execjs._exceptions.ProgramError: ReferenceError: document is not defined
    eval_func = content.call('x')

    name = re.findall(r'var (.*?)=function.*', eval_func)[0]

    mode_func = eval_func.replace('while(window._phantom||window.__phantomas){};', ''). 
        replace('document.cookie=', 'return').replace('if((function(){try{return !!window.addEventListener;}', ''). 
        replace("catch(e){return false;}})()){document.addEventListener('DOMContentLoaded',%s,false)}" % name, ''). 
        replace("else{document.attachEvent('onreadystatechange',%s)}" % name, '').replace(
        r"setTimeout('location.href=location.pathname+location.search.replace(/[?|&]captcha-challenge/,'')',1500);",
        '')

    content = execjs.compile(mode_func)
    cookies = content.call(name)
    # print(cookies)
    clearance = cookies.split(';')[0]

    return clearance


def structurecookie(cook, clearance):
    """
    构造新的headers
    :return:
    """

    cookie = cook + ';' + clearance
    print(cookie)

    return cookie


if __name__ == '__main__':
    response = getResponse()
    clearance = getClearance(response)
    cook = getJslid(response)

    head = {  # 模拟浏览器头部信息，向豆瓣服务器发送消息
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,**;q=0.8,application/signed-exchange;v=b3;q=0.9",
    "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
    "Cache-Control": "max-age=0",
    "Connection": "keep-alive",
    "cookie": 'mfw_uuid=61dc38ef-2c67-45ce-ed26-c30fa04f2418; oad_n=a:3:{s:3:"oid";i:1029;s:2:"dm";s:15:"www.mafengwo.cn";s:2:"ft";s:19:"2022-01-10+21:47:27";}; __jsluid_h=aa6e6e4350e2fd0e52cc227da10e26b5; __omc_chl=; __omc_r=; __mfwc=direct; uva=s:78:"a:3:{s:2:"lt";i:1641822448;s:10:"last_refer";s:6:"direct";s:5:"rhost";s:0:"";}";; __mfwurd=a:3:{s:6:"f_time";i:1641822448;s:9:"f_rdomain";s:0:"";s:6:"f_host";s:3:"www";}; __mfwuuid=61dc38ef-2c67-45ce-ed26-c30fa04f2418; UM_distinctid=17e443e711c512-05dd7ff73ec639-5e181552-144000-17e443e711dc58; login=mafengwo; mafengwo=16a582a6e0ca5f6c73654cb640343886_35627906_61e15d7be119c7.29366428_61e15d7be11a11.54996187; __jsl_clearance=1642341544.979|0|fafiHNHGZB+baEyxg5NVjPfVXm0=; PHPSESSID=s4foj9fhkm3mq8rs64omagvvp2; mfw_uid=35627906; __mfwa=1641822449293.40635.14.1642238623523.1642341546971; __mfwlv=1642341546; __mfwvn=10; Hm_lvt_8288b2ed37e5bc9b4c9f7008798d2de0=1642215122,1642218839,1642238624,1642341547; CNZZDATA30065558=cnzz_eid=1067569765-1641819345-&ntime=1642337760; bottom_ad_status=0; uol_throttle=35627906; __mfwb=8cc49c72508e.10.direct; __mfwlt=1642343676; Hm_lpvt_8288b2ed37e5bc9b4c9f7008798d2de0=1642343676',
    "Host": "www.mafengwo.cn",
    "Upgrade-Insecure-Requests": "1",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36 Edg/97.0.1072.55",
}

# # 输出访问网页的状态码
# req = requests.get(url, headers=head).status_code
# print(req)

request = urllib.request.Request(url1, headers=head)

html = ""
try:
    response = urllib.request.urlopen(request)
    html = response.read().decode(encoding="utf-8", errors="ignore")
    print(html)
except urllib.error.URLError as e:
    if hasattr(e, "code"):
        print("状态码：%s"%(e.code))
    if hasattr(e, "reason"):
        print("原因：%s"%(e.reason))

# response = requests.get(url1)
# print(response)

# # cookie1
# cookie1 = response.cookies
# print(cookie1)
# # js代码
# js_code = response.text
# print(js_code)

欢迎分享，转载请注明来源：内存溢出

原文地址: https://outofmemory.cn/zaji/5712026.html

关于Python爬取网页返回521状况码的解决方案

发表评论

评论列表（0条）