【转载】爬虫篇：获取数据——urllib库的基础知识（总结）_python

注：本文章大部分代码的案例节选于《Python3网络爬虫开发实战（第2版）》。

一、发出请求 1、urlopen方法

# Python版本：3.6
# -*- coding:utf-8 -*-
"""
urlopen()方法的API:
urllib.request.urlopen(url,data=None,[timeout,]*,cafile=None,capath=None,cadefault=False,context=None)
"""
import urllib.request

response = urllib.request.urlopen('https://www.python.org')
# 读取网页源代码
# print(response.read().decode('utf-8'))
# 输出响应的类型
print(type(response))  # 
# 输出响应的状态码
print(response.status)  # 200
# 输出响应的headers（列表数据类型）
print(response.getheaders())
# 输出响应的headers的指定Content-Type值
print(response.getheader('Content-Type'))

扩展1：下载网页到本地磁盘 a.先读取网站的源代码，再用文件 *** 作写入、保存到本地磁盘

import urllib.request

# 打开并爬取一个网页
request_url = urllib.request.urlopen('https://www.baidu.com/')
# 读取网页内容
html = request_url.read().decode('utf-8')
# 下载到本地
with open('html_1.html', mode='wb') as f:
    f.write(html)

b.使用urllib.request.urlretrieve方法直接下载到本地磁盘

import urllib.request

# 下载到本地
fileName = urllib.request.urlretrieve("https://www.geeksforgeeks.org/", 'html_2.html')
print("fileName:",fileName) # fileName: ('html_2.html', )

扩展2：获取网站信息 a.文件头信息（网页信息）

response = urllib.request.urlopen('https://www.baidu.com')
print("获取文件头信息：", response.info())  # Accept-Ranges: bytes Cache-Control: no-cache ...
print("获取文件头信息 - 列表嵌套元组的形式输出：", response.getheaders())  # [('Accept-Ranges', 'bytes'), ('Cache-Control', 'no-cache'), ('Content-Length', '227'), ...]
print("获取某项文件头信息，如Server:",response.getheader('Server')) # BWS/1.1

b.状态码：status、getcode()

response = urllib.request.urlopen('https://www.baidu.com')
print("获取状态码：", response.getcode())  # 200
print("获取状态码：", response.status)  # 200

2、data参数

# Python版本：3.6
# -*- coding:utf-8 -*-

import urllib.request
import urllib.parse

data = bytes(urllib.parse.urlencode({'name': 'germey'}), encoding='utf-8')
print(data)
response = urllib.request.urlopen('https://www.httpbin.org/post', data=data)
print(response.url)
print(response.read().decode('utf-8'))

3、timeout参数

案例1：

import urllib.request

try:
    # timeout=3,打开网页超时设置为3秒
    html = urllib.request.urlopen('http://www.google.com', timeout=3)
    data = html.read()
    print(len(data))
except Exception as e:
    print('异常了...', str(e))

案例2：

import socket
import urllib.request
import urllib.error

try:
    response = urllib.request.urlopen('https://www.httpbin.org/get', timeout=0.1)
except urllib.error.URLError as e:
    if isinstance(e.reason, socket.timeout):
        print('time out')

4、其他参数（了解）

# Python版本：3.6
# -*- coding:utf-8 -*-
import urllib.request
import ssl

context = ssl.create_default_context()

"""
context="": 指定ssl的设置
cafile="" : 指定CA证书
capath="" ： 指定CA路径
"""
res = urllib.request.urlopen('https://www.sogou.com', context=context, cafile="", capath="")
print(res.status) # 200
print(res.read().decode('utf-8'))

5、Request()：不完整请求，需要与urlopen()结合使用

参数说明：

class urllib.request.Resquest(url,data=None,headers={},origin_req_host=None,unverifiable=False,method=None)
第一个参数 url 用于请求url，这里必选参数，其它的都是可选参数
第二个参数 data 如果要传数据，必须传入 bytes 类型的。如果数据是字典，可以先用 urllib.parse 模块里的 urlopen 方法进行编码
第三个参数 headers 是一个字典，这就是请求头，可以通过 headers 参数直接构造此项，也可以通过调用实例的 add_headers 方法添加
第四个参数 origin_req_host 指的是请求方的 host 名称或 IP 地址
第五个参数 unverifiable 表示请求是否是无法验证，默认取值是 False，意思是用户没有足够的权限来接收这个请求的结果。
		例如：请求一个 HTML 文档中的图片，但是没有自动抓取图像的权限，这里 unverifiable 的值就是 True 
第六个参数 method 是一个字符串，用来指示请求使用的方法，例如 GET、POST、PUT等

headers参数

from urllib.request import Request, urlopen

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                  'Chrome/97.0.4692.71 Safari/537.36 '
}
response = urlopen(Request('https://python.org', headers=headers))
print(response.getcode())
print(response.read().decode('utf-8'))

传入多个参数，比如url、headers、data参数

# Python版本：3.6
# -*- coding:utf-8 -*-

from urllib import parse, request

url = 'https://www.httpbin.org/post'
headers = {
    'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:99.0) Gecko/20100101 Firefox/99.0',
    'Host': 'www.httpbin.org'
}
dict = {'name': 'germey'}
# 把字典数据转成字节流格式（url编码（参数编码））
data = bytes(parse.urlencode(dict), encoding='utf-8')
print(data)  # b'name=germey'
# url编码（参数编码）
req = request.Request(url=url, data=data, headers=headers, method='POST')
# 请求网页
response = request.urlopen(req)
# 获取网页源代码
print(response.read().decode('utf-8'))

使用add_header方法添加headers

from urllib.request import Request, urlopen

req = Request('https://www.baidu.com')
req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                             'Chrome/97.0.4692.71 Safari/537.36 ')
res = urlopen(req)
print(res.status)

6、验证（常用于登录d窗）

在访问某些网站时，例如：https://ssr3.scrape.center,可能会d出这样的认证窗口，这样的网页怎么爬取？可借助HTTPBasicAuthHandler模块完成。

from urllib.request import HTTPPasswordMgrWithDefaultRealm, HTTPBasicAuthHandler, build_opener
from urllib.error import URLError

url = 'https://ssr3.scrape.center'
username = "admin"
password = 'admin'

"""
首先实例化了一个 HTTPBasicAuthHandler 对象 auth_handler ，
其参数是 HTTPPasswordMgrWithDefaultRealm对象 ，
它利用 add_password 方法添加用户名和密码，
这样就建立了一个用来处理验证的 Hanler 类。
"""
p = HTTPPasswordMgrWithDefaultRealm()
p.add_password(None, url, username, password)
auth_handler = HTTPBasicAuthHandler(p)
print(auth_handler)
"""
将刚建立的auth_handler类当作参数传入build_opener方法，
构建一个Opener,这个Opener在发送请求时就相当于已经验证成功了
"""
opener = build_opener(auth_handler)
print(opener)
try:
    """
    利用 Opener类中的open方法打开链接，即可完成验证，
    这里获取的结果就是验证成功后的网页源码内容。
    """
    result = opener.open(url)
    # 读取网页的源码代码并进行编码utf-8
    html = result.read().decode('utf-8')
    print(html)
except URLError as e:
    print(e.reason)

7、代理ip 写法1

import urllib
import urllib.request

def use_proxy(proxy_addr,url):
    # 代理服务器信息
    proxy = urllib.request.ProxyHandler({'http':proxy_addr})
    # 创建opener对象
    opener = urllib.request.build_opener(proxy,urllib.request.HTTPHandler)
    urllib.request.install_opener(opener)
    data = urllib.request.urlopen(url).read().decode('utf-8')
    return data

proxy_addr = '209.141.56.127:80'
data = use_proxy(proxy_addr,'https://www.so.com/')
print(len(data))

写法2

# Python版本：3.6
# -*- coding:utf-8 -*-

from urllib.error import URLError
from urllib.request import ProxyHandler, build_opener

url = 'https://blog.csdn.net/'
proxy_handler = ProxyHandler({
    'http': 'http://209.141.56.127:80',
    'http': 'http://123.253.97.89:8080',
})
opener = build_opener(proxy_handler)
try:
    response = opener.open(url)
    print(response.read().decode('utf-8'))
except URLError as e:
    print(e.reason)

扩展：获取免费代理ip

以下程序的功能，从66免费代理网站，抓取免费代理：

import requests
from lxml import html
import time

S = requests.Session()

target_haeders = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'Accept-Encoding': 'gzip, deflate',
    'Accept-Language': 'zh-CN,zh;q=0.9',
}

for pn in range(10):
    target_url = f'http://www.66ip.cn/{pn}.html'
    # 抓取网页
    response = S.get(url=target_url, headers=target_haeders)
    response.encoding = 'gbk'
    context = response.text
    # 格式化网页
    ht = html.fromstring(context)
    # 定位要抓取的内容
    tr_li = ht.xpath('//tr[position()>1]/td[position()<3]')
    i = 1
    for li in tr_li:
        if i % 2 == 1:
            print(li.text, end=':')
        else:
            print(li.text)
        i += 1
    time.sleep(2)

8、获取网站的Cookie 案例1：获取网站的Cookie

# Python版本：3.6
# -*- coding:utf-8 -*-

import http.cookiejar,urllib.request

cookie = http.cookiejar.CookieJar()
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('https://www.baidu.com')
for item in cookie:
    print(item.name + '=' + item.value)

运行结果如下：

BAIDUID=2A6D858098753BD500794CA457AAD0F5:FG=1
BIDUPSID=2A6D858098753BD53D9F605F2F6720C7
PSTM=1649861956
BD_NOT_HTTPS=1

案例2：以文本形式保存cookie

# Python版本：3.6
# -*- coding:utf-8 -*-

import urllib.request,http.cookiejar

filename = 'cookie1.txt'
# 将cookie用文本记录，只需将CookieJar换成MozillaCookieJar
cookie = http.cookiejar.MozillaCookieJar(filename)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('https://www.baidu.com')
cookie.save(ignore_discard=True,ignore_expires=True)

程序执行后，其cookie1.txt文件的内容如下：

# Netscape HTTP Cookie File
# http://curl.haxx.se/rfc/cookie_spec.html
# This is a generated file!  Do not edit.

.baidu.com	TRUE	/	FALSE	1681398291	BAIDUID	D69EC146D8DB2030A82CB587DFE99742:FG=1
.baidu.com	TRUE	/	FALSE	3797345938	BIDUPSID	D69EC146D8DB20303EA44C8D2690E3D0
.baidu.com	TRUE	/	FALSE	3797345938	PSTM	1649862292
www.baidu.com	FALSE	/	FALSE	1649862591	BD_NOT_HTTPS	1

案例3：LWP(libwww-perl)格式记录的cookie a、保存成LWP(libwww-perl)格式的cookie文件

import urllib.request, http.cookiejar

"""
http.cookiejar.LWPCookieJar()
保存成LWP(libwww-perl)格式的cookie文件
"""

filename = 'cookie2.txt'
# 将cookie以LWP格式记录，只需将CookieJar换成LWPCookieJar
cookie = http.cookiejar.LWPCookieJar(filename)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('https://www.baidu.com')
cookie.save(ignore_discard=True, ignore_expires=True)

程序执行后，其cookie2.txt文件的内容如下：

#LWP-Cookies-2.0
Set-Cookie3: BAIDUID="6AA8678D4A29C7D9FF786F5A30C007F4:FG=1"; path="/"; domain=".baidu.com"; path_spec; domain_dot; expires="2023-04-15 15:12:13Z"; comment=bd; version=0
Set-Cookie3: BIDUPSID=6AA8678D4A29C7D9BA1979358931B470; path="/"; domain=".baidu.com"; path_spec; domain_dot; expires="2090-05-03 18:26:20Z"; version=0
Set-Cookie3: PSTM=1650035534; path="/"; domain=".baidu.com"; path_spec; domain_dot; expires="2090-05-03 18:26:20Z"; version=0
Set-Cookie3: BD_NOT_HTTPS=1; path="/"; domain="www.baidu.com"; path_spec; expires="2022-04-15 15:17:13Z"; version=0

b、读取LWP格式的cookie文件

import urllib.request, http.cookiejar

"""
读取LWP格式的cookie文件
"""
cookie = http.cookiejar.LWPCookieJar()
cookie.load('cookie2.txt', ignore_discard=True, ignore_expires=True)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('https://www.baidu.com')
print(response.read().decode('utf-8'))

9、gzip加密网站怎么爬取内容

# Python版本：3.6
# -*- coding:utf-8 -*-

from urllib.request import urlopen, Request
from io import BytesIO
import gzip


def get_html(url):
    headers = {
        'Accept': "text/html,application/xhtml+xml,application/xml;"
                  "q=0.9,image/avif,image/webp,image/apng,*/*;"
                  "q=0.8,application/signed-exchange;"
                  "v=b3;q=0.9",
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                      'AppleWebKit/537.36 (KHTML, like Gecko) '
                      'Chrome/98.100.4758.66 Safari/537.36 '
    }
    res_url = Request(url, headers=headers)
    res = urlopen(res_url)
    # 读取加密的bytes网页源码
    html = res.read()
    # 网页源码有gzip加密，执行if条件
    if res.info().get('Content-Encoding') == 'gzip':
        # 创建io.BytesIO对象，并指向bytes网页源码的内存地址
        buff = BytesIO(html)
        # 创建gzip _io.BytesIO对象
        f = gzip.GzipFile(fileobj=buff)
        # 读取与解码文件
        html_str = f.read().decode('utf-8', errors='ignore')
        print(html_str)
    else:  # 网页源码没有加密，执行else语句
        """
        如果设置为ignore，则会忽略非法字符；
        如果设置为replace，则会用?取代非法字符；
        """
        print(html.decode('utf-8', errors='ignore'))


if __name__ == '__main__':
    url = 'https://www.douyu.com'
    get_html(url)

二、处理异常 1、HTTPError a、处理可能不存在的网页：

HTTPError，用来处理HTTP请求错误，例如认证请求失败等。它有如下3个属性：
a、code:返回HTTP状态码，例如，404表示网页不存在，500表示服务器内部错误等
b、reason:同父类一样，用于返回错误的原因,可能返回字符串，也可能返回是对象
c、headers:返回请求头

import urllib.request
import urllib.error
# 常用于处理可能不存在的网址
try:
    urllib.request.urlopen('http://1000phone.com/1')
except urllib.error.HTTPError as e:
    print(e.code)
    print(e.reason)
    print(e.headers)

b、HTTPError无法处理不存在的域名网站的处理

import urllib.request
import urllib.error
"""
程序报错，HTTPError无法处理不存在的网址异常
"""
try:
    urllib.request.urlopen('http://www.xyxyxz.com') # 报错
except urllib.error.HTTPError as e:
    print(e.reason)

2、URLError a、处理不存在的网页

import urllib.request
import urllib.error
"""
code（404，...）属性
"""
try:
    urllib.request.urlopen("http://www.1000phone.cc")
except urllib.error.URLError as e:
    if hasattr(e,'code'): # 使用hasatttr判断e中是否有code属性
        print(e.code) # 打印状态码
    print(e.reason)

# Python版本：3.6
# -*- coding:utf-8 -*-

from urllib import request,error
"""
URLError是HTTPError的父类
"""
try:
    # 打开不存在的页面，在这里会捕获了URLError这个异常，运行后输出”Not Found“
    response = request.urlopen('https://cuiqingcai.com/404')
except error.URLError as e:
    print(e.reason)  # Not Found

b、reason属性返回的不一定是字符串，也可能是一个对象

# Python版本：3.6
# -*- coding:utf-8 -*-

import urllib.error
import socket
import urllib.request

try:
    response = urllib.request.urlopen('https://www.baidu.com', timeout=0.01)
except urllib.error.URLError as e:
    # 打印reason的类型，这里不是字符串，而是一个对象
    print(type(e.reason))
    # 如果"e.reason"的类型是”socket.timeout“则打印”time out“
    if isinstance(e.reason, socket.timeout):
        print('time out')

3、混合型：使用HTTPError + URLError处理异常

import urllib.request
import urllib.error
# 常用于处理不存在的网址或HTTPError中不存在的网址（404、）
"""
HTTPError:它是URLError的子类
URLError:它是HTTPError的父类
"""
try:
    urllib.request.urlopen('http://www.1000phone.cc')
# 先捕获子类的错误
except urllib.error.HTTPError as e:
    print(e.code)
    print(e.headers)
    print('HTTPError:',e.reason)
# 先捕获子类的错误
except urllib.error.URLError as e:
    print('URLError:',e.reason)
else:
    print('Request Successfully')

三、解析链接

ParseResult(scheme='https', netloc='www.baidu.com',path='/index.html', params='user', query='id=5', frnt=agme'comment')
scheme：协议
netloc：域名
path：路径
params：参数
query：查询条件
fragment：锚点

1、分离(抽取)URL的不同组件：urlparse()

# Python版本：3.6
# -*- coding:utf-8 -*-

from urllib.parse import urlparse

"""
用法：urllib.parse.urlparse(urlstring,scheme='',allow_fragments=True)
urlstring：必填项，即待解析的URL
scheme:这是默认的协议（例如http或https等）。如果待解析的URL没有带协议信息，就会将这个作为默认协议
"""

'''例1：分离成6个部分'''
result = urlparse('https://www.baidu.com/index.html;user?id=5#comment')
print(type(result)) # 
# 分离成6个部分
print(result) # ParseResult(scheme='https', netloc='www.baidu.com',
                          # path='/index.html', params='user', query='id=5', fragment='comment')


'''例2：此例不包含前面的协议信息，会把域名和路径划分在一起'''
# url链接没有http或https前缀
result = urlparse('www.baidu.com/index.html;user?id=5#comment',scheme='https')
# http前缀自动补全，并且域名与路径划分在一起
print(result) # ParseResult(scheme='https', netloc='',
              # path='www.baidu.com/index.html', params='user', query='id=5', fragment='comment')


'''例3：此例，在urlstring加上https://前缀，包含前面的协议信息'''
# 在参数中，指定scheme='https'
result = urlparse('https://www.baidu.com/index.html;user?id=5#comment',scheme='https')
print(result) # ParseResult(scheme='https', netloc='www.baidu.com',
              # path='/index.html', params='user', query='id=5', fragment='comment')


'''例4：此例，设置allow_fragments=False,忽略fragments锚点部分，它会被解析为path、params或者query的一部分'''
result = urlparse('https://www.baidu.com/index.html;user?id=5#comment',allow_fragments=False)
# 锚点和query查询条件合并在一起
print(result) # ParseResult(scheme='https', netloc='www.baidu.com',
              # path='/index.html', params='user', query='id=5#comment', fragment='')

'''例4——1：此例，将fragment会被解析为path的一部分(没有参数、查询条件，锚点会解析到path路径后面)'''
result = urlparse('https://www.baidu.com/index.html#comment',allow_fragments=False)
print(result) # ParseResult(scheme='https', netloc='www.baidu.com',
              # path='/index.html#comment', params='', query='', fragment='')
# 分别用属性名和索引获取 scheme,netloc,path
# 方法一：
print(result.scheme,result.netloc,result.path) # https www.baidu.com /index.html#comment
# 方法二：
print(result[1]) # www.baidu.com

2、合并链接：urlunparse（urlparse的对立方法，用于构造url）

from urllib.parse import urlunparse

data = ['https', 'www.baidu.com', 'index.html', 'user', 'a=6', 'comment']
# 注意：可迭代对象，其长度必须是6，否则会抛出参数数量不足或者过多的问题。
url = urlunparse(data)
print(url)  # https://www.baidu.com/index.html;user?a=6#comment

3、分段链接：urlsplit（与urlparse方法相似；忽略参数，将参数解析到path中，其它不变）

from urllib.parse import urlsplit

result = urlsplit('https://www.baid.com/index.html;user?id=6#comment') # 不单独解析params会合并到path中
print(result) # SplitResult(scheme='https', netloc='www.baid.com',
                # path='/index.html;user', query='id=6', fragment='comment')
print(result.scheme,result[1]) # https www.baid.com

4、组合成完整链接：urlunsplit（与urlsplit方法相似，合并链接）

from urllib.parse import urlunsplit

data = ['https','www.baidu.com','index.html','a=6','comment']
print(urlunsplit(data)) # https://www.baidu.com/index.html?a=6#comment

5、合并链接：urljoin（与urlunparse和urlunsplit方法的功能一样）

from urllib.parse import urljoin
"""
urljoin(base_url,new_url)
base_url提供了三项内容：scheme、netloc和path。如果新的链接（new_url）里不存在这三项，就予以被充；
如果存在，就使用新的链接里面的，base_url中的是不起作用。

技巧：看urljoin的第二个参数，scheme、netloc、path这3个内容缺哪个就从第二个参数中取
"""
print(urljoin('https://www.baidu.com','FAQ.html'))
print(urljoin('https://www.baidu.com','https://cuiqingcai.com/FAQ.html'))
print(urljoin('https://www.baidu.com/about.html','https://cuiqingcai.com/FAQ.html'))
print(urljoin('https://www.baidu.com?wd=abc','https://www.cuiqingcai.com/FAQ.html?question=2'))
print(urljoin('https://www.baidu.com?wd=abc','FAQ.html?question=2'))
print(urljoin('https://www.baidu.com','FAQ.html?question=2'))
print(urljoin('https://www.baidu.com','?question=2'))

6、序列化与反序列：用于解析url中的中文编码问题 A、序列化（简单地讲就是，对中文字符编码） 1、urlencode方法：构造get请求参数时使用

# Python版本：3.6
# -*- coding:utf-8 -*-

from urllib.parse import urlencode

'''urlencode方法将params序列化为GET请求的参数'''
params = {
    'name': '张三',
    'age': 25
}
base_url = 'https://www.baidu.com?'
url = base_url + urlencode(params)
print(url)  # https://www.baidu.com?name=%E5%BC%A0%E4%B8%89&age=25

实例案例：

import urllib.request
import urllib.parse

# 将参数转为utf-8编码
values = {'q':'python'}
data = urllib.parse.urlencode(values).encode('utf-8')
# print(data) # b'q=python'
# 提交url+参数，
url = 'https://tutorialspoint.com'
req = urllib.request.Request(url,data)
# print(req) # 
resp = urllib.request.urlopen(req)
# print(resp) # 
respData = resp.read()
print(respData)

以上代码简化为：

from urllib import request,parse
url = 'https://tutorialspoint.com'
# 编码为utf-8
data = parse.urlencode({'q':'python'}).encode('utf-8')
req = request.Request(url,data)
resp = request.urlopen(req)
print(resp.read())

2、quote方法：URL编码（将中文字符转化为URL编码）

# Python版本：3.6
# -*- coding:utf-8 -*-

"""
quote()：将内容转化为URL编码的格式。特别是，当URL中带有中文参数时，
         可能导致乱码问题，quote方法可以将中文字符转化为URL编码。
"""
from urllib.parse import quote

keyword = '五笔输入法'
url = 'https://www.baidu.com/s?ie=utf-8&wd=' + quote(keyword)
print(url)  # https://www.baidu.com/s?ie=utf-8&wd=%E4%BA%94%E7%AC%94%E8%BE%93%E5%85%A5%E6%B3%95

B、反序列（简单地讲就是，对URL编码进行解码） 1、parse_qs方法：将一串get请求参数转回字典

# Python版本：3.6
# -*- coding:utf-8 -*-

from urllib.parse import parse_qs

query = 'name=%E5%BC%A0%E4%B8%89&age=25'
print(parse_qs(query))  # {'name': ['张三'], 'age': ['25']}

2、parse_qsl方法：将参数转化为由元组组成的列表

# Python版本：3.6
# -*- coding:utf-8 -*-

from urllib.parse import parse_qsl

# parse_qsl方法将参数转化为由元组组成的列表
query = 'name=%E5%BC%A0%E4%B8%89&age=25'
print(parse_qsl(query))  # [('name', '张三'), ('age', '25')]

3、unquote方法：URL解码（即将url编码转化为中文字符）

# Python版本：3.6
# -*- coding:utf-8 -*-

from urllib.parse import unquote

url = 'https://www.baidu.com/s?ie=utf-8&wd=%E4%BA%94%E7%AC%94%E8%BE%93%E5%85%A5%E6%B3%95'
print(unquote(url))  # https://www.baidu.com/s?ie=utf-8&wd=五笔输入法

四、分析Robots协议

方法一：在声明对象时，传入robots.txt文件的链接
rp = RobotFileParser()

方法二：通过set_url()方法设置robots.txt文件的链接
rp.set_url('https://www.baidu.com/robots.txt')

1、在RobotFileParser方法中传入robots.txt文件的链接

# Python版本：3.6
# -*- coding:utf-8 -*-

from urllib.robotparser import RobotFileParser

# 不用set_url()方法，可在声明对象时直接用如下方法设置
rp = RobotFileParser('https://www.baidu.com/robots.txt')

rp.read()
# 允许百度爬虫访问 域名的根目录
print(rp.can_fetch('Baiduspider','https://www.baidu.com')) # True
# 允许百度爬虫访问 目录/homepage/
print(rp.can_fetch('Baiduspider','https://www.baidu.com/homepage/')) # True
# 禁止谷歌爬虫访问 目录/homepage/
print(rp.can_fetch('Googlebot','https://www.baidu.com/homepage/')) # False
# 没有指定爬虫名称，就匹配"User-agent: *"，禁止所有爬虫访问所有目录
print(rp.can_fetch('','https://www.baidu.com')) # False

2、通过set_url()方法设置robots.txt文件的链接

# Python版本：3.6
# -*- coding:utf-8 -*-

from urllib.robotparser import RobotFileParser

rp = RobotFileParser()
# 通过set_url()方法设置robots.txt文件的链接
rp.set_url('https://www.baidu.com/robots.txt')
rp.read()

# 允许百度爬虫访问 域名的根目录
print(rp.can_fetch('Baiduspider', 'https://www.baidu.com'))  # True
# 允许百度爬虫访问 目录/homepage/
print(rp.can_fetch('Baiduspider', 'https://www.baidu.com/homepage/'))  # True
# 禁止谷歌爬虫访问 目录/homepage/
print(rp.can_fetch('Googlebot', 'https://www.baidu.com/homepage/'))  # False
# 没有指定爬虫名称，就匹配"User-agent: *"，禁止所有爬虫访问所有目录
print(rp.can_fetch('', 'https://www.baidu.com'))  # False

3、使用parse方法执行对robots.txt文件的读取和分析

# Python版本：3.6
# -*- coding:utf-8 -*-

from urllib.request import urlopen
from urllib.robotparser import RobotFileParser

rp = RobotFileParser()
# 使用parse()方法对robots.txt文件的读取和分析
rp.parse(urlopen('https://www.baidu.com/robots.txt').read().decode('utf-8').split('\n'))
# 允许百度掉访问 域名的根目录
print(rp.can_fetch('Baiduspider', 'https://www.baidu.com'))  # True
# 允许百度爬虫访问 目录/homepage/
print(rp.can_fetch('Baiduspider', 'https://www.baidu.com/homepage/'))  # True
# 禁止谷歌爬虫访问 目录/homepage/
print(rp.can_fetch('Googlebot', 'https://www.baidu.com/homepage/'))  # False
# 禁止所有爬虫访问所有目录
print(rp.can_fetch('', 'https://www.baidu.com'))  # False

4、实例案例

# Python版本：3.6
# -*- coding:utf-8 -*-

import requests
from urllib.parse import urlencode
from urllib.robotparser import RobotFileParser

rp = RobotFileParser("https://www.baidu.com/robots.txt")
rp.read()

dic = {'wd': '爬虫'}
url = 'https://www.baidu.com/s?ie=utf-8&' + urlencode(dic)
url2 = 'https://www.baidu.com/'

headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                  'Chrome/98.0.4758.80 Safari/537.36'
}

"""
can_fetch方法的第1个参数，填爬虫名称，必须与https://www.baidu.com/robots.txt中爬虫名称一致
"""
if rp.can_fetch('Baiduspider', url):
    print(f'正在爬取url：{url}')
    res = requests.get(url=url, headers=headers)
    print(res.status_code)
    # 继续执行其它 *** 作
elif rp.can_fetch('Baiduspider', url2):
    print(f'正在爬取url2：{url2}') # 正在爬取url2：https://www.baidu.com/
    res = requests.get(url=url2, headers=headers)
    print(res.status_code) # 200
    # 继续执行其它 *** 作
else:
    print('网站不允许爬虫机器人，爬取网站的内容。')