urllib 库的使用

开始

urllib 是 Python 内置的 HTTP 请求模块,其包含以下四个模块

  • request:HTTP 请求模块

  • error: 异常处理模块

  • parse: 工具模块,拆分、解析、合并

  • robotparser: 识别网站 robots.txt 文件,判断哪些网站可以爬。

请求发送

  1. urlopen()

1
2
3
4
5
6
import urllib.request
response=urllib.request.urlopen('https://www.baidu.com')
print(response.read().decode('utf-8'))#read可以返回网页内容, 网页编码为UTF-8需要用decode,不然会有转义字符
print(type(response)) #类型
print(response.status) #获取响应状态码
print(response.getheaders()) #获取响应头rs())

运行结果

  • data 参数

    data 参数为可选,使用需要,使用 bytes () 转换为字节流编码格式

    1
    2
    3
    4
    5
    import urllib.request
    import urllib.parse
    data=bytes(urllib.parse.urlencode({'B':'1'}),encoding='utf-8')
    response=urllib.request.urlopen('http://httpbin.org/post',data=data)
    print(response.read().decode('utf8'))
运行结果

我们的数据出现在了 form 字段里,这是我们用 POST 方法传递的数据

  • timeout 参数

    1
    2
    3
    4
    5
    6
    7
    8
    import urllib.request
    import socket
    try:
    response=urllib.request.urlopen('http://www.google.com',timeout=1)
    print(response.read().decode('utf-8'))
    except urllib.error.URLError as e:
    if isinstance(e.reason,socket.timeout):
    print('TIME OUT')
  1. Request()

    urllib.request — Extensible library for opening URLs — Python 3.10.6 documentation

    Request 能实现更多参数的请求

    1
    2
    3
    def __init__(self, url, data=None, headers={},
    origin_req_host=None, unverifiable=False,
    method=None):
    • URL: 请求 URL

    • data: 必须为 bytes (),如果是字典可以用 urllib.parse.urlencode()

    • headers: 为一个字典

    • origin_req_host:表示这个请求是否是无法验证的

    • method: 请求方法

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    from urllib import request,parse
    url='http://httpbin.org/post'
    headers={
    'Host':'httpbin.org',
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.81 Safari/537.36 Edg/104.0.1293.47'
    }
    dict={'b':1}
    data=bytes(parse.urlencode(dict),encoding='utf8')
    data=bytes(parse.urlencode({'B':'1'}),encoding='utf-8')
    req=request.Request(url,data=data,headers=headers,method='POST')

    """
    另一种请求头的添加方式
    req=request.Request(url=url,data=data,method='POST')
    req.add_header('Host','httpbin.org',)
    """
    response=request.urlopen(req)
    print(response.read().deco

进阶用法

Handler

现在我们介绍 Handler

比如:

HTTPDefaultErrorHandler: 用于处理响应错误,错误会抛出 HTTPError 类型的异常

HTTPRedirectHandler: 处理重定向

HTTPCookieProcesser: 处理 Cookies

ProxyHandler: 设置代理,默认代理为空

HTTPPasswordMgr: 用于管理密码,维护了用户名和密码

HTTPBasicAuthHandler: 用于管理认证

Cookies

1
2
3
4
5
6
7
import http.cookiejar,urllib.request
filename='cookies.txt'
cookie=http.cookiejar.MozillaCookieJar(filename)
handler=urllib.request.HTTPCookieProcessor(cookie)
opener=urllib.request.build_opener(handler)
response=opener.open('http://www.baidu.com')
cookie.save(ignore_discard=True,ignore_expires=True)
1
2
3
4
5
6
7
import http.cookiejar,urllib.request
cookie=http.cookiejar.CookieJar()
handler=urllib.request.HTTPCookieProcessor(cookie)
opener=urllib.request.build_opener(handler)
response=opener.open('http://www.baidu.com')
for item in cookie:
print(item.
1
2
3
4
5
6
7
import http.cookiejar,urllib.request
cookie=http.cookiejar.LWPCookieJar()
cookie.load('cookies.txt',ignore_expires=1,ignore_discard=1)
handler=urllib.request.HTTPCookieProcessor(cookie)
opener=urllib.request.build_opener(handler)
response=opener.open('http://www.baidu.com')
print(response.read.decode('utf-8'))

异常处理

URLError

1
2
3
4
5
from urllib import request,error
try:
response=request.urlopen('http://mrharsh.top/index.htm')
except error.URLError as e:
print(e.reason)

HTTPError

1
2
3
4
5
6
7
from urllib import request,error
try:
response=request.urlopen('http://mrharsh.top/index.htm')
except error.HTTPError as e:
print(e.reason)
print(e.code)

URLError 是 HTTPError 的父类

1
2
3
4
5
6
7
8
9
10
11
from urllib import request,error
try:
response=request.urlopen('http://mrharsh.top/index.htm')
except error.HTTPError as e:
print(e.reason)
print(e.code)
except error.URLError as e:
print(e.reason)
else:
print('SUCCESS!!')

解析链接

urlparse

1
2
3
from urllib.parse import urlparse
result=urlparse('https://www.baidu.com/s?wd=%E7%89%B9%E6%9C%97%E6%99%AE%E6%88%96%E5%B0%86%E9%9D%A2%E4%B8%B4%E6%9C%80%E9%AB%9830%E5%B9%B4%E5%88%91%E6%9C%9F&tn=baiduhome_pg&usm=1&rsv_idx=2&ie=utf-8&rsv_pq=beff06110014fb0c&oq=%E9%BB%91%E5%B1%B1%E5%8F%91%E7%94%9F%E6%9E%AA%E5%87%BB%E4%BA%8B%E4%BB%B6%E8%87%B411%E6%AD%BB6%E4%BC%A4&rsv_t=203cnvqhWV3frPIJkl4SxrljZIVFGNv6ZJkChXgM4YBU2qiBwG0dd0rTfPGYEJdqPkDg&rqid=beff06110014fb0c&rsf=a7183c00b74706d91bca215ee108b466_1_15_2&rsv_dl=0_right_fyb_pchot_20811&sa=0_right_fyb_pchot_20811')
print(result)

urlunparse()

1
2
3
from urllib.parse import urlunparse
data=['https','wwww.baidu.com','index.html','user','a=6','comment']
print(urlunparse(data))

urlsplit

1
2
3
4
from urllib.parse import urlsplit
result=urlsplit('https://www.baidu.com/s?wd=python&pn=10&oq=python&tn=baiduhome_pg&ie=utf-8&usm=4&rsv_idx=2&rsv_pq=cb482fb3002cef17&rsv_t=6415j2DcN1b3Ov7Yj0M6vjanBErrN6Meq3YKeNQ%2BhsGLzD1xnsUAYddpkN%2FlYnFdsQ07')
for i in result:
print(i)

urlunsplit()

1
2
3
from urllib.parse import urlunsplit
data=['http','www.baidu.com','index.html','a=0','c']
print(urlunsplit(data))

urljoin()

1
2
from urllib.parse import urljoin
print(urljoin('http://www.baidu.com','index.html'))

urlencode

在 GET 中加参数

1
2
3
4
5
6
7
8
from urllib.parse import urlencode
params={
'name':'s',
"1":"2"
}
base_url='http//www.baidu.com'
urll=base_url+urlencode(params)
print(urll)

parse_qs()

反序列化

1
2
3
4
from urllib.parse import urlsplit,parse_qs
result=urlsplit('https://www.baidu.com/s?wd=python&pn=10&oq=python&tn=baiduhome_pg&ie=utf-8&usm=4&rsv_idx=2&rsv_pq=cb482fb3002cef17&rsv_t=6415j2DcN1b3Ov7Yj0M6vjanBErrN6Meq3YKeNQ%2BhsGLzD1xnsUAYddpkN%2FlYnFdsQ07')
print(parse_qs(result.query))

quote()

将内容转为 URL 编码

1
2
3
4
from urllib.parse import quote,urljoin
keyword='python'
url='http://www.baidu.com/s?wd='+quote(keyword)
print(url)

unquote()

1
2
3
4
5
from urllib.parse import quote,urljoin,unquote
keyword='你好'
url='https://www.baidu.com/s?wd='+quote(keyword)
print(url)
print(unquote(url))

Robots 协议

robotparser()

  • set_ur():设置 Robots.txt 链接

  • read():读取 robots.txt 并分析

  • parse():解析 robots.txt 文件

  • can_fetch():传入 User-Agent 和 URL, 判读是否可爬取

  • mtime():返回上次抓取的时间

  • modified():设置当前时间为上次抓取时间

1
2
3
4
5
6
from urllib.robotparser import RobotFileParser
rp=RobotFileParser()
rp.set_url('https://www.bilibili.com/robots.txt')
rp.read()
print(rp.can_fetch('Yisouspider','https://www.bilibili.com/video/BV1fN4y1u7s1?spm_id_from=333.1007.tianma.2-3-6.click'))
print(rp.can_fetch('*','https://www.bilibili.com/video/BV1fN4y1u7s1?spm_id_from=333.1007.tianma.2-3-6.click'))