Requests的应用
Get请求
import requests
### 添加params
data = {
'name':'admin',
'password':'passward'
}
### 添加headers
headers ={
"Host": "pss.bdstatic.com",
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:96.0) Gecko/20100101 Firefox/96.0'
'Accept': "*/*"
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2'
'Accept-Encoding': 'gzip, deflate, br'
'Referer': 'https://www.baidu.com/baidu?tn=monline_3_dg&ie=utf-8&wd=%E6%A0%A1%E5%9B%AD%E7%BD%91%E7%99%BB%E5%BD%95%E5%85%A8%E6%98%AFget'
'Connection': 'keep-alive'
'Sec-Fetch-Dest': 'script'
'Sec-Fetch-Mode': 'no-cors'
'Sec-Fetch-Site': 'cross-site'
'Pragma': 'no-cache'
'Cache-Control': 'no-cache'
'TE': 'trailers'
}
response = requests.get('',params = data,headers = headers)
print(response.text)
### 如果返回是请求头,则能以str格式或者json格式输出
print(response.json)
### 如果返回是网页,则能使用正则表达式匹配抓取
import re
pattern = re.compile("<h1.*?> .*?</h1>",re.S)
title = re.findall(pattern,response.txt)
print(title)
### 如果是视频,图片,音频,则是二进制文件,使用context
print(response.context)
Post请求
基本用法实例
import requests
url = ''
##
data = {
'name':'admin',
'password':'passward'
}
## 设置headers
headers ={
"Host": "pss.bdstatic.com",
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:96.0) Gecko/20100101 Firefox/96.0'
'Accept': "*/*"
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2'
'Accept-Encoding': 'gzip, deflate, br'
'Referer': 'https://www.baidu.com/baidu?tn=monline_3_dg&ie=utf-8&wd=%E6%A0%A1%E5%9B%AD%E7%BD%91%E7%99%BB%E5%BD%95%E5%85%A8%E6%98%AFget'
'Connection': 'keep-alive'
'Sec-Fetch-Dest': 'script'
'Sec-Fetch-Mode': 'no-cors'
'Sec-Fetch-Site': 'cross-site'
'Pragma': 'no-cache'
'Cache-Control': 'no-cache'
'TE': 'trailers'
}
##
response = requests.post(url,data = data, headers = headers)
print(response.status_code)##状态码
print(response.headers)##响应头
print(response.cookies)##cookie
print(response.url)##url
print(response.history)##请求你是
## requests内置了状态码,比如 requests.codes,ok 就是 statsus_code 200
exit() if not response.status_code == requests.codes.ok else print("Successful")
高级用法
文件上传
import requests
files = {'file':open("filepath")}
response = requests.post(url,files = files)
print(response.text)
Cookie设置
import requests
class Spider:
def __init__(self,CookieJar):
self.cookies = Requests.cookies.RequestsCookieJar() ## 创建一个Cookiejar类
def Update(cookies):
self.cookies.updata(cookies)
def Crawler_get(url)
self.response = requests.get(url,cookies = self.cookies)
##
r = requests.get(url)
print(r.cookies) ## cookies 属于RequestsCookieJar类型
for key,values in r.cookies.items(): ##用items方法返回一个由Cookie组成的元组
print(key + ":" + values)
## 使用Cookie去访问网站
MySpider = Spider() ## 创捷Spider对象
MySpider.Update(r.cookies) ## 更新cookies
MySpider.Crawler_get(url) ## 获取开始爬
Session维持
Session
维持能保证两次请求的时候都是同一个账户,并且不需要手动保持Cookie
,显得更为方便.其主要模拟同一个浏览器访问同一个站点的不同页面.
下面举例
import requests
requests.get('https://www.httpbin.org/cookies/set/number/123456789') ##这个网址能设置cookies
response = requests.get('http://www.httpbin.org/cookies') ##访问这个网址能获取自己的请求头,这里是返回自己的cookies
print(response.text) ##这里面不会有之前这是的cookies
import requests
s = requests.Session()
s.get('https://www.httpbin.org/cookies/set/number/123456789')
response = s.get('http://www.httpbin.org/cookies')
print(response.text) ###这里返回了cookies
SSL证书验证
有些网站可能没有设置好HTTPS证书,或者其证书没有被CA认证,那么访问的时候就会出现SSL证书错误异常,那么就需要让爬虫自动处理.
import requests
response = requests.get(url,verify = False)
print(response.status_code)##这里能输出200,但是会有一个warning,让我们指定证书,我们可以设置忽略
如下
import requests
from requests.packages.md import urllib3
urllib3.disable_warnings()
response = requests.get(url,verify = False)
print(response.status_code)
或者我们可以把警告捕获到日志,从而忽略
import logging
import requests
logging.CaptureWarnings(True)
response = requests.get(url,verify = False)
print(response.status_code)
当然也可以设置本地证书用作客户端证书,可以是单个文件,也可以是包含两个文件路径的元组
import requests
response = requests.get(url,cert = ('/path/server.crt','/path/server.key'))
print(response.status_code)
超时设置
import requests
response = requests.get(url,timeout = 1)
身份验证
urllib
中有HTTPBasicAuthHandler
来处理身份验证问题,那么同样的requests
中也有相应的库来处理.
import requests
from requests.auth import HTTPBasicAuth
r = requests.get(url,auth = HTTPBasicAuth("admin","password"))
##或者
r = requests.get(url,auth = ("admin","password"))
print(r.status_code)
另外,requests
还提供了其他认证方式,比如oa
验证,提供了OAuth
,需要安装oauth
包.
import requests.oauthlib import OAuth1
import request
auth = OAuth1('Your_App_Key','You_App_Secret','User_Oauth_Token','User_Oauth_Token_Secret')
response = requests.get(url.auth = auth)
代理设置
urllib
中有ProxyHandler
来处理身份验证问题,那么同样的requests
中也有相应的库来处理.
import requests
proxies = {
'http':'http://'
'https':'https://'
}
requests.get(url,proxies = proxies)
如果需要身份验证,那么,可以使用类似http://user:password@host:port
的形式设置代理.
import requests
proxies = {
'http':'http://user:password@'
'https':'https://user:password@'
}
requests.get(url,proxies = proxies)
Requests库的实现浅尝
实际上,在使用requests库的时候,他是在内部构造了一个Request对象,并给这个对象各种参数,再把这个Request对象发送出去,请求成功之后再返回Response对象.
这个Request对象是Prepare Request类型
from requests import Request,Session
url = ''
data = {'':''}
headers = {'':'','':''}
s = Session()
req = Request('POST',url,data= data,headers = headers) ##创建了一个request对象
prepped = s.prepare_request(req) ##把request对象转换为一个prepare_request对象
r = s.send(prepped)##把prepare_request对象发送出去,获得其response对象
print(r.text)