Python爬虫快速上手教程

  • 时间:
  • 浏览:0

       埋点Python中requests常用的API

from bs4 import BeautifulSoup
import requests
import re

# 使用requests进行网络求解,用BeautifulSoup除理html网页
s_url = "https://www.baidu.com"
o_header = {
        'Accept': 'image/webp,image/apng,image/*,*/*;q=0.8', 
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'en-US,en;q=0.8',
        'Connection': 'keep-alive',
        'Referer': 'https://www.baidu.com',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/3000.0.2661.102 Safari/537.36'
} 
n_timeout = 36 # 设置超时秒数


'''
除理资源文件, url返回有有另一个图片、音频、视频
'''
s_local_url = "img.jpg"
resp_stream = requests.get(s_local_url, stream=True, headers=o_header, timeout=n_timeout) #进行图片下载   

with open(s_local_url, 'wb', buffering = 1024) as fp:
    fp.write(resp_stream.content)
    fp.flush()          

'''
除理html, url返回有有另一个网页
'''
# 使用get命令得到请求
resp = requests.get(s_url, headers=o_header, timeout= n_timeout)


# 得到明显的编码
resp.encoding = resp.apparent_encoding


# 字符串html转为soup
soup__html = BeautifulSoup(resp, "lxml")   


# 找到所有的id属性为abc的h
soup__h = soup__html.find("a", id="h")
print(soup__h.text)


# 找到所有的class属性为abc的<img>
soup__img_s = soup__html.find("img", class_="abc")
for soup__img in soup__img_s:
    print(soup__img["src"], soup__img.text)


# 找到所有的abc属性为opq的a
soup__a = soup__html.find("a", attrs= {"abc" :"opq"})
print(soup__a.text)


# 找到所有的abc属性为opq 1, opq 2的a (正则找)
soup__a = soup__html.find("a", attrs= {"abc" :re.compile(r"opq(\s\w+)?")})
print(soup__a.text)