import urllib.request as req
url="網址";
request=req.Request(url, headers={
# ptt 裡的 cookie
"cookie":"over18=1",
"User-Agent":"網站 Network 裡指定 html 的 Headers > Request Headers"
})
with req.urlopen(url) as res
data = res.read().decode("utf-8")
print(data)
使用 BeautifulSoup (4) 套件
pip install beautifulsoup4
import bs4
root = bs4.BeautifulSoup(data, "html.parser")
# print(root.title.string)
titles = root.find_all("div", class_="title")
print(titles.a.string)
for title in titles:
if title.a != None
print(title.a.string)
以 pttweb 網站為例 url="https://www.pttweb.cc/bbs/Gossiping"
urllib.request
無法抓到動態(dynamic) cookies,因此改用更多自動化的插件 requests
headers= { ... }
session = req.Session()
response = session.get(url, headers=headers)
Headers
分頁中 Request Headers
裡有 ":" 前綴的為 HTTP2
特性之偽元素 ,插件不一定支援 (~SEP 2021),也不一定跟爬蟲權限有關:Authority:www.ptt.cc
:Method:GET
:Path:/bbs/Gossiping/index.html
:Scheme:https
with req.urlopen(url) as res
data = res.read().decode("utf-8")
requests
使用 session
動態抓取
res = session.get(url, headers=headers)
data = res.content.decode('utf-8')
titles = root.find_all("span", class_="e7-show-if-device-is-not-xs")
for title in titles:
print(title.text.strip())
<div data-xs>
這種標籤,則要使用attrs
titles = root.find_all("div", attrs={"data_xs": True})
.text
:會回傳標籤內所有子孫標籤的文字內容。(也可作.get_text()
).string
:只有在標籤只有一個 NavigableString
類型子節點時才會回傳內容,否則會回傳 None
(包含<b>
這類標籤)。if response.headers.get('Content-Encoding') == 'br':
decompressed_data = brotli.decompress(response.content)
text = decompressed_data.decode('utf-8')
print(text)
else:
print(response.text)
或 zlib
import zlib
decompressed_data = zlib.decompress(response.content)
import requests as req
import bs4
url="https://www.pttweb.cc/bbs/Gossiping"
headers = {
# "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
# "Accept-Encoding":"gzip, deflate, br",
# "Accept-Language":"zh-TW,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6,und;q=0.5,zh-CN;q=0.4,ja;q=0.3",
# "Cache-Control":"no-cache",
# "Cookie":"_ga=GA1.1.12609530.1687623116; PTTweb_v2_guestId_persistent=561355565; PTTweb_v2_authKey_persistent=y81n71zmu3d9dfnofiofckqbfy; PTTweb_v2_guestId=561355565; PTTweb_v2_authKey=y81n71zmu3d9dfnofiofckqbfy; _ga_F0HJ7JBSPD=GS1.1.1687627721.2.0.1687627721.0.0.0",
# "Pragma":"no-cache",
# "Sec-Fetch-Dest":"document",
# "Sec-Fetch-Mode":"navigate",
# "Sec-Fetch-Site":"none",
# "Sec-Fetch-User":"?1",
# "Upgrade-Insecure-Requests":"1",
# "User-Agent":"Mozilla/5.0 (iPad; CPU OS 13_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/87.0.4280.77 Mobile/15E148 Safari/604.1 Edg/114.0.0.0"
}
session = req.Session()
response = session.get(url, headers=headers)
data = response.content.decode('utf-8')
#beautifulsoup4 text/html 轉成文字
root = bs4.BeautifulSoup(data, "html.parser")
titles = root.find_all("span", class_="e7-show-if-device-is-not-xs")
for title in titles:
print(title.text.strip())
or
By clicking below, you agree to our terms of service.
New to HackMD? Sign up