tags: python,BeautifulSoup

網頁資料抓取與分析

使用urllib套件
接收資料: 傳送到ParseResult物件
用requests函式讀取原始碼。 若沒有requests 需用pip install requests安裝
用ParseResult物件,取得網址資料:

# FileName:urlparse.py from urllib.parse import urlparse url = 'https://hackmd.io:80/xZrScthwQdqfIdaIDyYvFA?view=1' url_obj = urlparse(url) print(type(url_obj)) #<class 'urllib.parse.ParseResult'> ParseResult物件 print(url_obj) print('通訊協定:',format(url_obj.scheme)) # https print('網站名稱(網域):',format(url_obj.netloc)) # hackmd.io:80 print('路徑:',format(url_obj.path)) # /xZrScthwQdqfIdaIDyYvFA print('查詢的參數:',format(url_obj.params)) print('Get參數(query查詢字串):',format(url_obj.query)) # view=1 print('框架名稱:',format(url_obj.fragment)) print('通訊埠:',format(url_obj.port)) # 80 # print() # print('用索引值方法') # print('通訊協定:',format(url_obj[0])) # print('網站名稱(網域):',format(url_obj[1])) # print('路徑:',format(url_obj[2])) # print('查詢的參數:',format(url_obj[3])) # print('Get參數(query查詢字串):',format(url_obj[4])) # print('框架名稱:',format(url_obj[5])) # print('通訊埠:',format(url_obj)) # S20200718 By YTC

用Beautifulsoup進行複雜網頁分析

# FileName:beautifulsoup.py # pip install beautifulsoup4 若沒有需要用 pip install beautifulsoup4 安裝 # beautifulsoup範例 html_doc=''' <html> <meta charset="UTF-8"> <head><title>網頁標題</title></head> <style> .gray{color: gray;} </style> <body> <p class="title"><b>文件標題</b></p> <div class ="gray">故事內容</div> <p class="story"> Onec upon a time there were three little sisters; and their name were <a href="http://expamle.com/elsie" class ="sister" id="link1">Elsie</a>, <a href="http://expamle.com/lacie" class ="sister" id="link2" id="btn2">Lacie</a> and <a href="http://expamle.com/tillie" class ="sister", id="link3">Tillie</a>; and they lived at the bottom of a well. </p> <div class ="gray">故事結束</div> </body> </html> ''' from bs4 import BeautifulSoup import requests # 進行網頁分析:'html.parser'為BeautifulSoup預設語法 sp = BeautifulSoup(html_doc,'html.parser') d1 = sp.select('title') # 傳回網頁標題 d2= sp.text # 傳回去除HTML標籤後的網頁文字內容 d3 = sp.find("div") # 找到第一個tag d4 = sp.find("a",{"id":"link1"}) # 讀取tag屬性內容 d5 = sp.find_all("div") # 找到所有tag d6 = sp.find_all(["title","a"]) # 找到多個tag d7 = sp.find_all("a",{"class":"sister"}) # 另外可用 find_all(tag,{屬性:數性內容}) 找資料 d8 = sp.select('#btn2') # 傳回指定css選擇器 id或是class 的內容 d9 = sp.select("html head title") # 用tag逐層找資料 d10 = sp.select("body p b") # 依tag逐層找資料 d11 = sp.select(".gray") # 依css類別找出gray資料 print(d1) # [<title>網頁標題</title>] print(d2) print(d3) # <div class="gray">故事內容</div> print(d3.text) # 故事內容 print(d4.get("href")) # 用get取得tag屬性內容-> http://expamle.com/elsie print(d5) # [<div class="gray">故事內容</div>, <div class="gray">故事結束</div>] print(d5[0].text) # 故事內容 print(d6) # [<title>網頁標題</title>, <a class="sister" href="http://expamle.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://expamle.com/lacie" id="btn2">Lacie</a>, <a ,="" class="sister" href="http://expamle.com/tillie" id="link3">Tillie</a>] for link in d6: href=link.get("href") if href != None and href.startswith("http://"): #startswith 開頭是"http://" 才顯示 print(href) print(d7) print(d8) print(d9) print(d10) print(d11) # [<div class="gray">故事內容</div>, <div class="gray">故事結束</div>] print(d11[1].text) # 故事結束 # S20200718 By YTC

S20200718 By YTC
M20200720

Select a repo