tags: `python`,`BeautifulSoup`

網頁資料抓取與分析

使用urllib套件
接收資料: 傳送到ParseResult物件
用requests函式讀取原始碼。若沒有requests 需用pip install requests安裝
用ParseResult物件，取得網址資料:



























# FileName:urlparse.py
from urllib.parse import urlparse

url = 'https://hackmd.io:80/xZrScthwQdqfIdaIDyYvFA?view=1'

url_obj = urlparse(url)         
print(type(url_obj))        #<class 'urllib.parse.ParseResult'> ParseResult物件
print(url_obj)
print('通訊協定:',format(url_obj.scheme))       # https
print('網站名稱(網域):',format(url_obj.netloc)) # hackmd.io:80
print('路徑:',format(url_obj.path))             # /xZrScthwQdqfIdaIDyYvFA
print('查詢的參數:',format(url_obj.params))
print('Get參數(query查詢字串):',format(url_obj.query)) # view=1
print('框架名稱:',format(url_obj.fragment))
print('通訊埠:',format(url_obj.port))                  # 80

# print()
# print('用索引值方法')
# print('通訊協定:',format(url_obj[0]))
# print('網站名稱(網域):',format(url_obj[1]))
# print('路徑:',format(url_obj[2]))
# print('查詢的參數:',format(url_obj[3]))
# print('Get參數(query查詢字串):',format(url_obj[4]))
# print('框架名稱:',format(url_obj[5]))
# print('通訊埠:',format(url_obj))

# S20200718 By YTC

用Beautifulsoup進行複雜網頁分析






























































# FileName:beautifulsoup.py
# pip install beautifulsoup4 若沒有需要用 pip install beautifulsoup4 安裝
# beautifulsoup範例

html_doc='''
<html>
    <meta charset="UTF-8">
    <head><title>網頁標題</title></head>
    <style>
        .gray{color: gray;}
    </style>
<body>
    <p class="title"><b>文件標題</b></p>
    <div class ="gray">故事內容</div>
    <p class="story">
        Onec upon a time there were three little sisters;
        and their name were 
        <a href="http://expamle.com/elsie" class ="sister" id="link1">Elsie</a>,
        <a href="http://expamle.com/lacie" class ="sister" id="link2" id="btn2">Lacie</a> and 
        <a href="http://expamle.com/tillie" class ="sister", id="link3">Tillie</a>;
        and they lived at the bottom of a well.
    </p>
    <div class ="gray">故事結束</div>
</body>
</html>
'''
from bs4 import BeautifulSoup
import requests
# 進行網頁分析:'html.parser'為BeautifulSoup預設語法
sp = BeautifulSoup(html_doc,'html.parser')

d1 = sp.select('title')         # 傳回網頁標題
d2=  sp.text                    # 傳回去除HTML標籤後的網頁文字內容
d3 = sp.find("div")             # 找到第一個tag
d4 = sp.find("a",{"id":"link1"})    # 讀取tag屬性內容
d5 = sp.find_all("div")             # 找到所有tag 
d6 = sp.find_all(["title","a"])     # 找到多個tag 
d7 = sp.find_all("a",{"class":"sister"})  # 另外可用 find_all(tag,{屬性:數性內容}) 找資料
d8 = sp.select('#btn2')                   # 傳回指定css選擇器 id或是class 的內容
d9 = sp.select("html head title")   # 用tag逐層找資料
d10 = sp.select("body p b")         # 依tag逐層找資料
d11 = sp.select(".gray")            # 依css類別找出gray資料

print(d1)               # [<title>網頁標題</title>]
print(d2)                 
print(d3)               # <div class="gray">故事內容</div>
print(d3.text)          # 故事內容
print(d4.get("href"))   # 用get取得tag屬性內容-> http://expamle.com/elsie
print(d5)               # [<div class="gray">故事內容</div>, <div class="gray">故事結束</div>]
print(d5[0].text)       # 故事內容
print(d6)               # [<title>網頁標題</title>, <a class="sister" href="http://expamle.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://expamle.com/lacie" id="btn2">Lacie</a>, <a ,="" class="sister" href="http://expamle.com/tillie" id="link3">Tillie</a>]
for link in d6:
    href=link.get("href")
    if href != None and href.startswith("http://"): #startswith 開頭是"http://" 才顯示
        print(href)
print(d7)
print(d8)
print(d9)
print(d10)
print(d11)              # [<div class="gray">故事內容</div>, <div class="gray">故事結束</div>]
print(d11[1].text)      # 故事結束
# S20200718 By YTC

S20200718 By YTC
M20200720

Syntax	Example	Reference
# Header	Header	基本排版
- Unordered List	Unordered List
1. Ordered List	Ordered List
- [ ] Todo List	Todo List
> Blockquote	Blockquote
Bold font	Bold font
Italics font	Italics font
~~Strikethrough~~	~~Strikethrough~~
19^th^	19^th
H~2~O	H₂O
++Inserted text++	Inserted text
==Marked text==	Marked text
[link text](https:// "title")	Link
![image alt](https:// "title")	Image
`Code`	`Code`	在筆記中貼入程式碼
```javascript var i = 0; ```	`var i = 0;`	在筆記中貼入程式碼
:smile:		Emoji list
{%youtube youtube_id %}	Externals
$L^aT_eX$	L^aT_eX
:::info This is a alert area. :::	This is a alert area.

tags: python,BeautifulSoup

網頁資料抓取與分析

tags: `python`,`BeautifulSoup`