python
,BeautifulSoup
使用urllib套件
接收資料: 傳送到ParseResult物件
用requests函式讀取原始碼。 若沒有requests 需用pip install requests安裝
用ParseResult物件,取得網址資料:
# FileName:urlparse.py
from urllib.parse import urlparse
url = 'https://hackmd.io:80/xZrScthwQdqfIdaIDyYvFA?view=1'
url_obj = urlparse(url)
print(type(url_obj)) #<class 'urllib.parse.ParseResult'> ParseResult物件
print(url_obj)
print('通訊協定:',format(url_obj.scheme)) # https
print('網站名稱(網域):',format(url_obj.netloc)) # hackmd.io:80
print('路徑:',format(url_obj.path)) # /xZrScthwQdqfIdaIDyYvFA
print('查詢的參數:',format(url_obj.params))
print('Get參數(query查詢字串):',format(url_obj.query)) # view=1
print('框架名稱:',format(url_obj.fragment))
print('通訊埠:',format(url_obj.port)) # 80
# print()
# print('用索引值方法')
# print('通訊協定:',format(url_obj[0]))
# print('網站名稱(網域):',format(url_obj[1]))
# print('路徑:',format(url_obj[2]))
# print('查詢的參數:',format(url_obj[3]))
# print('Get參數(query查詢字串):',format(url_obj[4]))
# print('框架名稱:',format(url_obj[5]))
# print('通訊埠:',format(url_obj))
# S20200718 By YTC
用Beautifulsoup進行複雜網頁分析
# FileName:beautifulsoup.py
# pip install beautifulsoup4 若沒有需要用 pip install beautifulsoup4 安裝
# beautifulsoup範例
html_doc='''
<html>
<meta charset="UTF-8">
<head><title>網頁標題</title></head>
<style>
.gray{color: gray;}
</style>
<body>
<p class="title"><b>文件標題</b></p>
<div class ="gray">故事內容</div>
<p class="story">
Onec upon a time there were three little sisters;
and their name were
<a href="http://expamle.com/elsie" class ="sister" id="link1">Elsie</a>,
<a href="http://expamle.com/lacie" class ="sister" id="link2" id="btn2">Lacie</a> and
<a href="http://expamle.com/tillie" class ="sister", id="link3">Tillie</a>;
and they lived at the bottom of a well.
</p>
<div class ="gray">故事結束</div>
</body>
</html>
'''
from bs4 import BeautifulSoup
import requests
# 進行網頁分析:'html.parser'為BeautifulSoup預設語法
sp = BeautifulSoup(html_doc,'html.parser')
d1 = sp.select('title') # 傳回網頁標題
d2= sp.text # 傳回去除HTML標籤後的網頁文字內容
d3 = sp.find("div") # 找到第一個tag
d4 = sp.find("a",{"id":"link1"}) # 讀取tag屬性內容
d5 = sp.find_all("div") # 找到所有tag
d6 = sp.find_all(["title","a"]) # 找到多個tag
d7 = sp.find_all("a",{"class":"sister"}) # 另外可用 find_all(tag,{屬性:數性內容}) 找資料
d8 = sp.select('#btn2') # 傳回指定css選擇器 id或是class 的內容
d9 = sp.select("html head title") # 用tag逐層找資料
d10 = sp.select("body p b") # 依tag逐層找資料
d11 = sp.select(".gray") # 依css類別找出gray資料
print(d1) # [<title>網頁標題</title>]
print(d2)
print(d3) # <div class="gray">故事內容</div>
print(d3.text) # 故事內容
print(d4.get("href")) # 用get取得tag屬性內容-> http://expamle.com/elsie
print(d5) # [<div class="gray">故事內容</div>, <div class="gray">故事結束</div>]
print(d5[0].text) # 故事內容
print(d6) # [<title>網頁標題</title>, <a class="sister" href="http://expamle.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://expamle.com/lacie" id="btn2">Lacie</a>, <a ,="" class="sister" href="http://expamle.com/tillie" id="link3">Tillie</a>]
for link in d6:
href=link.get("href")
if href != None and href.startswith("http://"): #startswith 開頭是"http://" 才顯示
print(href)
print(d7)
print(d8)
print(d9)
print(d10)
print(d11) # [<div class="gray">故事內容</div>, <div class="gray">故事結束</div>]
print(d11[1].text) # 故事結束
# S20200718 By YTC
S20200718 By YTC
M20200720
or
or
By clicking below, you agree to our terms of service.
New to HackMD? Sign up
Syntax | Example | Reference | |
---|---|---|---|
# Header | Header | 基本排版 | |
- Unordered List |
|
||
1. Ordered List |
|
||
- [ ] Todo List |
|
||
> Blockquote | Blockquote |
||
**Bold font** | Bold font | ||
*Italics font* | Italics font | ||
~~Strikethrough~~ | |||
19^th^ | 19th | ||
H~2~O | H2O | ||
++Inserted text++ | Inserted text | ||
==Marked text== | Marked text | ||
[link text](https:// "title") | Link | ||
 | Image | ||
`Code` | Code |
在筆記中貼入程式碼 | |
```javascript var i = 0; ``` |
|
||
:smile: | ![]() |
Emoji list | |
{%youtube youtube_id %} | Externals | ||
$L^aT_eX$ | LaTeX | ||
:::info This is a alert area. ::: |
This is a alert area. |
On a scale of 0-10, how likely is it that you would recommend HackMD to your friends, family or business associates?
Please give us some advice and help us improve HackMD.
Syncing