python
from bs4 import BeautifulSoup
# 原始HTML原始碼
# 原始HTML原始碼
html_doc = """
<html>
<head>
<title>這是HTML文件標題</title>
</head>
<body>
<h1 id="article" class="banner">網頁標題</h1>
<p data-author='aaron'>文章段落</p>
<a href="https://www.aaronlife.com/ref1">參考資料連結1</a>
<a href="https://www.aaronlife.com/ref2">參考資料連結2</a>
<p>這是一份<b class="boldtext">HTML文件</b>。</p>
<h2 id="article">網頁標題2</h2>
</body>
</html>
"""
soup = BeautifulSoup(html_doc, 'html.parser') # 建立BeautifulSoup物件
print(soup.prettify())
print(soup.title) # 第一個title標籤
print(soup.html.body.h1)
print(soup.a)
print(soup.title.string)
tag_p = soup.find_all('p')
for p in tag_p:
print(p.getText())
tag_a = soup.find_all('a')
for a in tag_a:
print(a.getText(), '-', a.get('href'))
tags = soup.find_all(['a', 'p'], limit=2) # limit限制找到的數量
for tag in tags:
print(tag)
tags = soup.find_all(['a', 'p'], limit=2) # limit限制找到的數量
for tag in tags:
print(tag)
tag = soup.find(['a', 'p']) # 找到第一個符合條件的標籤
print(tag)
h1_tags = soup.html.body.find_all('h1', recursive=False) # 只找出第一層h1標籤
print(h1_tags)
tags = soup.find_all('h2', id='article')
print(tags)
tags = soup.find_all(attrs={'data-author': 'aaron'}) # 如果元素的屬性名稱有dash, 須使用dict來包住
print(tags)
tags = soup.find_all(class_='banner')
print(tags)
from bs4 import BeautifulSoup
import requests
import csv
import time
# 存匯率資料用的list
result = []
# 台灣銀行匯率網址
url = 'https://rate.bot.com.tw/xrt?Lang=zh-TW'
# 使用requerts爬網頁
response = requests.get(url)
html_doc = response.text
soup = BeautifulSoup(html_doc, 'html.parser') # pip3 install lxml
rate_table = soup.find('table').find('tbody')
rate_table_row = soup.find_all('tr') # 取得每一筆匯率的html資料
for row in rate_table_row:
columns = row.find_all('td')
# 存放解析後的每一筆資料
data = []
for c in columns:
if c.attrs['data-table'] == '幣別':
divs = c.find_all('div')
last_div = None
for last_div in divs:pass
data.append(last_div.string.strip())
elif c.getText().find('查詢') != 0:
data.append(c.getText().strip())
if len(data) != 0:
result.append(tuple(data))
now = time.localtime()
file_name = time.strftime('%Y%m%d_%H%M%S.csv')
with open(file_name, 'w', encoding='utf-8', newline='') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(['幣別', '現金買入', '現金賣出', '即期買入', '即期賣出'])
writer.writerows(result)
from selenium import webdriver
import time
# 初始化WebDriver
driver = webdriver.Chrome('chromedriver')
# 載入網頁等待時間
driver.implicitly_wait(10)
driver.get('https://rate.bot.com.tw/xrt?Lang=zh-TW')
print(driver.title)
time.sleep(5) # 停5秒
html_doc = driver.page_source # 爬取網頁內容
print(html_doc)
備註:
Selenium安裝步驟和注意事項請參考講義。
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import time
driver = webdriver.Chrome('chromedriver')
driver.implicitly_wait(10)
driver.get('https://www.google.com')
print(driver.title)
try:
search = driver.find_element(By.NAME, 'q') # 定位搜尋框
print(search.tag_name)
search.send_keys('amkor') # 送出搜尋文字
time.sleep(2)
search.send_keys(Keys.ENTER) #按Enter
time.sleep(1)
# driver.execute_script('window.scrollTo(0,document.body.scrollHeight)')
item = driver.find_elements(By.CLASS_NAME, "LC20lb") # 定位搜尋結果標題
addrs = driver.find_elements(By.CLASS_NAME, "yuRUbf") # 定位搜尋結果網址
all = zip(item, addrs)
for item in all:
addr = item[1].find_element(By.TAG_NAME, 'a').get_attribute('href')
print(f'{item[0].text} - {addr}')
except NoSuchElementException:
print('無法定位元素')
time.sleep(5)
print('end')