Try   HackMD

2023-09-01 第五天 Python入門實作班上課記錄

tags: python

靜態爬蟲-BeautifulSoup

from bs4 import BeautifulSoup

# 原始HTML原始碼
# 原始HTML原始碼
html_doc = """
<html>
<head>
<title>這是HTML文件標題</title>
</head>
<body>
<h1 id="article" class="banner">網頁標題</h1>
<p data-author='aaron'>文章段落</p>
<a href="https://www.aaronlife.com/ref1">參考資料連結1</a>
<a href="https://www.aaronlife.com/ref2">參考資料連結2</a>
<p>這是一份<b class="boldtext">HTML文件</b>。</p>
<h2 id="article">網頁標題2</h2>
</body>
</html>
"""

soup = BeautifulSoup(html_doc, 'html.parser') # 建立BeautifulSoup物件

print(soup.prettify())
print(soup.title)  # 第一個title標籤
print(soup.html.body.h1)
print(soup.a)

print(soup.title.string)

tag_p = soup.find_all('p')

for p in tag_p:
    print(p.getText())

tag_a = soup.find_all('a')
for a in tag_a:
    print(a.getText(), '-', a.get('href'))

tags = soup.find_all(['a', 'p'], limit=2) # limit限制找到的數量
for tag in tags:
    print(tag)

tags = soup.find_all(['a', 'p'], limit=2) # limit限制找到的數量
for tag in tags:
    print(tag)

tag = soup.find(['a', 'p']) # 找到第一個符合條件的標籤
print(tag)

h1_tags = soup.html.body.find_all('h1', recursive=False) # 只找出第一層h1標籤
print(h1_tags)

tags = soup.find_all('h2', id='article')
print(tags)

tags = soup.find_all(attrs={'data-author': 'aaron'})  # 如果元素的屬性名稱有dash, 須使用dict來包住
print(tags)

tags = soup.find_all(class_='banner')
print(tags)

靜態爬蟲-爬取台灣銀行匯率網頁並將匯率存成CSV檔

from bs4 import BeautifulSoup
import requests
import csv
import time

# 存匯率資料用的list
result  = []

# 台灣銀行匯率網址
url = 'https://rate.bot.com.tw/xrt?Lang=zh-TW'

# 使用requerts爬網頁
response = requests.get(url)

html_doc = response.text

soup = BeautifulSoup(html_doc, 'html.parser')  # pip3 install lxml

rate_table = soup.find('table').find('tbody')

rate_table_row = soup.find_all('tr') # 取得每一筆匯率的html資料

for row in rate_table_row:
    columns = row.find_all('td')

    # 存放解析後的每一筆資料
    data = []

    for c in columns:
        if c.attrs['data-table'] == '幣別':
            divs = c.find_all('div')
            
            last_div = None
            for last_div in divs:pass

            data.append(last_div.string.strip())
        elif c.getText().find('查詢') != 0:
            data.append(c.getText().strip())

    if len(data) != 0:
        result.append(tuple(data))

now = time.localtime()
file_name = time.strftime('%Y%m%d_%H%M%S.csv')

with open(file_name, 'w', encoding='utf-8', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['幣別', '現金買入', '現金賣出', '即期買入', '即期賣出'])
    writer.writerows(result)

動態爬蟲-Selenium

使用Selenium爬取台灣銀行網頁(不解析, 純測試環境是否成功)

from selenium import webdriver
import time

# 初始化WebDriver
driver = webdriver.Chrome('chromedriver')

# 載入網頁等待時間
driver.implicitly_wait(10)

driver.get('https://rate.bot.com.tw/xrt?Lang=zh-TW')

print(driver.title)

time.sleep(5)  # 停5秒

html_doc = driver.page_source # 爬取網頁內容
print(html_doc)

備註:
Selenium安裝步驟和注意事項請參考講義。

動態爬蟲-到google搜尋amkor並取回搜尋結果

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException

from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import time

driver = webdriver.Chrome('chromedriver')
driver.implicitly_wait(10)
driver.get('https://www.google.com')
print(driver.title)

try:
    search = driver.find_element(By.NAME, 'q') # 定位搜尋框
    print(search.tag_name)
    search.send_keys('amkor')  # 送出搜尋文字
    time.sleep(2) 
    search.send_keys(Keys.ENTER)  #按Enter
    time.sleep(1)
    # driver.execute_script('window.scrollTo(0,document.body.scrollHeight)')

    item = driver.find_elements(By.CLASS_NAME, "LC20lb") # 定位搜尋結果標題 
    addrs = driver.find_elements(By.CLASS_NAME, "yuRUbf") # 定位搜尋結果網址

    all = zip(item, addrs)

    for item in all:
        addr = item[1].find_element(By.TAG_NAME, 'a').get_attribute('href')
        print(f'{item[0].text} - {addr}')

except NoSuchElementException:
    print('無法定位元素')

time.sleep(5)
print('end')