# 2023-09-01 第五天 Python入門實作班上課記錄 ###### tags: `python` ## 靜態爬蟲-BeautifulSoup ``` from bs4 import BeautifulSoup # 原始HTML原始碼 # 原始HTML原始碼 html_doc = """ <html> <head> <title>這是HTML文件標題</title> </head> <body> <h1 id="article" class="banner">網頁標題</h1> <p data-author='aaron'>文章段落</p> <a href="https://www.aaronlife.com/ref1">參考資料連結1</a> <a href="https://www.aaronlife.com/ref2">參考資料連結2</a> <p>這是一份<b class="boldtext">HTML文件</b>。</p> <h2 id="article">網頁標題2</h2> </body> </html> """ soup = BeautifulSoup(html_doc, 'html.parser') # 建立BeautifulSoup物件 print(soup.prettify()) print(soup.title) # 第一個title標籤 print(soup.html.body.h1) print(soup.a) print(soup.title.string) tag_p = soup.find_all('p') for p in tag_p: print(p.getText()) tag_a = soup.find_all('a') for a in tag_a: print(a.getText(), '-', a.get('href')) tags = soup.find_all(['a', 'p'], limit=2) # limit限制找到的數量 for tag in tags: print(tag) tags = soup.find_all(['a', 'p'], limit=2) # limit限制找到的數量 for tag in tags: print(tag) tag = soup.find(['a', 'p']) # 找到第一個符合條件的標籤 print(tag) h1_tags = soup.html.body.find_all('h1', recursive=False) # 只找出第一層h1標籤 print(h1_tags) tags = soup.find_all('h2', id='article') print(tags) tags = soup.find_all(attrs={'data-author': 'aaron'}) # 如果元素的屬性名稱有dash, 須使用dict來包住 print(tags) tags = soup.find_all(class_='banner') print(tags) ``` ## 靜態爬蟲-爬取台灣銀行匯率網頁並將匯率存成CSV檔 ``` from bs4 import BeautifulSoup import requests import csv import time # 存匯率資料用的list result = [] # 台灣銀行匯率網址 url = 'https://rate.bot.com.tw/xrt?Lang=zh-TW' # 使用requerts爬網頁 response = requests.get(url) html_doc = response.text soup = BeautifulSoup(html_doc, 'html.parser') # pip3 install lxml rate_table = soup.find('table').find('tbody') rate_table_row = soup.find_all('tr') # 取得每一筆匯率的html資料 for row in rate_table_row: columns = row.find_all('td') # 存放解析後的每一筆資料 data = [] for c in columns: if c.attrs['data-table'] == '幣別': divs = c.find_all('div') last_div = None for last_div in divs:pass data.append(last_div.string.strip()) elif c.getText().find('查詢') != 0: data.append(c.getText().strip()) if len(data) != 0: result.append(tuple(data)) now = time.localtime() file_name = time.strftime('%Y%m%d_%H%M%S.csv') with open(file_name, 'w', encoding='utf-8', newline='') as csvfile: writer = csv.writer(csvfile) writer.writerow(['幣別', '現金買入', '現金賣出', '即期買入', '即期賣出']) writer.writerows(result) ``` ## 動態爬蟲-Selenium #### 使用Selenium爬取台灣銀行網頁(不解析, 純測試環境是否成功) ``` from selenium import webdriver import time # 初始化WebDriver driver = webdriver.Chrome('chromedriver') # 載入網頁等待時間 driver.implicitly_wait(10) driver.get('https://rate.bot.com.tw/xrt?Lang=zh-TW') print(driver.title) time.sleep(5) # 停5秒 html_doc = driver.page_source # 爬取網頁內容 print(html_doc) ``` > **備註:** > Selenium安裝步驟和注意事項請參考講義。 ## 動態爬蟲-到google搜尋amkor並取回搜尋結果 ``` from selenium import webdriver from selenium.webdriver.common.by import By from selenium.common.exceptions import NoSuchElementException from selenium.webdriver.common.keys import Keys from bs4 import BeautifulSoup import time driver = webdriver.Chrome('chromedriver') driver.implicitly_wait(10) driver.get('https://www.google.com') print(driver.title) try: search = driver.find_element(By.NAME, 'q') # 定位搜尋框 print(search.tag_name) search.send_keys('amkor') # 送出搜尋文字 time.sleep(2) search.send_keys(Keys.ENTER) #按Enter time.sleep(1) # driver.execute_script('window.scrollTo(0,document.body.scrollHeight)') item = driver.find_elements(By.CLASS_NAME, "LC20lb") # 定位搜尋結果標題 addrs = driver.find_elements(By.CLASS_NAME, "yuRUbf") # 定位搜尋結果網址 all = zip(item, addrs) for item in all: addr = item[1].find_element(By.TAG_NAME, 'a').get_attribute('href') print(f'{item[0].text} - {addr}') except NoSuchElementException: print('無法定位元素') time.sleep(5) print('end') ```