專題二 - 數據一條龍(Technews)

# 專題二 - 數據一條龍(Technews) ### 爬蟲 - Technews搜尋頁面 * 將爬蟲程式模組化，函式內帶入搜尋關鍵字，可產生搜尋頁面搜尋到的文章標題、文章連結、文章日期、與點入連結後的文章內文之Json格式文字檔。 ```py= from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC import time from bs4 import BeautifulSoup import requests import json def searchTechnews(searchword): #自訂搜尋關鍵字，使用關鍵字帶入Technews搜尋網址 TARGET_URL = f'https://technews.tw/google-search/?googlekeyword={searchword}' #利用Selenium開啟Chrome的無頭模式 options = webdriver.ChromeOptions() options.add_argument("headless") driver = webdriver.Chrome(options = options) driver.get(TARGET_URL) #定位Split Botton option_menu = driver.find_element(By.CLASS_NAME, 'gsc-selected-option') option_menu.click() #將Split Botton改為以日期將文章排序 wait = WebDriverWait(driver, 10) date_option = wait.until(EC.element_to_be_clickable((By.XPATH, '//div[@class="gsc-option" and text()="日期"]'))) date_option.click() time.sleep(6) #直接用 selenium 爬也可以 #用BeautifalSoup定位文章列表 soup = BeautifulSoup(driver.page_source, 'html.parser') articleList = [] elements = soup.findAll("div", {"class": "gs-webResult"}) #取每篇文章的標題、URL、日期 for i in elements: articleTitle = i.findChild("div").findChild("div").findChild("a").text articleUrl = i.findChild("div").findChild("div").findChild("a").get('href') if "科技新報" in articleTitle: pass else: date = i.findChild("div").findNextSibling("div").findChild("span").findNextSibling("span").text[3:] value = { 'Date': date, 'Article': articleTitle, 'Url': articleUrl } articleList.append(value) articleList.pop() #取每篇文章的內文，存回List中。 for articles in articleList: options2 = webdriver.ChromeOptions() options2.add_argument("headless") driver2 = webdriver.Chrome(options = options2) driver2.get(articles['Url']) wait = WebDriverWait(driver2, 10) soup2 = BeautifulSoup(driver2.page_source, 'html.parser') articleContent = [] paragraph = soup2.find("div", {"class": "indent"}) paraSection = paragraph.findAll("p") str="" for i in paraSection: str = str + i.text.strip() articles['contents']=str print(articleList) #寫入Json資料 with open('technews.txt', 'a', encoding='utf-8') as f: json.dump(articleList, f, ensure_ascii=False, indent=4) if __name__ == '__main__': searchTechnews('全家') ``` ### Output ![](https://hackmd.io/_uploads/SJtHzWJUh.png) ### 爬蟲-抓搜尋頁前三頁文章 ```py= from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC import time from bs4 import BeautifulSoup import requests import json def searchTechnews(searchword): #自訂搜尋關鍵字，使用關鍵字帶入Technews搜尋網址 TARGET_URL = f'https://technews.tw/google-search/?googlekeyword={searchword}' #利用Selenium開啟Chrome options = webdriver.ChromeOptions() options.add_argument("headless") driver = webdriver.Chrome(options = options) driver.get(TARGET_URL) #定位Split Botton option_menu = driver.find_element(By.CLASS_NAME, 'gsc-selected-option') option_menu.click() #將Split Botton改為以日期將文章排序 wait = WebDriverWait(driver, 10) date_option = wait.until(EC.element_to_be_clickable((By.XPATH, '//div[@class="gsc-option" and text()="日期"]'))) date_option.click() time.sleep(6) #按日期排序後抓前三頁所有文章 for i in range(1,4): if i >=2 : #按第n頁 wait = WebDriverWait(driver, 10) date_option = wait.until(EC.element_to_be_clickable((By.XPATH, f'//div[@class="gsc-cursor-page" and text()="{i}"]'))) date_option.click() time.sleep(6) #用BeautifalSoup定位文章列表 soup = BeautifulSoup(driver.page_source, 'html.parser') articleList = [] elements = soup.findAll("div", {"class": "gs-webResult"}) #取每篇文章的標題、URL、日期 for i in elements: articleTitle = i.findChild("div").findChild("div").findChild("a").text articleUrl = i.findChild("div").findChild("div").findChild("a").get('href') if "科技新報" in articleTitle: pass else: date = i.findChild("div").findNextSibling("div").findChild("span").findNextSibling("span").text[3:] value = { 'Date': date, 'Article': articleTitle, 'Url': articleUrl } articleList.append(value) articleList.pop() #取每篇文章的內文，存回List中。 for articles in articleList: options2 = webdriver.ChromeOptions() options2.add_argument("headless") driver2 = webdriver.Chrome(options = options2) driver2.get(articles['Url']) wait = WebDriverWait(driver2, 10) soup2 = BeautifulSoup(driver2.page_source, 'html.parser') paragraph = soup2.find("div", {"class": "indent"}) paraSection = paragraph.findAll("p") str="" for i in paraSection: str = str + i.text.strip() articles['contents']=str #寫入Json資料 with open('technews_all.txt', 'a', encoding='utf-8') as f: json.dump(articleList, f, ensure_ascii=False, indent=4) if __name__ == '__main__': searchTechnews('7-11') ``` ### 抓全家與7-11過去一年的股價 ```py= import requests import datetime import json import pandas as pd #抓一年的股價 def get_symbols_info(symbols): url = "https://yfapi.net/v8/finance/spark" querystring = { "symbols": symbols, "interval": "1d", "range": "1y " } headers = {'x-api-key': "FxNI17Zq747urT6cSqf5Q361aFK2Acou6zkuczQi"} response = requests.request("GET", url, headers=headers, params=querystring) data = response.json() result ={} print(data) for k in symbols.split(","): dt_list = [] for i in data[k]['timestamp']: dt = datetime.datetime.fromtimestamp(i) act_datetime = dt.strftime("%Y/%m/%d") dt_list.append(act_datetime) info = dict(zip(dt_list, data[k]['close'])) result[k]=info print(result) return result if __name__ == '__main__': a = get_symbols_info('5903.TWO,1216.tw') print(a) df = pd.DataFrame(a) df.columns = ['全家', '7-11'] df=df.dropna(axis=0) df.to_csv('stockprice.csv', encoding='utf-8') ``` ###### tags: `python` `專題二` `數據一條龍` `爬蟲`