# 專題二 - 數據一條龍(Technews)
### 爬蟲 - Technews搜尋頁面
* 將爬蟲程式模組化,函式內帶入搜尋關鍵字,可產生搜尋頁面搜尋到的文章標題、文章連結、文章日期、與點入連結後的文章內文之Json格式文字檔。
```py=
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
from bs4 import BeautifulSoup
import requests
import json
def searchTechnews(searchword):
#自訂搜尋關鍵字,使用關鍵字帶入Technews搜尋網址
TARGET_URL = f'https://technews.tw/google-search/?googlekeyword={searchword}'
#利用Selenium開啟Chrome的無頭模式
options = webdriver.ChromeOptions()
options.add_argument("headless")
driver = webdriver.Chrome(options = options)
driver.get(TARGET_URL)
#定位Split Botton
option_menu = driver.find_element(By.CLASS_NAME, 'gsc-selected-option')
option_menu.click()
#將Split Botton改為以日期將文章排序
wait = WebDriverWait(driver, 10)
date_option = wait.until(EC.element_to_be_clickable((By.XPATH, '//div[@class="gsc-option" and text()="日期"]')))
date_option.click()
time.sleep(6)
#直接用 selenium 爬也可以
#用BeautifalSoup定位文章列表
soup = BeautifulSoup(driver.page_source, 'html.parser')
articleList = []
elements = soup.findAll("div", {"class": "gs-webResult"})
#取每篇文章的標題、URL、日期
for i in elements:
articleTitle = i.findChild("div").findChild("div").findChild("a").text
articleUrl = i.findChild("div").findChild("div").findChild("a").get('href')
if "科技新報" in articleTitle:
pass
else:
date = i.findChild("div").findNextSibling("div").findChild("span").findNextSibling("span").text[3:]
value = {
'Date': date,
'Article': articleTitle,
'Url': articleUrl
}
articleList.append(value)
articleList.pop()
#取每篇文章的內文,存回List中。
for articles in articleList:
options2 = webdriver.ChromeOptions()
options2.add_argument("headless")
driver2 = webdriver.Chrome(options = options2)
driver2.get(articles['Url'])
wait = WebDriverWait(driver2, 10)
soup2 = BeautifulSoup(driver2.page_source, 'html.parser')
articleContent = []
paragraph = soup2.find("div", {"class": "indent"})
paraSection = paragraph.findAll("p")
str=""
for i in paraSection:
str = str + i.text.strip()
articles['contents']=str
print(articleList)
#寫入Json資料
with open('technews.txt', 'a', encoding='utf-8') as f:
json.dump(articleList, f, ensure_ascii=False, indent=4)
if __name__ == '__main__':
searchTechnews('全家')
```
### Output

### 爬蟲-抓搜尋頁前三頁文章
```py=
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
from bs4 import BeautifulSoup
import requests
import json
def searchTechnews(searchword):
#自訂搜尋關鍵字,使用關鍵字帶入Technews搜尋網址
TARGET_URL = f'https://technews.tw/google-search/?googlekeyword={searchword}'
#利用Selenium開啟Chrome
options = webdriver.ChromeOptions()
options.add_argument("headless")
driver = webdriver.Chrome(options = options)
driver.get(TARGET_URL)
#定位Split Botton
option_menu = driver.find_element(By.CLASS_NAME, 'gsc-selected-option')
option_menu.click()
#將Split Botton改為以日期將文章排序
wait = WebDriverWait(driver, 10)
date_option = wait.until(EC.element_to_be_clickable((By.XPATH, '//div[@class="gsc-option" and text()="日期"]')))
date_option.click()
time.sleep(6)
#按日期排序後抓前三頁所有文章
for i in range(1,4):
if i >=2 :
#按第n頁
wait = WebDriverWait(driver, 10)
date_option = wait.until(EC.element_to_be_clickable((By.XPATH, f'//div[@class="gsc-cursor-page" and text()="{i}"]')))
date_option.click()
time.sleep(6)
#用BeautifalSoup定位文章列表
soup = BeautifulSoup(driver.page_source, 'html.parser')
articleList = []
elements = soup.findAll("div", {"class": "gs-webResult"})
#取每篇文章的標題、URL、日期
for i in elements:
articleTitle = i.findChild("div").findChild("div").findChild("a").text
articleUrl = i.findChild("div").findChild("div").findChild("a").get('href')
if "科技新報" in articleTitle:
pass
else:
date = i.findChild("div").findNextSibling("div").findChild("span").findNextSibling("span").text[3:]
value = {
'Date': date,
'Article': articleTitle,
'Url': articleUrl
}
articleList.append(value)
articleList.pop()
#取每篇文章的內文,存回List中。
for articles in articleList:
options2 = webdriver.ChromeOptions()
options2.add_argument("headless")
driver2 = webdriver.Chrome(options = options2)
driver2.get(articles['Url'])
wait = WebDriverWait(driver2, 10)
soup2 = BeautifulSoup(driver2.page_source, 'html.parser')
paragraph = soup2.find("div", {"class": "indent"})
paraSection = paragraph.findAll("p")
str=""
for i in paraSection:
str = str + i.text.strip()
articles['contents']=str
#寫入Json資料
with open('technews_all.txt', 'a', encoding='utf-8') as f:
json.dump(articleList, f, ensure_ascii=False, indent=4)
if __name__ == '__main__':
searchTechnews('7-11')
```
### 抓全家與7-11過去一年的股價
```py=
import requests
import datetime
import json
import pandas as pd
#抓一年的股價
def get_symbols_info(symbols):
url = "https://yfapi.net/v8/finance/spark"
querystring = {
"symbols": symbols,
"interval": "1d",
"range": "1y "
}
headers = {'x-api-key': "FxNI17Zq747urT6cSqf5Q361aFK2Acou6zkuczQi"}
response = requests.request("GET", url, headers=headers, params=querystring)
data = response.json()
result ={}
print(data)
for k in symbols.split(","):
dt_list = []
for i in data[k]['timestamp']:
dt = datetime.datetime.fromtimestamp(i)
act_datetime = dt.strftime("%Y/%m/%d")
dt_list.append(act_datetime)
info = dict(zip(dt_list, data[k]['close']))
result[k]=info
print(result)
return result
if __name__ == '__main__':
a = get_symbols_info('5903.TWO,1216.tw')
print(a)
df = pd.DataFrame(a)
df.columns = ['全家', '7-11']
df=df.dropna(axis=0)
df.to_csv('stockprice.csv', encoding='utf-8')
```
###### tags: `python` `專題二` `數據一條龍` `爬蟲`