專題二 - 數據一條龍 (數位時代)

# 專題二 - 數據一條龍 (數位時代) ### 爬蟲 - 數位時代搜尋頁面 * 將爬蟲程式模組化，函式內帶入搜尋關鍵字，可產生搜尋頁面搜尋到的文章標題、文章連結、文章日期、與點入連結後的文章內文之Json格式文字檔。 ```py= from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC import time from bs4 import BeautifulSoup import requests import json import datetime def searchDigitalTimes(searchword): #自訂搜尋關鍵字，使用關鍵字帶入Technews搜尋網址 TARGET_URL = f'https://www.bnext.com.tw/search/kw/{searchword}#gsc.tab=0&gsc.q={searchword}&gsc.sort=date' #利用Selenium開啟Chrome的無頭模式 options = webdriver.ChromeOptions() options.add_argument("headless") driver = webdriver.Chrome(options = options) driver.get(TARGET_URL) wait = WebDriverWait(driver, 10) #用BeautifalSoup定位文章列表 soup = BeautifulSoup(driver.page_source, 'html.parser') articleList = [] elements = soup.findAll("div", {"class": "gs-webResult"}) for i in elements: articleTitle = i.findChild("div").findChild("div").findChild("a").text articleUrl = i.findChild("div").findChild("div").findChild("a").get('href') if "數位時代" in articleTitle: pass else: date = i.findChild("div").findNextSibling("div").findNextSibling("div").findChild("div").findNextSibling("div").findChild("div").findNextSibling("div").text[:11] if "." in date: date = date.replace('.','') date = date.strip() if '天前' in date: num = int(date[0:1]) date = (datetime.date.today() - datetime.timedelta(days=num)).strftime("%Y年%m月%d日") value = { 'Date': date, 'Article': articleTitle, 'Url': articleUrl } articleList.append(value) articleList.pop() #取每篇文章的內文，存回List中。 for articles in articleList: options2 = webdriver.ChromeOptions() options2.add_argument("headless") driver2 = webdriver.Chrome(options = options2) driver2.get(articles['Url']) wait = WebDriverWait(driver2, 10) soup2 = BeautifulSoup(driver2.page_source, 'html.parser') paragraph = soup2.find("div", {"class": "DynamicComp"}) paraSection = paragraph.findAll("p") str="" for i in paraSection: str = str + i.text.strip() articles['contents']=str print(articleList) #寫入Json資料 with open('digitaltimes.txt', 'a', encoding='utf-8') as f: json.dump(articleList, f, ensure_ascii=False, indent=4) if __name__ == '__main__': searchDigitalTimes('7-11') ``` #### Output ![](https://hackmd.io/_uploads/Sk1ZFhxLn.png) ###### tags: `python` `爬蟲` `數據一條龍`