# 爬取Google Trend資料 ### 規格 #### 實現步驟 1. 觀察Google Trend網頁規則(透過Google Chrome開發工具)。 2. 從Google Trend網頁抓取指定天數的資料(最多30天)。 3. 寫入PostgreSQL資料。 #### 會用到的模組 | 模組名稱 | 說明 | | -------- | -------------------------------------- | | bs4 | BeautifulSoup網頁解析模組 | | requests | HTTP模組,用來直接透過網路讀取網頁資料 | | json | Python內建解析json模組 | | datetime | Python內建時間相關功能模組 | | psycopg2 | 連接PostgreSQL資料庫 | ##### 安裝BeautifulSoup 4模組 ``` $ pip3 install bs4 ``` ## 爬取Google Trend關鍵字 ```html= import re import json import requests from bs4 import BeautifulSoup import datetime # 最多30天前 days = 30 def fetch_trend(fetch_date): url = 'https://trends.google.com.tw/trends/api/dailytrends?hl=zh-TW&tz=-480&ed=' + fetch_date + '&geo=TW&ns=15' print(url) resp = requests.get(url) data = re.sub(r'\)\]\}\',\n', '', resp.text) trend_tree = json.loads(data) if len(trend_tree['default']['trendingSearchesDays']) == 0: print('Empty trend data at ' + fetch_date) return keywords = trend_tree['default']['trendingSearchesDays'][0]['trendingSearches'] for i in keywords: title = i['title']['query'] formattedTraffic = i['formattedTraffic'] imageUrl = 'none' if 'imageUrl' in i['image']: imageUrl = i['image']['imageUrl'] articles = i['articles'] shareUrl = i['shareUrl'] fetch_date = int(fetch_date) # 相關的搜尋,只取出關鍵字 relatedQueries = [] for r in i['relatedQueries']: relatedQueries.append(r['query']) if len(relatedQueries) > 0: print(relatedQueries) def main(): # 取得今天當結束日期 day_end = datetime.datetime.today() for d in range(days, 0, -1): day_start = day_end - datetime.timedelta(days=d) format_day = datetime.datetime.strftime(day_start,'%Y%m%d') print(d, format_day) fetch_trend(format_day) if __name__ == "__main__": main() ``` ## 寫入PostgreSQL資料庫 ##### 安裝psycopg2模組 ``` $ pip3 install psycopg2 ``` ## 資料庫Table規格 ``` CREATE TABLE google_trend ( id serial PRIMARY KEY, title varchar(32), formattedTraffic varchar(32), relatedQueries text, imageUrl varchar(1024), articles text, shareUrl varchar(1024), fetch_date int ); ``` ## 完整程式碼 ```python import re import json import requests from bs4 import BeautifulSoup import datetime import psycopg2 # 最多30天前 days = 30 def connect_postgresql(): # 設定連線資訊 host = "localhost" dbname = "{資料庫名稱}" user = "{帳號}" password = "{密碼}" sslmode = "disable" # Construct connection string conn_string = "host={0} user={1} dbname={2} password={3} sslmode={4}".format(host, user, dbname, password, sslmode) conn = psycopg2.connect(conn_string) print("連線完成") return conn def insert_data(cursor, title, formattedTraffic, relatedQueries, imageUrl, articles, shareUrl, fetch_date): print(title, '=', formattedTraffic) print('relatedQueries=', str(relatedQueries)) print('imageUrl=', imageUrl) print('articles=', str(articles)) print('shareUrl=', shareUrl) print('date=', fetch_date) cursor.execute("INSERT INTO google_trend (title, formattedTraffic, relatedQueries, imageUrl, articles, shareUrl, fetch_date) VALUES (%s, %s, %s, %s, %s, %s, %s);", (title, formattedTraffic, str(relatedQueries), imageUrl, str(articles), shareUrl, fetch_date)) def fetch_trend(fetch_date, cursor): url = 'https://trends.google.com.tw/trends/api/dailytrends?hl=zh-TW&tz=-480&ed=' + fetch_date + '&geo=TW&ns=15' print(url) resp = requests.get(url) data = re.sub(r'\)\]\}\',\n', '', resp.text) trend_tree = json.loads(data) if len(trend_tree['default']['trendingSearchesDays']) == 0: print('Empty trend data at ' + fetch_date) return keywords = trend_tree['default']['trendingSearchesDays'][0]['trendingSearches'] for i in keywords: title = i['title']['query'] formattedTraffic = i['formattedTraffic'] imageUrl = 'none' if 'imageUrl' in i['image']: imageUrl = i['image']['imageUrl'] articles = i['articles'] shareUrl = i['shareUrl'] fetch_date = int(fetch_date) # 相關的搜尋,只取出關鍵字 relatedQueries = [] for r in i['relatedQueries']: relatedQueries.append(r['query']) insert_data(cursor, title, formattedTraffic, relatedQueries, imageUrl, articles, shareUrl, fetch_date) def main(): # 取得今天當結束日期 day_end = datetime.datetime.today() # 取得資料庫連線 conn = connect_postgresql() for d in range(days, 0, -1): day_start = day_end - datetime.timedelta(days=d) format_day = datetime.datetime.strftime(day_start,'%Y%m%d') print(d, format_day) fetch_trend(format_day, conn.cursor()) # 釋放資料庫資源 conn.commit() conn.cursor().close() conn.close() if __name__ == "__main__": main() ``` > 注意: > > `host`、`dbname`、`user`、`password`變數值需換成實際資料庫設定值才能正常執行。