觀察Google Trend網頁規則(透過Google Chrome開發工具)。
從Google Trend網頁抓取指定天數的資料(最多30天)。
寫入PostgreSQL資料。
模組名稱 | 說明 |
---|---|
bs4 | BeautifulSoup網頁解析模組 |
requests | HTTP模組,用來直接透過網路讀取網頁資料 |
json | Python內建解析json模組 |
datetime | Python內建時間相關功能模組 |
psycopg2 | 連接PostgreSQL資料庫 |
$ pip3 install bs4
import re
import json
import requests
from bs4 import BeautifulSoup
import datetime
# 最多30天前
days = 30
def fetch_trend(fetch_date):
url = 'https://trends.google.com.tw/trends/api/dailytrends?hl=zh-TW&tz=-480&ed=' + fetch_date + '&geo=TW&ns=15'
print(url)
resp = requests.get(url)
data = re.sub(r'\)\]\}\',\n', '', resp.text)
trend_tree = json.loads(data)
if len(trend_tree['default']['trendingSearchesDays']) == 0:
print('Empty trend data at ' + fetch_date)
return
keywords = trend_tree['default']['trendingSearchesDays'][0]['trendingSearches']
for i in keywords:
title = i['title']['query']
formattedTraffic = i['formattedTraffic']
imageUrl = 'none'
if 'imageUrl' in i['image']:
imageUrl = i['image']['imageUrl']
articles = i['articles']
shareUrl = i['shareUrl']
fetch_date = int(fetch_date)
# 相關的搜尋,只取出關鍵字
relatedQueries = []
for r in i['relatedQueries']:
relatedQueries.append(r['query'])
if len(relatedQueries) > 0:
print(relatedQueries)
def main():
# 取得今天當結束日期
day_end = datetime.datetime.today()
for d in range(days, 0, -1):
day_start = day_end - datetime.timedelta(days=d)
format_day = datetime.datetime.strftime(day_start,'%Y%m%d')
print(d, format_day)
fetch_trend(format_day)
if __name__ == "__main__":
main()
$ pip3 install psycopg2
CREATE TABLE google_trend (
id serial PRIMARY KEY,
title varchar(32),
formattedTraffic varchar(32),
relatedQueries text,
imageUrl varchar(1024),
articles text,
shareUrl varchar(1024),
fetch_date int
);
import re
import json
import requests
from bs4 import BeautifulSoup
import datetime
import psycopg2
# 最多30天前
days = 30
def connect_postgresql():
# 設定連線資訊
host = "localhost"
dbname = "{資料庫名稱}"
user = "{帳號}"
password = "{密碼}"
sslmode = "disable"
# Construct connection string
conn_string = "host={0} user={1} dbname={2} password={3} sslmode={4}".format(host, user, dbname, password, sslmode)
conn = psycopg2.connect(conn_string)
print("連線完成")
return conn
def insert_data(cursor, title, formattedTraffic, relatedQueries, imageUrl, articles, shareUrl, fetch_date):
print(title, '=', formattedTraffic)
print('relatedQueries=', str(relatedQueries))
print('imageUrl=', imageUrl)
print('articles=', str(articles))
print('shareUrl=', shareUrl)
print('date=', fetch_date)
cursor.execute("INSERT INTO google_trend (title, formattedTraffic, relatedQueries, imageUrl, articles, shareUrl, fetch_date) VALUES (%s, %s, %s, %s, %s, %s, %s);", (title, formattedTraffic, str(relatedQueries), imageUrl, str(articles), shareUrl, fetch_date))
def fetch_trend(fetch_date, cursor):
url = 'https://trends.google.com.tw/trends/api/dailytrends?hl=zh-TW&tz=-480&ed=' + fetch_date + '&geo=TW&ns=15'
print(url)
resp = requests.get(url)
data = re.sub(r'\)\]\}\',\n', '', resp.text)
trend_tree = json.loads(data)
if len(trend_tree['default']['trendingSearchesDays']) == 0:
print('Empty trend data at ' + fetch_date)
return
keywords = trend_tree['default']['trendingSearchesDays'][0]['trendingSearches']
for i in keywords:
title = i['title']['query']
formattedTraffic = i['formattedTraffic']
imageUrl = 'none'
if 'imageUrl' in i['image']:
imageUrl = i['image']['imageUrl']
articles = i['articles']
shareUrl = i['shareUrl']
fetch_date = int(fetch_date)
# 相關的搜尋,只取出關鍵字
relatedQueries = []
for r in i['relatedQueries']:
relatedQueries.append(r['query'])
insert_data(cursor, title, formattedTraffic, relatedQueries, imageUrl, articles, shareUrl, fetch_date)
def main():
# 取得今天當結束日期
day_end = datetime.datetime.today()
# 取得資料庫連線
conn = connect_postgresql()
for d in range(days, 0, -1):
day_start = day_end - datetime.timedelta(days=d)
format_day = datetime.datetime.strftime(day_start,'%Y%m%d')
print(d, format_day)
fetch_trend(format_day, conn.cursor())
# 釋放資料庫資源
conn.commit()
conn.cursor().close()
conn.close()
if __name__ == "__main__":
main()
注意:
host
、dbname
、user
、password
變數值需換成實際資料庫設定值才能正常執行。