Try   HackMD

爬取Google Trend資料

規格

實現步驟

  1. 觀察Google Trend網頁規則(透過Google Chrome開發工具)。

  2. 從Google Trend網頁抓取指定天數的資料(最多30天)。

  3. 寫入PostgreSQL資料。

會用到的模組

模組名稱 說明
bs4 BeautifulSoup網頁解析模組
requests HTTP模組,用來直接透過網路讀取網頁資料
json Python內建解析json模組
datetime Python內建時間相關功能模組
psycopg2 連接PostgreSQL資料庫
安裝BeautifulSoup 4模組
$ pip3 install bs4

爬取Google Trend關鍵字

import re import json import requests from bs4 import BeautifulSoup import datetime # 最多30天前 days = 30 def fetch_trend(fetch_date): url = 'https://trends.google.com.tw/trends/api/dailytrends?hl=zh-TW&tz=-480&ed=' + fetch_date + '&geo=TW&ns=15' print(url) resp = requests.get(url) data = re.sub(r'\)\]\}\',\n', '', resp.text) trend_tree = json.loads(data) if len(trend_tree['default']['trendingSearchesDays']) == 0: print('Empty trend data at ' + fetch_date) return keywords = trend_tree['default']['trendingSearchesDays'][0]['trendingSearches'] for i in keywords: title = i['title']['query'] formattedTraffic = i['formattedTraffic'] imageUrl = 'none' if 'imageUrl' in i['image']: imageUrl = i['image']['imageUrl'] articles = i['articles'] shareUrl = i['shareUrl'] fetch_date = int(fetch_date) # 相關的搜尋,只取出關鍵字 relatedQueries = [] for r in i['relatedQueries']: relatedQueries.append(r['query']) if len(relatedQueries) > 0: print(relatedQueries) def main(): # 取得今天當結束日期 day_end = datetime.datetime.today() for d in range(days, 0, -1): day_start = day_end - datetime.timedelta(days=d) format_day = datetime.datetime.strftime(day_start,'%Y%m%d') print(d, format_day) fetch_trend(format_day) if __name__ == "__main__": main()

寫入PostgreSQL資料庫

安裝psycopg2模組
$  pip3 install psycopg2

資料庫Table規格

CREATE TABLE google_trend (
  id serial PRIMARY KEY,
  title varchar(32),
  formattedTraffic varchar(32),
  relatedQueries text,
  imageUrl varchar(1024),
  articles text,
  shareUrl varchar(1024),
  fetch_date int
);

完整程式碼

import re
import json
import requests
from bs4 import BeautifulSoup
import datetime
import psycopg2

# 最多30天前
days = 30

def connect_postgresql():
    # 設定連線資訊
    host = "localhost"
    dbname = "{資料庫名稱}"
    user = "{帳號}"
    password = "{密碼}"
    sslmode = "disable"

    # Construct connection string
    conn_string = "host={0} user={1} dbname={2} password={3} sslmode={4}".format(host, user, dbname, password, sslmode)
    conn = psycopg2.connect(conn_string)
    print("連線完成")

    return conn

def insert_data(cursor, title, formattedTraffic, relatedQueries, imageUrl, articles, shareUrl, fetch_date):
    print(title, '=', formattedTraffic)
    print('relatedQueries=', str(relatedQueries))
    print('imageUrl=', imageUrl)
    print('articles=', str(articles))
    print('shareUrl=', shareUrl)
    print('date=', fetch_date)

    cursor.execute("INSERT INTO google_trend (title, formattedTraffic, relatedQueries, imageUrl, articles, shareUrl, fetch_date) VALUES (%s, %s, %s, %s, %s, %s, %s);", (title, formattedTraffic, str(relatedQueries), imageUrl, str(articles), shareUrl, fetch_date))


def fetch_trend(fetch_date, cursor):
    url = 'https://trends.google.com.tw/trends/api/dailytrends?hl=zh-TW&tz=-480&ed=' + fetch_date + '&geo=TW&ns=15'
    print(url)
    
    resp = requests.get(url)

    data = re.sub(r'\)\]\}\',\n', '', resp.text)
    trend_tree = json.loads(data)

    if len(trend_tree['default']['trendingSearchesDays']) == 0:
        print('Empty trend data at ' + fetch_date)
        return

    keywords = trend_tree['default']['trendingSearchesDays'][0]['trendingSearches']

    for i in keywords:
        title = i['title']['query']
        formattedTraffic = i['formattedTraffic']

        imageUrl = 'none'
        if 'imageUrl' in i['image']:
            imageUrl = i['image']['imageUrl']
        articles = i['articles']
        shareUrl = i['shareUrl']
        fetch_date = int(fetch_date)

        # 相關的搜尋,只取出關鍵字
        relatedQueries = []
        for r in i['relatedQueries']:
            relatedQueries.append(r['query'])

        insert_data(cursor, title, formattedTraffic, relatedQueries, imageUrl, articles, shareUrl, fetch_date)

def main():
    # 取得今天當結束日期
    day_end = datetime.datetime.today()

    # 取得資料庫連線
    conn = connect_postgresql()

    for d in range(days, 0, -1):
        day_start = day_end - datetime.timedelta(days=d)
        format_day = datetime.datetime.strftime(day_start,'%Y%m%d')
        print(d, format_day)

        fetch_trend(format_day, conn.cursor())

    # 釋放資料庫資源
    conn.commit()
    conn.cursor().close()
    conn.close()


if __name__ == "__main__":
    main()

注意:

hostdbnameuserpassword變數值需換成實際資料庫設定值才能正常執行。