###### tags: SQL # 第五章: :::spoiler 計算行為數和相關比率 ``` WITH stats AS ( --計算日誌中的全體UU(不重複使用者)數 SELECT COUNT(DISTINCT session) AS total_uu FROM action_log ) SELECT l.action, --行為數 COUNT(DISTINCT l.session) AS action_uu, --全體UU COUNT(1) action_count, --全體UU s.total_uu, --利用率: Action UU / 全體 UU 100.0 * COUNT(DISTINCT l.session) / s.total_uu AS usage_rate, --每人平均行為數: 行為數 / Action UU 1.0 * COUNT(1) / COUNT(DISTINCT l.session) AS count_per_user FROM action_log l --將日誌全體的UU數和所有record 聯結 CROSS JOIN stats AS s GROUP BY l.action, s.total_uu ``` ::: --- :::spoiler 判斷登入狀態 ``` WITH action_log_with_status AS ( SELECT session, user_id, action, --user_id 不是null 或空字串時,判定為login CASE WHEN COALESCE(user_id, '') <> '' THEN 'login' ELSE 'guest' END AS login_status FROM action_log ) SELECT * FROM action_log_with_status ``` ::: --- :::spoiler 依據login_status 狀態彙總行為數 ``` WITH action_log_with_status AS ( SELECT session, user_id, action, --user_id 不是null 或空字串時,判定為login CASE WHEN COALESCE(user_id, '') <> '' THEN 'login' ELSE 'guest' END AS login_status FROM action_log ) SELECT COALESCE(action, 'all') AS action, COALESCE(login_status, 'all') AS login_status, COUNT(DISTINCT session) AS action_uu, COUNT(1) AS action_count FROM action_log_with_status GROUP BY ROLLUP(action, login_status) ``` ::: --- :::spoiler 分組彙總會員與非會員數據 ``` WITH action_log_with_status AS ( SELECT session, user_id, action, --將log以時間軸排序,只要是曾經登入過的session, --在那之後就會將日誌中的會員狀態(member_status)設為member CASE WHEN COALESCE(MAX(user_id) OVER(PARTITION BY session ORDER BY stamp ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW), '') <> '' THEN 'member' ELSE 'none' END AS member_status, stamp FROM action_log ) SELECT * FROM action_log_with_status ``` ::: --- :::spoiler 計算使用者生日 ``` WITH mst_users_with_int_birth_date AS ( SELECT *, 20170101 AS int_specific_date, --以整數型態表示特定日(2017/01/01) CAST(replace(substring(birth_date, 1, 10), '-', '') AS integer) AS int_birth_date FROM mst_users ), mst_users_with_age AS ( SELECT *, --在特定日期(2017年1月1日)的年齡 --將生日和特定日期以整數型態呈現,再將兩者差值除以10000 floor((int_specific_date - int_birth_date) / 10000) AS age FROM mst_users_with_int_birth_date ) SELECT user_id, sex, birth_date, age FROM mst_users_with_age ``` ::: --- :::spoiler 依性別和年齡區分年齡層 ``` WITH mst_users_with_int_birth_date AS ( SELECT *, 20170101 AS int_specific_date, --以整數型態表示特定日(2017/01/01) CAST(replace(substring(birth_date, 1, 10), '-', '') AS integer) AS int_birth_date FROM mst_users ), mst_users_with_age AS ( SELECT *, --在特定日期(2017年1月1日)的年齡 --將生日和特定日期以整數型態呈現,再將兩者差值除以10000 floor((int_specific_date - int_birth_date) / 10000) AS age FROM mst_users_with_int_birth_date ), mst_users_with_category AS( SELECT user_id, sex, age, CONCAT( CASE WHEN 20 <= age THEN sex ELSE '' END, CASE WHEN age BETWEEN 4 AND 12 THEN 'C' WHEN age BETWEEN 12 AND 19 THEN 'T' WHEN age BETWEEN 20 AND 34 THEN '1' WHEN age BETWEEN 35 AND 49 THEN '2' WHEN age >= 50 THEN '3' END ) AS category FROM mst_users_with_age ) SELECT * FROM mst_users_with_category ``` ::: --- :::spoiler 計算各年齡層人數 ``` WITH mst_users_with_int_birth_date AS ( SELECT *, 20170101 AS int_specific_date, --以整數型態表示特定日(2017/01/01) CAST(replace(substring(birth_date, 1, 10), '-', '') AS integer) AS int_birth_date FROM mst_users ), mst_users_with_age AS ( SELECT *, --在特定日期(2017年1月1日)的年齡 --將生日和特定日期以整數型態呈現,再將兩者差值除以10000 floor((int_specific_date - int_birth_date) / 10000) AS age FROM mst_users_with_int_birth_date ), mst_users_with_category AS( SELECT user_id, sex, age, CONCAT( CASE WHEN 20 <= age THEN sex ELSE '' END, CASE WHEN age BETWEEN 4 AND 12 THEN 'C' WHEN age BETWEEN 12 AND 19 THEN 'T' WHEN age BETWEEN 20 AND 34 THEN '1' WHEN age BETWEEN 35 AND 49 THEN '2' WHEN age >= 50 THEN '3' END ) AS category FROM mst_users_with_age ) SELECT category, COUNT(1) AS user_count FROM mst_users_with_category GROUP BY category ``` ::: --- :::spoiler 彙總使用者的訪問頻率 ``` WITH mst_users_with_int_birth_date AS ( SELECT *, 20170101 AS int_specific_date, --以整數型態表示特定日(2017/01/01) CAST(replace(substring(birth_date, 1, 10), '-', '') AS integer) AS int_birth_date FROM mst_users ), mst_users_with_age AS ( SELECT *, --在特定日期(2017年1月1日)的年齡 --將生日和特定日期以整數型態呈現,再將兩者差值除以10000 floor((int_specific_date - int_birth_date) / 10000) AS age FROM mst_users_with_int_birth_date ), mst_users_with_category AS( SELECT user_id, sex, age, CONCAT( CASE WHEN 20 <= age THEN sex ELSE '' END, CASE WHEN age BETWEEN 4 AND 12 THEN 'C' WHEN age BETWEEN 12 AND 19 THEN 'T' WHEN age BETWEEN 20 AND 34 THEN '1' WHEN age BETWEEN 35 AND 49 THEN '2' WHEN age >= 50 THEN '3' END ) AS category FROM mst_users_with_age ) SELECT p.category AS product_category, u.category AS user_category, COUNT(*) AS purchase_count FROM action_log p JOIN mst_users_with_category u ON p.user_id = u.user_id WHERE action = 'purchase' --設定購買日誌的數據使用條件 GROUP BY p.category, u.category ORDER BY p.category, u.category ``` ::: --- :::spoiler 計算組成百分比和累計百分比(一週內使用者的使用天數和組成百分比) ``` WITH action_log_with_dt AS( SELECT *, substring(stamp, 1, 10) AS dt FROM action_log ) , action_day_count_per_user AS ( SELECT user_id, COUNT(DISTINCT dt) AS action_day_count FROM action_log_with_dt WHERE dt BETWEEN '2016-11-01' AND '2016-11-07' --以一週間的數據為對象 GROUP BY user_id ) -- SELECT * -- FROM action_day_count_per_user SELECT action_day_count, COUNT(DISTINCT user_id) AS user_count, --組成百分比 100.0 * COUNT(DISTINCT user_id) / SUM(COUNT(DISTINCT user_id)) OVER() AS composition_ratio, --累計百分比 100.0 * SUM(COUNT(DISTINCT user_id)) OVER(ORDER BY action_day_count ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) / SUM(COUNT(DISTINCT user_id)) OVER() AS cumulative_ratio FROM action_day_count_per_user GROUP BY action_day_count ORDER BY action_day_count ``` ::: --- :::spoiler 彙總每個使用者的行為旗標 ``` WITH user_action_flag AS ( --針對每位使用者所執行的action,給予0或1的旗標 SELECT user_id, SIGN(SUM(CASE WHEN action = 'purchase' THEN 1 ELSE 0 END)) AS has_purchase, SIGN(SUM(CASE WHEN action = 'review' THEN 1 ELSE 0 END)) AS has_review, SIGN(SUM(CASE WHEN action = 'favorite' THEN 1 ELSE 0 END)) AS has_favorite FROM action_log GROUP BY user_id ) SELECT * FROM user_action_flag; ``` ::: --- :::spoiler 計算action的所有組合方式之使用者數 ``` action_venn_diagram AS ( SELECT has_purchase, has_review, has_favorite FROM user_action_flag GROUP BY CUBE(has_purchase, has_review, has_favorite) ) SELECT * FROM action_venn_diagram ORDER BY has_purchase, has_review, has_favorite ``` ::: --- :::spoiler 承上,不使用CUBE句,以標準SQL的敘述來表現CUBE的結果 ``` WITH user_action_flag AS ( --針對每位使用者所執行的action,給予0或1的旗標 SELECT user_id, SIGN(SUM(CASE WHEN action = 'purchase' THEN 1 ELSE 0 END)) AS has_purchase, SIGN(SUM(CASE WHEN action = 'review' THEN 1 ELSE 0 END)) AS has_review, SIGN(SUM(CASE WHEN action = 'favorite' THEN 1 ELSE 0 END)) AS has_favorite FROM action_log GROUP BY user_id ), action_venn_diagram AS ( ----在action的各種組合方式下,分別計算使用者數,最後UNION ALL 統整結果 --彙總以3個action進行組合使用的數據 SELECT has_purchase, has_review, has_favorite, COUNT(1) AS users FROM user_action_flag GROUP BY has_purchase, has_review, has_favorite --彙總以3種action中的其中2種actoin,進行組合使用數據 UNION ALL SELECT NULL AS has_purchase, has_review, has_favorite, COUNT(1) AS users FROM user_action_flag GROUP BY has_review, has_favorite UNION ALL SELECT has_purchase, NULL AS has_review, has_favorite, COUNT(1) AS users FROM user_action_flag GROUP BY has_purchase, has_favorite UNION ALL SELECT has_purchase, has_review, NULL AS has_favorite, COUNT(1) AS users FROM user_action_flag GROUP BY has_purchase, has_review --彙總以3種action中的其中一種action,進行組合使用的數據 UNION ALL SELECT NULL AS has_purchase, NULL AS has_review, has_favorite, COUNT(1) AS users FROM user_action_flag GROUP BY has_favorite UNION ALL SELECT NULL AS has_purchase, has_review, NULL AS has_favorite, COUNT(1) AS users FROM user_action_flag GROUP BY has_review UNION ALL SELECT has_purchase, NULL AS has_review, NULL AS has_favorite, COUNT(1) AS users FROM user_action_flag GROUP BY has_purchase --不用action進行分組,彙總全部使用者數 UNION ALL SELECT NULL AS has_purchase, NULL AS has_review, NULL AS has_favorite, COUNT(1) AS users FROM user_action_flag ) SELECT * FROM action_venn_diagram ORDER BY has_purchase, has_review, has_favorite ``` ::: --- :::spoiler 新增欄位值為NULL的虛擬紀錄,取得和CUBE句一樣的結果 ``` WITH user_action_flag AS ( --針對每位使用者所執行的action,給予0或1的旗標 SELECT user_id, SIGN(SUM(CASE WHEN action = 'purchase' THEN 1 ELSE 0 END)) AS has_purchase, SIGN(SUM(CASE WHEN action = 'review' THEN 1 ELSE 0 END)) AS has_review, SIGN(SUM(CASE WHEN action = 'favorite' THEN 1 ELSE 0 END)) AS has_favorite FROM action_log GROUP BY user_id ), action_venn_diagram AS ( SELECT mod_has_purchase AS has_purchase, mod_has_review AS has_review, mod_has_favorite AS has_favorite, COUNT(1) AS users FROM user_action_flag --新增各欄位值為NULL的虛擬紀錄 --在BigQuery 使用CROSS JOIN 和 unnest函數 CROSS JOIN unnest(array[has_purchase, NULL]) AS mod_has_purchase CROSS JOIN unnest(array[has_review, NULL]) AS mod_has_review CROSS JOIN unnest(array[has_favorite, NULL]) AS mod_has_favorite GROUP BY mod_has_purchase, mod_has_review, mod_has_favorite ) SELECT * FROM action_venn_diagram ORDER BY has_purchase, has_review, has_favorite ``` ::: --- :::spoiler 整理建立圓餅圖所需之數據 ``` WITH user_action_flag AS ( --針對每位使用者所執行的action,給予0或1的旗標 SELECT user_id, SIGN(SUM(CASE WHEN action = 'purchase' THEN 1 ELSE 0 END)) AS has_purchase, SIGN(SUM(CASE WHEN action = 'review' THEN 1 ELSE 0 END)) AS has_review, SIGN(SUM(CASE WHEN action = 'favorite' THEN 1 ELSE 0 END)) AS has_favorite FROM action_log GROUP BY user_id ), action_venn_diagram AS ( SELECT mod_has_purchase AS has_purchase, mod_has_review AS has_review, mod_has_favorite AS has_favorite, COUNT(1) AS users FROM user_action_flag --新增各欄位值為NULL的虛擬紀錄 --在BigQuery 使用CROSS JOIN 和 unnest函數 CROSS JOIN unnest(array[has_purchase, NULL]) AS mod_has_purchase CROSS JOIN unnest(array[has_review, NULL]) AS mod_has_review CROSS JOIN unnest(array[has_favorite, NULL]) AS mod_has_favorite GROUP BY mod_has_purchase, mod_has_review, mod_has_favorite ) SELECT --將0, 1旗標轉換成字串 CASE has_purchase WHEN 1 THEN 'purchase' WHEN 0 THEN 'not purchase' ELSE 'any' END AS has_purchase, CASE has_review WHEN 1 THEN 'review' WHEN 0 THEN 'not review' ELSE 'any' END AS has_review, CASE has_favorite WHEN 1 THEN 'favorite' WHEN 0 THEN 'not favorite' ELSE 'any' END AS has_favorite, users, --計算在全體不重複使用者中,所佔的比例 100.0 * users / NULLIF( --因為全部action旗標欄位都為NULL的使用者數就是全部的不重複使用者數(UU) --所以用視窗函數取得那行的使用者數 SUM( CASE WHEN has_purchase IS NULL AND has_review IS NULL AND has_favorite IS NULL THEN users ELSE 0 END ) OVER() , 0) AS ratio FROM action_venn_diagram ORDER BY has_purchase, has_review, has_favorite ``` ::: --- :::spoiler 依消費降冪排序,將使用者依序分成10等份 ``` WITH user_purchase_amount AS ( SELECT user_id, SUM(amount) AS purchase_amount FROM action_log WHERE action = 'purchase' GROUP BY user_id ) , users_with_decile AS ( SELECT user_id, purchase_amount, ntile(10) OVER(ORDER BY purchase_amount DESC) as decile FROM user_purchase_amount ) SELECT * FROM users_with_decile ``` :::