###### tags: SQL
# 第五章:
:::spoiler 計算行為數和相關比率
```
WITH stats AS (
--計算日誌中的全體UU(不重複使用者)數
SELECT COUNT(DISTINCT session) AS total_uu
FROM action_log
)
SELECT l.action,
--行為數
COUNT(DISTINCT l.session) AS action_uu,
--全體UU
COUNT(1) action_count,
--全體UU
s.total_uu,
--利用率: Action UU / 全體 UU
100.0 * COUNT(DISTINCT l.session) / s.total_uu AS usage_rate,
--每人平均行為數: 行為數 / Action UU
1.0 * COUNT(1) / COUNT(DISTINCT l.session) AS count_per_user
FROM action_log l
--將日誌全體的UU數和所有record 聯結
CROSS JOIN
stats AS s
GROUP BY l.action,
s.total_uu
```
:::
---
:::spoiler 判斷登入狀態
```
WITH action_log_with_status AS (
SELECT session,
user_id,
action,
--user_id 不是null 或空字串時,判定為login
CASE WHEN COALESCE(user_id, '') <> '' THEN 'login' ELSE 'guest' END AS login_status
FROM action_log
)
SELECT *
FROM action_log_with_status
```
:::
---
:::spoiler 依據login_status 狀態彙總行為數
```
WITH action_log_with_status AS (
SELECT session,
user_id,
action,
--user_id 不是null 或空字串時,判定為login
CASE WHEN COALESCE(user_id, '') <> '' THEN 'login' ELSE 'guest' END AS login_status
FROM action_log
)
SELECT COALESCE(action, 'all') AS action,
COALESCE(login_status, 'all') AS login_status,
COUNT(DISTINCT session) AS action_uu,
COUNT(1) AS action_count
FROM action_log_with_status
GROUP BY
ROLLUP(action, login_status)
```
:::
---
:::spoiler 分組彙總會員與非會員數據
```
WITH action_log_with_status AS (
SELECT session,
user_id,
action,
--將log以時間軸排序,只要是曾經登入過的session,
--在那之後就會將日誌中的會員狀態(member_status)設為member
CASE
WHEN COALESCE(MAX(user_id) OVER(PARTITION BY session ORDER BY stamp ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW), '') <> '' THEN 'member'
ELSE 'none'
END AS member_status,
stamp
FROM action_log
)
SELECT *
FROM action_log_with_status
```
:::
---
:::spoiler 計算使用者生日
```
WITH mst_users_with_int_birth_date AS (
SELECT *,
20170101 AS int_specific_date, --以整數型態表示特定日(2017/01/01)
CAST(replace(substring(birth_date, 1, 10), '-', '') AS integer) AS int_birth_date
FROM mst_users
),
mst_users_with_age AS (
SELECT *,
--在特定日期(2017年1月1日)的年齡
--將生日和特定日期以整數型態呈現,再將兩者差值除以10000
floor((int_specific_date - int_birth_date) / 10000) AS age
FROM mst_users_with_int_birth_date
)
SELECT user_id,
sex,
birth_date,
age
FROM mst_users_with_age
```
:::
---
:::spoiler 依性別和年齡區分年齡層
```
WITH mst_users_with_int_birth_date AS (
SELECT *,
20170101 AS int_specific_date, --以整數型態表示特定日(2017/01/01)
CAST(replace(substring(birth_date, 1, 10), '-', '') AS integer) AS int_birth_date
FROM mst_users
),
mst_users_with_age AS (
SELECT *,
--在特定日期(2017年1月1日)的年齡
--將生日和特定日期以整數型態呈現,再將兩者差值除以10000
floor((int_specific_date - int_birth_date) / 10000) AS age
FROM mst_users_with_int_birth_date
),
mst_users_with_category AS(
SELECT user_id,
sex,
age,
CONCAT(
CASE
WHEN 20 <= age THEN sex
ELSE ''
END,
CASE
WHEN age BETWEEN 4 AND 12 THEN 'C'
WHEN age BETWEEN 12 AND 19 THEN 'T'
WHEN age BETWEEN 20 AND 34 THEN '1'
WHEN age BETWEEN 35 AND 49 THEN '2'
WHEN age >= 50 THEN '3'
END
) AS category
FROM mst_users_with_age
)
SELECT *
FROM mst_users_with_category
```
:::
---
:::spoiler 計算各年齡層人數
```
WITH mst_users_with_int_birth_date AS (
SELECT *,
20170101 AS int_specific_date, --以整數型態表示特定日(2017/01/01)
CAST(replace(substring(birth_date, 1, 10), '-', '') AS integer) AS int_birth_date
FROM mst_users
),
mst_users_with_age AS (
SELECT *,
--在特定日期(2017年1月1日)的年齡
--將生日和特定日期以整數型態呈現,再將兩者差值除以10000
floor((int_specific_date - int_birth_date) / 10000) AS age
FROM mst_users_with_int_birth_date
),
mst_users_with_category AS(
SELECT user_id,
sex,
age,
CONCAT(
CASE
WHEN 20 <= age THEN sex
ELSE ''
END,
CASE
WHEN age BETWEEN 4 AND 12 THEN 'C'
WHEN age BETWEEN 12 AND 19 THEN 'T'
WHEN age BETWEEN 20 AND 34 THEN '1'
WHEN age BETWEEN 35 AND 49 THEN '2'
WHEN age >= 50 THEN '3'
END
) AS category
FROM mst_users_with_age
)
SELECT category,
COUNT(1) AS user_count
FROM mst_users_with_category
GROUP BY category
```
:::
---
:::spoiler 彙總使用者的訪問頻率
```
WITH mst_users_with_int_birth_date AS (
SELECT *,
20170101 AS int_specific_date, --以整數型態表示特定日(2017/01/01)
CAST(replace(substring(birth_date, 1, 10), '-', '') AS integer) AS int_birth_date
FROM mst_users
),
mst_users_with_age AS (
SELECT *,
--在特定日期(2017年1月1日)的年齡
--將生日和特定日期以整數型態呈現,再將兩者差值除以10000
floor((int_specific_date - int_birth_date) / 10000) AS age
FROM mst_users_with_int_birth_date
),
mst_users_with_category AS(
SELECT user_id,
sex,
age,
CONCAT(
CASE
WHEN 20 <= age THEN sex
ELSE ''
END,
CASE
WHEN age BETWEEN 4 AND 12 THEN 'C'
WHEN age BETWEEN 12 AND 19 THEN 'T'
WHEN age BETWEEN 20 AND 34 THEN '1'
WHEN age BETWEEN 35 AND 49 THEN '2'
WHEN age >= 50 THEN '3'
END
) AS category
FROM mst_users_with_age
)
SELECT p.category AS product_category,
u.category AS user_category,
COUNT(*) AS purchase_count
FROM action_log p
JOIN mst_users_with_category u
ON p.user_id = u.user_id
WHERE action = 'purchase' --設定購買日誌的數據使用條件
GROUP BY p.category,
u.category
ORDER BY p.category,
u.category
```
:::
---
:::spoiler 計算組成百分比和累計百分比(一週內使用者的使用天數和組成百分比)
```
WITH action_log_with_dt AS(
SELECT *,
substring(stamp, 1, 10) AS dt
FROM action_log
)
, action_day_count_per_user AS (
SELECT user_id,
COUNT(DISTINCT dt) AS action_day_count
FROM action_log_with_dt
WHERE dt BETWEEN '2016-11-01' AND '2016-11-07' --以一週間的數據為對象
GROUP BY user_id
)
-- SELECT *
-- FROM action_day_count_per_user
SELECT action_day_count,
COUNT(DISTINCT user_id) AS user_count,
--組成百分比
100.0 * COUNT(DISTINCT user_id) / SUM(COUNT(DISTINCT user_id)) OVER() AS composition_ratio,
--累計百分比
100.0 * SUM(COUNT(DISTINCT user_id)) OVER(ORDER BY action_day_count ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) / SUM(COUNT(DISTINCT user_id)) OVER() AS cumulative_ratio
FROM action_day_count_per_user
GROUP BY action_day_count
ORDER BY action_day_count
```
:::
---
:::spoiler 彙總每個使用者的行為旗標
```
WITH user_action_flag AS (
--針對每位使用者所執行的action,給予0或1的旗標
SELECT user_id,
SIGN(SUM(CASE WHEN action = 'purchase' THEN 1 ELSE 0 END)) AS has_purchase,
SIGN(SUM(CASE WHEN action = 'review' THEN 1 ELSE 0 END)) AS has_review,
SIGN(SUM(CASE WHEN action = 'favorite' THEN 1 ELSE 0 END)) AS has_favorite
FROM action_log
GROUP BY user_id
)
SELECT *
FROM user_action_flag;
```
:::
---
:::spoiler 計算action的所有組合方式之使用者數
```
action_venn_diagram AS (
SELECT has_purchase,
has_review,
has_favorite
FROM user_action_flag
GROUP BY
CUBE(has_purchase, has_review, has_favorite)
)
SELECT *
FROM action_venn_diagram
ORDER BY has_purchase, has_review, has_favorite
```
:::
---
:::spoiler 承上,不使用CUBE句,以標準SQL的敘述來表現CUBE的結果
```
WITH user_action_flag AS (
--針對每位使用者所執行的action,給予0或1的旗標
SELECT user_id,
SIGN(SUM(CASE WHEN action = 'purchase' THEN 1 ELSE 0 END)) AS has_purchase,
SIGN(SUM(CASE WHEN action = 'review' THEN 1 ELSE 0 END)) AS has_review,
SIGN(SUM(CASE WHEN action = 'favorite' THEN 1 ELSE 0 END)) AS has_favorite
FROM action_log
GROUP BY user_id
),
action_venn_diagram AS (
----在action的各種組合方式下,分別計算使用者數,最後UNION ALL 統整結果
--彙總以3個action進行組合使用的數據
SELECT has_purchase,
has_review,
has_favorite,
COUNT(1) AS users
FROM user_action_flag
GROUP BY has_purchase,
has_review,
has_favorite
--彙總以3種action中的其中2種actoin,進行組合使用數據
UNION ALL
SELECT NULL AS has_purchase,
has_review,
has_favorite,
COUNT(1) AS users
FROM user_action_flag
GROUP BY has_review, has_favorite
UNION ALL
SELECT has_purchase,
NULL AS has_review,
has_favorite,
COUNT(1) AS users
FROM user_action_flag
GROUP BY has_purchase, has_favorite
UNION ALL
SELECT has_purchase,
has_review,
NULL AS has_favorite,
COUNT(1) AS users
FROM user_action_flag
GROUP BY has_purchase, has_review
--彙總以3種action中的其中一種action,進行組合使用的數據
UNION ALL
SELECT NULL AS has_purchase,
NULL AS has_review,
has_favorite,
COUNT(1) AS users
FROM user_action_flag
GROUP BY has_favorite
UNION ALL
SELECT NULL AS has_purchase,
has_review,
NULL AS has_favorite,
COUNT(1) AS users
FROM user_action_flag
GROUP BY has_review
UNION ALL
SELECT has_purchase,
NULL AS has_review,
NULL AS has_favorite,
COUNT(1) AS users
FROM user_action_flag
GROUP BY has_purchase
--不用action進行分組,彙總全部使用者數
UNION ALL
SELECT NULL AS has_purchase,
NULL AS has_review,
NULL AS has_favorite,
COUNT(1) AS users
FROM user_action_flag
)
SELECT *
FROM action_venn_diagram
ORDER BY has_purchase,
has_review,
has_favorite
```
:::
---
:::spoiler 新增欄位值為NULL的虛擬紀錄,取得和CUBE句一樣的結果
```
WITH user_action_flag AS (
--針對每位使用者所執行的action,給予0或1的旗標
SELECT user_id,
SIGN(SUM(CASE WHEN action = 'purchase' THEN 1 ELSE 0 END)) AS has_purchase,
SIGN(SUM(CASE WHEN action = 'review' THEN 1 ELSE 0 END)) AS has_review,
SIGN(SUM(CASE WHEN action = 'favorite' THEN 1 ELSE 0 END)) AS has_favorite
FROM action_log
GROUP BY user_id
),
action_venn_diagram AS (
SELECT mod_has_purchase AS has_purchase,
mod_has_review AS has_review,
mod_has_favorite AS has_favorite,
COUNT(1) AS users
FROM user_action_flag
--新增各欄位值為NULL的虛擬紀錄
--在BigQuery 使用CROSS JOIN 和 unnest函數
CROSS JOIN unnest(array[has_purchase, NULL]) AS mod_has_purchase
CROSS JOIN unnest(array[has_review, NULL]) AS mod_has_review
CROSS JOIN unnest(array[has_favorite, NULL]) AS mod_has_favorite
GROUP BY mod_has_purchase,
mod_has_review,
mod_has_favorite
)
SELECT *
FROM action_venn_diagram
ORDER BY has_purchase,
has_review,
has_favorite
```
:::
---
:::spoiler 整理建立圓餅圖所需之數據
```
WITH user_action_flag AS (
--針對每位使用者所執行的action,給予0或1的旗標
SELECT user_id,
SIGN(SUM(CASE WHEN action = 'purchase' THEN 1 ELSE 0 END)) AS has_purchase,
SIGN(SUM(CASE WHEN action = 'review' THEN 1 ELSE 0 END)) AS has_review,
SIGN(SUM(CASE WHEN action = 'favorite' THEN 1 ELSE 0 END)) AS has_favorite
FROM action_log
GROUP BY user_id
),
action_venn_diagram AS (
SELECT mod_has_purchase AS has_purchase,
mod_has_review AS has_review,
mod_has_favorite AS has_favorite,
COUNT(1) AS users
FROM user_action_flag
--新增各欄位值為NULL的虛擬紀錄
--在BigQuery 使用CROSS JOIN 和 unnest函數
CROSS JOIN unnest(array[has_purchase, NULL]) AS mod_has_purchase
CROSS JOIN unnest(array[has_review, NULL]) AS mod_has_review
CROSS JOIN unnest(array[has_favorite, NULL]) AS mod_has_favorite
GROUP BY mod_has_purchase,
mod_has_review,
mod_has_favorite
)
SELECT
--將0, 1旗標轉換成字串
CASE has_purchase
WHEN 1 THEN 'purchase'
WHEN 0 THEN 'not purchase'
ELSE 'any'
END AS has_purchase,
CASE has_review
WHEN 1 THEN 'review'
WHEN 0 THEN 'not review'
ELSE 'any'
END AS has_review,
CASE has_favorite
WHEN 1 THEN 'favorite'
WHEN 0 THEN 'not favorite'
ELSE 'any'
END AS has_favorite,
users,
--計算在全體不重複使用者中,所佔的比例
100.0 * users / NULLIF(
--因為全部action旗標欄位都為NULL的使用者數就是全部的不重複使用者數(UU)
--所以用視窗函數取得那行的使用者數
SUM(
CASE
WHEN has_purchase IS NULL AND
has_review IS NULL AND
has_favorite IS NULL
THEN users
ELSE 0
END
) OVER()
, 0) AS ratio
FROM action_venn_diagram
ORDER BY
has_purchase,
has_review,
has_favorite
```
:::
---
:::spoiler 依消費降冪排序,將使用者依序分成10等份
```
WITH user_purchase_amount AS (
SELECT user_id,
SUM(amount) AS purchase_amount
FROM action_log
WHERE action = 'purchase'
GROUP BY user_id
) ,
users_with_decile AS (
SELECT user_id,
purchase_amount,
ntile(10) OVER(ORDER BY purchase_amount DESC) as decile
FROM user_purchase_amount
)
SELECT *
FROM users_with_decile
```
:::