使用PostgreSQL做不定時段統計

這是延伸的問題,之前有固定間隔的,也介紹了date_bin().
上一次的
 這次的

苦工大師






















































with t1 as (
select case
          when ts >= timestamp '2023-10-11 09:00:00' 
           and ts <  timestamp '2023-10-11 10:00:00'
            then timestamp '2023-10-11 09:00:00'
          when ts >= timestamp '2023-10-11 10:00:00'
           and ts <  timestamp '2023-10-11 10:08:00'
            then timestamp '2023-10-11 10:00:00'
          when ts >= timestamp '2023-10-11 10:08:00'
           and ts <  timestamp '2023-10-11 10:18:00'
            then timestamp '2023-10-11 10:08:00'
          when ts >= timestamp '2023-10-11 10:18:00'
           and ts <  timestamp '2023-10-11 10:27:00'
            then timestamp '2023-10-11 10:18:00'
          when ts >= timestamp '2023-10-11 10:27:00'
           and ts <  timestamp '2023-10-11 10:34:00'
            then timestamp '2023-10-11 10:27:00'
          when ts >= timestamp '2023-10-11 10:34:00'
           and ts <  timestamp '2023-10-11 10:44:00'
            then timestamp '2023-10-11 10:34:00'
          when ts >= timestamp '2023-10-11 10:44:00'
           and ts <  timestamp '2023-10-11 11:00:00'
            then timestamp '2023-10-11 10:44:00'
          when ts >= timestamp '2023-10-11 11:00:00'
           and ts <  timestamp '2023-10-11 11:15:00'
            then timestamp '2023-10-11 11:00:00'
          when ts >= timestamp '2023-10-11 11:15:00'
           and ts <  timestamp '2023-10-11 11:30:00'
            then timestamp '2023-10-11 11:15:00'
          else null
        end as bin
     , qty
  from booking
)
select bin
     , count(*)
     , sum(qty)
  from t1
 group by rollup(bin)
 order by bin nulls last;
 
         bin         | count | sum 
---------------------+-------+-----
 2023-10-11 09:00:00 |    31 |  71
 2023-10-11 10:00:00 |    11 |  28
 2023-10-11 10:08:00 |    25 |  65
 2023-10-11 10:18:00 |    18 |  46
 2023-10-11 10:27:00 |    12 |  30
 2023-10-11 10:34:00 |    13 |  33
 2023-10-11 10:44:00 |    25 |  62
 2023-10-11 11:00:00 |    13 |  36
 2023-10-11 11:15:00 |    12 |  35
 NULL                |   160 | 406
(10 rows)

麻煩,也容易出錯,而且當該時段沒資料時,不會顯示出來,這裡就不多做處理,因為不是主角.

輔助表的方式







































create temp table booking_aux (
  sts timestamp
, ets timestamp
);

insert into booking_aux values
(timestamp '2023-10-11 09:00:00', timestamp '2023-10-11 10:00:00'),
(timestamp '2023-10-11 10:00:00', timestamp '2023-10-11 10:08:00'),
(timestamp '2023-10-11 10:08:00', timestamp '2023-10-11 10:18:00'),
(timestamp '2023-10-11 10:18:00', timestamp '2023-10-11 10:27:00'),
(timestamp '2023-10-11 10:27:00', timestamp '2023-10-11 10:34:00'),
(timestamp '2023-10-11 10:34:00', timestamp '2023-10-11 10:44:00'),
(timestamp '2023-10-11 10:44:00', timestamp '2023-10-11 11:00:00'),
(timestamp '2023-10-11 11:00:00', timestamp '2023-10-11 11:15:00'),
(timestamp '2023-10-11 11:15:00', timestamp '2023-10-11 11:30:00');

select ba.sts as bin
     , count(*)
     , sum(qty)
  from booking b
 right join booking_aux ba
    on b.ts >= ba.sts
   and b.ts <  ba.ets
 group by rollup(bin)
 order by bin nulls last;

         bin         | count | sum 
---------------------+-------+-----
 2023-10-11 09:00:00 |    31 |  71
 2023-10-11 10:00:00 |    11 |  28
 2023-10-11 10:08:00 |    25 |  65
 2023-10-11 10:18:00 |    18 |  46
 2023-10-11 10:27:00 |    12 |  30
 2023-10-11 10:34:00 |    13 |  33
 2023-10-11 10:44:00 |    25 |  62
 2023-10-11 11:00:00 |    13 |  36
 2023-10-11 11:15:00 |    12 |  35
 NULL                |   160 | 406
(10 rows)

方便一點了.

輔助表搭配PostgreSQL Range Data Type















































create temp table booking_aux2 (
  bin timestamp
, timsrange tsrange 
);

insert into booking_aux2
select sts, tsrange(sts, ets)
  from booking_aux;

select * from booking_aux2;

         bin         |                   timsrange                   
---------------------+-----------------------------------------------
 2023-10-11 09:00:00 | ["2023-10-11 09:00:00","2023-10-11 10:00:00")
 2023-10-11 10:00:00 | ["2023-10-11 10:00:00","2023-10-11 10:08:00")
 2023-10-11 10:08:00 | ["2023-10-11 10:08:00","2023-10-11 10:18:00")
 2023-10-11 10:18:00 | ["2023-10-11 10:18:00","2023-10-11 10:27:00")
 2023-10-11 10:27:00 | ["2023-10-11 10:27:00","2023-10-11 10:34:00")
 2023-10-11 10:34:00 | ["2023-10-11 10:34:00","2023-10-11 10:44:00")
 2023-10-11 10:44:00 | ["2023-10-11 10:44:00","2023-10-11 11:00:00")
 2023-10-11 11:00:00 | ["2023-10-11 11:00:00","2023-10-11 11:15:00")
 2023-10-11 11:15:00 | ["2023-10-11 11:15:00","2023-10-11 11:30:00")
(9 rows)


select ba.bin
     , count(*)
     , sum(qty)
  from booking b
 right join booking_aux2 ba
    on b.ts <@ ba.timsrange
 group by rollup(bin)
 order by bin nulls last;

         bin         | count | sum 
---------------------+-------+-----
 2023-10-11 09:00:00 |    31 |  71
 2023-10-11 10:00:00 |    11 |  28
 2023-10-11 10:08:00 |    25 |  65
 2023-10-11 10:18:00 |    18 |  46
 2023-10-11 10:27:00 |    12 |  30
 2023-10-11 10:34:00 |    13 |  33
 2023-10-11 10:44:00 |    25 |  62
 2023-10-11 11:00:00 |    13 |  36
 2023-10-11 11:15:00 |    12 |  35
 NULL                |   160 | 406
(10 rows)

這就PostgreSQL才有的資料型態,會比較方便,也會快一些,因為只要做一次比較運算 <@, 不用做兩種比較運算.

使用PostgreSQL 的Array

觀察上面的時段,頭尾銜接部分會出現重複輸入.之前我有其他分享,有介紹了Array.我們可以將輔助表轉為Array的形式.





































































with t1 (tsarray) as (
select '{"2023-10-11 09:00:00", "2023-10-11 10:00:00",
         "2023-10-11 10:08:00", "2023-10-11 10:18:00",
         "2023-10-11 10:27:00", "2023-10-11 10:34:00",
         "2023-10-11 10:44:00", "2023-10-11 11:00:00",
         "2023-10-11 11:15:00"}'
         ::timestamp[]
)
select width_bucket(b.ts, t1.tsarray) as bin
     , count(*) as cnt
     , sum(qty) as sumqty
  from booking b
     , t1
 group by rollup(bin);
 
 bin  | cnt | sumqty 
------+-----+--------
 NULL | 160 |    406
    4 |  18 |     46
    6 |  13 |     33
    2 |  11 |     28
    9 |  12 |     35
    7 |  25 |     62
    3 |  25 |     65
    1 |  31 |     71
    5 |  12 |     30
    8 |  13 |     36
(10 rows)

-- 雖然有計算出來,但是透過 width_bucket()得到的分組是整數型態
-- 我們還可以透過 unnest() 再將 Array展開
with t1 (tsarray) as (
select '{"2023-10-11 09:00:00", "2023-10-11 10:00:00",
         "2023-10-11 10:08:00", "2023-10-11 10:18:00",
         "2023-10-11 10:27:00", "2023-10-11 10:34:00",
         "2023-10-11 10:44:00", "2023-10-11 11:00:00",
         "2023-10-11 11:15:00"}'
         ::timestamp[]
), bookingcount as (
select width_bucket(b.ts, t1.tsarray) as bin
     , count(*) as cnt
     , sum(qty) as sumqty
  from booking b
     , t1
 group by rollup(bin)
)
select ts_min
     , sn
     , coalesce(bc.cnt, 0) as cnt
     , coalesce(bc.sumqty, 0) as sumqty
  from t1
 cross join unnest(t1.tsarray) with ordinality as c(ts_min, sn)
  full outer join bookingcount bc
    on c.sn = bc.bin
 order by c.sn nulls last;

       ts_min        |  sn  | cnt | sumqty 
---------------------+------+-----+--------
 2023-10-11 09:00:00 |    1 |  31 |     71
 2023-10-11 10:00:00 |    2 |  11 |     28
 2023-10-11 10:08:00 |    3 |  25 |     65
 2023-10-11 10:18:00 |    4 |  18 |     46
 2023-10-11 10:27:00 |    5 |  12 |     30
 2023-10-11 10:34:00 |    6 |  13 |     33
 2023-10-11 10:44:00 |    7 |  25 |     62
 2023-10-11 11:00:00 |    8 |  13 |     36
 2023-10-11 11:15:00 |    9 |  12 |     35
 NULL                | NULL | 160 |    406
(10 rows)

這樣我們就透過Array,搭配width_bucket(), unnest() 來解決不定時段統計的問題.當然也不見得是時段,凡是有範圍的,都可以處理,來得到直方圖.
若是固定的,那就更好處理了.這樣就產生了.







with t1 as (
select array_agg(a) as tsarray
  from generate_series(
          timestamp '2023-10-11 09:00:00'
        , timestamp '2023-10-11 11:15:00'
        , interval '15 minutes') a
)

這會與date_bin() 等效,但是沒有date_bin()方便.
不過width_bucket() 在PostgreSQL 很早就有了,date_bin()是14版之後才有.

結語

PostgreSQL的Array可以協助我們做反正規化,減少table,提供很多彈性.

感謝

感謝海綿寶寶 (antijava) 大大.

使用PostgreSQL做不定時段統計

苦工大師

輔助表的方式

輔助表搭配PostgreSQL Range Data Type

使用PostgreSQL 的Array

結語

感謝

Read more

PostgreSQL Upsert 的探討

Postgresql 的冪等操作探討

正規化探討

巧用 array 處理 tags 續篇