TopN extension

參考資料

Github

https://github.com/citusdata/postgresql-topn

簡報及介紹

https://speakerdeck.com/citusdata/the-topn-extension-maintaining-top-10-lists-at-scale-pgconf-eu-2019-furkan-sahin

https://www.citusdata.com/blog/2018/03/27/topn-for-your-postgres-database/

安裝

下載

從 github 下載, 會存成 zip, 解壓縮.

unzip postgresql-topn-master.zip

目前的版本是 2.3.1
topn v2.3.1 (November 27, 2020)

​​​​Adds PostgreSQL 13 support
​​​​Supports parallel queries and aggregates

編譯

make
make install

/bin/sh /usr/local/lib/postgresql/pgxs/src/makefiles/../../config/install-sh -c -d '/usr/local/share/postgresql/extension'
/bin/sh /usr/local/lib/postgresql/pgxs/src/makefiles/../../config/install-sh -c -d '/usr/local/share/postgresql/extension'
/bin/sh /usr/local/lib/postgresql/pgxs/src/makefiles/../../config/install-sh -c -d '/usr/local/lib/postgresql'
/usr/bin/install -c -m 644 .//topn.control '/usr/local/share/postgresql/extension/'
/usr/bin/install -c -m 644 .//topn--2.0.0--2.1.0.sql .//topn--2.1.0--2.2.0.sql .//topn--2.2.0--2.2.1.sql .//topn--2.2.1--2.2.2.sql .//topn--2.2.2--2.3.0.sql .//topn--2.3.0--2.3.1.sql topn--2.0.0.sql topn--2.1.0.sql topn--2.2.0.sql topn--2.2.1.sql topn--2.2.2.sql topn--2.3.0.sql topn--2.3.1.sql '/usr/local/share/postgresql/extension/'
/usr/bin/install -c -m 755  topn.so '/usr/local/lib/postgresql/'

create extension

|=> psql -U pagila
psql (13.1)
Type "help" for help.

pagila[pagila]# create extension topn with schema public;
CREATE EXTENSION
Time: 266.651 ms
pagila[pagila]#* commit;
COMMIT
Time: 5.081 ms
pagila[pagila]# \dx topn
          List of installed extensions
 Name | Version | Schema |     Description      
------+---------+--------+----------------------
 topn | 2.3.1   | public | type for top-n JSONB
(1 row)

探索 extension

pagila[pagila]# \dx+ topn

           Objects in extension "topn"
               Object description                
-------------------------------------------------
 function topn(jsonb,integer)
 function topn_add(jsonb,text)
 function topn_add_agg(text)
 function topn_add_trans(internal,text)
 function topn_deserialize(bytea,internal)
 function topn_pack(internal)
 function topn_serialize(internal)
 function topn_union(jsonb,jsonb)
 function topn_union_agg(jsonb)
 function topn_union_internal(internal,internal)
 function topn_union_trans(internal,jsonb)
 operator +(jsonb,jsonb)
 type topn_record
(13 rows)

pagila[pagila]# \df topn*
                               List of functions
 Schema |        Name         | Result data type  | Argument data types | Type 
--------+---------------------+-------------------+---------------------+------
 public | topn                | SETOF topn_record | jsonb, integer      | func
 public | topn_add            | jsonb             | jsonb, text         | func
 public | topn_add_agg        | jsonb             | text                | agg
 public | topn_add_trans      | internal          | internal, text      | func
 public | topn_deserialize    | internal          | bytea, internal     | func
 public | topn_pack           | jsonb             | internal            | func
 public | topn_serialize      | bytea             | internal            | func
 public | topn_union          | jsonb             | jsonb, jsonb        | func
 public | topn_union_agg      | jsonb             | jsonb               | agg
 public | topn_union_internal | internal          | internal, internal  | func
 public | topn_union_trans    | internal          | internal, jsonb     | func
(11 rows)

範例

下載

wget http://examples.citusdata.com/customer_reviews_2000.csv.gz
gzip -d customer_reviews_2000.csv.gz

建立 table 及 載入資料

CREATE TABLE customer_reviews
(
    customer_id TEXT,
    review_date DATE,
    review_rating INTEGER,
    review_votes INTEGER,
    review_helpful_votes INTEGER,
    product_id CHAR(10),
    product_title TEXT,
    product_sales_rank BIGINT,
    product_group TEXT,
    product_category TEXT,
    product_subcategory TEXT,
    similar_product_ids CHAR(10)[]
);

\COPY customer_reviews FROM 'customer_reviews_2000.csv' WITH CSV;

commit;

會出現 COPY 1260658 的訊息

roll-up table

-- create a roll-up table to capture most popular products
create table popular_products (
  review_date date UNIQUE
, agg_data jsonb
);

-- Create different summaries by grouping top reviews for each date (day, month, year)

insert into popular_products
select review_date
     , topn_add_agg(product_id)
  from customer_reviews
 group by review_date;

commit;

查詢 roll-up table

select review_date
     , topn(agg_data, 1)
     , (topn(agg_data, 1)).*
  from popular_products
 limit 5;

 review_date |      topn       |    item    | frequency 
-------------+-----------------+------------+-----------
 2000-01-16  | (0151004439,15) | 0151004439 |        15
 2000-01-06  | (B00000JJCZ,13) | B00000JJCZ |        13
 2000-01-30  | (0807281956,10) | 0807281956 |        10
 2000-01-05  | (B00000JJCZ,17) | B00000JJCZ |        17
 2000-02-05  | (630575070X,17) | 630575070X |        17
(5 rows)

select review_date
     , topn(agg_data, 1)
     , (topn(agg_data, 1)).*
  from popular_products
 order by frequency desc
 limit 10;

 review_date |       topn       |    item    | frequency 
-------------+------------------+------------+-----------
 2000-07-09  | (0786229276,705) | 0786229276 |       705
 2000-07-09  | (0807282596,705) | 0807282596 |       705
 2000-07-08  | (0939173379,493) | 0939173379 |       493
 2000-07-08  | (0807282588,493) | 0807282588 |       493
 2000-07-10  | (0807282588,434) | 0807282588 |       434
 2000-07-10  | (0786229276,434) | 0786229276 |       434
 2000-10-03  | (B00004Z511,243) | B00004Z511 |       243
 2000-10-03  | (B00004XONN,243) | B00004XONN |       243
 2000-07-11  | (0939173379,179) | 0939173379 |       179
 2000-07-11  | (0439139600,179) | 0439139600 |       179
(10 rows)

select review_date
     , topn(agg_data, 10)
     , (topn(agg_data, 10)).*
  from popular_products
 order by frequency desc
 limit 20;
 
 review_date |       topn       |    item    | frequency 
-------------+------------------+------------+-----------
 2000-07-09  | (0939173379,705) | 0939173379 |       705
 2000-07-09  | (0807282588,705) | 0807282588 |       705
 2000-07-09  | (0786229276,705) | 0786229276 |       705
 2000-07-09  | (0439139597,705) | 0439139597 |       705
 2000-07-09  | (0439139600,705) | 0439139600 |       705
 2000-07-09  | (0807282596,705) | 0807282596 |       705
 2000-07-08  | (0939173379,493) | 0939173379 |       493
 2000-07-08  | (0786229276,493) | 0786229276 |       493
 2000-07-08  | (0807282596,493) | 0807282596 |       493
 2000-07-08  | (0807282588,493) | 0807282588 |       493
 2000-07-08  | (0439139597,493) | 0439139597 |       493
 2000-07-08  | (0439139600,493) | 0439139600 |       493
 2000-07-10  | (0439139600,434) | 0439139600 |       434
 2000-07-10  | (0939173379,434) | 0939173379 |       434
 2000-07-10  | (0439139597,434) | 0439139597 |       434
 2000-07-10  | (0786229276,434) | 0786229276 |       434
 2000-07-10  | (0807282596,434) | 0807282596 |       434
 2000-07-10  | (0807282588,434) | 0807282588 |       434
 2000-10-03  | (B00004XONN,243) | B00004XONN |       243
 2000-10-03  | (B00004Z511,243) | B00004Z511 |       243
(20 rows)

-- 使用 topn_union_agg 來聚合數個 

select (topn(topn_union_agg(agg_data), 10)).*
  from popular_products
 where review_date >= date '2000-01-01'
   and review_date <  date '2000-02-01'
 order by 2 desc;

    item    | frequency 
------------+-----------
 0375408738 |       217
 0375404368 |       217
 0345417623 |       217
 0375404376 |       217
 0807281751 |       204
 0939173344 |       204
 043936213X |       204
 0807286001 |       204
 0807281956 |       204
 0786222727 |       204
(10 rows)

-- 一年期間 每月的前兩名
select date_trunc('month', review_date)::date as review_month
     , (topn(topn_union_agg(agg_data), 2)).*
  from popular_products
 where review_date >= date '2000-01-01'
   and review_date <  date '2001-01-01'
 group by review_month
 order by review_month, frequency;
 
 review_month |    item    | frequency 
--------------+------------+-----------
 2000-01-01   | 0375408738 |       217
 2000-01-01   | 0345417623 |       217
 2000-02-01   | 0440236673 |       291
 2000-02-01   | 0375409726 |       291
 2000-03-01   | B00004NRPZ |       635
 2000-03-01   | B00004NRU3 |       637
 2000-04-01   | B0000667J7 |       399
 2000-04-01   | B00003CX5P |       399
 2000-05-01   | B00004SCX6 |       380
 2000-05-01   | B00004T1HK |       380
 2000-06-01   | B00004SCX6 |       202
 2000-06-01   | B00004TL21 |       259
 2000-07-01   | 0786229276 |      2734
 2000-07-01   | 0807282596 |      2736
 2000-08-01   | 0786229276 |       350
 2000-08-01   | 0807282596 |       350
 2000-09-01   | B00007CWQI |       238
 2000-09-01   | 039914563X |       238
 2000-10-01   | B00004XONN |       961
 2000-10-01   | B00004Z511 |       962
 2000-11-01   | B000050I5N |       343
 2000-11-01   | B000050HSL |       343
 2000-12-01   | B00005NBAU |       224
 2000-12-01   | B00004YS7F |       230
(24 rows)

另一個 roll-up 範例

topn.number_of_counters

https://github.com/citusdata/postgresql-topn#config-settings

show topn.number_of_counters;
 topn.number_of_counters 
-------------------------
 1000

set topn.number_of_counters to 100;

commit;

建立 table

-- Create table to insert summaries.

create table popular_products2 (
  review_summary jsonb
, year int not null
, month int not null
);

-- Create different summaries by grouping the reviews according to their year and month.

insert into popular_products2
select topn_add_agg(product_id)
     , extract(year from review_date)::int as year
     , extract(month from review_date)::int as month
  from customer_reviews
 group by year, month;

INSERT 0 12
Time: 3113.382 ms (00:03.113)

commit;

topn_union_agg()

https://github.com/citusdata/postgresql-topn#topn_union_aggtopntypecolumn

This is the aggregate for union operation. It merges the JSONB counter lists and returns the final JSONB which stores overall result.

overall_result table

create table overall_result (
 merged_summary jsonb
);

insert into overall_result
select topn_union_agg(review_summary)
  from popular_products2;
  
INSERT 0 1
Time: 50.210 ms

找出全部 最大的 20個, 使用 topn()

select (topn(merged_summary, 20)).*
  from overall_result
 order by 2 desc, 1;

    item    | frequency 
------------+-----------
 0807282596 |      3282
 0807282588 |      3278
 0439139597 |      3277
 0786229276 |      3277
 0939173379 |      3276
 0439139600 |      3272
 B00004NRU3 |      1274
 B00004NRPZ |      1254
 B00004Z511 |      1181
 B00004XONN |      1179
 0590353403 |      1134
 0807281751 |      1129
 0807286001 |      1092
 0807281956 |      1034
 0786222727 |      1029
 043936213X |       981
 0939173344 |       974
 630575070X |       959
 6305750750 |       958
 630575067X |       939
(20 rows)

window functions 應用

create table daily_populars (
  date date
, agg_data jsonb
);

insert into daily_populars
select date_trunc('day', review_date)
     , topn_add_agg(product_id)
  from customer_reviews
 group by 1;

commit;

select date
     , topn_union_agg(agg_data) over seven_days
  from daily_populars
window seven_days as (order by date asc rows 6 preceding)
order by 1
limit 10;

結果如:

with t(date, merged7) as ( select date , topn_union_agg(agg_data) over seven_days from daily_populars window seven_days as (order by date asc rows 6 preceding) order by 1 limit 10 ) select date , (topn(merged7, 5)).* from t; date | item | frequency ------------+------------+----------- 2000-01-01 | 0939173344 | 12 2000-01-01 | 0807286001 | 12 2000-01-01 | 043936213X | 12 2000-01-01 | 0807281956 | 12 2000-01-01 | 0590353403 | 12 2000-01-02 | 0807282324 | 19 2000-01-02 | 0807282316 | 19 2000-01-02 | 0807286028 | 19 2000-01-02 | 0439136369 | 19 2000-01-02 | 0747545111 | 19 2000-01-03 | 043936213X | 27 2000-01-03 | 0590353403 | 27 2000-01-03 | 0807281751 | 27 2000-01-03 | 0786222727 | 27 2000-01-03 | 0939173344 | 27 2000-01-04 | 0345417623 | 39 2000-01-04 | 0375408738 | 39 2000-01-04 | 0375404376 | 39 2000-01-04 | 0375404368 | 39 2000-01-04 | 0939173344 | 35 2000-01-05 | 0375408738 | 47 2000-01-05 | 0375404368 | 47 2000-01-05 | 0375404376 | 47 2000-01-05 | 0345417623 | 47 2000-01-05 | 0786222727 | 40 2000-01-06 | 0375404376 | 54 2000-01-06 | 0375408738 | 54 2000-01-06 | 0375404368 | 54 2000-01-06 | 0345417623 | 54 2000-01-06 | 0939173344 | 46 2000-01-07 | 0375408738 | 60 2000-01-07 | 0345417623 | 60 2000-01-07 | 0375404368 | 60 2000-01-07 | 0375404376 | 60 2000-01-07 | 0807281751 | 52 2000-01-08 | 0375408738 | 63 2000-01-08 | 0375404376 | 63 2000-01-08 | 0375404368 | 63 2000-01-08 | 0345417623 | 63 2000-01-08 | 0060392983 | 52 2000-01-09 | 0345417623 | 76 2000-01-09 | 0375408738 | 76 2000-01-09 | 0375404368 | 76 2000-01-09 | 0375404376 | 76 2000-01-09 | 0060392983 | 66 2000-01-10 | 0060392983 | 85 2000-01-10 | 006103116X | 85 2000-01-10 | 0375404376 | 72 2000-01-10 | 0375404368 | 72 2000-01-10 | 0375408738 | 72 (50 rows)