# 老師出題
## 63029 - 十六縣持卡人前十大國外消費金額及筆數(依簽帳筆數排名)
> card = LOAD '/dataset/63029/63029.csv' USING PigStorage(',') AS (yaer_month: chararray, country: chararray, tw_city: chararray, amt:int, count:int);
```
欄位名稱
年月 year_month:chararray
國別 country:chararray
城市 tw_city:chararray
金額 amt:int
筆數 count:int
原始資料集內容
year_month country tw_city amt count
2014-01,英國,基隆市,20933333,30063
2014-01,盧森堡,基隆市,2575891,5276
2014-01,日本,基隆市,20238233,3895
```
### 請問在2017年,哪一個縣市消費的總筆數最多 ?
> card = LOAD '/dataset/63029/63029.csv' USING PigStorage(',') AS (yaer_month: chararray, country: chararray, tw_city: chararray, amt:int, count:int);
foreach_card = FOREACH card GENERATE SUBSTRING($0,0,4) AS year:int,$1 ,$2 , $3, $4;
filter_card = FILTER foreach_card BY $0 == 2017;
gp_card = GROUP filter_card BY $2;
count_card = FOREACH gp_card GENERATE $0,SUM($1.$4);
asc_card = ORDER count_card BY $1;
dump asc_card;
```
(彰化縣,1665951)
```
### 請問在2017年,哪一個縣市消費的總金額最多 ?
card = LOAD '/dataset/63029/63029.csv' USING PigStorage(',') AS (yaer_month: chararray, country: chararray, tw_city: chararray, amt:int, count:int);
foreach_card = FOREACH card GENERATE SUBSTRING($0,0,4) AS year:int,$1 ,$2 , $3, $4;
filter_card = FILTER foreach_card BY $0 == 2017;
gp_card = GROUP filter_card BY $2;
count_card = FOREACH gp_card GENERATE $0,SUM($1.$3);
asc_card = ORDER count_card BY $1;
dump asc_card;
(新竹市,3523342728)
### 請問在2017年,哪一個縣市平均單筆消費金額最高(縣市消費總金額 / 縣市消費總筆數) ?
card = LOAD '/dataset/63029/63029.csv' USING PigStorage(',') AS (yaer_month: chararray, country: chararray, tw_city: chararray, amt:int, count:int);
foreach_card = FOREACH card GENERATE SUBSTRING($0,0,4) AS year:int,$1 ,$2 , $3, $4;
filter_card = FILTER foreach_card BY $0 == 2017;
gp_card = GROUP filter_card BY $2;
count_card = FOREACH gp_card GENERATE $0,ROUND_TO((double)SUM($1.$3)/SUM($1.$4),2);
asc_card = ORDER count_card BY $1;
dump asc_card;
(新竹市,2230.28)
## 38315 -- 十六縣居民跨縣市消費樣態
> pay = LOAD '/dataset/38315/38315.csv' USING PigStorage(',') AS (yaer: chararray,area:chararray,type:chararray,cards:chararray,total_trans:int,total_amt:biginteger, inter_city_trans:int, inter_city_amt:biginteger);
```
欄位名稱
年月 year:chararray
地區 area:chararray
類別 type:chararray
卡數 cards:int
總交易筆數 total_trans:int
總交易金額[新台幣] total_amt:biginteger
跨縣市交易筆數 inter_city_trans:int
跨縣市交易金額[新台幣] inter_city_amt:biginteger
原始資料集內容
year area type cards total_trans total_amt inter_city_trans inter_city_amt
2014-01,基隆市,ALL,241213,942118,2265105458,714100,1803194168
2014-02,基隆市,ALL,229540,813048,1842401862,641756,1533509566
2014-03,基隆市,ALL,239793,918036,1992896656,722421,1658553554
```
### 請問2014年~2018年,年度總交易金額(所有月份相加)排名第一的縣市分別是哪幾個 ?
pay = LOAD '/dataset/38315/38315.csv' USING PigStorage(',') AS (yaer: chararray,area:chararray,type:chararray,cards:chararray,total_trans:int,total_amt:biginteger, inter_city_trans:int, inter_city_amt:biginteger);
foreach_pay = FOREACH pay GENERATE SUBSTRING($0,0,4) AS year:int, $1,$2,$3,$4,$5,$6,$7;
filter_pay = FILTER foreach_pay BY $0 == 2014;
gp_pay = GROUP filter_pay BY $1;
sum_pay = FOREACH gp_pay GENERATE $0,SUM($1.$5);
asc_pay = ORDER sum_pay BY $1 ASC;
dump asc_pay;
Ans:
(2014,彰化縣,56841192618)
pay = LOAD '/dataset/38315/38315.csv' USING PigStorage(',') AS (yaer: chararray,area:chararray,type:chararray,cards:chararray,total_trans:int,total_amt:biginteger, inter_city_trans:int, inter_city_amt:biginteger);
foreach_pay = FOREACH pay GENERATE SUBSTRING($0,0,4) AS year:int, $1,$2,$3,$4,$5,$6,$7;
filter_pay = FILTER foreach_pay BY $0 == 2015;
gp_pay = GROUP filter_pay BY $1;
sum_pay = FOREACH gp_pay GENERATE $0,SUM($1.$5);
asc_pay = ORDER sum_pay BY $1 ASC;
dump asc_pay;
Ans:
(2015,彰化縣,61262174503)
pay = LOAD '/dataset/38315/38315.csv' USING PigStorage(',') AS (yaer: chararray,area:chararray,type:chararray,cards:chararray,total_trans:int,total_amt:biginteger, inter_city_trans:int, inter_city_amt:biginteger);
foreach_pay = FOREACH pay GENERATE SUBSTRING($0,0,4) AS year:int, $1,$2,$3,$4,$5,$6,$7;
filter_pay = FILTER foreach_pay BY $0 == 2016;
gp_pay = GROUP filter_pay BY $1;
sum_pay = FOREACH gp_pay GENERATE $0,SUM($1.$5);
asc_pay = ORDER sum_pay BY $1 ASC;
dump asc_pay;
Ans:
(2016,彰化縣,63749693551)
pay = LOAD '/dataset/38315/38315.csv' USING PigStorage(',') AS (yaer: chararray,area:chararray,type:chararray,cards:chararray,total_trans:int,total_amt:biginteger, inter_city_trans:int, inter_city_amt:biginteger);
foreach_pay = FOREACH pay GENERATE SUBSTRING($0,0,4) AS year:int, $1,$2,$3,$4,$5,$6,$7;
filter_pay = FILTER foreach_pay BY $0 == 2017;
gp_pay = GROUP filter_pay BY $1;
sum_pay = FOREACH gp_pay GENERATE $0,SUM($1.$5);
asc_pay = ORDER sum_pay BY $1 ASC;
dump asc_pay;
Ans:
(2017, 彰化縣,62817665332)
pay = LOAD '/dataset/38315/38315.csv' USING PigStorage(',') AS (yaer: chararray,area:chararray,type:chararray,cards:chararray,total_trans:int,total_amt:biginteger, inter_city_trans:int, inter_city_amt:biginteger);
foreach_pay = FOREACH pay GENERATE SUBSTRING($0,0,4) AS year:int, $1,$2,$3,$4,$5,$6,$7;
filter_pay = FILTER foreach_pay BY $0 == 2018;
gp_pay = GROUP filter_pay BY $1;
sum_pay = FOREACH gp_pay GENERATE $0,SUM($1.$5);
asc_pay = ORDER sum_pay BY $1 ASC;
dump asc_pay;
Ans:
(2018, 新竹市,49747507243)
pay = LOAD '/dataset/38315/38315.csv' USING PigStorage(',') AS (yaer: chararray,area:chararray,type:chararray,cards:chararray,total_trans:int,total_amt:biginteger, inter_city_trans:int, inter_city_amt:biginteger);
foreach_pay = FOREACH pay GENERATE SUBSTRING($0,0,4) AS year:int, $1,$2,$3,$4,$5,$6,$7;
filter_pay = FILTER foreach_pay BY $0 > 2013 AND NOT $0 > 2018;
gp_pay = GROUP filter_pay BY ($0,$1);
sum_pay = FOREACH gp_pay GENERATE $0,SUM($1.$5);
asc_pay = ORDER sum_pay BY $1 ASC;
dump asc_pay;
((2018,連江縣),345165176)
((2014,連江縣),390528588)
((2015,連江縣),442532536)
((2016,連江縣),475653650)
((2017,連江縣),482723834)
((2018,金門縣),2373566134)
((2018,澎湖縣),2434969213)
((2014,金門縣),2723281819)
((2016,金門縣),2897490251)
((2015,金門縣),2929771049)
((2014,澎湖縣),3139405357)
((2017,金門縣),3241866523)
((2015,澎湖縣),3492111113)
((2017,澎湖縣),3558284500)
((2016,澎湖縣),3641610493)
((2018,台東縣),5058981454)
((2014,台東縣),7024607825)
((2017,台東縣),7507746697)
((2015,台東縣),7581667697)
((2016,台東縣),7727220839)
((2018,花蓮縣),10117712404)
((2018,嘉義縣),11695852610)
((2018,嘉義市),13447386732)
((2018,南投縣),13956596000)
((2014,花蓮縣),14081076270)
((2018,宜蘭縣),14665948172)
((2017,花蓮縣),15256609655)
((2015,花蓮縣),15258406032)
((2014,嘉義縣),15555706546)
((2016,花蓮縣),15725763834)
((2018,雲林縣),16342747825)
((2017,嘉義縣),16733453285)
((2015,嘉義縣),16749277778)
((2014,嘉義市),17102158402)
((2016,嘉義縣),17582707719)
((2014,南投縣),18332868927)
((2018,基隆市),18461366675)
((2015,嘉義市),18940876335)
((2014,宜蘭縣),19251884737)
((2017,嘉義市),19427659359)
((2015,南投縣),19609248115)
((2017,南投縣),20158969471)
((2016,南投縣),20165402540)
((2016,嘉義市),20259023976)
((2018,苗栗縣),20260492143)
((2014,雲林縣),21076065604)
((2015,宜蘭縣),21124157428)
((2017,宜蘭縣),21784889888)
((2018,屏東縣),22137074625)
((2016,宜蘭縣),22249642559)
((2015,雲林縣),22417678145)
((2017,雲林縣),23298755256)
((2016,雲林縣),23543939306)
((2014,基隆市),25777429750)
((2014,苗栗縣),25988975709)
((2015,基隆市),27890984309)
((2015,苗栗縣),28114699927)
((2017,基隆市),28194616815)
((2016,基隆市),28627732850)
((2017,苗栗縣),28735053276)
((2016,苗栗縣),29350067479)
((2014,屏東縣),30966259026)
((2017,屏東縣),32214371495)
((2015,屏東縣),33565720047)
((2016,屏東縣),34369024031)
((2018,新竹縣),41955156147)
((2018,彰化縣),43965600868)
((2014,新竹縣),45501896505)
((2014,新竹市),49627611024)
((2018,新竹市),49747507243)
((2015,新竹縣),49938665531)
((2017,新竹縣),52184071413)
((2016,新竹縣),52436209126)
((2015,新竹市),55975622125)
((2014,彰化縣),56841192618)
((2017,新竹市),58118082302)
((2016,新竹市),59051881498)
((2015,彰化縣),61262174503)
((2017,彰化縣),62817665332)
((2016,彰化縣),63749693551)
### 請問2014年,新竹市當年度的跨縣市交易金額中位數為 ?
pay = LOAD '/dataset/38315/38315.csv' USING PigStorage(',') AS (yaer: chararray,area:chararray,type:chararray,cards:chararray,total_trans:int,total_amt:biginteger, inter_city_trans:int, inter_city_amt:biginteger);
foreach_pay = FOREACH pay GENERATE SUBSTRING($0,0,4) AS year:int, $1,$2,$3,$4,$5,$6,$7;
filter_pay = FILTER foreach_pay BY $0 == 2014 AND $1 == '新竹市';
desc_pay = ORDER filter_pay BY $7 DESC;
rank_pay = RANK desc_pay;
filter_pay_6 = FILTER rank_pay BY $0 == 6;
filter_pay_7 = FILTER rank_pay BY $0 == 7;
cross_pay = CROSS filter_pay_6,filter_pay_7;
mediean_pay = FOREACH cross_pay GENERATE ((bigdecimal)$8 + (bigdecimal)$17) /2;
dump mediean_pay;
(3145372711.500000)
## 12197_A1 -- A1類交通事故資料
> accident = LOAD '/dataset/12197_A1/12197_A1.csv' USING PigStorage(',') AS (time: chararray, dead: chararray, injured: chararray, vehicle: chararray);
```
欄位名稱
發生時間 time:chararray
死亡人數 dead:chararray
受傷人數 injured:chararray
車種 vehicle:chararray
原始資料集
ctime dead injured vehicle
106年01月01日 02時35分00秒,死亡1,受傷0,普通重型-機車
106年01月01日 02時43分00秒,死亡1,受傷0,普通重型-機車
106年01月01日 03時35分00秒,死亡2,受傷0,普通重型-機車
```
## 12197_A2 – A2類交通事故資料
> accident2 = LOAD '/dataset/12197_A2/12197_A2.csv' USING PigStorage(',') AS (time: chararray, dead: chararray, injured: chararray, vehicle: chararray);
```
欄位名稱
發生時間 time:chararray
死亡人數 dead:chararray
受傷人數 injured:chararray
車種 vehicle:chararray
原始資料集內
ctime dead injured vehicle
106年01月01日 00時00分00秒,死亡0,受傷1,自用-小客車
106年01月01日 00時00分00秒,死亡0,受傷2,自用-小客車
106年01月01日 00時03分00秒,死亡0,受傷1,自用-小客車
```
### 請統計每天發生車禍的次數,並找出單日發生車禍次數最多的排名,取前三名(不分 A1類, A2類) ?
accident = LOAD '/dataset/12197_A1/12197_A1.csv' USING PigStorage(',') AS (time: chararray, dead: chararray, injured: chararray, vehicle: chararray);
accident2 = LOAD '/dataset/12197_A2/12197_A2.csv' USING PigStorage(',') AS (time: chararray, dead: chararray, injured: chararray, vehicle: chararray);
foreach_accident = FOREACH accident GENERATE SUBSTRING($0,0,10),$1,$2,$3;
gp_accident = GROUP foreach_accident BY $0;
count_accident = FOREACH gp_accident GENERATE $0, COUNT($1);
foreach_accident2 = FOREACH accident2 GENERATE SUBSTRING($0,0,10),$1,$2,$3;
gp_accident2 = GROUP foreach_accident2 BY $0;
count_accident2 = FOREACH gp_accident2 GENERATE $0, COUNT($1);
join_accident = JOIN count_accident BY $0, count_accident2 BY $0;
add_accident = FOREACH join_accident GENERATE $0, $1+$3;
asc_accident = ORDER add_accident BY $1 ASC;
dump asc_accident;
(106年01月26日,1098)
(106年01月13日,1064)
(106年06月12日,1045)
### 請統計每天傷亡人數,並找出單日傷亡人數最多的排名,取前三名(不分 A1類, A2類) ?
accident = LOAD '/dataset/12197_A1/12197_A1.csv' USING PigStorage(',') AS (time: chararray, dead: chararray, injured: chararray, vehicle: chararray);
accident2 = LOAD '/dataset/12197_A2/12197_A2.csv' USING PigStorage(',') AS (time: chararray, dead: chararray, injured: chararray, vehicle: chararray);
foreach_accident = FOREACH accident GENERATE SUBSTRING($0,0,10),SUBSTRING($1,2,3) AS dead:int,SUBSTRING($2,2,3) AS injured:int,$3;
gp_accident = GROUP foreach_accident BY $0;
sum_accident = FOREACH gp_accident GENERATE $0, SUM($1.$1), SUM($1.$2);
foreach_accident2 = FOREACH accident2 GENERATE SUBSTRING($0,0,10),SUBSTRING($1,2,3) AS dead:int,SUBSTRING($2,2,3) AS injured:int,$3;
gp_accident2 = GROUP foreach_accident2 BY $0;
sum_accident2 = FOREACH gp_accident2 GENERATE $0, SUM($1.$1), SUM($1.$2);
join_accident = JOIN sum_accident BY $0, sum_accident2 BY $0;
add_accident = FOREACH join_accident GENERATE $0, $1+$2+$4+$5;
asc_accident = ORDER add_accident BY $1 ASC;
dump asc_accident;
(106年01月26日,1437)
(106年06月12日,1365)
(106年01月13日,1363)
### 請問106年裡哪一個區段發生的車禍次數最多(不分 A1類, A2類) ?
accident = LOAD '/dataset/12197_A1/12197_A1.csv' USING PigStorage(',') AS (time: chararray, dead: chararray, injured: chararray, vehicle: chararray);
accident2 = LOAD '/dataset/12197_A2/12197_A2.csv' USING PigStorage(',') AS (time: chararray, dead: chararray, injured: chararray, vehicle: chararray);
union_accident = UNION accident, accident2;
foreach_accident = FOREACH union_accident GENERATE (int)SUBSTRING($0,11,13),SUBSTRING($1,2,3) AS dead:int,SUBSTRING($2,2,3) AS injured:int;
foreach_accident2 = FOREACH foreach_accident GENERATE $0, $1, $2, (
CASE
WHEN 0 <= $0 AND $0 < 6 THEN '凌晨'
WHEN 6 <= $0 AND $0 < 12 THEN '早上'
WHEN 12 <= $0 AND $0 < 18 THEN '下午'
WHEN 18 <= $0 AND $0 < 24 THEN '晚上'
END
);
gp_accident = GROUP foreach_accident2 BY $3;
count_accident = FOREACH gp_accident GENERATE $0,COUNT($1);
dump count_accident;
(下午,101390)
(早上,100528)
(晚上,69769)
(凌晨,11232)
## 999001 -- 連線紀錄
> connect = LOAD '/dataset/999001/999001.tsv' USING PigStorage('\t') AS (connection:int, type:chararray, protocol:chararray, service:chararray, timestamp:int, local_host:chararray, local_port:int, remote_host:chararray, remote_port:int);
```
欄位名稱
連線編號 connection:int
連線形式 type:chararray
通訊協定 protocol:chararray
服務程式 service:chararray
時間戳記 timestamp:int
本機位址 local_host:chararray
本機通訊埠 local_port:int
遠端位址 remote_host:chararray
遠端通訊埠 remote_port:int
原始資料集內容
connection type protocol service ctimestamp local _host local _port remote_ host remote_ port
3 accept tcp mssqld 1502196257 192.168.131.6 1433 222.174.114.42 56803
4 accept tcp mssqld 1502196259 192.168.131.6 1433 222.174.114.42 56889
5 accept tcp smbd 1502196284 192.168.131.4 445 218.81.136.101 2671
```
## 999002 - 攻擊紀錄
> attack = LOAD '/dataset/999002/999002.tsv' USING PigStorage('\t') AS (download:int, connection:int, download_url:chararray, download_md5_hash:chararray);
```
欄位名稱
下載記錄編號 download:int
連線編號 connection:int
下載網址 download_url:chararray
下載檔案雜湊值 download_md5_hash:chararray
原始資料集內容
download connection download_url download_md5_hash
1 1220 http://118.89.159.61:12347/Se.exe eccb44e2a6cb4ece00f17f2a56d918f4
2 1222 http://118.89.159.61:12347/svcyr.exe 0a2c841961c6b4b6b09a9bfc9a79a94c
3 1225 http://118.89.159.61:12347/Se.exe eccb44e2a6cb4ece00f17f2a56d918f4
```
### 請使用999001,統計出哪一種通訊協定最常被使用,並由使用次數由大到小進行排序?
connect = LOAD '/dataset/999001/999001.tsv' USING PigStorage('\t') AS (connection:int, type:chararray, protocol:chararray, service:chararray, timestamp:int, local_host:chararray, local_port:int, remote_host:chararray, remote_port:int);
gp_connect = GROUP connect BY $2;
count_connect = FOREACH gp_connect GENERATE $0, COUNT($1);
desc_connect = ORDER count_connect BY $1 DESC;
dump desc_connect;
(udp,50969)
(tcp,40870)
(tls,121)
### 請使用999001, 999002資料集,找出最常被攻擊的服務程式
connect = LOAD '/dataset/999001/999001.tsv' USING PigStorage('\t') AS (connection:int, type:chararray, protocol:chararray, service:chararray, timestamp:int, local_host:chararray, local_port:int, remote_host:chararray, remote_port:int);
attack = LOAD '/dataset/999002/999002.tsv' USING PigStorage('\t') AS (download:int, connection:int, download_url:chararray, download_md5_hash:chararray);
join_connect_attack = JOIN connect BY $0, attack BY $1;
gp_join_connect_attack = GROUP join_connect_attack BY $3;
count_join_connect_attack = FOREACH gp_join_connect_attack GENERATE $0, COUNT($1);
asc_count_join_connect_attack = ORDER count_join_connect_attack BY $1 ASC;
dump asc_count_join_connect_attack;
(smbd,636)
### 請使用999001, 999002,找出最常被攻擊次數最多的前5名遠端位址,如果次數相同則一起輸出並依IP位址降冪排序
> connect = LOAD '/dataset/999001/999001.tsv' USING PigStorage('\t') AS (connection:int, type:chararray, protocol:chararray, service:chararray, timestamp:int, local_host:chararray, local_port:int, remote_host:chararray, remote_port:int);
attack = LOAD '/dataset/999002/999002.tsv' USING PigStorage('\t') AS (download:int, connection:int, download_url:chararray, download_md5_hash:chararray);
join_connect_attack = JOIN connect BY $0, attack BY $1;
gp_join_connect_attack = GROUP join_connect_attack BY $7;
count_join_connect_attack = FOREACH gp_join_connect_attack GENERATE $0, COUNT($1);
asc_count_join_connect_attack = ORDER count_join_connect_attack BY $1 ASC;
dump asc_count_join_connect_attack;
```
(222.186.21.168,50)
(201.116.204.150,49)
(186.92.96.253,26)
(59.63.141.145,16)
(111.121.193.205,14)
(61.158.163.238,14)
```
### 請問2017-08-20總共有多少次連線,被攻擊多少次,平均多少次連線會遭到一次攻擊 ?
```
2017-08-20 00:00:00 的時間戳記為 1503158400
2017-08-21 00:00:00 的時間戳記為 1503244800
【註】時間戳記是從1970年1月1日開始所經過的秒數
```
> connect = LOAD '/dataset/999001/999001.tsv' USING PigStorage('\t') AS (connection:int, type:chararray, protocol:chararray, service:chararray, timestamp:int, local_host:chararray, local_port:int, remote_host:chararray, remote_port:int);
attack = LOAD '/dataset/999002/999002.tsv' USING PigStorage('\t') AS (download:int, connection:int, download_url:chararray, download_md5_hash:chararray);
> filter_connect = FILTER connect BY $4 > 1503158399 AND NOT $4 > 1503244800;
connectTime = GROUP filter_connect ALL;
foreach_connectTime = FOREACH connectTime GENERATE COUNT($1);
dump foreach_connectTime;
Ans:
(1690)
> join_connect_attack = JOIN filter_connect BY $0, attack BY $1;
gp_join_connect_attack = GROUP join_connect_attack ALL;
foreach_connectTime = FOREACH gp_join_connect_attack GENERATE COUNT($1);
dump foreach_connectTime;
```
Ans:
(1)
```
```
Ans:
1,1690,1
```
### 請問999001.tsv資料集總共記錄著幾天的資料 ? (只要當天只有一筆資料就算做有紀錄)
> connect = LOAD '/dataset/999001/999001.tsv' USING PigStorage('\t') AS (connection:int, type:chararray, protocol:chararray, service:chararray, timestamp:long, local_host:chararray, local_port:int, remote_host:chararray, remote_port:int);
connectmile = FOREACH connect GENERATE $0, $1, $2, $3, $4*1000, $5, $6, $7, $8;
dateconnect = FOREACH connectmile GENERATE $0, $1, $2, $3,ToDate($4) AS timestamp, $5, $6, $7, $8;
subconnect = FOREACH dateconnect GENERATE $0,$1,$2,$3,SUBSTRING(ToString($4),0,10), $5, $6, $7, $8;
gp_subconnect = GROUP subconnect BY $4;
count_subconnect = FOREACH gp_subconnect GENERATE $0, COUNT($1);
gp_count_subconnect = GROUP count_subconnect ALL;
count_gp_count_subconnect = FOREACH gp_count_subconnect GENERATE COUNT($1);
dump count_gp_count_subconnect;
```
(28)
```
## 6086 - 107年幼兒園名錄
> kid = LOAD '/dataset/6086/6086.csv' Using PigStorage (',') AS(code:chararray,school:chararray,city:chararray,address:chararray, phone:chararray);
```
欄位名稱
代碼 code:chararray
學校名稱 school:chararray
縣市名稱 city:chararray
地址 address:chararray
電話 phone:chararray
原始資料集內容
code school city address phone
011K02,新北市私立溫特爾幼兒園,新北市,新北市三峽區龍埔里5鄰三樹路336號1、2、3樓,(02)26718181
011K03,新北市私立新榮富新莊幼兒園,新北市,新北市新莊區中信里13鄰中和街204巷1、3號,218巷2、6號,中信里12鄰中和街1、3、5、7、11號,(02)29922174
011K04,新北市私立崇儒幼兒園,新北市,新北市中和區安樂里16鄰宜安路56之1號1樓,(02)29435789
```
## 6087 - 107年國民小學名錄
> element = LOAD '/dataset/6087/6087.csv' Using PigStorage (',') AS(code:chararray,school:chararray,city:chararray,address:chararray, phone:chararray,url:chararray);
```
欄位名稱
代碼 code:chararray
學校名稱 school:chararray
縣市名稱 city:chararray
地址 address:chararray
電話 phone:chararray
網址 url:chararray
原始資料集內容
code school city address phone url
011601,私立育才國小,新北市,新北市永和區福和路125巷20號,(02)29214630,http://www.ytes.ntpc.edu.tw
011602,私立聖心國小,新北市,新北市八里區龍米路一段261號,(02)26182330,http://lshes.com/school/
011603,私立及人國小,新北市,新北市永和區文化路172號,(02)29212145,http://www.cjps.ntpc.edu.tw
```
## 999003 - 107年6月行政區分齡兒童及少年性別人口統計_縣市
> teens = LOAD '/dataset/999003/999003.csv' USING PigStorage(',') AS (country_id: int, country: chararray, a0a5_cnt: int, a6a11_cnt:int, a12a17_cnt:int);
```
欄位名稱
縣市代碼 country_id:int
縣市名稱 country:chararray
0-5歲兒童人口數 a0a5_cnt:int
6-11歲兒童人口數 a6a11_cnt:int
12-17歲少年人口數 a12a17_cnt:int
原始資料集內容
country_id country a0a5_cnt a6a11_cnt a12a17_cnt
10002,宜蘭縣,20907,21718,27474
10015,花蓮縣,15380,15338,19695
09020,金門縣,6373,4437,5451
```
### 全台灣共有幾所小學和幼稚園 ? 幼稚園比小學的比例為和 ?
> element = LOAD '/dataset/6087/6087.csv' Using PigStorage (',') AS(code:chararray,school:chararray,city:chararray,address:chararray, phone:chararray,url:chararray);
kid = LOAD '/dataset/6086/6086.csv' Using PigStorage (',') AS(code:chararray,school:chararray,city:chararray,address:chararray, phone:chararray);
gp_element = GROUP element ALL;
count_element = FOREACH gp_element GENERATE COUNT($1);
(2633)
gp_kid = GROUP kid ALL;
count_kid = FOREACH gp_kid GENERATE COUNT($1);
(6720)
```
(2.55,6720,2633)
```
### 請輸出全台灣各縣市的6~11歲人口數量與小學數量的比例 (不須約分),並依照縣市名稱進行升冪排序
> element = LOAD '/dataset/6087/6087.csv' Using PigStorage (',') AS(code:chararray,school:chararray,city:chararray,address:chararray, phone:chararray,url:chararray);
teens = LOAD '/dataset/999003/999003.csv' USING PigStorage(',') AS (country_id: int, country: chararray, a0a5_cnt: int, a6a11_cnt:int, a12a17_cnt:int);
gp_element = GROUP element BY $2;
count_element = FOREACH gp_element GENERATE $0,COUNT($1);
filter_teens = FOREACH teens GENERATE $1, $3;
join_element_teens = JOIN filter_teens BY $0, count_element BY $0;
foreach_element_teens = FOREACH join_element_teens GENERATE $0,CONCAT((chararray)$1,':',(chararray)$3),$1/$3;
## 請找出台中市各區的小學和幼稚園數量排名,並且依照區域名稱進行降冪排序
> kid = LOAD '/dataset/6086/6086.csv' Using PigStorage (',') AS(code:chararray,school:chararray,city:chararray,address:chararray, phone:chararray);
filter_kid = FILTER kid BY $2 == '臺中市';
foreach_kid = FOREACH filter_kid GENERATE $1,REPLACE($3,'臺中市','');
foreach_kid_a = FOREACH foreach_kid GENERATE $1,STRSPLIT($1,'區',2);
gp_foreach_kid_a = GROUP foreach_kid_a BY $1.$0;
count_foreach_kid_a = FOREACH gp_foreach_kid_a GENERATE $0,COUNT($1);
```
(中,2)
(北,35)
(南,22)
(東,22)
(西,29)
(北屯,56)
(南屯,48)
(后里,20)
(和平,7)
(外埔,7)
(大安,5)
(大甲,21)
(大肚,18)
(大里,56)
(大雅,26)
(太平,42)
(新社,5)
(東勢,16)
(梧棲,16)
(沙鹿,28)
(清水,22)
(潭子,27)
(烏日,24)
(石岡,4)
(神岡,26)
(西屯,63)
(豐原,46)
(霧峰,24)
(龍井,20)
```
> element = LOAD '/dataset/6087/6087.csv' Using PigStorage (',') AS(code:chararray,school:chararray,city:chararray,address:chararray, phone:chararray,url:chararray);
filter_element = FILTER element BY $2 == '臺中市';
foreach_element = FOREACH filter_element GENERATE $1,REPLACE($3,'臺中市','');
foreach_element_a = FOREACH foreach_element GENERATE $1,STRSPLIT($1,'區',2);
gp_foreach_element_a = GROUP foreach_element_a BY $1.$0;
count_foreach_element_a = FOREACH gp_foreach_element_a GENERATE $0,COUNT($1);
dump count_foreach_element_a;
```
(中,1)
(北,9)
(南,4)
(東,6)
(西,6)
(北屯,18)
(南屯,10)
(后里,6)
(和平,8)
(外埔,5)
(大安,4)
(大甲,10)
(大肚,7)
(大里,12)
(大雅,8)
(太平,14)
(新社,8)
(東勢,10)
(梧棲,7)
(沙鹿,7)
(清水,11)
(潭子,7)
(烏日,8)
(石岡,2)
(神岡,5)
(西屯,15)
(豐原,10)
(霧峰,11)
(龍井,7)
```
> join_element_kid = JOIN count_foreach_element_a BY $0, count_foreach_kid_a BY $0;
foreach_join_element_kid = FOREACH join_element_kid GENERATE $0,$1,$3;
desc_foreach_join_element_kid = ORDER foreach_join_element_kid BY $0 DESC;
dump desc_foreach_join_element_kid;
```
(龍井,7,20)
(霧峰,11,24)
(豐原,10,46)
```
## 8066 - 農產品交易行情
> farm = LOAD '/dataset/8066/8066.csv' USING PigStorage(',') AS (date: chararray, crop_code: chararray, crop_name: chararray, market_code: chararray, market_name:chararray, upper_price:float, middle_price:float, lower_price:float, average_price:float, trading_volume:int);
```
欄位名稱
交易日期 date:chararray
作物代號 crop_code:chararray
作物名稱 crop_name:chararray
市場代號 market_code:chararray
市場名稱 market_name:chararray
上價 upper_price:float
中價 middle_price:float
下價 lower_price:float
平均價 average_price:float
交易量 trading_volume:int
原始資料集內容
cdata crop_code crop_name market_code market_name upper_price middle_price lower_price average_price trading_volume
107.09.11,11,椰子,104,台北二,31,20,15.7,21.3,849
107.09.11,129,椰子-進口剝殼,104,台北二,88.3,72.8,60,73.3,60
107.09.11,31,釋迦,104,台北二,114,73.5,46.8,76.3,3581
```
### 請問107.09.11 當日的全台灣的農產品總交易量?
> farm = LOAD '/dataset/8066/8066.csv' USING PigStorage(',') AS (date: chararray, crop_code: chararray, crop_name: chararray, market_code: chararray, market_name:chararray, upper_price:float, middle_price:float, lower_price:float, average_price:float, trading_volume:int);
filter_farm = FILTER farm BY $0 == '107.09.11';
gp_farm = GROUP filter_farm ALL;
sum_farm = FOREACH gp_farm GENERATE SUM($1.$9);
dump sum_farm;
```
(5785551)
```
### 台灣的農產品總交易量排名中酪梨是排名第幾名("百香果-其他"與"百香果-改良種"皆算百香果)
> farm = LOAD '/dataset/8066/8066.csv' USING PigStorage(',') AS (date: chararray, crop_code: chararray, crop_name: chararray, market_code: chararray, market_name:chararray, upper_price:float, middle_price:float, lower_price:float, average_price:float, trading_volume:int);
foreach_farm = FOREACH farm GENERATE STRSPLIT($2,'-',2), $9;
gp_farm = GROUP foreach_farm BY $0.$0;
sum_farm = FOREACH gp_farm GENERATE $0, SUM($1.$1);
desc_farm = ORDER sum_farm BY $1 DESC;
rank_farm = RANK desc_farm;
dump rank_farm;
```
(1,文旦柚,589354)
(2,甘藍,413468)
(3,梨,371113)
(4,西瓜,283485)
(5,火龍果,240436)
(6,番石榴,214843)
(7,香蕉,173653)
(8,蘋果,148990)
(9,包心白菜,119909)
(10,木瓜,119784)
(11,百香果,118033)
(12,絲瓜,117886)
(13,柿子,111822)
(14,蘿蔔,103494)
(15,胡蘿蔔,93191)
(16,花胡瓜,92686)
(17,蕃茄,90111)
(18,鳳梨,89134)
(19,洋蔥,86078)
(20,葡萄,83123)
(21,萵苣菜,82514)
(22,胡瓜,72953)
(23,小白菜,69603)
(24,竹筍,66834)
(25,青蔥,64657)
(26,其他蔬菜,64207)
(27,奇異果,61389)
(28,雜柑,60028)
(29,蕹菜,58347)
(30,龍眼,57147)
(31,玉米,55463)
(32,甘藷,54942)
(33,茄子,46977)
(34,南瓜,46893)
(35,扁蒲,45597)
(36,甜椒,43792)
(37,冬瓜,42055)
(38,花椰菜,39163)
(39,芒果,39029)
(40,苦瓜,37582)
(41,芽菜類,35858)
(42,馬鈴薯,33928)
(43,甘藷葉,33329)
(44,青江白菜,32554)
(45,椰子,29546)
(46,莧菜,28108)
(47,桃子,28100)
(48,薑,27269)
(49,酪梨,26897)
(50,油菜,26121)
(51,甘薯,25373)
(52,敏豆,25305)
(53,包心白,25303)
(54,茭白筍,25041)
(55,甜橙,23694)
(56,釋迦,23663)
(57,辣椒,23248)
(58,韭菜,22631)
(59,芋,20233)
(60,青花苔,19741)
(61,李,19219)
(62,菜豆,18870)
(63,甜瓜,18063)
(64,洋香瓜,17442)
(65,芥藍菜,16980)
(66,小番茄,16306)
(67,楊桃,15257)
(68,大蒜,14632)
(69,芹菜,14507)
(70,芥菜,14486)
(71,甘蔗,14052)
```
### 請找出各品種的農作物,有多少市場在賣? 並且使用降冪排序("百香果-其他"與"百香果-改良種" 算兩種農作物)
> farm = LOAD '/dataset/8066/8066.csv' USING PigStorage(',') AS (date: chararray, crop_code: chararray, crop_name: chararray, market_code: chararray, market_name:chararray, upper_price:float, middle_price:float, lower_price:float, average_price:float, trading_volume:int);
gp_farm = GROUP farm BY $2;
count_farm = FOREACH gp_farm GENERATE $0, COUNT($1);
desc_farm = ORDER count_farm BY $1 DESC;
dump desc_farm;
```
(包心白菜-包白,15)
(絲瓜-絲瓜,15)
(茄子-麻荸茄,14)
(苦瓜-白大米,14)
(花胡瓜,14)
(胡瓜-黑刺,14)
(冬瓜-白皮,14)
(花椰菜-青梗,13)
(小白菜-土白菜,13)
(甘藍-初秋,13)
```
### 請找出各個市場總交易量排名前三名的農作物
> farm = LOAD '/dataset/8066/8066.csv' USING PigStorage(',') AS (date: chararray, crop_code: chararray, crop_name: chararray, market_code: chararray, market_name:chararray, upper_price:float, middle_price:float, lower_price:float, average_price:float, trading_volume:int);
gp_farm = GROUP farm BY ($4,$3,$2);
sum_farm = FOREACH gp_farm GENERATE $0, SUM($1.$9);
filter_sum = FILTER sum_farm BY $0.$0 == '台北二';
desc_sum = ORDER filter_sum BY $1 DESC;
dump desc_sum;
```
((台北二,104,文旦柚-文旦柚),87041)
((台北二,104,甘藍-改良種),59232)
((台北二,104,番石榴-珍珠芭),26064)
```
> farm = LOAD '/dataset/8066/8066.csv' USING PigStorage(',') AS (date: chararray, crop_code: chararray, crop_name: chararray, market_code: chararray, market_name:chararray, upper_price:float, middle_price:float, lower_price:float, average_price:float, trading_volume:int);
gp_farm = GROUP farm BY ($4,$3,$2);
sum_farm = FOREACH gp_farm GENERATE $0, SUM($1.$9);
filter_sum = FILTER sum_farm BY $0.$0 == '高雄市場';
desc_sum = ORDER filter_sum BY $1 DESC;
dump desc_sum;
```
((高雄市場,800,白竹-白竹),730)
((高雄市場,800,電信蘭葉-電信蘭葉),707)
((高雄市場,800,水燭葉-水燭葉),685)
```
> farm = LOAD '/dataset/8066/8066.csv' USING PigStorage(',') AS (date: chararray, crop_code: chararray, crop_name: chararray, market_code: chararray, market_name:chararray, upper_price:float, middle_price:float, lower_price:float, average_price:float, trading_volume:int);
gp_farm = GROUP farm BY ($4,$3,$2);
sum_farm = FOREACH gp_farm GENERATE $0, SUM($1.$9);
filter_sum = FILTER sum_farm BY $0.$0 == '台北市場';
desc_sum = ORDER filter_sum BY $1 DESC;
dump desc_sum;
((台北市場,105,大菊-白天星),2894)
### 請問那些玫瑰的品種是台北市場有但是高雄市場沒有 ?
> farm = LOAD '/dataset/8066/8066.csv' USING PigStorage(',') AS (date: chararray, crop_code: chararray, crop_name: chararray, market_code: chararray, market_name:chararray, upper_price:float, middle_price:float, lower_price:float, average_price:float, trading_volume:int);
filter_farm2 = FILTER farm BY $2 matches '.*玫瑰.*' AND $4 == '高雄市場';
foreach_farm2 = FOREACH filter_farm2 GENERATE $2;
dump foreach_farm2;
```
(石玫瑰-石玫瑰,高雄市場)
(玫瑰-玫瑰,高雄市場)
(玫瑰-佳娜紅,高雄市場)
(玫瑰-紅衣武士,高雄市場)
(玫瑰-萬年紅,高雄市場)
(玫瑰-卡門,高雄市場)
(玫瑰-黛安娜粉,高雄市場)
(玫瑰-卡蜜拉,高雄市場)
(玫瑰-翡翠香檳,高雄市場)
(玫瑰-白,高雄市場)
(玫瑰-翡翠白,高雄市場)
(玫瑰-紫天王,高雄市場)
(進口玫瑰-混合色,高雄市場)
```
> farm = LOAD '/dataset/8066/8066.csv' USING PigStorage(',') AS (date: chararray, crop_code: chararray, crop_name: chararray, market_code: chararray, market_name:chararray, upper_price:float, middle_price:float, lower_price:float, average_price:float, trading_volume:int);
filter_farm = FILTER farm BY $2 matches '.*玫瑰.*' AND $4 == '台北市場';
foreach_farm = FOREACH filter_farm GENERATE $2;
dump foreach_farm;
```
(石玫瑰-石玫瑰,台北市場)
(玫瑰-玫瑰,台北市場)
(玫瑰-萬年紅,台北市場)
(玫瑰-卡門,台北市場)
(玫瑰-黛安娜粉,台北市場)
(玫瑰-卡蜜拉,台北市場)
(玫瑰-翡翠香檳,台北市場)
(玫瑰-白,台北市場)
(玫瑰-翡翠白,台北市場)
(玫瑰-紫天王,台北市場)
-------------------
(玫瑰-新櫻紅,台北市場)
(玫瑰-荷蘭紅,台北市場)
(玫瑰-埔里之星,台北市場)
(玫瑰-新橙色,台北市場)
(玫瑰-粉,台北市場)
(玫瑰-桃紅,台北市場)
(玫瑰-甜心,台北市場)
(玫瑰-翡翠粉,台北市場)
(玫瑰-維納斯,台北市場)
(玫瑰-珍愛你,台北市場)
(玫瑰-綠,台北市場)
(玫瑰-紫,台北市場)
(玫瑰-紫愛你,台北市場)
(玫瑰-冰火,台北市場)
(玫瑰-雪之戀,台北市場)
```
> join_result = JOIN foreach_farm by $0 LEFT OUTER, foreach_farm2 by $0;
filter_result = FILTER join_result BY $1 is null;
dump filter_result;
```
(玫瑰-冰火,)
(玫瑰-埔里之星,)
(玫瑰-新橙色,)
(玫瑰-新櫻紅,)
(玫瑰-桃紅,)
(玫瑰-珍愛你,)
(玫瑰-甜心,)
(玫瑰-粉,)
(玫瑰-紫,)
(玫瑰-紫愛你,)
(玫瑰-綠,)
(玫瑰-維納斯,)
(玫瑰-翡翠粉,)
(玫瑰-荷蘭紅,)
(玫瑰-雪之戀,)
```
## 24333 –「ATM位置」查詢一覽表
> atm = LOAD '/dataset/24333/24333.csv' USING PigStorage(',') AS (code: chararray, name: chararray, location: chararray, city: chararray, address:chararray);
```
欄位名稱
裝設金融機構代號 code:chararray
裝設金融機構名稱 name:chararray
裝設地點 location:chararray
裝設縣市 city:chararray
裝設地址 address:chararray
原始資料集內容
code name location city address
004,臺灣銀行,士林分行,台北市,台北市士林區中山北路六段197號
004,臺灣銀行,大同公司,台北市,台北市中山區中山北路三段22號
004,臺灣銀行,大安分行,台北市,台北市大安區敦化南路二段69號1樓
```
### 使用 24333 資料集找出各縣市的ATM數量,並找出前三名
(使用數量由大到小排序,若數量一樣則使用縣市升冪排序)
> atm = LOAD '/dataset/24333/24333.csv' USING PigStorage(',') AS (code: chararray, name: chararray, location: chararray, city: chararray, address:chararray);
gp_atm = GROUP atm BY $3;
count_atm = FOREACH gp_atm GENERATE $0, COUNT($1);
asc_atm = ORDER count_atm BY $1 ASC;
dump asc_atm;
```
(台北市,3973)
(新北市,3774)
(台中市,2551)
```
### 請使用9621資料集,找出全台灣總共有多少學士生、碩士生、博士生
> colleage = LOAD '/dataset/9621/9621.csv' Using PigStorage (',') AS(city:chararray,school:chararray,faculty:chararray,courses:chararray,level:chararray,student:int,teachers:int);
gp_colleage = GROUP colleage BY $4;
sum_colleage = FOREACH gp_colleage GENERATE $0, SUM($1.$5);
dump sum_colleage;
```
(博士,28346)
(學士,503130)
(碩士,168783)
```
```
Ans:
(503130, 168783,28346)
```
### 請使用9621 資料集,分別找出台、清、交、成四所大學的學生總人數(不分學碩博)
並且依照學生數量由大到小排序,若人數相同則依學校名稱進行倒排序
> colleage = LOAD '/dataset/9621/9621.csv' Using PigStorage (',') AS(city:chararray,school:chararray,faculty:chararray,courses:chararray,level:chararray,student:int,teachers:int);
filter_colleage = FILTER colleage BY $0 == '臺北市' AND $1 == '國立臺灣大學';
gp_colleage = GROUP filter_colleage BY $1;
count_colleage = FOREACH gp_colleage GENERATE $0, SUM($1.$5);
dump count_colleage;
```
(國立臺灣大學,31802)
```
> colleage = LOAD '/dataset/9621/9621.csv' Using PigStorage (',') AS(city:chararray,school:chararray,faculty:chararray,courses:chararray,level:chararray,student:int,teachers:int);
filter_colleage = FILTER colleage BY $1 matches '.*清華大學.*';
gp_colleage = GROUP filter_colleage BY $1;
count_colleage = FOREACH gp_colleage GENERATE $0, SUM($1.$5);
dump count_colleage;
```
(國立清華大學,16664)
```
> colleage = LOAD '/dataset/9621/9621.csv' Using PigStorage (',') AS(city:chararray,school:chararray,faculty:chararray,courses:chararray,level:chararray,student:int,teachers:int);
filter_colleage = FILTER colleage BY $1 matches '.*交通大學.*';
gp_colleage = GROUP filter_colleage BY $1;
count_colleage = FOREACH gp_colleage GENERATE $0, SUM($1.$5);
dump count_colleage;
```
(國立交通大學,14169)
```
> colleage = LOAD '/dataset/9621/9621.csv' Using PigStorage (',') AS(city:chararray,school:chararray,faculty:chararray,courses:chararray,level:chararray,student:int,teachers:int);
filter_colleage = FILTER colleage BY $1 matches '.*成功大學.*';
gp_colleage = GROUP filter_colleage BY $1;
count_colleage = FOREACH gp_colleage GENERATE $0, SUM($1.$5);
dump count_colleage;
```
(國立成功大學,21252)
```
```
ANS:
(國立臺灣大學,31802)
(國立成功大學,21252)
(國立清華大學,16664)
(國立交通大學,14169)
```
### 請找出全台灣各縣市的警察局數量排名(不分派出所、分局等),並由小到大排序,如果數量一樣則依縣市名稱進行降冪排序
> police = LOAD '/dataset/5958/5958.csv' USING PigStorage(',') AS (name: chararray, code: chararray, address: chararray);
foreach_police = FOREACH police GENERATE $0,SUBSTRING($2,0,3);
gp_police = GROUP foreach_police BY $1;
count_police = FOREACH gp_police GENERATE $0,COUNT($1);
asc_police = ORDER count_police BY $1 ASC;
dump asc_police;
```
Ans:
(連江縣,7)
(金門縣,9)
(嘉義市,15)
(新竹市,19)
(基隆市,28)
(澎湖縣,28)
(新竹縣,61)
(宜蘭縣,62)
(臺東縣,67)
(苗栗縣,69)
(雲林縣,72)
(花蓮縣,73)
(彰化縣,81)
(嘉義縣,87)
(屏東縣,91)
(桃園市,95)
(南投縣,99)
(臺北市,107)
(臺中市,133)
(高雄市,153)
(臺南市,163)
(新北市,176)
```
### 請使用9621資料集,找出全台灣總共有學士生、碩士生、博士生的公私立比例(國立、市立皆是公立生)
> colleage = LOAD '/dataset/9621/9621.csv' Using PigStorage (',') AS(city:chararray,school:chararray,faculty:chararray,courses:chararray,level:chararray,student:int,teachers:int);
filter_colleage = FILTER colleage BY $1 matches '國立.*' OR $1 matches '市立.*';
gp_colleage = GROUP filter_colleage BY $4;
count_colleage = FOREACH gp_colleage GENERATE $0, COUNT($1);
```
(學士,828)
(碩士,1970)
(博士,698)
```
> gp_colleage = GROUP colleage BY $4;
count_colleage = FOREACH gp_colleage GENERATE $0, COUNT($1);
```
(博士,939)
(學士,1986)
(碩士,3675)
```
```
Ans:
(碩士,1705)
```