---
title: '使用tidyverse/tidyr/dplyr資料處理'
disqus: hackmd
---
使用tidyverse/tidyr/dplyr資料處理
===
## Table of Contents
[TOC]
## Loading Packages
```gherkin=1
library(dplyr)
library(tidyr)
install.packages('nycflights13') #example dataset
library(nycflights13)
summary(flights)
dim(flights)
```
## filter()用來選擇滿足特定條件的資料,先填資料框架名稱,再填篩選的欄位與條件
---
也可以用subset
```gherkin=1
head(filter(flights,month==1,day==1,carrier=='UA'))
head(flights[flights$month == 1 & flights$day == 1 & flights$carrier == 'UA', ])
```
## slice()用來挑選row
---
也可以直接挑選哪排
```gherkin=1
slice(flights, 1:10)
```
## arrange()和filter()功能差不多,只是有加上重新排列的功能,或是用desc排列
---
加上desc()會有由大到小的排序功能
```gherkin=1
head(arrange(flights,year,month,day,dep_time))
head(arrange(flights,desc(dep_time)))
```
## select()特別適合用於大資料量,但只有想要看特定幾個資料欄位
用vector來指定要的欄位名稱
```gherkin=1
head(select(flights,carrier))
select(flights,c(origin,carrier))
```
## rename()重新命名欄位
```gherkin=1
head(rename(flights,airline_car = carrier))
```
## 比起純粹用select(),加上distinct()可以得到刪除重複欄位的結果
```gherkin=1
distinct(select(flights,carrier))
select(flights,carrier) %>% distinct()
select(flights,c(origin,carrier)) %>% distinct()
```
## mutate()新增/刪減/變化欄位,可以從目前存在的欄位去做變化成新欄位,有點像是awk
```gherkin=1
head(mutate(flights, total_time = arr_delay-dep_delay))
```
## transmute()只輸出新增欄位結果
```gherkin=1
head(transmute(flights, new_col = arr_delay-dep_delay))
```
## 隨意取特定行數(百分比例)的資料
```gherkin=1
sample_n(flights,5)
sample_frac(flights,0.00002)
```
## gather()可將wide table欄位打散變成long table。下面這個例子,則是把季度特別變成一個變數。
```gherkin=1
#example 1
comp <- c(1,1,1,2,2,2,3,3,3)
yr <- c(2015,2016,2017,2016,2015,2017,2017,2016,2015)
q1 <- runif(9, min=0, max=100)
q2 <- runif(9, min=0, max=100)
q3 <- runif(9, min=0, max=100)
q4 <- runif(9, min=0, max=100)
df <- data.frame(comp=comp,year=yr,Qtr1 = q1,Qtr2 = q2,Qtr3 = q3,Qtr4 = q4)
df
df %>% gather(Quarter,Revenue,Qtr1:Qtr4) #gather(df,Quarter,Revenue,Qtr1:Qtr4)
#example 2
data.frame(
a=letters[1:10],
b=1:10,
c=sample(LETTERS[1:3],10,replace=TRUE),
d=sample(1:10,10,replace=TRUE)
) -> x
x
gather(x,variable,value,b,d)
gather(x,variable,value,-a,-c) -> y; #指定留下不要變成long form的column name也可以
y
gather(x,variable,value); #只指定key與value的話,會全部都gather起來. XD
gather(x,variable,value,-a) #指定留下不要變成long form的column name
```
## spread()和gather()相反,指定用哪兩個些變數展開。
gather變成long table
spread變成wide table
```gherkin=1
#example 1
df
df %>% gather(Quarter,Revenue,Qtr1:Qtr4)
df %>% gather(Quarter,Revenue,Qtr1:Qtr4) %>% spread(Quarter, Revenue)
stocks <- data.frame(
time = as.Date('2017-01-01') + 0:9,
X = rnorm(10, 2, 1), #rnorm(隨機樣本數,平均值,標準差)
Y = rnorm(10, 2, 3),
Z = rnorm(10, 2, 5)
)
stocksm <- stocks %>% gather(stock, price, -time)
stocksm %>% spread(stock, price)
stocksm %>% spread(time, price)
#example 2
spread(y,variable,value)
```
## separate()將單一欄位文字拆成不同欄位。
sep指定分隔符號(default:"[^[:alnum:]]+")
col指定分開後的column name
```gherkin=1
#example 1
df <- data.frame(x = c(NA, "b.h", "i.o", "g.w"))
df
df %>% separate(x, col=c("BIG", "HOW"))
```
## unite()把不同欄位黏在一起。和separate()是相反的功能。
sep用來指定粘接樣式,default是_
```gherkin=1
#example 1
head(mtcars)
unite_(mtcars, "vs.am", c("vs","am"),sep = '.')
mtcars %>% unite(vs_am, vs, am)
mtcars %>% unite(vs_am, vs, am) %>% separate(vs_am, c("vs", "am"))
#example 2
unite(x,ab,a,b) -> z
separate(z,ab,into=c("a","b"))
#用arrange作lexicographic order
arrange(x,a)
arrange(x,c)
arrange(x,c,b,a)
arrange(x,c,-b)
```
## count(),取出某一欄做table,可以加sort排序
```gherkin=1
count(x,c)
count(x,a,c)
count(mtcars,cyl)
count(mtcars,cyl,sort=TRUE)
count(mtcars,cyl,vs)
count(mtcars,cyl,vs,sort=TRUE)
```
## 複習一下filter
```gherkin=1
filter(x,d>4)
filter(x,d>1.2 & c != "B")
```
## 複習一下select跟rename
```gherkin=1
select(x,a,b)
select(x,-c)
select(x,z=a,d)
rename(x,z=a)
```
## 複習一下mutate and transmute
```gherkin=1
mutate(x,d=2*b,c=tolower(c),e=b+d) -> z; z
transmute(x,d=2*b,c=tolower(c),e=b+d)
```
## group_by(),有點像是lapply或tapply
group_by雖然不會改變整個data看起來的樣子
但是會改變object與dplyr其他function的互動
常常會跟summarise()一起使用
```gherkin=1
by_cyl <- mtcars %>% group_by(cyl)
#有group tag的object,會分各group tag進行動作
by_cyl %>% summarise(
disp = mean(disp),
hp = mean(hp)
)
#沒有group tag的object,只會對整體進行動作
mtcars %>% summarise(
disp = mean(disp),
hp = mean(hp)
)
#比較一下差異~~~
by_cyl %>% filter(disp == max(disp))
mtcars %>% filter(disp == max(disp))
#同時用多個group factors,可以同時summarize多個factors
by_cyl_vs <- mtcars %>% group_by(cyl,vs)
by_cyl_vs %>% summarise()
by_cyl %>% summarise()
#在考慮多個factors時特別方便,可以達成tapply一樣的效果
by_cyl_vs %>% summarise(mean_mpg=mean(mpg))
by_cyl %>% summarise(mean_mpg=mean(mpg))
```
## ungroup() 解除data.frame的grouping tage
```gherkin=1
#ungroup可以去除grouping
by_cyl %>% ungroup()
mtcars %>%
group_by(., cyl) %>%
ungroup(.) %>%
summarise(.,
hp=mean(hp),
mpg=mean(mpg))
```
## Join operations join, expand.grid
```gherkin=1
x <- expand.grid(a=1:3,b=1:5);
head(x)
y <- expand.grid(a=1:2,b=1:5,c=factor(c("F","G")));
head(y)
left_join(x,y,by=c('a','b'))
right_join(x,y,by=c('a','b'))
inner_join(x,y,by=c('a','b'))
full_join(x,y,by=c('a','b'))
full_join(x,y,by='a')
inner_join(x,y,by='a')
#another toy example
tab1 = data.frame(name = c("Alexis", "Bernie", "Charlie"),
children = 1:3,
stringsAsFactors = FALSE)
tab2 = data.frame(name = c("Alexis", "Bernie", "David"),
age = c(54, 34, 63),
stringsAsFactors = FALSE)
tab1
tab2
inner_join(x=tab1, y=tab2, by="name")
left_join(x=tab1, y=tab2, by="name")
right_join(x=tab1, y=tab2, by="name")
full_join(x=tab1, y=tab2, by="name")
```
 
 
## pivot_wider(),有點像是spread
```gherkin=1
#簡單的經典用法,指定哪兩個要spread,一個當column(with name),一個作為value
fish_encounters
fish_encounters %>%
pivot_wider(names_from = station, values_from = seen)
# 把NA變成0. XD.....千萬不要亂用,除非你知道填這個0是要幹嘛
fish_encounters %>%
pivot_wider(names_from = station, values_from = seen, values_fill = 0)
# 還可以同時spread兩個values,類似tapply再cbind這樣
# 這個例子就是將estimate展開,moe展開,然後cbind在一起,當然key會幫你對好
us_rent_income
us_rent_income %>%
pivot_wider(names_from = variable, values_from = c(estimate))
us_rent_income %>%
pivot_wider(names_from = variable, values_from = c(estimate, moe))
# 多個`names_from` 或是 `values_from`
# 使用`names_sep` 或是 `names_glue` 來控制輸出名稱
# name_sep指定column name的結合方式
us_rent_income %>%
pivot_wider(
names_from = variable,
names_sep = ".",
values_from = c(estimate, moe)
)
# 或是costoumized column name
us_rent_income %>%
pivot_wider(
names_from = variable,
names_glue = "{variable}_{.value}",
values_from = c(estimate, moe)
)
# Can perform aggregation with values_fn
warpbreaks <- as_tibble(warpbreaks[c("wool", "tension", "breaks")])
warpbreaks
warpbreaks %>%
pivot_wider(
names_from = wool,
values_from = breaks,
values_fn = mean
)
```
## pivot_longer(),有點像是gather
```gherkin=1
#給要留下的ID欄不要被gather,指定key(names_to)跟value(values_to)
relig_income
relig_income %>% pivot_longer(!religion, names_to = "income", values_to = "count")
# names_prefix可以用來移除column name的前綴,wk1, wk2 ==> 1, 2
billboard
billboard %>%
pivot_longer(
cols = starts_with("wk"),
names_to = "week",
names_prefix = "wk",
values_to = "rank",
values_drop_na = TRUE
)
# Multiple variables stored in column names
who %>% pivot_longer(
cols = new_sp_m014:newrel_f65,
names_to = c("diagnosis", "gender", "age"),
names_pattern = "new_?(.*)_(.)(.*)",
values_to = "count"
)
# Multiple observations per row
anscombe
anscombe %>%
pivot_longer(everything(),
names_to = c(".value", "set"),
names_pattern = "(.)(.)"
)
```
###### tags: `tidyverse` `tidyr` `dplyr` `dbplyr`