---
title: '使用purrr'
disqus: hackmd
---
使用purrr
===
library(purrr)
library(tibble)
library(tidyverse)
data(mtcars)
個人是覺得map跟lapply一樣好用,只是習慣問題,但是多會一點並無不好,
尤其是看得懂別人,用不同風格寫的code,要改也比較方便。
我們先來看一個例子:fitting a model to each subgroup and extracting a coefficient of the model
```gherkin=1
by_cyl <- split(mtcars, mtcars$cyl)
#style 1: for loop (程式碼長,需一行一行確認變數)
slopes <- double(length(by_cyl))
for (i in seq_along(by_cyl)) {
model <- lm(mpg ~ wt, data = by_cyl[[i]])
slopes[[i]] <- coef(model)[[2]]
}
slopes
#> [1] -5.65 -2.78 -2.19
#style 2: apply family (程式碼短,但要超熟手才能快速解讀)
models <- lapply(by_cyl, function(data) lm(mpg ~ wt, data = data))
vapply(models, function(x) coef(x)[[2]], double(1))
#> 4 6 8
#> -5.65 -2.78 -2.19
#style 3: apply family + pipe (程式碼長,依然要熟手才能快速解讀)
by_cyl %>%
lapply(function(data) lm(mpg ~ wt, data = data)) %>%
lapply(coef) %>%
vapply(function(x) x[[2]], double(1))
#> 4 6 8
#> -5.65 -2.78 -2.19
#style 4: 這樣是不是漂亮多了?
by_cyl %>%
map(~ lm(mpg ~ wt, data = .x)) %>%
map(coef) %>%
map_dbl(2)
#> 4 6 8
#> -5.65 -2.78 -2.19
```
# Table of Contents
[TOC]
# map family
## map

map(.x= , .f= ) always returns a list
map_dbl() always returns a double vector
map_chr() always returns a character vector
map_lgl() always returns a logical vector
map_int() always returns a integer vector
```gherkin=1
#當function輸出是單一value時
SQR <- function(input){
output <- sqrt(input)*10
return(output)
}
#輸入
input_parameters<-sample(1:100,30)
#跟lapply一樣,會輸出list
map(input_parameters, SQR)
map(input_parameters, ~ SQR(.x)) # Anonymous functions
lapply(input_parameters,SQR)
#但如果使用map_dbl,可以變成vector
map_dbl(input_parameters, SQR)
map_dbl(input_parameters, ~ SQR(.x)) # Anonymous functions
#當function輸出是vector時
SQR.vec <- function(input){
output1 <- sqrt(input)
output2 <- log2(input)
return(c(output1,output2))
}
#跟lapply一樣,會輸出list,但不可使用map_dbl,會出錯
map(input_parameters, SQR.vec)
map(input_parameters, ~ SQR(.x)) # Anonymous functions
lapply(input_parameters,SQR.vec)
```
## Anonymous functions
可以直接省略寫出function(x)
```gherkin=1
map_dbl(mtcars, function(x) length(unique(x)))
map_dbl(mtcars, ~ length(unique(.x)))
```
## Work with object like JSON
對於格式較為自由度高的物件,也可以利用purrr來進行處理
```gherkin=1
x <- list(
list(-1, x = 1, y = c(2), z = "a"),
list(-2, x = 4, y = c(5, 6), z = "b"),
list(-3, x = 8, y = c(9, 10, 11))
)
# Select by name, 給文字直接抓list tag
map_dbl(x, "x")
#> [1] 1 4 8
#ex.
trials <- map(1:100, ~ t.test(rpois(12, 9), rpois(8, 7))) #做了上百次的t-test
map_dbl(trials,"p.value") #抓出每一次的p-value
# Or by position, 給數字抓column
map_dbl(x, 1)
#> [1] -1 -2 -3
# Or by both, 抓某個list裡面第幾個column
map_dbl(x, list("y", 1))
#> [1] 2 5 9
# 如果list有缺這個tag,會變成回報錯誤
map_chr(x, "z")
#> Error: Result 3 must be a single string, not NULL of length 0
# 所以使用 .default value = NA 填補空值問題
map_chr(x, "z", .default = NA)
#> [1] "a" "b" NA
```
### 當一個object很複雜時,list裡面有list....
```gherkin=1
x <- list(
list(1, c(3, 9)),
list(c(3, 6), 7, c(4, 7, 6))
)
triple <- function(x) x * 3
map(x, map, .f = triple) #會出錯
x %>% map(map, triple) #這樣才對
x %>% map(~ map(.x, triple)) #不偷懶
x %>% map(~ .x %>% map(triple)) #也可以這樣寫
```
## 參數傳遞 (argument passing)
參數可以是一個logic,例如na.rm=TRUE;或是一個vector,例如breaks=c(-1,0,1)


```gherkin=1
x <- list(1:3, c(1:10, NA))
map_dbl(x, ~ mean(.x, na.rm = TRUE))
map_dbl(x, mean, na.rm = TRUE)
product <- function(x, y){ x * y }
x <- 1:4
map_dbl(x, product, runif(1))
map_dbl(x, ~ product(.x, runif(1)))
trims <- c(0, 0.2, 0.4, 0.6)
x <- rcauchy(500)
map_dbl(.x=trims, ~ mean(x, trim = .x)) #把input .x打出來,增加可讀性
```
### 同一data跑各種參數;同一function跑各種data
```gherkin=1
#同一data跑各種參數
#先寫好各個分析要用的參數,放進去list裡面
formulas <- list(
mpg ~ disp,
mpg ~ I(1 / disp),
mpg ~ disp + wt,
mpg ~ I(1 / disp) + wt
)
#把
map(.x=formulas, ~lm(formula=.x,data=mtcars))
map(.x=formulas, ~lm(formula=.x,data=mtcars)) %>% map(., summary) #可以繼續pipe下去
#同一function跑各種data
bootstrap <- function(df) {
df[sample(nrow(df), replace = TRUE), , drop = FALSE]
}
bootstraps <- map(1:10, ~ bootstrap(mtcars))
map(.x=bootstraps, ~lm(mpg ~ disp,data=.x))
#可以繼續pipe下去取出某一結果
map(.x=bootstraps, ~lm(mpg ~ disp,data=.x)) %>%
map(., summary) %>%
map(., "r.squared")
```
---

經由上面可以發現,purrr的設計其實就是適應各種input x output,對每一種形式來特化指令
## modify: 輸入與輸出同格式
---
```gherkin=1
df <- data.frame(
x = 1:3,
y = 6:4
)
map(df, ~ .x * 2)
#> $x
#> [1] 2 4 6
#>
#> $y
#> [1] 12 10 8
modify(df, ~ .x * 2)
#> x y
#> 1 2 12
#> 2 4 10
#> 3 6 8
# 可以用來把同一個row的資料別上其他的name....... XD
# 其實很簡單,輸入的row number給了參數的空間,後面指定index,把參數空間填滿輸出
modify(mtcars, 1)
modify(mtcars[1,], 1)
modify(mtcars[1:2,], 1)
modify(mtcars[1:3,], 1)
modify(mtcars[1:2,], 2)
modify(mtcars[1:3,], 3)
```
## map2:2個以上
---
map是輸入一組需逐一計算的vector,但當需要兩組成對時,這時就可以使用map2函數,
概念為輸入兩組成對且需逐一計算的vector,成對帶入後置函數內,完成計算。

```gherkin=1
#The function want to be used
Score <- function(input1, input2){
output <- sqrt(input1) + log2(input2)
return(output)
}
#輸入
input1 <- sample(1:100,30)
input2 <- sample(1:100,30)
output <- map2_dbl(input1,
input2,
Score)
#這時就可以發現lapply會非常難寫這種........
```
比較一下map


```gherkin=1
xs <- map(1:8, ~ runif(10))
xs[[1]][[1]] <- NA
ws <- map(1:8, ~ rpois(10, 5) + 1)
map_dbl(xs, mean)
#> [1] NA 0.463 0.551 0.453 0.564 0.501 0.371 0.443
map_dbl(xs, weighted.mean, w = ws)
#> Error in weighted.mean.default(.x[[i]], ...): 'x' and 'w' must have the same
#> length
map2_dbl(xs, ws, weighted.mean)
#> [1] NA 0.451 0.603 0.452 0.563 0.510 0.342 0.464
```
### 超方便:資料與function互相獨立維護
這裡另外給個好用的範例:利用欄位名當作index,對資料中同名欄位用list中同名function作計算處理
好處是資料不變的情況下,可以變更單一function,再map2回去資料就可以
或是function不變,將修改過後的資料,再map2回去資料就可以
概念上就是資料與處理流程可以分開maintain,方便function的版本維護
例如:把2個functions放進一個list
1. 把資料其中兩欄取出,用map2分別用list裡的兩個同名functions運算
2. 指定其中兩欄,用map去找到list裡面,使用與資料欄位名相同的function運算 (缺點是產出的結果沒有name只有index)
```gherkin=1
#map2
trans <- list(
disp = function(x) x * 0.0163871,
am = function(x) factor(x, labels = c("auto", "manual"))
)
nm <- names(trans)
map2(trans, mtcars[nm], function(f, var) f(var))
#map
map(nm, ~ trans[[.x]](mtcars[[.x]]))
```
more argument, ex. na.rm=TRUE

```gherkin=1
map2_dbl(xs, ws, weighted.mean, na.rm = TRUE)
#> [1] 0.504 0.451 0.603 0.452 0.563 0.510 0.342 0.464
```
map2() recycles its inputs to make sure that they’re the same length:

map2(x, y, f) 會變得與 map(x, f, y)一樣,
## walk: execute funciton while not return results
有時候不需要將執行的結果拿會RAM,例如寫出資料集或是cat出message
* map ==> walk
* map2 => walk2
```gherkin=1
## walk
welcome <- function(x) {
cat("Welcome ", x, "!\n", sep = "")
}
names <- c("Hadley", "Jenny")
map(names, welcome)
#> Welcome Hadley!
#> Welcome Jenny!
#> [[1]]
#> NULL
#>
#> [[2]]
#> NULL
walk(names, welcome)
#> Welcome Hadley!
#> Welcome Jenny!
## walk2
temp <- tempfile()
dir.create(temp)
cyls <- split(mtcars, mtcars$cyl)
paths <- file.path(temp, paste0("cyl-", names(cyls), ".csv"))
#執行寫出這個動作,但是不去將存的結果拿回RAM,因為已經在cyls了....
walk2(cyls, paths, write.csv)
#這個同等於以下這三個指令
#write.csv(cyls[[1]], paths[[1]])
#write.csv(cyls[[2]], paths[[2]])
#write.csv(cyls[[3]], paths[[3]])
dir(temp)
#> [1] "cyl-4.csv" "cyl-6.csv" "cyl-8.csv"
```
## imap(): indexed map(), iterate by index
imap(x, f) 同等於
* map2(x, names(x), f) : 如果x有column name
* map2(x, seq_along(x), f) : 如果x沒有column name
### imap() 在建立labels相當有用:
```gherkin=1
imap_chr(iris, ~ paste0("The first value of ", .y, " is ", .x[[1]]))
#> Sepal.Length
#> "The first value of Sepal.Length is 5.1"
#> Sepal.Width
#> "The first value of Sepal.Width is 3.5"
#> Petal.Length
#> "The first value of Petal.Length is 1.4"
#> Petal.Width
#> "The first value of Petal.Width is 0.2"
#> Species
#> "The first value of Species is setosa"
```
If the vector is unnamed, the second argument will be the index:
但如果vector沒有name,其順序或index會直接被當成第二個參數
例如 list 1 => "1", list 2 => "2"
```gherkin=1
x <- map(1:6, ~ sample(1000, 10)) # 6個亂數vectors,個放在一個list裡面
imap_chr(x, ~ paste0("The highest value of ", .y, " is ", max(.x)))
#> [1] "The highest value of 1 is 975" "The highest value of 2 is 915"
#> [3] "The highest value of 3 is 982" "The highest value of 4 is 955"
#> [5] "The highest value of 5 is 971" "The highest value of 6 is 696"
```
應用在iwalk
```gherkin=1
#walk2 and iwalk
temp <- tempfile()
dir.create(temp)
cyls <- split(mtcars, mtcars$cyl)
#walk2, 分兩行
paths <- file.path(temp, paste0("cyl-", names(cyls), ".csv"))
walk2(cyls, paths, write.csv)
#iwalk, 一行結束,但是要熟手比較能讀懂
iwalk(cyls, ~write.csv(x=.x, file=file.path(temp, paste0("cyl-", .y, ".csv"))))
```
## pmap(): 任意input
Input: a list of equal-length vectors.)(就當作很像是data.frame吧!!)

所以map2(x, y, f) 同等於 pmap(list(x, y), f).
* pmap(list(xs, ws), weighted.mean)
* map2_dbl(xs, ws, weighted.mean)

```gherkin=1
pmap_dbl(list(xs, ws), weighted.mean)
pmap_dbl(list(xs, ws), weighted.mean, na.rm = TRUE)
```

```gherkin=1
trims <- c(0, 0.1, 0.2, 0.5)
x <- rcauchy(1000)
map_dbl(trims, ~ mean(x, trim = .x)) #將參數當作.x,分開一個個丟入mean,並指定在參數trim使用(有x有.x很複雜)
map_dbl(trims, function(trim) mean(x, trim = trim)) #不偷懶的寫,但是有點長了
#切成簡單的兩步驟
par_trim <- list(trim = trims) # step 1 製作參數dataframe: 將參數做成list,然後命名為mean用的參數名trim
pmap_dbl(par_trim, mean, x = x) # step 2 用pmap將參數丟入mean使用在x上
#> [1] -6.6740 0.0210 0.0235 0.0151
```
但通常pmap使用上,與data.frame可以做出很棒的配合
使用tibble::tribble()來製作data.frame,可以讓使用者定義row-by-row,而非傳統的col-by-col
為何要這樣做的原因是:
回想上面的圖一下,我們是不是將參數row-by-row的放入map family裡面呢?
這個概念就是:
1. 使用tribble::tribble製作出一個參數空間
* 配合要使用的function,命名column name
* 每一條row即是,該次要跑function用的參數
2. 將這個參數空間利用pmap應用於該function上
這個好處是:能夠一目瞭然,這次要跑的參數是哪一些!!!!
下面這個例子:如何使用不同參數,模擬出隨機uniform distribution
```gherkin=1
#column name很重要,需要配合要丟進去的function裡面
params <- tibble::tribble(
~ n, ~ min, ~ max,
1L, 0, 1,
2L, 10, 100,
3L, 100, 1000
)
pmap(params, runif)
#> [[1]]
#> [1] 0.332
#>
#> [[2]]
#> [1] 53.5 47.6
#>
#> [[3]]
#> [1] 231 715 515
#這跟下面的結果一樣,雖然在小批量看不出差異,但是當你要跑10000組........
runif(n = 1L, min = 0, max = 1)
runif(n = 2L, min = 10, max = 100)
runif(n = 3L, min = 100, max = 1000)
```
# Reduce family
處理大數據的好幫手
## reduce
以下這兩個相等,但是reduce避免了使用巢狀結構,但這比較適用於需要遞迴處理單一參數的task,因為function不可以變
* reduce(1:4, f)
* f(f(f(1, 2), 3), 4)

比如我們要 intersect/union所有list裡面的值
```gherkin=1
l <- map(1:4, ~ sample(1:10, 15, replace = T))
str(l)
#這樣寫也可以,但是list有100的話??
out <- l[[1]]
out <- intersect(out, l[[2]])
out <- intersect(out, l[[3]])
out <- intersect(out, l[[4]])
out
#> [1] 8 4
reduce(l, intersect)
#> [1] 8 4
reduce(l, union)
#> [1] 7 1 8 3 2 4 10 5 9 6
```

也可以跟map一樣,混入奇怪的參數(誤),放在function後面
## accumulate
其實跟reduce一樣,不過會將過程一一保留下來,依序存成list。
開發時建議使用,方便確認每一步是否依照預想的進行。
```gherkin=1
accumulate(l, intersect)
#> [[1]]
#> [1] 7 1 8 8 3 8 2 4 7 10 10 3 7 10 10
#>
#> [[2]]
#> [1] 1 8 3 2 4 10
#>
#> [[3]]
#> [1] 8 4 10
#>
#> [[4]]
#> [1] 8 4
x <- c(4, 3, 10)
reduce(x, `+`)
#> [1] 17
accumulate(x, `+`)
#> [1] 4 7 17
```
## multiple input


目前尚未有ex.努力中
```gherkin=1
```
# Predicate functionals (比對數值時好用)
* some(.x, .p) returns TRUE if any element matches.
* every(.x, .p) returns TRUE if all elements match.
* none(.x, .p) returns TRUE if no element matches.
* detect(.x, .p) returns the value of the first match.
* detect_index(.x, .p) returns the location of the first match.
* keep(.x, .p) keeps all matching elements
* discard(.x, .p) drops all matching elements.
```gherkin=1
df <- data.frame(x = 1:3, y = c("a", "b", "c"))
detect(df, is.factor)
#> NULL
detect_index(df, is.factor)
#> [1] 0
str(keep(df, is.factor))
#> 'data.frame': 3 obs. of 0 variables
str(discard(df, is.factor))
#> 'data.frame': 3 obs. of 2 variables:
#> $ x: int 1 2 3
#> $ y: chr "a" "b" "c"
```
## map variants
只針對特定的format才進行運算,其他跳過,這對數值與文字混合的list很適用
```gherkin=1
df <- data.frame(
num1 = c(0, 10, 20),
num2 = c(5, 6, 7),
chr1 = c("a", "b", "c"),
stringsAsFactors = FALSE
)
str(map_if(df, is.numeric, mean))
#> List of 3
#> $ num1: num 10
#> $ num2: num 6
#> $ chr1: chr [1:3] "a" "b" "c"
str(modify_if(df, is.numeric, mean))
#> 'data.frame': 3 obs. of 3 variables:
#> $ num1: num 10 10 10
#> $ num2: num 6 6 6
#> $ chr1: chr "a" "b" "c"
str(map(keep(df, is.numeric), mean))
#> List of 2
#> $ num1: num 10
#> $ num2: num 6
```
# Base functionals
## Matrices and arrays
```gherkin=1
#typical apply
a2d <- matrix(1:20, nrow = 5)
apply(a2d, 1, mean)
#> [1] 8.5 9.5 10.5 11.5 12.5
apply(a2d, 2, mean)
#> [1] 3 8 13 18
#the output is not always the same as the input
a1 <- apply(a2d, 1, identity)
identical(a2d, a1)
#> [1] FALSE
a2 <- apply(a2d, 2, identity)
identical(a2d, a2)
#> [1] TRUE
#MARGIN 2d
a3d <- array(1:24, c(2, 3, 4))
apply(a3d, 1, mean)
#> [1] 12 13
apply(a3d, c(1, 2), mean)
#> [,1] [,2] [,3]
#> [1,] 10 12 14
#> [2,] 11 13 15
```
## Mathematical concerns
* integrate() finds the area under the curve defined by f()
* uniroot() finds where f() hits zero
* optimise() finds the location of the lowest (or highest) value of f()
下面是一個使用sin()的例子
```gherkin=1
integrate(sin, 0, pi)
#> 2 with absolute error < 2.2e-14
str(uniroot(sin, pi * c(1 / 2, 3 / 2)))
#> List of 5
#> $ root : num 3.14
#> $ f.root : num 1.22e-16
#> $ iter : int 2
#> $ init.it : int NA
#> $ estim.prec: num 6.1e-05
str(optimise(sin, c(0, 2 * pi)))
#> List of 2
#> $ minimum : num 4.71
#> $ objective: num -1
str(optimise(sin, c(0, pi), maximum= FALSE))
#> List of 2
#> $ minimum : num 7.93e-05
#> $ objective: num 7.93e-05
str(optimise(sin, c(0, pi), maximum = TRUE))
#> List of 2
#> $ maximum : num 1.57
#> $ objective: num 1
```
```gherkin=1
```
###### tags: `tidyverse` `purrr` `functional program`