使用purrr - HackMD

--- title: '使用purrr' disqus: hackmd --- 使用purrr === library(purrr) library(tibble) library(tidyverse) data(mtcars) 個人是覺得map跟lapply一樣好用，只是習慣問題，但是多會一點並無不好，尤其是看得懂別人，用不同風格寫的code，要改也比較方便。我們先來看一個例子：fitting a model to each subgroup and extracting a coefficient of the model ```gherkin=1 by_cyl <- split(mtcars, mtcars$cyl) #style 1: for loop (程式碼長，需一行一行確認變數) slopes <- double(length(by_cyl)) for (i in seq_along(by_cyl)) { model <- lm(mpg ~ wt, data = by_cyl[[i]]) slopes[[i]] <- coef(model)[[2]] } slopes #> [1] -5.65 -2.78 -2.19 #style 2: apply family (程式碼短，但要超熟手才能快速解讀) models <- lapply(by_cyl, function(data) lm(mpg ~ wt, data = data)) vapply(models, function(x) coef(x)[[2]], double(1)) #> 4 6 8 #> -5.65 -2.78 -2.19 #style 3: apply family + pipe (程式碼長，依然要熟手才能快速解讀) by_cyl %>% lapply(function(data) lm(mpg ~ wt, data = data)) %>% lapply(coef) %>% vapply(function(x) x[[2]], double(1)) #> 4 6 8 #> -5.65 -2.78 -2.19 #style 4: 這樣是不是漂亮多了？ by_cyl %>% map(~ lm(mpg ~ wt, data = .x)) %>% map(coef) %>% map_dbl(2) #> 4 6 8 #> -5.65 -2.78 -2.19 ``` # Table of Contents [TOC] # map family ## map ![](https://i.imgur.com/eZiH54v.png) map(.x= , .f= ) always returns a list map_dbl() always returns a double vector map_chr() always returns a character vector map_lgl() always returns a logical vector map_int() always returns a integer vector ```gherkin=1 #當function輸出是單一value時 SQR <- function(input){ output <- sqrt(input)*10 return(output) } #輸入 input_parameters<-sample(1:100,30) #跟lapply一樣，會輸出list map(input_parameters, SQR) map(input_parameters, ~ SQR(.x)) # Anonymous functions lapply(input_parameters,SQR) #但如果使用map_dbl，可以變成vector map_dbl(input_parameters, SQR) map_dbl(input_parameters, ~ SQR(.x)) # Anonymous functions #當function輸出是vector時 SQR.vec <- function(input){ output1 <- sqrt(input) output2 <- log2(input) return(c(output1,output2)) } #跟lapply一樣，會輸出list，但不可使用map_dbl，會出錯 map(input_parameters, SQR.vec) map(input_parameters, ~ SQR(.x)) # Anonymous functions lapply(input_parameters,SQR.vec) ``` ## Anonymous functions 可以直接省略寫出function(x) ```gherkin=1 map_dbl(mtcars, function(x) length(unique(x))) map_dbl(mtcars, ~ length(unique(.x))) ``` ## Work with object like JSON 對於格式較為自由度高的物件，也可以利用purrr來進行處理 ```gherkin=1 x <- list( list(-1, x = 1, y = c(2), z = "a"), list(-2, x = 4, y = c(5, 6), z = "b"), list(-3, x = 8, y = c(9, 10, 11)) ) # Select by name, 給文字直接抓list tag map_dbl(x, "x") #> [1] 1 4 8 #ex. trials <- map(1:100, ~ t.test(rpois(12, 9), rpois(8, 7))) #做了上百次的t-test map_dbl(trials,"p.value") #抓出每一次的p-value # Or by position, 給數字抓column map_dbl(x, 1) #> [1] -1 -2 -3 # Or by both, 抓某個list裡面第幾個column map_dbl(x, list("y", 1)) #> [1] 2 5 9 # 如果list有缺這個tag，會變成回報錯誤 map_chr(x, "z") #> Error: Result 3 must be a single string, not NULL of length 0 # 所以使用 .default value = NA 填補空值問題 map_chr(x, "z", .default = NA) #> [1] "a" "b" NA ``` ### 當一個object很複雜時，list裡面有list.... ```gherkin=1 x <- list( list(1, c(3, 9)), list(c(3, 6), 7, c(4, 7, 6)) ) triple <- function(x) x * 3 map(x, map, .f = triple) #會出錯 x %>% map(map, triple) #這樣才對 x %>% map(~ map(.x, triple)) #不偷懶 x %>% map(~ .x %>% map(triple)) #也可以這樣寫 ``` ## 參數傳遞 (argument passing) 參數可以是一個logic，例如na.rm=TRUE；或是一個vector，例如breaks=c(-1,0,1) ![](https://i.imgur.com/5b2QJBf.png) ![](https://i.imgur.com/3osTkHz.png) ```gherkin=1 x <- list(1:3, c(1:10, NA)) map_dbl(x, ~ mean(.x, na.rm = TRUE)) map_dbl(x, mean, na.rm = TRUE) product <- function(x, y){ x * y } x <- 1:4 map_dbl(x, product, runif(1)) map_dbl(x, ~ product(.x, runif(1))) trims <- c(0, 0.2, 0.4, 0.6) x <- rcauchy(500) map_dbl(.x=trims, ~ mean(x, trim = .x)) #把input .x打出來，增加可讀性 ``` ### 同一data跑各種參數；同一function跑各種data ```gherkin=1 #同一data跑各種參數 #先寫好各個分析要用的參數，放進去list裡面 formulas <- list( mpg ~ disp, mpg ~ I(1 / disp), mpg ~ disp + wt, mpg ~ I(1 / disp) + wt ) #把 map(.x=formulas, ~lm(formula=.x,data=mtcars)) map(.x=formulas, ~lm(formula=.x,data=mtcars)) %>% map(., summary) #可以繼續pipe下去 #同一function跑各種data bootstrap <- function(df) { df[sample(nrow(df), replace = TRUE), , drop = FALSE] } bootstraps <- map(1:10, ~ bootstrap(mtcars)) map(.x=bootstraps, ~lm(mpg ~ disp,data=.x)) #可以繼續pipe下去取出某一結果 map(.x=bootstraps, ~lm(mpg ~ disp,data=.x)) %>% map(., summary) %>% map(., "r.squared") ``` --- ![](https://i.imgur.com/zsIZogr.png) 經由上面可以發現，purrr的設計其實就是適應各種input x output，對每一種形式來特化指令 ## modify: 輸入與輸出同格式 --- ```gherkin=1 df <- data.frame( x = 1:3, y = 6:4 ) map(df, ~ .x * 2) #> $x #> [1] 2 4 6 #> #> $y #> [1] 12 10 8 modify(df, ~ .x * 2) #> x y #> 1 2 12 #> 2 4 10 #> 3 6 8 # 可以用來把同一個row的資料別上其他的name....... XD # 其實很簡單，輸入的row number給了參數的空間，後面指定index，把參數空間填滿輸出 modify(mtcars, 1) modify(mtcars[1,], 1) modify(mtcars[1:2,], 1) modify(mtcars[1:3,], 1) modify(mtcars[1:2,], 2) modify(mtcars[1:3,], 3) ``` ## map2：2個以上 --- map是輸入一組需逐一計算的vector，但當需要兩組成對時，這時就可以使用map2函數，概念為輸入兩組成對且需逐一計算的vector，成對帶入後置函數內，完成計算。 ![](https://i.imgur.com/8JJK3XD.png) ```gherkin=1 #The function want to be used Score <- function(input1, input2){ output <- sqrt(input1) + log2(input2) return(output) } #輸入 input1 <- sample(1:100,30) input2 <- sample(1:100,30) output <- map2_dbl(input1, input2, Score) #這時就可以發現lapply會非常難寫這種........ ``` 比較一下map ![](https://i.imgur.com/sgFxDLF.png) ![](https://i.imgur.com/RkSmIrf.png) ```gherkin=1 xs <- map(1:8, ~ runif(10)) xs[[1]][[1]] <- NA ws <- map(1:8, ~ rpois(10, 5) + 1) map_dbl(xs, mean) #> [1] NA 0.463 0.551 0.453 0.564 0.501 0.371 0.443 map_dbl(xs, weighted.mean, w = ws) #> Error in weighted.mean.default(.x[[i]], ...): 'x' and 'w' must have the same #> length map2_dbl(xs, ws, weighted.mean) #> [1] NA 0.451 0.603 0.452 0.563 0.510 0.342 0.464 ``` ### 超方便：資料與function互相獨立維護這裡另外給個好用的範例：利用欄位名當作index，對資料中同名欄位用list中同名function作計算處理好處是資料不變的情況下，可以變更單一function，再map2回去資料就可以或是function不變，將修改過後的資料，再map2回去資料就可以概念上就是資料與處理流程可以分開maintain，方便function的版本維護例如：把2個functions放進一個list 1. 把資料其中兩欄取出，用map2分別用list裡的兩個同名functions運算 2. 指定其中兩欄，用map去找到list裡面，使用與資料欄位名相同的function運算 (缺點是產出的結果沒有name只有index) ```gherkin=1 #map2 trans <- list( disp = function(x) x * 0.0163871, am = function(x) factor(x, labels = c("auto", "manual")) ) nm <- names(trans) map2(trans, mtcars[nm], function(f, var) f(var)) #map map(nm, ~ trans[[.x]](mtcars[[.x]])) ``` more argument, ex. na.rm=TRUE ![](https://i.imgur.com/0hSk9ZQ.png) ```gherkin=1 map2_dbl(xs, ws, weighted.mean, na.rm = TRUE) #> [1] 0.504 0.451 0.603 0.452 0.563 0.510 0.342 0.464 ``` map2() recycles its inputs to make sure that they’re the same length: ![](https://i.imgur.com/uysWj6R.png) map2(x, y, f) 會變得與 map(x, f, y)一樣， ## walk: execute funciton while not return results 有時候不需要將執行的結果拿會RAM，例如寫出資料集或是cat出message * map ==> walk * map2 => walk2 ```gherkin=1 ## walk welcome <- function(x) { cat("Welcome ", x, "!\n", sep = "") } names <- c("Hadley", "Jenny") map(names, welcome) #> Welcome Hadley! #> Welcome Jenny! #> [[1]] #> NULL #> #> [[2]] #> NULL walk(names, welcome) #> Welcome Hadley! #> Welcome Jenny! ## walk2 temp <- tempfile() dir.create(temp) cyls <- split(mtcars, mtcars$cyl) paths <- file.path(temp, paste0("cyl-", names(cyls), ".csv")) #執行寫出這個動作，但是不去將存的結果拿回RAM，因為已經在cyls了.... walk2(cyls, paths, write.csv) #這個同等於以下這三個指令 #write.csv(cyls[[1]], paths[[1]]) #write.csv(cyls[[2]], paths[[2]]) #write.csv(cyls[[3]], paths[[3]]) dir(temp) #> [1] "cyl-4.csv" "cyl-6.csv" "cyl-8.csv" ``` ## imap(): indexed map(), iterate by index imap(x, f) 同等於 * map2(x, names(x), f) ：如果x有column name * map2(x, seq_along(x), f) ：如果x沒有column name ### imap() 在建立labels相當有用: ```gherkin=1 imap_chr(iris, ~ paste0("The first value of ", .y, " is ", .x[[1]])) #> Sepal.Length #> "The first value of Sepal.Length is 5.1" #> Sepal.Width #> "The first value of Sepal.Width is 3.5" #> Petal.Length #> "The first value of Petal.Length is 1.4" #> Petal.Width #> "The first value of Petal.Width is 0.2" #> Species #> "The first value of Species is setosa" ``` If the vector is unnamed, the second argument will be the index: 但如果vector沒有name，其順序或index會直接被當成第二個參數例如 list 1 => "1", list 2 => "2" ```gherkin=1 x <- map(1:6, ~ sample(1000, 10)) # 6個亂數vectors，個放在一個list裡面 imap_chr(x, ~ paste0("The highest value of ", .y, " is ", max(.x))) #> [1] "The highest value of 1 is 975" "The highest value of 2 is 915" #> [3] "The highest value of 3 is 982" "The highest value of 4 is 955" #> [5] "The highest value of 5 is 971" "The highest value of 6 is 696" ``` 應用在iwalk ```gherkin=1 #walk2 and iwalk temp <- tempfile() dir.create(temp) cyls <- split(mtcars, mtcars$cyl) #walk2, 分兩行 paths <- file.path(temp, paste0("cyl-", names(cyls), ".csv")) walk2(cyls, paths, write.csv) #iwalk, 一行結束，但是要熟手比較能讀懂 iwalk(cyls, ~write.csv(x=.x, file=file.path(temp, paste0("cyl-", .y, ".csv")))) ``` ## pmap(): 任意input Input: a list of equal-length vectors.）(就當作很像是data.frame吧!!) ![](https://i.imgur.com/eUi8b4w.png) 所以map2(x, y, f) 同等於 pmap(list(x, y), f). * pmap(list(xs, ws), weighted.mean) * map2_dbl(xs, ws, weighted.mean) ![](https://i.imgur.com/q8K14fQ.png) ```gherkin=1 pmap_dbl(list(xs, ws), weighted.mean) pmap_dbl(list(xs, ws), weighted.mean, na.rm = TRUE) ``` ![](https://i.imgur.com/B9VBesf.png) ```gherkin=1 trims <- c(0, 0.1, 0.2, 0.5) x <- rcauchy(1000) map_dbl(trims, ~ mean(x, trim = .x)) #將參數當作.x，分開一個個丟入mean，並指定在參數trim使用(有x有.x很複雜) map_dbl(trims, function(trim) mean(x, trim = trim)) #不偷懶的寫，但是有點長了 #切成簡單的兩步驟 par_trim <- list(trim = trims) # step 1 製作參數dataframe: 將參數做成list，然後命名為mean用的參數名trim pmap_dbl(par_trim, mean, x = x) # step 2 用pmap將參數丟入mean使用在x上 #> [1] -6.6740 0.0210 0.0235 0.0151 ``` 但通常pmap使用上，與data.frame可以做出很棒的配合使用tibble::tribble()來製作data.frame，可以讓使用者定義row-by-row，而非傳統的col-by-col 為何要這樣做的原因是：回想上面的圖一下，我們是不是將參數row-by-row的放入map family裡面呢? 這個概念就是： 1. 使用tribble::tribble製作出一個參數空間 * 配合要使用的function，命名column name * 每一條row即是，該次要跑function用的參數 2. 將這個參數空間利用pmap應用於該function上這個好處是：能夠一目瞭然，這次要跑的參數是哪一些!!!! 下面這個例子：如何使用不同參數，模擬出隨機uniform distribution ```gherkin=1 #column name很重要，需要配合要丟進去的function裡面 params <- tibble::tribble( ~ n, ~ min, ~ max, 1L, 0, 1, 2L, 10, 100, 3L, 100, 1000 ) pmap(params, runif) #> [[1]] #> [1] 0.332 #> #> [[2]] #> [1] 53.5 47.6 #> #> [[3]] #> [1] 231 715 515 #這跟下面的結果一樣，雖然在小批量看不出差異，但是當你要跑10000組........ runif(n = 1L, min = 0, max = 1) runif(n = 2L, min = 10, max = 100) runif(n = 3L, min = 100, max = 1000) ``` # Reduce family 處理大數據的好幫手 ## reduce 以下這兩個相等，但是reduce避免了使用巢狀結構，但這比較適用於需要遞迴處理單一參數的task，因為function不可以變 * reduce(1:4, f) * f(f(f(1, 2), 3), 4) ![](https://i.imgur.com/xIZakvq.png) 比如我們要 intersect/union所有list裡面的值 ```gherkin=1 l <- map(1:4, ~ sample(1:10, 15, replace = T)) str(l) #這樣寫也可以，但是list有100的話?? out <- l[[1]] out <- intersect(out, l[[2]]) out <- intersect(out, l[[3]]) out <- intersect(out, l[[4]]) out #> [1] 8 4 reduce(l, intersect) #> [1] 8 4 reduce(l, union) #> [1] 7 1 8 3 2 4 10 5 9 6 ``` ![](https://i.imgur.com/1Yc1aRW.png) 也可以跟map一樣，混入奇怪的參數(誤)，放在function後面 ## accumulate 其實跟reduce一樣，不過會將過程一一保留下來，依序存成list。開發時建議使用，方便確認每一步是否依照預想的進行。 ```gherkin=1 accumulate(l, intersect) #> [[1]] #> [1] 7 1 8 8 3 8 2 4 7 10 10 3 7 10 10 #> #> [[2]] #> [1] 1 8 3 2 4 10 #> #> [[3]] #> [1] 8 4 10 #> #> [[4]] #> [1] 8 4 x <- c(4, 3, 10) reduce(x, `+`) #> [1] 17 accumulate(x, `+`) #> [1] 4 7 17 ``` ## multiple input ![](https://i.imgur.com/WXck9yc.png) ![](https://i.imgur.com/YdYkmxQ.png) 目前尚未有ex.努力中 ```gherkin=1 ``` # Predicate functionals (比對數值時好用) * some(.x, .p) returns TRUE if any element matches. * every(.x, .p) returns TRUE if all elements match. * none(.x, .p) returns TRUE if no element matches. * detect(.x, .p) returns the value of the first match. * detect_index(.x, .p) returns the location of the first match. * keep(.x, .p) keeps all matching elements * discard(.x, .p) drops all matching elements. ```gherkin=1 df <- data.frame(x = 1:3, y = c("a", "b", "c")) detect(df, is.factor) #> NULL detect_index(df, is.factor) #> [1] 0 str(keep(df, is.factor)) #> 'data.frame': 3 obs. of 0 variables str(discard(df, is.factor)) #> 'data.frame': 3 obs. of 2 variables: #> $ x: int 1 2 3 #> $ y: chr "a" "b" "c" ``` ## map variants 只針對特定的format才進行運算，其他跳過，這對數值與文字混合的list很適用 ```gherkin=1 df <- data.frame( num1 = c(0, 10, 20), num2 = c(5, 6, 7), chr1 = c("a", "b", "c"), stringsAsFactors = FALSE ) str(map_if(df, is.numeric, mean)) #> List of 3 #> $ num1: num 10 #> $ num2: num 6 #> $ chr1: chr [1:3] "a" "b" "c" str(modify_if(df, is.numeric, mean)) #> 'data.frame': 3 obs. of 3 variables: #> $ num1: num 10 10 10 #> $ num2: num 6 6 6 #> $ chr1: chr "a" "b" "c" str(map(keep(df, is.numeric), mean)) #> List of 2 #> $ num1: num 10 #> $ num2: num 6 ``` # Base functionals ## Matrices and arrays ```gherkin=1 #typical apply a2d <- matrix(1:20, nrow = 5) apply(a2d, 1, mean) #> [1] 8.5 9.5 10.5 11.5 12.5 apply(a2d, 2, mean) #> [1] 3 8 13 18 #the output is not always the same as the input a1 <- apply(a2d, 1, identity) identical(a2d, a1) #> [1] FALSE a2 <- apply(a2d, 2, identity) identical(a2d, a2) #> [1] TRUE #MARGIN 2d a3d <- array(1:24, c(2, 3, 4)) apply(a3d, 1, mean) #> [1] 12 13 apply(a3d, c(1, 2), mean) #> [,1] [,2] [,3] #> [1,] 10 12 14 #> [2,] 11 13 15 ``` ## Mathematical concerns * integrate() finds the area under the curve defined by f() * uniroot() finds where f() hits zero * optimise() finds the location of the lowest (or highest) value of f() 下面是一個使用sin()的例子 ```gherkin=1 integrate(sin, 0, pi) #> 2 with absolute error < 2.2e-14 str(uniroot(sin, pi * c(1 / 2, 3 / 2))) #> List of 5 #> $ root : num 3.14 #> $ f.root : num 1.22e-16 #> $ iter : int 2 #> $ init.it : int NA #> $ estim.prec: num 6.1e-05 str(optimise(sin, c(0, 2 * pi))) #> List of 2 #> $ minimum : num 4.71 #> $ objective: num -1 str(optimise(sin, c(0, pi), maximum= FALSE)) #> List of 2 #> $ minimum : num 7.93e-05 #> $ objective: num 7.93e-05 str(optimise(sin, c(0, pi), maximum = TRUE)) #> List of 2 #> $ maximum : num 1.57 #> $ objective: num 1 ``` ```gherkin=1 ``` ###### tags: `tidyverse` `purrr` `functional program`