--- GA: UA-159972578-2 --- ###### tags: `R` `Data Processing` `資料前處理` # R 常用技巧 Other Reference: 1. [Wrangling (Rpubs Note)](http://rpubs.com/RitaTang/WranglingNote) 2. [Visualization](/@ritatang242/SJF_dAR37) 3. [Visualization (Rpubs Note)](https://rpubs.com/RitaTang/Visualization) 4. [Text Mining (Preprocessing)](/@ritatang242/B1qbL3yLL) 5. [Text Mining (Chinese)](/PFrfv9NJSuy7frOV-wuJoA?both) 6. [PCA 主成分分析 (Principal Component Analysis)](/@ritatang242/r1lDWqiUL) # Package, initialization ```{r} pacman::p_load() rm(list = ls()); gc() ``` # Encode ```{r} a = cbind(df$area, df$product_name) Encoding(a) = 'UTF-8' # 只能放vector ``` # Deal with NA | 符號 | 定義 | | -------- | -------- | | NA |missing or undefined data | | NULL | empty object (e.g. null/empty lists) | | Inf and -Inf | positive and negative infinity | | NaN | results that cannot be reasonably defined | + 刪除NA ```{r} df = na.omit(df) # equivalent: total %>% na.omit() mean(data, na.rm = T) ``` ```{r} a = airquality[, colSums(is.na(airquality))==0] # 先找出誰是NA,再計算每個變數的NA總值來確認誰沒有NA,以此去除NA ``` + 跳過NA ```{r} mean(data, na.action="na.pass") ``` + 把NA變o ```{r} mx[is.na(mx)] = 0 ``` + 將NA變為一個類別 ```{r} Reduce(rbind, Map(function(x) read.csv(x, na.strings = '.', stringsAsFactors = F)[, -1], c('cars1.csv','cars2.csv'))) # 創造一個function做Map,一次讀取cars1和cars2兩個資料集(變成一個list) # 並將值為"."的替換成NA # 把這個list用rbind和reduce合在一起 ``` # Date Format ```{r} as.Date("2018-10-10", format = "%Y-%m-%d") as.character(r$BOARD_DATE) %>% as.Date(.,"%Y%m%d") as.Date(rail_df$BOARD_DATE %>% as.character(), format = '%Y%m%d') ``` # Data Frame Type + data.frame + 預覽全部資料 + data.frame會將字串資料轉換成factor + data.table + data.table不會 + 沒有rowname + 語法使用list() ```{r} dt[, list(A, C)] # equivalent: df[, c(1,3)] ``` + tibble + 方便查詢(可以直接預覽前10筆) + 處理快速 # Read & Write + 套件介紹 + library(foreign) 可以處理非csv的檔案,如sas + library(XML) + library(DBI) 讀取關聯式資料庫 + library(RMySQL) 讀取SQL資料庫 + Read Data + fread (速度最快/大量處理) + csv + library(data.table) + read_csv (速度中等) + csv, xlsx, xls + library(readr) + read.csv (速度最慢) + csv + read_file + txt + library(readr) + read.table + txt ```{r} library(data.table) fread("TaiwanRailway.csv", sep = '\t', encoding="big5") read.table("TaiwanRailway.csv", header = T, stringsAsFactors = F) xmlToDataFrame("Desktop/A_lvr_land_A.XML") ``` + Write R.data ```{r} save(想儲存的變數名稱, file = "../final.rdata") # 將指定的環境變數保存 save.image() # save everthing in current environment load("../final.rdata") ``` + Write Data + fwrite + write.csv + saveXML + write.foreign ```{r} fwrite("TaiwanRailway.csv") write.csv("TaiwanRailway.csv") saveXML(xml, "test.xml") write.foreign() ``` # Function ```{r} predtr = predict(mod4, tr) MAE = function(n,y,y_hat) { return( (1/n) * sum(abs(y-y_hat)) ) } MAE(nrow(tr), tr$Salary, predtr) # 223.9275 # (1/nrow(tr))*sum(abs(tr$Salary-predtr)) # 223.9275 ``` # 排序 + sort + 回傳排序後(A-F)本身實際值 ``` x = c("D","A","C","F","B","E") sort(x, decreasing = F) ``` > [1] "A" "B" "C" "D" "E" "F" + rank + 回傳排序後(A-F)的排名, 依照排序前的順序打印 ```{r} rank(x) ``` > [1] 4 1 3 6 2 5 > D: rank 4, A: rank 1 + oder + 回傳排序後(A-F)在排序前的索引 ```{r} order(x, decreasing = F) ``` > [1] 2 5 3 1 6 4 > A: index[2], B: index[5] + arrange (check Dplyr) + 只能對data.frame做 # Dplyr <iframe width=100%, height=500px src="https://hackmd.io/@ritatang242/BkyUfSKrL"> </iframe> # Join + merge + left_join + right_join + inner_join + all_join + semi_join # SQL ```{r} sqldf::sqldf("select * from A") where # filter的概念 group by # group_by is not null # 非NA值 order by x limit 5 # order,照x排列取五個 ``` # apply 系列 ## 1. apply ```{r} apply(data, 1, FUN) # margin: 1 means "row"; 2 means "column" ``` ## 2. lapply + 取代迴圈 + 回傳list ```{r} lapply(data, FUN) lapply(c(sum,mean,prob), FUN = function(f) f(data)) ``` ## 3. sapply + 取代迴圈 + 回傳matrix ```{r} sapply(data, FUN) ``` ## 4. tapply + 對類別做function計算 ```{r} tapply(data, INDEX = iris$Species, FUN = distinct_counts) # INDEX: 分類 # tapply() 函數是融入 table() 函數功能的形式。 ``` # aggregate ```{r} aggregate(price ~ cut + color, data = diamonds, mean) # 根據某個cut和color進行分群,求price平均數 aggregate(x=mtcars$mpg, by=list("cyl"= mtcars$cyl), FUN=mean) # cyl x # 1 4 26.66364 # 2 6 19.74286 # 3 8 15.10000 ``` # by ```{r} by(data=mtcars$mpg, INDICES=list("cyl"= mtcars$cyl), FUN=mean) # cyl: 4 # [1] 26.66364 # cyl: 6 # [1] 19.74286 # cyl: 8 # [1] 15.1 ``` # Map Reduce ## Map ```{r} Map(FUN, df$x) ``` 與lapply頗像,讓完整的df中的不同obs.分別執行同樣的function,再一起存入一個list ```{r} genKPercentile = function(q1, q2, q3, q4){ pct = Map(function(x) {function(y) quantile(y, x/100)}, c(q1, q2, q3, q4)) names(pct) <- Map(function(z) paste0("pct_", z), c(q1, q2, q3, q4)) return(pct) } pcts = genKPercentile(25,75,50,10) pcts$pct_50(mtcars$mpg) # 50%; 19.2 pcts$pct_10(mtcars$mpg) # 10%; 14.34 ``` ## Reduce ```{r} Reduce(f, x, init, right = FALSE, accumulate = FALSE) ``` + 將一個向量按相鄰的兩個元素依次作二元函数運算(一層包一層的疊加),最后输出结果。 + Reduce的基本參數有兩個,參數1是二元函数,参數2是運算向量 + 參數註解: + f: 有兩個參數的函数对象 + x: 向量 + init: 一個標準值,類型和x向量的元素相同 + right: 當值為FALSE時,從向量的左側開始,依次向右側取出元素傳给函数;當值為TRUE時,則從向量的右側開始 + accumulate:當值為TRUE時,會呈現逐步運算的结果,若為預設值,以上運算都只會输出最终结果 ```{r} x = c(12,25,3,8) Reduce(sum, x) # 48 # 運算過程:sum(sum(sum(12,25),3),8) Reduce(sum, x, accumulate = T) # 37 40 48 ``` ```{r} Reduce(sum,1:5,10) # 25 # 運算過程如下: a <- sum(10,1) b <- sum(a,2) c <- sum(b,3) d <- sum(c,4) sum(d,5) ``` ## mapReduce ```{r} mapReduce(分組變量, 指定函數, data = 數據框名) ``` + 與前面说的Map和Reduce功能完全不同,mapReduce提供了按分组變量運行指定函数的方式 + 實現分組統計的一種簡便、直接的方式,且能同時指定多個函數 + 回傳結果為一矩陣,同aggregate和sapply功能,但mapReduce可以實現平行運算,是為其優點 ``` > dt sex age weight 1 m 27 45.6 2 f 25 55.9 3 m 40 49.0 4 f 28 59.5 5 m 38 53.5 6 f 32 48.9 7 m 36 45.9 8 f 26 53.4 9 m 32 48.4 10 f 31 54.2 ``` 按性別分組,計算統計指標:平均年齡,平均體重,年齡標準差,體重標準差 ```{r} mapReduce(sex, mean(age),mean(weight),sd(age),sd(weight), data = dt) # [,1] [,2] [,3] [,4] # f 28.4 54.38 3.049590 3.858368 # m 34.6 48.48 5.176872 3.179151 ``` # Identify item ## identical 辨認兩者是否完全相等 ```{r} identical(AY$cust_id, subset(AX, train)$cust_id) # [1] TRUE ``` ## unique 找出不重複的值(組合) ```{r} unique(cust[,c(1:3)]) #看cust第一到第三的欄位共有幾種pattern unique(cust$cust_id) #找不重複的所有cust_id ``` ## duplicated 一個個判斷是否為重複,第一次出現是新值,第二次再出現一樣的即為重複。 ```{r} a <- c(rep("A", 3), rep("B", 3), rep("C",2)) b <- c(1,1,2,4,1,1,2,2) df <-data.frame(a,b) ``` ```{r} duplicated(df) # [1] FALSE TRUE FALSE FALSE FALSE TRUE FALSE TRUE ``` ```{r} df[duplicated(df), ] # a b # 2 A 1 # 6 B 1 # 8 C 2 ``` ```{r} df[!duplicated(df), ] # a b # 1 A 1 # 3 A 2 # 4 B 4 # 5 B 1 # 7 C 2 ``` # Data Reshaping ## melt/gather(寬的→長的) ```{r} melt(data=md, id.vars=c("ID","Time")) # id.vars即uid, 類似group_by md = gather(reshape_data, key="ketCol", value="valCol", 3, 4) ``` ## cast/speard(長的→寬的) ```{r} dcast(data=md, formula= ID + Time ~ variable, value.var="value") spread(reshape_data, key="ketCol", value="valCol", 3, 4) ``` # case_when 利用case_when()函數將連續值變為區間類別 ```{r} case_when(Age>=18 ~ "Adult", Age<18 ~ "Child", is.na(Age) ~ "Unknown") ``` # switch ```{r} switch(指定執行第幾行/哪個名稱的程式碼, 第一行:做A, 第二行:做B, 第三行:做C,...) switch("first", first = 1 + 1, second = 1 + 2, third = 1 + 3) # [1] 2 switch(3, "first", "second", "third", "fourth") # [1] "third" ```