tags: R Data Processing 資料前處理

R 常用技巧

Other Reference:

  1. Wrangling (Rpubs Note)
  2. Visualization
  3. Visualization (Rpubs Note)
  4. Text Mining (Preprocessing)
  5. Text Mining (Chinese)
  6. PCA 主成分分析 (Principal Component Analysis)

Package, initialization

pacman::p_load()
rm(list = ls()); gc()

Encode

a = cbind(df$area, df$product_name)
Encoding(a) = 'UTF-8' # 只能放vector

Deal with NA

符號 定義
NA missing or undefined data
NULL empty object (e.g. null/empty lists)
Inf and -Inf positive and negative infinity
NaN results that cannot be reasonably defined
  • 刪除NA
df = na.omit(df)
# equivalent: total %>% na.omit()

mean(data, na.rm = T)
a = airquality[, colSums(is.na(airquality))==0] 
# 先找出誰是NA,再計算每個變數的NA總值來確認誰沒有NA,以此去除NA
  • 跳過NA
mean(data, na.action="na.pass")
  • 把NA變o
mx[is.na(mx)] = 0
  • 將NA變為一個類別
Reduce(rbind, Map(function(x) read.csv(x, na.strings = '.',
    stringsAsFactors = F)[, -1], c('cars1.csv','cars2.csv')))
# 創造一個functionMap,一次讀取cars1cars2兩個資料集(變成一個list)
# 並將值為"."的替換成NA
# 把這個list用rbind和reduce合在一起

Date Format

as.Date("2018-10-10", format = "%Y-%m-%d")
as.character(r$BOARD_DATE) %>% as.Date(.,"%Y%m%d")
as.Date(rail_df$BOARD_DATE %>% as.character(), format = '%Y%m%d')

Data Frame Type

  • data.frame
    • 預覽全部資料
    • data.frame會將字串資料轉換成factor
  • data.table
    • data.table不會
    • 沒有rowname
    • 語法使用list()
dt[, list(A, C)]
# equivalent: df[, c(1,3)]
  • tibble
    • 方便查詢(可以直接預覽前10筆)
    • 處理快速

Read & Write

  • 套件介紹

    • library(foreign) 可以處理非csv的檔案,如sas
    • library(XML)
    • library(DBI) 讀取關聯式資料庫
    • library(RMySQL) 讀取SQL資料庫
  • Read Data

    • fread (速度最快/大量處理)
      • csv
      • library(data.table)
    • read_csv (速度中等)
      • csv, xlsx, xls
      • library(readr)
    • read.csv (速度最慢)
      • csv
    • read_file
      • txt
      • library(readr)
    • read.table
      • txt
library(data.table)
fread("TaiwanRailway.csv", sep = '\t', encoding="big5")

read.table("TaiwanRailway.csv", header = T, stringsAsFactors = F)

xmlToDataFrame("Desktop/A_lvr_land_A.XML")
  • Write R.data
save(想儲存的變數名稱, file = "../final.rdata") # 將指定的環境變數保存
save.image() # save everthing in current environment

load("../final.rdata")
  • Write Data
    • fwrite
    • write.csv
    • saveXML
    • write.foreign
fwrite("TaiwanRailway.csv")
write.csv("TaiwanRailway.csv")
saveXML(xml, "test.xml")
write.foreign()

Function

predtr = predict(mod4, tr)

MAE = function(n,y,y_hat) { 
  return( (1/n) * sum(abs(y-y_hat)) )
  }

MAE(nrow(tr), tr$Salary, predtr)  # 223.9275
# (1/nrow(tr))*sum(abs(tr$Salary-predtr)) # 223.9275

排序

  • sort
    • 回傳排序後(A-F)本身實際值
x = c("D","A","C","F","B","E")
sort(x, decreasing = F)

[1] "A" "B" "C" "D" "E" "F"

  • rank
    • 回傳排序後(A-F)的排名, 依照排序前的順序打印
rank(x)

[1] 4 1 3 6 2 5
D: rank 4, A: rank 1

  • oder
    • 回傳排序後(A-F)在排序前的索引
order(x, decreasing = F)

[1] 2 5 3 1 6 4
A: index[2], B: index[5]

  • arrange (check Dplyr)
    • 只能對data.frame做

Dplyr

Join

  • merge
  • left_join
  • right_join
  • inner_join
  • all_join
  • semi_join

SQL

sqldf::sqldf("select * from A")
where # filter的概念
group by # group_by
is not null # 非NA值
order by x limit 5 # order,照x排列取五個

apply 系列

1. apply

apply(data, 1, FUN)
# margin: 1 means "row"; 2 means "column"

2. lapply

  • 取代迴圈
  • 回傳list
lapply(data, FUN)
lapply(c(sum,mean,prob), FUN = function(f) f(data))

3. sapply

  • 取代迴圈
  • 回傳matrix
sapply(data, FUN)

4. tapply

  • 對類別做function計算
tapply(data, INDEX = iris$Species, FUN = distinct_counts)
# INDEX: 分類
# tapply() 函數是融入 table() 函數功能的形式。

aggregate

aggregate(price ~ cut + color, data = diamonds, mean)
# 根據某個cut和color進行分群,求price平均數
aggregate(x=mtcars$mpg, by=list("cyl"= mtcars$cyl), FUN=mean)
#   cyl  x
# 1 4   26.66364
# 2 6   19.74286
# 3 8   15.10000

by

by(data=mtcars$mpg, INDICES=list("cyl"= mtcars$cyl), FUN=mean)
# cyl: 4
# [1] 26.66364
# cyl: 6
# [1] 19.74286
# cyl: 8
# [1] 15.1

Map Reduce

Map

Map(FUN, df$x) 

與lapply頗像,讓完整的df中的不同obs.分別執行同樣的function,再一起存入一個list

genKPercentile = function(q1, q2, q3, q4){
  pct = Map(function(x) {function(y) quantile(y, x/100)}, c(q1, q2, q3, q4))
  names(pct) <- Map(function(z) paste0("pct_", z), c(q1, q2, q3, q4))
  return(pct)
}

pcts = genKPercentile(25,75,50,10)
pcts$pct_50(mtcars$mpg)  # 50%; 19.2
pcts$pct_10(mtcars$mpg)  # 10%; 14.34

Reduce

Reduce(f, x, init, right = FALSE, accumulate = FALSE)
  • 將一個向量按相鄰的兩個元素依次作二元函数運算(一層包一層的疊加),最后输出结果。
  • Reduce的基本參數有兩個,參數1是二元函数,参數2是運算向量
  • 參數註解:
    • f: 有兩個參數的函数对象
    • x: 向量
    • init: 一個標準值,類型和x向量的元素相同
    • right: 當值為FALSE時,從向量的左側開始,依次向右側取出元素傳给函数;當值為TRUE時,則從向量的右側開始
    • accumulate:當值為TRUE時,會呈現逐步運算的结果,若為預設值,以上運算都只會输出最终结果
x = c(12,25,3,8)
Reduce(sum, x)  # 48  # 運算過程:sum(sum(sum(12,25),3),8)
Reduce(sum, x, accumulate = T)  # 37 40 48
Reduce(sum,1:5,10)  # 25
# 運算過程如下:
a <- sum(10,1)
b <- sum(a,2)
c <- sum(b,3)
d <- sum(c,4)
sum(d,5)

mapReduce

mapReduce(分組變量, 指定函數, data = 數據框名)
  • 與前面说的Map和Reduce功能完全不同,mapReduce提供了按分组變量運行指定函数的方式
  • 實現分組統計的一種簡便、直接的方式,且能同時指定多個函數
  • 回傳結果為一矩陣,同aggregate和sapply功能,但mapReduce可以實現平行運算,是為其優點
> dt
   sex   age   weight
1    m    27    45.6
2    f    25    55.9
3    m    40    49.0
4    f    28    59.5
5    m    38    53.5
6    f    32    48.9
7    m    36    45.9
8    f    26    53.4
9    m    32    48.4
10   f    31    54.2

按性別分組,計算統計指標:平均年齡,平均體重,年齡標準差,體重標準差

mapReduce(sex, mean(age),mean(weight),sd(age),sd(weight), data = dt)

#      [,1]     [,2]        [,3]        [,4]
# f    28.4    54.38    3.049590    3.858368
# m    34.6    48.48    5.176872    3.179151

Identify item

identical

辨認兩者是否完全相等

identical(AY$cust_id, subset(AX, train)$cust_id)
# [1] TRUE

unique

找出不重複的值(組合)

unique(cust[,c(1:3)]) #看cust第一到第三的欄位共有幾種pattern
unique(cust$cust_id) #找不重複的所有cust_id

duplicated

一個個判斷是否為重複,第一次出現是新值,第二次再出現一樣的即為重複。

a <- c(rep("A", 3), rep("B", 3), rep("C",2))
b <- c(1,1,2,4,1,1,2,2)
df <-data.frame(a,b)
duplicated(df)
# [1] FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE  TRUE
df[duplicated(df), ]
#   a b
# 2 A 1
# 6 B 1
# 8 C 2
df[!duplicated(df), ]
#   a b
# 1 A 1
# 3 A 2
# 4 B 4
# 5 B 1
# 7 C 2

Data Reshaping

melt/gather(寬的→長的)

melt(data=md, id.vars=c("ID","Time"))
# id.vars即uid, 類似group_by

md = gather(reshape_data, key="ketCol", value="valCol", 3, 4)

cast/speard(長的→寬的)

dcast(data=md, formula= ID + Time ~ variable, value.var="value")

spread(reshape_data, key="ketCol", value="valCol", 3, 4)

case_when

利用case_when()函數將連續值變為區間類別

case_when(Age>=18 ~ "Adult",
    Age<18 ~ "Child",
    is.na(Age) ~ "Unknown")

switch

switch(指定執行第幾行/哪個名稱的程式碼, 
       第一行:做A, 
       第二行:做B,
       第三行:做C,...)
switch("first", first = 1 + 1, second = 1 + 2, third = 1 + 3)
# [1] 2
switch(3, "first", "second", "third", "fourth")
# [1] "third"