# R 期末考 ```r= normalize <- function(x) { return((x - min(x)) / (max(x) - min(x))) }#正規化 data.n <- as.data.frame(lapply(data.n, normalize)) #用lapply處理每項資料回傳list data.row <- data[,列名或數字] #取整行 data[,列名或數字] class.ind(data) #把所以值變成不同種類,填入布林記得轉data.frame set.seed #亂數種子 sample(範圍,樣本數,使否重複抽樣) #樣本數*0.8 表示取八成(十取八) seq(from=1, to=5, by=0.5) #產生一個「從1開始,每次加0.5,直到5為止」的數列(vector) cbind#列合併 rbind(a,b) #row合併 replace <- function(x) { sub("Yes","1",x) sub("No","0",x) } data.x<-as.data.frame(lappy(data.x,replace)) #把data.x的值轉成你要的樣子 library(neighbr) fit <- neighbr::knn(train_set=train_set, test_set=train_set[,-c(1,ncol(train_set))], k=3, categorical_target=NULL, #取分類Ex."Dingnosis" continuous_target= "mpg", #數字預設 comparison_measure="euclidean", return_ranked_neighbors=3, id="car_id") fit$test_set_scores #取得測試分數 head(data,幾筆)#顯示前幾筆資料 tail()是尾幾筆 a[order(a, decreasing=TRUE)] # 用order(),把數列由大排到小;從小排到大,decreasing = FALSE rm_q <- function(x) { x[x == '?'] <- NA return(x) }#清除NA as.numeric()#轉數值型 tb_1 <- function(x) { a <- class(x)#數值型態 b <- length(unique(x))#不同值個數 c <- sum(is.na(x))#NA數 d <- paste((c*100/length(x)),"%", sep = "")#NA% e <- length(x) - c#NonNA f <- paste((e*100/length(x)),"%", sep = "")#NonNA% return(c(a,b,c,d,e,f)) } tb_2 <- function(x) { a <- min(x, na.rm = TRUE) b <- max(x, na.rm = TRUE) c <- mean(x, na.rm = TRUE)#平均 d <- sd(x, na.rm = TRUE)#標準差 e <- c - 3 * d#上下三個標準差 f <- c + 3 * d return(c(a,b,c,d,e,f)) } tb_3 <- function(x) { a <- summary(x)#每個欄位的「最大值」、「最小值」、「平均值」、「中位數」... b <- paste((a*100/sum(a)),"%", sep = "")#paste接起字串 names(b) <- names(a)#給名稱 return(c(a,b)) } quantile(marketing$X..Estimated.salary.,c(0.25, 0.75), na.rm = TRUE) #四分衛數 marketing[,sapply(marketing, class)!="factor"] <- as.data.frame(lapply(marketing[,sapply(marketing, class)!="factor"],rm_outlier)) #對不是factor的欄位做rm_outlier #Sort library(plyr) /* var1 var2 var3 2 3 NA 13 5 5 6 15 4 4 NA 14 3 2 10 11 1 1 8 12 */ sort(X$var1) /* 1 2 3 4 5 */ sort(X$var1,decreasing=TRUE) /*5 4 3 2 1 */ sort(X$var2) /* 6 8 10 */ sort(X$var2,na.last=TRUE) /* 6 8 10 NA NA */ X[order(X$var1),] /* var1 var2 var3 1 1 8 12 3 2 10 11 2 3 NA 13 4 4 NA 14 5 5 6 15 */ library(plyr) arrange(X,var1,var3) /* var1 var2 var3 1 1 8 12 2 1 NA 14 3 2 10 11 4 3 NA 13 5 5 6 15 */ match返回x在table中的位置,沒有的話返回nomatch指定的值 "%in%" <- function(x, table) match(x, table, nomatch = 0) > 0 match(c(1,3,5,9,21),-10:10) [1] 12 14 16 20 NA c(1,3,5,9,21) %in% -10:10 [1] TRUE TRUE TRUE TRUE FALSE which輸入邏輯向量,返回真值所在的位置 which(c(TRUE, FALSE, TRUE, NA, FALSE, FALSE, TRUE)) [1] 1 3 7 which.max和which.min在向量中分別尋找最大值和最小值 which.max(c(1:4, 11, 0:5)) [1] 5 which.min(c(1:4, 11, 0:5)) [1] 6 cut(1:20,5)//分割成五等分 cut(1:20,c(1,5,9,13,17,20))//分割成以下區間 //縱向連接表 sqldf("select * from x union select * from y")#並集 #union all==rbind不考慮重複直接拼接 sqldf("select * from x INTERSECT select * from y") #交集 sqldf("select * from x EXCEPT select * from y") #差集 //橫向連接表 //交叉连接 x<-data.frame(id=c(1,2,3),x1=c("a","b","c")) y<-data.frame(id=c(4,3),y1=c("d","e")) sqldf("select * from x,y") //內連接 merge(x,y,by = "id", all = F)//第1種 library(dplyr,warn.conflicts = F)//第2種 inner_join(x, y , by = "id") sqldf("select * from x as a inner join y as b on a.id=b.id")//第3種 //左連接 merge(x, y, by = "id", all.x = TRUE)//第1種 left_join(x, y, by = "id")//第2種 sqldf("select * from x as a left join y as b on a.id=b.id")//第3種 //右連接 merge(x, y, by = "id", all.y = TRUE)//第1種 right_join(x, y, by = "id")//第2種 //全連接 merge(x, y, by = "id", all = TRUE)//第1種 full_join(x, y, by = "id")//第2種 //讀取excel library(openxlsx,quietly = T)//第1種 data<-read.xlsx("./hsb2.xlsx",sheet=1) detach("package:openxlsx") library(xlsx)//第2種 data<-read.xlsx("./hsb2.xlsx",sheetIndex=1,colIndex=2:10,rowIndex=1:10) SQL sqldf("select distinct[差異元素(unique)] market from sale") sqldf("select market,avg(sale) as mavg_sale from sale group by market having mavg_sale<32308 order by mavg_sale") ```