# R 期末考
```r=
normalize <- function(x) {
return((x - min(x)) / (max(x) - min(x)))
}#正規化
data.n <- as.data.frame(lapply(data.n, normalize))
#用lapply處理每項資料回傳list
data.row <- data[,列名或數字]
#取整行
data[,列名或數字]
class.ind(data)
#把所以值變成不同種類,填入布林記得轉data.frame
set.seed #亂數種子
sample(範圍,樣本數,使否重複抽樣)
#樣本數*0.8 表示取八成(十取八)
seq(from=1, to=5, by=0.5)
#產生一個「從1開始,每次加0.5,直到5為止」的數列(vector)
cbind#列合併
rbind(a,b) #row合併
replace <- function(x) {
sub("Yes","1",x)
sub("No","0",x)
}
data.x<-as.data.frame(lappy(data.x,replace))
#把data.x的值轉成你要的樣子
library(neighbr)
fit <- neighbr::knn(train_set=train_set,
test_set=train_set[,-c(1,ncol(train_set))],
k=3,
categorical_target=NULL,
#取分類Ex."Dingnosis"
continuous_target= "mpg",
#數字預設
comparison_measure="euclidean",
return_ranked_neighbors=3,
id="car_id")
fit$test_set_scores #取得測試分數
head(data,幾筆)#顯示前幾筆資料 tail()是尾幾筆
a[order(a, decreasing=TRUE)]
# 用order(),把數列由大排到小;從小排到大,decreasing = FALSE
rm_q <- function(x) {
x[x == '?'] <- NA
return(x)
}#清除NA
as.numeric()#轉數值型
tb_1 <- function(x) {
a <- class(x)#數值型態
b <- length(unique(x))#不同值個數
c <- sum(is.na(x))#NA數
d <- paste((c*100/length(x)),"%", sep = "")#NA%
e <- length(x) - c#NonNA
f <- paste((e*100/length(x)),"%", sep = "")#NonNA%
return(c(a,b,c,d,e,f))
}
tb_2 <- function(x) {
a <- min(x, na.rm = TRUE)
b <- max(x, na.rm = TRUE)
c <- mean(x, na.rm = TRUE)#平均
d <- sd(x, na.rm = TRUE)#標準差
e <- c - 3 * d#上下三個標準差
f <- c + 3 * d
return(c(a,b,c,d,e,f))
}
tb_3 <- function(x) {
a <- summary(x)#每個欄位的「最大值」、「最小值」、「平均值」、「中位數」...
b <- paste((a*100/sum(a)),"%", sep = "")#paste接起字串
names(b) <- names(a)#給名稱
return(c(a,b))
}
quantile(marketing$X..Estimated.salary.,c(0.25, 0.75), na.rm = TRUE)
#四分衛數
marketing[,sapply(marketing, class)!="factor"] <-
as.data.frame(lapply(marketing[,sapply(marketing, class)!="factor"],rm_outlier))
#對不是factor的欄位做rm_outlier
#Sort
library(plyr)
/*
var1 var2 var3
2 3 NA 13
5 5 6 15
4 4 NA 14
3 2 10 11
1 1 8 12
*/
sort(X$var1)
/* 1 2 3 4 5 */
sort(X$var1,decreasing=TRUE)
/*5 4 3 2 1 */
sort(X$var2)
/* 6 8 10 */
sort(X$var2,na.last=TRUE)
/* 6 8 10 NA NA */
X[order(X$var1),]
/* var1 var2 var3
1 1 8 12
3 2 10 11
2 3 NA 13
4 4 NA 14
5 5 6 15
*/
library(plyr)
arrange(X,var1,var3)
/* var1 var2 var3
1 1 8 12
2 1 NA 14
3 2 10 11
4 3 NA 13
5 5 6 15
*/
match返回x在table中的位置,沒有的話返回nomatch指定的值
"%in%" <- function(x, table) match(x, table, nomatch = 0) > 0
match(c(1,3,5,9,21),-10:10)
[1] 12 14 16 20 NA
c(1,3,5,9,21) %in% -10:10
[1] TRUE TRUE TRUE TRUE FALSE
which輸入邏輯向量,返回真值所在的位置
which(c(TRUE, FALSE, TRUE, NA, FALSE, FALSE, TRUE))
[1] 1 3 7
which.max和which.min在向量中分別尋找最大值和最小值
which.max(c(1:4, 11, 0:5))
[1] 5
which.min(c(1:4, 11, 0:5))
[1] 6
cut(1:20,5)//分割成五等分
cut(1:20,c(1,5,9,13,17,20))//分割成以下區間
//縱向連接表
sqldf("select * from x union select * from y")#並集
#union all==rbind不考慮重複直接拼接
sqldf("select * from x INTERSECT select * from y") #交集
sqldf("select * from x EXCEPT select * from y") #差集
//橫向連接表
//交叉连接
x<-data.frame(id=c(1,2,3),x1=c("a","b","c"))
y<-data.frame(id=c(4,3),y1=c("d","e"))
sqldf("select * from x,y")
//內連接
merge(x,y,by = "id", all = F)//第1種
library(dplyr,warn.conflicts = F)//第2種
inner_join(x, y , by = "id")
sqldf("select * from x as a inner join y as b on a.id=b.id")//第3種
//左連接
merge(x, y, by = "id", all.x = TRUE)//第1種
left_join(x, y, by = "id")//第2種
sqldf("select * from x as a left join y as b on a.id=b.id")//第3種
//右連接
merge(x, y, by = "id", all.y = TRUE)//第1種
right_join(x, y, by = "id")//第2種
//全連接
merge(x, y, by = "id", all = TRUE)//第1種
full_join(x, y, by = "id")//第2種
//讀取excel
library(openxlsx,quietly = T)//第1種
data<-read.xlsx("./hsb2.xlsx",sheet=1)
detach("package:openxlsx")
library(xlsx)//第2種
data<-read.xlsx("./hsb2.xlsx",sheetIndex=1,colIndex=2:10,rowIndex=1:10)
SQL
sqldf("select distinct[差異元素(unique)] market from sale")
sqldf("select market,avg(sale) as mavg_sale from sale
group by market having mavg_sale<32308 order by mavg_sale")
```