# R in mulitvariate
###### tags: `multivariate`
Hackmd note URL: https://hackmd.io/Dn0bJ72JRQ2ikWdiXvWxVw
## Create and select
vector and dataframe
``` R
chest<-c(34,37,38,36,38,43,40,38,40,41,36,36,34,33,36,37,34,36,38,35)
waist<-c(30,32,30,33,29,32,33,30,30,32,24,25,24,22,26,26,25,26,28,23)
hips<-c(32,37,36,39,33,38,42,40,37,39,35,37,37,34,38,37,38,37,40,35)
gender<-rep(c("male","female"),each=10)
measure<-data.frame(chest,waist,hips,gender)
data = read.csv("T5_5_FBEETLES.DAT", header=F, sep="")
colnames(data) = list("Number", "group", "y1", "y2", "y3", "y4")
subset(measure, gender == "female")[,1:3]
subset(measure, gender == "female")[, c("chest", "waist", "hips")]
set.seed(1)
sample(1:40, 6, replace=F)
rnorm(3, mean=10, sd=2)
factorial(3)
table(data_pred, data_truth)
```
## Plot
``` R
# show
X11()
title("Mileage / Price")
...(add=T)
while(names(dev.cur()) !='null device') Sys.sleep(1)
# png
png("q2_ab.png")
dev.off()
# Line Plot
plot(mean1, type="l", col="red", ylim=c(100, 300))
lines(mean2, type="l", col="green")
abline(v=log(cv.lasso$lambda.min), col="blue", lty=5.5 )
# Draftman plot
pairs(data[1:4], main="Draftman Plot of car dataset", pch=21, bg=c("red", "green3", "blue")[c(data[5])$C])
# boxplot
boxplot(CP~C, data=data, names=c("U.S.", "Japan", "Europe"), main="Mileage / Price", horizontal=T)
# countour
contour(devx, devy, q1, levels=c(q1_crit), label=c("Q1"), col="Blue", xlab="cr", ylab="st")
# text
points(.30, 10)
text(.30, 10 ,labels = " (.30, 10)", adj=0)
# qq
par(mfrow=c(1,2))
qqplot(qchisq(ppoints(500), df=p), dis,
main = expression("Q-Q plot for" ~~ {chi^2}[nu == 2]), xlab="Chi-squared", ylab="sample", asp=1)
qqline(dis, distribution=function(x) qchisq(x, df=p), asp=1)
# histogram
hist(chest, prob=TRUE)
lines(density(chest, kernel=c("gaussian")))
```
## markdown
run it
`rmarkdown::render("markdown.Rmd")`
``` markdown
---
title: HW5
output: html_document
---
> 1. 123
Setup
``` {r}
data = read.csv("pilot.dat", header=F, sep="")
colnames(data) = list("group", "y1", "y2", "y3", "y4", "y5", "y6")
```
```
## test
``` R
cols = c("y1", "y2", "y3", "y4")
p = length(cols)
data1 = subset(data, group==1)[,cols]
n1 = nrow(data1)
mean1 = colMeans(data1)
var1 = var(data1)
cor1 = cor(data1)
```
f-test
``` R
varpool_d = (var1_d * (n1 - 1) + var2_d * (n2 - 1)) / (n1 + n2 - 2)
t2_d = n1 * n2 / (n1 + n2) * (mean1_d - mean2_d) %*% solve(varpool_d) %*% (mean1_d - mean2_d)
f_d = (n1 + n2 - p_d - 1) / p_d / (n1 + n2 - 2) * t2_d
fcrit_d = qf(0.95, p_d, n1 + n2 - p_d - 1)
```
t-test
``` R
fcrit_sum = qt(1 - 0.05 / 2, n1 + n2 - 2)
```
``` R
library("ICSNP")
m1 <-with(measure, HotellingsT2(cbind(chest,waist,hips) ~ gender));
```
## Linear Regression
``` R
fit = lm(y~x, data=data)
summary(fit)
anova(fit)
predict(fit, data.frame(mileage=c(0)) )
# one-hot
data$C = factor(data$C)
# stdanrize
norm_data = scale(data[,1:3])
foward_aic = step(lm(M ~ 1, data), ~ W + D + C, data=data, direction="forward")
foward_bic = step(lm(M ~ 1, data), ~ W + D + C, data=data, direction="forward", k=log(n))
backward_aic = step(fit, data=data, direction="backward")
backward_bic = step(fit, data=data, direction="backward", k=log(n))
# output all sets
library(leaps)
fit = regsubsets(M ~ ., data=data, intercept=TRUE, method="exhaustive")
summary(fit)
# Set dummy
norm_data <- model.matrix( ~ .-1, data)
## Lasso and ridge
library(glmnet)
lambdas <- 10^seq(2, -4, by = -.1)
fit_lasso = cv.glmnet(as.matrix(norm_data[, 2:6]), as.matrix(norm_data[, 1]), alpha=1, lambda=lambdas)
fit_ridge = cv.glmnet(as.matrix(norm_data[, 2:6]), as.matrix(norm_data[, 1]), alpha=0, lambda=lambdas)
# predict
test_x = as.matrix(norm_data[,2:6])
test_y = as.matrix(norm_data[,1])
pred_y = predict(fit_lasso, s=fit_lasso$lambda.min, newx=test_x)
# R2
sum((test_y - pred_y) ** 2) / sum(var(test_y) * (n - 1))
fit_lasso$glmnet.fit$dev.ratio[which(fit_lasso$glmnet.fit$lambda == fit_lasso$lambda.min)]
## Lars
library(lars)
fit_lar <- cv.lars(as.matrix(norm_data[, 2:6]), as.matrix(norm_data[, 1]) ,type="lar")
coef(fit_lar)
predict(fit_lar, norm_data[,2:6], 2, mode="step")$fit
```
By hands
``` R
fit = lm(M ~ 1, data)
avail_set = c("W", "D", "C")
best_set = c()
# best_r = summary(fit)$adj.r.squared
best_r = AIC(fit)
repeat {
if (length(avail_set) == 0) break
ok = F
for (i in 1:length(avail_set)) {
fit = lm(paste("M ~", paste(c(best_set, avail_set[i]), collapse="+")), data)
# now_r = summary(fit)$adj.r.squared
now_r = AIC(fit)
if (now_r < best_r) {
ok = T
best_set = c(best_set, avail_set[i])
avail_set = avail_set[-1]
best_r = now_r
break
}
}
if (!ok)
break
}
```
other package
``` R
# alr3 will contains "lack of fit"
library("alr3")
pureErrorAnova(fit)
# by adj
library("SignifReg")
fit_adj = add1SignifReg(lm(M ~ 1, data), criterion="r-adj")
```
## SVD and pca and FA
``` R
s = svd(data)
restore_data = s$v %*% diag(s$d) %*% t(s$u)
pca_data_by_svd = t(t(s$v) %*% t(scale(data)))
```
``` R
pc_data = princomp(data ,cor=TRUE)
summary(pc_data, loadings=TRUE)
# plot(pc_data)
# screeplot(pc_data, type="lines")
# biplot(pc_data, col=c("red","green"))
score = scale(data) %*% pc_data$loadings
var(score)
```
``` R
library(psych)
fa_data1 = principal(data_cor,nfactors=2,rotate='none')
fa_data2 = principal(data_cor,nfactors=2,rotate='varimax')
fa_data3 = principal(data_cor,nfactors=2,rotate='promax')
print(fa_data1$loading, cutoff=.)
FA_1 <- fa(dat[, c(2:7)], nfactors=3, rotate="varimax", fm="pa", max.iter = 1)
library(rela)
paf1<-paf(as.matrix(data),eigcrit=1,convcrit=0.0001);
library(psy)
fit1<-factanal(data,1,scores="regression",rotation="none");
scree.plot(fit1$correlation)
abline(h=1, col="red")
```
## Canonical correlation
``` R
r11 <- cor(X)
r22 <- cor(Y)
r12 <- cor(X, Y)
r21 <- cor(Y, X)
E1 <- solve(r11) %*% r12 %*% solve(r22) %*%r21
E2 <- solve(r22) %*% r21 %*% solve(r11) %*%r12
# X and Y's coeffs
e1 <- eigen(E1)
e2 <- eigen(E2)
# result vector
(v1 = Re(e1$vectors)[, 1:2])
(v2 = Re(e2$vectors)[, 1:2])
canon_X <- as.matrix(X) %*% v1
canon_Y <- as.matrix(Y) %*% v2
# correclation
diag(cor(canon_X, canon_Y))
```
``` R
library(CCA)
result <-cc(X,Y)
barplot(result$cor)
plt.cc(result)
result
```
chi-test of correlation
``` R
n <-nrow(data)
dimx <-ncol(X)
dimy <-ncol(Y)
# Bartlett's test statistic as given by Equation 10.3
test.stat <- -( n - 0.5 * (dimx + dimy + 3) ) * sum(log(1 - result$cor))
test.stat
# Computing p-value
P.value <-pchisq(test.stat, df=dimx*dimy, lower.tail=FALSE)
P.value
```
## cluster
### Tree
``` R
d = dist(mtcears)
# "ward.D", "ward.D2", "single", "complete", "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC) or "centroid" (= UPGMC).
dc = hclust(d, method="complete")
plot(dc)
rect.hclust(dc, k=3)
```
### Kmeans
``` R
x <-rbind(matrix(rnorm(100, sd=0.3), ncol=2),
matrix(rnorm(100, mean=1, sd=0.3), ncol=2))
colnames(x)=c("x","y")
cl=kmeans(x,2)
plot(x,col=cl$cluster)
points(cl$centers, col=1:2, pch=8, cex=2)
```
### knn
``` R
x <-rbind(matrix(rnorm(100, sd=0.3), ncol=2),
matrix(rnorm(100, mean=1, sd=0.3), ncol=2))
colnames(x)=c("x","y")
data_train = rbind(x[1:45,], x[51:95,])
data_test = rbind(x[46:50,], x[96:100,])
require("class")
knn(data_train, data_test, cl=rep(c("male","female"),each=45), k=2)
```
### Fisher LDA
``` R
library(MASS)
m1 <- lda(live ~ smoke + weight + gender, data, prior = c(0.5, 0.5))
# prior number = numebr of class in live
m1$scaling
p = predict(m1, method="plug-in")
```
maunally
``` R
m1 = colMeans(t1)
m2 = colMeans(t2)
d1 = t(t(t1) - m1)
d2 = t(t(t2) - m2)
sw = t(d1) %*% d1 + t(d2) %*% d2
sb = m1 %*% t(m2)
w = solve(sw) %*% (m1 - m2)
```
## MDS
``` R
dist.MNZ <-as.dist(MNZ)
A=-0.5*as.matrix(NZroads)^2
H=diag(13)-1/13*matrix(1,13,13)
B=H %*% A %*% H
eig<-eigen(B)
Q=as.matrix(eig$vectors)
X=Q[,1:2]%*% sqrt(diag(eig$values[1:2]))
```
``` R
# Apply classic MDS
library(MASS)
fit <-cmdscale(dist.MNZ,eig=TRUE, k=2) # k is # of dimfitdist.MNZ
fit_iso = isoMDS(dist.MNZ, k=2)
# calculate stress
fit_stress = sqrt(sum((dist.MNZ - dist(fit$points)) ^ 2) / sum(dist.MNZ ^ 2))
```
plot
``` R
# plot dist to points
x <-fit$points[,1]
y <-fit$points[,2]
plot(x, y, xlab="Coordinate 1", ylab="Coordinate 2",main="Metric MDS", type="n")
text(x, y, labels = row.names(MNZ), cex=1)
# plot shepard diagram
shep = Shepard(dist.MNZ, fit$points)
plot(shep,
pch = ".",
xlab = "Dissimilarity",
ylab = "Distance",
xlim = range(shep$x),
ylim = range(shep$x))
lines(shep$x, shep$yf, type = "S")
abline(1, 1, col=2)
```
## Corresponse Analysis
``` R
library("FactoMineR")
fit <-CA(housetasks, graph = FALSE)
print(fit)
# plot all
fviz_ca_biplot(fit)
# scree
fviz_screeplot(fit, addlabels = TRUE, ylim = c(0, 50))
fviz_eig(fit)
get_eigenvalue(fit)
get_ca_row(fit)
get_ca_col(fit)
fviz_ca_row(fit)
fviz_ca_col(fit)
```
## MANOVA
``` Rlang
root$Tree.Number <- as.factor(root$Tree.Number)
root.manova <- manova(cbind(root$Trunk.Girth, root$Ext.Growth) ~ Tree.Number,
data = root)
summary(root.manova, test='Wilks')
# test = c("Pillai", "Wilks", "Hotelling-Lawley", "Roy")
```
## SVM
```
library(e1071)
plot(svmfit, dat)
```
## TODO
* decision tree