R in mulitvariate

# R in mulitvariate ###### tags: `multivariate` Hackmd note URL: https://hackmd.io/Dn0bJ72JRQ2ikWdiXvWxVw ## Create and select vector and dataframe ``` R chest<-c(34,37,38,36,38,43,40,38,40,41,36,36,34,33,36,37,34,36,38,35) waist<-c(30,32,30,33,29,32,33,30,30,32,24,25,24,22,26,26,25,26,28,23) hips<-c(32,37,36,39,33,38,42,40,37,39,35,37,37,34,38,37,38,37,40,35) gender<-rep(c("male","female"),each=10) measure<-data.frame(chest,waist,hips,gender) data = read.csv("T5_5_FBEETLES.DAT", header=F, sep="") colnames(data) = list("Number", "group", "y1", "y2", "y3", "y4") subset(measure, gender == "female")[,1:3] subset(measure, gender == "female")[, c("chest", "waist", "hips")] set.seed(1) sample(1:40, 6, replace=F) rnorm(3, mean=10, sd=2) factorial(3) table(data_pred, data_truth) ``` ## Plot ``` R # show X11() title("Mileage / Price") ...(add=T) while(names(dev.cur()) !='null device') Sys.sleep(1) # png png("q2_ab.png") dev.off() # Line Plot plot(mean1, type="l", col="red", ylim=c(100, 300)) lines(mean2, type="l", col="green") abline(v=log(cv.lasso$lambda.min), col="blue", lty=5.5 ) # Draftman plot pairs(data[1:4], main="Draftman Plot of car dataset", pch=21, bg=c("red", "green3", "blue")[c(data[5])$C]) # boxplot boxplot(CP~C, data=data, names=c("U.S.", "Japan", "Europe"), main="Mileage / Price", horizontal=T) # countour contour(devx, devy, q1, levels=c(q1_crit), label=c("Q1"), col="Blue", xlab="cr", ylab="st") # text points(.30, 10) text(.30, 10 ,labels = " (.30, 10)", adj=0) # qq par(mfrow=c(1,2)) qqplot(qchisq(ppoints(500), df=p), dis, main = expression("Q-Q plot for" ~~ {chi^2}[nu == 2]), xlab="Chi-squared", ylab="sample", asp=1) qqline(dis, distribution=function(x) qchisq(x, df=p), asp=1) # histogram hist(chest, prob=TRUE) lines(density(chest, kernel=c("gaussian"))) ``` ## markdown run it `rmarkdown::render("markdown.Rmd")` ``` markdown --- title: HW5 output: html_document --- > 1. 123 Setup ``` {r} data = read.csv("pilot.dat", header=F, sep="") colnames(data) = list("group", "y1", "y2", "y3", "y4", "y5", "y6") ```　 ``` ## test ``` R cols = c("y1", "y2", "y3", "y4") p = length(cols) data1 = subset(data, group==1)[,cols] n1 = nrow(data1) mean1 = colMeans(data1) var1 = var(data1) cor1 = cor(data1) ``` f-test ``` R varpool_d = (var1_d * (n1 - 1) + var2_d * (n2 - 1)) / (n1 + n2 - 2) t2_d = n1 * n2 / (n1 + n2) * (mean1_d - mean2_d) %*% solve(varpool_d) %*% (mean1_d - mean2_d) f_d = (n1 + n2 - p_d - 1) / p_d / (n1 + n2 - 2) * t2_d fcrit_d = qf(0.95, p_d, n1 + n2 - p_d - 1) ``` t-test ``` R fcrit_sum = qt(1 - 0.05 / 2, n1 + n2 - 2) ``` ``` R library("ICSNP") m1 <-with(measure, HotellingsT2(cbind(chest,waist,hips) ~ gender)); ``` ## Linear Regression ``` R fit = lm(y~x, data=data) summary(fit) anova(fit) predict(fit, data.frame(mileage=c(0)) ) # one-hot data$C = factor(data$C) # stdanrize norm_data = scale(data[,1:3]) foward_aic = step(lm(M ~ 1, data), ~ W + D + C, data=data, direction="forward") foward_bic = step(lm(M ~ 1, data), ~ W + D + C, data=data, direction="forward", k=log(n)) backward_aic = step(fit, data=data, direction="backward") backward_bic = step(fit, data=data, direction="backward", k=log(n)) # output all sets library(leaps) fit = regsubsets(M ~ ., data=data, intercept=TRUE, method="exhaustive") summary(fit) # Set dummy norm_data <- model.matrix( ~ .-1, data) ## Lasso and ridge library(glmnet) lambdas <- 10^seq(2, -4, by = -.1) fit_lasso = cv.glmnet(as.matrix(norm_data[, 2:6]), as.matrix(norm_data[, 1]), alpha=1, lambda=lambdas) fit_ridge = cv.glmnet(as.matrix(norm_data[, 2:6]), as.matrix(norm_data[, 1]), alpha=0, lambda=lambdas) # predict test_x = as.matrix(norm_data[,2:6]) test_y = as.matrix(norm_data[,1]) pred_y = predict(fit_lasso, s=fit_lasso$lambda.min, newx=test_x) # R2 sum((test_y - pred_y) ** 2) / sum(var(test_y) * (n - 1)) fit_lasso$glmnet.fit$dev.ratio[which(fit_lasso$glmnet.fit$lambda == fit_lasso$lambda.min)] ## Lars library(lars) fit_lar <- cv.lars(as.matrix(norm_data[, 2:6]), as.matrix(norm_data[, 1]) ,type="lar") coef(fit_lar) predict(fit_lar, norm_data[,2:6], 2, mode="step")$fit ``` By hands ``` R fit = lm(M ~ 1, data) avail_set = c("W", "D", "C") best_set = c() # best_r = summary(fit)$adj.r.squared best_r = AIC(fit) repeat { if (length(avail_set) == 0) break ok = F for (i in 1:length(avail_set)) { fit = lm(paste("M ~", paste(c(best_set, avail_set[i]), collapse="+")), data) # now_r = summary(fit)$adj.r.squared now_r = AIC(fit) if (now_r < best_r) { ok = T best_set = c(best_set, avail_set[i]) avail_set = avail_set[-1] best_r = now_r break } } if (!ok) break } ``` other package ``` R # alr3 will contains "lack of fit" library("alr3") pureErrorAnova(fit) # by adj library("SignifReg") fit_adj = add1SignifReg(lm(M ~ 1, data), criterion="r-adj") ``` ## SVD and pca and FA ``` R s = svd(data) restore_data = s$v %*% diag(s$d) %*% t(s$u) pca_data_by_svd = t(t(s$v) %*% t(scale(data))) ``` ``` R pc_data = princomp(data ,cor=TRUE) summary(pc_data, loadings=TRUE) # plot(pc_data) # screeplot(pc_data, type="lines") # biplot(pc_data, col=c("red","green")) score = scale(data) %*% pc_data$loadings var(score) ``` ``` R library(psych) fa_data1 = principal(data_cor,nfactors=2,rotate='none') fa_data2 = principal(data_cor,nfactors=2,rotate='varimax') fa_data3 = principal(data_cor,nfactors=2,rotate='promax') print(fa_data1$loading, cutoff=.) FA_1 <- fa(dat[, c(2:7)], nfactors=3, rotate="varimax", fm="pa", max.iter = 1) library(rela) paf1<-paf(as.matrix(data),eigcrit=1,convcrit=0.0001); library(psy) fit1<-factanal(data,1,scores="regression",rotation="none"); scree.plot(fit1$correlation) abline(h=1, col="red") ``` ## Canonical correlation ``` R r11 <- cor(X) r22 <- cor(Y) r12 <- cor(X, Y) r21 <- cor(Y, X) E1 <- solve(r11) %*% r12 %*% solve(r22) %*%r21 E2 <- solve(r22) %*% r21 %*% solve(r11) %*%r12 # X and Y's coeffs e1 <- eigen(E1) e2 <- eigen(E2) # result vector (v1 = Re(e1$vectors)[, 1:2]) (v2 = Re(e2$vectors)[, 1:2]) canon_X <- as.matrix(X) %*% v1 canon_Y <- as.matrix(Y) %*% v2 # correclation diag(cor(canon_X, canon_Y)) ``` ``` R library(CCA) result <-cc(X,Y) barplot(result$cor) plt.cc(result) result ``` chi-test of correlation ``` R n <-nrow(data) dimx <-ncol(X) dimy <-ncol(Y) # Bartlett's test statistic as given by Equation 10.3 test.stat <- -( n - 0.5 * (dimx + dimy + 3) ) * sum(log(1 - result$cor)) test.stat # Computing p-value P.value <-pchisq(test.stat, df=dimx*dimy, lower.tail=FALSE) P.value ``` ## cluster ### Tree ``` R d = dist(mtcears) # "ward.D", "ward.D2", "single", "complete", "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC) or "centroid" (= UPGMC). dc = hclust(d, method="complete") plot(dc) rect.hclust(dc, k=3) ``` ### Kmeans ``` R x <-rbind(matrix(rnorm(100, sd=0.3), ncol=2), matrix(rnorm(100, mean=1, sd=0.3), ncol=2)) colnames(x)=c("x","y") cl=kmeans(x,2) plot(x,col=cl$cluster) points(cl$centers, col=1:2, pch=8, cex=2) ``` ### knn ``` R x <-rbind(matrix(rnorm(100, sd=0.3), ncol=2), matrix(rnorm(100, mean=1, sd=0.3), ncol=2)) colnames(x)=c("x","y") data_train = rbind(x[1:45,], x[51:95,]) data_test = rbind(x[46:50,], x[96:100,]) require("class") knn(data_train, data_test, cl=rep(c("male","female"),each=45), k=2) ``` ### Fisher LDA ``` R library(MASS) m1 <- lda(live ~ smoke + weight + gender, data, prior = c(0.5, 0.5)) # prior number = numebr of class in live m1$scaling p = predict(m1, method="plug-in") ``` maunally ``` R m1 = colMeans(t1) m2 = colMeans(t2) d1 = t(t(t1) - m1) d2 = t(t(t2) - m2) sw = t(d1) %*% d1 + t(d2) %*% d2 sb = m1 %*% t(m2) w = solve(sw) %*% (m1 - m2) ``` ## MDS ``` R dist.MNZ <-as.dist(MNZ) A=-0.5*as.matrix(NZroads)^2 H=diag(13)-1/13*matrix(1,13,13) B=H %*% A %*% H eig<-eigen(B) Q=as.matrix(eig$vectors) X=Q[,1:2]%*% sqrt(diag(eig$values[1:2])) ``` ``` R # Apply classic MDS library(MASS) fit <-cmdscale(dist.MNZ,eig=TRUE, k=2) # k is # of dimfitdist.MNZ fit_iso = isoMDS(dist.MNZ, k=2) # calculate stress fit_stress = sqrt(sum((dist.MNZ - dist(fit$points)) ^ 2) / sum(dist.MNZ ^ 2)) ``` plot ``` R # plot dist to points x <-fit$points[,1] y <-fit$points[,2] plot(x, y, xlab="Coordinate 1", ylab="Coordinate 2",main="Metric MDS", type="n") text(x, y, labels = row.names(MNZ), cex=1) # plot shepard diagram shep = Shepard(dist.MNZ, fit$points) plot(shep, pch = ".", xlab = "Dissimilarity", ylab = "Distance", xlim = range(shep$x), ylim = range(shep$x)) lines(shep$x, shep$yf, type = "S") abline(1, 1, col=2) ``` ## Corresponse Analysis ``` R library("FactoMineR") fit <-CA(housetasks, graph = FALSE) print(fit) # plot all fviz_ca_biplot(fit) # scree fviz_screeplot(fit, addlabels = TRUE, ylim = c(0, 50)) fviz_eig(fit) get_eigenvalue(fit) get_ca_row(fit) get_ca_col(fit) fviz_ca_row(fit) fviz_ca_col(fit) ``` ## MANOVA ``` Rlang root$Tree.Number <- as.factor(root$Tree.Number) root.manova <- manova(cbind(root$Trunk.Girth, root$Ext.Growth) ~ Tree.Number, data = root) summary(root.manova, test='Wilks') # test = c("Pillai", "Wilks", "Hotelling-Lawley", "Roy") ``` ## SVM ``` library(e1071) plot(svmfit, dat) ``` ## TODO * decision tree