# HW4
```_
MLL_train<-read.delim("MLL_train.txt")
rownames(MLL_train)<-MLL_train[,1]
MLL_train_data<-as.matrix(MLL_train[-c(grep("AFFX",row.names(MLL_train))),-c(1,2)])
#remove gene name include AFFX, name and description
nor<-function(x){ #normalization
min.x<-min(x)
max.x<- max(x)
(x-min.x)/(max.x-min.x)
}
samplename<-colnames(MLL_train_data)
MLL_train_data<-t(apply(MLL_train_data,1,nor))
group_name<-c("ALL","MLL","AML")
MLL_train_data_T_TEST=matrix(0,nrow=3,ncol=12533)
top10COND=matrix(0,nrow=3,ncol=10)
#greater
for(i in 1:nrow(MLL_train_data)){
for(j in 1:3){
MLL_train_data_T_TEST[j,i]=t.test(MLL_train_data[i,grepl(group_name[j],samplename)],
MLL_train_data[i,!grepl(group_name[j],samplename)],
alternative = "greater")$p.value
}
}
colnames(MLL_train_data_T_TEST)<-rownames(MLL_train_data)
#top10COND1_names<-order(MLL_train_data_T_TEST[1,])[1:10] access ranked name
#x<-colnames(MLL_train_data_T_TEST)[top10COND1_names]
top10COND1<-as.matrix(sort(MLL_train_data_T_TEST[1,])[1:10])
top10COND2<-as.matrix(sort(MLL_train_data_T_TEST[2,])[1:10])
top10COND3<-as.matrix(sort(MLL_train_data_T_TEST[3,])[1:10])
colnames(top10COND1)<-"p-value"
colnames(top10COND2)<-"p-value"
colnames(top10COND3)<-"p-value"
write.csv(top10COND1, "condition1_top10.csv")
write.csv(top10COND2, "condition2_top10.csv")
write.csv(top10COND3, "condition3_top10.csv")
#SVM
library(e1071)
Xtest<- t(MLL_train_data)#confusion matrix
Ytrain<- as.factor(c(gsub("^ALL_\\d+","ALL",c(rownames(Xtest[1:20,]))),
gsub("^MLL_\\d+|^MLL__\\d+","MLL",c(rownames(Xtest[21:37,]))),
gsub("^AML_\\d+","AML",c(rownames(Xtest[38:57,])))))
the_3_classes<-svm(Xtest,Ytrain)
summary(the_3_classes)
pred<-predict(the_3_classes,Xtest)
ans<-table(pred,Ytrain)
ans
write.csv(ans, "3CLASSES.csv")
##Three 2-class classification problems, each case using all genes
#ALL vs Other
Ytrain_ALL_vs_Other<-as.factor(c(gsub("^ALL_\\d+","ALL",c(rownames(Xtest[1:20, ]))),
gsub("^MLL_\\d+|^MLL__\\d+|^AML_\\d+","MLL+AML",c(rownames(Xtest[21:57, ])))))
ALL_vs_Other_SVM<-svm(Xtest,Ytrain_ALL_vs_Other)
summary(ALL_vs_Other_SVM)
pred_ALL_vs_Other_SVM<-predict(ALL_vs_Other_SVM,Xtest)
ans<-table(pred_ALL_vs_Other_SVM,Ytrain_ALL_vs_Other)
ans
write.csv(ans, "ALL_vs_MLL+AML.csv")
#MLL vs Other
Ytrain_MLL_vs_Other<-as.factor(c(gsub("^MLL_\\d+|^MLL__\\d+","MLL",c(rownames(Xtest[21:37, ]))),
gsub("^ALL_\\d+|^AML_\\d+","ALL+AML",c(rownames(Xtest[c(1:20, 38:57), ])))))
MLL_vs_Other_SVM<-svm(Xtest,Ytrain_MLL_vs_Other)
summary(MLL_vs_Other_SVM)
pred_MLL_vs_Other_SVM<-predict(MLL_vs_Other_SVM,Xtest)
ans<-table(pred_MLL_vs_Other_SVM,Ytrain_MLL_vs_Other)
ans
write.csv(ans, "MLL_vs_ALL+AML.csv")
#AML vs Other
Ytrain_AML_vs_Other<-as.factor(c(gsub("^AML_\\d+","AML",c(rownames(Xtest[38:57,]))),
gsub("^ALL_\\d+|^MLL_\\d+|^MLL__\\d+","ALL+MLL",c(rownames(Xtest[1:37,])))))
AML_vs_Other_SVM<-svm(Xtest,Ytrain_AML_vs_Other)
summary(AML_vs_Other_SVM)
pred_AML_vs_Other_SVM<-predict(AML_vs_Other_SVM,Xtest)
ans<-table(pred_AML_vs_Other_SVM,Ytrain_AML_vs_Other)
ans
write.csv(ans, "AML_vs_ALL+MLL.csv")
#Three 2-class classification problems, each case using 10 selected genes
#ALL vs MLL+AML
#top10COND1
Xtest_con1top10<-t(cbind(MLL_train_data[rownames(top10COND1),]))
Ytrain_ALLtop10<-as.factor(c(gsub("^ALL_\\d+","ALL",c(rownames(Xtest_con1top10[1:20, ]))),
gsub("^MLL_\\d+|^MLL__\\d+|^AML_\\d+","MLL+AML",c(rownames(Xtest_con1top10[21:57, ])))))
ALLtop10_vs_Other_SVM<-svm(Xtest_con1top10,Ytrain_ALLtop10)
summary(ALLtop10_vs_Other_SVM)
pred_ALLtop10_vs_Other_SVM<-predict(ALLtop10_vs_Other_SVM,Xtest_con1top10)
ans<-table(pred_ALLtop10_vs_Other_SVM,Ytrain_ALLtop10)
ans
write.csv(ans, "ALLtop10_vs_MLL+AML.csv")
#MLL vs ALL+AML
#top10COND2
Xtest_con2top10<-t(cbind(MLL_train_data[rownames(top10COND2),]))
Ytrain_MLLtop10<-as.factor(c(gsub("^MLL_\\d+|^MLL__\\d+","MLL",c(rownames(Xtest_con2top10[21:37, ]))),
gsub("^ALL_\\d+|^AML_\\d+","ALL+AML",c(rownames(Xtest_con2top10[c(1:20, 38:57), ])))))
MLLtop10_vs_Other_SVM<-svm(Xtest_con2top10,Ytrain_MLLtop10)
summary(MLLtop10_vs_Other_SVM)
pred_MLLtop10_vs_Other_SVM<-predict(MLLtop10_vs_Other_SVM,Xtest_con2top10)
ans<-table(pred_MLLtop10_vs_Other_SVM,Ytrain_MLLtop10)
ans
write.csv(ans, "MLLtop10_vs_ALL+AML.csv")
#AML vs ALL+MLL
#top10COND3
Xtest_con3top10<-t(cbind(MLL_train_data[rownames(top10COND3),]))
Ytrain_AMLtop10<-as.factor(c(gsub("^AML_\\d+","AML",c(rownames(Xtest_con3top10[38:57,]))),
gsub("^ALL_\\d+|^MLL_\\d+|^MLL__\\d+","ALL+MLL",c(rownames(Xtest_con3top10[1:37,])))))
AMLtop10_vs_Other_SVM<-svm(Xtest_con3top10,Ytrain_AMLtop10)
summary(AMLtop10_vs_Other_SVM)
pred_AMLtop10_vs_Other_SVM<-predict(AMLtop10_vs_Other_SVM,Xtest_con3top10)
ans<-table(pred_AMLtop10_vs_Other_SVM,Ytrain_AMLtop10)
ans
write.csv(ans, "AMLtop10_vs_ALL+MLL.csv")
##A 3-class classification problem using the combination of 30 selected genes
Xtest_30gene<-cbind(Xtest_con1top10,Xtest_con2top10,Xtest_con3top10)
Ytrain_30gene<-as.factor(c(gsub("^ALL_\\d+","ALL",c(rownames(Xtest_30gene[1:20,]))),
gsub("^MLL_\\d+|^MLL__\\d+","MLL",c(rownames(Xtest_30gene[21:37,]))),
gsub("^AML_\\d+","AML",c(rownames(Xtest_30gene[38:57,])))))
genes30_SVM <-svm(Xtest_30gene, Ytrain_30gene)
summary(genes30_SVM)
pred_30gene <- predict(genes30_SVM, Xtest_30gene)
ans<-table(pred_30gene, Ytrain_30gene)
ans
write.csv(ans, "gene30_SVM.csv")
```