Lab 8 - HackMD

# Lab 8 'cp /courses/bi278/Course_Materials/lab_08/multivariate_data.txt ~/lab_08' to move files into R x<-c("dendextend", "ggplot2", "vegan", "reshape2", "ggrepel") lapply(x, require, character.only = TRUE) to launch necessary libraries cdata <- read.table("lab_08", h=T) because i changed the name ^ head(cdata) to visualize data rownames(cdata) <-cdata$sample to set row names of cdata 'ggplot(cdata, aes(x=gene1, y=gene2)) + geom_point() + geom_text(aes(label=sample), hjust=-.3, vjust=0)' to compare choice of two variables in a scatterplot (gene1 and gene2) 6. Let's run a PCA on these data (must be all numerical) and save those results to a new dataframe (a data table format that R uses). cdata.pca <- prcomp(cdata[,c(3:10)], scale=T) cdata.pca.res <- as.data.frame(cdata.pca$x) Note: we have chosen to scale (~normalize) our data. 7. We can take a look at the results of the PCA here. summary(cdata.pca) Easy explanation lolz 8. Let's make a new scatterplot of these new composite variables using the PCA results we captured in the previous step: ggplot(cdata.pca.res, aes(x=PC1, y=PC2)) + geom_point() + geom_text(aes(label=rownames(cdata.pca.res)), hjust=1,vjust=- .5) It may seem lazy but I actually think these are going to be helpful notes for me going forward use these to attribute taxon labels: 'cdata.pca.res$taxonomy <- cdata$taxonomy' 'ggplot(cdata.pca.res, aes(x=PC1, y=PC2, color=taxonomy)) + geom_point() + geom_text(aes(label=rownames(cdata.pca.res)), hjust=1,vjust=-.5)' Use these to see if variables are separated by generated path path = c(rep("green",5), rep("blue",4)) cdata.pca.res$path <- path ggplot(cdata.pca.res, aes(x=PC1, y=PC2, color=path)) + geom_point() + geom_text(aes(label=rownames(cdata.pca.res)), hjust=1,vjust=-.5) To make overlapping points appear clearer run ''ggplot(cdata.pca.res, aes(x=PC1, y=PC2, color=path)) + geom_point() + geom_label_repel(aes(label=rownames(cdata.pca.res))) To see how genes were reduced to these variables 'cdata.pca$rotation' To turn data into dissimilarity matrix: 'cdata.nms <- metaMDS(cdata[,c(3:10)], distance="bray", k=3) cdata.nms' Examine shepherd plot: 'stressplot(cdata.nms)' Plot using vegan package differentiating by groups 'path = c(rep("green",5), rep("blue",4)) # same order as your rows' plot(cdata.nms, type="n") ordihull(cdata.nms, groups=path, draw="polygon", col="grey90", label=F) orditorp(cdata.nms, display="species", col="red") orditorp(cdata.nms, display="sites", col=c(rep("green",5),rep("blue",4)), cex=1.25) To make bray-curtis distance matrix 'd <- vegdist(cdata[,3:10], method="bray")' To run hierarchal clustering analysis 'cdata.clust <- hclust(d, method="ward.D2") ' Visualize dendrogram: plot(cdata.clust) Visualize classification results thus far: cutree(cdata.clust, k=2) To convert clustering results into dendrogram and see it: dendro <- as.dendrogram(cdata.clust) plot(dendro, type = "rectangle", ylab = "Height") 24. We can manipulate the dendrogram further. For example, we can color the “leaves” based on the cluster identity that was shown with cutree(). dendro %>% set("leaves_pch", c(19)) %>% set("leaves_cex", c(2)) %>% set("leaves_col", as.numeric(cutree(cdata.clust, k=2)[cdata.clust$order])+1) %>% plot(type = "rectangle") Convert the data from wide to long to see how its been converted. Also to add experimental variable cdata <- read.table("multivariate_data.txt", h=T) path <- c(rep("green",5), rep("blue",4)) cdata$path <- path cluster.long <- melt(cluster[,1:11]) head(cluster.long) Plot into boxplots by taxonomy and path: ggplot(cluster.long, aes(x=taxonomy, y=value, fill=taxonomy)) + geom_boxplot() + facet_wrap(.~variable, scales="free") ggplot(cluster.long, aes(x=path, y=value, fill=path)) + geom_boxplot() + facet_wrap(.~variable, scales="free") Subset the data by chosen gene gene2.long <- subset(cluster.long, variable=="gene2") ggplot(gene2.long, aes(x=taxonomy, y=value, fill=taxonomy)) + geom_boxplot() For dotplot: ggplot(gene2.long, aes(x=path, y=value, fill=path)) + geom_dotplot(binaxis='y', stackdir='center')