# Lab 8
'cp /courses/bi278/Course_Materials/lab_08/multivariate_data.txt ~/lab_08' to move files into R
x<-c("dendextend", "ggplot2", "vegan", "reshape2", "ggrepel")
lapply(x, require, character.only = TRUE) to launch necessary libraries
cdata <- read.table("lab_08", h=T) because i changed the name ^
head(cdata) to visualize data
rownames(cdata) <-cdata$sample to set row names of cdata
'ggplot(cdata, aes(x=gene1, y=gene2)) + geom_point() + geom_text(aes(label=sample), hjust=-.3, vjust=0)' to compare choice of two variables in a scatterplot (gene1 and gene2)
6. Let's run a PCA on these data (must be all numerical) and save those results to a
new dataframe (a data table format that R uses).
cdata.pca <- prcomp(cdata[,c(3:10)], scale=T)
cdata.pca.res <- as.data.frame(cdata.pca$x)
Note: we have chosen to scale (~normalize) our data.
7. We can take a look at the results of the PCA here.
summary(cdata.pca)
Easy explanation lolz
8. Let's make a new scatterplot of these new composite variables using the PCA results
we captured in the previous step:
ggplot(cdata.pca.res, aes(x=PC1, y=PC2)) + geom_point() +
geom_text(aes(label=rownames(cdata.pca.res)), hjust=1,vjust=-
.5)
It may seem lazy but I actually think these are going to be helpful notes for me going forward
use these to attribute taxon labels:
'cdata.pca.res$taxonomy <- cdata$taxonomy'
'ggplot(cdata.pca.res, aes(x=PC1, y=PC2, color=taxonomy)) +
geom_point() + geom_text(aes(label=rownames(cdata.pca.res)),
hjust=1,vjust=-.5)'
Use these to see if variables are separated by generated path
path = c(rep("green",5), rep("blue",4))
cdata.pca.res$path <- path
ggplot(cdata.pca.res, aes(x=PC1, y=PC2, color=path)) +
geom_point() + geom_text(aes(label=rownames(cdata.pca.res)),
hjust=1,vjust=-.5)
To make overlapping points appear clearer run ''ggplot(cdata.pca.res, aes(x=PC1, y=PC2, color=path)) +
geom_point() +
geom_label_repel(aes(label=rownames(cdata.pca.res)))
To see how genes were reduced to these variables
'cdata.pca$rotation'
To turn data into dissimilarity matrix:
'cdata.nms <- metaMDS(cdata[,c(3:10)], distance="bray", k=3)
cdata.nms'
Examine shepherd plot:
'stressplot(cdata.nms)'
Plot using vegan package differentiating by groups
'path = c(rep("green",5), rep("blue",4)) # same order as your
rows'
plot(cdata.nms, type="n")
ordihull(cdata.nms, groups=path, draw="polygon", col="grey90",
label=F)
orditorp(cdata.nms, display="species", col="red")
orditorp(cdata.nms, display="sites",
col=c(rep("green",5),rep("blue",4)), cex=1.25)
To make bray-curtis distance matrix
'd <- vegdist(cdata[,3:10], method="bray")'
To run hierarchal clustering analysis
'cdata.clust <- hclust(d, method="ward.D2") '
Visualize dendrogram:
plot(cdata.clust)
Visualize classification results thus far:
cutree(cdata.clust, k=2)
To convert clustering results into dendrogram and see it:
dendro <- as.dendrogram(cdata.clust)
plot(dendro, type = "rectangle", ylab = "Height")
24. We can manipulate the dendrogram further. For example, we can color the “leaves”
based on the cluster identity that was shown with cutree().
dendro %>%
set("leaves_pch", c(19)) %>%
set("leaves_cex", c(2)) %>%
set("leaves_col", as.numeric(cutree(cdata.clust,
k=2)[cdata.clust$order])+1) %>%
plot(type = "rectangle")
Convert the data from wide to long to see how its been converted. Also to add experimental variable
cdata <- read.table("multivariate_data.txt", h=T)
path <- c(rep("green",5), rep("blue",4))
cdata$path <- path
cluster.long <- melt(cluster[,1:11])
head(cluster.long)
Plot into boxplots by taxonomy and path:
ggplot(cluster.long, aes(x=taxonomy, y=value, fill=taxonomy)) +
geom_boxplot() + facet_wrap(.~variable, scales="free")
ggplot(cluster.long, aes(x=path, y=value, fill=path)) +
geom_boxplot() + facet_wrap(.~variable, scales="free")
Subset the data by chosen gene
gene2.long <- subset(cluster.long, variable=="gene2")
ggplot(gene2.long, aes(x=taxonomy, y=value, fill=taxonomy)) + geom_boxplot()
For dotplot:
ggplot(gene2.long, aes(x=path, y=value, fill=path)) + geom_dotplot(binaxis='y', stackdir='center')