# GUTMICRO day 1
## 1. Use metadata file and species module output to play around w/ species abundance distributions w/in and across hosts. E.g. you could try to calculate:
### - [ ] How many species have coverage >= 10x in average person? (repeat for 1x, 20x, ...) What genera do these species tend to come from?
Code:
```
library(tidyverse)
library(ggplot2)
library(ggpubr)
theme_set(theme_pubr())
file <- read_tsv("species_prevalence.txt", col_names=T)
dat <- as_tibble(file)
tens <- filter(arrange(dat, mean_coverage), mean_coverage >= 10)
ones <- filter(arrange(dat, mean_coverage), mean_coverage >=1)
twens <- filter(arrange(dat, mean_coverage), mean_coverage >=20)
#total number of species
length(dat$species_id)
#species with mean_coverage >=10
tens$species_id
length(tens$species_id)
(length(tens$species_id)/length(dat$species_id))*100
p <- ggplot(tens, aes(x = reorder(species_id, mean_coverage), y = mean_coverage)) +
geom_bar(fill = "#0073C2FF", stat = "identity") +
geom_text(aes(label = mean_coverage), angle = 90) +
ggtitle("Species with coverage >=10") +
xlab("Species") + ylab("Mean coverage") +
theme_pubclean()
p + theme(axis.text.x = element_text(angle = 90))
#species with mean_coverage >=1
ones$species_id
length(ones$species_id)
(length(ones$species_id)/length(dat$species_id))*100
q <- ggplot(ones, aes(x = reorder(species_id, mean_coverage), y = mean_coverage)) +
geom_bar(fill = "#0073C2FF", stat = "identity") +
geom_text(aes(label = mean_coverage), angle = 90) +
ggtitle("Species with coverage >=1") +
xlab("Species") + ylab("Mean coverage") +
theme_pubclean()
q + theme(axis.text.x = element_text(angle = 90))
#species with mean_coverage >=20
twens$species_id
length(twens$species_id)
(length(twens$species_id)/length(dat$species_id))*100
r <- ggplot(twens, aes(x = reorder(species_id, mean_coverage), y = mean_coverage)) +
geom_bar(fill = "#0073C2FF", stat = "identity") +
geom_text(aes(label = mean_coverage), angle = 90) +
ggtitle("Species with coverage >=20") +
xlab("Species") + ylab("Mean coverage") +
theme_pubclean()
r + theme(axis.text.x = element_text(angle = 90))
```
### - [ ] How many of these species does an average pair of people share? What about samples from the same person over time?
Code:
setwd("/Users/Zireael/Documents/KITP/Gutmicro_project/my_scripts") # replace with your working directory
#load data
meta<-fread("../kitp_2021_microbiome_data/sample_metadata.txt", data.table = F, stringsAsFactors = F)
abundance<-fread("../kitp_2021_microbiome_data/species/relative_abundance.txt", data.table = F, stringsAsFactors = F)
coverage<-fread("../kitp_2021_microbiome_data/species/species_prevalence.txt", data.table = F, stringsAsFactors = F)
species_subset<-coverage$species_id[which(coverage$mean_coverage>=10)]
#clean colnames
colnames(abundance)[2:470]<-gsub("c","",colnames(abundance)[2:470])
#check that abundance is fine
colSums(abundance[,2:ncol(abundance)])
#how many subjects
length(table(meta$`Subject/TwinID`))
#subset 10x species
abundance<-abundance[which(abundance$species_id%in%species_subset),]
#how many species are shared between 2 time points of the same person?
paired_meta<-meta[which(meta$`Subject/TwinID`%in%names(table(meta$`Subject/TwinID`))[which(table(meta$`Subject/TwinID`)>=2)]),]
size<-length(unique(paired_meta$`Subject/TwinID`))
within_person<-matrix(0, 1,size)
colnames(within_person)<-unique(paired_meta$`Subject/TwinID`)
k<-1
for (subj in unique(paired_meta$`Subject/TwinID`)){
#subj<-"158458797"
ids<-paired_meta$SampleID[which(paired_meta$`Subject/TwinID`==subj)]
sub_abund<-abundance[,which(colnames(abundance)%in%ids)]
if(ncol(sub_abund)==2){ # for strict pairs
within_person[1,k]<-length(intersect(which(sub_abund[,1]>0),which(sub_abund[,2]>0)))
}else{ # when we have more than 2 time points
within_person[1,k]<-length(intersect(which(sub_abund[,1]>0),which(sub_abund[,2]>0)))
}
k<-k+1
}
mean(within_person)
#how many species average 2 people share?
size<-length(unique(meta$`Subject/TwinID`))
between_person<-matrix(0, size,size)
dim(between_person)
people<-unique(meta$`Subject/TwinID`)
for(i in 1:(length(people)-1)){
#i<-1
subj1<-people[i]
subj1_id<-meta$SampleID[which(meta$`Subject/TwinID`==subj1)[1]]
for(j in ((i+1):length(people))){
#j<-289
subj2<-people[j]
subj2_id<-meta$SampleID[which(meta$`Subject/TwinID`==subj2)[1]]
sub_abund<-abundance[,which(colnames(abundance)%in%c(subj1_id, subj2_id))]
between_person[i,j]<-length(intersect(which(sub_abund[,1]>0),which(sub_abund[,2]>0)))
}
}
bp<-as.dist(t(between_person))
mean(as.vector(bp))
plot(density(within_person), col="royal blue")
lines(density(as.vector(bp), bw = 0.7), col="red")
wilcox.test(within_person, as.vector(bp))
t.test(within_person, as.vector(bp), alternative = "two.sided")
### - [ ] Are there particular species that are present and abundant in a large number of hosts?
Code:
library(dplyr) ##load dplyr
data<-read.csv("relative_abundance.csv", header=TRUE, check.names = FALSE) #read in data, I converted the file to a csv as r would not recognize the tab delimination when I used read.table()
df <- as.data.frame(data)
df[,-1] = (df[,-1] !=0)*1 # change everything over 0 to 1
df.sum<-rowSums(df[,2:922]) #sum rows, write to value
df.new <- df #new dataframe
df.new$total <-df.sum #add total values to new dataframe
df.final <-df.new[c("species_id","total")] #new dataframe with just the names and total count
df.final[order(-df.final$total),] #reorder, descending
---
## 2. Try running MIDAS on the example reads provided (OPTIONAL, but recommended)
### - [ ] Figure out how many species are “present” (>= 3x coverage) in the example sample.
Code:
### - [ ] Run the MIDAS SNPs module for these species using the example code provided. (Note how long it takes per million reads)
Code:
### - [ ] Download the IGV genome viewer and look at the resulting .bam file (example_output/snps/temp/genomes.bam)to get a firsthand sense of how messy the alignments are (we’ll start w/ this in our next meeting).
Code: