# GUTMICRO day 2
## 1. Using the provided snps_depth_small.txt.bz2 files, calculate what fraction of the ref genome is not present in a typical sample. How does this vary across species / species?
Code:
```
#strategy: count up 0s, count up > 0s, divide
library(data.table)
library(tidyverse)
bdata <- fread("snps_depth_small_bacteroides.txt", data.table=F, stringsAsFactors = F)
adata <- fread("snps_depth_small_alistapes.txt", data.table=F, stringsAsFactors = F)
#strategy: count up 0s, count up > 0s, divide
lapply(bdata, function(x){length(which(x==0))/length(x)})
lapply(adata, function(x){length(which(x==0))/length(x)})
#for B.vulgatus, fraction of genome not present ranges from 13% to 40% between samples.
#for A. putredinis, range is similar.
#alternate strategy: average or sum across rows, then count 0s
bdata.sum <- rowSums(bdata[,2:39]) #sum rows, write to value
bdata$total <- bdata.sum #add sum to dta
bdata.final <- bdata[c("site_id","total")] #new dataframe, just id and total
bdata.final[order(bdata.final$total),] #reorder, ascending
lapply(bdata.final, function(x){length(which(x==0))/length(x)})
adata.sum <- rowSums(adata[,2:39])
adata$total <- adata.sum
adata.final <- adata[c("site_id","total")]
lapply(adata.final, function(x){length(which(x==0))/length(x)})
#here the fraction of genome not present is 6% for B.vulg and 3% for A.putre across samples
```
## 2. Are some genes present in almost all samples?
Code:
## 3. Can you identify the “typical” coverage for each sample? By eye? By some automatic method (you can see what we did in the methods section of our PLoS Bio paper).
> Based on these raw alignments, MIDAS reports the total read coverage D for each site in the reference genome for each given sample [18]. We used this distribution of coverage across the genome to obtain a measure of the “typical” coverage, D, defined as the median of all protein coding sites with nonzero coverage. All samples with D < 5 were excluded from further analyses. Additional coverage requirements for QP samples are imposed below.
Code:
## 4. What would you do to filter out sites not “present” in a given sample? (you can see what we did in the methods section of our paper). What tradeoffs are you thinking about re: the criteria you chose?
Code:
## 5. Bonus: since individual sites are noisy, one could maybe do better by coarse-graining the coverage estimation / filter step at the gene level. Can you implement a good way to estimate the “typical” coverage of a gene and compare it to the whole genome?
Code:
## Veronika's code:
#### Data analysis and comparison
```
rm(list=ls(all=TRUE)) # clean environment
getwd()
#update.packages(ask = FALSE, dependencies = c('Suggests')) #update all installed R packages
library(data.table) # for fread function
library(RColorBrewer) # for nice color palettes
library(scales) # for points transparency on plots
library(gplots) # for heatmap.2 function
setwd("/Users/Zireael/Documents/KITP/Gutmicro_project/my_scripts") # replace with your working directory
# load data
snps<-fread("../kitp_2021_microbiome_data/snps/Bacteroides_vulgatus_57955/snps_depth_small.txt", data.table = F, stringsAsFactors = F)
snps<-fread("../kitp_2021_microbiome_data/snps/Alistipes_putredinis_61533/snps_depth_small.txt", data.table = F, stringsAsFactors = F)
rownames(snps)<-snps$site_id
snps<-snps[,-1]
str(snps)
head(snps)
# Using the provided snps_depth_small.txt.bz2 files,
# calculate what fraction of the ref genome is not present in a typical sample.
# How does this vary across species / species?
frac<-apply(snps, 2, function(x) length(which(x==0))/length(x))
median(frac)
hist(frac,breaks = 15)
mean(frac)
sd(frac)
# Are some genes present in almost all samples?
snps<-fread("../kitp_2021_microbiome_data/snps/Bacteroides_vulgatus_57955/snps_depth_small.txt",
data.table = F, stringsAsFactors = F)
genome<-fread("../kitp_2021_microbiome_data/MIDAS/midas_db/rep_genomes/Bacteroides_vulgatus_57955/genome.features",
data.table = F, stringsAsFactors = F)
rownames(snps)<-snps$site_id
snps<-snps[,-1]
rownames(snps)<-apply(as.data.frame(rownames(snps)), 1, function(x) strsplit(x,split = "\\|")[[1]][2])
rownames(snps)[nrow(snps)]
cov_all<-data.frame()
sum_all<-data.frame()
for(i in 1:nrow(genome)){
#i<-1
start<-genome$start[i]
end<-genome$end[i]
sub.cov<-snps[c(start:end),]
cov<-apply(sub.cov, 2, function(x) length(which(x>=3))) # what is the best threshold?
cov_all<-rbind(cov_all, cov)
summ<-apply(sub.cov, 2, function(x) sum(x)/length(x)) # what is the best threshold?
sum_all<-rbind(sum_all, summ)
}
rownames(cov_all)<-genome$gene_id
rownames(sum_all)<-genome$gene_id
colnames(cov_all)<-colnames(snps)
colnames(sum_all)<-colnames(snps)
gene_length<-genome$end-genome$start+1
pres<-vector()
for(i in 1:nrow(cov_all)){ # the gene is present if at least half of it's length covered x3
#i<-1
pres[i]<-length(which(cov_all[i,]>gene_length[i]/2))
#pres_all<-rbind(pres_all,pres)
}
hist(pres)
length(which(pres==38))
pres_cov<-vector()
for(i in 1:nrow(sum_all)){ # the gene is present if it's covered at least x5 on average
#i<-1
pres_cov[i]<-length(which(sum_all[i,]>=5))
}
hist(pres_cov)
length(which(pres_cov==38))
length(intersect(which(pres==38),which(pres_cov==38)))
hist(gene_length)
median(gene_length)
pres_full<-vector()
for(i in 1:nrow(cov_all)){ # the gene is present if it's every position is covered at least x3
#i<-1
pres_full[i]<-length(which(cov_all[i,]==gene_length[i]))
}
length(which(pres_full==38))
length(intersect(which(pres_cov==38),which(pres_full==38)))
sub_genome<-genome[which(pres_full==38),]
```