GUTMICRO day 1

# GUTMICRO day 1 ## 1. Use metadata file and species module output to play around w/ species abundance distributions w/in and across hosts. E.g. you could try to calculate: ### - [ ] How many species have coverage >= 10x in average person? (repeat for 1x, 20x, ...) What genera do these species tend to come from? Code: ``` library(tidyverse) library(ggplot2) library(ggpubr) theme_set(theme_pubr()) file <- read_tsv("species_prevalence.txt", col_names=T) dat <- as_tibble(file) tens <- filter(arrange(dat, mean_coverage), mean_coverage >= 10) ones <- filter(arrange(dat, mean_coverage), mean_coverage >=1) twens <- filter(arrange(dat, mean_coverage), mean_coverage >=20) #total number of species length(dat$species_id) #species with mean_coverage >=10 tens$species_id length(tens$species_id) (length(tens$species_id)/length(dat$species_id))*100 p <- ggplot(tens, aes(x = reorder(species_id, mean_coverage), y = mean_coverage)) + geom_bar(fill = "#0073C2FF", stat = "identity") + geom_text(aes(label = mean_coverage), angle = 90) + ggtitle("Species with coverage >=10") + xlab("Species") + ylab("Mean coverage") + theme_pubclean() p + theme(axis.text.x = element_text(angle = 90)) #species with mean_coverage >=1 ones$species_id length(ones$species_id) (length(ones$species_id)/length(dat$species_id))*100 q <- ggplot(ones, aes(x = reorder(species_id, mean_coverage), y = mean_coverage)) + geom_bar(fill = "#0073C2FF", stat = "identity") + geom_text(aes(label = mean_coverage), angle = 90) + ggtitle("Species with coverage >=1") + xlab("Species") + ylab("Mean coverage") + theme_pubclean() q + theme(axis.text.x = element_text(angle = 90)) #species with mean_coverage >=20 twens$species_id length(twens$species_id) (length(twens$species_id)/length(dat$species_id))*100 r <- ggplot(twens, aes(x = reorder(species_id, mean_coverage), y = mean_coverage)) + geom_bar(fill = "#0073C2FF", stat = "identity") + geom_text(aes(label = mean_coverage), angle = 90) + ggtitle("Species with coverage >=20") + xlab("Species") + ylab("Mean coverage") + theme_pubclean() r + theme(axis.text.x = element_text(angle = 90)) ``` ### - [ ] How many of these species does an average pair of people share? What about samples from the same person over time? Code: setwd("/Users/Zireael/Documents/KITP/Gutmicro_project/my_scripts") # replace with your working directory #load data meta<-fread("../kitp_2021_microbiome_data/sample_metadata.txt", data.table = F, stringsAsFactors = F) abundance<-fread("../kitp_2021_microbiome_data/species/relative_abundance.txt", data.table = F, stringsAsFactors = F) coverage<-fread("../kitp_2021_microbiome_data/species/species_prevalence.txt", data.table = F, stringsAsFactors = F) species_subset<-coverage$species_id[which(coverage$mean_coverage>=10)] #clean colnames colnames(abundance)[2:470]<-gsub("c","",colnames(abundance)[2:470]) #check that abundance is fine colSums(abundance[,2:ncol(abundance)]) #how many subjects length(table(meta$`Subject/TwinID`)) #subset 10x species abundance<-abundance[which(abundance$species_id%in%species_subset),] #how many species are shared between 2 time points of the same person? paired_meta<-meta[which(meta$`Subject/TwinID`%in%names(table(meta$`Subject/TwinID`))[which(table(meta$`Subject/TwinID`)>=2)]),] size<-length(unique(paired_meta$`Subject/TwinID`)) within_person<-matrix(0, 1,size) colnames(within_person)<-unique(paired_meta$`Subject/TwinID`) k<-1 for (subj in unique(paired_meta$`Subject/TwinID`)){ #subj<-"158458797" ids<-paired_meta$SampleID[which(paired_meta$`Subject/TwinID`==subj)] sub_abund<-abundance[,which(colnames(abundance)%in%ids)] if(ncol(sub_abund)==2){ # for strict pairs within_person[1,k]<-length(intersect(which(sub_abund[,1]>0),which(sub_abund[,2]>0))) }else{ # when we have more than 2 time points within_person[1,k]<-length(intersect(which(sub_abund[,1]>0),which(sub_abund[,2]>0))) } k<-k+1 } mean(within_person) #how many species average 2 people share? size<-length(unique(meta$`Subject/TwinID`)) between_person<-matrix(0, size,size) dim(between_person) people<-unique(meta$`Subject/TwinID`) for(i in 1:(length(people)-1)){ #i<-1 subj1<-people[i] subj1_id<-meta$SampleID[which(meta$`Subject/TwinID`==subj1)[1]] for(j in ((i+1):length(people))){ #j<-289 subj2<-people[j] subj2_id<-meta$SampleID[which(meta$`Subject/TwinID`==subj2)[1]] sub_abund<-abundance[,which(colnames(abundance)%in%c(subj1_id, subj2_id))] between_person[i,j]<-length(intersect(which(sub_abund[,1]>0),which(sub_abund[,2]>0))) } } bp<-as.dist(t(between_person)) mean(as.vector(bp)) plot(density(within_person), col="royal blue") lines(density(as.vector(bp), bw = 0.7), col="red") wilcox.test(within_person, as.vector(bp)) t.test(within_person, as.vector(bp), alternative = "two.sided") ### - [ ] Are there particular species that are present and abundant in a large number of hosts? Code: library(dplyr) ##load dplyr data<-read.csv("relative_abundance.csv", header=TRUE, check.names = FALSE) #read in data, I converted the file to a csv as r would not recognize the tab delimination when I used read.table() df <- as.data.frame(data) df[,-1] = (df[,-1] !=0)*1 # change everything over 0 to 1 df.sum<-rowSums(df[,2:922]) #sum rows, write to value df.new <- df #new dataframe df.new$total <-df.sum #add total values to new dataframe df.final <-df.new[c("species_id","total")] #new dataframe with just the names and total count df.final[order(-df.final$total),] #reorder, descending --- ## 2. Try running MIDAS on the example reads provided (OPTIONAL, but recommended) ### - [ ] Figure out how many species are “present” (>= 3x coverage) in the example sample. Code: ### - [ ] Run the MIDAS SNPs module for these species using the example code provided. (Note how long it takes per million reads) Code: ### - [ ] Download the IGV genome viewer and look at the resulting .bam file (example_output/snps/temp/genomes.bam)to get a firsthand sense of how messy the alignments are (we’ll start w/ this in our next meeting). Code:

Syntax	Example	Reference
# Header	Header	基本排版
- Unordered List	Unordered List
1. Ordered List	Ordered List
- [ ] Todo List	Todo List
> Blockquote	Blockquote
Bold font	Bold font
Italics font	Italics font
~~Strikethrough~~	~~Strikethrough~~
19^th^	19^th
H~2~O	H₂O
++Inserted text++	Inserted text
==Marked text==	Marked text
[link text](https:// "title")	Link
![image alt](https:// "title")	Image
`Code`	`Code`	在筆記中貼入程式碼
```javascript var i = 0; ```	`var i = 0;`
:smile:		Emoji list
{%youtube youtube_id %}	Externals
$L^aT_eX$	L^aT_eX
:::info This is a alert area. :::	This is a alert area.