# New data 16S rRNA - CRC WG3 cost
###### tags: `16S rRNA` `COST Action` `ML`
### Qiime2 objects to parse to phyloseq
**Metadata**
[Metadata all studies](https://drive.google.com/file/d/1Koz4Ft7amQdfc_c8jC4HXeVtVq6oHc58/view?usp=share_link)
**Single-end data**
[Taxonomy table.](https://drive.google.com/file/d/1T_HvmChQFamt7spqmcW8nAsloDTm3mMh/view?usp=share_link)
[ASV table.](https://drive.google.com/file/d/1lFLhJXcKfMAa_yUG__22J3_phQHac2Jr/view?usp=share_link)
[Stats.](https://drive.google.com/file/d/14hKnLxw1c5wAnnH-IkuXNB5cD3esmmPG/view?usp=share_link) You can visualize [here.](https://view.qiime2.org/)
stats table.
**Paired-end data**
[Taxonomy table.](https://drive.google.com/file/d/1NX9-91EkOtvBuH1b1W64HzSCZzKDgzix/view?usp=share_link)
[ASV table. ](https://drive.google.com/file/d/193wXGAt7PDyQ4IZ6-YnZM3gQj_fOerSA/view?usp=share_link)
[Stats.](https://drive.google.com/file/d/1zcH4pDoT-pEPy8QMfe9creVMooVUcXaD/view?usp=share_link) you can visualize [here. ](https://view.qiime2.org/)
**My code**
```R=1
library(tidyverse)
library(qiime2R)
library(phyloseq)
library(microbiome)
#ASV table####
ASV<-read_qza("DATA/table.qza")
#Taxonomy table
taxonomyq <-read_qza("DATA/taxonomy.qza")
#Tax table transformations
taxtableq <- taxonomyq$data %>% as.tibble() %>% separate(Taxon, sep=";", c("Kingdom","Phylum","Class","Order","Family","Genus","Species"))
taxtableq$Kingdom <- gsub("d__", "",taxtableq$Kingdom)
taxtableq$Phylum <- gsub("p__", "",taxtableq$Phylum)
taxtableq$Class <- gsub("c__", "",taxtableq$Class)
taxtableq$Order <- gsub("o__", "",taxtableq$Order)
taxtableq$Family <- gsub("f__", "",taxtableq$Family)
taxtableq$Genus <- gsub("g__", "",taxtableq$Genus)
taxtableq$Species <- gsub("s__", "",taxtableq$Species)
#Clean taxa: Remove Kingdom Unassigned & Eukaryota
taxtableq <- taxtableq[grep("Eukaryota|Unassigned", taxtableq$Kingdom, invert=T),]
#Tree
tree<-read_qza("DATA/rooted-tree.qza")
#Metadata
#Here you have to create a csv file with the metadata
metadata <-read_csv("metadata.csv")
#Construct phyloseq object
ASVs <- otu_table(ASV$data, taxa_are_rows = T)
tree <- phy_tree(tree$data)
TAXq <- tax_table(as.data.frame(taxtableq) %>% select_("-Confidence") %>% column_to_rownames("Feature.ID") %>% as.matrix()) #moving the taxonomy to the way phyloseq wants it
sample_metadata <- sample_data(metadata %>% as.data.frame())
sample_names(sample_metadata) <- paste(metadata$SampleID)
physeq<- merge_phyloseq(ASVs, tree, TAXq,sample_metadata)
#Remove samples with no diagnosis - Some extra samples were inlcuded by mistake-
physeq <- subset_samples(physeq, Diagnosis!="NA")
#Agglomerate at genus level
physeq.gen <- tax_glom(physeq, taxrank="Genus")
#Write data for sharing
data <- psmelt(physeq.gen)
ASV_tab <- data %>% select(SampleID,Abundance,Genus)
genus <- pivot_wider(ASV_tab, id_cols=SampleID, names_from=Genus, values_from=Abundance, values_fn = mean)
write_csv(genus,"genus.csv")
#Save phyloseq object for sharing
saveRDS(physeq.gen, "physeq.RDS")
```
:female-construction-worker: **Working Plan:**
1. Merge the new studies with the other studies in a phyloseq object.
2. Maybe we need to use some strategy to analyze all studies togeteher an reduce batch-effect. I have used [Mmuphin](https://www.bioconductor.org/packages/release/bioc/html/MMUPHin.html) in other cases. Other tools are welcome :wink:
3. Descriptive analysys of the data, including:
- Analytics of metadata
- Alfa and beta diversities of the microbiome
- Whatever you think would be useful :P
:::info
Regarding classification, this is a topic that we have discussed (with little succes) in some meetings in the WG. I think somebody suggest just to use Healthy / Cancer categories. We have an issue here because Adenoma is something inbetween. So, first with the descriptive analysis we will see the "dimension" of the problem i.e. how many samples are in each category, and then we can decide better on this. However, I will try to ask a collegue that works with CRC genomics to know her opinion.
But the "easy" solution will be to do two categories: Healthy(Control)/Cancer
:::
## Solving Point 1:
```r=1
library(tidyverse)
library(qiime2R)
library(phyloseq)
library(microbiome)
library(ape)
#Single
#ASV table####
ASV<-read_qza("DATA/table-single.qza")
#Taxonomy table
taxonomyq <-read_qza("DATA/taxonomy-single.qza")
#Tax table transformations
taxtableq <- taxonomyq$data %>% as.tibble() %>% separate(Taxon, sep=";", c("Kingdom","Phylum","Class","Order","Family","Genus","Species"))
taxtableq$Kingdom <- gsub("d__", "",taxtableq$Kingdom)
taxtableq$Phylum <- gsub("p__", "",taxtableq$Phylum)
taxtableq$Class <- gsub("c__", "",taxtableq$Class)
taxtableq$Order <- gsub("o__", "",taxtableq$Order)
taxtableq$Family <- gsub("f__", "",taxtableq$Family)
taxtableq$Genus <- gsub("g__", "",taxtableq$Genus)
taxtableq$Species <- gsub("s__", "",taxtableq$Species)
#Clean taxa: Remove Kingdom Unassigned & Eukaryota
taxtableq <- taxtableq[grep("Eukaryota|Unassigned", taxtableq$Kingdom, invert=T),]
#Metadata
#Here you have to create a csv file with the metadata
metadata <-read_tsv("DATA/metadata_single.csv")
#Construct phyloseq object
ASVs <- otu_table(ASV$data, taxa_are_rows = T)
#tree <- phy_tree(tree$data)
TAXq <- tax_table(as.data.frame(taxtableq) %>% select_("-Confidence") %>% column_to_rownames("Feature.ID") %>% as.matrix()) #moving the taxonomy to the way phyloseq wants it
sample_metadata <- sample_data(metadata %>% as.data.frame())
sample_names(sample_metadata) <- paste(metadata$SampleID)
sample_names(ASVs) <- paste(metadata$SampleID)
physeq.single <- merge_phyloseq(ASVs, TAXq,sample_metadata)
#Paired-end
#ASV table####
ASV<-read_qza("DATA/table-paired.qza")
#Taxonomy table
taxonomyq <-read_qza("DATA/taxonomy-paired.qza")
#Tax table transformations
taxtableq <- taxonomyq$data %>% as.tibble() %>% separate(Taxon, sep=";", c("Kingdom","Phylum","Class","Order","Family","Genus","Species"))
taxtableq$Kingdom <- gsub("d__", "",taxtableq$Kingdom)
taxtableq$Phylum <- gsub("p__", "",taxtableq$Phylum)
taxtableq$Class <- gsub("c__", "",taxtableq$Class)
taxtableq$Order <- gsub("o__", "",taxtableq$Order)
taxtableq$Family <- gsub("f__", "",taxtableq$Family)
taxtableq$Genus <- gsub("g__", "",taxtableq$Genus)
taxtableq$Species <- gsub("s__", "",taxtableq$Species)
#Clean taxa: Remove Kingdom Unassigned & Eukaryota
taxtableq <- taxtableq[grep("Eukaryota|Unassigned", taxtableq$Kingdom, invert=T),]
#Construct phyloseq object
ASVp <- otu_table(ASV$data, taxa_are_rows = T)
TAXp <- tax_table(as.data.frame(taxtableq) %>% select_("-Confidence") %>% column_to_rownames("Feature.ID") %>% as.matrix()) #moving the taxonomy to the way phyloseq wants it
#Metadata
names <- as.data.frame(sample_names(ASVp))
colnames(names) <- "SampleID"
metadataALL <- read_csv("DATA/metadataALL.csv")
metadata_paired <- metadataALL %>% right_join(names)
sample_metadatap <- sample_data(metadata_paired %>% as.data.frame())
sample_names(sample_metadatap) <- paste(metadata_paired$SampleID)
sample_names(ASVp) <- paste(metadata_paired$SampleID)
physeq.paired <- merge_phyloseq(ASVp, TAXp,sample_metadatap)
#Previous datasets #####
#ASV table####
ASV<-read_qza("DATA/table.qza")
#Taxonomy table
taxonomyq <-read_qza("DATA/taxonomy.qza")
#Tax table transformations
taxtableq <- taxonomyq$data %>% as.tibble() %>% separate(Taxon, sep=";", c("Kingdom","Phylum","Class","Order","Family","Genus","Species"))
taxtableq$Kingdom <- gsub("d__", "",taxtableq$Kingdom)
taxtableq$Phylum <- gsub("p__", "",taxtableq$Phylum)
taxtableq$Class <- gsub("c__", "",taxtableq$Class)
taxtableq$Order <- gsub("o__", "",taxtableq$Order)
taxtableq$Family <- gsub("f__", "",taxtableq$Family)
taxtableq$Genus <- gsub("g__", "",taxtableq$Genus)
taxtableq$Species <- gsub("s__", "",taxtableq$Species)
#Clean taxa: Remove Kingdom Unassigned & Eukaryota
taxtableq <- taxtableq[grep("Eukaryota|Unassigned", taxtableq$Kingdom, invert=T),]
#Metadata
metadata<-read_csv("DATA/metadata.csv")
#Construct phyloseq object
ASVpr <- otu_table(ASV$data, taxa_are_rows = T)
TAXpr <- tax_table(as.data.frame(taxtableq) %>% select_("-Confidence") %>% column_to_rownames("Feature.ID") %>% as.matrix()) #moving the taxonomy to the way phyloseq wants it
sample_metadata <- sample_data(metadata %>% as.data.frame())
sample_names(sample_metadata) <- paste(metadata$SampleID)
physeq.prev <- merge_phyloseq(ASVpr, TAXpr, sample_metadata)
#Remove samples with no diagnosis - Some extra samples were inlcuded by mistake-
physeq.prev <- subset_samples(physeq.prev, Diagnosis!="NA")
#Unir tres objetos phyloseq
physeq <- merge_phyloseq(physeq.single, physeq.paired,physeq.prev)
#Generar arbol
random_tree <-rtree(ntaxa(physeq), rooted=TRUE, tip.label=taxa_names(physeq))
physeq.tree <- merge_phyloseq(physeq,random_tree)
#Agglomerate at genus level
physeq.gen <- tax_glom(physeq.tree, taxrank="Genus")
#Write data for sharing
data <- psmelt(physeq.gen)
ASV_tab <- data %>% select(SampleID,Abundance,Genus)
genus <- pivot_wider(ASV_tab, id_cols=SampleID, names_from=Genus, values_from=Abundance, values_fn = mean)
write_csv(genus,"genusALL.csv")
#Save phyloseq object for sharing
saveRDS(physeq.gen, "physeqALL.RDS")
```
:warning: **Pending work for Rajesh:**
There is a problem with the metadata, when I added the new datasets the previous metadata was deleted. You need to create a new `sample_data` object including all the clean metadata :eyes: be sure to include the correct `sample_names` and then create a new phyloseq object with the new metadata.
You can use the previous code to see how to do it.
[Here ](https://drive.google.com/file/d/1bBohAatEjZIYROJY7wFTM1rynMrDRMej/view?usp=share_link)is the phyloseq object with all the datasets and the BAD metadata. :cry: