Datos CRC - antiguos

###### tags: `WGS` # Datos CRC - antiguos Primero desacrgué unos datos con SRA toolkit ``` fastq-dump --split-files DRR127485 DRR127491 DRR127519 DRR127536 DRR127552 DRR127557 DRR127559 DRR127580 DRR127583 DRR127596 DRR127597 DRR127616 DRR127647 DRR127674 DRR127683 DRR127685 DRR127707 DRR127754 DRR127763 DRR127777 DRR162775 DRR127481 DRR127488 DRR127514 DRR127515 DRR127517 DRR127524 DRR127532 DRR127535 DRR127567 DRR127579 DRR127603 DRR127613 DRR127617 DRR127619 DRR127625 DRR127649 DRR127672 DRR127700 DRR127713 DRR127718 DRR127720 DRR127728 DRR127731 DRR127737 DRR127752 DRR127755 DRR127756 DRR127762 DRR127774 DRR162777 DRR162778 DRR162779 DRR127546 DRR127724 DRR127751 DRR127767 ``` Al ver que pesabanmucho y que no podría manejarlos por limitaciones computacionales :computer: decidí buscar otra socpiones de conseguir datas, y fue cuando descubrí [*curatedMetagenomicData* ](https://https://waldronlab.io/curatedMetagenomicData/) :::info Currently, curatedMetagenomicData provides: 10199 samples from 57 datasets, primarily of the human gut but including body sites profiled in the Human Microbiome Project Processed data from whole-metagenome shotgun metagenomics, with manually-curated metadata, as integrated and documented Bioconductor ExpressionSet objects ~80 fields of specimen metadata from original papers, supplementary files, and websites, with manual curation to standardize annotations Processing of data through the MetaPhlAn2 pipeline for taxonomic abundance, and HUMAnN2 pipeline for metabolic analysis This effort required analyzing ~100T of raw sequencing data ::: ### Probando cosas en R ```r=1 #Tabla Otu Contaje Feng <- otu %>% filter(Rank=="Genus" & Study=="FengQ_2015") Feng <- Feng[grep("noname|unclassified", Feng$Taxon, invert=T),] Feng <- Feng %>% select_("sampleID","Taxon", "Count") %>% spread(Taxon,Count) %>% column_to_rownames(sampleID) %>% column_to_rownames("sampleID") #SparCC sparcc.matrix <- sparcc(Feng) sparcc.cutoff <- 0.3 sparcc.adj <- ifelse(abs(sparcc.matrix$Cor) >= sparcc.cutoff, 1, 0) #Add OTU names rownames(sparcc.adj) <- colnames(Feng) colnames(sparcc.adj) <- colnames(Feng) #Build network from adjacency sparcc.net <- graph.adjacency(sparcc.adj, mode="undirected", diag=FALSE) plot(sparcc.net) #SPIEC-EASI network matrixFeng <-as.matrix(Feng) SpiecEasi.matrix <- spiec.easi(matrixFeng, method= 'glasso', lambda.min.ratio= 1e-2, nlambda= 20, icov.select.params=list(rep.num=50)) #Add OTU names to row and columns rownames(SpiecEasi.matrix$refit) <- colnames(Feng) #Build network SpiecEasi.net <- graph.adjacency(SpiecEasi.matrix$refit, mode="undirected", diag=FALSE) ``` ```r=1 #Limpiar data desde Phyloseq CRCseq.cl <- subset_taxa(CRCseq, (Kingdom!="Viruses") | is.na(Kingdom)) CRCseq.cl <- subset_taxa(CRCseq.cl, (Kingdom!="Archaea") | is.na(Kingdom)) CRCseq.cl <- subset_taxa(CRCseq.cl, (Kingdom!="Eukaryota") | is.na(Kingdom)) CRCseq.cl <- subset_samples(CRCseq.cl, study_condition!="NA") # MG100197 = HanniganGD_2017_A18 ; MG100187 = HanniganGD_2017_A07 ; MG100142 = HanniganGD_2017_C19 Contajes raros vistos en modo Abundancias relativas! CRCseq.cl <- subset_samples(CRCseq.cl, subjectID!= "HanniganGD_2017_A18" & subjectID!="HanniganGD_2017_A07" & subjectID != "HanniganGD_2017_C19") #Crear Variable estudio Objeto phyloseq SN <- as.data.frame(sample_names(CRCseq.cl)) Study <- SN %>% separate("sample_names(CRCseq.cl)",into=c("Study","sampleID"), sep=":") Study$Study <- gsub(".metaphlan_bugs_list.stool", "",Study$Study) estudio <- sample_data(Study) sample_names(estudio) <- paste(sample_names(CRCseq.cl)) #Recuerda que si el sample_name es diferente no puedes juntarlos! CRCseq.cl <- merge_phyloseq(CRCseq.cl, estudio) #Quitar Noname and unassigned allTaxa <- taxa_names(CRCseq.cl) badTaxa <- as.data.frame(allTaxa) badTaxa <- badTaxa[grep("unclassified|noname", badTaxa$allTaxa, invert=F),] allTaxa <- allTaxa[!(allTaxa %in% badTaxa)] CRC.clean <- prune_taxa(allTaxa, CRCseq.cl) # Diversidad desde Phyloseq#### alphas = c("Shannon", "Simpson", "InvSimpson") plot_richness(x1, "study_condition", measures = alphas) pairs(estimate_richness(x1, measures = alphas)) #PCoA beta-diversidad Bray Curtis ordinated_taxa = ordinate(CRC.clean, method="PCoA", distance="bray") #Por estudio #Comp 1 y 2 plot_ordination(CRC.clean, ordinated_taxa, color="study_condition", title = "Bray-Curtis Principal Coordinates Analysis") + theme_bw() #Comp 1 y 3 plot_ordination(CRC.clean, ordinated_taxa, color="Study", title = "Bray-Curtis Principal Coordinates Analysis", axes=c(1,3)) + theme_bw() #Comp 2 y 3 plot_ordination(CRC.clean, ordinated_taxa, color="Study", title = "Bray-Curtis Principal Coordinates Analysis", axes=c(2,3)) + theme_bw() #Viendo solo Especies por unifrac Species <- tax_glom(CRC.clean, taxrank = "Species") random_tree <- rtree(ntaxa(Species), rooted=TRUE, tip.label=taxa_names(Species)) Species.tree <- merge_phyloseq(Species, random_tree) ordinated_taxa = ordinate(Species.tree, method="MDS", distance="unifrac",weighted=TRUE) #Por estudio #Comp 1 y 2 plot_ordination(Species.tree, ordinated_taxa, color="Study", shape = "study_condition", title = "Unifrac MDS Analysis- Species level") + theme_bw() #Viendo solo Generos por unifrac Genero <- tax_glom(CRC.clean, taxrank = "Genus") random_tree_gen <- rtree(ntaxa(Genero), rooted=TRUE, tip.label=taxa_names(Genero)) Genero.tree <- merge_phyloseq(Genero, random_tree_gen) ordinated_taxa_gen = ordinate(Genero.tree, method="MDS", distance="unifrac",weighted=TRUE) #Por estudio #Comp 1 y 2 plot_ordination(Genero.tree, ordinated_taxa_gen, color="Study", shape = "study_condition", title = "Unifrac MDS Analysis- Genus level") + theme_bw() #Viendo solo Familias por unifrac Family <- tax_glom(CRC.clean, taxrank = "Family") random_tree_fam <- rtree(ntaxa(Family), rooted=TRUE, tip.label=taxa_names(Family)) Family.tree <- merge_phyloseq(Family, random_tree_fam) ordinated_taxa = ordinate(Family.tree, method="MDS", distance="unifrac",weighted=TRUE) #Por estudio #Comp 1 y 2 plot_ordination(Family.tree, ordinated_taxa_fam, color="Study", shape = "study_condition", title = "Unifrac MDS Analysis- Family level") + theme_bw() ``` ### Figuras LEFSe para el poster ```bash=v plot_cladogram.py --format png −−background_color k −−colored_labels 0 lefse_NS_4.res cladogram1.png [−−max_lev MAX_LEV] [−−max_point_size MAX_POINT_SIZE] [−−min_point_size MIN_POINT_SIZE] [−−point_edge_width MARKEREDGEWIDTH] [−−siblings_connector_width SIBLINGS_CONNECTOR_WIDTH] [−−parents_connector_width PARENTS_CONNECTOR_WIDTH] [−−radial_start_lev RADIAL_START_LEV] [−−labeled_start_lev LABELED_START_LEV] [−−labeled_stop_lev LABELED_STOP_LEV] [−−abrv_start_lev ABRV_START_LEV] [−−abrv_stop_lev ABRV_STOP_LEV] [−−expand_void_lev EXPAND_VOID_LEV] [−−class_legend_vis CLASS_LEGEND_VIS] [−−colored_connector COLORED_CONNECTORS] [−−alpha ALPHA] [−−title TITLE] [−−sub_clade SUB_CLADE] [−−title_font_size TITLE_FONT_SIZE] [−−right_space_prop R_PROP] [−−left_space_prop L_PROP] [−−label_font_size LABEL_FONT_SIZE] [−−background_color {k,w}] [−−colored_labels {0,1}] [−−class_legend_font_size CLASS_LEGEND_FONT_SIZE] [−−dpi DPI] [−−format {png,svg,pdf}] [−−all_feats ALL_FEATS] INPUT_FILE OUTPUT_FILE plot_cladogram.py lefse_esp_4.res cladogram14.png --format png --dpi 300 --radial_start_lev 5 --clade_sep 0.12 --labeled_start_lev 4 --labeled_stop_lev 6 --expand_void_lev 20 plot_cladogram.py lefse_esp_4.res cladogram15.png --format png --dpi 300 --radial_start_lev 5 --clade_sep 0.12 --labeled_start_lev 3 --labeled_stop_lev 6 --expand_void_lev 10 --alpha 0.15 --class_legend_vis 0 --title "" --right_space_prop 0.5 final: plot_cladogram.py lefse_esp_4.res cladogram15.png --format png --dpi 300 --radial_start_lev 5 --clade_sep 0.12 --labeled_start_lev 0 --labeled_stop_lev 0 --expand_void_lev 10 --alpha 0.15 --class_legend_vis 0 --title "" --right_space_prop 0.3 --siblings_connector_width --parents_connector_width 0.25 --label_font_size 0 ```