Moore Foundation Spartina Project

# Moore Foundation Spartina Project Sherlynette Pérez Castro, Elena López Peredo Marine Biological Laboratory, Woods Hole, MA ### Download SRA file using fastdump > prefetch -f yes SRR1157608 > vdb-validate SRR1157608 > fastq-dump -W --split-files --read-filter pass --gzip --skip-technical SRR1157608/SRR1157608.sra #### Create text file (rawreadslist.txt) containing sample names in project directory using GNU nano or other text editor (no extension name, commas or spaces). ## pre-FastQC > module load fastqc ### Create script preqc.sh > nano ./preqc.sh #!/bin/bash for sample in `cat rawreadslist.txt` do clusterize -n 5 -log logs/preqc_${sample}.log fastqc ${sample}_pass_1.fastq.gz ${sample}_pass_2.fastq.gz --outdir pre-QC_reports done > chmod +x preqc.sh > ./preqc.sh ## trimmomatic > module load trimmomatic > nano ./trimmomatic.sh #!/bin/bash for sample in `cat rawreadslist.txt` do clusterize -n 10 -log logs/trimmomatic_${sample}.log trimmomatic PE -phred33 ${sample}_pass_1.fastq.gz ${sample}_pass_2.fastq.gz ${sample}_paired_R1.fastq ${sample}_unpaired_R1.fastq ${sample}_paired_R2.fastq ${sample}_unpaired_R2.fastq ILLUMINACLIP:/groups/spart2020/adapters.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36 done > chmod +x trimmomatic.sh > ./trimmomatic.sh > clusterize -n 40 -log trimmomatic_SRR12659820.log trimmomatic PE -phred33 SRR12659820_pass_1.fastq.gz SRR12659820_pass_2.fastq.gz SRR12659820_paired_R1.fastq SRR12659820_unpaired_R1.fastq SRR12659820_paired_R2.fastq SRR12659820_unpaired_R2.fastq ILLUMINACLIP:/groups/spart2020/adapters.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36 ## prinseq > module load prinseq-lite/0.20.4 > nano ./prinseq.sh #!/bin/bash for file in `cat rawreadslist.txt` do clusterize -n 10 -log ${file}_prinseq.log prinseq-lite.pl -fastq ${file}_paired_R1.fastq -fastq2 ${file}_paired_R2.fastq -min_len 60 -min_qual_mean 20 -ns_max_n 0 -derep 12345 -trim_qual_left 20 -trim_qual_right 20 -trim_qual_type min -trim_qual_rule lt -out_format 3 -out_good ${file}_good -out_bad null -verbose done > chmod +x prinseq.sh > ./prinseq.sh > clusterize -n 40 -log SRR12659820_prinseq.log prinseq-lite.pl -fastq SRR12659820_paired_R1.fastq -fastq2 SRR12659820_paired_R2.fastq -min_len 60 -min_qual_mean 20 -ns_max_n 0 -derep 12345 -trim_qual_left 20 -trim_qual_right 20 -trim_qual_type min -trim_qual_rule lt -out_format 3 -out_good SRR12659820_good -out_bad null -verbose ## post-FastQC > module load fastqc ### Create script preqc.sh > nano ./postqc.sh #!/bin/bash for sample in `cat rawreadslist.txt` do clusterize -n 10 -log logs/postqc_${sample}.log fastqc ${sample}_good_1.fastq ${sample}_good_2.fastq --outdir post-QC_reports done > chmod +x postqc.sh > ./postqc.sh > clusterize -n 10 -log logs/postqc_SRR12659820.log fastqc SRR12659820_good_1.fastq SRR12659820_good_2.fastq --outdir post-QC_reports ## megahit ### Concatenate forward and reverse clean reads for co-assembly: > cat *_good_1.fastq > concatenated_1.fastq > cat *_good_2.fastq > concatenated_2.fastq ### MEGAHIT v1.2.9 > module load megahit > clusterize -n 40 -log MEGAHIT_79_141.log megahit -1 concatenated_1.fastq -2 concatenated_2.fastq --k-list 79,99,119,141 -o MEGAHIT_79_141 -t 40 --min-contig-len 3000 ## metawrap > module load metawrap/1.3 > clusterize -n 40 -log binning.log metawrap binning -o INITIAL_BINNING -t 40 -m 920 -a MEGAHIT_79_141/final.contigs.fa --metabat2 --maxbin2 --concoct reads/*.fastq --run-checkm > clusterize -n 40 -log refinement.log metawrap bin_refinement -o BIN_REFINEMENT -t 40 -A INITIAL_BINNING/metabat2_bins/ -B INITIAL_BINNING/maxbin2_bins/ -C INITIAL_BINNING/concoct_bins/ -c 90 -x 5 > clusterize -n 40 -log binning.log metawrap binning -o INITIAL_BINNINGnew -t 40 -m 920 -a MEGAHIT_79_141/final.contigs.fa --metabat2 --maxbin2 --concoct good/*.fastq --run-checkm > clusterize -n 30 -log refinement.log metawrap bin_refinement -o BIN_REFINEMENTnew -t 30 -A INITIAL_BINNINGnew/metabat2_bins/ -B INITIAL_BINNINGnew/maxbin2_bins/ -C INITIAL_BINNINGnew/concoct_bins/ -c 90 -x 5 ## DRAM ### the only host compatible with DRAM is Minnie > module show dram > module unload bioware > module load dram/20200625-conda > conda init bash > . ~/.bashrc > conda activate /bioware/dram-20200625-conda > DRAM.py annotate -i 'Renamed_bins/*.fa' -o annotation_Renamed_bins > DRAM.py distill -i annotation_Renamed_bins/annotations.tsv -o distill --trna_path annotation_Renamed_bins/trnas.tsv --rrna_path annotation_Renamed_bins/rrnas.tsv > DRAM.py strainer --identifiers K11181 -i annotations.tsv -f genes.fna -o dsrB_genes.fna https://github.com/shafferm/DRAM/wiki/4a.-Interpreting-the-Results-of-DRAM *For more detail on DRAM and how DRAM works please see the wiki: https://github.com/shafferm/DRAM/wiki ## GTDBTK > module load python/3.9.5-20210528-no-anvio > gtdbtk classify_wf --genome_dir SPC38bins/ --out_dir identify_output1.5 --extension fa > clusterize -n 40 -log gtdbtk.log gtdbtk classify_wf --genome_dir SPC38bins/ --out_dir identify_output_38bins --extension fa > scp sperezcastro@evol5.mbl.edu:/groups/spartina/metagenomes_analysis/identify_output_34bins/gtdbtk.bac120.summary.tsv /Users/sherlynette/Downloads ## metawrap kraken > module load metawrap/1.3 > clusterize -n <num_threads> -log <log> metawrap kraken -o <output_folder> -t <num_threads> -s <reads_subset> <clean_reads> <final_assembly> > clusterize -n 10 -log kraken.log metawrap kraken -o KRAKEN -t 10 reads/*.fastq MEGAHIT_79_141/final.contigs.fa > clusterize -n 10 -log kraken.log metawrap kraken -o KRAKEN -t 10 good/*.fastq MEGAHIT_79_141/final.contigs.fa > clusterize -n 10 -log kraken.log metawrap kraken -o KRAKEN -t 10 good/*.fastq MEGAHIT_79_141/final.contigs.fa > clusterize -n 10 -log kraken.log metawrap kraken -o KRAKEN -t 10 MEGAHIT_79_141/Spartina.final.contigs.fa ### CheckM https://github.com/Ecogenomics/CheckM/wiki/Genome-Quality-Commands > module load python/2.7.15-201901021238 #### Step 1: Place bins in the reference genome > checkm tree <bin folder> <output folder> > clusterize -n 40 -log checkm.log checkm tree Renamed_bins tree_folder -x fa #### Step 2: Assess phylogenetic markers found in each bin > checkm tree_qa <tree folder> > clusterize -n 40 -log checkm2.log checkm tree_qa tree_folder -o 2 1. brief summary of genome tree placement 2. detailed summary of genome tree placement including lineage-specific statistics 3. genome tree in Newick format decorated with IMG genome ids 4. genome tree in Newick format decorated with taxonomy strings 5. multiple sequence alignment of reference genomes and bins #### Step 3: Infer lineage-specific marker sets for each bin > checkm lineage_set <tree folder> <marker file> > clusterize -n 40 -log checkm.log checkm lineage_set tree_folder marker_file #### Step 4: List available taxonomic-specific marker sets > checkm taxon_list #### Step 5: Generate taxonomic-specific marker set > checkm taxon_set <rank> <taxon> <marker file> > clusterize -n 40 -log checkm.log checkm taxon_set domain Bacteria marker_file #### Step 6: Identify marker genes bins > checkm analyze <marker file> <bin folder> <output folder> > clusterize -n 40 -log checkm.log checkm analyze marker_file Renamed_bins -x fa output_folder #### Step 7: Assess bins for contamination and completeness > clusterize -n 40 -log checkm.log checkm qa marker_file output_folder -o 1 1. summary of bin completeness and contamination 2. extended summary of bin statistics (includes GC, genome size, ...) 3. summary of bin quality for increasingly basal lineage-specific marker sets 4. list of marker genes and their counts 5. list of bin id, marker gene id, gene id 6. list of marker genes present multiple times in a bin 7. list of marker genes present multiple times on the same scaffold 8. list indicating position of each marker gene within a bin 9. FASTA file of marker genes identified in each bin https://www.biostars.org/p/10469/ > module load metawrap/1.3 ## anvi'o ### clean up the crappy defline > anvi-script-reformat-fasta Alabama.Jroemerianus.bin.15.fa --simplify-names -o ALJroemerianus.bin.15_anvio.fa > anvi-gen-contigs-database -f ALJroemerianus.bin.15_anvio.fa -o ALJroemerianus.bin.15.db > anvi-run-hmms -c ALJroemerianus.bin.15.db --num-threads 10 > anvi-get-sequences-for-hmm-hits -c ALJroemerianus.bin.28.db --hmm-source Ribosomal_RNAs ### Step 1: Generate an anvi’o genomes storage using the program anvi-gen-genomes-storage #### Converting FASTA files into anvi’o contigs databases > module load python/3.7.9-202101061157 > anvi-script-FASTA-to-contigs-db Rowley.Spatens.bin.1.fa > clusterize -n 5 -log anvi-FA-to-contigs-db6.log anvi-script-FASTA-to-contigs-db Rowley.Spatens.bin.6.fa ### Create script anviodb.sh > nano ./anviodb.sh #!/bin/bash for sample in `cat ALbins.txt` do clusterize -n 5 -log anviodb_${sample}.log anvi-script-FASTA-to-contigs-db ${sample}.fa done > chmod +x anviodb.sh > ./anviodb.sh #### run anvi-run-ncbi-cogs on your contigs database to annotate your genes > nano ./anvicogs.sh #!/bin/bash for sample in `cat ALLbins.txt` do clusterize -n 5 -log anvicogs_${sample}.log anvi-run-ncbi-cogs -c ${sample}.db --num-threads 5 done > chmod +x anvicogs.sh > ./anvicogs.sh > clusterize -n 40 -log anvicogs_Alabama.Jroemerianus.bin.17.log anvi-run-ncbi-cogs -c Alabama.Jroemerianus.bin.17.db --num-threads 40 > anvi-gen-genomes-storage -e external-genomes.txt -o MOORE-GENOMES.db #### Generate an anvi’o pan database using the program anvi-pan-genome (which requires a genomes storage) > anvi-pan-genome -g MOORE-GENOMES.db -n MOORE_Pan > anvi-pan-genome -g MOORE-GENOMES.db --project-name "MOORE_Pan" --output-dir MOORE_Pan --num-threads 6 --minbit 0.5 --mcl-inflation 10 --use-ncbi-blast > anvi-import-misc-data layer-additional-data.txt -p MOORE_Pan/MOORE_Pan-PAN.db --target-data-table layers > anvi-display-pan -p MOORE_Pan-PAN.db -g MOORE-GENOMES.db #### SHH into server (not a cluster) of choice with 8008 port > ssh -L 8008:localhost:8008 <username>@<server> > ssh -L 8008:localhost:8008 sperezcastro@minnie > anvi-interactive -p SAMPLES-MERGED/PROFILE.db -c RowleySpatensbin16newfixed.db --server-only -P 8008 Listen to port 8008 in internet browser (Chrome): http://localhost:8008 # Juncus 20200831 :::info https://gold.jgi.doe.gov/study?id=Gs0135940 ::: > cd /groups/spartina/metagenomes_analysis/Alabama_JGI/MetadataAndDocs/ > cd /groups/spartina/metagenomes_analysis/Alabama_JGI/raw_reads_Juncus > module load trimmomatic > nano ./trimmomatic.sh #!/bin/bash for sample in `cat file.txt` do clusterize -n 10 -log trimmomatic.log trimmomatic PE -phred33 ${sample}_1.fastq ${sample}_2.fastq ${sample}_paired_R1.fastq ${sample}_unpaired_R1.fastq ${sample}_paired_R2.fastq ${sample}_unpaired_R2.fastq ILLUMINACLIP:/groups/spart2020/adapters.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36 done > chmod +x trimmomatic.sh > ./trimmomatic.sh > clusterize -n 10 -log trimmomatic.log trimmomatic PE -phred33 SRR9045297_1.fastq SRR9045297_2.fastq SRR9045297_paired_R1.fastq SRR9045297_unpaired_R1.fastq SRR9045297_paired_R2.fastq SRR9045297_unpaired_R2.fastq ILLUMINACLIP:/groups/spart2020/adapters.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36 for file in `cat juncus.txt`; do trimmomatic PE -phred33 -threads 12 ${file}_1.fastq ${file}_2.fastq ${file}_paired_R1.fastq ${file}_unpaired_R1.fastq ${file}_paired_R2.fastq ${file}_unpaired_R2.fastq ILLUMINACLIP:/groups/spart2020/adapters.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36; done > trimmomatic PE -phred33 -threads 12 SRR11061153_1.fastq SRR11061153_2.fastq SRR11061153_paired_R1.fastq SRR11061153_unpaired_R1.fastq SRR11061153_paired_R2.fastq SRR11061153_unpaired_R2.fastq ILLUMINACLIP:/groups/spart2020/adapters.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36 > module load prinseq-lite/0.20.4 nano ./prinseq.sh #!/bin/bash for sample in `cat prinseq.txt` do clusterize -n 10 -log prinseq.log prinseq-lite.pl -fastq ${sample}_paired_R1.fastq -fastq2 ${sample}_paired_R2.fastq -min_len 60 -min_qual_mean 20 -ns_max_n 0 -derep 12345 -trim_qual_left 20 -trim_qual_right 20 -trim_qual_type min -trim_qual_rule lt -out_format 3 -out_good ${sample}_good -out_bad null -verbose done > chmod +x prinseq.sh > ./prinseq.sh > nano ./prinseq.sh #!/bin/bash for file in `cat prinseq.txt` do clusterize -n 10 -log ${file}_prinseq.log prinseq-lite.pl -fastq ${file}_paired_R1.fastq -fastq2 ${file}_paired_R2.fastq -min_len 60 -min_qual_mean 20 -ns_max_n 0 -derep 12345 -trim_qual_left 20 -trim_qual_right 20 -trim_qual_type min -trim_qual_rule lt -out_format 3 -out_good ${file}_good -out_bad null -verbose done > chmod +x prinseq.sh > ./prinseq.sh clusterize -n 10 -log prinseq53.log prinseq-lite.pl -fastq SRR11061153_paired_R1.fastq -fastq2 SRR11061153_paired_R2.fastq -min_len 60 -min_qual_mean 20 -ns_max_n 0 -derep 12345 -trim_qual_left 20 -trim_qual_right 20 -trim_qual_type min -trim_qual_rule lt -out_format 3 -out_good SRR11061153_good -out_bad null -verbose > module load megahit > clusterize -n 30 -log megahit.log megahit -1 SRR11061153_good_1.fastq,SRR11061154_good_1.fastq,SRR11567260_good_1.fastq,SRR11567261_good_1.fastq,SRR9045291_good_1.fastq,SRR9045294_good_1.fastq,SRR9045295_good_1.fastq,SRR9045297_good_1.fastq -2 SRR11061153_good_2.fastq,SRR11061154_good_2.fastq,SRR11567260_good_2.fastq,SRR11567261_good_2.fastq,SRR9045291_good_2.fastq,SRR9045294_good_2.fastq,SRR9045295_good_2.fastq,SRR9045297_good_2.fastq --k-list 79,99,119,141 -o MEGAHIT_79_141 -t 14 --min-contig-len 3000 > module purge > module load jbpc > module load metawrap/1.3 > clusterize -n 20 -log refinement.log metawrap bin_refinement -o BIN_REFINEMENT -t 20 -A INITIAL_BINNING/metabat2_bins/ -B INITIAL_BINNING/maxbin2_bins/ -c 50 -x 10 > metawrap bin_refinement -o /users/eperedo/Alabama_Spartina_Megahit/BIN_REFINEMENT/ -t 18 -A /users/eperedo/Alabama_Spartina_Megahit/INITIAL_BINNING/metabat2_bins/ -B /users/eperedo/Alabama_Spartina_Megahit/INITIAL_BINNING/maxbin2_bins/ -C /users/eperedo/Alabama_Spartina_Megahit/INITIAL_BINNING/concoct_bins/ -c 50 -x 10 > module purge > module load jbpc > module load metawrap/1.3 > metawrap bin_refinement -o BIN_REFINEMENT -t 18 -A INITIAL_BINNING/metabat2_bins/ -B INITIAL_BINNING/maxbin2_bins/ -c 50 -x 10 > metawrap binning -o ~/Alabama_Spartina_Megahit/INITIAL_BI NNING -t 16 -m 10 -a ~/Alabama_Spartina_Megahit/MEGAHIT_79_141/final.contigs.fa --metabat2 --maxbin2 --concoct ~/reads_spartina/*.fastq --run-checkm > cp bin.83.fa /groups/spartina/metagenomes_analysis/Alabama_JGI/raw_reads_Juncus/paired/good/MEGAHIT_79_141/BIN_REFINEMENT/sherlysBins/ ## GTDBTK > module load gtdbtk > clusterize -n 10 -log gtdbtk.log gtdbtk classify_wf --genome_dir sherlysBins/ --out_dir identify_output --extension fa ## DRAM > DRAM.py annotate -i 'sherlysBins/*.fa' -o annotation --threads 12 --verbose DRAM.py annotate -i '/users/eperedo/Alabama_Spartina_Megahit/BIN_REFINEMENT/metawrap_50_10_bins/*.fa' -o /users/eperedo/Alabama_Spartina_Megahit/DRAM_annotation --threads 12 --verbose > DRAM.py distill -i annotation/annotations.tsv -o distill --trna_path annotation/trnas.tsv --rrna_path annotation/rrnas.tsv > DRAM.py annotate -i 'metawrap_bins/*.fa' -o annotation_IGERT > DRAM.py distill -i annotation/annotations.tsv -o distill > DRAM.py distill -i annotation_IGERT/annotations.tsv -o distill3 --trna_path annotation_IGERT/trnas.tsv --rrna_path annotation_IGERT/rrnas.tsv DRAM.py distill -i /users/eperedo/Rowley_metagenomes_subset/DRAM/annotation/annotations.tsv -o /users/eperedo/Rowley_metagenomes_subset/DRAM/distill --trna_path /users/eperedo/Rowley_metagenomes_subset/DRAM/annotation/trnas.tsv --rrna_path /users/eperedo/Rowley_metagenomes_subset/DRAM/annotation/rrnas.tsv # old notes ## Assembly ### SPAdes --12 <filename> file with interlaced forward and reverse paired-end reads -1 <filename> file with forward paired-end reads -2 <filename> file with reverse paired-end reads #### Command to filter contigs <1000 bp > anvi-script-reformat-fasta contigs.fasta -o contigs-fixed.fa -l 1000 --simplify-names ### MEGAHIT > module load megahit > megahit -1 7E3_1.fastq -2 7E3_2.fastq -o MEGAHIT_assembly --min-contig-len 1000 ## Assembly stats ### MetaQUAST > clusterize -n 10 -log MetaQUAST_g.log metaquast.py contigs-fixed.fa -o metaQUAST --max-ref-number 0 ## metawrap kraken > module load metawrap > clusterize -n <num_threads> -log <log> metawrap kraken -o <output_folder> -t <num_threads> -s <reads_subset> <clean_reads> <final_assembly> > clusterize -n 10 -log kraken.log metawrap kraken -o KRAKEN -t 10 -s 1000000 /CLEAN_READS/*.fastq contigs-fixed.fa ## metawrap binning > module load metawrap > clusterize -n <num_threads> -log <log> metawrap binning -o <output_folder> -t <num_threads> -a <assembly> --<binning_program> --<binning_program> --<binning_program> <goodReads> > clusterize -n 20 -log binning.log metawrap binning -o INITIAL_BINNING -t 20 -a final.contigs.fa --metabat2 --maxbin2 --concoct /groups/spartina/metagenomes_analysis/Alabama_JGI/raw_reads_Juncus/paired/good/*fastq ## IGERT > cd /groups/spartina/IGERT_Yr1_From_Oscar/IGERT_Yr1_From_Oscar/Brown_sequenced/fastq_files/quality_filtered/renamed/IGERT_SPC ### SPAdes > cat G*_1.fastq > concatenatedG_1.fastq > cat G*_2.fastq > concatenatedG_2.fastq > clusterize -n 40 -log SPAdes_G.log spades.py -m 920 -t 40 -1 concatenatedG_1.fastq -2 concatenatedG_2.fastq -o SPAdes --meta #### Command to filter contigs <1000 bp > anvi-script-reformat-fasta contigs.fasta -o contigs-fixed.fa -l 3000 --simplify-names # Alabama Spartina 08/31/2020 ELP :::info https://gold.jgi.doe.gov/study?id=Gs0135940 ::: :::info **Overview** Main goals of the project 1. Build high quality MAGs for sulfure cycling microorganisms present in salt marshes, with special emphasis in those symbiotic to Spartina. 2. Identifiy those microorganisms with broad distribution 3. Evaluate the variability of those organisms at large biogeographical scale. ::: Screen --- **are you running in a screened session?** check here: [screen instructions](https://hackmd.io/Y3AabwWoTpObumF4eYb37g?view#screen) navigate to the folder where you will run your analysis 1. to give a name to a screen session `screen -S sessionID` 2. List screens? `screen -r` 3. to connect to screen session `screen -r sessionID` 4. generate a file to log in specific parts of the session Once in a screened session type `crt+a` release and press `:` a line of text will appear on the botton of the terminal Type `logfile NAMEforlog.txt` press enter (twice). If you want to log something, press `crt+a` followed by `H` in the botton of the terminal a line saying creating file NAMEforlog.txt will appear, press enter all you type now will be recorded. To stop recording type again `crt+a` followed by `H`. Recording will stop. **Supercool!** if you use `crt+a` followed by `H` it will append (instead of overwrite!) to the log file. To stop, once again `crt+a` followed by `H` 4. close screen `crt+a` release and press `d` 5. kill screen `crt+a` release and press `k` 6. finished? type `exit` ## SRA download --- The data file are over 50Gb. Fastq-dump or fasterq-dump do not really work for files this size. Use prefetch instead, this downloads sra files which are ~5Gb (including F and R reads!) ### Prefetch > more : http://www.metagenomics.wiki/tools/short-read/ncbi-sra-file-format/prefetch You can download the compressed files in .sra format using `prefetch` Files will go to a ncbi folder in your user. (unless you set another output folder) Once files have downloaded, use `fastq-dump` to get .sra files in fastq format. Sra format is highly compressed. do not gunzip. ``` module load sratoolkit for file in `cat samples.txt`; do prefetch -f yes ${file}; done for file in `cat samples.txt`; do vdb-validate ${file}; done ``` ### Fastq-dump Now transform sra files into fastq, using fastq-dump and the flags to remove low quality reads. ``` for file in `cat samples.txt`; do fastq-dump -W --split-files --read-filter pass --skip-technical ${file}; done ``` note: instead of fastq-dump you can also use fasterq-dump > Read more about downloading SRA data here: https://hackmd.io/@elperedoStudents/HJMjIq6aL Read Quality control --- We record the results see example file. https://drive.google.com/file/d/1Hl1b7UyOnLYMZfDgpCYOWbl9Caf4MaQe/view?usp=sharing **These files are HUGE, check space before running and remove unnecesary files once one step has been finished** Load modules ``` module purge module load bioware module load fastqc module load trimmomatic module load prinseq-lite/0.20.4 ``` ### preQC stats We will be using fastqc and print seq to generate preliminary quality control data ``` fastqc *pass*.fastq ``` ### trimmomatic *in minnie* QC and remove adapters ***needs an adapter file*** double check the initial fastqc reports, and update the adapter file if sequences not listed in the file are found in the overrepresented in the samples. Our file is here `/groups/spart2020/adapters.fa` also, you can find it here (updated 9/3/2020) https://drive.google.com/file/d/1_T08Tl-0lMdoo66wzRu6h8G8ZT5LUUDA/view?usp=sharing ``` for file in `cat samples.txt`; do trimmomatic PE -phred33 -threads 12 ${file}_pass_1.fastq ${file}_pass_2.fastq ${file}_paired_R1.fastq ${file}_unpaired_R1.fastq ${file}_paired_R2.fastq ${file}_unpaired_R2.fastq ILLUMINACLIP:/groups/spart2020/adapters.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36; done ``` remove unpaired reads `rm *unpaired*` ### post trimmomatic stats We will be using fastqc and print seq to generate preliminary quality control data ``` fastqc *paired*.fastq ``` ### printseq-lite *in cluster5* length and deduplication!! ``` module load prinseq-lite/0.20.4 for file in `cat samples.txt`; do clusterize -n 10 -m elperedo@mbl.edu -log ${file}_printseq.log prinseq-lite.pl -fastq ${file}_paired_R1.fastq -fastq2 ${file}_paired_R2.fastq -min_len 60 -min_qual_mean 20 -ns_max_n 0 -derep 12345 -trim_qual_left 20 -trim_qual_right 20 -trim_qual_type min -trim_qual_rule lt -out_format 3 -out_good ${file}_good -out_bad null -verbose; done ``` The log file will have enough stats to complete the QC quality survey. Clean your folder! retain only the good reads file. remove files with singletons, paired and pass. > Read more about QC and trimming here: https://hackmd.io/@elperedoStudents/B18MOJW1D Assembly and assembly QC --- Apparently, only megahit is efficient enough to deal with this amount of reads ### megahit 1. megahit can run with series of files, not need for concatenated files (again, save space ~400Gb). list files separated by `,` and no spaces. 2. `--k-list 79,99,119,141` . with this flag we dont run small size kmers (should be faster) 3. `--min-contig-len 3000` only contigs over 3k. ``` module load megahit megahit -1 ~/reads_spartina/SRR10854653_good_1.fastq,~/reads_spartina/SRR11567157_good_1.fastq,~/reads_spartina/SRR9045292_good_1.fastq,~/reads_spartina/SRR9045293_good_1.fastq,~/reads_spartina/SRR9045296_good_1.fastq,~/reads_spartina/SRR9045298_good_1.fastq -2 ~/reads_spartina/SRR10854653_good_2.fastq,~/reads_spartina/SRR11567157_good_2.fastq,~/reads_spartina/SRR9045292_good_2.fastq,~/reads_spartina/SRR9045293_good_2.fastq,~/reads_spartina/SRR9045296_good_2.fastq,~/reads_spartina/SRR9045298_good_2.fastq --k-list 79,99,119,141 -o MEGAHIT_79_141 -t 14 --min-contig-len 3000 ``` View the longest 3 contig information `grep ">" ASSEMBLY/final_assembly.fasta | head -n3` > Read more about assemblies here: https://hackmd.io/@elperedoStudents/Sk3IDAHgP ### MetaQUAST http://quast.sourceforge.net/metaquast https://github.com/bxlab/metaWRAP/blob/master/Usage_tutorial.md#step-2-assembling-the-metagenomes-with-the-metawrap-assembly-module ``` metaquast.py ~/Alabama_Spartina_Megahit/MEGAHIT_79_141/*.fa -o metaQUAST --max-ref-number 0 ``` ### Kraken2 for more detail (including using Kraken instead of kraken2) check here https://hackmd.io/@elperedoStudents/BJBOB26gw ``` mkdir ~/kraken2/ cd ~/kraken2/ module load kraken2 kraken2 --threads 10 ~/Assembly/final.contigs.fa > ~/kraken2/Alabama_Spartina.assembly.kraken ## I would like to run this for each sample, including Juncus to see (maybe) diff. in abundance? for file in `cat samples.txt`; do kraken2 --paired --threads 10 ~/reads/${file}_good_1.fastq ~/reads/${file}_good_2.fastq> ~/kraken2/Alabama.${file}.kraken; done for file in `cat samples.txt`; do cat ~/kraken2/Alabama.${file}.kraken | cut -f 2,3 > ~/kraken2/Alabama.${file}.kraken.krona; done for file in `cat samples.txt`; do ktImportTaxonomy ~/kraken2/Alabama.${file}.kraken.krona -o ~/kraken/Alabama.${file}.kraken.krona.html; done firefox taxonomy.krona.html ``` **extract taxa of interest from assembly using Kraken2 taxonomy** ask sherly for her assembly. ``` mkdir ~/kraken2/SOB_sensuLato/ cd ~/kraken2/SOB_sensuLato/ ``` ``` kraken2 --use-names --threads 10 ~/Assembly/spartina.final.contigs.fa > ~/kraken2/SOB_sensuLato/Alabama.spartina.assembly.kraken kraken2 --use-names --threads 10 /groups/spartina/metagenomes_analysis/Alabama_JGI/raw_reads_Juncus/paired/good/MEGAHIT_79_141/final.contig.fa > ~/Alabama_Spartina_Megahit/kraken2/SOB_sensuLato/Alabama.juncus.assembly.kraken ``` ``` for file in `cat ~/kraken2/SOB_sensuLato/SOB_sensulato.txt`; do grep -E ${file} ~/kraken2/SOB_sensuLato/Alabama.spartina.assembly.kraken > ~/kraken2/SOB_sensuLato/Alabama.spartina.${file}.txt; done cat ~/kraken2/SOB_sensuLato/Alabama.spartina.*.txt >~/kraken2/SOB_sensuLato/Alabama.spartina.SOB.txt for file in `cat ~/kraken2/SOB_sensuLato/SOB_sensulato.txt`; do grep -E ${file} ~/kraken2/SOB_sensuLato/Alabama.juncus.assembly.kraken > ~/kraken2/SOB_sensuLato/Alabama.juncus.${file}.txt; done cat ~/kraken2/SOB_sensuLato/Alabama.juncus.*.txt >~/kraken2/SOB_sensuLato/Alabama.juncus.SOB.txt ``` > Read more about assemblies QC here: https://hackmd.io/@elperedoStudents/BJBOB26gw ## Metawrap example usage here: https://github.com/bxlab/metaWRAP/blob/master/Usage_tutorial.md Sherly's pipeline here: https://hackmd.io/@uRX-8muUSMKhpJawtHIBMA/BkWTZN8W8 ``` module load metawrap module purge module load jbpc module load metawrap/1.3 ``` return to normal ``` module purge module load bioware ``` ## Binning We will be using Metawrap https://github.com/bxlab/metaWRAP/blob/master/Usage_tutorial.md#step-4-bin-the-co-assembly-with-three-different-algorithms-with-the-binning-module MetaWRAP offers a hybrid approach for extracting draft metagenomes (bins, MAGs) by using metaBAT2, CONCOCT, and MaxBin2 Notes. Clusterize did not work for me. Needs Minnie to run because of memory constrains. gzip -d the reads! eed to be fastq :( maxbin2 failing! ``` metawrap binning -o /groups/spart2020/metagenomics/Rowley_JGI/INITIAL_BINNING -t 16 -m 10 -a /groups/spart2020/metagenomics/Rowley_JGI/Rowley_final.contig_3000bp.fa --metabat2 --maxbin2 --concoct /groups/spart2020/rawData/metagenome/Rowley_Finished_Reads/*.fastq --run-checkm ``` ## Refine those bins MetaWRAP consolidates all 3 bin sets by picking the best of each bin version based on completion and redundancy (contamination) thresholds https://github.com/bxlab/metaWRAP/blob/master/Usage_tutorial.md#step-5-consolidate-bin-sets-with-the-bin_refinement-module ``` clusterize -n 12 -log refinement.log -m mail@mail.com metawrap bin_refinement -o ~/BIN_REFINEMENT/ -t 12 -A ~/metabat2_bins/ -B /groups/spart2020/metagenomics/Rowley_JGI/INITIAL_BINNING_MH_Rowley/maxbin2_bins/ -c 50 -x 10 ``` In the output directory, you will see the three original bin folders we fed in, as well as `metaWRAP` directory, which contains the final, consolidated bins. **You will also see .stats files for each one of the bin directories.** **how many "meah bins"** (based on out >50% comp., <10% cont. metric) ``` cat ~/Alabama_Spartina_Megahit/BIN_REFINEMENT/concoct_bins.stats | awk '$2>50 && $3<10' | wc -l cat ~/Alabama_Spartina_Megahit/BIN_REFINEMENT/maxbin2_bins.stats | awk '$2>50 && $3<10' | wc -l cat ~/Alabama_Spartina_Megahit/BIN_REFINEMENT/metabat2_bins.stats | awk '$2>50 && $3<10' | wc -l cat ~/Alabama_Spartina_Megahit/BIN_REFINEMENT/metawrap_50_10_bins.stats | awk '$2>50 && $3<10' | wc -l ``` **how many "good bins"** (based on out >70% comp., <10% cont. metric) metaWRAP produced, we can run (use which ever of these you need) ``` cat ~/Alabama_Spartina_Megahit/BIN_REFINEMENT/concoct_bins.stats | awk '$2>70 && $3<10' | wc -l cat ~/Alabama_Spartina_Megahit/BIN_REFINEMENT/maxbin2_bins.stats | awk '$2>70 && $3<10' | wc -l cat ~/Alabama_Spartina_Megahit/BIN_REFINEMENT/metabat2_bins.stats | awk '$2>70 && $3<10' | wc -l cat ~/Alabama_Spartina_Megahit/BIN_REFINEMENT/metawrap_50_10_bins.stats | awk '$2>70 && $3<10' | wc -l ``` **how many "high quality bins"** (based on out >90% comp., <5% cont. metric) metaWRAP produced, we can run (use which ever of these you need) ``` cat ~/Alabama_Spartina_Megahit/BIN_REFINEMENT/concoct_bins.stats | awk '$2>90 && $3<5' | wc -l cat ~/Alabama_Spartina_Megahit/BIN_REFINEMENT/maxbin2_bins.stats | awk '$2>90 && $3<5' | wc -l cat ~/Alabama_Spartina_Megahit/BIN_REFINEMENT/metabat2_bins.stats | awk '$2>90 && $3<5' | wc -l cat ~/Alabama_Spartina_Megahit/BIN_REFINEMENT/metawrap_50_10_bins.stats | awk '$2>90 && $3<5' | wc -l ``` ## run DRAM for identification of sulfur-related taxa Only in bins **70** select the bins <70 and copy in file as txt now move them to another folder ``` for file in `cat /users/eperedo/Alabama_Spartina_Megahit/BIN_REFINEMENT/metawrap_sample_lower_70.txt`; do mv ${file}.fa /users/eperedo/Alabama_Spartina_Megahit/BIN_REFINEMENT/metawrap_50_10_bins/low_Quality; done ``` activate and run DRAM ``` module show dram module unload bioware module load dram/20200625-conda conda init bash . ~/.bashrc conda activate /bioware/dram-20200625-conda ``` now, use DRAM to annotate the bins of interest. *needs the `` ``` DRAM.py annotate -i '/users/eperedo/Alabama_Spartina_Megahit/BIN_REFINEMENT/metawrap_50_10_bins/*.fa' -o /users/eperedo/Alabama_Spartina_Megahit/DRAM_annotation --threads 12 --verbose ``` ``` DRAM.py distill -i /users/eperedo/Alabama_Spartina_Megahit/DRAM_annotation/annotations.tsv -o /users/eperedo/Alabama_Spartina_Megahit/DRAM_distill --trna_path /users/eperedo/Alabama_Spartina_Megahit/DRAM_annotation/trnas.tsv --rrna_path /users/eperedo/Alabama_Spartina_Megahit/DRAM_annotation/rrnas.tsv ``` 46 bins have completeness over 70 and sulfur ox or reducing pathways. lets see in which samples they are included. We will reassembly the MAGs in those sample where they are more frequent. #### copy the Sulfur MAGS into a new folder. for file in `cat ~/Alabama_Spartina_Megahit/MAGs_refined_sulfur.txt`; do cp ~/Alabama_Spartina_Megahit/BIN_REFINEMENT/metawrap_50_10_bins/${file}.fa ~/Alabama_Spartina_Megahit/BIN_REFINEMENT/metawrap_70_sulfur/. ; done ### Visualize the community and the extracted bins with the Blobology module Lets use the Blobology module to project the entire assembly onto a **GC vs Abundance plane**, and annote them with taxonomy and bin information. This will not only give us an idea of what these microbial communities are structured like, but will also show us our binning success in a more visual way. To annotate the blobplot with bins with the --bins flag, you **MUST use the non-reassembled bins** produced by the Bin_Refinment module, not the Reassemble_bins module. ``` metawrap blobology -a ~/Alabama_Spartina_Megahit/MEGAHIT_79_141/Spartina.final.contigs.fa -t 14 -o ~/Alabama_Spartina_Megahit/BIN_REFINEMENT/BLOBOLOGY_sulfur_70/ --bins ~/Alabama_Spartina_Megahit/BIN_REFINEMENT/metawrap_70_sulfur/ ~/reads_spartina/*.fastq ``` You will find that the output has a number of blob plots of our communities, annotated with different levels of taxonomy or their bin membership. Note that to help with visualizing the bins, some of the plots only contain the contigs that were successfully binned (these files have .binned.blobplot. in their names). Phyla taxonomy of the entire assembled community: ![](https://i.imgur.com/MSIh9iR.jpg) Bin membership of all the contigs: ![](https://i.imgur.com/3ah1o8E.jpg) ### Find the abundances of the draft genomes (bins) across the samples We would like to know how the extracted genomes are distributed across the samples, and in what abundances each bin is present in each sample. The Quant_bin module can give us this information. It used Salmon - a tool conventionally used for transcript quantitation - to estimate the abundance of each scaffold in each sample, and then computes the average bin abundances. NOTE: usse the bins produced by the Bin_Refinment module, not the Reassemble_bins module and provide the entire non-binned assembly with the -a option. This will give more accurate bin abundances that are in context of the entire community. Lets run the Quant_bins module: ```: metawrap quant_bins -b ~/Alabama_Spartina_Megahit/BIN_REFINEMENT/metawrap_70_sulfur -o ~/Alabama_Spartina_Megahit/BIN_REFINEMENT/QUANT_BINS_sulfur_70/ -t 12 -a ~/Alabama_Spartina_Megahit/MEGAHIT_79_141/Spartina.final.contigs.fa ~/reads_spartina/*.fastq ``` The output contains several usefull files. First, there is the bin_abundance_heatmap.png - a quick heatmap made to visualize the bin abundances across the samples. heatmap ![](https://i.imgur.com/UmMJN3b.png) The raw data for this plot (as you will most likely want to make your own heatmaps to analyze) is in bin_abundance_table.tab. Note that the abundances are expressed as **genome copies per million reads**, and are calculated with Salmon in a simmilar way like TPM (transcripts per million) is calculated in RNAseq analysis. As such, they should be already standardized to the individual sample size. :::info Genomic bins SRR9045298_good SRR9045296_good SRR11567157_good SRR9045293_good SRR10854653_goodSRR9045292_good bin.134 10.66856 19.352524 12.968496 9.372385 8.817114 2.506744 bin.113 2.104622 3.985442 3.03498 3.369774 3.335645 0.439683 bin.183 5.938467 2.448832 2.211644 2.032508 0.722717 7.731953 bin.20 24.097785 17.163352 21.060288 21.309368 6.118906 32.816795 bin.105 1.893448 3.106351 5.303488 3.479289 6.287506 0.634898 bin.136 1.298741 3.420994 2.0036105 1.6562899999999998 3.112868 1.2696605 bin.211 1.915036 1.567263 1.742291 0.818652 0.325375 2.687242 bin.168 3.546516 6.376698 3.339131 1.34176 4.302264 2.871652 bin.99 6.847895 7.944122 6.748675 3.339518 3.754411 4.952922 ::: Finally, you can view the abundances of all the individual contigs in all the samples in the quant_files folder. ## Determine the taxonomy Estimate the taxonomy of our refined bins with the Classify_bins module-- we whant sulfur related taxa: ``` metawrap classify_bins -b /users/eperedo/Alabama_Spartina_Megahit/BIN_REFINEMENT/metawrap_70_sulfur -o /users/eperedo/Alabama_Spartina_Megahit/BIN_REFINEMENT/metawrap_70_sulfur_BIN_CLASSIFICATION -t 12 ``` We can view the final estimated taxonomy in BIN_CLASSIFICATION/bin_taxonomy.tab: > bin.1.orig.fa Bacteria;Firmicutes;Clostridia;Clostridiales > > bin.5.orig.fa Archaea;Euryarchaeota;Methanobacteria;Methanobacteriales;Methanobacteriaceae;Methanobrevibacter;Methanobrevibacter smithii > > bin.11.orig.fa Bacteria;Bacteroidetes;Bacteroidia;Bacteroidales;Bacteroidaceae;Bacteroides > > bin.2.permissive.fa uncultured organism > > bin.10.orig.fa Bacteria;Firmicutes;Clostridia;Clostridiales;Clostridiaceae > > bin.14.strict.fa Bacteria;Firmicutes > > bin.8.strict.fa Bacteria > > bin.9.permissive.fa Bacteria > > bin.6.permissive.fa Bacteria;Firmicutes;Clostridia;Clostridiales > > bin.3.permissive.fa Bacteria;Firmicutes;Clostridia;Clostridiales;Clostridiaceae > > bin.4.permissive.fa Bacteria > > bin.13.permissive.fa Bacteria;Bacteroidetes;Bacteroidia;Bacteroidales;Bacteroidaceae > > bin.7.strict.fa Bacteria > As you can see some of the bins are annotated very deeply, while others can only be classified as "Bacteria". This method is relatively trustworthy, however it often fails to annotate organisms that are very distant from anything in the NCBI database. For these tricky bins, manually looking at marker genes (such as ribosomal proteins) can result in much more sensitive taxonomy assignment. ### Functionally annotate bins with the Annotate_bins module Now that we have our finalized reassembled bins, we are ready to functionally annotate them for downstream functional analysis. This module simply annotates the genes with PROKKA - it cannot do the actual functional analysis for you. Run the functional annotation module on the final, reassembled bins: ``` metawrap annotate_bins -b /users/eperedo/Alabama_Spartina_Megahit/BIN_REFINEMENT/metawrap_70_sulfur -o /users/eperedo/Alabama_Spartina_Megahit/BIN_REFINEMENT/metawrap_70_sulfur_FUNCT_ANNOT -t 10 ``` You will find the functional annotations of each bin in GFF format in the FUNCT_ANNOT/bin_funct_annotations folder > head FUNCT_ANNOT/bin_funct_annotations/bin.1.orig.gff > > NODE_75_length_31799_cov_0.983871 Prodigal:2.6 CDS 2866 3645 . - 0 ID=HMOHEJHL_00001;inference=ab initio prediction:Prodigal:2.6;locus_tag=HMOHEJHL_00001;product=hypothetical protein > > NODE_75_length_31799_cov_0.983871 Prodigal:2.6 CDS 3642 4478 . - 0 ID=HMOHEJHL_00002;inference=ab initio prediction:Prodigal:2.6;locus_tag=HMOHEJHL_00002;product=hypothetical protein > > NODE_75_length_31799_cov_0.983871 Prodigal:2.6 CDS 4606 5859 . - 0 ID=HMOHEJHL_00003;inference=ab initio prediction:Prodigal:2.6;locus_tag=HMOHEJHL_00003;product=hypothetical protein > > NODE_75_length_31799_cov_0.983871 Prodigal:2.6 CDS 5856 6575 . - 0 ID=HMOHEJHL_00004;Name=ypdB;gene=ypdB;inference=ab initio prediction:Prodigal:2.6,similar to AA sequence:UniProtKB:P0AE39;locus_tag=HMOHEJHL_00004;product=Transcriptional regulatory protein YpdB > > NODE_75_length_31799_cov_0.983871 Prodigal:2.6 CDS 6603 7658 . - 0 ID=HMOHEJHL_00005;eC_number=1.1.1.261;Name=egsA;gene=egsA;inference=ab initio prediction:Prodigal:2.6,similar to AA sequence:UniProtKB:P94527;locus_tag=HMOHEJHL_00005;product=Glycerol-1-phosphate dehydrogenase [NAD(P)+] > > NODE_75_length_31799_cov_0.983871 Prodigal:2.6 CDS 7843 11946 . - 0 ID=HMOHEJHL_00006;eC_number=3.2.1.78;inference=ab initio prediction:Prodigal:2.6,similar to AA sequence:UniProtKB:A1A278;locus_tag=HMOHEJHL_00006;product=Mannan endo-1%2C4-beta-mannosidase > > NODE_75_length_31799_cov_0.983871 Prodigal:2.6 CDS 12597 13586 . - 0 ID=HMOHEJHL_00007;eC_number=3.2.1.89;Name=ganB_1;gene=ganB_1;inference=ab initio prediction:Prodigal:2.6,similar to AA sequence:UniProtKB:Q65CX5;locus_tag=HMOHEJHL_00007;product=Arabinogalactan endo-beta-1%2C4-galactanase > > NODE_75_length_31799_cov_0.983871 Prodigal:2.6 CDS 13719 14564 . - 0 ID=HMOHEJHL_00008;Name=malG_1;gene=malG_1;inference=ab initio prediction:Prodigal:2.6,similar to AA sequence:UniProtKB:P68183;locus_tag=HMOHEJHL_00008;product=Maltose transport system permease protein MalG > > NODE_75_length_31799_cov_0.983871 Prodigal:2.6 CDS 14565 15476 . - 0 ID=HMOHEJHL_00009;Name=malF_1;gene=malF_1;inference=ab initio prediction:Prodigal:2.6,similar to AA sequence:UniProtKB:P02916;locus_tag=HMOHEJHL_00009;product=Maltose transport system permease protein MalF > > NODE_75_length_31799_cov_0.983871 Prodigal:2.6 CDS 15484 16704 . - 0 ID=HMOHEJHL_00010;Name=malE;gene=malE;inference=ab initio prediction:Prodigal:2.6,similar to AA sequence:UniProtKB:P0AEY0;locus_tag=HMOHEJHL_00010;product=Maltose-binding periplasmic protein > You will also find the translated and unstranslated predicted genes in fasta format in FUNCT_ANNOT/bin_translated_genes and FUNCT_ANNOT/bin_untranslated_genes folders. Finally, you can find the raw PROKKA output files in FUNCT_ANNOT/prokka_out. more: https://hackmd.io/@uRX-8muUSMKhpJawtHIBMA/BkWTZN8W8#Bins-QC ## CheckM Load checkm ``` module load python/2.7.15-201901021238 ``` [https://github.com/Ecogenomics/CheckM/wiki/Genome-Quality-Commands](https://github.com/Ecogenomics/CheckM/wiki/Genome-Quality-Commands) ### Step 1: Place bins in the reference genome ``` checkm tree /users/eperedo/Alabama_Spartina_Megahit/BIN_REFINEMENT/metawrap_70_sulfur /users/eperedo/Alabama_Spartina_Megahit/BIN_REFINEMENT/metawrap_70_sulfur/checkM/bins_70_sulfur_tree_folder -x fa -t 10 ``` ```. Required parameters: bin_folder: folder containing bins (FASTA format) out_folder: folder to write output files Optional parameters: -r, --reduced_tree: use reduced tree for determining lineage of each bin (suitable for 16 GB machines) --ali: generate HMMER alignment file for each bin in ./<output folder>/bins/<bin id>/hmmer.tree.ali.txt --nt: generate nucleotide gene sequences for each bin in ./<output folder>/bins/<bin id>/genes.fna -g, --genes: bins contain genes as amino acids instead of nucleotide contigs -x, --extension: extension of bins (other files in folder are ignored) -t, --threads: number of threads --pplacer_threads: number of threads used by pplacer (memory usage increases linearly with additional threads) -q, --quiet: suppress console output ``` ### Step 2: Assess phylogenetic markers found in each bin `checkm tree_qa <tree folder>` ``` checkm tree_qa /users/eperedo/Rowley_metagenomes_subset/INITIAL_BINNING_MH_Rowley/BIN_REFINEMENT/checkM/binsAB_tree_folder -o 2 -f /users/eperedo/Rowley_metagenomes_subset/INITIAL_BINNING_MH_Rowley/BIN_REFINEMENT/checkM/checkM.Taxonomy.output --tab_table ``` ``` Required parameters: tree_folder: output folder specified during tree command Optional parameters: -o, --out_format: specifies desired output (1-5) -f, --file: print results to file instead of the console --tab_table: for tabular outputs, print a tab-separated values table instead of a table formatted for console output -q, --quiet: suppress console output ``` ### Step 3: Infer lineage-specific marker sets for each bin `checkm lineage_set <tree folder> <marker file>` ``` checkm lineage_set /users/eperedo/Rowley_metagenomes_subset/INITIAL_BINNING_MH_Rowley/BIN_REFINEMENT/checkM/binsAB_tree_folder /users/eperedo/Rowley_metagenomes_subset/INITIAL_BINNING_MH_Rowley/BIN_REFINEMENT/checkM/checkM.MarkerFile.output ``` ### Step 4: List available taxonomic-specific marker sets ``` checkm taxon_list > /users/eperedo/Rowley_metagenomes_subset/INITIAL_BINNING_MH_Rowley/checkM/checkM.taxon_list.output ``` ### Step 5: Generate taxonomic-specific marker set `checkm taxon_set <rank> <taxon> <marker file>` ``` checkm taxon_set domain Bacteria /users/eperedo/Rowley_metagenomes_subset/INITIAL_BINNING_MH_Rowley/BIN_REFINEMENT/checkM/checkM.taxon_set.output ``` ### Step 6: Identify marker genes bins ` checkm analyze <marker file> <bin folder> <output folder>` ``` checkm analyze /users/eperedo/Rowley_metagenomes_subset/INITIAL_BINNING_MH_Rowley/BIN_REFINEMENT/checkM/checkM.MarkerFile.output /users/eperedo/Rowley_metagenomes_subset/INITIAL_BINNING_MH_Rowley/BIN_REFINEMENT/work_files/binsAB /users/eperedo/Rowley_metagenomes_subset/INITIAL_BINNING_MH_Rowley/BIN_REFINEMENT/checkM/analyze -t 10 -x fa ``` ### Step 7: Assess bins for contamination and completeness `checkm qa marker_file out_folder -o 1` ``` checkm qa /users/eperedo/Rowley_metagenomes_subset/INITIAL_BINNING_MH_Rowley/BIN_REFINEMENT/checkM/checkM.MarkerFile.output /users/eperedo/Rowley_metagenomes_subset/INITIAL_BINNING_MH_Rowley/BIN_REFINEMENT/checkM/analyze -o 2 -f /users/eperedo/Rowley_metagenomes_subset/INITIAL_BINNING_MH_Rowley/BIN_REFINEMENT/checkM/checkM.Contamination.Completeness.output --tab_table ``` ### Step 8: Genome Database Taxonomy [https://ecogenomics.github.io/GTDBTk/ ](https://ecogenomics.github.io/GTDBTk/) gtdbtk classify_wf --genome_dir <my_genomes> --out_dir <output_dir> ``` module load gtdbtk gtdbtk classify_wf --genome_dir /users/eperedo/Rowley_metagenomes_subset/INITIAL_BINNING_MH_Rowley/BIN_REFINEMENT/work_files/binsAB/ --out_dir /users/eperedo/Rowley_metagenomes_subset/INITIAL_BINNING_MH_Rowley/BIN_REFINEMENT/classify_wf --cpus 16 -x fa ``` ## Appendix and FAQ :::info **Find this document incomplete?** Leave a comment! ::: ### Re-assemble the consolidated bin set with the Reassemble_bins module Now that we have our final, consilidated bin set in BIN_REFINEMENT/metawrap_bins, we can try to further improve it with reassembly. The Reassemble bins module will collect reads belonging to each bun, and then reassemble them sepperately with a "permissive" and a "strict" algorithm. Only the bins that improved through reassembly will be altered in the final set. Let us run the Reassemble_bins module with all the reads we have: **you might need to concatenate all R1 reads and all R2 reads** (or ungzip the concatenated_1, but this is slower,I would remove the fastq.gz files and do a new concatenated file --- this will take seconds vs how long it takes to ungzip stuff!) ``` cat /users/eperedo/Rowley_Finished_Reads/*_1.fastq> /users/eperedo/Rowley_Finished_Reads/Rowley_concatenated_1.fastq cat /users/eperedo/Rowley_Finished_Reads/*_2.fastq> /users/eperedo/Rowley_Finished_Reads/Rowley_concatenated_2.fastq ``` now reassemble ``` metawrap reassemble_bins -o /users/eperedo/Rowley_metagenomes_subset/BIN_REASSEMBLY -1 /users/eperedo/Rowley_Finished_Reads/Rowley_concatenated_1.fastq -2 /users/eperedo/Rowley_Finished_Reads/Rowley_concatenated_2.fastq -t 16 -m 800 -c 50 -x 10 -b /users/eperedo/Rowley_metagenomes_subset/INITIAL_BINNING_MH_Rowley/BIN_REFINEMENT/metawrap_50_10_bins ``` #### individual reassemblies * -c 70* ``` for file in `cat samples.txt`; do metawrap reassemble_bins -o ~/Alabama_Spartina_Megahit/${file}.BIN_REASSEMBLY -1 ~/reads_spartina/${file}_good_1.fastq -2 ~/reads_spartina/${file}_good_2.fastq -t 16 -m 800 -c 70 -x 10 -b ~/Alabama_Spartina_Megahit/BIN_REFINEMENT/metawrap_50_10_bins --parallel; done ``` Looking at the output in BIN_REASSEMBLY/reassembled_bins.stats, we can see that 3 bins were improved though strict reassembly, 6 improved thorugh permissive reassembly, and 4 bins could not be improved (.strict, .permissive, and .orig bin extensions, respectively): > bin completeness contamination GC lineage N50 size binner > > bin.10.orig 65.78 0.0 0.263 Bacteria 3045 1159966 NA > > bin.7.strict 54.94 0.671 0.501 Clostridiales 3947 1474089 NA > > bin.4.permissive 99.32 1.342 0.408 Clostridiales 72135 2088821 NA > > bin.2.permissive 82.06 0.0 0.469 Bacteria 18989 3604843 NA > > bin.14.strict 85.84 3.066 0.293 Bacteria 4576 2201824 NA > > bin.9.permissive 76.74 2.554 0.425 Selenomonadales 3601 1802438 NA > > bin.13.permissive 78.37 1.357 0.435 Bacteroidales 9887 3675176 NA > > bin.11.orig 64.85 3.776 0.417 Bacteroidales 2086 3103352 NA > > bin.6.permissive 88.04 1.006 0.371 Clostridiales 6288 2070146 NA > > bin.5.orig 100.0 1.6 0.311 Euryarchaeota 12686 1705532 NA > > bin.3.permissive 74.91 0.396 0.284 Clostridiales 16578 1243641 NA > > bin.1.orig 57.36 0.0 0.430 Bacteria 4628 2673426 NA > > bin.8.strict 83.89 1.342 0.446 Clostridiales 3870 1474833 NA > But how much did our bin set really improve? We can look at the BIN_REASSEMBLY/reassembly_results.png plot to compare the original and reassembled sets. We can see that the bin reassembly modestly improved the bin N50 and completion metrics, and signifficantly reduced contamination. Fantastic! ![](https://i.imgur.com/ksFwJCs.jpg) We can also view the CheckM plot of the final bins in `BIN_REASSEMBLY/reassembled_bins.png`: ![](https://i.imgur.com/mVjh2Wn.png) **López Peredo, E; Pérez Castro, S**, Cardon Lab, Marine Biological Laboratory, Woods Hole, MA *Raw reads were processed with Trimmomatic and Preprocessing and Information of SEQuence data (PRINSEQ) for adaptor removal and quality filtering. Reads were then assembled using MEGAHIT. Contigs whose length was less than 3000 bp were removed. Binning was performed using MetaWRAP processing modules with initial extraction (using MaxBin2, metaBAT2, and CONCOT) and bin refinement. Bin completeness and contamination was evaluated using CheckM. Assembled genomes that were of substantial quality, as defined by containing more than 90 % genome completeness and less than 5 % contamination, were further analyzed. MAG quality was reported based on Bowers et al. (2017). Taxonomies were assigned to each bin using GTDB-Tk. Maximum likelihood phylogenetic tree of genomes was generated using PATRIC RaxML service. The tree was visualized using the Interactive Tree of Life (iTOL) webtool. Genomes were annotated using Distilled and Refined Annotation of Metabolism (DRAM), with default parameters. ## make a blastable database ### rename contigs ``` for file in `cat list.txt`; do cat ${file}.fa | sed s/\k/${file}_k/g >${file}_new.fa ; done ``` Make folder for database ``` mkdir /groups/spartina/metagenomes_analysis/blastable_40bins ``` cat all new files ``` cat /groups/spartina/metagenomes_analysis/*new.fa > /groups/spartina/metagenomes_analysis/blastable_40bins/40bin.fa ``` ### make db database. In this case, I made one for all 40 bins. I took the contigs from each bin and renamed them so the bin name is included in each. Need a fasta file 40bin.fa ``` makeblastdb -in 40bin.fa -dbtype nucl -out 40bins -parse_seqids ``` test ``` blastn -db 40bins -query test_sequence.fasta -out test_sequence.txt.fasta.out ``` to retrieve sequences of interest to extract the contigs (full contigs): copy the ones with significant hits in a new file ``` cat > test_sequence_matches.txt Alabama.Salterniflora.bin.42_k141_9106372 Alabama.Salterniflora.bin.176_k141_347914 Alabama.Jroemerianus.bin.25_k141_5034948 Alabama.Salterniflora.bin.143_k141_640411 Alabama.Jroemerianus.bin.14_k141_4383668 Alabama.Jroemerianus.bin.32_k141_1859307 blastdbcmd -db 40bins -entry_batch test_sequence_matches.txt -out test_sequence_matches.txt.fasta ``` ## heatmaps Alabama_spartina ``` module purge module load jbpc module load metawrap/1.3 gzip -d /groups/spart2020/paper_metagenomics_reads/Alabama/raw_reads_Spartina/*.fastq.gz metawrap quant_bins -b /groups/spartina/metagenomes_analysis/Renamed_bins/40_bins/ -o /groups/spartina/metagenomes_analysis/heatmaps_40bins/Alabama_Spartina -t 20 -a /groups/spartina/metagenomes_analysis/Alabama_JGI/Alabama_Spartina_Megahit/MEGAHIT_79_141/Spartina.final.contigs.fa /groups/spart2020/paper_metagenomics_reads/Alabama/raw_reads_Spartina/*.fastq gzip /groups/spart2020/paper_metagenomics_reads/Alabama/raw_reads_Spartina/*.fastq ``` Alabama_Juncus ``` module purge module load jbpc module load metawrap/1.3 metawrap quant_bins -b /groups/spartina/metagenomes_analysis/Renamed_bins/40_bins/ -o /groups/spartina/metagenomes_analysis/heatmaps_40bins/Alabama_Juncus -t 20 -a /groups/spartina/metagenomes_analysis/Alabama_JGI/Alabama_Juncus_Megahit/MEGAHIT_79_141/final.contigs.fa /groups/spart2020/paper_metagenomics_reads/Alabama/raw_reads_Juncus/paired/good/*.fastq ``` Rowley_Spatens ``` module purge module load jbpc module load metawrap/1.3 metawrap quant_bins -b /groups/spartina/metagenomes_analysis/Renamed_bins/40_bins/ -o /groups/spartina/metagenomes_analysis/heatmaps_40bins/Rowley_Spatens -t 20 -a /groups/spartina/metagenomes_analysis/Rowley_SPC/goodReads/MEGAHIT_79_141/final.contigs.fa /groups/spart2020/paper_metagenomics_reads/Rowley_Finished_Reads/Spatens_reads/*.fastq ``` Rowley_Salterniflora ``` module purge module load jbpc module load metawrap/1.3 metawrap quant_bins -b /groups/spartina/metagenomes_analysis/Renamed_bins/40_bins/ -o /groups/spartina/metagenomes_analysis/heatmaps_40bins/Rowley_Salterniflora -t 20 -a /workspace/eperedo/Rowley/MEGAHIT_79_141/final.contigs.fa /groups/spart2020/paper_metagenomics_reads/Rowley_Finished_Reads/Salterniflora_reads/*.fastq ``` ##### updated heatmaps screen 1 ``` module purge module load jbpc module load metawrap/1.3 metawrap quant_bins -b /groups/spartina/metagenomes_analysis/Renamed_bins/AlJR -o /groups/spartina/metagenomes_analysis/40bin_heatmaps_new/Alabama_Juncus -t 15 -a /groups/spartina/metagenomes_analysis/Alabama_JGI/Alabama_Juncus_Megahit/MEGAHIT_79_141/final.contigs.fa /groups/spart2020/paper_metagenomics_reads/*.fastq metawrap quant_bins -b /groups/spartina/metagenomes_analysis/Renamed_bins/RoSP -o /groups/spartina/metagenomes_analysis/40bin_heatmaps_new/Rowley_Spatens -t 15 -a /groups/spartina/metagenomes_analysis/Rowley_SPC/goodReads/MEGAHIT_79_141/final.contigs.fa /groups/spart2020/paper_metagenomics_reads/*.fastq ``` screen 2 ``` module purge module load jbpc module load metawrap/1.3 metawrap quant_bins -b /groups/spartina/metagenomes_analysis/Renamed_bins/AlSA -o /groups/spartina/metagenomes_analysis/40bin_heatmaps_new/Alabama_Spartina -t 15 -a /groups/spartina/metagenomes_analysis/Alabama_JGI/Alabama_Spartina_Megahit/MEGAHIT_79_141/Spartina.final.contigs.fa /groups/spart2020/paper_metagenomics_reads/*.fastq metawrap quant_bins -b /groups/spartina/metagenomes_analysis/Renamed_bins/RoSS -o /groups/spartina/metagenomes_analysis/40bin_heatmaps_new/Rowley_Salterniflora -t 15 -a /workspace/eperedo/Rowley/MEGAHIT_79_141/final.contigs.fa /groups/spart2020/paper_metagenomics_reads/*.fastq ``` # Prodigal ``` nano ./prodigalloop2.sh ``` ``` #!/bin/bash for sample in `cat mags16.txt` do clusterize -n 5 -log prodigal${sample}.log prodigal -i ${sample}.fa -o ${sample}.genes -a ${sample}.faa -p meta done ``` ``` chmod +x prodigalloop2.sh ./prodigalloop2.sh ``` > grep -E Deltaproteobacteria filename.kraken > Deltaproteobacteria.txt > grep -E Rowley.Spatens.bin.6_* genes.faa > Rowley.Spatens.bin.6.faa > module load aai > aai.rb -1 bin.0.fa -2 bin.1.fa > aai.rb -1 Alabama.Jroemerianus.bin.17.faa -2 Rowley.Salterniflora.bin.4.faa > clusterize -n 5 -log prodigalBetaproteobacteriabacteriumSG8_39.log prodigal -i BetaproteobacteriabacteriumSG8_39.fna -o BetaproteobacteriabacteriumSG8_39.genes -a BetaproteobacteriabacteriumSG8_39.faa -p meta > raxmlHPC-PTHREADS-SSE3 -f a -T 20 -s orthomcl16ed_p450_clans_ncbi.aln.phy -n orthomcl16ed_p450_clans_ncbi.aln_bt -m PROTGAMMABLOSUM62 -p 1234 -x 12345 -# 100 > cat *.faa > concatenated_faa.faa > grep '^>' Alabama.Jroemerianus.bin.15.fa > AL.Jroemerianus.bin.15.txt > ssh -L8080:localhost:8080 sperezcastro@evol5.mbl.edu anvi-interactive -p AlabamaJroemerianusbin15.db --server-only -P 8080