--- tags: BRAILLE title: kraken2-bracken read classification --- [toc] # Conda env ```bash conda create -n kraken22 -c conda-forge -c bioconda -c defaults -c astrobiomike kraken2=2.1.1 bracken=2.6.1 bit=1.8.28 ``` # Getting diatom genomes to put in there too: ```bash esearch -db assembly -query '"Bacillariophyta"[Organism] AND (latest[filter] AND "representative genome"[filter] AND all[filter] NOT anomalous[filter])' | esummary | xtract -pattern DocumentSummary -def "NA" -element AssemblyAccession > assembly-accs.txt bit-dl-ncbi-assemblies -f fasta -w assembly-accs.txt -j 5 ``` # Making db ```bash cat make-kraken2-and-bracken-dbs.sh ``` ```bash mkdir -p /data3/Data_Processing/mlee/ref-dbs/kraken2-arc-bac-vir-fungi-plant-protozoa-and-diatoms DB="/data3/Data_Processing/mlee/ref-dbs/kraken2-arc-bac-vir-fungi-plant-protozoa-and-diatoms/" kraken2-build --download-taxonomy --db ${DB} --threads 10 kraken2-build --download-library archaea --db ${DB} --threads 10 kraken2-build --download-library bacteria --db ${DB} --threads 10 kraken2-build --download-library viral --db ${DB} --threads 10 kraken2-build --download-library fungi --db ${DB} --threads 10 kraken2-build --download-library protozoa --db ${DB} --threads 10 for file in Bacillariophyta-NCBI-genomes-to-add-to-kraken2-db/*.fa kraken2-build --add-to-library ${file} --db ${DB} --threads 10 done kraken2-build --build --db ${DB} --threads 10 bracken-build -d ${DB} -t 10 -l 250 kraken2-build --clean --db ${DB} --threads 10 ``` # Running kraken2 and bracken ```bash cat run-kraken2-and-bracken.sh ``` ```bash DB="/data3/Data_Processing/mlee/ref-dbs/kraken2-arc-bac-vir-fungi-plant-protozoa-and-diatoms/" out_dir="kraken2-bracken-outputs-db-with-arc-bac-vir-fungi-protozoa-and-diatoms/" mkdir -p ${out_dir} mkdir -p ${out_dir}/kraken2-outputs/ mkdir -p ${out_dir}/bracken-outputs/ for sample in $(cat samples.txt) do printf "\tOn sample: ${sample}\n\n" time kraken2 --db ${DB} --threads 5 --output ${out_dir}kraken2-outputs/${sample}-kraken2-out.txt --report ${out_dir}kraken2-outputs/${sample}-kraken2-report.txt --paired trimmed-data/${sample}_R1_trimmed.fq.gz trimmed-data/${sample}_R2_trimmed.fq.gz for rank in D P C O F G S do time bracken -r 250 -d ${DB} -i ${out_dir}kraken2-outputs/${sample}-kraken2-report.txt -o ${out_dir}bracken-outputs/${sample}-bracken-out-${rank}.tsv -l ${rank} done done ``` # Formatting, summarizing, and merging kraken2 outputs Not really going to use the bracken outputs, because I think it's being given an impossible task and the results can therefore end up wildly misleading, e.g. see [here](https://hackmd.io/@astrobiomike/tax-probing) if interested for one deep dive. ```bash cat process-and-merge-kraken2-outputs.sh ``` ```bash results_dir="kraken2-bracken-outputs-db-with-arc-bac-vir-fungi-protozoa-and-diatoms/kraken2-outputs" for sample in $(cat samples.txt) do printf "\n\tWorking on sample: ${sample}\n" bit-kraken2-to-taxon-summaries -i ${results_dir}/${sample}-kraken2-out.txt -o ${results_dir}/${sample}-kraken2-tax.tsv done # combining samples=$(sed 's/$/-kraken2-tax.tsv/' samples.txt | tr "\n" " " | sed 's/ $//') names=$(tr "\n" "," < samples.txt | sed 's/,$//') bit-combine-kraken2-taxon-summaries -i ${samples} -n ${names} -o All-samples-combined-kraken2-tax.tsv ```