Try   HackMD

Annotating plaac-positive proteins with InterProScan and KOs

Working with data from 1-Jun-2021, UniProt "Standard" proteomes only. The files used below are from this drive.


NOTE
The code below works based on the directory structure in the google drive, with commands being run from within the sub-directory "further-annotating-plaac-positive-proteins". If not working in that structure, the code below would need to be modified to point to the correct file paths.


Environment setup

conda install -c conda-forge mamba

bit

mamba create -n bit -c conda-forge -c bioconda -c defaults -c astrobiomike bit=1.8.42

Interproscan

mamba create -n interproscan5 -c conda-forge -c bioconda -c defaults interproscan=5.54_87.0

5.54_87.0

conda activate interproscan

# setting up reference db
wget http://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/5.54_87.0/interproscan-5.54_87.0-64-bit.tar.gz

tar -pxvzf interproscan-5.54_87.0-64-bit.tar.gz

rm -rf ${CONDA_PREFIX}/share/InterProScan/data/

mv interproscan-5.54_87.0/data/ ${CONDA_PREFIX}/share/InterProScan

rm -rf interproscan-5.54_87.0/ interproscan-5.54_87.0-64-bit.tar.gz

# test
interproscan.sh -i ${CONDA_PREFIX}/share/InterProScan/test_all_appl.fasta -f tsv

interproscan.sh -i ${CONDA_PREFIX}/share/InterProScan/test_all_appl.fasta -f tsv -dp

Setting to automatically delete working directory after finishing (though it only deletes the files, not the temp dir it makes):

# doing in a way that works on typical darwin sed also
sed 's/delete.temporary.directory.on.completion=false/delete.temporary.directory.on.completion=true/' ${CONDA_PREFIX}/share/InterProScan/interproscan.properties > t && mv t ${CONDA_PREFIX}/share/InterProScan/interproscan.properties

KOFamScan

mamba create -y -n kofamscan -c conda-forge -c bioconda -c defaults kofamscan=1.3.0 hmmer=3.3.0

# getting ref db, need to point to these when running
curl -L -O ftp://ftp.genome.jp/pub/db/kofam/ko_list.gz
gunzip ko_list.gz

curl -L -O ftp://ftp.genome.jp/pub/db/kofam/profiles.tar.gz
tar -xzvf profiles.tar.gz && rm profiles.tar.gz

Getting fasta files of plaac-positive proteins

mkdir archaea bacteria eukarya

conda activate bit

bit-parse-fasta-by-headers -i ../archaea/reference-genome-info/archaea-proteomes.faa.gz -w ../archaea/plaac-core-score-0/archaea-plaac-core-score-0-positive-protein-accs.txt -o archaea/archaea-plaac-positive-seqs.faa --gz

bit-parse-fasta-by-headers -i ../bacteria/reference-genome-info/bacteria-proteomes.faa.gz -w ../bacteria/plaac-core-score-0/bacteria-plaac-core-score-0-positive-protein-accs.txt -o bacteria/bacteria-plaac-positive-seqs.faa --gz

bit-parse-fasta-by-headers -i ../eukarya/reference-genome-info/eukarya-proteomes.faa.gz -w ../eukarya/plaac-core-score-0/eukarya-plaac-core-score-0-positive-protein-accs.txt -o eukarya/eukarya-plaac-positive-seqs.faa --gz

Running annotations

Scripts are below and in the google drive in the "helper-scripts" subdirectory.

bash helper-scripts/run-kofamscan.sh

bash helper-scripts/run-interproscan.sh

Scripts

run-kofamscan.sh

#!/usr/env/bin bash set -e eval "$(conda shell.bash hook)" for domain in archaea bacteria eukarya do printf "\n\n\tDoing ${domain}\n\n" mkdir -p ${domain}/${domain}-ko-annotations conda activate kofamscan exec_annotation -p profiles/ -k ko_list --cpu 50 -f detail-tsv -o ${domain}/${domain}-ko-annotations/${domain}-ko-annots.tmp ${domain}/${domain}-plaac-positive-seqs.faa rm -rf tmp conda deactivate conda activate bit bit-filter-KOFamScan-results -i ${domain}/${domain}-ko-annotations/${domain}-ko-annots.tmp -o ${domain}/${domain}-ko-annotations/${domain}-plaac-positive-KO-annots.tsv rm ${domain}/${domain}-ko-annotations/${domain}-ko-annots.tmp done

run-interproscan.sh

#!/usr/env/bin bash set -e eval "$(conda shell.bash hook)" conda activate interproscan5 for domain in archaea bacteria do printf "\n\n\tDoing ${domain}\n\n" mkdir -p ${domain}/${domain}-interproscan-out interproscan.sh --cpu 20 --goterms --disable-residue-annot -f tsv -i ${domain}/${domain}-plaac-positive-seqs.faa -o ${domain}/${domain}-interproscan-out/${domain}-interpro-out.tsv 2> ${domain}/${domain}-interproscan-out/${domain}-interpro-stderr.log rm -rf temp/ done # splitting euks because it's a lot of seqs head -n 110000 eukarya/eukarya-plaac-positive-seqs.faa > eukarya/eukarya-plaac-positive-seqs-p1.faa sed -n '110001,210000p' eukarya/eukarya-plaac-positive-seqs.faa > eukarya/eukarya-plaac-positive-seqs-p2.faa sed -n '210001,310000p' eukarya/eukarya-plaac-positive-seqs.faa > eukarya/eukarya-plaac-positive-seqs-p3.faa sed -n '310001,421018p' eukarya/eukarya-plaac-positive-seqs.faa > eukarya/eukarya-plaac-positive-seqs-p4.faa for set in p1 p2 p3 p4 do printf "\n\n\tDoing eukarya ${set}\n\n" interproscan.sh --cpu 20 --goterms --disable-residue-annot -f tsv -i eukarya/eukarya-plaac-positive-seqs-${set}.faa -o eukarya/eukarya-interproscan-out/eukarya-interpro-out-${set}.tsv 2> eukarya/eukarya-interproscan-out/eukarya-interpro-${set}-stderr.log rm -rf temp done