Genome Assembly/Resequencing/Analysis Pipeline

# Genome Assembly/Resequencing/Analysis Pipeline ##Adapted from https://github.com/SmithsonianWorkshops/SMSC_2023_Conservation_Genomics.git ####Building the Assembly # Upload raw data files to run on supercomputer scp -r /local/directory/path/to/files/ user@qb.loni.org:/path/to/folder ##scp securely copies files ##make sure if you're logged into loni you log out before running this # Run FastQC on all samples to check quality of reads {if loading from conda; path doesn't need to be repeated below} /path/to/fastqc for i in /path/to/files/*.fq.gz do /path/to/FastQC/fastqc $i; done # FastQC creates fastqc.html files that need to be downloaded to computer to view scp user@qb.loni.org:/path/to/files/*.html /local/directory/ ##scp securely copies files ##make sure if you're logged into loni you log out before running this **OR** module load ffsend ffsend upload /path/to/files/*.html ##This will create a url that you can copy into your browser and download selected files to your desktop *** You would not trim HiFi or Hi-C data because the goal is to input long reads and trimming would cut down those long reads. FastQC in this instance is mainly informational or to complain to the sequencing facility if you get back really crappy data lol *** # Hifiasm Assembly (with HiFi files) module load hifiasm hifiasm read_1.fq.gz read_2.fq.gz -t 32 -o file.asm ##fast haplotype-resolved de novo assembler for PacBio HiFi reads ##-t: number of threads, -o output file, uses about 300G total memory # Hifiasm Assembly (with Hi-C files) module load hifiasm hifiasm --h1 read_1.fq.gz --h2 read_2.fq.gz -t 32 -o file.asm ##fast haplotype-resolved de novo assembler for PacBio HiFi reads ##-t: number of threads, -o output file, uses about 300G total memory # Convert primary assembly to fasta file format cd /path/to/assembly awk '/^S/{print ">"$2;print $3}' assembly.p_ctg.gfa > assembly.p_ctg.fa # Check finished assembly stats with assembly_stats module load assembly_stats assembly_stats /path/to/assembly.fa > assembly_stats.out ##gives info like mean contig size, GC content, longest and shortest contig size, N50, L50, scaffold stats etc. ####Assembly Quality Check # Run BUSCO to assess completeness of genome assembly module load busco busco -i /path/to/assembly.fa --auto-lineage<prok><euk> -c 10 -m genome -o /path/to/output ##if you have clade information for your assembly you can replace --auto-lineage with -l (lineage)and specify e.g. mammalia_odb10 (This will automatically connect and download the database from the BUSCO website); -m <transcriptome> <proteins> <genome>, -c number of CPU, -i input, -o output # Blast assembly with blastn to assess genome assembly quality (need to run blobtools) module load blast blastn -db /path/to/dv/ncbi/db/latest_v4/nt/nt -query /path/to/assembly.fa -outfmt "6 qseqid staxids bitscore std" -max_target_seqs 20 -max_hsps 1 -evaule 1e-20 -num_threads 10 -out assembly_blast.out ##-db database (must download ncbi database to run), -query genome you want to query, -outfmt output format, -max_target_seqs maximum target sequence you want to keep (the higher the number the longer it takes to run), -max_hsps maximum number of alignments to keep, -evalue requirement for sequence to be kept, -num_threads number of threads, -out output # Map raw reads to genome assembly with minimap2 module load minimap2 minimap2 -ax -t 20 /path/to/assembly.fa /path/to/read1.fq.gz /path/to/read2.fq.gz |samtools view -b | samtools sort -@20 -O BAM -o reads_sorted.bam ##-ax configuration to map reads to genome -t number of threads -@ number of threads -b change sam to bam file -O output format -o output ##pipe to samtools view to change the output froma sam to bam file and then pipe to samtools sort to to sort reads and output bam file # Run Blobtools to detect contamination module load blobtools blobtools create -i /path/to/assembly.fa -b /path/to/mapped.bam -t /path/to/blast.out -o blobplot ##blobtools create creates a blobtools database; -i input assembly -b mapped bam files to input assembly -t blast output -o output # Run Blobtools plot to visualize coverage, GC proportion, and percentage of mapped assembly module load blobtools blobtools plot -i /path/to/blobplot.json ##-i input from blobplot create output # Blobtools plot creates png files that need to be downloaded to computer to view scp user@qb.loni.org:/path/to/files/*.png /local/directory/ ##scp securely copies files ##make sure if you're logged into loni you log out before running this **OR** module load ffsend ffsend upload /path/to/files/*.png ##This will create a url that you can copy into your browser and download selected files to your desktop ####Genome Annotation # Identify repeats and build repeat database with RepeatModeler module load bio/repeatmodeler BuildDatabase -name /path/to/assembly/fasta/file ##name is the name you want to give your dataset i.e. clouded_leopard ##followed by newly assembled reference genome # Run RepeatModeler using the repeat database module load bio/repeatmodeler RepeatModeler -database /path/to/repeat/database -pa 36 -engine ncbi > out.log ##database is the repeat database created in the previous step ##pa is the number of CPUs ##engine the name of the search engine (note: this may vary according to version of RepeatModeler) ##this is a high memory job; can take serveral days ##output is a consensi.fa.classified file (a complete database to be used in RepeatMasker) # Run RepeatMasker module load bio/repeatmodeler RepeatMasker -pa 30 -xsmall -gff -lib consensi_classified -dir /path/to/output/directory /path/to/assembly/fasta/file ##pa is the number of CPUs ##xsmall stands for soft-masking ##gff is the output format of the annotated repeats ##lib is the database to base the repeat-masking on ##dir is directory to write the output to ##followed by your newly assembled reference genome ##there will be 3 outputs: 1. .fast.tbl (summary information about the repetitive elements) 2. .masked.fasta (masked assembly (in our case, softmasked) 3. .fasta.out (detailed information about the repetitive elements, including coordinates, repeat type and size) # Alternative to above step (also uses RepeatMasker) module load bio/repeatmasker RepeatMasker -species mammalia -pa $NSLOTS -xsmall -dir /path/to/output/directory /path/to/assembly/fasta/file ##species is the taxonomic database (options available here https://www.girinst.org/repbase/update/browse.php) ##pa is number of CPUs ##xsmall is soft-masking ##dir is directory to write the output to ##followed by your newly assemble reference genome ##same kind of outputs as above step # Run GeMoMa to infer annotation of protein-coding genes (using annotated reference genomes) ##first step would be to download some reference genomes (taxa_assembly.fasta) and their associated annotation (taxa_annotation.gff) from GenBank module load bio/gemoma/1.9 set maxHeapSize 400000 GeMoMa -Xmx400G GeMoMaPipeline threads=$NSLOTS outdir=/path/to/output/directory GeMoMa.Score=ReAlign AnnotationFinalizer.r=NO p=false o=true t=/path/to/assembly/fasta/file s=own i=cat a=reference genome in gff format g=same reference genome in fasta format ##can repeat flags s, i, a, and g for desired "reference genomes" that are self-created or downloaded from GenBank ##threads is number of CPUs ##dir is directory to write the output to ##t is path your reference genome ##s specifies if you will be providing your "own" reference genomes ##i allows you to provide an ID for each reference organism ##a is path to individual reference genome ##g is path to individual annotation genome ##this is a high memory job ~480GB ####Resequencing Analysis ####Assumes you have samples that have been quality-checked and trimmed (as needed) # Map samples to Reference Genome and convert to .bam format module load bio/minimap2 minimap2 -ax sr -t 20 /path/to/assembly/fasta/file /path/to/read_1/fastq.gz /path/to/read_2/fastq.gz | samtools sort -@20 -O BAM -o out.bam ##ax is the preset configuration to map illumina short reads (sr) to genomes ##t is number of threads to use ##followed by reference file ##read 1 of sample ##read 2 of sample ##minimap2 outputs a .sam file (one sam file per sample) ##samtools sort both converts sam to bam and sorts bam file ##@ is the number of threads to use ##O is output format ##o is name of output ##mapping and samtools will take time!! # Mark PCR Duplicates with PicardTools (GATK) module load bio/gatk/4.1.3.0 rungatk MarkDuplicates -I=sample_sorted.bam -O=sample_duplMarked.bam -M=sorted_reads_duplMarked.metrics ##I is the input (sorted bam file) ##O is the output (duplicate-marked bam) ##M is the output for the metrics file # Add or Replace Read Groups with GATK module load bio/gatk/4.1.3.0 rungatk AddOrReplaceReadGroups -I=sample_duplMarked.bam -O=sample_duplMarked_SM.bam -ID=1 -LB=lib1 -PL=ILLUMINA -PU=unit1 -SM=sample_name ##I is the input (duplicate-marked bam) ##O is the output (SM.bam) ##ID is Read Group ID, lane number ##LB is library ID ##PL is sequencing platform ##SM is sample name # Create index for reference genome and all .bam files module load bio/gatk/4.1.3.0 module load bio/samtools/1.9 rungatk CreateSequenceDictionary -R=/path/to/assembly/fasta/file -O=output.dict samtools faidx /path/to/assembly/fasta/file samtools index sample_duplMarked_SM.bam ##R is the reference genome ##O is the output reference index dictionary # Create gvcf files for each sample module load bio/gatk/4.1.3.0 rungatk HaplotypeCaller -I sample_duplMarked_SM.bam -R /path/to/assembly/fasta/file -ERC GVCF -O sample.g.vcf.gz ##I is input ##R is reference genome ##ERC preset parameters for likelihood calculations and is more efficient and recommended for large number of samples and therefore more scalable ##O is output ##This is a memory and time intensive step; can break up into groups of samples ##Try multi-threading # Index gvcf file qrsh module load bioinformatics/htslib/1.9 tabix -p vcf sample.g.vcf.gz ##qrsh switches to interactive node (check if LONI has this option) # Combine gvcf files module load bio/gatk/4.1.3.0 rungatk CombineGVCFs -R /path/to/assembly/fasta/file \ -V sample1.g.vcf.gz \ -V sample2.g.vcf.gz \ -V sample3.g.vcf.gz \ -O cohort.g.vcf.gz ##R is the reference genome ##V are your gvcf samples to combine ##O is output ##This is also a memory and time intensive step ##May need to ask for an extension on QB3 # Genotype combined gvcf file module load bio/gatk/4.1.3.0 rungatk GenotypeGVCFs \ -R /path/to/assembly/fasta/file \ -V cohort.g.vcf.gz \ -O cohort.vcf.gz ##R is the reference genome ##V is the combined gvcf ##O is output (genotyped and combined vcf) ##Memory and time intensive step # Filter variants module load bio/gatk/4.1.3.0 rungatk VariantFiltration \ -R /path/to/assembly/fasta/file \ -V cohort.vcf.gz \ -O cohort_varfilter.vcf \ --filter-name "Low_depth10" \ --filter-expression "DP < 10" ##R is the reference genome ##V is the genotyped and combined vcf ##O is output (filtered vcf) ##Add in your preferred filtration parameters ####Analyses to run with final, clean VCF # PCA In a nutshell... 1.prune vcf 2.create PCA 3.plot PCA in R 1. Prune VCF qrsh module load bio/plink/ plink --vcf /path/to/vcf/file.vcf --double-id --allow-extra-chr --set-missing-var-ids @:# --indep-pairwise 50 10 0.1 --out output_name_plink ##qrsh takes you to an interactive node ##vcf path to your vcf file ##double-id tells plink to set both the family ID and the within-family ID as the sample ID ##allow-extra-chr allows additional chromosomes beyond the human chromosome set ##set-missing-var-ids sets a variant ID for our SNPs. Human data sets often have annotated SNP names and so plink will look for these. We do not have them so instead we set ours to default to chromosome:position which can be achieved in plink by setting the option @:#. ##indep--pairwise performs the linkage pruning. The first argument, 50 denotes we have set a window of 50 Kb. The second argument, 10 is our window step size - meaning we move 10 bp each time we calculate linkage. Finally, we set an r2 threshold - i.e. the threshold of linkage we are willing to tolerate. Here we prune any variables that show an r2 of greater than 0.1. ##output will be a prune.in file 2. Create PCA plink --vcf /path/to/vcf/file.vcf --double-id --allow-extra-chr --set-missing-var-ids @:# \ --extract output_plink.prune.in \ --make-bed --pca --out output_pca ##extract tells plink to extract only these positions from our vcf ##make-bed creates a bed file and a bim file (useful for ADMIXTURE) ##pca tells plink to create a Principal Component Analysis (PCA) ##outputs will be .bed file, eigenval file, and eigenvec file 3. Plot PCA in R library (tidyverse) setwd("C:/Users/annac/Desktop/Grad School/Classes/Bioinformatics for Conservation Genomics/Files") getwd() ##Reading in eigen files pca<-read_table2("clouded_leopard_pca.eigenvec", col_names = FALSE) eigenval<-scan('clouded_leopard_pca.eigenval') ##Cleaning up table pca <- pca[,-1] ##Set names names(pca)[1]<-"ind" names(pca)[2:ncol(pca)] <- paste0("PC", 1:(ncol(pca)-1)) ##Organize individuals by name and and population # name nm <- rep(NA, length(pca$ind)) nm[grep("NN114296", pca$ind)] <- "pepe" nm[grep("NN114297", pca$ind)] <- "rosa" nm[grep("NN114393", pca$ind)] <- "shakiral" nm[grep("NN115950", pca$ind)] <- "carlos" nm[grep("NN190240", pca$ind)] <- "mike" # population pop <- rep(NA, length(pca$ind)) pop[grep("NN114296", pca$ind)] <- "cl" pop[grep("NN114297", pca$ind)] <- "cl" pop[grep("NN114393", pca$ind)] <- "cl" pop[grep("NN115950", pca$ind)] <- "cl" pop[grep("NN190240", pca$ind)] <- "cl" ##Create 3rd vector so we can plot with different colors ##and separated by both color and population nm_pop <- paste0(nm, "_", pop) ##Reorganize dataframe with all the new variables pca2 <- as_tibble(data.frame(pca, nm, pop, nm_pop)) ##Convert eigenvalues into percentages ##i.e. variance explained by each principal component pve <- data.frame(PC = 1:5, pve = eigenval/sum(eigenval)*100) ##Plot eigenvalues plot_pve <- ggplot(pve, aes(PC, pve)) + geom_bar(stat = "identity") plot_pve <- plot_pve + ylab("% of variance expl") + theme_classic() plot_pve ##Looks like PC 1 and PC 2 explain the most variance (out of 100) ##Calculate cumulative variance explained cumsum_cal<-cumsum(pve$pve) cumsum_cal ##Plot PC1 and PC2 pca_plot <- ggplot(pca2, aes(PC1, PC2, col = nm, shape = pop)) + geom_point(size = 3) pca_plot <- pca_plot + scale_colour_manual(values = c("red", "blue", "green", "purple","black")) pca_plot <- pca_plot + coord_equal() + theme_light() pca_plot <- pca_plot + xlab(paste0("PC1 (", signif(pve$pve[1], 3), "%)")) + ylab(paste0("PC2 (", signif(pve$pve[2], 3), "%)")) pca_plot # ADMIXTURE (calculates individual and population ancestries) In a nutshell: 1.Clean up the bim file (from plink) 2.Run ADMIXTURE 3.Run a loop to test out different values of K 4.Identify best values of K 5.Extract a list of individuals used in the analysis for plotting 6.Plot ADMIXTURE results in R 1. Clean up the bim file (from plink) awk '{$1=0;print $0}' sample.bim > sample.bim.tmp mv sample.bim.tmp sample.bim ##Changes the first column of the bim file with an O ##and renames the bim file to match the bed file 2. Run ADMIXTURE qrsh module load bio/admixture admixture --cv sample.bed 2 > log2.out ##qrsh changes you to an interactive node ##above code tests ancestry assuming two genomic clusters 3. Run a loop to test out different values of K #!/bin/sh for i in {2..15} do admixture --cv sample.bed $i > log${i}.out done ##output is 2 files: .Q file contains cluster assignments for each individual .P file contains population allele frequencies for each SNP 4. Identify best values of K grep "CV" *out | awk '{print $3,$4}' | sed -e 's/(//;s/)//;s/://;s/K=//' > sample.cv.error ##the best value of K will be the lowest cross-validation error 5. Extract a list of individuals used in the analysis for plotting awk '{split($1,name,"."); print $1,name[2]}' sample.nosex > sample.list 6. Plot ADMIXTURE results in R #read .Q file tbl<-read.table("efish.10.Q") #Simple plot barplot(t(as.matrix(tbl)), col=rainbow(10), xlab="Individual #", ylab="Ancestry", border=NA) # PSMC ##The PSMC (Pairwise Sequential Markovian Coalescent) analysis is a widely used tool in genomics for inferring the demographic history of a population from genomic data. It is based on the coalescent theory, a framework for understanding how genetic diversity is shaped by the history of a population. By modeling the coalescent process in a pairwise fashion, PSMC can provide insights into changes in population size over time, allowing researchers to reconstruct the demographic history of a population. In a nutshell: 1.Build a concensus sequence from the bam files 2.Convert to psmc format 3.Run PSMC 4.Plot results 1. Build a concensus sequence from the bam file (can this be mult. bams?) module load bioinformatics/bcftools bcftools mpileup -C50 -uf <reference_genome> <bam_file> | bcftools call -c | vcfutils.pl vcf2fq -d 10 -D 100 | gzip > <output.fq.gz> ##here we pipe mpileup, call, vcf2fq, and zipping ##bcftools mpileup generates a textual pileup format of the input bam file(s) ##C50 applies a coefficient to adjust the base alignment quality ##u flag outputs the results in the uncompressed BCF format, which is required for piping to bcftools ##bcftools call performs variant calling on the input data received from the bcftools mpileup command ##c option uses the consensus caller, which is suitable for calling a diploid consensus sequence ##vcfutils.pl vcf2fq is part of the bcftools package and converts the vcf output from bcftools call to FastQ format ##d and D options set a minimum and maximum (respectively) depth thresholds for filtering variants ##gzip compresses the final output FastQ file to .fq.gz format 2. Convert to psmc format module load bioinformatics/psmc fq2psmcfa -q20 <input.fq.gz> > <output.psmcfa> ##converts the zipped FastQ file (.fq.gz) to .psmcfa format ##q will mask bases with scores below the indicated number ##The higher the quality threshold, the more stringent the filtering process, leading to more masked bases in the output. 3. Run PSMC module load bioinformatics/psmc psmc -N25 -t15 -r5 -p "4+25*2+4+6" -o <output.psmc> <input.psmcfa> ##N flag sets the effective population size ##t flag sets the scaled mutation rate per generation; the scaled mutation rate is the product of the mutation rate per base pair per generation and the effective population size. ##r flag sets the scaled recombination rate per generation; the scaled recombination rate is the product of the recombination rate per base pair per generation and the effective population size. ##p flag sets the time intervals for the PSMC model. ->The specified pattern, "4+25*2+4+6", means that there are 4 intervals of equal size at the start, followed by 25 intervals with twice the size of the previous intervals, and then 4 more intervals of equal size, and finally 6 more intervals of increasing size. This allows the model to have higher time resolution near the present and lower resolution in the more distant past. ##o flag specifies output in .psmc format ##input (no flag) is the .psmcfa output from the previous step (#2 Convert to PSMC format) 4. Plot results module load bioinformatics/psmc psmc_plot.pl -g 7 -u 1e-8 -p <input_name> </path/to/file/psmc/input.psmc> ##g flag specifies the maximum generation time in years (tmax) that will be used for the analysis. ##u flag sets the mutation rate per site per generation used in the analysis. ##p flag is used to save the generated plot in pdf format. ##input name specifies a prefix to be used for the output files generated by the script. ##input.psmc specifies the input file for the script (from step 3) ##selection of appropriate values for the generation time and mutation rate of the species being studied is crucial. ##download .pdf files to local computer for visualization # Heterozygosity BCFtools 1. Average heterozygosity #!/bin/bash #Replace these with the actual file names VCF_FILE="/pool/genomics/figueiroh/SMSC_2023_class/vcf/NN_6samples_HD_PASS_DP5.vcf.gz" OUTPUT_FILE="NN_heterozygosity_v5.tsv" GENOME_LENGTH=2468345093 #you can use the fasta index (.fai) to sum the total length of the genome #Get a list of sample names from the VCF file SAMPLES=$(bcftools query -l $VCF_FILE) #Write a header line to the output file echo -e "Sample\tHeterozygous_sites\tHeterozygosity" > $OUTPUT_FILE #Loop through each sample and calculate the heterozygosity for SAMPLE in $SAMPLES; do HETEROZYGOUS=$(bcftools view -s $SAMPLE $VCF_FILE | grep -v "#" | grep -o "0/1" | wc -l) HETEROZYGOSITY=$(echo "scale=7; $HETEROZYGOUS / $GENOME_LENGTH" | bc) echo -e "$SAMPLE\t$HETEROZYGOUS\t$HETEROZYGOSITY" >> $OUTPUT_FILE done ##The above bash script calculates heterozygosity of each sample in a vcf file. ##This bcftools command line tool extracts the list of sample names and calculates the number of heterozygous variants for each sample. ##The results are written to a tab-separated file with the name "heterozygosity.tsv" and two columns: "Sample" and "Heterozygosity". ##The script assumes that the VCF file is named "your_vcf_file.vcf" and is located in the same directory as the script. 2. Plot heterozygosity in R library(ggplot2) #Replace this with the actual file name INPUT_FILE <- "heterozygosity.tsv" #Read in the data from the input file data <- read.table(INPUT_FILE, header=TRUE, sep="\t") #Create a scatter plot of heterozygosity values plot <- ggplot(data, aes(x=Sample, y=Heterozygosity)) + geom_point(size=2) + theme(axis.text.x=element_text(angle=90, hjust=1, vjust=0.5)) + labs(x="Sample", y="Heterozygosity") #Save the plot to a file ggsave("heterozygosity_plot.png", plot, width=10, height=5, dpi=300) # Heterozygosity ANGSD to calculate genome-wide heterozygosity ##this is a window-based approach module load bioinformatics/angsd/0.921 angsd -P <threads> -i <input_bam_file> -anc <ancestral_fasta_file> -dosaf <dosaf_value> -gl <genotype_likelihood_method> -C <base_quality_adjustment> -minQ <min_base_quality> -minmapq <min_mapping_quality> -fold <fold_value> -out <output_file> -ref <reference_fasta_file> -r <region_of_interest> realSFS -nsites <number_of_sites> <input_saf_idx_file> > <output_est_ml_file> ##P flag sets the number of threads to be used in parallel. ##i flag specifies the input file as a BAM file. ##anc flag specifies the ancestral fasta reference file (we used the regular reference file here too) ##dosaf flag computes the Site Frequency Spectrum (SFS) based on the genotype likelihoods. ##gl flag specifies the method used for calculating genotype likelihoods. ##C flag adjusts the base quality score by a specified value before using it.\ ##minQ flag sets the minimum base quality score required. ##minmapq flag sets the minimum mapping quality score required. ##fold flag indicates whether you are analyzing folded SFS or unfolded SFS. ##out flag specifies the output file path and name. ##ref flag specifies the reference fasta file. ##r flag specifies the region of interest. ##nsites flag specifies the number of sites to be considered for the estimation. Replace <number_of_sites> with the desired number. ##<input_saf_idx_file> specifies the input .saf.idx file, which is the output from the ANGSD program that contains information about the SFS. ##<output_est_ml_file> specifies the output file path and name for the maximum likelihood estimate of the SFS. ##ANGSD does not differentiate the chromosomes if you run the whole genome at once, that is why we need to use the ‘region’ variable when running ANGSD to specify the chromosome/scaffold. This will be useful to look the heterozygosity throughout the genome. 1. Below script will loop through scaffolds 1 to 19 (you need to know how many scaffolds/chromosomes your species has), running the ANGSD command and then the realSFS command for each scaffold. The results will be saved in the specified output directory. #!/bin/bash #Set variables input_bam_file="/pool/genomics/figueiroh/SMSC_2023/mapping/NN114296_cloud_leopard_sorted.bam" ancestral_fasta_file="/pool/genomics/figueiroh/SMSC_2023/reference/mNeoNeb1.pri.cur.20220520.fasta" reference_fasta_file="/pool/genomics/figueiroh/SMSC_2023/reference/mNeoNeb1.pri.cur.20220520.fasta" output_directory="/pool/genomics/figueiroh/SMSC_2023/heterozygosity" SAMPLE="NN114296" #Loop through scaffolds 1 to 18 for i in {1..18}; do #Run ANGSD command angsd -P 10 -i ${input_bam_file} -anc ${ancestral_fasta_file} -dosaf 1 -gl 1 -C 50 -minQ 20 -minmapq 30 -fold 1 -out ${output_directory}/$SAMPLE.SUPER_${i} -ref ${reference_fasta_file} -r SUPER_${i} #Run realSFS command realSFS -nsites 200000 ${output_directory}/$SAMPLE.SUPER_${i}.saf.idx > ${output_directory}/$SAMPLE.SUPER_${i}.est.ml done 2. Below script will add the sample name and scaffold number for each line in our output. This will make our work easier when we want to plot our results. #!/bin/bash output_directory="path/to/your/output_directory" SAMPLE="your_sample_name" #Loop through scaffolds 1 to 18 for i in {1..18}; do #Calculate the number of lines in the output file num_lines=$(wc -l < ${output_directory}/$SAMPLE.SUPER_${i}.est.ml) #Add the number of lines, sample name, and scaffold number to each line of the output file awk -v lines="$num_lines" -v sample="$SAMPLE" -v scaffold="$i" '{print lines, sample, "SUPER_" scaffold, $0}' ${output_directory}/$SAMPLE.SUPER_${i}.est.ml > ${output_directory}/$SAMPLE.SUPER_${i}.est.ml.annotated #Optional: Move the annotated file to the original file mv ${output_directory}/$SAMPLE.SUPER_${i}.est.ml.annotated ${output_directory}/$SAMPLE.SUPER_${i}.est.ml done 3. Below script will concatenate files #!/bin/bash Set the input and output directory input_directory="path/to/your/input_directory" output_directory="path/to/your/output_directory" Create a new output file output_file="${output_directory}/all_est_ml_concatenated.txt" Remove the output file if it already exists if [ -f "${output_file}" ]; then rm "${output_file}" fi Loop through all "[est.ml](http://est.ml/)" files and concatenate them for file in ${input_directory}/*.est.ml; do cat "${file}" >> "${output_file}" done 4. Plot the results for one chromosome in R #Load required libraries library(tidyverse) # Collection of packages for data manipulation and visualization library(viridis) # Package for generating color palettes library(scales) # Package for scaling and formatting axis labels #Read data file and store it in the variable 'het_master' het_master <- read.table("/path/to/file/NN_SUPER.est.ml") #Manipulate the data het_master %>% rename(sample=V2) %>% # Rename V2 as 'sample' rename(chromosome = V3) %>% # Rename V3 as 'chromosome' mutate(heterozygosity = V5/(V4 + V5)) %>% # Calculate heterozygosity as V5 / (V4 + V5) mutate(position = ((V1*200000)-200000)) %>% # Calculate position as (V1 * 200000) - 200000 filter(chromosome == "SUPER_2") %>% # Filter data to include only rows where 'chromosome' is 'SUPER_2' #Create a ggplot2 plot ggplot(aes(x=position, y=heterozygosity)) + # Set x-axis as 'position' and y-axis as 'heterozygosity' geom_line(colour="grey",alpha=0.5) + # Add a line plot with grey color and 0.5 alpha (transparency) geom_point(aes(colour=factor(chromosome))) + # Add points, color them based on 'chromosome' factor variable scale_color_viridis(discrete = TRUE) + # Use viridis color palette for the points facet_grid(sample ~ chromosome,scales = "free_x") + # Create a facet grid with 'sample' on the y-axis and 'chromosome' on the x-axis, set x-axis scales to be free labs(x = NULL, y = "Heterozygosity\n") + # Remove x-axis labels and set y-axis label to "Heterozygosity\n" scale_y_continuous(labels = comma) + # Format y-axis labels with commas scale_x_continuous(labels = comma) + # Format x-axis labels with commas theme_minimal() + # Apply a minimal theme to the plot theme(legend.position = "none", # Remove legend strip.text.x = element_text(face = "bold"), # Set strip text for x-axis to bold strip.text.y = element_text(face = "bold"), # Set strip text for y-axis to bold panel.grid.major.x = element_blank(), # Remove major x-axis gridlines panel.grid.minor.x = element_blank(), # Remove minor x-axis gridlines panel.spacing.x = unit(0, "line"), # Set panel spacing to zero panel.border = element_rect(color = "black", fill = NA, size = 0.25)) # Add a black border around the panels # SNP Density In a nutshell.. 1. Filter vcf for only heterozygous positions 2. Calculate SNP density with vcftools 3. Add sample names to the output density tables 4. Concatenate all samples (.snpden files) into a single file 5. Plot in R 1. Filter vcf for only heterozygous positions vcftools --gzvcf path/to/vcf/file.vcf.gz --recode --out output_prefix --maf 0.1 ##gzvcf is your filtered input vcf file (can be compressed) ##recode flag specifies that you are creating a new vcf file (not overwriting the existing one) ##out flag allows you to choose a prefix for your output file ##maf flag allows you to set a minimum allele frequency filter; a maf of "0.1" means only variants with a maf of 10% will be included in the output file 2. Calculate SNP density with vcftools vcftools --vcf path/to/maf/filtered/vcf.recode.vcf --SNPdensity 1000000 --out output_prefix ##vcf is the filtered vcf from step 1 (filtered for heterozygous sites) ##SNPdensity flag calculates SNP density in non-overlapping windows 1,000,000 base pairs (1 Mb) across the genome. SNP density is the number of SNPs per window. ##can also run this in a loop for multiple samples: for file in *.vcf.gz do name=$(basename $file) name=${name%.vcf.gz} vcftools --vcf $file --SNPdensity 1000000 --out ${name}_density done 3. Add sample names to the output density tables awk -v sample="sample_name" 'NR==1{print $0"\tIndiv"} NR>1{print $0"\t"sample}' sample.snpden > sample_id.snpden ##v flag sets a variable called sample with the value "sample_name" ##'NR==1{print $0"\tIndiv"} NR>1{print $0"\t"sample}': This is an awk script that processes the input file line by line. It checks if the current line number (NR) is equal to 1, and if it is, it prints the entire line ($0) followed by a tab and the string "Indiv". For all other lines (NR > 1), it prints the entire line followed by a tab and the value of the sample variable. ##sample.snpden specifies the input file, which is the SNP density output file generated by VCFtools in your previous command. ##> sample_id.snpden redirects the output to a new file named "sample_id.snpden" in the current directory. 4. Concatenate all samples (.snpden files) into a single file tail -q -n +2 *_id.snpden > total_hetsites.snpden ##Can combine steps 1-4 into a single bash script for efficiency: #!/bin/bash #create directories mkdir -p snpden/plots #set parameters WINDOW_SIZE=1000000 MAF_THRESHOLD=0.1 SAMPLES=(NN114296 NN114297 NN114393 NN115950 NN190240) #process input files for sample in "${SAMPLES[@]}"; do #extract heterozygous sites vcftools --gzvcf filtered/${sample}_HD_PASS_DP5.vcf.gz --recode --out snpden/${sample}_hetsites --maf ${MAF_THRESHOLD} #calculate SNP density vcftools --vcf snpden/${sample}_hetsites.recode.vcf --SNPdensity ${WINDOW_SIZE} --out snpden/${sample}_hetsites #add sample name to output awk -v sample="${sample}" 'NR==1{print $0"\\tIndiv"} NR>1{print $0"\\t"sample}' snpden/${sample}_hetsites.snpden > snpden/${sample}_hetsites_id.snpden done #combine output files tail -q -n +2 snpden/*_id.snpden > snpden/NN_hetsites.snpden 5. Plot in R ##download file .snpden file to local machine for plotting ##can plot SNP density by chromosome or by sample ##SNP density by chromosome: setwd("C:/Users/annac/Desktop/Grad School/Classes/Bioinformatics for Conservation Genomics/Files") getwd() library(tidyverse) library(gdata) # Read the SNP density data file snpden <- read.table("NN_hetsites.snpden", header = T) colnames(snpden)<-c("CHROM", "BIN_START", "SNP_COUNT", "VARIANTS/KB", "INDV") # Define the order of the scaffolds to be used in the visualization target <- c("CM051599.1", "CM051600.1", "CM051601.1", "CM051602.1", "CM051603.1", "CM051604.1", "CM051605.1", "CM051606.1", "CM051607.1", "CM051608.1", "CM051609.1", "CM051610.1", "CM051611.1", "CM051612.1", "CM051613.1", "CM051614.1", "CM051615.1", "CM051616.1", "CM051617.1") #Define the order of the chromosomes to be used in the visualization chr <-c('chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10', 'chr11','chr12','chr13','chr14','chr15','chr16','chr17','chr18','chrX') snpden.master <- snpden # Reorder the chromosome column of the data frame according to the target order snpden.master$CHROM <- reorder.factor(snpden.master$CHROM, new.order = target) # Subset data from chromosomes that are not "NA" snpden.master <-subset(snpden.master, snpden.master$CHROM!='NA') snpden.master$groups <- cut(as.numeric(snpden.master$`VARIANTS/KB`), c(0,0.05,0.1,0.15,0.20,0.25,0.5,0.75,1,1.25,1.5,1.75,2,2.25,2.5,2.75, 3,3.25,3.5,3.75,4,4.25,4.5,4.75,5), include.lowest = TRUE, labels = c("0", "0.05-0.1", "0.1-0.15", "0.15-0.2", "0.2-0.25", "0.25-0.5", "0.5-0.75", "0.75-1", "1-1.25", "1.25-1.5", "1.5-1.75", "1.75-2", "2-2.25", "2.25-2.5", "2.5-2.75", "2.75-3", "3-3.25", "3.25-3.5", "3.5-3.75", "3.75-4", "4-4.25", "4.25-4.5", "4.5-4.75", "4.75-5")) snpden.master$groups[snpden.master$`VARIANTS/KB` == 0] <- "0" # Rename CHROM levels levels(snpden.master$CHROM) <-c('chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10', 'chr11','chr12','chr13','chr14','chr15','chr16','chr17','chr18','chrX') snpden.master$BIN_START <- as.numeric(as.character(snpden.master$BIN_START)) names_vec <- c("NN114296", "NN114297", "NN114393", "NN115950", "NN190240") for (chromosome in unique(snpden.master$CHROM)) { # Subset the data for the current chromosome snpden.chr <- subset(snpden.master, snpden.master$CHROM == chromosome) # Define title title<-expression(paste(italic("Neofelis nebulosa"))) #Create ggplot object snpden_plot <- snpden.chr %>% mutate(INDV = factor(INDV, levels = c("NN114296", "NN114297", "NN114393", "NN115950", "NN190240"))) %>% ggplot(aes(x=BIN_START, y=1)) + geom_tile(aes(fill=groups)) + facet_grid(INDV ~ ., switch='y') + labs(x = 'Chromosome Length' , y = 'Scaffold Number' , title = expression(paste(italic("Neofelis nebulosa"))), subtitle = paste0("Chromosome ", chromosome, " heterozygous SNP densities")) + theme_minimal() + theme(axis.title.y=element_blank(), axis.text.y=element_blank(), axis.ticks.y=element_blank(), panel.grid.minor = element_blank(), panel.grid.major = element_blank(), strip.text.y.left = element_text(angle = 0, size=8), panel.spacing.y=unit(0.15, "lines"), plot.title = element_text(hjust = .5, size = 15), plot.subtitle = element_text(hjust = .5, size = 13, color = "gray")) + scale_fill_manual(values = c("#000081", "#0000f3", "#004dff", "#00b3ff", "#29ffce", "#7bff7b", "#ceff29", "#ffc600", "#ff6800", "#f30900", "brown","#800000","black"), name = "Variants/kb", labels = c("<0.05","0.05-0.10","0.10-0.15","0.15-0.20","0.20-0.25", "0.25-0.50","0.50-0.75","0.75-1.0","1.0-1.25","1.25-1.5", "1.5-1.75","1.75-2.0","2.0-2.25","2.25-2.5")) + guides(fill = guide_legend(nrow = 2)) + theme(legend.position = "bottom", legend.box = "horizontal") + scale_x_continuous(name='Chromosome length', labels = c('0Mb',"50Mb", '100Mb', "150Mb", '200Mb','250Mb'), breaks = c(0,50000000, 100000000, 150000000, 200000000,250000000), expand = c(0,0)) ggsave(filename = paste0('Neofelis_',chromosome, '.1Mb.snpden.png'), plot = snpden_plot, device = 'png', dpi = 600, units = c('cm'), width = 28, height = 6, path = "plots/", bg = "white") } snpden_plot ##SNP density by sample: setwd("C:/Users/annac/Desktop/Grad School/Classes/Bioinformatics for Conservation Genomics/Files") getwd() library(tidyverse) library(gdata) # Read the SNP density data file snpden <- read.table("NN_hetsites.snpden", header = T) colnames(snpden)<-c("CHROM", "BIN_START", "SNP_COUNT", "VARIANTS/KB", "INDV") # Define the order of the scaffolds to be used in the visualization target <- c("CM051599.1", "CM051600.1", "CM051601.1", "CM051602.1", "CM051603.1", "CM051604.1", "CM051605.1", "CM051606.1", "CM051607.1", "CM051608.1", "CM051609.1", "CM051610.1", "CM051611.1", "CM051612.1", "CM051613.1", "CM051614.1", "CM051615.1", "CM051616.1", "CM051617.1") #Define the order of the chromosomes to be used in the visualization chr <-c('chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10', 'chr11','chr12','chr13','chr14','chr15','chr16','chr17','chr18','chrX') snpden.master <- snpden # Reorder the chromosome column of the data frame according to the target order snpden.master$CHROM <- reorder.factor(snpden.master$CHROM, new.order = target) # Subset data from chromosomes that are not "NA" snpden.master <-subset(snpden.master, snpden.master$CHROM!='NA') snpden.master$groups <- cut(as.numeric(snpden.master$`VARIANTS/KB`), c(0,0.05,0.1,0.15,0.20,0.25,0.5,0.75,1,1.25,1.5,1.75,2,2.25,2.5,2.75, 3,3.25,3.5,3.75,4,4.25,4.5,4.75,5), include.lowest = TRUE, labels = c("0", "0.05-0.1", "0.1-0.15", "0.15-0.2", "0.2-0.25", "0.25-0.5", "0.5-0.75", "0.75-1", "1-1.25", "1.25-1.5", "1.5-1.75", "1.75-2", "2-2.25", "2.25-2.5", "2.5-2.75", "2.75-3", "3-3.25", "3.25-3.5", "3.5-3.75", "3.75-4", "4-4.25", "4.25-4.5", "4.5-4.75", "4.75-5")) snpden.master$groups[snpden.master$`VARIANTS/KB` == 0] <- "0" # Rename CHROM levels levels(snpden.master$CHROM) <-c('chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10', 'chr11','chr12','chr13','chr14','chr15','chr16','chr17','chr18','chrX') snpden.master$BIN_START <- as.numeric(as.character(snpden.master$BIN_START)) names_vec <- c("NN114296", "NN114297", "NN114393", "NN115950", "NN190240") for (individual in unique(snpden.master$INDV)) { # Subset the data for the current chromosome snpden.chr <- subset(snpden.master, snpden.master$INDV == individual) # Define title title<-expression(paste(italic("Neofelis nebulosa"))) #Create ggplot object snpden_plot <- snpden.chr %>% mutate(INDV = factor(INDV, levels = c("NN114296", "NN114297", "NN114393", "NN115950", "NN190240"))) %>% ggplot(aes(x=BIN_START, y=1)) + geom_tile(aes(fill=groups)) + facet_grid(CHROM ~ ., switch='y') + labs(x = 'Chromosome Length' , y = 'Scaffold Number' , title = expression(paste(italic("Neofelis nebulosa"))), subtitle = paste0(individual, " heterozygous SNP densities")) + theme_minimal() + theme(axis.title.y=element_blank(), axis.text.y=element_blank(), axis.ticks.y=element_blank(), panel.grid.minor = element_blank(), panel.grid.major = element_blank(), strip.text.y.left = element_text(angle = 0, size=8), panel.spacing.y=unit(0.15, "lines"), plot.title = element_text(hjust = .5, size = 15), plot.subtitle = element_text(hjust = .5, size = 13, color = "gray")) + scale_fill_manual(values = c("#000081", "#0000f3", "#004dff", "#00b3ff", "#29ffce", "#7bff7b", "#ceff29", "#ffc600", "#ff6800", "#f30900", "brown","#800000","black","pink","cyan","purple","magenta"), name = "Variants/kb", labels = c("<0.05","0.05-0.10","0.10-0.15","0.15-0.20","0.20-0.25", "0.25-0.50","0.50-0.75","0.75-1.0","1.0-1.25","1.25-1.5", "1.5-1.75","1.75-2.0","2.0-2.25","2.25-2.5")) + scale_x_continuous(name='Chromosome length', labels = c('0Mb',"50Mb", '100Mb', "150Mb", '200Mb','250Mb'), breaks = c(0,50000000, 100000000, 150000000, 200000000,250000000), expand = c(0,0)) ggsave(filename = paste0('Neofelis_',individual, '.1Mb.snpden.png'), plot = snpden_plot, device = 'png', dpi = 600, units = c('cm'), width = 28, height = 18, path = "plots/", bg = "white") } snpden_plot # Runs of Homozygosity (ROH) In a nutshell... 1. Calculate ROH using bcftools 2. Edit the output file 3. Plot in R 1. Calculate ROH using bcftools module load bioinformatics/bcftools bcftools roh --AF-dflt 0.4 -I -G30 --rec-rate 1.4e-9 /path/to/vcf/file.vcf.gz > prefix.roh.txt ##roh specifies that we want to calculate runs of homozygosity ##--AF-dflt 0.4 sets the default allele frequency to 0.4 when the frequency is missing in the input file. ##I flag tells the roh plugin to perform the imputation of missing genotypes. ##G30 sets the phred-scaled genotype quality threshold to 30. Genotypes below this quality threshold will be treated as missing. ##--rec-rate 1.4e-9 sets the recombination rate to 1.4e-9. 2. Edit the output file grep "RG" prefix.roh.txt | cut -f 2,3,6 > prefix.roh.edited.txt ##This will process the output file from bcftools to retain specific information. ##For example, you can extract the chromosome, start, and end positions of ROH using the above command. 3. Plot in R ##For the ROH plots, we are only interested in the “RG” portion of the output files, where it contains the homozygous blocks in the genome. setwd("C:/Users/annac/Desktop/Grad School/Classes/Bioinformatics for Conservation Genomics/Files") getwd() # Load libraries and read data library(tidyverse) library(ggrepel) # Read data with read_delim() for better control over input file parsing clouded_roh <- read_delim("NN_6samples_HD_PASS_DP5.roh.edited.txt", delim = "\t", skip = 1, col_names = c("Sample", "Chromosome", "RoH_length")) # Compute NROH and SROH clouded_nroh <- clouded_roh %>% group_by(Sample) %>% summarize(NROH = n()) clouded_sroh <- clouded_roh %>% group_by(Sample) %>% summarize(SROH = sum(RoH_length)) # Compute FROH clouded_froh <- inner_join(clouded_nroh, clouded_sroh, by = "Sample") %>% mutate(FROH = NROH / SROH) # Create a table with NROH, SROH, and FROH for each sample summary_table <- clouded_froh # Display the table print(summary_table) # Save the table to a CSV file write_csv(summary_table, "summary_table.csv") # Plot NROH vs. SROH and save the plot to a file froh_plot <- inner_join(clouded_nroh, clouded_sroh, by = "Sample") %>% ggplot(aes(x = SROH, y = NROH, color=Sample)) + geom_point(size = 3, shape=2) + geom_text_repel(aes(label = Sample)) +theme_minimal() + ##splits point and text, so there's no overlap theme(plot.title = element_text(hjust = 0.5)) + labs(title = "NROH vs. SROH") froh_plot ggsave("froh_plot.png", froh_plot, width = 8, height = 6, dpi = 300) # Create a boxplot of RoH lengths for each sample and save the plot to a file roh_boxplot <- clouded_roh %>% ggplot(aes(x = Sample, y = RoH_length, color=Sample)) + geom_boxplot() + theme_minimal() + theme(plot.title = element_text(hjust = 0.5)) + labs(title = "RoH Lengths per Sample") roh_boxplot ggsave("roh_boxplot.png", roh_boxplot, width = 8, height = 6, dpi = 300) # Annotate deleterious variants with SnpEff and SnpSift ##Before using SnpEff and SnpSift, you need to have a reference genome and annotation files. If you're working with non-model organisms, you might have to create a custom SnpEff database using the organism's reference genome and annotation files. The annotation files usually include gene models in GFF3, GTF, or Gencode format. Another option is to map your reads to a closely related species that already has its genome in the SnpEff database. 1. Creating a custom SnpEff database a. Download the SnpEff software and set up the environment: wget http://sourceforge.net/projects/snpeff/files/snpEff_latest_core.zip unzip snpEff_latest_core.zip cd snpEff b. Create a new directory for your non-model organism in the data folder: mkdir data/your_organism c. Copy the reference genome (FASTA) and annotation (GFF3, GTF, or Gencode) files into the new directory: cp /path/to/reference_genome.fasta data/your_organism/ cp /path/to/annotation_file.gff3 data/your_organism/ d. Modify the snpEff.config file to include the new genome: echo "your_organism.genome : Your_Organism" >> snpEff.config e. Build the SnpEff database: java -jar snpEff.jar build -gff3 -v your_organism ##v flag specifies the directory in which all your files are found (reference genome, annotaion files, and new genome) 2. Mapping the genome against a reference with a SnpEff database ##To map the genome of your non-model organism against a reference with a SnpEff database, you will first need to align the reads to the reference genome using an aligner like BWA or Bowtie2. Once you have the alignments in BAM or SAM format, you can call variants using a tool like GATK, FreeBayes, or Samtools. 1. Annotating variants with SnpEff With the custom SnpEff database created and the variants called, you can now annotate the variants using SnpEff: java -jar snpEff.jar your_organism input.vcf > annotated_output.vcf 2. Analyzing deleterious variants with SnpSift SnpSift is a collection of tools that can be used to filter, sort, and analyze the annotated VCF files. You can filter deleterious variants using SnpSift Filter: java -jar SnpSift.jar filter "ANN[0].EFFECT has 'missense_variant' | ANN[0].EFFECT has 'frameshift_variant'" annotated_output.vcf > deleterious_variants.vcf Some other examples of filters you can use with SnpSift a) Filtering by quality (QUAL) and depth (DP): java -jar SnpSift.jar filter "(QUAL > 30) & (DP > 10)" input.vcf > filtered_output.vcf b) Filtering by minor allele frequency (AF) for a specific population in a 1000 Genomes Project VCF file: java -jar SnpSift.jar filter "AF < 0.05" input.vcf > filtered_output.vcf c) Filtering by impact, retaining only HIGH impact variants: java -jar SnpSift.jar filter "ANN[0].IMPACT has 'HIGH'" input.vcf > high_impact_output.vcf d) Filtering by specific gene: java -jar SnpSift.jar filter "ANN[0].GENE has 'BRCA1'" input.vcf > brca1_output.vcf e) Filtering variants that are either missense, frameshift, or stop gained, and have a SIFT score below 0.05: java -jar SnpSift.jar filter "(ANN[0].EFFECT has 'missense_variant' | ANN[0].EFFECT has 'frameshift_variant' | ANN[0].EFFECT has 'stop_gained') & (ANN[0].SIFT_SCORE < 0.05)" input.vcf > filtered_output.vcf 3. Enrichment analysis with g:Profiler ##Provides essential insights into the biological context and functional implications of these genes. import argparse import pandas as pd from gprofiler import GProfiler def read_gene_list(file_path): with open(file_path, "r") as file: genes = [line.strip() for line in file.readlines()] return genes def run_gprofiler(genes, output_prefix): gp = GProfiler(return_dataframe=True) enrichment_results = gp.profile(organism='hsapiens', query=genes) # Save the raw results to a CSV file enrichment_results.to_csv(f"{output_prefix}_raw_enrichment_results.csv", index=False) return enrichment_results def parse_and_save_results(enrichment_results, output_prefix): filtered_results = enrichment_results[ (enrichment_results["p_value"] <= 0.05) & (enrichment_results["source"].isin(["GO:BP", "GO:MF", "GO:CC", "KEGG", "HP"])) ] sorted_results = filtered_results.sort_values("p_value") # Save all significant categories to a new CSV file sorted_results.to_csv(f"{output_prefix}_significant_enrichment_categories.csv", index=False) # Save all significant categories to a new Excel file sorted_results.to_excel(f"{output_prefix}_significant_enrichment_categories.xlsx", index=False) return sorted_results # Set up command-line argument parsing parser = argparse.ArgumentParser(description="Run g:Profiler and parse enrichment analysis results") parser.add_argument("gene_list_file", help="Input file containing the list of genes") parser.add_argument("output_prefix", help="Output file prefix for significant enrichment categories") args = parser.parse_args() # Read the gene list from the input file genes = read_gene_list(args.gene_list_file) # Run g:Profiler and save the raw results enrichment_results = run_gprofiler(genes, args.output_prefix) # Parse the results and save significant categories significant_categories = parse_and_save_results(enrichment_results, args.output_prefix) # Display significant categories print(significant_categories)