# Genome Assembly/Resequencing/Analysis Pipeline
##Adapted from https://github.com/SmithsonianWorkshops/SMSC_2023_Conservation_Genomics.git
####Building the Assembly
# Upload raw data files to run on supercomputer
scp -r /local/directory/path/to/files/ user@qb.loni.org:/path/to/folder
##scp securely copies files
##make sure if you're logged into loni you log out before running this
# Run FastQC on all samples to check quality of reads
{if loading from conda; path doesn't need to be repeated below} /path/to/fastqc
for i in /path/to/files/*.fq.gz do /path/to/FastQC/fastqc $i; done
# FastQC creates fastqc.html files that need to be downloaded to computer to view
scp user@qb.loni.org:/path/to/files/*.html /local/directory/
##scp securely copies files
##make sure if you're logged into loni you log out before running this
**OR**
module load ffsend
ffsend upload /path/to/files/*.html
##This will create a url that you can copy into your browser and download selected files to your desktop
*** You would not trim HiFi or Hi-C data because the goal is to input long reads and trimming would cut down those long reads. FastQC in this instance is mainly informational or to complain to the sequencing facility if you get back really crappy data lol ***
# Hifiasm Assembly (with HiFi files)
module load hifiasm
hifiasm read_1.fq.gz read_2.fq.gz -t 32 -o file.asm
##fast haplotype-resolved de novo assembler for PacBio HiFi reads
##-t: number of threads, -o output file, uses about 300G total memory
# Hifiasm Assembly (with Hi-C files)
module load hifiasm
hifiasm --h1 read_1.fq.gz --h2 read_2.fq.gz -t 32 -o file.asm
##fast haplotype-resolved de novo assembler for PacBio HiFi reads
##-t: number of threads, -o output file, uses about 300G total memory
# Convert primary assembly to fasta file format
cd /path/to/assembly
awk '/^S/{print ">"$2;print $3}' assembly.p_ctg.gfa > assembly.p_ctg.fa
# Check finished assembly stats with assembly_stats
module load assembly_stats
assembly_stats /path/to/assembly.fa > assembly_stats.out
##gives info like mean contig size, GC content, longest and shortest contig size, N50, L50, scaffold stats etc.
####Assembly Quality Check
# Run BUSCO to assess completeness of genome assembly
module load busco
busco -i /path/to/assembly.fa --auto-lineage<prok><euk> -c 10 -m genome -o /path/to/output
##if you have clade information for your assembly you can replace --auto-lineage with -l (lineage)and specify e.g. mammalia_odb10 (This will automatically connect and download the database from the BUSCO website); -m <transcriptome> <proteins> <genome>, -c number of CPU, -i input, -o output
# Blast assembly with blastn to assess genome assembly quality (need to run blobtools)
module load blast
blastn -db /path/to/dv/ncbi/db/latest_v4/nt/nt -query /path/to/assembly.fa -outfmt "6 qseqid staxids bitscore std" -max_target_seqs 20 -max_hsps 1 -evaule 1e-20 -num_threads 10 -out assembly_blast.out
##-db database (must download ncbi database to run), -query genome you want to query, -outfmt output format, -max_target_seqs maximum target sequence you want to keep (the higher the number the longer it takes to run), -max_hsps maximum number of alignments to keep, -evalue requirement for sequence to be kept, -num_threads number of threads, -out output
# Map raw reads to genome assembly with minimap2
module load minimap2
minimap2 -ax -t 20 /path/to/assembly.fa /path/to/read1.fq.gz /path/to/read2.fq.gz |samtools view -b | samtools sort -@20 -O BAM -o reads_sorted.bam
##-ax configuration to map reads to genome -t number of threads -@ number of threads -b change sam to bam file -O output format -o output
##pipe to samtools view to change the output froma sam to bam file and then pipe to samtools sort to to sort reads and output bam file
# Run Blobtools to detect contamination
module load blobtools
blobtools create -i /path/to/assembly.fa -b /path/to/mapped.bam -t /path/to/blast.out -o blobplot
##blobtools create creates a blobtools database; -i input assembly -b mapped bam files to input assembly -t blast output -o output
# Run Blobtools plot to visualize coverage, GC proportion, and percentage of mapped assembly
module load blobtools
blobtools plot -i /path/to/blobplot.json
##-i input from blobplot create output
# Blobtools plot creates png files that need to be downloaded to computer to view
scp user@qb.loni.org:/path/to/files/*.png /local/directory/
##scp securely copies files
##make sure if you're logged into loni you log out before running this
**OR**
module load ffsend
ffsend upload /path/to/files/*.png
##This will create a url that you can copy into your browser and download selected files to your desktop
####Genome Annotation
# Identify repeats and build repeat database with RepeatModeler
module load bio/repeatmodeler
BuildDatabase -name /path/to/assembly/fasta/file
##name is the name you want to give your dataset i.e. clouded_leopard
##followed by newly assembled reference genome
# Run RepeatModeler using the repeat database
module load bio/repeatmodeler
RepeatModeler -database /path/to/repeat/database -pa 36 -engine ncbi > out.log
##database is the repeat database created in the previous step
##pa is the number of CPUs
##engine the name of the search engine (note: this may vary according to version of RepeatModeler)
##this is a high memory job; can take serveral days
##output is a consensi.fa.classified file (a complete database to be used in RepeatMasker)
# Run RepeatMasker
module load bio/repeatmodeler
RepeatMasker -pa 30 -xsmall -gff -lib consensi_classified -dir /path/to/output/directory /path/to/assembly/fasta/file
##pa is the number of CPUs
##xsmall stands for soft-masking
##gff is the output format of the annotated repeats
##lib is the database to base the repeat-masking on
##dir is directory to write the output to
##followed by your newly assembled reference genome
##there will be 3 outputs:
1. .fast.tbl (summary information about the repetitive elements)
2. .masked.fasta (masked assembly (in our case, softmasked)
3. .fasta.out (detailed information about the repetitive elements, including coordinates, repeat type and size)
# Alternative to above step (also uses RepeatMasker)
module load bio/repeatmasker
RepeatMasker -species mammalia -pa $NSLOTS -xsmall -dir /path/to/output/directory /path/to/assembly/fasta/file
##species is the taxonomic database (options available here https://www.girinst.org/repbase/update/browse.php)
##pa is number of CPUs
##xsmall is soft-masking
##dir is directory to write the output to
##followed by your newly assemble reference genome
##same kind of outputs as above step
# Run GeMoMa to infer annotation of protein-coding genes (using annotated reference genomes)
##first step would be to download some reference genomes (taxa_assembly.fasta) and their associated annotation (taxa_annotation.gff) from GenBank
module load bio/gemoma/1.9
set maxHeapSize 400000
GeMoMa -Xmx400G GeMoMaPipeline threads=$NSLOTS outdir=/path/to/output/directory GeMoMa.Score=ReAlign AnnotationFinalizer.r=NO p=false o=true t=/path/to/assembly/fasta/file s=own i=cat a=reference genome in gff format g=same reference genome in fasta format
##can repeat flags s, i, a, and g for desired "reference genomes" that are self-created or downloaded from GenBank
##threads is number of CPUs
##dir is directory to write the output to
##t is path your reference genome
##s specifies if you will be providing your "own" reference genomes
##i allows you to provide an ID for each reference organism
##a is path to individual reference genome
##g is path to individual annotation genome
##this is a high memory job ~480GB
####Resequencing Analysis
####Assumes you have samples that have been quality-checked and trimmed (as needed)
# Map samples to Reference Genome and convert to .bam format
module load bio/minimap2
minimap2 -ax sr -t 20 /path/to/assembly/fasta/file /path/to/read_1/fastq.gz /path/to/read_2/fastq.gz | samtools sort -@20 -O BAM -o out.bam
##ax is the preset configuration to map illumina short reads (sr) to genomes
##t is number of threads to use
##followed by reference file
##read 1 of sample
##read 2 of sample
##minimap2 outputs a .sam file (one sam file per sample)
##samtools sort both converts sam to bam and sorts bam file
##@ is the number of threads to use
##O is output format
##o is name of output
##mapping and samtools will take time!!
# Mark PCR Duplicates with PicardTools (GATK)
module load bio/gatk/4.1.3.0
rungatk MarkDuplicates -I=sample_sorted.bam -O=sample_duplMarked.bam -M=sorted_reads_duplMarked.metrics
##I is the input (sorted bam file)
##O is the output (duplicate-marked bam)
##M is the output for the metrics file
# Add or Replace Read Groups with GATK
module load bio/gatk/4.1.3.0
rungatk AddOrReplaceReadGroups -I=sample_duplMarked.bam -O=sample_duplMarked_SM.bam -ID=1 -LB=lib1 -PL=ILLUMINA -PU=unit1 -SM=sample_name
##I is the input (duplicate-marked bam)
##O is the output (SM.bam)
##ID is Read Group ID, lane number
##LB is library ID
##PL is sequencing platform
##SM is sample name
# Create index for reference genome and all .bam files
module load bio/gatk/4.1.3.0
module load bio/samtools/1.9
rungatk CreateSequenceDictionary -R=/path/to/assembly/fasta/file -O=output.dict
samtools faidx /path/to/assembly/fasta/file
samtools index sample_duplMarked_SM.bam
##R is the reference genome
##O is the output reference index dictionary
# Create gvcf files for each sample
module load bio/gatk/4.1.3.0
rungatk HaplotypeCaller -I sample_duplMarked_SM.bam -R /path/to/assembly/fasta/file -ERC GVCF -O sample.g.vcf.gz
##I is input
##R is reference genome
##ERC preset parameters for likelihood calculations and is more efficient and recommended for large number of samples and therefore more scalable
##O is output
##This is a memory and time intensive step; can break up into groups of samples
##Try multi-threading
# Index gvcf file
qrsh
module load bioinformatics/htslib/1.9
tabix -p vcf sample.g.vcf.gz
##qrsh switches to interactive node (check if LONI has this option)
# Combine gvcf files
module load bio/gatk/4.1.3.0
rungatk CombineGVCFs -R /path/to/assembly/fasta/file \
-V sample1.g.vcf.gz \
-V sample2.g.vcf.gz \
-V sample3.g.vcf.gz \
-O cohort.g.vcf.gz
##R is the reference genome
##V are your gvcf samples to combine
##O is output
##This is also a memory and time intensive step
##May need to ask for an extension on QB3
# Genotype combined gvcf file
module load bio/gatk/4.1.3.0
rungatk GenotypeGVCFs \
-R /path/to/assembly/fasta/file \
-V cohort.g.vcf.gz \
-O cohort.vcf.gz
##R is the reference genome
##V is the combined gvcf
##O is output (genotyped and combined vcf)
##Memory and time intensive step
# Filter variants
module load bio/gatk/4.1.3.0
rungatk VariantFiltration \
-R /path/to/assembly/fasta/file \
-V cohort.vcf.gz \
-O cohort_varfilter.vcf \
--filter-name "Low_depth10" \
--filter-expression "DP < 10"
##R is the reference genome
##V is the genotyped and combined vcf
##O is output (filtered vcf)
##Add in your preferred filtration parameters
####Analyses to run with final, clean VCF
# PCA
In a nutshell...
1.prune vcf
2.create PCA
3.plot PCA in R
1. Prune VCF
qrsh
module load bio/plink/
plink --vcf /path/to/vcf/file.vcf --double-id --allow-extra-chr --set-missing-var-ids @:# --indep-pairwise 50 10 0.1 --out output_name_plink
##qrsh takes you to an interactive node
##vcf path to your vcf file
##double-id tells plink to set both the family ID and the within-family ID as the sample ID
##allow-extra-chr allows additional chromosomes beyond the human chromosome set
##set-missing-var-ids sets a variant ID for our SNPs. Human data sets often have annotated SNP names and so plink will look for these. We do not have them so instead we set ours to default to chromosome:position which can be achieved in plink by setting the option @:#.
##indep--pairwise performs the linkage pruning. The first argument, 50 denotes we have set a window of 50 Kb. The second argument, 10 is our window step size - meaning we move 10 bp each time we calculate linkage. Finally, we set an r2 threshold - i.e. the threshold of linkage we are willing to tolerate. Here we prune any variables that show an r2 of greater than 0.1.
##output will be a prune.in file
2. Create PCA
plink --vcf /path/to/vcf/file.vcf --double-id --allow-extra-chr --set-missing-var-ids @:# \
--extract output_plink.prune.in \
--make-bed --pca --out output_pca
##extract tells plink to extract only these positions from our vcf
##make-bed creates a bed file and a bim file (useful for ADMIXTURE)
##pca tells plink to create a Principal Component Analysis (PCA)
##outputs will be .bed file, eigenval file, and eigenvec file
3. Plot PCA in R
library (tidyverse)
setwd("C:/Users/annac/Desktop/Grad School/Classes/Bioinformatics for Conservation Genomics/Files")
getwd()
##Reading in eigen files
pca<-read_table2("clouded_leopard_pca.eigenvec", col_names = FALSE)
eigenval<-scan('clouded_leopard_pca.eigenval')
##Cleaning up table
pca <- pca[,-1]
##Set names
names(pca)[1]<-"ind"
names(pca)[2:ncol(pca)] <- paste0("PC", 1:(ncol(pca)-1))
##Organize individuals by name and and population
# name
nm <- rep(NA, length(pca$ind))
nm[grep("NN114296", pca$ind)] <- "pepe"
nm[grep("NN114297", pca$ind)] <- "rosa"
nm[grep("NN114393", pca$ind)] <- "shakiral"
nm[grep("NN115950", pca$ind)] <- "carlos"
nm[grep("NN190240", pca$ind)] <- "mike"
# population
pop <- rep(NA, length(pca$ind))
pop[grep("NN114296", pca$ind)] <- "cl"
pop[grep("NN114297", pca$ind)] <- "cl"
pop[grep("NN114393", pca$ind)] <- "cl"
pop[grep("NN115950", pca$ind)] <- "cl"
pop[grep("NN190240", pca$ind)] <- "cl"
##Create 3rd vector so we can plot with different colors
##and separated by both color and population
nm_pop <- paste0(nm, "_", pop)
##Reorganize dataframe with all the new variables
pca2 <- as_tibble(data.frame(pca, nm, pop, nm_pop))
##Convert eigenvalues into percentages
##i.e. variance explained by each principal component
pve <- data.frame(PC = 1:5, pve = eigenval/sum(eigenval)*100)
##Plot eigenvalues
plot_pve <- ggplot(pve, aes(PC, pve)) + geom_bar(stat = "identity")
plot_pve <- plot_pve + ylab("% of variance expl") +
theme_classic()
plot_pve
##Looks like PC 1 and PC 2 explain the most variance (out of 100)
##Calculate cumulative variance explained
cumsum_cal<-cumsum(pve$pve)
cumsum_cal
##Plot PC1 and PC2
pca_plot <- ggplot(pca2, aes(PC1, PC2, col = nm, shape = pop)) + geom_point(size = 3)
pca_plot <- pca_plot + scale_colour_manual(values = c("red", "blue", "green", "purple","black"))
pca_plot <- pca_plot + coord_equal() + theme_light()
pca_plot <- pca_plot + xlab(paste0("PC1 (", signif(pve$pve[1], 3), "%)")) + ylab(paste0("PC2 (", signif(pve$pve[2], 3), "%)"))
pca_plot
# ADMIXTURE (calculates individual and population ancestries)
In a nutshell:
1.Clean up the bim file (from plink)
2.Run ADMIXTURE
3.Run a loop to test out different values of K
4.Identify best values of K
5.Extract a list of individuals used in the analysis for plotting
6.Plot ADMIXTURE results in R
1. Clean up the bim file (from plink)
awk '{$1=0;print $0}' sample.bim > sample.bim.tmp
mv sample.bim.tmp sample.bim
##Changes the first column of the bim file with an O
##and renames the bim file to match the bed file
2. Run ADMIXTURE
qrsh
module load bio/admixture
admixture --cv sample.bed 2 > log2.out
##qrsh changes you to an interactive node
##above code tests ancestry assuming two genomic clusters
3. Run a loop to test out different values of K
#!/bin/sh
for i in {2..15}
do
admixture --cv sample.bed $i > log${i}.out
done
##output is 2 files: .Q file contains cluster assignments for each individual
.P file contains population allele frequencies for each SNP
4. Identify best values of K
grep "CV" *out | awk '{print $3,$4}' | sed -e 's/(//;s/)//;s/://;s/K=//' > sample.cv.error
##the best value of K will be the lowest cross-validation error
5. Extract a list of individuals used in the analysis for plotting
awk '{split($1,name,"."); print $1,name[2]}' sample.nosex > sample.list
6. Plot ADMIXTURE results in R
#read .Q file
tbl<-read.table("efish.10.Q")
#Simple plot
barplot(t(as.matrix(tbl)), col=rainbow(10),
xlab="Individual #", ylab="Ancestry", border=NA)
# PSMC
##The PSMC (Pairwise Sequential Markovian Coalescent) analysis is a widely used tool in genomics for inferring the demographic history of a population from genomic data. It is based on the coalescent theory, a framework for understanding how genetic diversity is shaped by the history of a population. By modeling the coalescent process in a pairwise fashion, PSMC can provide insights into changes in population size over time, allowing researchers to reconstruct the demographic history of a population.
In a nutshell:
1.Build a concensus sequence from the bam files
2.Convert to psmc format
3.Run PSMC
4.Plot results
1. Build a concensus sequence from the bam file (can this be mult. bams?)
module load bioinformatics/bcftools
bcftools mpileup -C50 -uf <reference_genome> <bam_file> | bcftools call -c | vcfutils.pl vcf2fq -d 10 -D 100 | gzip > <output.fq.gz>
##here we pipe mpileup, call, vcf2fq, and zipping
##bcftools mpileup generates a textual pileup format of the input bam file(s)
##C50 applies a coefficient to adjust the base alignment quality
##u flag outputs the results in the uncompressed BCF format, which is required for piping to bcftools
##bcftools call performs variant calling on the input data received from the bcftools mpileup command
##c option uses the consensus caller, which is suitable for calling a diploid consensus sequence
##vcfutils.pl vcf2fq is part of the bcftools package and converts the vcf output from bcftools call to FastQ format
##d and D options set a minimum and maximum (respectively) depth thresholds for filtering variants
##gzip compresses the final output FastQ file to .fq.gz format
2. Convert to psmc format
module load bioinformatics/psmc
fq2psmcfa -q20 <input.fq.gz> > <output.psmcfa>
##converts the zipped FastQ file (.fq.gz) to .psmcfa format
##q will mask bases with scores below the indicated number
##The higher the quality threshold, the more stringent the filtering process, leading to more masked bases in the output.
3. Run PSMC
module load bioinformatics/psmc
psmc -N25 -t15 -r5 -p "4+25*2+4+6" -o <output.psmc> <input.psmcfa>
##N flag sets the effective population size
##t flag sets the scaled mutation rate per generation; the scaled mutation rate is the product of the mutation rate per base pair per generation and the effective population size.
##r flag sets the scaled recombination rate per generation; the scaled recombination rate is the product of the recombination rate per base pair per generation and the effective population size.
##p flag sets the time intervals for the PSMC model.
->The specified pattern, "4+25*2+4+6", means that there are 4 intervals of equal size at the start, followed by 25 intervals with twice the size of the previous intervals, and then 4 more intervals of equal size, and finally 6 more intervals of increasing size. This allows the model to have higher time resolution near the present and lower resolution in the more distant past.
##o flag specifies output in .psmc format
##input (no flag) is the .psmcfa output from the previous step (#2 Convert to PSMC format)
4. Plot results
module load bioinformatics/psmc
psmc_plot.pl -g 7 -u 1e-8 -p <input_name> </path/to/file/psmc/input.psmc>
##g flag specifies the maximum generation time in years (tmax) that will be used for the analysis.
##u flag sets the mutation rate per site per generation used in the analysis.
##p flag is used to save the generated plot in pdf format.
##input name specifies a prefix to be used for the output files generated by the script.
##input.psmc specifies the input file for the script (from step 3)
##selection of appropriate values for the generation time and mutation rate of the species being studied is crucial.
##download .pdf files to local computer for visualization
# Heterozygosity BCFtools
1. Average heterozygosity
#!/bin/bash
#Replace these with the actual file names
VCF_FILE="/pool/genomics/figueiroh/SMSC_2023_class/vcf/NN_6samples_HD_PASS_DP5.vcf.gz"
OUTPUT_FILE="NN_heterozygosity_v5.tsv"
GENOME_LENGTH=2468345093 #you can use the fasta index (.fai) to sum the total length of the genome
#Get a list of sample names from the VCF file
SAMPLES=$(bcftools query -l $VCF_FILE)
#Write a header line to the output file
echo -e "Sample\tHeterozygous_sites\tHeterozygosity" > $OUTPUT_FILE
#Loop through each sample and calculate the heterozygosity
for SAMPLE in $SAMPLES; do
HETEROZYGOUS=$(bcftools view -s $SAMPLE $VCF_FILE | grep -v "#" | grep -o "0/1" | wc -l)
HETEROZYGOSITY=$(echo "scale=7; $HETEROZYGOUS / $GENOME_LENGTH" | bc)
echo -e "$SAMPLE\t$HETEROZYGOUS\t$HETEROZYGOSITY" >> $OUTPUT_FILE
done
##The above bash script calculates heterozygosity of each sample in a vcf file.
##This bcftools command line tool extracts the list of sample names and calculates the number of heterozygous variants for each sample.
##The results are written to a tab-separated file with the name "heterozygosity.tsv" and two columns: "Sample" and "Heterozygosity".
##The script assumes that the VCF file is named "your_vcf_file.vcf" and is located in the same directory as the script.
2. Plot heterozygosity in R
library(ggplot2)
#Replace this with the actual file name
INPUT_FILE <- "heterozygosity.tsv"
#Read in the data from the input file
data <- read.table(INPUT_FILE, header=TRUE, sep="\t")
#Create a scatter plot of heterozygosity values
plot <- ggplot(data, aes(x=Sample, y=Heterozygosity)) +
geom_point(size=2) +
theme(axis.text.x=element_text(angle=90, hjust=1, vjust=0.5)) +
labs(x="Sample", y="Heterozygosity")
#Save the plot to a file
ggsave("heterozygosity_plot.png", plot, width=10, height=5, dpi=300)
# Heterozygosity ANGSD to calculate genome-wide heterozygosity
##this is a window-based approach
module load bioinformatics/angsd/0.921
angsd -P <threads> -i <input_bam_file> -anc <ancestral_fasta_file> -dosaf <dosaf_value> -gl <genotype_likelihood_method> -C <base_quality_adjustment> -minQ <min_base_quality> -minmapq <min_mapping_quality> -fold <fold_value> -out <output_file> -ref <reference_fasta_file> -r <region_of_interest>
realSFS -nsites <number_of_sites> <input_saf_idx_file> > <output_est_ml_file>
##P flag sets the number of threads to be used in parallel.
##i flag specifies the input file as a BAM file.
##anc flag specifies the ancestral fasta reference file (we used the regular reference file here too)
##dosaf flag computes the Site Frequency Spectrum (SFS) based on the genotype likelihoods.
##gl flag specifies the method used for calculating genotype likelihoods.
##C flag adjusts the base quality score by a specified value before using it.\
##minQ flag sets the minimum base quality score required.
##minmapq flag sets the minimum mapping quality score required.
##fold flag indicates whether you are analyzing folded SFS or unfolded SFS.
##out flag specifies the output file path and name.
##ref flag specifies the reference fasta file.
##r flag specifies the region of interest.
##nsites flag specifies the number of sites to be considered for the estimation. Replace <number_of_sites> with the desired number.
##<input_saf_idx_file> specifies the input .saf.idx file, which is the output from the ANGSD program that contains information about the SFS.
##<output_est_ml_file> specifies the output file path and name for the maximum likelihood estimate of the SFS.
##ANGSD does not differentiate the chromosomes if you run the whole genome at once, that is why we need to use the ‘region’ variable when running ANGSD to specify the chromosome/scaffold. This will be useful to look the heterozygosity throughout the genome.
1. Below script will loop through scaffolds 1 to 19 (you need to know how many scaffolds/chromosomes your species has), running the ANGSD command and then the realSFS command for each scaffold. The results will be saved in the specified output directory.
#!/bin/bash
#Set variables
input_bam_file="/pool/genomics/figueiroh/SMSC_2023/mapping/NN114296_cloud_leopard_sorted.bam"
ancestral_fasta_file="/pool/genomics/figueiroh/SMSC_2023/reference/mNeoNeb1.pri.cur.20220520.fasta"
reference_fasta_file="/pool/genomics/figueiroh/SMSC_2023/reference/mNeoNeb1.pri.cur.20220520.fasta"
output_directory="/pool/genomics/figueiroh/SMSC_2023/heterozygosity"
SAMPLE="NN114296"
#Loop through scaffolds 1 to 18
for i in {1..18}; do
#Run ANGSD command
angsd -P 10 -i ${input_bam_file} -anc ${ancestral_fasta_file} -dosaf 1 -gl 1 -C 50 -minQ 20 -minmapq 30 -fold 1 -out ${output_directory}/$SAMPLE.SUPER_${i} -ref ${reference_fasta_file} -r SUPER_${i}
#Run realSFS command
realSFS -nsites 200000 ${output_directory}/$SAMPLE.SUPER_${i}.saf.idx > ${output_directory}/$SAMPLE.SUPER_${i}.est.ml
done
2. Below script will add the sample name and scaffold number for each line in our output. This will make our work easier when we want to plot our results.
#!/bin/bash
output_directory="path/to/your/output_directory"
SAMPLE="your_sample_name"
#Loop through scaffolds 1 to 18
for i in {1..18}; do
#Calculate the number of lines in the output file
num_lines=$(wc -l < ${output_directory}/$SAMPLE.SUPER_${i}.est.ml)
#Add the number of lines, sample name, and scaffold number to each line of the output file
awk -v lines="$num_lines" -v sample="$SAMPLE" -v scaffold="$i" '{print lines, sample, "SUPER_" scaffold, $0}' ${output_directory}/$SAMPLE.SUPER_${i}.est.ml > ${output_directory}/$SAMPLE.SUPER_${i}.est.ml.annotated
#Optional: Move the annotated file to the original file
mv ${output_directory}/$SAMPLE.SUPER_${i}.est.ml.annotated ${output_directory}/$SAMPLE.SUPER_${i}.est.ml
done
3. Below script will concatenate files
#!/bin/bash
Set the input and output directory
input_directory="path/to/your/input_directory"
output_directory="path/to/your/output_directory"
Create a new output file
output_file="${output_directory}/all_est_ml_concatenated.txt"
Remove the output file if it already exists
if [ -f "${output_file}" ]; then
rm "${output_file}"
fi
Loop through all "[est.ml](http://est.ml/)" files and concatenate them
for file in ${input_directory}/*.est.ml; do
cat "${file}" >> "${output_file}"
done
4. Plot the results for one chromosome in R
#Load required libraries
library(tidyverse) # Collection of packages for data manipulation and visualization
library(viridis) # Package for generating color palettes
library(scales) # Package for scaling and formatting axis labels
#Read data file and store it in the variable 'het_master'
het_master <- read.table("/path/to/file/NN_SUPER.est.ml")
#Manipulate the data
het_master %>%
rename(sample=V2) %>% # Rename V2 as 'sample'
rename(chromosome = V3) %>% # Rename V3 as 'chromosome'
mutate(heterozygosity = V5/(V4 + V5)) %>% # Calculate heterozygosity as V5 / (V4 + V5)
mutate(position = ((V1*200000)-200000)) %>% # Calculate position as (V1 * 200000) - 200000
filter(chromosome == "SUPER_2") %>% # Filter data to include only rows where 'chromosome' is 'SUPER_2'
#Create a ggplot2 plot
ggplot(aes(x=position, y=heterozygosity)) + # Set x-axis as 'position' and y-axis as 'heterozygosity'
geom_line(colour="grey",alpha=0.5) + # Add a line plot with grey color and 0.5 alpha (transparency)
geom_point(aes(colour=factor(chromosome))) + # Add points, color them based on 'chromosome' factor variable
scale_color_viridis(discrete = TRUE) + # Use viridis color palette for the points
facet_grid(sample ~ chromosome,scales = "free_x") + # Create a facet grid with 'sample' on the y-axis and 'chromosome' on the x-axis, set x-axis scales to be free
labs(x = NULL, y = "Heterozygosity\n") + # Remove x-axis labels and set y-axis label to "Heterozygosity\n"
scale_y_continuous(labels = comma) + # Format y-axis labels with commas
scale_x_continuous(labels = comma) + # Format x-axis labels with commas
theme_minimal() + # Apply a minimal theme to the plot
theme(legend.position = "none", # Remove legend
strip.text.x = element_text(face = "bold"), # Set strip text for x-axis to bold
strip.text.y = element_text(face = "bold"), # Set strip text for y-axis to bold
panel.grid.major.x = element_blank(), # Remove major x-axis gridlines
panel.grid.minor.x = element_blank(), # Remove minor x-axis gridlines
panel.spacing.x = unit(0, "line"), # Set panel spacing to zero
panel.border = element_rect(color = "black", fill = NA, size = 0.25)) # Add a black border around the panels
# SNP Density
In a nutshell..
1. Filter vcf for only heterozygous positions
2. Calculate SNP density with vcftools
3. Add sample names to the output density tables
4. Concatenate all samples (.snpden files) into a single file
5. Plot in R
1. Filter vcf for only heterozygous positions
vcftools --gzvcf path/to/vcf/file.vcf.gz --recode --out output_prefix --maf 0.1
##gzvcf is your filtered input vcf file (can be compressed)
##recode flag specifies that you are creating a new vcf file (not overwriting the existing one)
##out flag allows you to choose a prefix for your output file
##maf flag allows you to set a minimum allele frequency filter; a maf of "0.1" means only variants with a maf of 10% will be included in the output file
2. Calculate SNP density with vcftools
vcftools --vcf path/to/maf/filtered/vcf.recode.vcf --SNPdensity 1000000 --out output_prefix
##vcf is the filtered vcf from step 1 (filtered for heterozygous sites)
##SNPdensity flag calculates SNP density in non-overlapping windows 1,000,000 base pairs (1 Mb) across the genome. SNP density is the number of SNPs per window.
##can also run this in a loop for multiple samples:
for file in *.vcf.gz
do
name=$(basename $file)
name=${name%.vcf.gz}
vcftools --vcf $file --SNPdensity 1000000 --out ${name}_density
done
3. Add sample names to the output density tables
awk -v sample="sample_name" 'NR==1{print $0"\tIndiv"} NR>1{print $0"\t"sample}' sample.snpden > sample_id.snpden
##v flag sets a variable called sample with the value "sample_name"
##'NR==1{print $0"\tIndiv"} NR>1{print $0"\t"sample}': This is an awk script that processes the input file line by line. It checks if the current line number (NR) is equal to 1, and if it is, it prints the entire line ($0) followed by a tab and the string "Indiv". For all other lines (NR > 1), it prints the entire line followed by a tab and the value of the sample variable.
##sample.snpden specifies the input file, which is the SNP density output file generated by VCFtools in your previous command.
##> sample_id.snpden redirects the output to a new file named "sample_id.snpden" in the current directory.
4. Concatenate all samples (.snpden files) into a single file
tail -q -n +2 *_id.snpden > total_hetsites.snpden
##Can combine steps 1-4 into a single bash script for efficiency:
#!/bin/bash
#create directories
mkdir -p snpden/plots
#set parameters
WINDOW_SIZE=1000000
MAF_THRESHOLD=0.1
SAMPLES=(NN114296 NN114297 NN114393 NN115950 NN190240)
#process input files
for sample in "${SAMPLES[@]}"; do
#extract heterozygous sites
vcftools --gzvcf filtered/${sample}_HD_PASS_DP5.vcf.gz --recode --out snpden/${sample}_hetsites --maf ${MAF_THRESHOLD}
#calculate SNP density
vcftools --vcf snpden/${sample}_hetsites.recode.vcf --SNPdensity ${WINDOW_SIZE} --out snpden/${sample}_hetsites
#add sample name to output
awk -v sample="${sample}" 'NR==1{print $0"\\tIndiv"} NR>1{print $0"\\t"sample}' snpden/${sample}_hetsites.snpden > snpden/${sample}_hetsites_id.snpden
done
#combine output files
tail -q -n +2 snpden/*_id.snpden > snpden/NN_hetsites.snpden
5. Plot in R
##download file .snpden file to local machine for plotting
##can plot SNP density by chromosome or by sample
##SNP density by chromosome:
setwd("C:/Users/annac/Desktop/Grad School/Classes/Bioinformatics for Conservation Genomics/Files")
getwd()
library(tidyverse)
library(gdata)
# Read the SNP density data file
snpden <- read.table("NN_hetsites.snpden", header = T)
colnames(snpden)<-c("CHROM", "BIN_START", "SNP_COUNT", "VARIANTS/KB", "INDV")
# Define the order of the scaffolds to be used in the visualization
target <- c("CM051599.1", "CM051600.1", "CM051601.1", "CM051602.1", "CM051603.1", "CM051604.1",
"CM051605.1", "CM051606.1", "CM051607.1", "CM051608.1", "CM051609.1", "CM051610.1",
"CM051611.1", "CM051612.1", "CM051613.1", "CM051614.1", "CM051615.1", "CM051616.1",
"CM051617.1")
#Define the order of the chromosomes to be used in the visualization
chr <-c('chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10',
'chr11','chr12','chr13','chr14','chr15','chr16','chr17','chr18','chrX')
snpden.master <- snpden
# Reorder the chromosome column of the data frame according to the target order
snpden.master$CHROM <- reorder.factor(snpden.master$CHROM, new.order = target)
# Subset data from chromosomes that are not "NA"
snpden.master <-subset(snpden.master, snpden.master$CHROM!='NA')
snpden.master$groups <- cut(as.numeric(snpden.master$`VARIANTS/KB`),
c(0,0.05,0.1,0.15,0.20,0.25,0.5,0.75,1,1.25,1.5,1.75,2,2.25,2.5,2.75,
3,3.25,3.5,3.75,4,4.25,4.5,4.75,5),
include.lowest = TRUE, labels = c("0", "0.05-0.1", "0.1-0.15", "0.15-0.2",
"0.2-0.25", "0.25-0.5", "0.5-0.75", "0.75-1",
"1-1.25", "1.25-1.5", "1.5-1.75", "1.75-2",
"2-2.25", "2.25-2.5", "2.5-2.75", "2.75-3",
"3-3.25", "3.25-3.5", "3.5-3.75", "3.75-4",
"4-4.25", "4.25-4.5", "4.5-4.75", "4.75-5"))
snpden.master$groups[snpden.master$`VARIANTS/KB` == 0] <- "0"
# Rename CHROM levels
levels(snpden.master$CHROM) <-c('chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10',
'chr11','chr12','chr13','chr14','chr15','chr16','chr17','chr18','chrX')
snpden.master$BIN_START <- as.numeric(as.character(snpden.master$BIN_START))
names_vec <- c("NN114296", "NN114297", "NN114393", "NN115950", "NN190240")
for (chromosome in unique(snpden.master$CHROM)) {
# Subset the data for the current chromosome
snpden.chr <- subset(snpden.master, snpden.master$CHROM == chromosome)
# Define title
title<-expression(paste(italic("Neofelis nebulosa")))
#Create ggplot object
snpden_plot <- snpden.chr %>%
mutate(INDV = factor(INDV, levels = c("NN114296", "NN114297", "NN114393", "NN115950", "NN190240"))) %>%
ggplot(aes(x=BIN_START, y=1)) +
geom_tile(aes(fill=groups)) +
facet_grid(INDV ~ ., switch='y') +
labs(x = 'Chromosome Length' ,
y = 'Scaffold Number' ,
title = expression(paste(italic("Neofelis nebulosa"))),
subtitle = paste0("Chromosome ", chromosome, " heterozygous SNP densities")) +
theme_minimal() +
theme(axis.title.y=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank(),
panel.grid.minor = element_blank(),
panel.grid.major = element_blank(),
strip.text.y.left = element_text(angle = 0, size=8),
panel.spacing.y=unit(0.15, "lines"),
plot.title = element_text(hjust = .5, size = 15),
plot.subtitle = element_text(hjust = .5, size = 13, color = "gray")) +
scale_fill_manual(values = c("#000081", "#0000f3", "#004dff", "#00b3ff", "#29ffce",
"#7bff7b", "#ceff29", "#ffc600", "#ff6800", "#f30900", "brown","#800000","black"),
name = "Variants/kb",
labels = c("<0.05","0.05-0.10","0.10-0.15","0.15-0.20","0.20-0.25",
"0.25-0.50","0.50-0.75","0.75-1.0","1.0-1.25","1.25-1.5",
"1.5-1.75","1.75-2.0","2.0-2.25","2.25-2.5")) +
guides(fill = guide_legend(nrow = 2)) +
theme(legend.position = "bottom", legend.box = "horizontal") +
scale_x_continuous(name='Chromosome length', labels = c('0Mb',"50Mb", '100Mb', "150Mb", '200Mb','250Mb'),
breaks = c(0,50000000, 100000000, 150000000, 200000000,250000000), expand = c(0,0))
ggsave(filename = paste0('Neofelis_',chromosome, '.1Mb.snpden.png'), plot = snpden_plot, device = 'png',
dpi = 600, units = c('cm'), width = 28, height = 6, path = "plots/", bg = "white")
}
snpden_plot
##SNP density by sample:
setwd("C:/Users/annac/Desktop/Grad School/Classes/Bioinformatics for Conservation Genomics/Files")
getwd()
library(tidyverse)
library(gdata)
# Read the SNP density data file
snpden <- read.table("NN_hetsites.snpden", header = T)
colnames(snpden)<-c("CHROM", "BIN_START", "SNP_COUNT", "VARIANTS/KB", "INDV")
# Define the order of the scaffolds to be used in the visualization
target <- c("CM051599.1", "CM051600.1", "CM051601.1", "CM051602.1", "CM051603.1", "CM051604.1",
"CM051605.1", "CM051606.1", "CM051607.1", "CM051608.1", "CM051609.1", "CM051610.1",
"CM051611.1", "CM051612.1", "CM051613.1", "CM051614.1", "CM051615.1", "CM051616.1",
"CM051617.1")
#Define the order of the chromosomes to be used in the visualization
chr <-c('chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10',
'chr11','chr12','chr13','chr14','chr15','chr16','chr17','chr18','chrX')
snpden.master <- snpden
# Reorder the chromosome column of the data frame according to the target order
snpden.master$CHROM <- reorder.factor(snpden.master$CHROM, new.order = target)
# Subset data from chromosomes that are not "NA"
snpden.master <-subset(snpden.master, snpden.master$CHROM!='NA')
snpden.master$groups <- cut(as.numeric(snpden.master$`VARIANTS/KB`),
c(0,0.05,0.1,0.15,0.20,0.25,0.5,0.75,1,1.25,1.5,1.75,2,2.25,2.5,2.75,
3,3.25,3.5,3.75,4,4.25,4.5,4.75,5),
include.lowest = TRUE, labels = c("0", "0.05-0.1", "0.1-0.15", "0.15-0.2",
"0.2-0.25", "0.25-0.5", "0.5-0.75", "0.75-1",
"1-1.25", "1.25-1.5", "1.5-1.75", "1.75-2",
"2-2.25", "2.25-2.5", "2.5-2.75", "2.75-3",
"3-3.25", "3.25-3.5", "3.5-3.75", "3.75-4",
"4-4.25", "4.25-4.5", "4.5-4.75", "4.75-5"))
snpden.master$groups[snpden.master$`VARIANTS/KB` == 0] <- "0"
# Rename CHROM levels
levels(snpden.master$CHROM) <-c('chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10',
'chr11','chr12','chr13','chr14','chr15','chr16','chr17','chr18','chrX')
snpden.master$BIN_START <- as.numeric(as.character(snpden.master$BIN_START))
names_vec <- c("NN114296", "NN114297", "NN114393", "NN115950", "NN190240")
for (individual in unique(snpden.master$INDV)) {
# Subset the data for the current chromosome
snpden.chr <- subset(snpden.master, snpden.master$INDV == individual)
# Define title
title<-expression(paste(italic("Neofelis nebulosa")))
#Create ggplot object
snpden_plot <- snpden.chr %>%
mutate(INDV = factor(INDV, levels = c("NN114296", "NN114297", "NN114393", "NN115950", "NN190240"))) %>%
ggplot(aes(x=BIN_START, y=1)) +
geom_tile(aes(fill=groups)) +
facet_grid(CHROM ~ ., switch='y') +
labs(x = 'Chromosome Length' ,
y = 'Scaffold Number' ,
title = expression(paste(italic("Neofelis nebulosa"))),
subtitle = paste0(individual, " heterozygous SNP densities")) +
theme_minimal() +
theme(axis.title.y=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank(),
panel.grid.minor = element_blank(),
panel.grid.major = element_blank(),
strip.text.y.left = element_text(angle = 0, size=8),
panel.spacing.y=unit(0.15, "lines"),
plot.title = element_text(hjust = .5, size = 15),
plot.subtitle = element_text(hjust = .5, size = 13, color = "gray")) +
scale_fill_manual(values = c("#000081", "#0000f3", "#004dff", "#00b3ff", "#29ffce",
"#7bff7b", "#ceff29", "#ffc600", "#ff6800", "#f30900", "brown","#800000","black","pink","cyan","purple","magenta"),
name = "Variants/kb",
labels = c("<0.05","0.05-0.10","0.10-0.15","0.15-0.20","0.20-0.25",
"0.25-0.50","0.50-0.75","0.75-1.0","1.0-1.25","1.25-1.5",
"1.5-1.75","1.75-2.0","2.0-2.25","2.25-2.5")) +
scale_x_continuous(name='Chromosome length', labels = c('0Mb',"50Mb", '100Mb', "150Mb", '200Mb','250Mb'),
breaks = c(0,50000000, 100000000, 150000000, 200000000,250000000), expand = c(0,0))
ggsave(filename = paste0('Neofelis_',individual, '.1Mb.snpden.png'), plot = snpden_plot, device = 'png',
dpi = 600, units = c('cm'), width = 28, height = 18, path = "plots/", bg = "white")
}
snpden_plot
# Runs of Homozygosity (ROH)
In a nutshell...
1. Calculate ROH using bcftools
2. Edit the output file
3. Plot in R
1. Calculate ROH using bcftools
module load bioinformatics/bcftools
bcftools roh --AF-dflt 0.4 -I -G30 --rec-rate 1.4e-9 /path/to/vcf/file.vcf.gz > prefix.roh.txt
##roh specifies that we want to calculate runs of homozygosity
##--AF-dflt 0.4 sets the default allele frequency to 0.4 when the frequency is missing in the input file.
##I flag tells the roh plugin to perform the imputation of missing genotypes.
##G30 sets the phred-scaled genotype quality threshold to 30. Genotypes below this quality threshold will be treated as missing.
##--rec-rate 1.4e-9 sets the recombination rate to 1.4e-9.
2. Edit the output file
grep "RG" prefix.roh.txt | cut -f 2,3,6 > prefix.roh.edited.txt
##This will process the output file from bcftools to retain specific information.
##For example, you can extract the chromosome, start, and end positions of ROH using the above command.
3. Plot in R
##For the ROH plots, we are only interested in the “RG” portion of the output files, where it contains the homozygous blocks in the genome.
setwd("C:/Users/annac/Desktop/Grad School/Classes/Bioinformatics for Conservation Genomics/Files")
getwd()
# Load libraries and read data
library(tidyverse)
library(ggrepel)
# Read data with read_delim() for better control over input file parsing
clouded_roh <- read_delim("NN_6samples_HD_PASS_DP5.roh.edited.txt", delim = "\t", skip
= 1, col_names = c("Sample", "Chromosome", "RoH_length"))
# Compute NROH and SROH
clouded_nroh <- clouded_roh %>%
group_by(Sample) %>%
summarize(NROH = n())
clouded_sroh <- clouded_roh %>%
group_by(Sample) %>%
summarize(SROH = sum(RoH_length))
# Compute FROH
clouded_froh <- inner_join(clouded_nroh, clouded_sroh, by = "Sample") %>%
mutate(FROH = NROH / SROH)
# Create a table with NROH, SROH, and FROH for each sample
summary_table <- clouded_froh
# Display the table
print(summary_table)
# Save the table to a CSV file
write_csv(summary_table, "summary_table.csv")
# Plot NROH vs. SROH and save the plot to a file
froh_plot <- inner_join(clouded_nroh, clouded_sroh, by = "Sample") %>%
ggplot(aes(x = SROH, y = NROH, color=Sample)) +
geom_point(size = 3, shape=2) +
geom_text_repel(aes(label = Sample)) +theme_minimal() + ##splits point and text, so there's no overlap
theme(plot.title = element_text(hjust = 0.5)) +
labs(title = "NROH vs. SROH")
froh_plot
ggsave("froh_plot.png", froh_plot, width = 8, height = 6, dpi = 300)
# Create a boxplot of RoH lengths for each sample and save the plot to a file
roh_boxplot <- clouded_roh %>%
ggplot(aes(x = Sample, y = RoH_length, color=Sample)) +
geom_boxplot() +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5)) +
labs(title = "RoH Lengths per Sample")
roh_boxplot
ggsave("roh_boxplot.png", roh_boxplot, width = 8, height = 6, dpi = 300)
# Annotate deleterious variants with SnpEff and SnpSift
##Before using SnpEff and SnpSift, you need to have a reference genome and annotation files. If you're working with non-model organisms, you might have to create a custom SnpEff database using the organism's reference genome and annotation files. The annotation files usually include gene models in GFF3, GTF, or Gencode format. Another option is to map your reads to a closely related species that already has its genome in the SnpEff database.
1. Creating a custom SnpEff database
a. Download the SnpEff software and set up the environment:
wget http://sourceforge.net/projects/snpeff/files/snpEff_latest_core.zip
unzip snpEff_latest_core.zip
cd snpEff
b. Create a new directory for your non-model organism in the data folder:
mkdir data/your_organism
c. Copy the reference genome (FASTA) and annotation (GFF3, GTF, or Gencode) files into the new directory:
cp /path/to/reference_genome.fasta data/your_organism/
cp /path/to/annotation_file.gff3 data/your_organism/
d. Modify the snpEff.config file to include the new genome:
echo "your_organism.genome : Your_Organism" >> snpEff.config
e. Build the SnpEff database:
java -jar snpEff.jar build -gff3 -v your_organism
##v flag specifies the directory in which all your files are found (reference genome, annotaion files, and new genome)
2. Mapping the genome against a reference with a SnpEff database
##To map the genome of your non-model organism against a reference with a SnpEff database, you will first need to align the reads to the reference genome using an aligner like BWA or Bowtie2. Once you have the alignments in BAM or SAM format, you can call variants using a tool like GATK, FreeBayes, or Samtools.
1. Annotating variants with SnpEff
With the custom SnpEff database created and the variants called, you can now annotate the variants using SnpEff:
java -jar snpEff.jar your_organism input.vcf > annotated_output.vcf
2. Analyzing deleterious variants with SnpSift
SnpSift is a collection of tools that can be used to filter, sort, and analyze the annotated VCF files. You can filter deleterious variants using SnpSift Filter:
java -jar SnpSift.jar filter "ANN[0].EFFECT has 'missense_variant' | ANN[0].EFFECT has 'frameshift_variant'" annotated_output.vcf > deleterious_variants.vcf
Some other examples of filters you can use with SnpSift
a) Filtering by quality (QUAL) and depth (DP):
java -jar SnpSift.jar filter "(QUAL > 30) & (DP > 10)" input.vcf > filtered_output.vcf
b) Filtering by minor allele frequency (AF) for a specific population in a 1000 Genomes Project VCF file:
java -jar SnpSift.jar filter "AF < 0.05" input.vcf > filtered_output.vcf
c) Filtering by impact, retaining only HIGH impact variants:
java -jar SnpSift.jar filter "ANN[0].IMPACT has 'HIGH'" input.vcf > high_impact_output.vcf
d) Filtering by specific gene:
java -jar SnpSift.jar filter "ANN[0].GENE has 'BRCA1'" input.vcf > brca1_output.vcf
e) Filtering variants that are either missense, frameshift, or stop gained, and have a SIFT score below 0.05:
java -jar SnpSift.jar filter "(ANN[0].EFFECT has 'missense_variant' | ANN[0].EFFECT has 'frameshift_variant' | ANN[0].EFFECT has 'stop_gained') & (ANN[0].SIFT_SCORE < 0.05)" input.vcf > filtered_output.vcf
3. Enrichment analysis with g:Profiler
##Provides essential insights into the biological context and functional implications of these genes.
import argparse
import pandas as pd
from gprofiler import GProfiler
def read_gene_list(file_path):
with open(file_path, "r") as file:
genes = [line.strip() for line in file.readlines()]
return genes
def run_gprofiler(genes, output_prefix):
gp = GProfiler(return_dataframe=True)
enrichment_results = gp.profile(organism='hsapiens', query=genes)
# Save the raw results to a CSV file
enrichment_results.to_csv(f"{output_prefix}_raw_enrichment_results.csv", index=False)
return enrichment_results
def parse_and_save_results(enrichment_results, output_prefix):
filtered_results = enrichment_results[
(enrichment_results["p_value"] <= 0.05) &
(enrichment_results["source"].isin(["GO:BP", "GO:MF", "GO:CC", "KEGG", "HP"]))
]
sorted_results = filtered_results.sort_values("p_value")
# Save all significant categories to a new CSV file
sorted_results.to_csv(f"{output_prefix}_significant_enrichment_categories.csv", index=False)
# Save all significant categories to a new Excel file
sorted_results.to_excel(f"{output_prefix}_significant_enrichment_categories.xlsx", index=False)
return sorted_results
# Set up command-line argument parsing
parser = argparse.ArgumentParser(description="Run g:Profiler and parse enrichment analysis results")
parser.add_argument("gene_list_file", help="Input file containing the list of genes")
parser.add_argument("output_prefix", help="Output file prefix for significant enrichment categories")
args = parser.parse_args()
# Read the gene list from the input file
genes = read_gene_list(args.gene_list_file)
# Run g:Profiler and save the raw results
enrichment_results = run_gprofiler(genes, args.output_prefix)
# Parse the results and save significant categories
significant_categories = parse_and_save_results(enrichment_results, args.output_prefix)
# Display significant categories
print(significant_categories)