SnpEff/SnpSift for all lethal lines

# SnpEff/SnpSift for all lethal lines To filter VCFs for high impact mutations, I first tried using snpEff directly in the terminal. L19 - DF(2R)ED1725 ``` #!/bin/bash #SBATCH -e errs/L19_SnpSift_ed1725.err #SBATCH -o outs/L19_SnpSift_ed1725.out #SBATCH --mem-per-cpu=20G #SBATCH -p scavenger #SBATCH --mail-type=ALL --mail-user=sbm34@duke.edu module load Java/1.8.0_60 java -Xmx8g -jar /work/sbm34/lethal/snpEff/SnpSift.jar filter \ "( CHROM = '2R' ) & \ ( POS > 7613924 ) & ( POS < 8156045 ) & \ ((ANN[*].IMPACT = 'HIGH'))" \ /work/sbm34/lethal/L19/dmel_L19_chr2.ann.vcf > /work/sbm34/lethal/L19/dmel_L19_highimp_ed1725.vcf ``` This seems to work okay, but Kieran showed me some example R code (that I've added to/tweaked) that used vcfR to filter the annotated VCF based on Quality score, chromosome position and predicted effect size. This seems to be a better filter. Code below: install.packages("tidyverse") install.packages("vcfR") library("tidyverse") library("vcfR") # the snpeff VCF for chromosome 2R vcf_file <- "/Users/sarah/Downloads/dmel_L73_chr2.ann.vcf" vcf_df <- read.vcfR(file = vcf_file) # extract the fixed fields\ #From my understanding "fix_names" will 1) Return the full INFO field if Base Quality is reported #2) If Base Quality it not reported but Mapping Quality rank IS reported, then set Base Quality Rank, Mapping Quality rank and ReadPosRankSum to NA #3) If neither Base Quality nor Mapping Quality rank is reported but Read position is reported, set Base Quality Rank and ReadPosRankSum to NA #4) If BaseQ, MQRank and ReadPos are all NOT reported, then just need to set BaseQRankSum to be NA #fix_names(tmp) ``` fix_names <- function(x){ if(grepl("BaseQ", x)){ return(x) } else{ if (!grepl("MQRank", x)){ gsub("\\;DP", ";BaseQRankSum=NA\\;DP", x) %>% gsub("\\;QD", ";MQRankSum=NA\\;QD", .) %>% gsub("\\;SOR", ";ReadPosRankSum=NA\\;SOR", .) } else{ if(!grepl("ReadPos", x)){ gsub("\\;DP", ";BaseQRankSum=NA\\;DP", x) %>% gsub("\\;SOR", ";ReadPosRankSum=NA\\;SOR", .) } else{ gsub("\\;DP", ";BaseQRankSum=NA\\;DP", x) } } } } ``` #### This converts the vcf to a data frame, makes the INFO field into a character variable. #### Rowwise "allows you to compute on a data frame a row-at-a-time" #### Then applies the 'fix_names' function to the INFO field #### The next line uses a regular expression to replace any string of characters in between a semicolon and an equals sign with just a semicolon. #### I think this is to make everything in the INFO column into values that can be separated into their own column/variable! #### I'm not sure what ungroup does here #### Then seperate the values that you just cleaned up in the INFO field into their own columns (specify they are separated by semicolons) #### Type_convert I think is used frequently after converting a field to a character field (using regular expressions to clean up) and then to re-convert the character fields back to what they were ``` vcf_fix <- vcf_df@fix %>% data.frame %>% mutate(INFO = as.character(INFO)) %>% rowwise %>% mutate(INFO = fix_names(INFO)) %>% mutate(INFO = gsub(";[A-Za-z]+=", ";", INFO)) %>% ungroup %>% separate(INFO, into = c("AC", "AF", "AN", "BaseQRankSum", "DP", "ExcessHet", "FS", "MLEAC", "MLEAF", "MQ", "MQRankSum", "QD", "ReadPosRankSum", "SOR", "EFF"), sep = ";") %>% type_convert ``` #### apply site-level filters to SNPs and indels #### just using the FIXED fields here, and will following with a left (filtering) join #### GATK hard filter suggestions SNPs "QD < 2.0 || FS > 60.0 || MQ < 40.0 || MQRankSum < -12.5 || SOR > 3" #### GATK hard filter suggestions for INDELs "QD < 2.0 || FS > 200.0" #### (these the criteria for *exclusion*, so are inverted below) ``` vcf_fix <- vcf_fix %>% mutate(QUAL = as.numeric(QUAL)) %>% filter(QD > 2.0, QUAL > 30) ``` #### apply the variant-class specific filtering #### want to be careful not to toss the potential causal variant here #### so applying only the SNP filters (INDELs are more stringent) vcf_fix <- vcf_fix %>% filter(FS < 60.0, MQ > 40.0, MQRankSum > -12.5, SOR < 3, ReadPosRankSum > -8.0) #indels <- vcf_fix %>% ``` # filter(grepl("[A-Z\\*]{2,}", ALT) | grepl("[A-Z\\*]{2,}", REF)) %>% # filter(FS < 200.0) ``` #### filtering join to the genotypes ``` gt_df <- vcf_df@gt %>% data.frame %>% gather(key = "ind", value = "format", -FORMAT) chrom_pos <- vcf_df@fix %>% data.frame %>% select(CHROM, POS) gt_df <- data.frame(chrom_pos, gt_df) %>% separate(format, into = c("GT", "AD", "DP_geno", "GQ", "PL"), sep = "\\:") %>% select(-FORMAT) %>% select(c("CHROM", "POS", "ind", "GT", "AD", "DP_geno", "GQ")) gt_df$POS <- as.numeric(gt_df$POS ) is.numeric(gt_df$POS) ``` # filtering join gt_df <- left_join(vcf_fix, gt_df, by = c("POS")) #Change EFF variable into it's own classes so I can filter by effect impact etc gt_df_test <- gt_df %>% mutate(EFF = gsub("\\|+", "|", EFF)) %>% mutate(EFF = gsub("\\(|\\|",";", EFF)) %>% separate(EFF, into = c("effect", "eff_impact", "functional_class", "codon_change", "aa_change", "aa_length", "gene_name", "transcript_biotype", "gene_coding", "transcript_id", "exon_rank", "genotype_number"), sep = ";") %>% mutate(annotation = paste(functional_class, codon_change, aa_change, aa_length, gene_name, transcript_biotype, gene_coding, transcript_id, exon_rank, genotype_number), sep = ";") %>% select(CHROM.x, POS, REF, ALT, effect, eff_impact, annotation, ind, GT) gt_df_filter <- rename(gt_df_test, CHROM = CHROM.x, LineID = ind, Reference_Allele = REF, Alternate_Allele = ALT, Predicted_Effect = eff_impact) %>% filter(POS > 21806350 & POS< 22592996 & Predicted_Effect=="HIGH" & CHROM=="2R") write.csv(gt_df_filter, "/Users/sarah/Downloads/dmel_L73_high_impact_bsc597.csv")