# SnpEff/SnpSift for all lethal lines
To filter VCFs for high impact mutations, I first tried using snpEff directly in the terminal.
L19 - DF(2R)ED1725
```
#!/bin/bash
#SBATCH -e errs/L19_SnpSift_ed1725.err
#SBATCH -o outs/L19_SnpSift_ed1725.out
#SBATCH --mem-per-cpu=20G
#SBATCH -p scavenger
#SBATCH --mail-type=ALL --mail-user=sbm34@duke.edu
module load Java/1.8.0_60
java -Xmx8g -jar /work/sbm34/lethal/snpEff/SnpSift.jar filter \
"( CHROM = '2R' ) & \
( POS > 7613924 ) & ( POS < 8156045 ) & \
((ANN[*].IMPACT = 'HIGH'))" \
/work/sbm34/lethal/L19/dmel_L19_chr2.ann.vcf > /work/sbm34/lethal/L19/dmel_L19_highimp_ed1725.vcf
```
This seems to work okay, but Kieran showed me some example R code (that I've added to/tweaked) that used vcfR to filter the annotated VCF based on Quality score, chromosome position and predicted effect size. This seems to be a better filter. Code below:
install.packages("tidyverse")
install.packages("vcfR")
library("tidyverse")
library("vcfR")
# the snpeff VCF for chromosome 2R
vcf_file <- "/Users/sarah/Downloads/dmel_L73_chr2.ann.vcf"
vcf_df <- read.vcfR(file = vcf_file)
# extract the fixed fields\
#From my understanding "fix_names" will 1) Return the full INFO field if Base Quality is reported
#2) If Base Quality it not reported but Mapping Quality rank IS reported, then set Base Quality Rank, Mapping Quality rank and ReadPosRankSum to NA
#3) If neither Base Quality nor Mapping Quality rank is reported but Read position is reported, set Base Quality Rank and ReadPosRankSum to NA
#4) If BaseQ, MQRank and ReadPos are all NOT reported, then just need to set BaseQRankSum to be NA
#fix_names(tmp)
```
fix_names <- function(x){
if(grepl("BaseQ", x)){
return(x)
} else{
if (!grepl("MQRank", x)){
gsub("\\;DP", ";BaseQRankSum=NA\\;DP", x) %>%
gsub("\\;QD", ";MQRankSum=NA\\;QD", .) %>%
gsub("\\;SOR", ";ReadPosRankSum=NA\\;SOR", .)
} else{
if(!grepl("ReadPos", x)){
gsub("\\;DP", ";BaseQRankSum=NA\\;DP", x) %>%
gsub("\\;SOR", ";ReadPosRankSum=NA\\;SOR", .)
} else{
gsub("\\;DP", ";BaseQRankSum=NA\\;DP", x)
}
}
}
}
```
#### This converts the vcf to a data frame, makes the INFO field into a character variable.
#### Rowwise "allows you to compute on a data frame a row-at-a-time"
#### Then applies the 'fix_names' function to the INFO field
#### The next line uses a regular expression to replace any string of characters in between a semicolon and an equals sign with just a semicolon.
#### I think this is to make everything in the INFO column into values that can be separated into their own column/variable!
#### I'm not sure what ungroup does here
#### Then seperate the values that you just cleaned up in the INFO field into their own columns (specify they are separated by semicolons)
#### Type_convert I think is used frequently after converting a field to a character field (using regular expressions to clean up) and then to re-convert the character fields back to what they were
```
vcf_fix <- vcf_df@fix %>%
data.frame %>%
mutate(INFO = as.character(INFO)) %>%
rowwise %>%
mutate(INFO = fix_names(INFO)) %>%
mutate(INFO = gsub(";[A-Za-z]+=", ";", INFO)) %>%
ungroup %>%
separate(INFO, into = c("AC", "AF", "AN", "BaseQRankSum", "DP", "ExcessHet",
"FS", "MLEAC", "MLEAF", "MQ", "MQRankSum", "QD", "ReadPosRankSum", "SOR", "EFF"), sep = ";") %>%
type_convert
```
#### apply site-level filters to SNPs and indels
#### just using the FIXED fields here, and will following with a left (filtering) join
#### GATK hard filter suggestions SNPs "QD < 2.0 || FS > 60.0 || MQ < 40.0 || MQRankSum < -12.5 || SOR > 3"
#### GATK hard filter suggestions for INDELs "QD < 2.0 || FS > 200.0"
#### (these the criteria for *exclusion*, so are inverted below)
```
vcf_fix <- vcf_fix %>%
mutate(QUAL = as.numeric(QUAL)) %>%
filter(QD > 2.0, QUAL > 30)
```
#### apply the variant-class specific filtering
#### want to be careful not to toss the potential causal variant here
#### so applying only the SNP filters (INDELs are more stringent)
vcf_fix <- vcf_fix %>%
filter(FS < 60.0, MQ > 40.0, MQRankSum > -12.5, SOR < 3, ReadPosRankSum > -8.0)
#indels <- vcf_fix %>%
```
# filter(grepl("[A-Z\\*]{2,}", ALT) | grepl("[A-Z\\*]{2,}", REF)) %>%
# filter(FS < 200.0)
```
#### filtering join to the genotypes
```
gt_df <- vcf_df@gt %>%
data.frame %>%
gather(key = "ind", value = "format", -FORMAT)
chrom_pos <- vcf_df@fix %>%
data.frame %>%
select(CHROM, POS)
gt_df <- data.frame(chrom_pos, gt_df) %>%
separate(format, into = c("GT", "AD", "DP_geno", "GQ", "PL"), sep = "\\:") %>%
select(-FORMAT) %>%
select(c("CHROM", "POS", "ind", "GT", "AD", "DP_geno", "GQ"))
gt_df$POS <- as.numeric(gt_df$POS )
is.numeric(gt_df$POS)
```
# filtering join
gt_df <- left_join(vcf_fix, gt_df, by = c("POS"))
#Change EFF variable into it's own classes so I can filter by effect impact etc
gt_df_test <- gt_df %>%
mutate(EFF = gsub("\\|+", "|", EFF)) %>%
mutate(EFF = gsub("\\(|\\|",";", EFF)) %>%
separate(EFF, into = c("effect", "eff_impact", "functional_class", "codon_change",
"aa_change", "aa_length", "gene_name", "transcript_biotype",
"gene_coding", "transcript_id", "exon_rank", "genotype_number"), sep = ";") %>%
mutate(annotation = paste(functional_class, codon_change,
aa_change, aa_length, gene_name, transcript_biotype,
gene_coding, transcript_id, exon_rank, genotype_number), sep = ";") %>%
select(CHROM.x, POS, REF, ALT, effect, eff_impact, annotation, ind, GT)
gt_df_filter <- rename(gt_df_test, CHROM = CHROM.x,
LineID = ind,
Reference_Allele = REF,
Alternate_Allele = ALT,
Predicted_Effect = eff_impact) %>%
filter(POS > 21806350 & POS< 22592996 & Predicted_Effect=="HIGH" & CHROM=="2R")
write.csv(gt_df_filter, "/Users/sarah/Downloads/dmel_L73_high_impact_bsc597.csv")