# Using SNPfiltR and vcfR to filter SNPs
###### tags: `RADSeq` `STACKS`
---
author: Emma Fetterly
---
**Resources:**
1. Link to STACKS website:
https://catchenlab.life.illinois.edu/stacks/
*This guide is meant to help users with the basic workflow and to produce output files for population genetic analyses.*
2. Bioinformatics guide by Eric C. Anderson
https://eriqande.github.io/eca-bioinf-handbook/shell-programming.html
3. Speciation & Population Genomics: a how-to-guide
https://speciationgenomics.github.io/mapping_reference/
4. Source of workflow code:
https://devonderaad.github.io/SNPfiltR/
```
####VcfR/SNPfiltR SNP Filtering CACO ###
#Emma Fetterly
#CACO ddRADseq data analysis
#following the workflow from https://devonderaad.github.io/SNPfiltR/
#clear environment
rm(list = ls())
#install packages
install.packages(c("vcfR", "SNPfiltR", "adegenet", "hierfstat", "diveRsity", "vegan", "reshape2"))
#load libraries
library("vcfR")
library("SNPfiltR")
library("adegenet")
library("hierfstat")
library("diveRsity")
library("vegan")
library("reshape2")
#### Data ####
#load in the minimal filtering file
vcf <-read.vcfR("populations.snps.vcf")
#read in popmap file
#Note about popmaps: A popmap file helps the code understand which population each sample belongs to - it should have two columns,
#one column with the population ID and one with the sample ID
popmap <- data.frame(
id = colnames(vcf@gt)[2:length(colnames(vcf@gt))],
pop = substr(colnames(vcf@gt)[2:length(colnames(vcf@gt))], 1, 2)
)
#this messsed up IB1 and IB 2 - need to correct them to include 3 characters
# in pop ID
rows_to_correct <- popmap$pop %in% c("IB")
popmap$pop[rows_to_correct] <- substr(popmap$id[rows_to_correct], 1, 3)
vcf
#check metadata in dataset
# ***** Object of Class vcfR *****
# 45 samples
# 38 CHROMs
# 53,297 variants
# Object size: 90.3 Mb
# 57.5 percent missing data #this is strange...there is definitely missing data in this file
#visualize distributions
hard_filter(vcf=vcf)
#> no depth cutoff provided, exploratory visualization will be generated.
# note about this code: when no parameters are listed in SNPfiltR it generates "exploratory visuals" to explore the data
# View these in the "plots" window
#The guide suggests to implement filters that DO NOT look at missing data first and then move on to missing by SNP and missing by sample
#### Min Depth and Genotype Quality ####
#this filtering is based on the recommendations from the workflow guide, we can modify to suit our own data
# the "minimal filtering" dataset should already be filtered to min DP of 3
#hard filter to a min depth of 5 and genotype quality of 30
vcf<-hard_filter(vcf=vcf, depth = 5, gq = 30)
# 25.53% of genotypes fall below a read depth of 5 and were converted to NA
# 7.02% of genotypes fall below a genotype quality of 30 and were converted to NA
#### Allele balance ####
# What is allele balance?
#“Allele balance: a number between 0 and 1 representing the ratio of reads showing
# the reference allele to all reads, considering only reads from individuals called as
# heterozygous, we expect that the allele balance in our data (for real loci)
# should be close to 0.5”
#execute allele balance filter
vcf<-filter_allele_balance(vcf)
# 15.88% of het genotypes (2.97% of all genotypes)
# fall outside of 0.25 - 0.75 allele balance ratio and were converted to NA
#### Max Depth ####
# This filter is applied ‘per SNP’ rather than ‘per genotype’
# otherwise we would simply be removing most of the genotypes from our
# deepest sequenced samples (because sequencing depth is so variable between samples).
# By filtering per SNP, we remove the SNPs with outlier depth values, which are most
# likely to be spuriously mapped/built paralagous loci.
#visualize and pick appropriate max depth cutoff
max_depth(vcf)
#cutoff is not specified, exploratory visualization will be generated.
#dashed line indicates a mean depth across all SNPs of 11.6
#a good rule of thumb for max depth is mean depth x 2 (https://speciationgenomics.github.io/filtering_vcfs/),
#so max depth = 25 in this case, which might be too low
#filter vcf by the max depth cutoff you chose
vcf<-max_depth(vcf, maxdepth = 50)
#8.17% of SNPs were above a mean depth of 50 and were removed from the vcf
#check vcf to see how many SNPs we have left
vcf
# ***** Object of Class vcfR *****
# 45 samples
# 35 CHROMs
# 48,927 variants
# Object size: 66.3 Mb
# 73.54 percent missing data
#### Missing by samples ####
#run function to visualize missing by samples
missing_by_sample(vcf=vcf, popmap=popmap)
# Drop .9 missing data by sample
vcf<-missing_by_sample(vcfR=vcf, cutoff = .9)
#11 samples are above a 0.9 missing data cutoff, and were removed from VCF
#subset popmap to only include retained individuals
popmap<-popmap[popmap$id %in% colnames(vcf@gt),]
#remove invariant sites generated by dropping individuals
vcf<-min_mac(vcf, min.mac = 1)
# 27.64% of SNPs fell below a minor allele count of 1 and were removed from the VCF
#verify that missing data is not driving clustering patterns among the retained samples
miss<-assess_missing_data_pca(vcfR=vcf, popmap = popmap, thresholds = .8, clustering = FALSE)
#> cutoff is specified, filtered vcf object will be returned
# 88% of SNPs fell below a completeness cutoff of 0.8 and were removed from the VCF
#### Missing by SNP ####
#Run function to visualize missing by SNPs
missing_by_snp(vcf)
# filt missingness snps.retained
# 1 0.30 0.37057890 19532
# 2 0.50 0.27785586 13979
# 3 0.60 0.21946626 10525
# 4 0.65 0.18741031 8649
# 5 0.70 0.17071346 7704
# 6 0.75 0.13667732 5873
# 7 0.80 0.10432970 4247
# 8 0.85 0.08656966 3408
# 9 0.90 0.05446186 1996
# 10 0.95 0.01428445 663
# 11 1.00 0.00000000 341
#verify that missing data is not driving clustering patterns among the retained samples at some reasonable thresholds
miss<-assess_missing_data_pca(vcfR=vcf, popmap = popmap, thresholds = c(.7,.8,.85), clustering = FALSE)
# cutoff is specified, filtered vcfR object will be returned
# 58.43% of SNPs fell below a completeness cutoff of 0.7 and were removed from the VCF
# cutoff is specified, filtered vcfR object will be returned
# 78.24% of SNPs fell below a completeness cutoff of 0.7 and were removed from the VCF
# cutoff is specified, filtered vcfR object will be returned
# 88% of SNPs fell below a completeness cutoff of 0.8 and were removed from the VCF
# cutoff is specified, filtered vcfR object will be returned
# 90.37% of SNPs fell below a completeness cutoff of 0.85 and were removed from the VCF
# choose a value that retains an acceptable amount of missing data in each sample,
# and maximizes SNPs retained while minimizing overall missing data, and filter vcf
vcf <-missing_by_snp(vcf, cutoff = .7)
# 78.24% of SNPs fell below a completeness cutoff of 0.7 and were removed from the VCF
vcf
# ***** Object of Class vcfR *****
# 34 samples
# 19 CHROMs
# 7,704 variants
# Object size: 19.2 Mb
# 17.07 percent missing data
## MAC cutoff
#investigate clustering patterns with and without a minor allele cutoff
#use min.mac() to investigate the effect of multiple cutoffs
vcf.mac<-min_mac(vcfR = vcf, min.mac = 2)
#> 39.21% of SNPs fell below a minor allele count of 3 and were removed from the VCF
#assess clustering without MAC cutoff
miss<-assess_missing_data_tsne(vcf, popmap, clustering = FALSE)
#assess clustering with MAC cutoff
miss<-assess_missing_data_tsne(vcf.mac, popmap, clustering = FALSE)
#Thinning this file
vcfR.thin <-distance_thin(vcf_50sample_75SNP_mac, min.distance = 500)
#based on these patterns we might need to use a MAC - seems like it is affecting clustering
vcf.mac<-min_mac(vcfR = vcf, min.mac = 2)
vcf.mac
#***** Object of Class vcfR *****
# 34 samples
# 15 CHROMs
# 4,927 variants
# Object size: 12.5 Mb
# 17.71 percent missing data
#plot depth per snp and per sample
dp <- extract.gt(vcf.mac, element = "DP", as.numeric=TRUE)
heatmap.bp(dp, rlabels = FALSE)
#plot genotype quality per snp and per sample
gq <- extract.gt(vcfR, element = "GQ", as.numeric=TRUE)
heatmap.bp(gq, rlabels = FALSE)
# DP830 and DP831 seem to have some issues...maybe remove?
#linkage filter vcf to thin SNPs to one per 500bp
vcfR.thin<-distance_thin(vcf.mac, min.distance = 500)
vcfR.thin
#***** Object of Class vcfR *****
# 34 samples
# 15 CHROMs
# 1,920 variants
# Object size: 5 Mb
# 18.55 percent missing data
# ***** ***** *****
#write out thinned vcf
vcfR::write.vcf(vcfR.thin, "~/CACO.filtered.thinned.vcf.gz")
# Final dataset "vcfR.thin" :
# Min depth of 5
# Min genotype quality of 30
# Allele balance between 0.25 and 0.75
# Max depth of 50
# Missing by sample cuoff 90%
# Missing by SNP 70%
# Min MAC 2
# thinned to 1 SNP every 500 bp
```
# Running admixture...
used plink output from this: data are filtered for LD,
```
plink --vcf CACO.filtered.vcf.gz --double-id --allow-extra-chr --set-missing-var-ids @:# --extract plink_CACO_filter.5.prune.in --make-bed --pca --out caco
```
ADMIXTURE does not accept chromosome names that are not human chromosomes. We will thus just exchange the first column by 0
```
awk '{$1="0";print $0}' caco.bim > caco.bim.tmp
mv caco.bim.tmp caco.bim
awk '{$1="0";print $0}' caco_2.bim > caco_2.bim.tmp
mv caco_2.bim.tmp caco_2.bim
```
practice with k=2
```
admixture --cv caco.bed 2 > log2.out
```
make a loop for k 3-10 with 10 fold cross validation
```
for i in {1..10}
do
admixture -s time --cv=10 caco.bed $i > log${i}_REP6.out
done
for i in {1..10}
do
admixture -s time --cv=10 caco_2.bed $i > log${i}_r2.2.out
done
```
Change to make set seed different for each run?
```
admixture -s time --cv file.ped K -jN
```
```
for K in {1..11}; do
for REP in {1..20}; do
admixture -cv caco.bed $K > CACO_admix_K${K}_rep${REP}.out
done
done
```
Make a combined list of the CV
```
awk '/CV/ {print $3,$4}' *out | cut -c 4,7-20 > caco.cv.error
```
2 0.69832
3 0.83582
4 0.91816
5 1.01791
6 1.23774
7 1.29736
8 1.43955
9 1.32922
10 1.39969
11 1.43884
```
awk '{split($1,name,"."); print $1,name[2]}' caco.nosex > caco.list
```
```
wget https://github.com/speciationgenomics/scripts/raw/master/plotADMIXTURE.r
chmod +x plotADMIXTURE.r
Rscript plotADMIXTURE.r -p caco -i pops.txt -k 2 -l DP,NR,SR,MC,HP,IB1,IB2,SP,PS,MW
Rscript plotADMIXTURE.r -p caco -i pops.txt -k 10 -l DP,NR,SR,MC,HP,IB1,IB2,SP,PS,MW
```