---
title: 'taper'
disqus: hackmd
---
Correcting errors in alignments with TAPER
===
By Tanya Lama
### Objective:
TAPER is a tool that looks for errors in small species-specific stretches of the multiple sequence alignments. TAPER uses a two-dimensional outlier detection algorithm that seeks to find what parts of a sequence are outliers given the overall level of divergence of the sequence to others and the level of the divergence of the region in the alignment. Importantly, TAPER adjusts its null expectations per site and species, and in doing so, it attempts to distinguish the real heterogeneity (signal) from errors (noise).
Zhang, C., Zhao, Y., Braun, E. L., & Mirarab, S. (2021). TAPER: Pinpointing errors in multiple sequence alignments despite varying rates of evolution. Methods in Ecology and Evolution, 00, 1– 14. https://doi.org/10.1111/2041-210X.13696
### Acknowledgements
Thanks to Bill Thomas 5ever
## Table of Contents
[TOC]
# Installing julia/ TAPER
see the full directions [on github](https://github.com/chaoszhang/TAPER)
move to our project /bin and download and unzip Julia.
curl -fsSL https://install.julialang.org | sh
# Where is juliaup?
which juliaup
/home/tlama_umass_edu/.juliaup/bin/juliaup
# Create the correction_multi.jl file
```
global R = [Dict("k"=>5, "p"=>0.25, "q"=>0.1, "L"=>30),
Dict("k"=>9, "p"=>0.25, "q"=>0.25, "L"=>54),
Dict("k"=>17, "p"=>0.1, "q"=>0.5, "L"=>Inf)]
PROGRAM_VERSION = v"1.0.0"
try
using ArgParse
catch
import Pkg
Pkg.add("ArgParse")
using ArgParse
end
import Statistics.median
function correction(c, output, k, X, MASK, pvalue, qvalue, threshold)
upperx = ('a' <= X && X <= 'z') ? X - 'a' + 'A' : X
upperc = [uppercase(str) for str in c]
arrc = [Array{Char, 1}(str) for str in c]
n = length(c[1])
m = length(c)
wo = zeros(n, m)
# read sequences and compute per column profiles.
for i in 1:n
cnt = zeros(128)
# read a column, internally represent in upper case, and count letters
for j in 1:m
cnt[UInt8(upperc[j][i])] += 1
end
cnt[UInt8(upperx)] = 0
cnt[UInt8('-')] = 0
unq = length([1 for t in cnt if t > 0])
total = sum(cnt)
for j in 1:m
wo[i, j] = total == 0 ? 0 : total / (unq * cnt[UInt8(upperc[j][i])])
end
end
w1 = [wo[:,j][(arrc[j] .!= '-') .& (arrc[j] .!= X)] for j in 1:m]
w = [[median(arr[i:i+k-1]) for i in 1:length(arr)-k+1] for arr in w1]
ws = [[sum(arr[i:i+k-1]) for i in 1:length(arr)-k+1] for arr in w1]
wsorted = [sort(arr) for arr in w]
wsum = [accumulate(+, arr) for arr in wsorted]
f(x, y, n, m) = x^2 / n + (n == m ? 0 : (y - x)^2 / (m - n))
var = [length(arr) > 0 ? f.(arr, arr[end], 1:length(arr), length(arr)) : [] for arr in wsum]
wCutoff = [length(var[j]) > 0 ? wsorted[j][findmax(var[j])[2]] : 0 for j in 1:m]
cutoffSorted = sort([wCutoff[j] for j in 1:m if length(var[j]) > 0])
cutoffFloor = (pvalue >= 1) ? 0 : cutoffSorted[end - floor(Int, length(cutoffSorted) * pvalue)]
s = zeros(n - k + 1, m, 2)
tiebreaker = zeros(n - k + 1, m, 2)
bt = zeros(Int64, n - k + 1, m, 2)
for j in 1:m
wj = w[j]
wsj = ws[j]
L = length(wj)
if L <= k
push!(output, c[j])
continue
end
s = zeros(L, 2)
tiebreaker = zeros(L, 2)
bt = zeros(Int64, L, 2)
cutoff = max(wCutoff[j], cutoffFloor, (qvalue >= 1) ? 0 : wsorted[j][end - floor(Int, length(wsorted[j]) * qvalue)], threshold)
for i in 1:L
v = (wj[i] > cutoff ? 0 : 1)
if i == 1
s[i, 1] = v
s[i, 2] = 1 - v
tiebreaker[i, 1] = 0
tiebreaker[i, 2] = wsj[i]
else
s[i, 1] = s[i - 1, 1] + v
s[i, 2] = s[i - 1, 2] + 1 - v
tiebreaker[i, 1] = tiebreaker[i - 1, 1]
tiebreaker[i, 2] = tiebreaker[i - 1, 2] + wsj[i]
bt[i, 1] = 1
bt[i, 2] = 2
end
if i > k && (s[i, 1], tiebreaker[i, 1]) < (s[i - k, 2] + v, tiebreaker[i - k, 2])
s[i, 1] = s[i - k, 2] + v
tiebreaker[i, 1] = tiebreaker[i - k, 2]
bt[i, 1] = 2
end
if i > k && (s[i, 2], tiebreaker[i, 2]) < (s[i - k, 1] + 1 - v, tiebreaker[i - k, 1])
s[i, 2] = s[i - k, 1] + 1 - v
tiebreaker[i, 2] = tiebreaker[i - k, 1] + wsj[i]
bt[i, 2] = 1
end
end
str = arrc[j][(arrc[j] .!= '-') .& (arrc[j] .!= X)]
icur = L
if s[L, 1] < s[L, 2]
str[L:L+k-1] .= MASK
bcur = 2
else
bcur = 1
end
while true
if bcur == 1 && bt[icur, bcur] == 1
icur -= 1
bcur = 1
elseif bcur == 1 && bt[icur, bcur] == 2
icur -= k
bcur = 2
str[icur : icur + k - 1] .= MASK
elseif bcur == 2 && bt[icur, bcur] == 1
icur -= k
bcur = 1
elseif bcur == 2 && bt[icur, bcur] == 2
icur -= 1
bcur = 2
str[icur] = MASK
elseif bcur == 1
break
else
str[1:icur - 1] .= MASK
break
end
end
strout = ""
i = 1
for t in 1:length(c[j])
if c[j][t] == X || c[j][t] == '-'
strout *= c[j][t]
else
strout *= str[i]
i += 1
end
end
push!(output, strout)
end
end
function ArgParse.parse_item(::Type{Char}, x::AbstractString)
return x[1]
end
function parse_commandline()
s = ArgParseSettings()
@add_arg_table! s begin
"--list", "-l"
help = "running on a list of inputs; for every two lines of the list file, the first one should be the path to the input and the second should be the path to its output"
action = :store_true
"--mask", "-m"
help = "the character to mask erroneous regions"
arg_type = Char
default = 'X'
"--any", "-a"
help = "the character to denote ambiguous positions or character to denote ANY in the input files"
arg_type = Char
default = 'X'
"--cutoff", "-c"
help = "set score cutoff to control the minimum aggressiveness of masking (should be > 1)"
arg_type = Float64
default = 3.0
"--parameter", "-p"
help = "(Advanced) load the list of k, p, q, and L from the input parameter file"
arg_type = String
"input"
help = "a fasta file as input"
required = true
end
return parse_args(s)
end
function correction_multi(args, fin, fout)
inputText = read(fin, String)
temp = split(inputText, ">")
temp = temp[length.(temp) .> 0]
temp = [split(arr, "\n") for arr in temp]
header = [arr[1] for arr in temp]
c = [string(arr[2:end]...) for arr in temp]
arrc = [Array{Char, 1}(str) for str in c]
n = length(c[1])
m = length(c)
MASK = args["mask"]
for r in R
output = []
correction(c, output, get(r, "k", 9), args["any"], '*', get(r, "p", 0), get(r, "q", 0), args["cutoff"])
L = get(r, "L", Inf)
for j in 1:m
start = 0
cnt = 0
for i in 1:n
if start == 0 && output[j][i] == '*'
start = i
cnt = 1
end
if start != 0 && output[j][i] == '*'
cnt += 1
end
if start != 0 && output[j][i] != '*' && output[j][i] != '-'
if cnt < L
for k in start:i-1
arrc[j][k] = (output[j][k] == '-') ? '-' : MASK
end
end
start = 0
cnt = 0
end
end
if start != 0 && cnt < L
for k in start:n
arrc[j][k] = (output[j][k] == '-') ? '-' : MASK
end
end
end
end
for j in 1:m
println(fout, ">" * header[j])
println(fout, String(arrc[j]))
end
end
function main()
println(stderr, "Version " * string(PROGRAM_VERSION))
args = parse_commandline()
if isnothing(args["parameter"]) == false
global R = eval(Meta.parse(read(open(args["parameter"], "r"), String)))
end
if args["list"] == false
correction_multi(args, open(args["input"], "r"), stdout)
else
temp = split(read(open(args["input"], "r"), String), "\n")
temp = temp[length.(temp) .> 0]
for i = 2:2:length(temp)
try
println(stderr, "Processing " * temp[i - 1] * "...")
correction_multi(args, open(temp[i - 1], "r"), open(temp[i], "w"))
catch
println(stderr, "Error happened when processing " * temp[i - 1] * ".")
println(stderr)
end
end
end
end
main()
```
# Run TAPER
Usage is
/home/tlama_umass_edu/.juliaup/bin/julia correction_multi.jl -m N input > output
```
/home/tlama_umass_edu/.juliaup/bin/julia correction_multi.jl -m N ENST00000678030.PANO1.cleanLb_hmm_manual.fasta > julia-ENST00000678030.PANO1.cleanLb_hmm_manual.fasta
```
# Run aBSREL on our cleanLb_hmm_manual alignment
hyphy -i [work through the questions in interactive mode]
# Run aBSREL on TAPER corrected alignment
hyphy -i
# Compare results
### Original:
### Adaptive branch site random effects likelihood test
Likelihood ratio test for episodic diversifying positive selection at Holm-Bonferroni corrected _p = 0.0500_ found **13** branches under selection among **18** tested.
* HLsacLep1, p-value = 0.00000
* HLrhiTri2, p-value = 0.00000
* HLrhiSed2, p-value = 0.00000
* HLphyDis3, p-value = 0.00000
* HLaseSto2, p-value = 0.00000
* HLartJam3, p-value = 0.00000
* HLrhiLuc4, p-value = 0.00001
* HLdipEca1, p-value = 0.00001
* HLpipKuh2, p-value = 0.00002
* felCat9, p-value = 0.00006
* hg38, p-value = 0.00282
* HLphyHas1, p-value = 0.00287
* HLrouAeg4, p-value = 0.04412
### TAPER corrected
### Adaptive branch site random effects likelihood test
Likelihood ratio test for episodic diversifying positive selection at Holm-Bonferroni corrected _p = 0.0500_ found **12** branches under selection among **18** tested.
* HLrhiTri2, p-value = 0.00000
* HLrhiSed2, p-value = 0.00000
* HLphyDis3, p-value = 0.00000
* HLsacLep1, p-value = 0.00000
* HLartJam3, p-value = 0.00000
* HLaseSto2, p-value = 0.00000
* HLrhiLuc4, p-value = 0.00001
* HLdipEca1, p-value = 0.00002
* HLpipKuh2, p-value = 0.00002
* felCat9, p-value = 0.00005
* hg38, p-value = 0.00307
* HLphyHas1, p-value = 0.00291
looks like we lost the result from HLrouAeg4, however this site likely would have fallen out with FDR correction (above are BH corrected pvalues but we do an additional correction in post processing)
# Compare results in AliView
# Compare the results using MEME
### Original:
>Please choose an option (or press q to cancel selection):1
* Log(L) = -7306.01
* non-synonymous/synonymous rate ratio for *background* = 0.7622
* non-synonymous/synonymous rate ratio for *test* = 0.7565
### For partition 1 these sites are significant at p <=0.05
| Codon | Partition | alpha | beta+ | p+ | LRT |Episodic selection detected?| # branches |Most common codon substitutions at this site|
|:----------:|:----------:|:----------:|:----------:|:----------:|:----------:|:--------------------------:|:----------:|:------------------------------------------:|
| 13 | 1 | 0.050 | 170.501 | 0.079 | 5.995 | Yes, p = 0.0226 | 1 | [1]CAT>ATC,GCT>CAT,GTT>GCT,GTT>TGC |
| 15 | 1 | 0.830 | 9.465 | 0.495 | 4.687 | Yes, p = 0.0443 | 7 |[1]CAC>ACC,CAT>CAC,CAT>CCC,CAT>CCT,CAT>CG...|
| 27 | 1 | 0.172 | 176.094 | 0.102 | 8.495 | Yes, p = 0.0063 | 1 | [2]GCG>GTG|[1]ACG>GCG,GCG>CGC |
| 41 | 1 | 0.118 |10000.000...| 0.304 | 15.866 | Yes, p = 0.0002 | 2 |[2]GGA>GGC|[1]ACA>CAA,ACA>GAC,GGA>AAG,GGA...|
| 46 | 1 | 1.857 | 75.370 | 0.642 | 4.849 | Yes, p = 0.0408 | 3 |[1]GTG>AGG,GTG>ATG,GTG>CCG,GTG>CGC,GTG>GC...|
| 54 | 1 | 0.000 | 42.346 | 0.274 | 5.795 | Yes, p = 0.0250 | 3 |[2]GGG>GCG|[1]GAG>CGG,GAG>GGA,GGG>GAG,GGG...|
| 56 | 1 | 1.001 | 29.788 | 0.245 | 10.044 | Yes, p = 0.0029 | 4 |[2]GGC>CGC,GGC>GGT|[1]GGC>CAC,GGC>CAG,GGC...|
| 60 | 1 | 1.268 | 23.823 | 0.335 | 9.442 | Yes, p = 0.0039 | 4 |[3]TGT>TGC|[2]TGT>GTC|[1]AGT>TGT,GGT>AGT,...|
| 65 | 1 | 0.000 | 239.277 | 0.481 | 13.944 | Yes, p = 0.0004 | 7 |[2]GTC>CCA|[1]GTC>ATG,GTC>CAG,GTC>GAT,GTC...|
| 67 | 1 | 0.000 | 15.404 | 0.266 | 13.229 | Yes, p = 0.0006 | 3 |[2]ACC>GAC|[1]AAC>CCA,ACC>AAC,ACC>ATG,ATG...|
| 68 | 1 | 1.052 | 114.888 | 0.290 | 7.537 | Yes, p = 0.0103 | 4 |[2]GGG>GGC,GGG>TGG,TGG>GTC|[1]GGG>GCT,TGG...|
| 71 | 1 | 0.860 | 427.396 | 0.386 | 8.859 | Yes, p = 0.0052 | 5 |[1]ACG>ACA,ACG>AGG,ACG>CAG,ACG>CCA,ACG>GG...|
| 73 | 1 | 0.000 |10000.000...| 0.143 | 6.481 | Yes, p = 0.0176 | 2 |[2]ACG>ATG|[1]AAG>ACG,ACG>CCG,ACG>GTC,ATA...|
| 79 | 1 | 0.000 | 474.307 | 0.131 | 21.851 | Yes, p = 0.0000 | 2 | [1]CGA>GAT,CGA>GCC |
| 91 | 1 | 0.000 | 433.397 | 0.111 | 9.688 | Yes, p = 0.0034 | 2 | [1]CCC>TCC,TCC>ACC,TCC>CGT |
| 106 | 1 | 1.073 | 672.590 | 0.260 | 8.261 | Yes, p = 0.0071 | 2 |[1]CGT>GCC,TCT>TAC,TGC>CGT,TGC>GCA,TGC>GC...|
| 108 | 1 | 0.000 | 506.817 | 0.154 | 12.511 | Yes, p = 0.0008 | 2 |[1]CCG>CAG,CCG>GCA,CCG>GGG,CCG>TCG,CTG>GC...|
| 109 | 1 | 0.000 | 2077.696 | 0.347 | 15.398 | Yes, p = 0.0002 | 6 |[2]GGC>GCT|[1]AGC>CCC,GGC>ACG,GGC>AGC,GGC...|
| 131 | 1 | 0.000 | 22.126 | 0.196 | 5.471 | Yes, p = 0.0296 | 3 | [2]CGG>CCG|[1]CGG>CTC |
| 154 | 1 | 0.000 | 97.984 | 0.230 | 6.529 | Yes, p = 0.0172 | 3 | [1]CAG>GAG,CAG>GCC,GAG>ACC,GAG>CTC |
| 196 | 1 | 0.000 | 120.736 | 0.089 | 6.833 | Yes, p = 0.0147 | 1 | [1]CTC>CGC,CTC>GCT |
| 203 | 1 | 0.321 | 212.486 | 0.137 | 7.771 | Yes, p = 0.0091 | 2 | [1]CTC>CCG,CTC>CTG,CTC>GCA |
### ** Found _22_ sites under episodic diversifying positive selection at p <= 0.05**
### TAPER corrected:
>Please choose an option (or press q to cancel selection):1
* Log(L) = -7299.71
* non-synonymous/synonymous rate ratio for *background* = 0.7620
* non-synonymous/synonymous rate ratio for *test* = 0.7541
### For partition 1 these sites are significant at p <=0.05
| Codon | Partition | alpha | beta+ | p+ | LRT |Episodic selection detected?| # branches |Most common codon substitutions at this site|
|:----------:|:----------:|:----------:|:----------:|:----------:|:----------:|:--------------------------:|:----------:|:------------------------------------------:|
| 13 | 1 | 0.053 | 149.763 | 0.080 | 5.983 | Yes, p = 0.0227 | 1 | [1]CAT>ATC,GCT>CAT,GTT>GCT,GTT>TGC |
| 15 | 1 | 0.726 | 15.691 | 0.394 | 4.587 | Yes, p = 0.0467 | 1 |[1]CAC>ACC,CAT>CAC,CAT>CCC,CAT>CCT,CAT>CG...|
| 27 | 1 | 0.172 | 176.094 | 0.102 | 8.466 | Yes, p = 0.0064 | 1 | [2]GCG>GTG|[1]ACG>GCG,GCG>CGC |
| 41 | 1 | 0.119 |10000.000...| 0.302 | 15.942 | Yes, p = 0.0001 | 2 |[2]GGA>GGC|[1]ACA>CAA,ACA>GAC,GGA>AAG,GGA...|
| 46 | 1 | 1.855 | 78.131 | 0.635 | 4.829 | Yes, p = 0.0412 | 3 |[1]GTG>AGG,GTG>ATG,GTG>CCG,GTG>CGC,GTG>GC...|
| 54 | 1 | 0.000 | 41.961 | 0.272 | 5.756 | Yes, p = 0.0256 | 3 |[2]GGG>GCG|[1]GAG>CGG,GAG>GGA,GGG>GAG,GGG...|
| 56 | 1 | 0.976 | 29.084 | 0.247 | 10.023 | Yes, p = 0.0029 | 4 |[2]GGC>CGC,GGC>GGT|[1]GGC>CAC,GGC>CAG,GGC...|
| **57** | 1 | 1.208 | 1617.563 | 0.180 | 6.760 | Yes, p = 0.0153 | 3 |[2]TCA>TCC,TCA>TCT|[1]TCA>CGC,TCA>CTC,TCA...|
| 60 | 1 | 1.270 | 22.526 | 0.342 | 9.401 | Yes, p = 0.0040 | 4 |[3]TGT>TGC|[2]TGT>GTC|[1]AGT>TGT,GGT>AGT,...|
| 65 | 1 | 0.000 | 243.982 | 0.472 | 14.029 | Yes, p = 0.0004 | 7 |[2]GTC>CCA|[1]GTC>ATG,GTC>CAG,GTC>GAT,GTC...|
| 67 | 1 | 0.000 | 15.101 | 0.274 | 13.312 | Yes, p = 0.0006 | 3 |[2]ACC>GAC|[1]AAC>CCA,ACC>AAC,ACC>ATG,ATG...|
| 68 | 1 | 1.035 | 115.300 | 0.285 | 7.534 | Yes, p = 0.0103 | 4 |[2]GGG>GGC,GGG>TGG,TGG>GTC|[1]GGG>GCT,TGG...|
| 71 | 1 | 0.821 | 458.703 | 0.384 | 8.918 | Yes, p = 0.0051 | 5 |[1]ACG>ACA,ACG>AGG,ACG>CAG,ACG>CCA,ACG>GG...|
| 73 | 1 | 0.000 | 552.454 | 0.134 | 6.578 | Yes, p = 0.0168 | 2 |[2]ACG>ATG|[1]AAG>ACG,ACG>CCG,ACG>GTC,ATA...|
| 79 | 1 | 0.000 | 501.160 | 0.132 | 21.818 | Yes, p = 0.0000 | 2 | [1]CGA>GAT,CGA>GCC |
| 91 | 1 | 0.000 | 350.914 | 0.113 | 9.669 | Yes, p = 0.0035 | 2 | [1]CCC>TCC,TCC>ACC,TCC>CGT |
| 106 | 1 | 1.032 | 878.243 | 0.250 | 5.719 | Yes, p = 0.0260 | 2 |[1]CGT>GCC,TCT>TAC,TGC>CGT,TGC>GCA,TGC>GC...|
| 108 | 1 | 0.000 | 509.649 | 0.154 | 12.489 | Yes, p = 0.0008 | 2 |[1]CCG>CAG,CCG>GCA,CCG>GGG,CCG>TCG,CTG>GC...|
| 109 | 1 | 0.000 | 1481.559 | 0.344 | 18.773 | Yes, p = 0.0000 | 6 |[2]GGC>GCT|[1]AGC>CCC,GGC>ACG,GGC>AGC,GGC...|
| **114** | 1 | 0.000 | 99.765 | 0.389 | 5.620 | Yes, p = 0.0274 | 1 |[1]CAC>CAA,CAC>CTC,CAC>GAG,CTC>GGC,CTC>TG...|
| **116** | 1 | 0.283 | 456.492 | 0.343 | 5.746 | Yes, p = 0.0257 | 1 |[3]CAC>CCA|[2]CAC>CCC|[1]CAC>TAC,TAC>CCT,...|
| 131 | 1 | 0.000 | 21.031 | 0.193 | 5.454 | Yes, p = 0.0298 | 3 | [2]CGG>CCG|[1]CGG>CTC |
| 154 | 1 | 0.000 | 87.358 | 0.231 | 6.526 | Yes, p = 0.0172 | 3 | [1]CAG>GAG,CAG>GCC,GAG>ACC,GAG>CTC |
| 196 | 1 | 0.000 | 138.744 | 0.088 | 6.817 | Yes, p = 0.0148 | 1 | [1]CTC>CGC,CTC>GCT |
| 203 | 1 | 0.322 | 192.632 | 0.133 | 7.883 | Yes, p = 0.0086 | 2 | [1]CTC>CCG,CTC>CTG,CTC>GCA |
### ** Found _25_ sites under episodic diversifying positive selection at p <= 0.05**
# Do these differences in the MEME results really matter after we screen for EBF>100?