TAPER - HackMD

--- title: 'taper' disqus: hackmd --- Correcting errors in alignments with TAPER === By Tanya Lama ### Objective: TAPER is a tool that looks for errors in small species-specific stretches of the multiple sequence alignments. TAPER uses a two-dimensional outlier detection algorithm that seeks to find what parts of a sequence are outliers given the overall level of divergence of the sequence to others and the level of the divergence of the region in the alignment. Importantly, TAPER adjusts its null expectations per site and species, and in doing so, it attempts to distinguish the real heterogeneity (signal) from errors (noise). Zhang, C., Zhao, Y., Braun, E. L., & Mirarab, S. (2021). TAPER: Pinpointing errors in multiple sequence alignments despite varying rates of evolution. Methods in Ecology and Evolution, 00, 1– 14. https://doi.org/10.1111/2041-210X.13696 ### Acknowledgements Thanks to Bill Thomas 5ever ## Table of Contents [TOC] # Installing julia/ TAPER see the full directions [on github](https://github.com/chaoszhang/TAPER) move to our project /bin and download and unzip Julia. curl -fsSL https://install.julialang.org | sh # Where is juliaup? which juliaup /home/tlama_umass_edu/.juliaup/bin/juliaup # Create the correction_multi.jl file ``` global R = [Dict("k"=>5, "p"=>0.25, "q"=>0.1, "L"=>30), Dict("k"=>9, "p"=>0.25, "q"=>0.25, "L"=>54), Dict("k"=>17, "p"=>0.1, "q"=>0.5, "L"=>Inf)] PROGRAM_VERSION = v"1.0.0" try using ArgParse catch import Pkg Pkg.add("ArgParse") using ArgParse end import Statistics.median function correction(c, output, k, X, MASK, pvalue, qvalue, threshold) upperx = ('a' <= X && X <= 'z') ? X - 'a' + 'A' : X upperc = [uppercase(str) for str in c] arrc = [Array{Char, 1}(str) for str in c] n = length(c[1]) m = length(c) wo = zeros(n, m) # read sequences and compute per column profiles. for i in 1:n cnt = zeros(128) # read a column, internally represent in upper case, and count letters for j in 1:m cnt[UInt8(upperc[j][i])] += 1 end cnt[UInt8(upperx)] = 0 cnt[UInt8('-')] = 0 unq = length([1 for t in cnt if t > 0]) total = sum(cnt) for j in 1:m wo[i, j] = total == 0 ? 0 : total / (unq * cnt[UInt8(upperc[j][i])]) end end w1 = [wo[:,j][(arrc[j] .!= '-') .& (arrc[j] .!= X)] for j in 1:m] w = [[median(arr[i:i+k-1]) for i in 1:length(arr)-k+1] for arr in w1] ws = [[sum(arr[i:i+k-1]) for i in 1:length(arr)-k+1] for arr in w1] wsorted = [sort(arr) for arr in w] wsum = [accumulate(+, arr) for arr in wsorted] f(x, y, n, m) = x^2 / n + (n == m ? 0 : (y - x)^2 / (m - n)) var = [length(arr) > 0 ? f.(arr, arr[end], 1:length(arr), length(arr)) : [] for arr in wsum] wCutoff = [length(var[j]) > 0 ? wsorted[j][findmax(var[j])[2]] : 0 for j in 1:m] cutoffSorted = sort([wCutoff[j] for j in 1:m if length(var[j]) > 0]) cutoffFloor = (pvalue >= 1) ? 0 : cutoffSorted[end - floor(Int, length(cutoffSorted) * pvalue)] s = zeros(n - k + 1, m, 2) tiebreaker = zeros(n - k + 1, m, 2) bt = zeros(Int64, n - k + 1, m, 2) for j in 1:m wj = w[j] wsj = ws[j] L = length(wj) if L <= k push!(output, c[j]) continue end s = zeros(L, 2) tiebreaker = zeros(L, 2) bt = zeros(Int64, L, 2) cutoff = max(wCutoff[j], cutoffFloor, (qvalue >= 1) ? 0 : wsorted[j][end - floor(Int, length(wsorted[j]) * qvalue)], threshold) for i in 1:L v = (wj[i] > cutoff ? 0 : 1) if i == 1 s[i, 1] = v s[i, 2] = 1 - v tiebreaker[i, 1] = 0 tiebreaker[i, 2] = wsj[i] else s[i, 1] = s[i - 1, 1] + v s[i, 2] = s[i - 1, 2] + 1 - v tiebreaker[i, 1] = tiebreaker[i - 1, 1] tiebreaker[i, 2] = tiebreaker[i - 1, 2] + wsj[i] bt[i, 1] = 1 bt[i, 2] = 2 end if i > k && (s[i, 1], tiebreaker[i, 1]) < (s[i - k, 2] + v, tiebreaker[i - k, 2]) s[i, 1] = s[i - k, 2] + v tiebreaker[i, 1] = tiebreaker[i - k, 2] bt[i, 1] = 2 end if i > k && (s[i, 2], tiebreaker[i, 2]) < (s[i - k, 1] + 1 - v, tiebreaker[i - k, 1]) s[i, 2] = s[i - k, 1] + 1 - v tiebreaker[i, 2] = tiebreaker[i - k, 1] + wsj[i] bt[i, 2] = 1 end end str = arrc[j][(arrc[j] .!= '-') .& (arrc[j] .!= X)] icur = L if s[L, 1] < s[L, 2] str[L:L+k-1] .= MASK bcur = 2 else bcur = 1 end while true if bcur == 1 && bt[icur, bcur] == 1 icur -= 1 bcur = 1 elseif bcur == 1 && bt[icur, bcur] == 2 icur -= k bcur = 2 str[icur : icur + k - 1] .= MASK elseif bcur == 2 && bt[icur, bcur] == 1 icur -= k bcur = 1 elseif bcur == 2 && bt[icur, bcur] == 2 icur -= 1 bcur = 2 str[icur] = MASK elseif bcur == 1 break else str[1:icur - 1] .= MASK break end end strout = "" i = 1 for t in 1:length(c[j]) if c[j][t] == X || c[j][t] == '-' strout *= c[j][t] else strout *= str[i] i += 1 end end push!(output, strout) end end function ArgParse.parse_item(::Type{Char}, x::AbstractString) return x[1] end function parse_commandline() s = ArgParseSettings() @add_arg_table! s begin "--list", "-l" help = "running on a list of inputs; for every two lines of the list file, the first one should be the path to the input and the second should be the path to its output" action = :store_true "--mask", "-m" help = "the character to mask erroneous regions" arg_type = Char default = 'X' "--any", "-a" help = "the character to denote ambiguous positions or character to denote ANY in the input files" arg_type = Char default = 'X' "--cutoff", "-c" help = "set score cutoff to control the minimum aggressiveness of masking (should be > 1)" arg_type = Float64 default = 3.0 "--parameter", "-p" help = "(Advanced) load the list of k, p, q, and L from the input parameter file" arg_type = String "input" help = "a fasta file as input" required = true end return parse_args(s) end function correction_multi(args, fin, fout) inputText = read(fin, String) temp = split(inputText, ">") temp = temp[length.(temp) .> 0] temp = [split(arr, "\n") for arr in temp] header = [arr[1] for arr in temp] c = [string(arr[2:end]...) for arr in temp] arrc = [Array{Char, 1}(str) for str in c] n = length(c[1]) m = length(c) MASK = args["mask"] for r in R output = [] correction(c, output, get(r, "k", 9), args["any"], '*', get(r, "p", 0), get(r, "q", 0), args["cutoff"]) L = get(r, "L", Inf) for j in 1:m start = 0 cnt = 0 for i in 1:n if start == 0 && output[j][i] == '*' start = i cnt = 1 end if start != 0 && output[j][i] == '*' cnt += 1 end if start != 0 && output[j][i] != '*' && output[j][i] != '-' if cnt < L for k in start:i-1 arrc[j][k] = (output[j][k] == '-') ? '-' : MASK end end start = 0 cnt = 0 end end if start != 0 && cnt < L for k in start:n arrc[j][k] = (output[j][k] == '-') ? '-' : MASK end end end end for j in 1:m println(fout, ">" * header[j]) println(fout, String(arrc[j])) end end function main() println(stderr, "Version " * string(PROGRAM_VERSION)) args = parse_commandline() if isnothing(args["parameter"]) == false global R = eval(Meta.parse(read(open(args["parameter"], "r"), String))) end if args["list"] == false correction_multi(args, open(args["input"], "r"), stdout) else temp = split(read(open(args["input"], "r"), String), "\n") temp = temp[length.(temp) .> 0] for i = 2:2:length(temp) try println(stderr, "Processing " * temp[i - 1] * "...") correction_multi(args, open(temp[i - 1], "r"), open(temp[i], "w")) catch println(stderr, "Error happened when processing " * temp[i - 1] * ".") println(stderr) end end end end main() ``` # Run TAPER Usage is /home/tlama_umass_edu/.juliaup/bin/julia correction_multi.jl -m N input > output ``` /home/tlama_umass_edu/.juliaup/bin/julia correction_multi.jl -m N ENST00000678030.PANO1.cleanLb_hmm_manual.fasta > julia-ENST00000678030.PANO1.cleanLb_hmm_manual.fasta ``` # Run aBSREL on our cleanLb_hmm_manual alignment hyphy -i [work through the questions in interactive mode] # Run aBSREL on TAPER corrected alignment hyphy -i # Compare results ### Original: ### Adaptive branch site random effects likelihood test Likelihood ratio test for episodic diversifying positive selection at Holm-Bonferroni corrected _p = 0.0500_ found **13** branches under selection among **18** tested. * HLsacLep1, p-value = 0.00000 * HLrhiTri2, p-value = 0.00000 * HLrhiSed2, p-value = 0.00000 * HLphyDis3, p-value = 0.00000 * HLaseSto2, p-value = 0.00000 * HLartJam3, p-value = 0.00000 * HLrhiLuc4, p-value = 0.00001 * HLdipEca1, p-value = 0.00001 * HLpipKuh2, p-value = 0.00002 * felCat9, p-value = 0.00006 * hg38, p-value = 0.00282 * HLphyHas1, p-value = 0.00287 * HLrouAeg4, p-value = 0.04412 ### TAPER corrected ### Adaptive branch site random effects likelihood test Likelihood ratio test for episodic diversifying positive selection at Holm-Bonferroni corrected _p = 0.0500_ found **12** branches under selection among **18** tested. * HLrhiTri2, p-value = 0.00000 * HLrhiSed2, p-value = 0.00000 * HLphyDis3, p-value = 0.00000 * HLsacLep1, p-value = 0.00000 * HLartJam3, p-value = 0.00000 * HLaseSto2, p-value = 0.00000 * HLrhiLuc4, p-value = 0.00001 * HLdipEca1, p-value = 0.00002 * HLpipKuh2, p-value = 0.00002 * felCat9, p-value = 0.00005 * hg38, p-value = 0.00307 * HLphyHas1, p-value = 0.00291 looks like we lost the result from HLrouAeg4, however this site likely would have fallen out with FDR correction (above are BH corrected pvalues but we do an additional correction in post processing) # Compare results in AliView # Compare the results using MEME ### Original: >Please choose an option (or press q to cancel selection):1 * Log(L) = -7306.01 * non-synonymous/synonymous rate ratio for *background* = 0.7622 * non-synonymous/synonymous rate ratio for *test* = 0.7565 ### For partition 1 these sites are significant at p <=0.05 | Codon | Partition | alpha | beta+ | p+ | LRT |Episodic selection detected?| # branches |Most common codon substitutions at this site| |:----------:|:----------:|:----------:|:----------:|:----------:|:----------:|:--------------------------:|:----------:|:------------------------------------------:| | 13 | 1 | 0.050 | 170.501 | 0.079 | 5.995 | Yes, p = 0.0226 | 1 | [1]CAT>ATC,GCT>CAT,GTT>GCT,GTT>TGC | | 15 | 1 | 0.830 | 9.465 | 0.495 | 4.687 | Yes, p = 0.0443 | 7 |[1]CAC>ACC,CAT>CAC,CAT>CCC,CAT>CCT,CAT>CG...| | 27 | 1 | 0.172 | 176.094 | 0.102 | 8.495 | Yes, p = 0.0063 | 1 | [2]GCG>GTG|[1]ACG>GCG,GCG>CGC | | 41 | 1 | 0.118 |10000.000...| 0.304 | 15.866 | Yes, p = 0.0002 | 2 |[2]GGA>GGC|[1]ACA>CAA,ACA>GAC,GGA>AAG,GGA...| | 46 | 1 | 1.857 | 75.370 | 0.642 | 4.849 | Yes, p = 0.0408 | 3 |[1]GTG>AGG,GTG>ATG,GTG>CCG,GTG>CGC,GTG>GC...| | 54 | 1 | 0.000 | 42.346 | 0.274 | 5.795 | Yes, p = 0.0250 | 3 |[2]GGG>GCG|[1]GAG>CGG,GAG>GGA,GGG>GAG,GGG...| | 56 | 1 | 1.001 | 29.788 | 0.245 | 10.044 | Yes, p = 0.0029 | 4 |[2]GGC>CGC,GGC>GGT|[1]GGC>CAC,GGC>CAG,GGC...| | 60 | 1 | 1.268 | 23.823 | 0.335 | 9.442 | Yes, p = 0.0039 | 4 |[3]TGT>TGC|[2]TGT>GTC|[1]AGT>TGT,GGT>AGT,...| | 65 | 1 | 0.000 | 239.277 | 0.481 | 13.944 | Yes, p = 0.0004 | 7 |[2]GTC>CCA|[1]GTC>ATG,GTC>CAG,GTC>GAT,GTC...| | 67 | 1 | 0.000 | 15.404 | 0.266 | 13.229 | Yes, p = 0.0006 | 3 |[2]ACC>GAC|[1]AAC>CCA,ACC>AAC,ACC>ATG,ATG...| | 68 | 1 | 1.052 | 114.888 | 0.290 | 7.537 | Yes, p = 0.0103 | 4 |[2]GGG>GGC,GGG>TGG,TGG>GTC|[1]GGG>GCT,TGG...| | 71 | 1 | 0.860 | 427.396 | 0.386 | 8.859 | Yes, p = 0.0052 | 5 |[1]ACG>ACA,ACG>AGG,ACG>CAG,ACG>CCA,ACG>GG...| | 73 | 1 | 0.000 |10000.000...| 0.143 | 6.481 | Yes, p = 0.0176 | 2 |[2]ACG>ATG|[1]AAG>ACG,ACG>CCG,ACG>GTC,ATA...| | 79 | 1 | 0.000 | 474.307 | 0.131 | 21.851 | Yes, p = 0.0000 | 2 | [1]CGA>GAT,CGA>GCC | | 91 | 1 | 0.000 | 433.397 | 0.111 | 9.688 | Yes, p = 0.0034 | 2 | [1]CCC>TCC,TCC>ACC,TCC>CGT | | 106 | 1 | 1.073 | 672.590 | 0.260 | 8.261 | Yes, p = 0.0071 | 2 |[1]CGT>GCC,TCT>TAC,TGC>CGT,TGC>GCA,TGC>GC...| | 108 | 1 | 0.000 | 506.817 | 0.154 | 12.511 | Yes, p = 0.0008 | 2 |[1]CCG>CAG,CCG>GCA,CCG>GGG,CCG>TCG,CTG>GC...| | 109 | 1 | 0.000 | 2077.696 | 0.347 | 15.398 | Yes, p = 0.0002 | 6 |[2]GGC>GCT|[1]AGC>CCC,GGC>ACG,GGC>AGC,GGC...| | 131 | 1 | 0.000 | 22.126 | 0.196 | 5.471 | Yes, p = 0.0296 | 3 | [2]CGG>CCG|[1]CGG>CTC | | 154 | 1 | 0.000 | 97.984 | 0.230 | 6.529 | Yes, p = 0.0172 | 3 | [1]CAG>GAG,CAG>GCC,GAG>ACC,GAG>CTC | | 196 | 1 | 0.000 | 120.736 | 0.089 | 6.833 | Yes, p = 0.0147 | 1 | [1]CTC>CGC,CTC>GCT | | 203 | 1 | 0.321 | 212.486 | 0.137 | 7.771 | Yes, p = 0.0091 | 2 | [1]CTC>CCG,CTC>CTG,CTC>GCA | ### ** Found _22_ sites under episodic diversifying positive selection at p <= 0.05** ### TAPER corrected: >Please choose an option (or press q to cancel selection):1 * Log(L) = -7299.71 * non-synonymous/synonymous rate ratio for *background* = 0.7620 * non-synonymous/synonymous rate ratio for *test* = 0.7541 ### For partition 1 these sites are significant at p <=0.05 | Codon | Partition | alpha | beta+ | p+ | LRT |Episodic selection detected?| # branches |Most common codon substitutions at this site| |:----------:|:----------:|:----------:|:----------:|:----------:|:----------:|:--------------------------:|:----------:|:------------------------------------------:| | 13 | 1 | 0.053 | 149.763 | 0.080 | 5.983 | Yes, p = 0.0227 | 1 | [1]CAT>ATC,GCT>CAT,GTT>GCT,GTT>TGC | | 15 | 1 | 0.726 | 15.691 | 0.394 | 4.587 | Yes, p = 0.0467 | 1 |[1]CAC>ACC,CAT>CAC,CAT>CCC,CAT>CCT,CAT>CG...| | 27 | 1 | 0.172 | 176.094 | 0.102 | 8.466 | Yes, p = 0.0064 | 1 | [2]GCG>GTG|[1]ACG>GCG,GCG>CGC | | 41 | 1 | 0.119 |10000.000...| 0.302 | 15.942 | Yes, p = 0.0001 | 2 |[2]GGA>GGC|[1]ACA>CAA,ACA>GAC,GGA>AAG,GGA...| | 46 | 1 | 1.855 | 78.131 | 0.635 | 4.829 | Yes, p = 0.0412 | 3 |[1]GTG>AGG,GTG>ATG,GTG>CCG,GTG>CGC,GTG>GC...| | 54 | 1 | 0.000 | 41.961 | 0.272 | 5.756 | Yes, p = 0.0256 | 3 |[2]GGG>GCG|[1]GAG>CGG,GAG>GGA,GGG>GAG,GGG...| | 56 | 1 | 0.976 | 29.084 | 0.247 | 10.023 | Yes, p = 0.0029 | 4 |[2]GGC>CGC,GGC>GGT|[1]GGC>CAC,GGC>CAG,GGC...| | **57** | 1 | 1.208 | 1617.563 | 0.180 | 6.760 | Yes, p = 0.0153 | 3 |[2]TCA>TCC,TCA>TCT|[1]TCA>CGC,TCA>CTC,TCA...| | 60 | 1 | 1.270 | 22.526 | 0.342 | 9.401 | Yes, p = 0.0040 | 4 |[3]TGT>TGC|[2]TGT>GTC|[1]AGT>TGT,GGT>AGT,...| | 65 | 1 | 0.000 | 243.982 | 0.472 | 14.029 | Yes, p = 0.0004 | 7 |[2]GTC>CCA|[1]GTC>ATG,GTC>CAG,GTC>GAT,GTC...| | 67 | 1 | 0.000 | 15.101 | 0.274 | 13.312 | Yes, p = 0.0006 | 3 |[2]ACC>GAC|[1]AAC>CCA,ACC>AAC,ACC>ATG,ATG...| | 68 | 1 | 1.035 | 115.300 | 0.285 | 7.534 | Yes, p = 0.0103 | 4 |[2]GGG>GGC,GGG>TGG,TGG>GTC|[1]GGG>GCT,TGG...| | 71 | 1 | 0.821 | 458.703 | 0.384 | 8.918 | Yes, p = 0.0051 | 5 |[1]ACG>ACA,ACG>AGG,ACG>CAG,ACG>CCA,ACG>GG...| | 73 | 1 | 0.000 | 552.454 | 0.134 | 6.578 | Yes, p = 0.0168 | 2 |[2]ACG>ATG|[1]AAG>ACG,ACG>CCG,ACG>GTC,ATA...| | 79 | 1 | 0.000 | 501.160 | 0.132 | 21.818 | Yes, p = 0.0000 | 2 | [1]CGA>GAT,CGA>GCC | | 91 | 1 | 0.000 | 350.914 | 0.113 | 9.669 | Yes, p = 0.0035 | 2 | [1]CCC>TCC,TCC>ACC,TCC>CGT | | 106 | 1 | 1.032 | 878.243 | 0.250 | 5.719 | Yes, p = 0.0260 | 2 |[1]CGT>GCC,TCT>TAC,TGC>CGT,TGC>GCA,TGC>GC...| | 108 | 1 | 0.000 | 509.649 | 0.154 | 12.489 | Yes, p = 0.0008 | 2 |[1]CCG>CAG,CCG>GCA,CCG>GGG,CCG>TCG,CTG>GC...| | 109 | 1 | 0.000 | 1481.559 | 0.344 | 18.773 | Yes, p = 0.0000 | 6 |[2]GGC>GCT|[1]AGC>CCC,GGC>ACG,GGC>AGC,GGC...| | **114** | 1 | 0.000 | 99.765 | 0.389 | 5.620 | Yes, p = 0.0274 | 1 |[1]CAC>CAA,CAC>CTC,CAC>GAG,CTC>GGC,CTC>TG...| | **116** | 1 | 0.283 | 456.492 | 0.343 | 5.746 | Yes, p = 0.0257 | 1 |[3]CAC>CCA|[2]CAC>CCC|[1]CAC>TAC,TAC>CCT,...| | 131 | 1 | 0.000 | 21.031 | 0.193 | 5.454 | Yes, p = 0.0298 | 3 | [2]CGG>CCG|[1]CGG>CTC | | 154 | 1 | 0.000 | 87.358 | 0.231 | 6.526 | Yes, p = 0.0172 | 3 | [1]CAG>GAG,CAG>GCC,GAG>ACC,GAG>CTC | | 196 | 1 | 0.000 | 138.744 | 0.088 | 6.817 | Yes, p = 0.0148 | 1 | [1]CTC>CGC,CTC>GCT | | 203 | 1 | 0.322 | 192.632 | 0.133 | 7.883 | Yes, p = 0.0086 | 2 | [1]CTC>CCG,CTC>CTG,CTC>GCA | ### ** Found _25_ sites under episodic diversifying positive selection at p <= 0.05** # Do these differences in the MEME results really matter after we screen for EBF>100?