# Mutational target size and
Given a set of replicates from a single experimental condition estimate the chances of parallel mutations occuring by chance.
## The overall design
Suppose you are adopting *E. coli* to a new environment (high salinity, particular antibiotic and so on). You have several replicates (parallel experiments) for treatment (blue) and control (red):

After *N* generations you sequence last timepoint of your experiment. This gives you the following information:
- mutation rate per site per generation
- Tn/Ts ratio
- genes that contain mutations (mutations and how many replicates share this mutation)
Some genes will be mutated more than others, and these likely contain adaptive changes.
:::danger
**The question**: how do we know that a particular gene is mutated in parallel (in multiple lines) more than by chance.
:::
To answer this question we need to obtain the distribution of mutations per gene: e.g., for a given gene what are the counts of seeing a given mutation in 1, 2, 3, .... *N* lines. This can be done with the following simulation:
1. Given mutation late $\mu$ and transition/transversion ratio $t$ introduce mutations to a genome for $M$ generation across $N$ independent lines. This simply means that:
```
for m in [0;M]:
for n in [0;N]:
do: introduce mu*genome_size mutations
```
2. Mutations should weighted by $t$: to ensure that transitions are less likely than transversions.
3. Nonsense mutations (introducing STOPs) are not allowed.
To approach this you should start with something simple initially: e.g., a 1000 bp genome with 2 genes. You can basically create a lookup table where for each base in the genome you know if:
- it is coding or non-coding
- if it is coding which mutations are non-sense, which are non-synonymous and which are synonymous
:::info
Let's discuss this further if anything is not clear.
:::
## The current approach
- Estimate mutation rate from observed data (sequencing of the last time point)
- For each gene and intergenic region compute the number of mutations:
```r=
{rbinom(100000,popmut_genintergenC[m,3],prob_popC[i,2])->aC
#conversion of the vector of number of mutations per simulation in a vector of presence/absence of mutation
for (j in 1:100000)
if (aC[j]>=1) {bC[(i),j]<-1} else {bC[(i),j]<-0}}
```
Here```popmut_genintergenC[m,3]``` is length and ```prob_popC[i,2]``` is mutation rate estimated from sequencing.

Mixed approach:
https://observablehq.com/@spond/usual-vs-unusual-mutation-analyses
```R=
## for populations of the control group
popmut_genintergenC<-popmut_genintergen_LB
prob_popC<-prob_pos_LB
nbgeneC<-dim(popmut_genintergenC)[1]
nbpopC<-dim(prob_popC)[1]
#for each gene, makes 100000 simulations of mutation in each of the populations characterized by its mutation rate
probC<-matrix(nrow=nbgeneC, ncol=nbpopC-1)
for (m in 1:nbgeneC)
{bC<-matrix(nrow=nbpopC, ncol=100000)
for (i in 1:nbpopC)
{rbinom(100000,popmut_genintergenC[m,3],prob_popC[i,2])->aC
#conversion of the vector of number of mutations per simulation in a vector of presence/absence of mutation
for (j in 1:100000)
if (aC[j]>=1) {bC[(i),j]<-1} else {bC[(i),j]<-0}}
#calculation of the number of populations in which the gene is mutated
dC<-vector()
for (i in 1:100000)
{dC[i]<-sum(bC[1:nbpopC,i])}
# calculation of the probability that a gene is mutated in more than 1, 2, 3,... populations
x<-table(dC)
if (min(dC)==0)
{for (l in 3:length(table(dC)))
{probC[m,l-2]<-(sum(table(dC)[l:length(table(dC))])/100000)}}}
{if (min(dC)==1)
{for (l in 2:length(table(dC)))
{probC[m,l-1]<-(sum(table(dC)[l:length(table(dC))])/100000)}}}
probC[is.na(probC)] <- 0.000009
# association of a p-value of being mutated by chance to each gene/intergene
pbC<-vector()
for (i in 1:nbgeneC)
+ {pbC[i]<-probC[i,popmut_genintergenC[i,2]-1]}
resC<-cbind(popmut_genintergenC,pbC)
# extract the list of genes/intergenes mutated in // in adapating pop (p<0.001)
mut_parC<-vector
for (i in 1:nbgeneC)
if (as.numeric(resC[i,4])<0.001) {mut_parC<-c(mut_parC,resC[i,1])}
## for populations of the G group
npop<-dim(prob_pos_G)[1]
ngene<-dim(mutgen.pop.matrix.G)[1]
nbpop<-round(0.9*npop)
for (u in 1:100)
#sorting of 90% of the populations of the group
{print(u)
v<-sample (prob_pos_G[,1], size=nbpop, replace =F)
#count the number of sampled populations mutated in each gene
t<-replicate(ngene,0)
for (j in 1:ngene)
for (i in 2:npop+1)
if(colnames(mutgen.pop.matrix.G)[i] %in% v){t[j]<-t[j]+as.numeric(mutgen.pop.matrix.G[j,i])}
popmut_genintergen<-cbind(mutgen.pop.matrix.G[,1],t)
#remove the genes mutated in only one pop
popmut_genintergen<-subset(popmut_genintergen,popmut_genintergen[,2]>1)
nbgene<-dim(popmut_genintergen)[1]
#add to the gene table the length of the gene/intergene mutated
for (i in 1:dim(Table_gene_intergenic_length_2)[1])
{Table_gene_intergenic_length_2[i,3]<-gsub(" ", "",Table_gene_intergenic_length_2[i,3])}
w<-vector()
for (i in 1:nbgene)
for (j in 1:dim(Table_gene_intergenic_length_2)[1])
if (popmut_genintergen[i,1]==Table_gene_intergenic_length_2[j,3])
w[i]<-Table_gene_intergenic_length_2[j,6]
popmut_genintergen<-cbind(popmut_genintergen,w)
#build the change probability table for the sampled populations
prob_pop<-subset(prob_pos_G, prob_pos_G[,1] %in% v)
print("resampled tables done")
#for each gene, makes 100000 simulations of mutation in each of the populations characterized by its mutation rate
prob<-matrix(nrow=nbgene, ncol=nbpop-1)
for (m in 1:nbgene)
{print(m)
b<-matrix(nrow=nbpop, ncol=100000)
for (i in 1:nbpop)
{rbinom(100000,as.numeric(popmut_genintergen[m,3]),prob_pop[i,2])->a
#conversion of the vector of number of mutations per simulation in a vector of presence/absence of mutation
for (j in 1:100000)
if (a[j]>=1) {b[(i),j]<-1} else {b[(i),j]<-0}}
#calculation of the number of populations in which the gene is mutated
d<-vector()
for (i in 1:100000)
{d[i]<-sum(b[1:nbpop,i])}
# calculation of the probability that a gene is mutated in more than 1, 2, 3,... populations
x<-table(d)
if (min(d)==0)
{for (l in 3:length(table(d)))
{prob[m,l-2]<-(sum(table(d)[l:length(table(d))])/100000)}}}
{if (min(d)==1)
{for (l in 2:length(table(d)))
{prob[m,l-1]<-(sum(table(d)[l:length(table(d))])/100000)}}}
prob[is.na(prob)] <- 0.000009
# association of a p-value of being mutated by chance to each gene/intergene
pb<-vector()
for (i in 1:nbgene)
+ {pb[i]<-prob[i,as.numeric(popmut_genintergen[i,2])-1]}
res<-cbind(popmut_genintergen,pb)
# extract the list of genes/intergenes mutated in // in adapating pop (p<0.001)
mut_par<-vector
for (i in 1:nbgene)
if (as.numeric(res[i,4])<0.001) {mut_par<-c(mut_par,res[i,1])}
#definition of mutational target (=substraction of the control list from the adapting list)
MT<-setdiff(mut_par, mut_parC)
f<-vector()
for (i in 1:ngene)
{if (mutgen.pop.matrix.G[i,1] %in% MT) {f[i]<-1} else {f[i]<-0}}
mutgen.pop.matrix.G<-cbind(mutgen.pop.matrix.G,f)
rm(pb, popmut_genintergen)}
## for populations of the S group
npop<-dim(prob_pos_S)[1]
ngene<-dim(mutgen.pop.matrix.S)[1]
nbpop<-round(0.9*npop)
for (u in 1:100)
#sorting of 90% of the populations of the group
{print(u)
v<-sample (prob_pos_S[,1], size=nbpop, replace =F)
#count the number of sampled populations mutated in each gene
t<-replicate(ngene,0)
for (j in 1:ngene)
for (i in 2:npop+1)
if(colnames(mutgen.pop.matrix.S)[i] %in% v){t[j]<-t[j]+as.numeric(mutgen.pop.matrix.S[j,i])}
popmut_genintergen<-cbind(mutgen.pop.matrix.S[,1],t)
#remove the genes mutated in only one pop
popmut_genintergen<-subset(popmut_genintergen,popmut_genintergen[,2]>1)
nbgene<-dim(popmut_genintergen)[1]
#add to the gene table the length of the gene/intergene mutated
for (i in 1:dim(Table_gene_intergenic_length_2)[1])
{Table_gene_intergenic_length_2[i,3]<-gsub(" ", "",Table_gene_intergenic_length_2[i,3])}
w<-vector()
for (i in 1:nbgene)
for (j in 1:dim(Table_gene_intergenic_length_2)[1])
if (popmut_genintergen[i,1]==Table_gene_intergenic_length_2[j,3])
w[i]<-Table_gene_intergenic_length_2[j,6]
popmut_genintergen<-cbind(popmut_genintergen,w)
#build the change probability table for the sampled populations
prob_pop<-subset(prob_pos_S, prob_pos_S[,1] %in% v)
print("resampled tables done")
#for each gene, makes 100000 simulations of mutation in each of the populations characterized by its mutation rate
prob<-matrix(nrow=nbgene, ncol=nbpop-1)
for (m in 1:nbgene)
{print(m)
b<-matrix(nrow=nbpop, ncol=100000)
for (i in 1:nbpop)
{rbinom(100000,as.numeric(popmut_genintergen[m,3]),prob_pop[i,2])->a
#conversion of the vector of number of mutations per simulation in a vector of presence/absence of mutation
for (j in 1:100000)
if (a[j]>=1) {b[(i),j]<-1} else {b[(i),j]<-0}}
#calculation of the number of populations in which the gene is mutated
d<-vector()
for (i in 1:100000)
{d[i]<-sum(b[1:nbpop,i])}
# calculation of the probability that a gene is mutated in more than 1, 2, 3,... populations
x<-table(d)
if (min(d)==0)
{for (l in 3:length(table(d)))
{prob[m,l-2]<-(sum(table(d)[l:length(table(d))])/100000)}}}
{if (min(d)==1)
{for (l in 2:length(table(d)))
{prob[m,l-1]<-(sum(table(d)[l:length(table(d))])/100000)}}}
prob[is.na(prob)] <- 0.000009
# association of a p-value of being mutated by chance to each gene/intergene
pb<-vector()
for (i in 1:nbgene)
+ {pb[i]<-prob[i,as.numeric(popmut_genintergen[i,2])-1]}
res<-cbind(popmut_genintergen,pb)
# extract the list of genes/intergenes mutated in // in adapating pop (p<0.001)
mut_par<-vector
for (i in 1:nbgene)
if (as.numeric(res[i,4])<0.001) {mut_par<-c(mut_par,res[i,1])}
#definition of mutational target (=substraction of the control list from the adapting list)
MT<-setdiff(mut_par, mut_parC)
f<-vector()
for (i in 1:ngene)
{if (mutgen.pop.matrix.S[i,1] %in% MT) {f[i]<-1} else {f[i]<-0}}
mutgen.pop.matrix.S<-cbind(mutgen.pop.matrix.S,f)
rm(pb, popmut_genintergen)}
```