Mutational target size and

# Mutational target size and Given a set of replicates from a single experimental condition estimate the chances of parallel mutations occuring by chance. ## The overall design Suppose you are adopting *E. coli* to a new environment (high salinity, particular antibiotic and so on). You have several replicates (parallel experiments) for treatment (blue) and control (red): ![](https://i.imgur.com/vCzUc7C.png) After *N* generations you sequence last timepoint of your experiment. This gives you the following information: - mutation rate per site per generation - Tn/Ts ratio - genes that contain mutations (mutations and how many replicates share this mutation) Some genes will be mutated more than others, and these likely contain adaptive changes. :::danger **The question**: how do we know that a particular gene is mutated in parallel (in multiple lines) more than by chance. ::: To answer this question we need to obtain the distribution of mutations per gene: e.g., for a given gene what are the counts of seeing a given mutation in 1, 2, 3, .... *N* lines. This can be done with the following simulation: 1. Given mutation late $\mu$ and transition/transversion ratio $t$ introduce mutations to a genome for $M$ generation across $N$ independent lines. This simply means that: ``` for m in [0;M]: for n in [0;N]: do: introduce mu*genome_size mutations ``` 2. Mutations should weighted by $t$: to ensure that transitions are less likely than transversions. 3. Nonsense mutations (introducing STOPs) are not allowed. To approach this you should start with something simple initially: e.g., a 1000 bp genome with 2 genes. You can basically create a lookup table where for each base in the genome you know if: - it is coding or non-coding - if it is coding which mutations are non-sense, which are non-synonymous and which are synonymous :::info Let's discuss this further if anything is not clear. ::: ## The current approach - Estimate mutation rate from observed data (sequencing of the last time point) - For each gene and intergenic region compute the number of mutations: ```r= {rbinom(100000,popmut_genintergenC[m,3],prob_popC[i,2])->aC #conversion of the vector of number of mutations per simulation in a vector of presence/absence of mutation for (j in 1:100000) if (aC[j]>=1) {bC[(i),j]<-1} else {bC[(i),j]<-0}} ``` Here```popmut_genintergenC[m,3]``` is length and ```prob_popC[i,2]``` is mutation rate estimated from sequencing. ![](https://i.imgur.com/JmPu409.png) Mixed approach: https://observablehq.com/@spond/usual-vs-unusual-mutation-analyses ```R= ## for populations of the control group popmut_genintergenC<-popmut_genintergen_LB prob_popC<-prob_pos_LB nbgeneC<-dim(popmut_genintergenC)[1] nbpopC<-dim(prob_popC)[1] #for each gene, makes 100000 simulations of mutation in each of the populations characterized by its mutation rate probC<-matrix(nrow=nbgeneC, ncol=nbpopC-1) for (m in 1:nbgeneC) {bC<-matrix(nrow=nbpopC, ncol=100000) for (i in 1:nbpopC) {rbinom(100000,popmut_genintergenC[m,3],prob_popC[i,2])->aC #conversion of the vector of number of mutations per simulation in a vector of presence/absence of mutation for (j in 1:100000) if (aC[j]>=1) {bC[(i),j]<-1} else {bC[(i),j]<-0}} #calculation of the number of populations in which the gene is mutated dC<-vector() for (i in 1:100000) {dC[i]<-sum(bC[1:nbpopC,i])} # calculation of the probability that a gene is mutated in more than 1, 2, 3,... populations x<-table(dC) if (min(dC)==0) {for (l in 3:length(table(dC))) {probC[m,l-2]<-(sum(table(dC)[l:length(table(dC))])/100000)}}} {if (min(dC)==1) {for (l in 2:length(table(dC))) {probC[m,l-1]<-(sum(table(dC)[l:length(table(dC))])/100000)}}} probC[is.na(probC)] <- 0.000009 # association of a p-value of being mutated by chance to each gene/intergene pbC<-vector() for (i in 1:nbgeneC) + {pbC[i]<-probC[i,popmut_genintergenC[i,2]-1]} resC<-cbind(popmut_genintergenC,pbC) # extract the list of genes/intergenes mutated in // in adapating pop (p<0.001) mut_parC<-vector for (i in 1:nbgeneC) if (as.numeric(resC[i,4])<0.001) {mut_parC<-c(mut_parC,resC[i,1])} ## for populations of the G group npop<-dim(prob_pos_G)[1] ngene<-dim(mutgen.pop.matrix.G)[1] nbpop<-round(0.9*npop) for (u in 1:100) #sorting of 90% of the populations of the group {print(u) v<-sample (prob_pos_G[,1], size=nbpop, replace =F) #count the number of sampled populations mutated in each gene t<-replicate(ngene,0) for (j in 1:ngene) for (i in 2:npop+1) if(colnames(mutgen.pop.matrix.G)[i] %in% v){t[j]<-t[j]+as.numeric(mutgen.pop.matrix.G[j,i])} popmut_genintergen<-cbind(mutgen.pop.matrix.G[,1],t) #remove the genes mutated in only one pop popmut_genintergen<-subset(popmut_genintergen,popmut_genintergen[,2]>1) nbgene<-dim(popmut_genintergen)[1] #add to the gene table the length of the gene/intergene mutated for (i in 1:dim(Table_gene_intergenic_length_2)[1]) {Table_gene_intergenic_length_2[i,3]<-gsub(" ", "",Table_gene_intergenic_length_2[i,3])} w<-vector() for (i in 1:nbgene) for (j in 1:dim(Table_gene_intergenic_length_2)[1]) if (popmut_genintergen[i,1]==Table_gene_intergenic_length_2[j,3]) w[i]<-Table_gene_intergenic_length_2[j,6] popmut_genintergen<-cbind(popmut_genintergen,w) #build the change probability table for the sampled populations prob_pop<-subset(prob_pos_G, prob_pos_G[,1] %in% v) print("resampled tables done") #for each gene, makes 100000 simulations of mutation in each of the populations characterized by its mutation rate prob<-matrix(nrow=nbgene, ncol=nbpop-1) for (m in 1:nbgene) {print(m) b<-matrix(nrow=nbpop, ncol=100000) for (i in 1:nbpop) {rbinom(100000,as.numeric(popmut_genintergen[m,3]),prob_pop[i,2])->a #conversion of the vector of number of mutations per simulation in a vector of presence/absence of mutation for (j in 1:100000) if (a[j]>=1) {b[(i),j]<-1} else {b[(i),j]<-0}} #calculation of the number of populations in which the gene is mutated d<-vector() for (i in 1:100000) {d[i]<-sum(b[1:nbpop,i])} # calculation of the probability that a gene is mutated in more than 1, 2, 3,... populations x<-table(d) if (min(d)==0) {for (l in 3:length(table(d))) {prob[m,l-2]<-(sum(table(d)[l:length(table(d))])/100000)}}} {if (min(d)==1) {for (l in 2:length(table(d))) {prob[m,l-1]<-(sum(table(d)[l:length(table(d))])/100000)}}} prob[is.na(prob)] <- 0.000009 # association of a p-value of being mutated by chance to each gene/intergene pb<-vector() for (i in 1:nbgene) + {pb[i]<-prob[i,as.numeric(popmut_genintergen[i,2])-1]} res<-cbind(popmut_genintergen,pb) # extract the list of genes/intergenes mutated in // in adapating pop (p<0.001) mut_par<-vector for (i in 1:nbgene) if (as.numeric(res[i,4])<0.001) {mut_par<-c(mut_par,res[i,1])} #definition of mutational target (=substraction of the control list from the adapting list) MT<-setdiff(mut_par, mut_parC) f<-vector() for (i in 1:ngene) {if (mutgen.pop.matrix.G[i,1] %in% MT) {f[i]<-1} else {f[i]<-0}} mutgen.pop.matrix.G<-cbind(mutgen.pop.matrix.G,f) rm(pb, popmut_genintergen)} ## for populations of the S group npop<-dim(prob_pos_S)[1] ngene<-dim(mutgen.pop.matrix.S)[1] nbpop<-round(0.9*npop) for (u in 1:100) #sorting of 90% of the populations of the group {print(u) v<-sample (prob_pos_S[,1], size=nbpop, replace =F) #count the number of sampled populations mutated in each gene t<-replicate(ngene,0) for (j in 1:ngene) for (i in 2:npop+1) if(colnames(mutgen.pop.matrix.S)[i] %in% v){t[j]<-t[j]+as.numeric(mutgen.pop.matrix.S[j,i])} popmut_genintergen<-cbind(mutgen.pop.matrix.S[,1],t) #remove the genes mutated in only one pop popmut_genintergen<-subset(popmut_genintergen,popmut_genintergen[,2]>1) nbgene<-dim(popmut_genintergen)[1] #add to the gene table the length of the gene/intergene mutated for (i in 1:dim(Table_gene_intergenic_length_2)[1]) {Table_gene_intergenic_length_2[i,3]<-gsub(" ", "",Table_gene_intergenic_length_2[i,3])} w<-vector() for (i in 1:nbgene) for (j in 1:dim(Table_gene_intergenic_length_2)[1]) if (popmut_genintergen[i,1]==Table_gene_intergenic_length_2[j,3]) w[i]<-Table_gene_intergenic_length_2[j,6] popmut_genintergen<-cbind(popmut_genintergen,w) #build the change probability table for the sampled populations prob_pop<-subset(prob_pos_S, prob_pos_S[,1] %in% v) print("resampled tables done") #for each gene, makes 100000 simulations of mutation in each of the populations characterized by its mutation rate prob<-matrix(nrow=nbgene, ncol=nbpop-1) for (m in 1:nbgene) {print(m) b<-matrix(nrow=nbpop, ncol=100000) for (i in 1:nbpop) {rbinom(100000,as.numeric(popmut_genintergen[m,3]),prob_pop[i,2])->a #conversion of the vector of number of mutations per simulation in a vector of presence/absence of mutation for (j in 1:100000) if (a[j]>=1) {b[(i),j]<-1} else {b[(i),j]<-0}} #calculation of the number of populations in which the gene is mutated d<-vector() for (i in 1:100000) {d[i]<-sum(b[1:nbpop,i])} # calculation of the probability that a gene is mutated in more than 1, 2, 3,... populations x<-table(d) if (min(d)==0) {for (l in 3:length(table(d))) {prob[m,l-2]<-(sum(table(d)[l:length(table(d))])/100000)}}} {if (min(d)==1) {for (l in 2:length(table(d))) {prob[m,l-1]<-(sum(table(d)[l:length(table(d))])/100000)}}} prob[is.na(prob)] <- 0.000009 # association of a p-value of being mutated by chance to each gene/intergene pb<-vector() for (i in 1:nbgene) + {pb[i]<-prob[i,as.numeric(popmut_genintergen[i,2])-1]} res<-cbind(popmut_genintergen,pb) # extract the list of genes/intergenes mutated in // in adapating pop (p<0.001) mut_par<-vector for (i in 1:nbgene) if (as.numeric(res[i,4])<0.001) {mut_par<-c(mut_par,res[i,1])} #definition of mutational target (=substraction of the control list from the adapting list) MT<-setdiff(mut_par, mut_parC) f<-vector() for (i in 1:ngene) {if (mutgen.pop.matrix.S[i,1] %in% MT) {f[i]<-1} else {f[i]<-0}} mutgen.pop.matrix.S<-cbind(mutgen.pop.matrix.S,f) rm(pb, popmut_genintergen)} ```