---
author: Benjamin Brachi
date: 20/05/2020
title: Running xcms analysis with Vincent
output:
html_document:
toc: true
toc_float: true
theme: yeti
---
# analyse xcms:
Changement
- noise=0 dans CentWaveParam
- Dans Camera: `E <- groupCorr(D, calcCiS=T, calcCaS=T, calcIso=T)`
- `peaklist=getPeaklist(E)` this was getPeaklist(D) which would have been wrong I imagine.
- PeakDensityParam () `sampleGroups = xdata$sample_group`
# Analyse Tableau peak après xcms
```{r eval=F}
data=readRDS("./data/xsannotated_bourran.rds")
d=as.matrix(data[,8:(ncol(data)-3)])
peaks=data[, c(1:8, (ncol(data)-3):ncol(data))]
dim(d)
dim(peaks)
length(unique(peaks$pcgroup))
##count na in rows...
nar=apply(d, 1, function(x)(sum(is.na(x))))
##count na in cols
nac=apply(d, 2, function(x)(sum(is.na(x))))
##I think NA values should be 0.
d[is.na(d)]=0
##see about intensities per row
intmaxr=apply(d, 1,max)
hist(intmaxr)
hist(intmaxr, breaks=10000, xlim=c(0, 5000))
```
Je vois pas de raison de mettre un threshold sur intmaxr pour filtrer les pics.
```{r}
##look for peaks with too many 0
nzr=apply(d, 1, function(x, N){sum(x==0)/N}, N=ncol(d))
hist(nzr)
```
Je vois pas de raison de filtré à ce niveau non plus.
Now we can filter using the experimental design.
let's filter for peaks to be stable in QCs. The expected error in the machine coefficient of variation is 20%.
```{r eval=T, echo=T}
samples=readRDS("../../../res/bourran_samples.rds")
##reorder according to data
I=intersect(samples$id, colnames(d))
samples=droplevels(samples[match(I, samples$id),])
d=d[,I]
all(colnames(d)==samples$id)
##what do we have in terms of samples in the data.
table(paste(samples$type))
##the rows with type NA should be removed.
s=(samples$type%in%c("NA", "parents","timeserie")==F)
samples <- samples[s,]
d <- d[,s]
```
```{r}
##investigate coefficient of variation in QCs
dqc=d[,samples$type=="QC"]
##check that all QC are correct
irqc=apply(dqc, 2, sum)
hist(irqc)
```
There are QCs that look bad. let's remove them.
```{r}
##remove bad QCs
w=names(which(irqc<5e6))
d=d[, colnames(d)%in%w==F]
samples=samples[samples$id%in%w==F,]
all(samples$id==colnames(d))
```
```{r}
#resubset to keep only QCs
dqc=d[,samples$type=="QC"]
rs=rowMeans(dqc)
cv=apply(dqc, 1, function(x){sd(x)/mean(x)})
plot(log10(rs), cv)
##add the point for the internal standard.
mzstd = 303.0504
winmz = 0.005
w=(peaks$mz >= mzstd - winmz & peaks$mz <= mzstd + winmz & peaks$rt > 450)
points(log10(rs)[w], cv[w], pch=16, cex=2, col="gold")
cv[w]
abline(0.2, 0)
dqc=dqc[cv<=0.2,]
d=d[cv<=0.2,]
peaks=peaks[cv<=0.2,]
dim(d)
dim(peaks)
length(unique(peaks$pcgroup))
```
We're getting closer. Now we need to keep peaks that are higher in QCs than in blancks
```{r}
dbl=d[,samples$type=="blank"]
##are all the blanks, blanks..
ib=apply(dbl, 2,sum)
hist(ib)
##no.. there are a few contaminations
w=names(which(ib>5e6))
##remove those.
d=d[,colnames(d)%in%w==F]
samples=samples[(samples$id%in%w)==F,]
all(samples$id==colnames(d))
dbl=d[,samples$type=="blank"]
dqc=d[,samples$type=="QC"]
##mesure noise in blanks and compare to qc. Use to filter.
ib=apply(dbl, 1, quantile, 0.99) ##quantile à 99% dans les blancs
iqc=apply(dqc, 1,mean)
w=(iqc>=ib*10) ##signal to noise ratio 100X
##for peaks only, but we can remove them.
d=d[w, ]
peaks=peaks[w,]
##check we have the same number of peaks and count the number of groups
dim(d)
dim(peaks)
dim(samples)
length(unique(peaks$pcgroup))
```
```{r}
```
Now we have 101 pseudo-molecules. We can investigate those.
```{r}
saveRDS(d, "./data/intensities_filt.rds")
saveRDS(peaks, "./data/peaks_filt.rds")
saveRDS(samples, "./data/samples_filt.rds")
```
## Analyses de données
## technical replicability
```{r}
d=readRDS("data/intensities_filt.rds")
samples=readRDS("data/samples_filt.rds")
peaks=readRDS("data/peaks_filt.rds")
##subset dat to keep only techreps
samples$techrep[samples$type=="blank"]=NA
samples$techrep=as.integer(samples$techrep)
u=unique(samples$sample[samples$techrep>1 & is.na(samples$techrep)==F])
drep=d[,samples$sample%in%u]
s=droplevels(samples$sample[samples$sample%in%u])
estim_repet=function(trait, s){
repet=c(Fstat=NA, pv=NA)
s=as.factor(s)
trait=as.numeric(trait)
f=formula(paste("trait~ s"))
m=lm(f)
a=anova(m)
as.numeric(a["F value"][1,1])->repet[1]
as.numeric(a["Pr(>F)"][1,1])->repet[2]
return(repet)
}
##estimate replicability for each fragment
rep=t(apply(log(drep+1), 1, estim_repet, s=s))
rep=as.data.frame(rep)
rep$pv_adj=p.adjust(rep$pv, method="fdr")
cols=rep("dodgerblue", nrow(rep))
cols[rep$pv_adj<=0.05]="gold"
barplot(rep$Fstat, col=cols, xlab="Pseudo-molecules", ylab="Fstab", border=cols)
#hist(rep$Fstat, breaks=100)
##filter avec pv<=0.05
##filter avec pvalue corrigées <=0.05
s=(rep$pv_adj<=0.05)
d=d[s, ]
peaks=cbind(peaks, rep)
peaks=peaks[s,]
saveRDS(d, "./data/intensities_filt_rep.rds")
saveRDS(peaks, "./data/peaks_filt_rep.rds")
saveRDS(samples, "./data/samples_filt_rep.rds")
```
### PCA
```{r}
library(plyr)
d=readRDS("data/intensities_filt_rep.rds")
samples=readRDS("data/samples_filt_rep.rds")
peaks=readRDS("data/peaks_filt_rep.rds")
##take highest peak per pcgroup as the representative peak.
#Intmax = rowSums(d,na.rm = TRUE)
funmax=function(x){return(x[which.max(x$Fstat)[1],])}
data=data.frame(peak=paste("peak_", peaks$X1, sep=""), pcgroup=paste("group_", peaks$pcgroup, sep=""),Intmax=Intmax,d)
res <-ddply(data,"pcgroup",funmax) ##
dpm=as.matrix(res[, -1:-3])
row.names(dpm)=res$pcgroup
dpm=t(dpm)
ppm=peaks[match(res$peak, paste("peak_", peaks$X1, sep="")),]
##preparer pour l'ACP
library(FactoMineR)
library(factoextra)
dpmsq = sqrt(dpm + 0.001)
dpmsq = scale(dpmsq)
res.pca <- PCA(dpmsq,ncp = 3,scale.unit = FALSE, graph=F)
fviz_eig(res.pca)
fviz_pca_var(res.pca, col.var = "black")
fviz_contrib(res.pca, choice = "var", axes = 1, top = 50)
fviz_contrib(res.pca, choice = "var", axes = 2, top = 50)
fviz_contrib(res.pca, choice = "var", axes = 3, top = 50)
fviz_contrib(res.pca, choice = "var", axes = 1:2, top = 10)
fviz_pca_ind(res.pca,
geom.ind = c("point"),
col.ind = samples$type, # colorer by groups
addEllipses = FALSE, # Ellipses de concentration
legend.title = "Type", title= "representation des individus en fonction du type d'echantillons"
)
```
The diffence between the blanks and the other samples is well captured by the first axis explaining over 80\% of the variance.
The problem is that the cloud of points looking like blanks doesn't only include blanks. There are bad samples left in the data!
Let's remove them.
```{r}
samples=droplevels(samples)
rs=rowSums(dpm)
boxplot(rs~samples$type)
stripchart(rs~samples$type,add=T, pch=16, col="Dodgerblue" , vertical=T, method="jitter")
abline(5e4, 0)
```
Some samples look like blanks, which suggest injection didn't work.
We can remove all the blanks and all of these by removing all rows with rs<3e6
```{r}
## now without the blanks!
unique(samples$type)
s=(samples$type%in%c("allgeno", "reps", "QC") & rs>=3e6)
dpm=dpm[s,]
samples=samples[s,]
all(samples$id==row.names(dpm))
dpmsq = sqrt(dpm + 0.001)
dpmsq = scale(dpmsq)
all(samples$id==row.names(dpmsq))
res.pca <- PCA(dpmsq,ncp = 3,scale.unit = FALSE, graph=F)
fviz_eig(res.pca)
fviz_contrib(res.pca, choice = "var", axes = 1, top = 50)
fviz_contrib(res.pca, choice = "var", axes = 2, top = 50)
fviz_contrib(res.pca, choice = "var", axes = 3, top = 50)
fviz_pca_ind(res.pca,
geom.ind = c("point"),
col.ind = samples$type, # colorer by groups
addEllipses = T, # Ellipses de concentration
legend.title = "Type", title= "representation des individus en fonction du type d'echantillons"
)
samples$height2=paste(samples$height)
samples$height2[samples$type=="QC"]<-"QC"
fviz_pca_ind(res.pca,
geom.ind = c("point"),
col.ind = samples$height2, # colorer by groups
addEllipses = FALSE, # Ellipses de concentration
legend.title = "Type", title= "representation des individus en fonction du type d'echantillons"
)
fviz_pca_ind(res.pca,
geom.ind = c("point"),
col.ind = samples$height2, # colorer by groups
addEllipses = TRUE, # Ellipses de concentration
legend.title = "Type", title= "representation des individus en fonction du type d'echantillons"
)
```
## test de l'effet genotype.
```{r}
dpm=readRDS("./data/d_filt_rep_pm.rds")
samples=readRDS("./data/samples_filt_rep_pm.rds")
ppm=readRDS("./data/peaks_filt_rep_pm.rds")
##garder unique les genotype "allgeno"
## s=(samples$type=="allgeno")
## dpm=dpm[s, ]
## samples=samples[s,]
##identifier les genotype repliqué.
tab=table(samples$genotype)
u=names(tab[tab>=7])
s=(samples$genotype%in%u & samples$techrep==1)
dat=dpm[s,]
sam=droplevels(samples[s,])
table(sam$genotype)
table(sam$techrep)
```
```
estim_geno=function(trait, height, genotype){
trait=as.numeric(trait)
height=as.factor(height)
genotype=as.factor(genotype)
f=formula(paste("trait~ height+genotype+height*genotype"))
m=lm(f)
a=anova(m)
return(a)
}
##estimate replicability for each fragment
res=apply(log(dat+1), 2, estim_geno, height=sam$height, genotype=sam$genotype)
library(plyr)
effgeno=ldply(res, function(x){data.frame(Fstat=x["F value"]["genotype",], pv=x["Pr(>F)"]["genotype",])})
effgeno$pv_adj=p.adjust(effgeno$pv, method="fdr")
cols=rep("dodgerblue", nrow(effgeno))
cols[effgeno$pv_adj<=0.05]="gold"
barplot(effgeno$Fstat, col=cols, xlab="Pseudo-molecules", ylab="Fstab", border=cols, names.arg=effgeno[,1])