# Statistics - Part 2 with Bastien LANGE
###### tags: emPLANT+, 第一年 法國 UniLaSalle, 上學期
**Use R Classical**
R程式 轉換語言
https://metalanguageinuse.com/how-to-change-language-setting-to-english-in-r-rstudio-a-temporary-measure/
課程時間表

the idea is to understand then in the future when we incounter problems that we can deal with it, so the exam can open R script and trying to slove it like this way.

## <font color="blue"><font size=5>Decision tree </font></font><br>
quantitative (定量的)
qualitative (定性的)


Paired: measure on the same people/ population/ individuals
Unpaired: measure on different people/ population/ individuals


---
## <font color="red"><font size=5>Code in class</font></font><br>
### **1. class 1 & practical session 1**

```
# 2021-2022 Univariate statistics Practical session 1 – emPLANT & 4A Plant Breeding
--------------------------------------------
# Question 3
--------------------------------------------
# first step set and change the working directory. It been done of the R console. Or you can also use fuction
setwd("C:/Users/XPS/Desktop/emPLANT+ ULS France/Statistics/Part 2 - Bastein")
# remeber you should add "" of the directory in the setwd() fuction
# remeber you should change the "\" copy from the folder to "/" because the computer doesn't recongnize it
# to read the documents that you want to put and read
DATA <- read.csv("SunflowerWO.csv", dec=".", sep=",", header=T)
# remember that is dec=".", sep="," in my dell computer
fix(DATA)
------------------------------------------------------
# Question 4 - verify the structure of the dataframe
------------------------------------------------------
str(DATA)
# str function is to vertify the nature of the variables
DATA$Label <- as.character(DATA$Label)
DATA$Year <- as.factor(DATA$Year)
DATA$Soil.type <- as.factor(DATA$Soil.type)
summary(DATA)
----------------------------------------------------------------
# Question 5 - Linear relationship between each couple of trait
----------------------------------------------------------------
#considering two variables at a times
#linear relationship between Oil and Campesterol
cor.test(x = DATA$Oil, y = DATA$Campesterol, method = "pearson")
#linear relationship between Oil and Tocopherol
cor.test(x = DATA$Oil, y = DATA$Tocopherol, method = "pearson")
#linear relationship between Campesterol and Tocopherol
cor.test(x = DATA$Campesterol, y = DATA$Tocopherol, method = "pearson")
#check the p-value < 0.05, p-value < 0.01, p-value < 0.001 means that have significant difference (also can see throgh the graph)
#all data are significant, 2 are negative, 1 is positive
#consider all the variable at the same time -> matrix of correlation
#install a package called "PerformaceAnalytics"
#packages -> install a package -> France(montpeiller)
#load the package
library(PerformanceAnalytics)
#extract only the quantitative data, we need to select the quantitative traits only in a new dataframe
names(DATA)#know about the column number
DATAcorr <- DATA[c(5,6,7)] #select the quantitative traits
fix(DATAcorr) #verify the selection
#run the "chart.Correlation" function
chart.Correlation(DATAcorr, histogram = TRUE)
```

* Correlation (相關係數)

* linear regression (簡單直線回歸分析)

* 無母數統計法 (線性相關係數)

### **2. class 2 & practical session 1**
```
# first step set and change the working directory. It been done of the R console. Or you can also use fuction
setwd("C:/Users/XPS/Desktop/emPLANT+ ULS France/Statistics/Part 2 - Bastein")
# remeber you should add "" of the directory in the setwd() fuction
# remeber you should change the "\" copy from the folder to "/" because the computer doesn't recongnize it
# to read the documents that you want to put and read
DATA <- read.csv("SunflowerWO.csv", dec=".", sep=",", header=T)
# remember that is dec=".", sep="," in my dell computer
fix(DATA)
str(DATA)
# str function is to vertify the nature of the variables
DATA$Label <- as.character(DATA$Label)
DATA$Year <- as.factor(DATA$Year)
DATA$Soil.type <- as.factor(DATA$Soil.type)
summary(DATA)
-----------------------------------------
# Question 6 - simple regression
-----------------------------------------
SR1 <- lm(DATA$Tocopherols ~ DATA$Campesterol)
SR1
# Calculate the value of Tocopherols considering a Campesterol of 35 mg.kg-1
# Intercept is 405.443
# Coefficient is 4.691
Tocopherol <- 4.691*(35) + 405.443
Tocopherol
summary(SR1)
# to provide the R2(look at the Adjusted R-squared)
# R2 = 0.06149 -> very low !
# The model is not predictive
plot(DATA$Tocopherols ~ DATA$Campesterol)
# graphical representation
abline(SR1)
# create the predictive linear line
# export the graph you've been creating, remeber to let the graph open to let computer to catch it
savePlot(filename = "SR1", type = c("pdf"))
-----------------------------------------------
# Practice of question 6 - simple regression
-----------------------------------------------
# Predict the oil content of oil content for Tocopherol = 335 mg.kg-1?
SR2 <- lm(DATA$Oil ~ DATA$Tocopherols)
SR2
Oil <- -0.0001562*(335) + 0.4308520
Oil
# It is equal to 0.378525
summary(SR2)
plot(DATA$Oil ~ DATA$Tocopherols)
# graphical representation
abline(SR2)
#create the predictive linear line
# export the graph you've been creating, remeber to close the graph first to let computer to catch it
savePlot(filename = "SR2", type = c("pdf"))
-----------------------------------------------------------------------------
# Question 7 - Is there effect of genotype on oil/Tocopherols/Campesterol ?
-----------------------------------------------------------------------------
DATAQ7 <- read.csv("SunflowerQ7.csv", dec=".", sep=",", header=T)
fix(DATAQ7)
str(DATAQ7)
DATAQ7$Label <- as.character(DATAQ7$Label)
DATAQ7$Year <- as.factor(DATAQ7$Year)
str(DATAQ7)
# Oil content, normal distribution ?
shapiro.test(DATAQ7$Oil)
# in normal distribution, H0 -> data are normally distributed
# in normal distribution, H1 -> data are not normally distributed
# p-value > 0.05 -> normal distribution
hist(DATAQ7$Oil)
# One-way ANOVA using the "aov" function
AOV1 <- aov(DATAQ7$Oil ~ DATAQ7$Genotype, data=DATAQ7)
summary(AOV1)
# There is no effect of the genotype on the oil content (look at Pr > 0.05)
# H0: the mean are all equal
# H1: the mean are not all equal
# Campesterol, normal distribution ?
shapiro.test(DATAQ7$Campesterol)
# p-value < 0.05 -> not normal distribution
# transform the data into log10 by creating a new variable called "LOGCampesterol"
DATAQ7$LOGCampesterol <- log10(DATAQ7$Campesterol)
str(DATAQ7)
# check the normal distribution of the log-trasformed Campesterol
shapiro.test(DATAQ7$LOGCampesterol)
# p<0.05 -> not normal distribution, move to non-parametric KW test
KW1 <- kruskal.test(DATAQ7$Campesterol ~ DATAQ7$Genotype, data = DATAQ7)
KW1
# there is an effect of the genotype on the Campesterol content (H = 11.031, p-value < 0.05)
# Tocopherols, normal distribution ?
shapiro.test(DATAQ7$Tocopherols)
# p-value > 0.05 -> normal distribution
hist(DATAQ7$Tocopherols)
AOV2 <- aov(DATAQ7$Tocopherols ~ DATAQ7$Genotype, data = DATAQ7)
summary(AOV2)
# There is an effect of the genotype on the Tocopherols content of the plants(F = 3.028, p < 0.05) (because Pr < 0.05, also look at the star)
----------------------------------------------------------------------
# Question 8 - See the difference between genotype for Tocopherols
----------------------------------------------------------------------
# make an AOV post-hoc test -> HSD (Honestly Significant Difference)
#install a package called "agricolae"
#packages -> install a package -> France(montpeiller)
library("agricolae")
HSD1 <- HSD.test(AOV2, "DATAQ7$Genotype", alpha = 0.05, group = TRUE)
HSD1
# AA in group a have difference compare to AB in group b. But don't have difference compare to BA and BB in group ab.
# AB in group b have difference compare to AA in group a. But don't have difference compare to BA and BB in group ab.
boxplot(DATAQ7$Tocopherols ~ DATAQ7$Genotype, xlab = "Genotype", ylab = "Tocopherols mg.kg-1")
plot(HSD1)
```
### **3. class 3 & practical session 1 & 2**
```
# first step set and change the working directory. It been done of the R console. Or you can also use fuction
setwd("C:/Users/XPS/Desktop/emPLANT+ ULS France/Statistics/Part 2 - Bastein")
# remeber you should add "" of the directory in the setwd() fuction
# remeber you should change the "\" copy from the folder to "/" because the computer doesn't recongnize it
# to read the documents that you want to put and read
DATA <- read.csv("SunflowerWO.csv", dec=".", sep=",", header=T)
# remember that is dec=".", sep="," in my dell computer
fix(DATA)
str(DATA)
# str function is to vertify the nature of the variables
DATA$Label <- as.character(DATA$Label)
DATA$Year <- as.factor(DATA$Year)
DATA$Soil.type <- as.factor(DATA$Soil.type)
summary(DATA)
str(DATA)
-----------------------------------------------------
# Question 9 - effect on soil type of Tocopherols
-----------------------------------------------------
# use form 2 in the decision tree, soil type have more than 2 modalities (be careful)
# normal distribution of the Tocopherols, use the initial dataset/ dataframe
# check the normal distribution
shapiro.test(DATA$Tocopherols)
# Tocopherols is not normally distributed beacause p-value < 0.05
# transform all the quantitative traits at the same time and join it to the dataframe
# select the quantitative traits
names(DATA)
DATALOG <- DATA[c(5, 6, 7)]
# trasform all the quantitative data
DATALOG <- log10(DATALOG)
fix(DATALOG)
# join the dataframe to DATALOG to the dataframe called DATA
DATA <- data.frame(DATA, DATALOG)
str(DATA)
fix(DATA)
# remember to go to the "fix(DATA)" array to change the name of the label
# check the normal distribution on the log-trasformed Tocopherols
shapiro.test(DATA$TocopherolsLOG)
# p-value close to 0.05, so do both test, KW and AOV
# Kruskal test on initial values of Tocopherols
KW3 <- kruskal.test(DATA$Tocopherols ~ DATA$Soil, data=DATA)
KW3
# post-hoc comparision using "Dunn" test for dataset that is not equllibrate
# install the package "FSA"
library(FSA)
# identify the signiificant differences (p-value < 0.05), loamy - Sandy and loamy - Silty have difference, but Sandy - Silty don't have difference
# loamy is group a, Sandy is group b, Silty is group b
dunnTest(DATA$Tocopherols ~ DATA$Soil, method = "bonferroni", data = DATA)
# plot to look at the data
boxplot(DATA$Tocopherols ~ DATA$Soil, xlab="Soil Type", ylab="Tocopherols mg.kg-1")
# post-hoc comparision using "Mann-Withney" test for dataset that is equllibrate (not much missing value NA)
pairwise.wilcox.test(DATA$Tocopherols, DATA$Soil, p.adjust.method="bonferroni")
fix(DATA)
# AOV on log-transformed Tocopherols
# remember to check "str(DATA)" and "fix(DATA)" the label name to run the function
AOV3 <- aov(DATA$TocopherolsLOG ~ DATA$Soil, data=DATA)
summary(AOV3)
library("agricolae")
HSD2 <- HSD.test(AOV3, "DATA$Soil", alpha=0.05, group=TRUE)
HSD2
plot(HSD2)
-------------------------------------------------
# Question 10 - effect on year of Tocopherols
-------------------------------------------------
shapiro.test(DATA$TocopherolsLOG)
# If it is normally distributed then move to student t.test
str(DATA)
ST1 <- t.test(DATA$TocopherolsLOG ~ DATA$Year, conf.level = 0.95, var.equal = TRUE, data = DATA)
ST1
# year is affecting Tocopherols content, beacause p-value < 0.05
# If it is not normally distributed then move to Mann-Whithney U-test (non-parametric test)
MW1 <- wilcox.test(DATA$Tocopherols ~ DATA$Year)
MW1
# year is affecting Tocopherols content, beacause p-value < 0.05
-------------------------------------------------
# try to compare two traits in the same graph
#packages -> install a package "lattice" -> France(montpeiller)
#load the package
library(lattice)
histogram( ~ Tocopherols | DATA$Year, data = DATA, layout = c(1,2))
```
* Plot(作圖)









* 同時多組數據作圖

```
x = 1:12
y = matrix(AirPassengers,nrow=12)
matplot(x,y,type="b")
```

```
------------------------------------------------
Self practice - practical session 2
------------------------------------------------
# 2. import dataset and verify structure
------------------------------------------------
# first step set and change the working directory. It been done of the R console. Or you can also use fuction
setwd("C:/Users/XPS/Desktop/emPLANT+ ULS France/Statistics/Part 2 - Bastein")
# remeber you should add "" of the directory in the setwd() fuction
# remeber you should change the "\" copy from the folder to "/" because the computer doesn't recongnize it
# to read the documents that you want to put and read
DATA <- read.csv("WheatWO.csv", dec=".", sep=",", header=T)
# remember that is dec=".", sep="," in my dell computer
fix(DATA)
str(DATA)
# str function is to vertify the nature of the variables
DATA$Sample <- as.character(DATA$Sample)
DATA$Nincom <- as.factor(DATA$Nincom)
summary(DATA)
-----------------------------------------------------------------------
# 2.5 examine the linear relationship between the traits of interest
-----------------------------------------------------------------------
#considering two variables at a times
#linear relationship between Biomass and FoliarSurface
cor.test(x = DATA$Biomass, y = DATA$FoliarSurface, method = "pearson")
#linear relationship between Biomass and GrainPerEar
cor.test(x = DATA$Biomass, y = DATA$GrainPerEar, method = "pearson")
#linear relationship between Biomass and Yield
cor.test(x = DATA$Biomass, y = DATA$Yield, method = "pearson")
#linear relationship between FoliarSurface and GrainPerEar
cor.test(x = DATA$FoliarSurface, y = DATA$GrainPerEar, method = "pearson")
#linear relationship between GrainPerEar and Yield
cor.test(x = DATA$GrainPerEar, y = DATA$Yield, method = "pearson")
#check the p-value < 0.05, p-value < 0.01, p-value < 0.001 means that have significant difference (also can see throgh the graph)
#data are significant: Biomass-FoliarSurface, GrainPerEar-Yield
#data are not significant: Biomass-GrainPerEar, Biomass-Yield, FoliarSurface-GrainPerEar
#consider all the variable at the same time -> matrix of correlation
#load the package
library(PerformanceAnalytics)
#extract only the quantitative data, we need to select the quantitative traits only in a new dataframe
names(DATA)#know about the column number
DATAcorr <- DATA[c(5,6,7,8)]
#select the quantitative traits
fix(DATAcorr)
#verify the selection
#run the "chart.Correlation" function
chart.Correlation(DATAcorr, histogram = TRUE)
#Biomass and FoliarSurface (x=0.82, p-value < 0.001)
#GrainPerEar and Yield (x=0.57, p-value < 0.001)
---------------------------------------------------------------------------
# 2.8 predict the biomass of plants that have foliar surface of 203 mm^2
---------------------------------------------------------------------------
SR1 <- lm(DATA$Biomass ~ DATA$FoliarSurface)
SR1
# Calculate the value of biomass considering a foliar surface of 203 mg.kg-1
# Intercept is -0.0333496
# Coefficient is 0.0005566
biomass <- 0.0005566*(203) - 0.0333496
biomass
summary(SR1)
# to provide the R2(look at the Adjusted R-squared)
# R2 = 0.6711 -> quite great
# The model is predictive
plot(DATA$Biomass ~ DATA$FoliarSurface)
# graphical representation
abline(SR1)
# create the predictive linear line
```
### **4. class 4 & practical session 2**
*Principal Component Analysis (PCA): to draw a plot that is more than 3 dimension data (to test and see possible correlation), 將資料降維度並且作圖,讓讀者更容易閱讀的資料分析方法 (如果同時處理多個維度的資料,很難畫圖 & 統整資料)
*draw ineilia graph (code "posieig" while drawing the graph): make decision of which data you are gonna take, if there is a gap between data, only choose the data that don't have a lot of differences
```
# first step set and change the working directory. It been done of the R console. Or you can also use fuction
setwd("C:/Users/XPS/Desktop/emPLANT+ ULS France/Statistics/Part 2 - Bastein")
# remeber you should add "" of the directory in the setwd() fuction
# remeber you should change the "\" copy from the folder to "/" because the computer doesn't recongnize it
# to read the documents that you want to put and read
DATA <- read.csv("WheatWO.csv", dec=".", sep=",", header=T)
# remember that is dec=".", sep="," in my dell computer
fix(DATA)
str(DATA)
# str function is to vertify the nature of the variables
DATA$Sample <- as.character(DATA$Sample)
DATA$Nincom <- as.factor(DATA$Nincom)
summary(DATA)
-------------------------------------------------------
Question 3 - Multivarite analysis descriptive -> PCA
-------------------------------------------------------
# use form 3 in the decision tree Multivarite analysis descriptive PCA
# must delete all the NA data, if you have lots of NA data don't run this analysis because it will delete a lot of information
DATA <- na.omit(DATA)
fix(DATA)
# create a dataframe with quantitative data only
names(DATA)
DATAPCA <- DATA[c(5,6,7,8)]
fix(DATAPCA)
# packages -> install a package "ade4" -> France(montpeiller)
# load the package "ade4"
library(ade4)
PCA1 <- dudi.pca(DATAPCA, scale=TRUE, center=TRUE)
# select the number of axes, then you can close the plot
# select just only two because there is a huge gap of the rest of two data
# plot the PCA
scatter(PCA1, clab.row=0, posieig="topleft")
# FoliarSurface and Biomass might have correlation because they are really close to each other
# GrainPerEar and Yield might have correlation because they are close to each other
# "FoliarSurface & Biomass" and "GrainPerEar & Yield" groups are not correlated
# on the topright d=1 is somthing that we don't care
summary(PCA1)
# axes 1 in the graph represent 48.169% of the dataset (look at "Projected inertia")
# axes 2 in the graph represent 36.739% of the dataset (look at "Projected inertia")
# consider "Cumulative projected inertia" if it is 60% it means quite really well
# export the graph you've been creating, remeber to let the graph open to let computer to catch it
# type = c("pdf") can be change to type = c("jpe") or type = c("jpg")
savePlot(filename = "PCAWheat", type = c("pdf"))
s.corcircle(PCA1$co)
# plot in correlation circle
-----------------------------------
Question 4 - Individual map
-----------------------------------
PCA1$li
# to give the new position in the graph (axes 1 and axes 2)
# "pch" is to give the symbol in the graph (can be found on the internet)
str(DATA)
s.class(PCA1$li, add.plot=FALSE, pch = c(5, 10, 11)[DATA$Genotypes], fac = factor(DATA$ExpositionFungi), col = c("blue", "red"), clabel = 0.7, cstar = 0, cellipse = 1)
summary(DATA$Genotypes)
# export the graph you've been creating, remeber to let the graph open to let computer to catch it
# type = c("pdf") can be change to type = c("jpe") or type = c("jpg")
savePlot(filename = "DATA Genotypes", type = c("pdf"))
x11()
# open a new empty window to have both two graph at the same time
# just for position can give explaination. admisor on the left top might have negative correlation of GrainPerEar and Yield
------------------------------------
Question 5 - Multi-factorial AOV
------------------------------------
DATA <- read.csv("WheatWO.csv", dec=".", sep=",", header=T)
# remember that is dec=".", sep="," in my dell computer
fix(DATA)
str(DATA)
# test the normal distribution of the response trait -> yield
shapiro.test(DATA$Yield)
hist(DATA$Yield)
# p-value > 0.05, accepted the hypothesis of H0. this is a normal distribution
DATA$YieldSQRT <- sqrt(DATA$Yield)
str(DATA)
DATA$YieldLOG <- log10(DATA$Yield)
str(DATA)
shapiro.test(DATA$YieldSQRT)
shapiro.test(DATA$YieldLOG)
AOV1 <- aov(DATA$YieldSQRT ~ DATA$Genotypes *DATA$Nincom* DATA$ExpositionFungi)
summary(AOV1)
# there is only Genotypes and ExpositionFungi will affect yield (have significant effect)
# Genotypes and Yield will affect by ExpositionFungi
# packages -> install a package "ggplot2" -> France(montpeiller)
# load the package "ggplot2"
library(ggplot2)
Graph1 <- ggplot(DATA, aes(x=Genotypes, y=Yield, fill=ExpositionFungi))+ geom_boxplot(position=position_dodge(1))
Graph1
```
Exam's answer should be like this (and give some explaination)
