--- title: TP Classification tags: Ensai - public description: Mars 2022 --- # TP Classification TP2 exo2 ```r rm(list = ls()) install.packages("mclust") require(cluster) require(pgmm) require(FactoMineR) require(explor) require(VarSelLCM) require(mclust) data("coffee") ?coffee summary(coffee) coffee$Variety <- as.factor(coffee$Variety) out.pca <- PCA(coffee, quali.sup = 1:2, graph = F) explor(out.pca) plot(out.pca$ind$coord[,1], out.pca$ind$coord[,2], col=coffee[,1]) ?VarSelCluster res.diago.bic <- VarSelCluster(coffee[, -c(1, 2)], 1:8, vbleSelec = F) summary(res.diago.bic) res.diago.icl <- VarSelCluster(coffee[, -c(1, 2)], 1:8, vbleSelec = F, crit.varsel = "ICL") summary(res.diago.icl) res.diago.bic@partitions table(coffee[, 1], res.diago.bic@partitions@zMAP) table(coffee[, 2], res.diago.bic@partitions@zMAP) par(mfrow=c(1,1)) plot(out.pca$ind$coord[, 1], out.pca$ind$coord[, 2], col = res.diago.bic@partitions@zMAP) sil <- silhouette(res.diago.bic@partitions@zMAP, daisy(coffee[, -c(1, 2)])) sil <- silhouette(res.diago.icl@partitions@zMAP, daisy(coffee[, -c(1, 2)])) plot(sil) summary(sil) data.with.partitions <- cbind(coffee, factor(res.diago.bic@partitions@zMAP)) ?catdes catdes(data.with.partitions, ncol(data.with.partitions)) by(coffee, res.diago.icl@partitions@zMAP, summary) res.diago.icl@criteria@discrim cor(coffee[res.diago.bic@partitions@zMAP == 1, -c(1, 2)]) cor(coffee[res.diago.bic@partitions@zMAP == 2, -c(1, 2)]) ?Mclust mclust.options("emModelNames") resfull <- Mclust(coffee[, -c(1, 2)], 1:8) summary(resfull) resfull$parameters table(resfull$classification, res.diago.bic@partitions@zMAP) ``` TP2 exo1 ```r ############################################################################################################## # Mélange de distributions de poissons ############################################################################################################## rm(list = ls()) rdata <- function(n, prop, lambda) { g <- length(prop) z <- sample(1:g, n, replace = TRUE, prob = prop) x <- rpois(n, lambda[z]) list(z = z, x = x) } oneEM <- function(x, g, tol) { # Init prop <- runif(g) prop <- prop / sum(prop) lambda <- sample(x, g) # Calcul loglike masse.cond <- sapply(1:g, function(k) dpois(x, lambda[k]) * prop[k]) loglike <- sum(log(rowSums(masse.cond))) prec <- -Inf while ((loglike - prec) > tol) { # Estep tik <- masse.cond / rowSums(masse.cond) # Mstep prop <- colSums(tik) / sum(tik) lambda <- as.numeric(t(tik) %*% x) / colSums(tik) # Calcul loglike masse.cond <- sapply(1:g, function(k) dpois(x, lambda[k]) * prop[k]) prec <- loglike loglike <- sum(log(rowSums(masse.cond))) } list( prop = prop, lambda = lambda, loglike = loglike, tik = tik, check = loglike > prec, masse.cond = masse.cond ) } multi.EM <- function(x, g, nbinit = 20, tol = 0.01) { res <- replicate(nbinit, oneEM(x, g, tol), simplify = FALSE) res <- res[[which.max(sapply(res, function(u) u$loglike))]] res } MAPassign <- function(prob) { as.factor(apply(prob, 1, which.max)) } estim.mixture <- function(x, gmax, criterion = "BIC", nbinit = 20, tol = 0.01) { results <- lapply(1:gmax, multi.EM, x = x, nbinit = nbinit, tol = tol) for (g in 1:gmax) { results[[g]]$partition <- MAPassign(results[[g]]$tik) results[[g]]$BIC <- results[[g]]$loglike - (2 * g - 1) * 0.5 * log(length(x)) results[[g]]$ICL <- sum(log(apply(results[[g]]$masse.cond, 1, max))) - (2 * g - 1) * 0.5 * log(length(x)) } if (criterion == "BIC") { results <- results[[which.max(sapply(results, function(u) u$BIC))]] } else if (criterion == "ICL") { results <- results[[which.max(sapply(results, function(u) u$ICL))]] } results } set.seed(123) ech <- rdata(100, c(1 / 2, 1 / 2), c(5, 20)) solution <- estim.mixture(ech$x, 8) solution$prop solution$lambda table(solution$partition, ech$z) ``` TP2 exo3 ```r ############################################################################################################## # Sélection de variable et gestion de données manquantes ############################################################################################################## rm(list = ls()) require(VarSelLCM) require(missMDA) generdata <- function(n, r, d, epsilon) { z <- sample(1:2, n, replace = TRUE) x <- matrix(rnorm(n * r), n, r) x[which(z == 1), ] <- x[which(z == 1), ] + epsilon x[which(z == 2), ] <- x[which(z == 2), ] - epsilon x <- cbind(x, matrix(rnorm(n * (d - r), sd = sqrt(2)), n, d - r)) list(z = z, x = x) } giveARI <- function(ech) { res.with <- VarSelCluster(ech$x, 2, vbleSelec = TRUE, crit.varsel = "BIC") res.without <- VarSelCluster(ech$x, 2, vbleSelec = FALSE) c(ARI(ech$z, res.with@partitions@zMAP), ARI(ech$z, res.without@partitions@zMAP)) } set.seed(123) all.ech <- replicate(20, generdata(100, 3, 20, 1), simplify = FALSE) res <- sapply(all.ech, giveARI) summary(t(res)) generdata2 <- function(n, r, d, epsilon, tau) { z <- sample(1:2, n, replace = TRUE) x <- matrix(rnorm(n * r), n, r) x[which(z == 1), ] <- x[which(z == 1), ] + epsilon x[which(z == 2), ] <- x[which(z == 2), ] - epsilon x <- cbind(x, matrix(rnorm(n * (d - r), sd = sqrt(2)), n, d - r)) x.notna <- x for (j in 1:ncol(x)) { naloc <- which(runif(n) < tau) if (length(naloc) > 0) { x[naloc, j] <- NA } } list(z = z, x = x, x.notna = x.notna) } giveARI.na <- function(ech) { x.impute <- imputePCA(ech$x, scale = FALSE)$completeObs ari.geo <- ARI(ech$z, kmeans(x.impute, 2)$cluster) ari.mixture <- ARI(ech$z, VarSelCluster(ech$x, 2, vbleSelec = TRUE)@partitions@zMAP) c(ari.geo, ari.mixture) } set.seed(123) all.ech <- replicate(20, generdata2(101, 3, 6, 1, .2), simplify = FALSE) res <- sapply(all.ech, giveARI.na) summary(t(res)) generdata3 <- function(n, r, d, epsilon, tau) { z <- sample(1:2, n, replace = TRUE) x <- matrix(rnorm(n * r), n, r) x[which(z == 1), ] <- x[which(z == 1), ] + epsilon x[which(z == 2), ] <- x[which(z == 2), ] - epsilon x <- cbind(x, matrix(rnorm(n * (d - r), sd = sqrt(2)), n, d - r)) x.notna <- x for (j in 1:ncol(x)) { naloc <- which(x[, j] < tau) if (length(naloc) > 0) { x[naloc, j] <- NA } } list(z = z, x = x, x.notna = x.notna) } set.seed(123) all.ech <- replicate(20, generdata3(101, 3, 6, 1, 1), simplify = FALSE) res <- sapply(all.ech, giveARI.na) summary(t(res)) ```