```
tab_src <- "y P N
37.41667 petit facile
34.41667 petit dur
27.16667 petit impossible
34.08333 petit facile
29.83333 grand dur
24.25000 grand impossible
28.41667 grand facile
23.16667 grand dur
26.00000 grand impossible"
dat <- read.table(
text = tab_src,
header = TRUE, sep = " ")
z <- split(dat, dat$P)
z <- split(dat, list(dat$P, dat$N), sep = "_")
# un df à partir des 6 elements ----
rbind(z[[1]], z[[2]], z[[3]],
z[[4]], z[[5]], z[[6]])
# un df à partir des 6 elements par une boucle ----
out <- data.frame()
for(elt in z){
out <- rbind(out, elt)
}
out
mean(x = rnorm(100), trim = 0, na.rm = TRUE)
args <- list(x = rnorm(100), trim = 0, na.rm = TRUE)
do.call(mean, args)
# la BELLE approche R base ----
# rbind(z[[1]], z[[2]], z[[3]],
# z[[4]], z[[5]], z[[6]])
w <- do.call(rbind, z)
row.names(w) <- NULL
w
# la vraie plus BELLE approche data.table ----
w <- rbindlist(z, use.names = TRUE)
setDF(w)
```
### Apply
```
aq <- airquality
apply(aq, 2, mean, na.rm = TRUE)
z <- apply(aq, 2, function(x){
c(mean = mean(x, na.rm = TRUE), sd = sd(x, na.rm = TRUE))
})
t(z)
z <- apply(aq, 1, function(x){
sum(x, na.rm = TRUE)
})
t(z)
# compter les NA par lignes ou colonnes -----
is_na <- is.na(aq)
# bof -----
apply(is_na, 1, sum)
# mieux que bof -----
rowSums(is_na)
colSums(is_na)
````
### mapply
```
ma_fun <- function(x, nom_variable){
data.frame(
variable = nom_variable,
mean = mean(x, na.rm = TRUE), sd = sd(x, na.rm = TRUE),
stringsAsFactors = FALSE)
}
# avec mapply
w <- mapply(ma_fun, x = mtcars, nom_variable = colnames(mtcars), SIMPLIFY = FALSE)
# avec for
w <- list()
for(j in seq_along(mtcars)){
x <- mtcars[[j]]
nom_variable <- colnames(mtcars)[j]
w[[j]] <- ma_fun(x = x, nom_variable = nom_variable)
}
# pour toutes les solutions
w <- do.call(rbind, w)
row.names(w) <- NULL
w
```
### Exo regex
```
library(data.table)
tweets <- fread("data/coffeeTweets.csv")
library(data.table)
library(stringr)
tweets <- fread("data/coffeeTweets.csv")
# Filtrer les lignes où screen_name contient “cafe” ou “Coffee”
tweets[grepl( "cafe|coffee", screen_name, ignore.case = TRUE), ]
# Combien de tweets ont plus de 8 hashtags et le hashtag “coffeeaddict”. Utiliser
# str_count pour compter les hashtags (qui sont séparés par des espaces):
tweets[grepl("coffeeaddict", hashtags, ignore.case = TRUE), ][
str_count(hashtags, " ") > 7,
]
tweets[grepl("#coffeeaddict", text, ignore.case = TRUE), ][
str_count(text, "#") > 8,
]
# Parmis les noms (screen_name), combien ont au moins trois z successifs ?
tweets[grepl( "[z]{3,}", screen_name, ignore.case = TRUE), ]
tweets[ , hour:= str_extract(created_at, "[[:digit:]]+:[[:digit:]]+$") ]
```
```
library(data.table)
library(stringi)
fruits <- c("apple", "apricot", "avocado", "banana", "bell pepper", "bilberry",
"blackberry", "blackcurrant", "blood orange", "blueberry", "boysenberry",
"breadfruit", "canary melon", "cantaloupe", "cherimoya", "cherry",
"chili pepper", "clementine", "cloudberry", "coconut", "cranberry",
"cucumber", "currant", "damson", "date", "dragonfruit", "durian",
"eggplant", "elderberry", "feijoa", "fig", "goji berry", "gooseberry",
"grape", "grapefruit", "guava", "honeydew", "huckleberry", "jackfruit",
"jambul", "jujube", "kiwi fruit", "kumquat", "lemon", "lime",
"loquat", "lychee", "mandarine", "mango", "mulberry", "nectarine",
"nut", "olive", "orange", "pamelo", "papaya", "passionfruit",
"peach", "pear", "persimmon", "physalis", "pineapple", "plum",
"pomegranate", "pomelo", "purple mangosteen", "quince", "raisin",
"rambutan", "raspberry", "redcurrant", "rock melon", "salal berry",
"satsuma", "star fruit", "strawberry", "tamarillo", "tangerine",
"ugli fruit", "watermelon")
data_fruits <- data.table(nom = fruits, stringsAsFactors = FALSE)
rm(fruits)
data_fruits
# > Pour chaque élément, calculer si oui ou non la chaîne de caractère an est détectée ? Ajouter le résultat dans une nouvelle colonne.
data_fruits[, has_an := grepl("an", nom)]
# > Indiquer dans une nouvelle colonne si le fruit commence par la lettre b.
data_fruits[, start_with_b := grepl("^b", nom)]
data_fruits[, ct_an := stri_count(nom, regex = "an")]
data_fruits[, c("pos_start_an", "pos_end_an") := as.data.frame(stri_locate_first_regex(nom, "an"))]
data_fruits[, c("tri") := substr(nom, start = 1, stop = 3)]
data_fruits[, c("berry") := stri_extract_first_regex(nom, "(.*)(berry)$")]
data_fruits[grepl("berry$", nom), c("berry") := gsub("(.*)(berry)$", "\\1\\2", nom)]
data_fruits[, c("berry") := stri_extract_all_regex(nom, "(.*)(berry)$")]
data_fruits[stri_detect(nom, regex = "berry$"), nom := "chuck"]
data_fruits[, nom:=gsub("(.*)(berry)$", "chuck", nom)]
```
## LE CODE
```
library(data.table)
library(magrittr)
library(stringi)
library(readxl)
read_munic_dat <- function(x){
id_cols <- c("ID_BVOTE", "SCRUTIN", "ANNEE", "TOUR",
"DATE", "NUM_CIRC", "NUM_QUARTIER", "NUM_ARROND",
"NUM_BUREAU")
nb_cols <- c("NB_PROCU", "NB_INSCR", "NB_EMARG",
"NB_VOTANT", "NB_BLANC", "NB_NUL", "NB_EXPRIM")
dat <- read_xls(x)
setDT(dat)
measure_vars <- setdiff(colnames(dat), c(id_cols, nb_cols))
melt(dat,
id.vars = id_cols,
measure.vars = measure_vars,
variable.name = "candidat",
value.name = "score",
variable.factor = FALSE)
}
read_municipales <- function(repertoire){
if(!dir.exists(repertoire)){
stop("le repertoire ", shQuote(repertoire), " n'existe pas.")
}
dat <- list.files(path = repertoire,
recursive = TRUE,
full.names = TRUE)
if(length(dat) < 1){
stop("le repertoire ne contient pas de fichier a importer.")
}
dat <- data.table(nom_fichier = basename(dat), nom_repertoire = dirname(dat), nom_complet = dat)
dat[, date := stri_extract_first_regex(nom_fichier, pattern = "[[:digit:]]{8}")]
dat[, ar := stri_extract_first_regex(nom_fichier, pattern = "ardt_[[:digit:]]{2}")]
x <- lapply(dat$nom_complet, read_munic_dat)
x <- rbindlist(x, use.names = TRUE, fill = FALSE)
setDF(x)
x
}
z <- read_municipales("data/leg-paris-2020")
z <- read_municipales("data/leg-paris-2021")
```