# tidyverse
https://dplyr.tidyverse.org/reference/index.html#data-frame-verbs
## 1.rows
### arrange()
Order rows using column values
arrange(data, var1, var2, ...)
### distinct()
distinct(data)
This is similar to unique.data.frame(data)
Keep only unique/distinct rows from a data frame
### filter()
filter(data, condition1, condition2, ...)
### slice()
Subset rows using their positions
slice(data, number)
slice_head(data,n=number)
slice_tail(data,n=number)
slice_min(data, n=number)
slice_max(data,n=number)
slice_sample(data, n=number)
## 2.columns
### glimpse()
Get a glimpse of your data
glimpse(data)
### select()
Keep or drop columns using their names and types
select(data,var1,var2)
### mutate()
Create, modify, and delete columns
mutate(data, var1=, var2=, .keep="none")
### pull()
Extract a single column
mtcars %>% pull(1)
### relocate()
Change column order
relocate(data,vars, .before=var1, .after=var2)
### rename() rename_with()
Rename columns
rename(data, var1=var3)
rename_with(data, toupper)
rename_with(data, tolower)
## 3.groups
Verbs that principally operate on groups of rows.
### count() tally() add_count() add_tally()
Count the observations in each group
starwars %>% count(species) count by species
starwars %>% tally() count total number of group
df %>% add_count(gender, wt = runs) total number for each gender
df %>% add_tally(wt = runs) total number of things
### group_by() ungroup()
Group by one or more variables
### dplyr_by()
Per-operation grouping with .by/by
### rowwise()
Group input by rows, compute things one row at a time
df <- tibble(x = runif(6), y = runif(6), z = runif(6))
df %>% rowwise() %>% mutate(m = mean(c(x, y, z)))
### summarise() summarize()
Summarise each group down to one row
mtcars %>%
summarise(mean = mean(disp), n = n())
### reframe()
Transform each group to an arbitrary number of rows
### n() cur_group() cur_group_id() cur_group_rows() cur_column()
Information about the "current" group or variable
n() gives the current group size.
cur_group() gives the group keys, a tibble with one row and one column for each grouping variable.
cur_group_id() gives a unique numeric identifier for the current group.
cur_group_rows() gives the row indices for the current group.
cur_column() gives the name of the current column (in across() only).
## 4. dataframe
Verbs that principally operate on pairs of data frames.
### bind_cols()
Bind multiple data frames by column
bind_cols(dataframe1, dataframe2)
### bind_rows()
Bind multiple data frames by row
bind_rows(dataframe1, dataframe2)
### intersect() union() union_all() setdiff() setequal() symdiff()
Set operations
intersect(df1, df2)
union(df1, df2)
union_all(df1, df2)
setdiff(df1, df2)
setdiff(df2, df1)
symdiff(df1, df2)
setequal(df1, df2)
setequal(df1, df1[3:1, ])
### inner_join() left_join() right_join() full_join()
Mutating joins
band_members %>% inner_join(band_instruments)
band_members %>% left_join(band_instruments)
band_members %>% right_join(band_instruments)
band_members %>% full_join(band_instruments)
### nest_join()
Nest join
A nest join leaves x almost unchanged, except that it adds a new list-column, where each element contains the rows from y that match the corresponding row in x.
### semi_join() anti_join()
Filtering joins
band_members %>% semi_join(band_instruments)
band_members %>% anti_join(band_instruments)
### cross_join()
Cross join: give all the combinations
cross_join(band_instruments, band_members)
### join_by()
Join specifications
sales <- tibble(
id = c(1L, 1L, 1L, 2L, 2L),
sale_date = as.Date(c("2018-12-31", "2019-01-02", "2019-01-05", "2019-01-04", "2019-01-01"))
)
sales
promos <- tibble(
id = c(1L, 1L, 2L),
promo_date = as.Date(c("2019-01-01", "2019-01-05", "2019-01-02"))
)
promos
Match `id` to `id`, and `sale_date` to `promo_date`
by <- join_by(id, sale_date == promo_date)
left_join(sales, promos, by)
### rows_insert() rows_append() rows_update() rows_patch() rows_upsert() rows_delete()
Manipulate individual rows
data <- tibble(a = 1:3, b = letters[c(1:2, NA)], c = 0.5 + 0:2)
data
Insert
rows_insert(data, tibble(a = 4, b = "z"))
```
By default, if a key in `y` matches a key in `x`, then it can't be inserted
and will throw an error. Alternatively, you can ignore rows in `y`
containing keys that conflict with keys in `x` with `conflict = "ignore"`,
or you can use `rows_append()` to ignore keys entirely.
try(rows_insert(data, tibble(a = 3, b = "z")))
rows_insert(data, tibble(a = 3, b = "z"), conflict = "ignore")
rows_append(data, tibble(a = 3, b = "z"))
Update
rows_update(data, tibble(a = 2:3, b = "z"))
rows_update(data, tibble(b = "z", a = 2:3), by = "a")
Variants: patch and upsert
rows_patch(data, tibble(a = 2:3, b = "z"))
rows_upsert(data, tibble(a = 2:4, b = "z"))
```
```
rows_delete(data, tibble(a = 2:3))
rows_delete(data, tibble(a = 2:3, b = "b")
```
### across() if_any() if_all()
Apply a function (or functions) across multiple columns
```
iris %>%
mutate(across(c(1, 2), round))
iris %>%
filter(if_any(ends_with("Width"), ~ . > 4))
iris %>%
filter(if_all(ends_with("Width"), ~ . > 2))
```
### c_across()
Combine values from multiple columns
```
df <- tibble(id = 1:4, w = runif(4), x = runif(4), y = runif(4), z = runif(4))
df %>%
rowwise() %>%
mutate(
sum = sum(c_across(w:z)),
sd = sd(c_across(w:z))
)
### pick()
Select a subset of columns
df <- tibble(
x = c(3, 2, 2, 2, 1),
y = c(0, 2, 1, 1, 4),
z1 = c("a", "a", "a", "b", "a"),
z2 = c("c", "d", "d", "a", "c")
)
df
# `pick()` provides a way to select a subset of your columns using
# tidyselect. It returns a data frame.
df %>% mutate(cols = pick(x, y))
```
## 5. Vector functions
Unlike other dplyr functions, these functions work on individual vectors, not data frames.
### between()
Detect where values fall in a specified range
between(1:10,2,3)
### case_match()
A general vectorised switch()
```
x <- c("a", "b", "a", "d", "b", NA, "c", "e")
# `case_match()` acts like a vectorized `switch()`.
# Unmatched values "fall through" as a missing value.
case_match(
x,
"a" ~ 1,
"b" ~ 2,
"c" ~ 3,
"d" ~ 4
)
```
### case_when()
A general vectorised if-else
```
x <- 1:70
case_when(
x %% 35 == 0 ~ "fizz buzz",
x %% 5 == 0 ~ "fizz",
x %% 7 == 0 ~ "buzz",
.default = as.character(x)
)
```
### coalesce()
Find the first non-missing element
```
x <- sample(c(1:5, NA, NA, NA))
x
coalesce(x, 0L)
```
### consecutive_id()
Generate a unique identifier for consecutive combinations
```
consecutive_id(c(TRUE, TRUE, FALSE, FALSE, TRUE, FALSE, NA, NA))
consecutive_id(c(1, 1, 1, 2, 1, 1, 2, 2))
df <- data.frame(x = c(0, 0, 1, 0), y = c(2, 2, 2, 2))
df %>% group_by(x, y) %>% summarise(n = n())
df %>% group_by(id = consecutive_id(x, y), x, y) %>% summarise(n = n())
### cumall() cumany() cummean()
```
Cumulativate versions of any, all, and mean
### desc()
Descending order
### if_else()
Vectorised if-else
x <- c(-5:5, NA)
if_else(x < 0, NA, x)
### lag() lead()
Compute lagged or leading values
```
lag(1:5, n = 1)
lag(1:5, n = 2)
lead(1:5, n = 1)
lead(1:5, n = 2)
```
### n_distinct()
Count unique combinations
n_distinct(c(1,1,3,4,5,5))
### na_if()
Convert values to NA
```
na_if(1:5, 5:1)
x <- c(1, -1, 0, 10)
100 / x
100 / na_if(x, 0)
```
### near()
Compare two numeric vectors
near(c(1:3),c(1,2,4))
### nth() first() last()
Extract the first, last, or nth value from a vector
```
x <- 1:10
y <- 10:1
first(x)
last(y)
nth(x, 1)
nth(x, 5)
```
### ntile()
ntile() is a sort of very rough rank, which breaks the input vector into n buckets. If length(x) is not an integer multiple of n, the size of the buckets will differ by up to one, with larger buckets coming first.
```
x <- c(5, 1, 3, 2, 2, NA)
ntile(x, 2)
ntile(x, 4)
```
### order_by()
A helper function for ordering window function output
order_by(10:1, cumsum(1:10))
### percent_rank() cume_dist()
Proportional ranking functions
x <- c(5, 1, 3, 2, 2)
cume_dist(x)
percent_rank(x)
### recode() recode_factor()
Recode values
```
char_vec <- sample(c("a", "b", "c"), 10, replace = TRUE)
# `recode()` is superseded by `case_match()`
recode(char_vec, a = "Apple", b = "Banana")
num_vec <- c(1:4, NA)
recode_factor(
num_vec,
`1` = "z",
`2` = "y",
`3` = "x",
.default = "D",
.missing = "M"
)
```
### row_number() min_rank() dense_rank()
Integer ranking functions
x <- c(5, 1, 3, 2, 2, NA)
row_number(x)
min_rank(x)
dense_rank(x)
row_number() gives every input a unique rank, so that c(10, 20, 20, 30) would get ranks c(1, 2, 3, 4). It's equivalent to rank(ties.method = "first").
min_rank() gives every tie the same (smallest) value so that c(10, 20, 20, 30) gets ranks c(1, 2, 2, 4). It's the way that ranks are usually computed in sports and is equivalent to rank(ties.method = "min").
dense_rank() works like min_rank(), but doesn't leave any gaps, so that c(10, 20, 20, 30) gets ranks c(1, 2, 2, 3).
# tidymodels
## 1.explore the data
```
library(readr)
urchins <-
# Data were assembled for a tutorial
# at https://www.flutterbys.com.au/stats/tut/tut7.5a.html
read_csv("https://tidymodels.org/start/models/urchins.csv") %>%
# Change the names to be a little more verbose
setNames(c("food_regime", "initial_volume", "width")) %>%
# Factors are very helpful for modeling, so we convert one column
mutate(food_regime = factor(food_regime, levels = c("Initial", "Low", "High")))
```
## 2 build a model
### set up engine/ set up model / use tidy ()
set up engine
set up linear regression models
set up tidy
```
linear_reg() %>%
set_engine("keras")
#> Linear Regression Model Specification (regression)
lm_mod <- linear_reg()
lm_fit <-
lm_mod %>%
fit(width ~ initial_volume * food_regime, data = urchins)
lm_fit
tidy(lm_fit)
```
```
library(cowplot)
library(dotwhisker)
tidy(lm_fit) %>%
dwplot(dot_args = list(size = 2, color = "black"),
whisker_args = list(color = "black"),
vline = geom_vline(xintercept = 0, colour = "grey50", linetype = 2))
```
## 3. Use a model to predict
### predict y value
```
new_points <- expand.grid(initial_volume = 20,
food_regime = c("Initial", "Low", "High"))
new_points
```
### predict confidence interval
```
conf_int_pred <- predict(lm_fit,
new_data = new_points,
type = "conf_int")
conf_int_pred
```
```
plot_data <-
new_points %>%
bind_cols(mean_pred) %>%
bind_cols(conf_int_pred)
ggplot(plot_data, aes(x = food_regime)) +
geom_point(aes(y = .pred)) +
geom_errorbar(aes(ymin = .pred_lower,
ymax = .pred_upper),
width = .2) +
labs(y = "urchin size")
```
## 4. preprocess your data with recipes
### 4.1 data checking
```
set.seed(123)
flight_data <-
flights %>%
mutate(
# Convert the arrival delay to a factor
arr_delay = ifelse(arr_delay >= 30, "late", "on_time"),
arr_delay = factor(arr_delay),
# We will use the date (not date-time) in the recipe below
date = lubridate::as_date(time_hour)
) %>%
Include the weather data
inner_join(weather, by = c("origin", "time_hour")) %>%
Only retain the specific columns we will use
select(dep_time, flight, origin, dest, air_time, distance,
carrier, date, arr_delay, time_hour) %>%
Exclude missing data
na.omit() %>%
For creating models, it is better to have qualitative columns
encoded as factors (instead of character strings)
mutate_if(is.character, as.factor)
```
```
flight_data %>%
count(arr_delay) %>%
mutate(prop = n/sum(n))
flight_data %>%
skimr::skim(dest, carrier)
```
### 4.2 data splitting
```
# Fix the random numbers by setting the seed
# This enables the analysis to be reproducible when random numbers are used
set.seed(222)
# Put 3/4 of the data into the training set
data_split <- initial_split(flight_data, prop = 3/4)
# Create data frames for the two sets:
train_data <- training(data_split)
test_data <- testing(data_split)
```
### 4.3 create recipe
```
flights_rec <-
recipe(arr_delay ~ ., data = train_data)
```
```
flights_rec <-
recipe(arr_delay ~ ., data = train_data) %>%
update_role(flight, time_hour, new_role = "ID")
```
`summary(flights_rec)` predictor, ID, outcome
```
flights_rec <-
recipe(arr_delay ~ ., data = train_data) %>%
update_role(flight, time_hour, new_role = "ID") %>%
step_date(date, features = c("dow", "month")) %>%
step_holiday(date,
holidays = timeDate::listHolidays("US"),
keep_original_cols = FALSE) %>%
step_dummy(all_nominal_predictors()) %>%
step_zv(all_predictors())
```
### 4.4 create models
```
lr_mod <-
logistic_reg() %>%
set_engine("glm")
```
### 4.5 create the workflow
```
flights_wflow <-
workflow() %>%
add_model(lr_mod) %>%
add_recipe(flights_rec)
```
### 4.6 Use the workflow to train from predictors
Create the trained workflow
```
flights_fit <-
flights_wflow %>%
fit(data = train_data)
```
### 4.7 extract the results
```
flights_fit %>%
extract_fit_parsnip() %>%
tidy()
```
### 4.8 Use the trained workflow to predict results
`predict(flights_fit, test_data)`
Combine type="prob" with the argument()
```
predict(flights_fit, test_data, type="prob")
flights_aug <-
augment(flights_fit, test_data)
```
```
flights_aug %>%
select(arr_delay, time_hour, flight, .pred_class, .pred_on_time)
flights_aug %>%
roc_auc(truth = arr_delay, .pred_late)
flights_aug %>%
roc_auc(truth = arr_delay, .pred_late)
```
## 5. evaluate your model with resamples
```
data(cells, package = "modeldata")
cells %>%
count(class) %>%
mutate(prop = n/sum(n))
```
### 5.1 spliting the data base on certain categorical variable
```
set.seed(123)
cell_split <- initial_split(cells %>% select(-case),
strata = class)
cell_train <- training(cell_split)
cell_test <- testing(cell_split)
nrow(cell_train)
nrow(cell_train)/nrow(cells)
cell_train %>%
count(class) %>%
mutate(prop = n/sum(n))
cell_test %>%
count(class) %>%
mutate(prop = n/sum(n))
```
### 5.2 create models
```
rf_mod <-
rand_forest(trees = 1000) %>%
set_engine("ranger") %>%
set_mode("classification")
```
```
set.seed(234)
rf_fit <-
rf_mod %>%
fit(class ~ ., data = cell_train)
rf_fit
```
### 5.3 estimating performance
training set
```
rf_training_pred <-
predict(rf_fit, cell_train) %>%
bind_cols(predict(rf_fit, cell_train, type = "prob")) %>%
bind_cols(cell_train %>%
select(class))
```
```
rf_training_pred %>% # training set predictions
roc_auc(truth = class, .pred_PS)
```
```
rf_training_pred %>% # training set predictions
accuracy(truth = class, .pred_class)
```
test set
```
rf_testing_pred <-
predict(rf_fit, cell_test) %>%
bind_cols(predict(rf_fit, cell_test, type = "prob")) %>%
bind_cols(cell_test %>% select(class))
```
```
rf_testing_pred %>% # test set predictions
roc_auc(truth = class, .pred_PS)
```
```
rf_testing_pred %>% # test set predictions
accuracy(truth = class, .pred_class)
```
### 5.4 fit a model with resampling
```
set.seed(345)
folds <- vfold_cv(cell_train, v = 10)
rf_wf <-
workflow() %>%
add_model(rf_mod) %>% ## create an workflow
add_formula(class ~ .)
set.seed(456)
rf_fit_rs <-
rf_wf %>%
fit_resamples(folds)
rf_fit_rs
collect_metrics(rf_fit_rs)
rf_testing_pred %>% # test set predictions
roc_auc(truth = class, .pred_PS)
rf_testing_pred %>% # test set predictions
accuracy(truth = class, .pred_class)
```
## tune model parameters
```
library(tidymodels) # for the tune package, along with the rest of tidymodels
library(rpart.plot) # for visualizing a decision tree
library(vip)
```
### Tune hyperparameters preparement
set up what to tune and the resampled object
```
set.seed(123)
cell_split <- initial_split(cells %>% select(-case),
strata = class)
cell_train <- training(cell_split)
cell_test <- testing(cell_split)
```
```
tune_spec <-
decision_tree(
cost_complexity = tune(),
tree_depth = tune()
) %>%
set_engine("rpart") %>%
set_mode("classification")
tune_spec
```
```
tree_grid <- grid_regular(cost_complexity(),
tree_depth(),
levels = 5)
tree_grid
tree_grid %>%
count(tree_depth)
```
```
set.seed(234)
cell_folds <- vfold_cv(cell_train)
```
### model tuning with a grid
```
set.seed(345)
tree_wf <- workflow() %>%
add_model(tune_spec) %>%
add_formula(class ~ .)
tree_res <-
tree_wf %>%
tune_grid(
resamples = cell_folds,
grid = tree_grid
)
```
```
tree_res
tree_res %>%
collect_metrics()
```
### compare the result and select the best model
```
tree_res %>%
collect_metrics() %>%
mutate(tree_depth = factor(tree_depth)) %>%
ggplot(aes(cost_complexity, mean, color = tree_depth)) +
geom_line(size = 1.5, alpha = 0.6) +
geom_point(size = 2) +
facet_wrap(~ .metric, scales = "free", nrow = 2) +
scale_x_log10(labels = scales::label_number()) +
scale_color_viridis_d(option = "plasma", begin = .9, end = 0)
```
```
tree_res %>%
show_best(metric = "accuracy")
```
```
best_tree <- tree_res %>%
select_best(metric = "accuracy")
best_tree
```
```
final_wf <-
tree_wf %>%
finalize_workflow(best_tree)
final_wf
```
### Use the best model on test data
```
final_fit <-
final_wf %>%
last_fit(cell_split)
final_fit %>%
collect_metrics()
```
```
final_fit %>%
collect_predictions() %>%
roc_curve(class, .pred_PS) %>%
autoplot()
```
```
final_tree <- extract_workflow(final_fit)
final_tree
```
```
final_tree %>%
extract_fit_engine() %>%
rpart.plot(roundint = FALSE)
```
```
library(vip)
final_tree %>%
extract_fit_parsnip() %>%
vip()
```
```
```
args(decision_tree)
## a predictive modeling case study
```
library(tidymodels)
library(readr) # for importing data
library(vip)
```
### check and manipulate data
```
hotels <-
read_csv("https://tidymodels.org/start/case-study/hotels.csv") %>%
mutate(across(where(is.character), as.factor))
dim(hotels)
glimpse(hotels)
```
```
hotels %>%
count(children) %>%
mutate(prop = n/sum(n))
```
### split data to training and resting
```
set.seed(123)
splits <- initial_split(hotels, strata = children)
hotel_other <- training(splits)
hotel_test <- testing(splits)
hotel_other %>%
count(children) %>%
mutate(prop = n/sum(n))
hotel_test %>%
count(children) %>%
mutate(prop = n/sum(n))
```
### split training data for validation
```
set.seed(234)
val_set <- validation_split(hotel_other,
strata = children,
prop = 0.80)
```
### build the model
lr_mod <-
logistic_reg(penalty = tune(), mixture = 1) %>%
set_engine("glmnet")
### create the recipe
```
holidays <- c("AllSouls", "AshWednesday", "ChristmasEve", "Easter",
"ChristmasDay", "GoodFriday", "NewYearsDay", "PalmSunday")
lr_recipe <-
recipe(children ~ ., data = hotel_other) %>%
step_date(arrival_date) %>%
step_holiday(arrival_date, holidays = holidays) %>%
step_rm(arrival_date) %>%
step_dummy(all_nominal_predictors()) %>%
step_zv(all_predictors()) %>%
step_normalize(all_predictors())
```
### Create the workflow
lr_workflow <-
workflow() %>%
add_model(lr_mod) %>%
add_recipe(lr_recipe)
### Create the grid for tuning
```
lr_reg_grid <- tibble(penalty = 10^seq(-4, -1, length.out = 30))
lr_reg_grid %>% top_n(-5)
lr_reg_grid %>% top_n(5)
```
### Train and tune the model
Let’s use tune::tune_grid() to train these 30 penalized logistic regression models. We’ll also save the validation set predictions (via the call to control_grid()) so that diagnostic information can be available after the model fit. The area under the ROC curve will be used to quantify how well the model performs across a continuum of event thresholds (recall that the event rate—the proportion of stays including children— is very low for these data).
```
lr_res <-
lr_workflow %>%
tune_grid(val_set,
grid = lr_reg_grid,
control = control_grid(save_pred = TRUE),
metrics = metric_set(roc_auc))
```
```
lr_plot <-
lr_res %>%
collect_metrics() %>%
ggplot(aes(x = penalty, y = mean)) +
geom_point() +
geom_line() +
ylab("Area under the ROC Curve") +
scale_x_log10(labels = scales::label_number())
lr_plot
```
```
top_models <-
lr_res %>%
show_best(metric = "roc_auc", n = 15) %>%
arrange(penalty)
top_models
```
```
lr_best <-
lr_res %>%
collect_metrics() %>%
arrange(penalty) %>%
slice(12)
lr_best
```
```
lr_auc <-
lr_res %>%
collect_predictions(parameters = lr_best) %>%
roc_curve(children, .pred_children) %>%
mutate(model = "Logistic Regression")
autoplot(lr_auc)
```
## A tree based ensamble model
### build the model and improve training time
```
cores <- parallel::detectCores()
cores
```
```
rf_mod <-
rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>%
set_engine("ranger", num.threads = cores) %>%
set_mode("classification")
```
### Create the recipe and workflow
```
rf_recipe <-
recipe(children ~ ., data = hotel_other) %>%
step_date(arrival_date) %>%
step_holiday(arrival_date) %>%
step_rm(arrival_date)
```
```
rf_workflow <-
workflow() %>%
add_model(rf_mod) %>%
add_recipe(rf_recipe)
```
### Train and tune the model
```
rf_mod
extract_parameter_set_dials(rf_mod)
###
```
```
set.seed(345)
rf_res <-
rf_workflow %>%
tune_grid(val_set,
grid = 25,
control = control_grid(save_pred = TRUE),
metrics = metric_set(roc_auc))
```
```
rf_res %>%
show_best(metric = "roc_auc")
```
```
autoplot(rf_res)
rf_best <-
rf_res %>%
select_best(metric = "roc_auc")
rf_best
```
```
rf_res %>%
collect_predictions()
```
```
rf_auc <-
rf_res %>%
collect_predictions(parameters = rf_best) %>%
roc_curve(children, .pred_children) %>%
mutate(model = "Random Forest")
```
### compare regression and RF model
```
bind_rows(rf_auc, lr_auc) %>%
ggplot(aes(x = 1 - specificity, y = sensitivity, col = model)) +
geom_path(lwd = 1.5, alpha = 0.8) +
geom_abline(lty = 3) +
coord_equal() +
scale_color_viridis_d(option = "plasma", end = .6)
```
### the last fit
```
#the last model
last_rf_mod <-
rand_forest(mtry = 8, min_n = 7, trees = 1000) %>%
set_engine("ranger", num.threads = cores, importance = "impurity") %>%
set_mode("classification")
#the last workflow
last_rf_workflow <-
rf_workflow %>%
update_model(last_rf_mod)
#the last fit
set.seed(345)
last_rf_fit <-
last_rf_workflow %>%
last_fit(splits)
last_rf_fit
```
```
last_rf_fit %>%
collect_metrics()
```
```
last_rf_fit %>%
extract_fit_parsnip() %>%
vip(num_features = 20)
```
```
last_rf_fit %>%
collect_predictions() %>%
roc_curve(children, .pred_children) %>%
autoplot()
```