tidyverse and tidy models

# tidyverse https://dplyr.tidyverse.org/reference/index.html#data-frame-verbs ## 1.rows ### arrange() Order rows using column values arrange(data, var1, var2, ...) ### distinct() distinct(data) This is similar to unique.data.frame(data) Keep only unique/distinct rows from a data frame ### filter() filter(data, condition1, condition2, ...) ### slice() Subset rows using their positions slice(data, number) slice_head(data,n=number) slice_tail(data,n=number) slice_min(data, n=number) slice_max(data,n=number) slice_sample(data, n=number) ## 2.columns ### glimpse() Get a glimpse of your data glimpse(data) ### select() Keep or drop columns using their names and types select(data,var1,var2) ### mutate() Create, modify, and delete columns mutate(data, var1=, var2=, .keep="none") ### pull() Extract a single column mtcars %>% pull(1) ### relocate() Change column order relocate(data,vars, .before=var1, .after=var2) ### rename() rename_with() Rename columns rename(data, var1=var3) rename_with(data, toupper) rename_with(data, tolower) ## 3.groups Verbs that principally operate on groups of rows. ### count() tally() add_count() add_tally() Count the observations in each group starwars %>% count(species) count by species starwars %>% tally() count total number of group df %>% add_count(gender, wt = runs) total number for each gender df %>% add_tally(wt = runs) total number of things ### group_by() ungroup() Group by one or more variables ### dplyr_by() Per-operation grouping with .by/by ### rowwise() Group input by rows, compute things one row at a time df <- tibble(x = runif(6), y = runif(6), z = runif(6)) df %>% rowwise() %>% mutate(m = mean(c(x, y, z))) ### summarise() summarize() Summarise each group down to one row mtcars %>% summarise(mean = mean(disp), n = n()) ### reframe() Transform each group to an arbitrary number of rows ### n() cur_group() cur_group_id() cur_group_rows() cur_column() Information about the "current" group or variable n() gives the current group size. cur_group() gives the group keys, a tibble with one row and one column for each grouping variable. cur_group_id() gives a unique numeric identifier for the current group. cur_group_rows() gives the row indices for the current group. cur_column() gives the name of the current column (in across() only). ## 4. dataframe Verbs that principally operate on pairs of data frames. ### bind_cols() Bind multiple data frames by column bind_cols(dataframe1, dataframe2) ### bind_rows() Bind multiple data frames by row bind_rows(dataframe1, dataframe2) ### intersect() union() union_all() setdiff() setequal() symdiff() Set operations intersect(df1, df2) union(df1, df2) union_all(df1, df2) setdiff(df1, df2) setdiff(df2, df1) symdiff(df1, df2) setequal(df1, df2) setequal(df1, df1[3:1, ]) ### inner_join() left_join() right_join() full_join() Mutating joins band_members %>% inner_join(band_instruments) band_members %>% left_join(band_instruments) band_members %>% right_join(band_instruments) band_members %>% full_join(band_instruments) ### nest_join() Nest join A nest join leaves x almost unchanged, except that it adds a new list-column, where each element contains the rows from y that match the corresponding row in x. ### semi_join() anti_join() Filtering joins band_members %>% semi_join(band_instruments) band_members %>% anti_join(band_instruments) ### cross_join() Cross join: give all the combinations cross_join(band_instruments, band_members) ### join_by() Join specifications sales <- tibble( id = c(1L, 1L, 1L, 2L, 2L), sale_date = as.Date(c("2018-12-31", "2019-01-02", "2019-01-05", "2019-01-04", "2019-01-01")) ) sales promos <- tibble( id = c(1L, 1L, 2L), promo_date = as.Date(c("2019-01-01", "2019-01-05", "2019-01-02")) ) promos Match `id` to `id`, and `sale_date` to `promo_date` by <- join_by(id, sale_date == promo_date) left_join(sales, promos, by) ### rows_insert() rows_append() rows_update() rows_patch() rows_upsert() rows_delete() Manipulate individual rows data <- tibble(a = 1:3, b = letters[c(1:2, NA)], c = 0.5 + 0:2) data Insert rows_insert(data, tibble(a = 4, b = "z")) ``` By default, if a key in `y` matches a key in `x`, then it can't be inserted and will throw an error. Alternatively, you can ignore rows in `y` containing keys that conflict with keys in `x` with `conflict = "ignore"`, or you can use `rows_append()` to ignore keys entirely. try(rows_insert(data, tibble(a = 3, b = "z"))) rows_insert(data, tibble(a = 3, b = "z"), conflict = "ignore") rows_append(data, tibble(a = 3, b = "z")) Update rows_update(data, tibble(a = 2:3, b = "z")) rows_update(data, tibble(b = "z", a = 2:3), by = "a") Variants: patch and upsert rows_patch(data, tibble(a = 2:3, b = "z")) rows_upsert(data, tibble(a = 2:4, b = "z")) ``` ``` rows_delete(data, tibble(a = 2:3)) rows_delete(data, tibble(a = 2:3, b = "b") ``` ### across() if_any() if_all() Apply a function (or functions) across multiple columns ``` iris %>% mutate(across(c(1, 2), round)) iris %>% filter(if_any(ends_with("Width"), ~ . > 4)) iris %>% filter(if_all(ends_with("Width"), ~ . > 2)) ``` ### c_across() Combine values from multiple columns ``` df <- tibble(id = 1:4, w = runif(4), x = runif(4), y = runif(4), z = runif(4)) df %>% rowwise() %>% mutate( sum = sum(c_across(w:z)), sd = sd(c_across(w:z)) ) ### pick() Select a subset of columns df <- tibble( x = c(3, 2, 2, 2, 1), y = c(0, 2, 1, 1, 4), z1 = c("a", "a", "a", "b", "a"), z2 = c("c", "d", "d", "a", "c") ) df # `pick()` provides a way to select a subset of your columns using # tidyselect. It returns a data frame. df %>% mutate(cols = pick(x, y)) ``` ## 5. Vector functions Unlike other dplyr functions, these functions work on individual vectors, not data frames. ### between() Detect where values fall in a specified range between(1:10,2,3) ### case_match() A general vectorised switch() ``` x <- c("a", "b", "a", "d", "b", NA, "c", "e") # `case_match()` acts like a vectorized `switch()`. # Unmatched values "fall through" as a missing value. case_match( x, "a" ~ 1, "b" ~ 2, "c" ~ 3, "d" ~ 4 ) ``` ### case_when() A general vectorised if-else ``` x <- 1:70 case_when( x %% 35 == 0 ~ "fizz buzz", x %% 5 == 0 ~ "fizz", x %% 7 == 0 ~ "buzz", .default = as.character(x) ) ``` ### coalesce() Find the first non-missing element ``` x <- sample(c(1:5, NA, NA, NA)) x coalesce(x, 0L) ``` ### consecutive_id() Generate a unique identifier for consecutive combinations ``` consecutive_id(c(TRUE, TRUE, FALSE, FALSE, TRUE, FALSE, NA, NA)) consecutive_id(c(1, 1, 1, 2, 1, 1, 2, 2)) df <- data.frame(x = c(0, 0, 1, 0), y = c(2, 2, 2, 2)) df %>% group_by(x, y) %>% summarise(n = n()) df %>% group_by(id = consecutive_id(x, y), x, y) %>% summarise(n = n()) ### cumall() cumany() cummean() ``` Cumulativate versions of any, all, and mean ### desc() Descending order ### if_else() Vectorised if-else x <- c(-5:5, NA) if_else(x < 0, NA, x) ### lag() lead() Compute lagged or leading values ``` lag(1:5, n = 1) lag(1:5, n = 2) lead(1:5, n = 1) lead(1:5, n = 2) ``` ### n_distinct() Count unique combinations n_distinct(c(1,1,3,4,5,5)) ### na_if() Convert values to NA ``` na_if(1:5, 5:1) x <- c(1, -1, 0, 10) 100 / x 100 / na_if(x, 0) ``` ### near() Compare two numeric vectors near(c(1:3),c(1,2,4)) ### nth() first() last() Extract the first, last, or nth value from a vector ``` x <- 1:10 y <- 10:1 first(x) last(y) nth(x, 1) nth(x, 5) ``` ### ntile() ntile() is a sort of very rough rank, which breaks the input vector into n buckets. If length(x) is not an integer multiple of n, the size of the buckets will differ by up to one, with larger buckets coming first. ``` x <- c(5, 1, 3, 2, 2, NA) ntile(x, 2) ntile(x, 4) ``` ### order_by() A helper function for ordering window function output order_by(10:1, cumsum(1:10)) ### percent_rank() cume_dist() Proportional ranking functions x <- c(5, 1, 3, 2, 2) cume_dist(x) percent_rank(x) ### recode() recode_factor() Recode values ``` char_vec <- sample(c("a", "b", "c"), 10, replace = TRUE) # `recode()` is superseded by `case_match()` recode(char_vec, a = "Apple", b = "Banana") num_vec <- c(1:4, NA) recode_factor( num_vec, `1` = "z", `2` = "y", `3` = "x", .default = "D", .missing = "M" ) ``` ### row_number() min_rank() dense_rank() Integer ranking functions x <- c(5, 1, 3, 2, 2, NA) row_number(x) min_rank(x) dense_rank(x) row_number() gives every input a unique rank, so that c(10, 20, 20, 30) would get ranks c(1, 2, 3, 4). It's equivalent to rank(ties.method = "first"). min_rank() gives every tie the same (smallest) value so that c(10, 20, 20, 30) gets ranks c(1, 2, 2, 4). It's the way that ranks are usually computed in sports and is equivalent to rank(ties.method = "min"). dense_rank() works like min_rank(), but doesn't leave any gaps, so that c(10, 20, 20, 30) gets ranks c(1, 2, 2, 3). # tidymodels ## 1.explore the data ``` library(readr) urchins <- # Data were assembled for a tutorial # at https://www.flutterbys.com.au/stats/tut/tut7.5a.html read_csv("https://tidymodels.org/start/models/urchins.csv") %>% # Change the names to be a little more verbose setNames(c("food_regime", "initial_volume", "width")) %>% # Factors are very helpful for modeling, so we convert one column mutate(food_regime = factor(food_regime, levels = c("Initial", "Low", "High"))) ``` ## 2 build a model ### set up engine/ set up model / use tidy () set up engine set up linear regression models set up tidy ``` linear_reg() %>% set_engine("keras") #> Linear Regression Model Specification (regression) lm_mod <- linear_reg() lm_fit <- lm_mod %>% fit(width ~ initial_volume * food_regime, data = urchins) lm_fit tidy(lm_fit) ``` ``` library(cowplot) library(dotwhisker) tidy(lm_fit) %>% dwplot(dot_args = list(size = 2, color = "black"), whisker_args = list(color = "black"), vline = geom_vline(xintercept = 0, colour = "grey50", linetype = 2)) ``` ## 3. Use a model to predict ### predict y value ``` new_points <- expand.grid(initial_volume = 20, food_regime = c("Initial", "Low", "High")) new_points ``` ### predict confidence interval ``` conf_int_pred <- predict(lm_fit, new_data = new_points, type = "conf_int") conf_int_pred ``` ``` plot_data <- new_points %>% bind_cols(mean_pred) %>% bind_cols(conf_int_pred) ggplot(plot_data, aes(x = food_regime)) + geom_point(aes(y = .pred)) + geom_errorbar(aes(ymin = .pred_lower, ymax = .pred_upper), width = .2) + labs(y = "urchin size") ``` ## 4. preprocess your data with recipes ### 4.1 data checking ``` set.seed(123) flight_data <- flights %>% mutate( # Convert the arrival delay to a factor arr_delay = ifelse(arr_delay >= 30, "late", "on_time"), arr_delay = factor(arr_delay), # We will use the date (not date-time) in the recipe below date = lubridate::as_date(time_hour) ) %>% Include the weather data inner_join(weather, by = c("origin", "time_hour")) %>% Only retain the specific columns we will use select(dep_time, flight, origin, dest, air_time, distance, carrier, date, arr_delay, time_hour) %>% Exclude missing data na.omit() %>% For creating models, it is better to have qualitative columns encoded as factors (instead of character strings) mutate_if(is.character, as.factor) ``` ``` flight_data %>% count(arr_delay) %>% mutate(prop = n/sum(n)) flight_data %>% skimr::skim(dest, carrier) ``` ### 4.2 data splitting ``` # Fix the random numbers by setting the seed # This enables the analysis to be reproducible when random numbers are used set.seed(222) # Put 3/4 of the data into the training set data_split <- initial_split(flight_data, prop = 3/4) # Create data frames for the two sets: train_data <- training(data_split) test_data <- testing(data_split) ``` ### 4.3 create recipe ``` flights_rec <- recipe(arr_delay ~ ., data = train_data) ``` ``` flights_rec <- recipe(arr_delay ~ ., data = train_data) %>% update_role(flight, time_hour, new_role = "ID") ``` `summary(flights_rec)` predictor, ID, outcome ``` flights_rec <- recipe(arr_delay ~ ., data = train_data) %>% update_role(flight, time_hour, new_role = "ID") %>% step_date(date, features = c("dow", "month")) %>% step_holiday(date, holidays = timeDate::listHolidays("US"), keep_original_cols = FALSE) %>% step_dummy(all_nominal_predictors()) %>% step_zv(all_predictors()) ``` ### 4.4 create models ``` lr_mod <- logistic_reg() %>% set_engine("glm") ``` ### 4.5 create the workflow ``` flights_wflow <- workflow() %>% add_model(lr_mod) %>% add_recipe(flights_rec) ``` ### 4.6 Use the workflow to train from predictors Create the trained workflow ``` flights_fit <- flights_wflow %>% fit(data = train_data) ``` ### 4.7 extract the results ``` flights_fit %>% extract_fit_parsnip() %>% tidy() ``` ### 4.8 Use the trained workflow to predict results `predict(flights_fit, test_data)` Combine type="prob" with the argument() ``` predict(flights_fit, test_data, type="prob") flights_aug <- augment(flights_fit, test_data) ``` ``` flights_aug %>% select(arr_delay, time_hour, flight, .pred_class, .pred_on_time) flights_aug %>% roc_auc(truth = arr_delay, .pred_late) flights_aug %>% roc_auc(truth = arr_delay, .pred_late) ``` ## 5. evaluate your model with resamples ``` data(cells, package = "modeldata") cells %>% count(class) %>% mutate(prop = n/sum(n)) ``` ### 5.1 spliting the data base on certain categorical variable ``` set.seed(123) cell_split <- initial_split(cells %>% select(-case), strata = class) cell_train <- training(cell_split) cell_test <- testing(cell_split) nrow(cell_train) nrow(cell_train)/nrow(cells) cell_train %>% count(class) %>% mutate(prop = n/sum(n)) cell_test %>% count(class) %>% mutate(prop = n/sum(n)) ``` ### 5.2 create models ``` rf_mod <- rand_forest(trees = 1000) %>% set_engine("ranger") %>% set_mode("classification") ``` ``` set.seed(234) rf_fit <- rf_mod %>% fit(class ~ ., data = cell_train) rf_fit ``` ### 5.3 estimating performance training set ``` rf_training_pred <- predict(rf_fit, cell_train) %>% bind_cols(predict(rf_fit, cell_train, type = "prob")) %>% bind_cols(cell_train %>% select(class)) ``` ``` rf_training_pred %>% # training set predictions roc_auc(truth = class, .pred_PS) ``` ``` rf_training_pred %>% # training set predictions accuracy(truth = class, .pred_class) ``` test set ``` rf_testing_pred <- predict(rf_fit, cell_test) %>% bind_cols(predict(rf_fit, cell_test, type = "prob")) %>% bind_cols(cell_test %>% select(class)) ``` ``` rf_testing_pred %>% # test set predictions roc_auc(truth = class, .pred_PS) ``` ``` rf_testing_pred %>% # test set predictions accuracy(truth = class, .pred_class) ``` ### 5.4 fit a model with resampling ``` set.seed(345) folds <- vfold_cv(cell_train, v = 10) rf_wf <- workflow() %>% add_model(rf_mod) %>% ## create an workflow add_formula(class ~ .) set.seed(456) rf_fit_rs <- rf_wf %>% fit_resamples(folds) rf_fit_rs collect_metrics(rf_fit_rs) rf_testing_pred %>% # test set predictions roc_auc(truth = class, .pred_PS) rf_testing_pred %>% # test set predictions accuracy(truth = class, .pred_class) ``` ## tune model parameters ``` library(tidymodels) # for the tune package, along with the rest of tidymodels library(rpart.plot) # for visualizing a decision tree library(vip) ``` ### Tune hyperparameters preparement set up what to tune and the resampled object ``` set.seed(123) cell_split <- initial_split(cells %>% select(-case), strata = class) cell_train <- training(cell_split) cell_test <- testing(cell_split) ``` ``` tune_spec <- decision_tree( cost_complexity = tune(), tree_depth = tune() ) %>% set_engine("rpart") %>% set_mode("classification") tune_spec ``` ``` tree_grid <- grid_regular(cost_complexity(), tree_depth(), levels = 5) tree_grid tree_grid %>% count(tree_depth) ``` ``` set.seed(234) cell_folds <- vfold_cv(cell_train) ``` ### model tuning with a grid ``` set.seed(345) tree_wf <- workflow() %>% add_model(tune_spec) %>% add_formula(class ~ .) tree_res <- tree_wf %>% tune_grid( resamples = cell_folds, grid = tree_grid ) ``` ``` tree_res tree_res %>% collect_metrics() ``` ### compare the result and select the best model ``` tree_res %>% collect_metrics() %>% mutate(tree_depth = factor(tree_depth)) %>% ggplot(aes(cost_complexity, mean, color = tree_depth)) + geom_line(size = 1.5, alpha = 0.6) + geom_point(size = 2) + facet_wrap(~ .metric, scales = "free", nrow = 2) + scale_x_log10(labels = scales::label_number()) + scale_color_viridis_d(option = "plasma", begin = .9, end = 0) ``` ``` tree_res %>% show_best(metric = "accuracy") ``` ``` best_tree <- tree_res %>% select_best(metric = "accuracy") best_tree ``` ``` final_wf <- tree_wf %>% finalize_workflow(best_tree) final_wf ``` ### Use the best model on test data ``` final_fit <- final_wf %>% last_fit(cell_split) final_fit %>% collect_metrics() ``` ``` final_fit %>% collect_predictions() %>% roc_curve(class, .pred_PS) %>% autoplot() ``` ``` final_tree <- extract_workflow(final_fit) final_tree ``` ``` final_tree %>% extract_fit_engine() %>% rpart.plot(roundint = FALSE) ``` ``` library(vip) final_tree %>% extract_fit_parsnip() %>% vip() ``` ``` ``` args(decision_tree) ## a predictive modeling case study ``` library(tidymodels) library(readr) # for importing data library(vip) ``` ### check and manipulate data ``` hotels <- read_csv("https://tidymodels.org/start/case-study/hotels.csv") %>% mutate(across(where(is.character), as.factor)) dim(hotels) glimpse(hotels) ``` ``` hotels %>% count(children) %>% mutate(prop = n/sum(n)) ``` ### split data to training and resting ``` set.seed(123) splits <- initial_split(hotels, strata = children) hotel_other <- training(splits) hotel_test <- testing(splits) hotel_other %>% count(children) %>% mutate(prop = n/sum(n)) hotel_test %>% count(children) %>% mutate(prop = n/sum(n)) ``` ### split training data for validation ``` set.seed(234) val_set <- validation_split(hotel_other, strata = children, prop = 0.80) ``` ### build the model lr_mod <- logistic_reg(penalty = tune(), mixture = 1) %>% set_engine("glmnet") ### create the recipe ``` holidays <- c("AllSouls", "AshWednesday", "ChristmasEve", "Easter", "ChristmasDay", "GoodFriday", "NewYearsDay", "PalmSunday") lr_recipe <- recipe(children ~ ., data = hotel_other) %>% step_date(arrival_date) %>% step_holiday(arrival_date, holidays = holidays) %>% step_rm(arrival_date) %>% step_dummy(all_nominal_predictors()) %>% step_zv(all_predictors()) %>% step_normalize(all_predictors()) ``` ### Create the workflow lr_workflow <- workflow() %>% add_model(lr_mod) %>% add_recipe(lr_recipe) ### Create the grid for tuning ``` lr_reg_grid <- tibble(penalty = 10^seq(-4, -1, length.out = 30)) lr_reg_grid %>% top_n(-5) lr_reg_grid %>% top_n(5) ``` ### Train and tune the model Let’s use tune::tune_grid() to train these 30 penalized logistic regression models. We’ll also save the validation set predictions (via the call to control_grid()) so that diagnostic information can be available after the model fit. The area under the ROC curve will be used to quantify how well the model performs across a continuum of event thresholds (recall that the event rate—the proportion of stays including children— is very low for these data). ``` lr_res <- lr_workflow %>% tune_grid(val_set, grid = lr_reg_grid, control = control_grid(save_pred = TRUE), metrics = metric_set(roc_auc)) ``` ``` lr_plot <- lr_res %>% collect_metrics() %>% ggplot(aes(x = penalty, y = mean)) + geom_point() + geom_line() + ylab("Area under the ROC Curve") + scale_x_log10(labels = scales::label_number()) lr_plot ``` ``` top_models <- lr_res %>% show_best(metric = "roc_auc", n = 15) %>% arrange(penalty) top_models ``` ``` lr_best <- lr_res %>% collect_metrics() %>% arrange(penalty) %>% slice(12) lr_best ``` ``` lr_auc <- lr_res %>% collect_predictions(parameters = lr_best) %>% roc_curve(children, .pred_children) %>% mutate(model = "Logistic Regression") autoplot(lr_auc) ``` ## A tree based ensamble model ### build the model and improve training time ``` cores <- parallel::detectCores() cores ``` ``` rf_mod <- rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>% set_engine("ranger", num.threads = cores) %>% set_mode("classification") ``` ### Create the recipe and workflow ``` rf_recipe <- recipe(children ~ ., data = hotel_other) %>% step_date(arrival_date) %>% step_holiday(arrival_date) %>% step_rm(arrival_date) ``` ``` rf_workflow <- workflow() %>% add_model(rf_mod) %>% add_recipe(rf_recipe) ``` ### Train and tune the model ``` rf_mod extract_parameter_set_dials(rf_mod) ### ``` ``` set.seed(345) rf_res <- rf_workflow %>% tune_grid(val_set, grid = 25, control = control_grid(save_pred = TRUE), metrics = metric_set(roc_auc)) ``` ``` rf_res %>% show_best(metric = "roc_auc") ``` ``` autoplot(rf_res) rf_best <- rf_res %>% select_best(metric = "roc_auc") rf_best ``` ``` rf_res %>% collect_predictions() ``` ``` rf_auc <- rf_res %>% collect_predictions(parameters = rf_best) %>% roc_curve(children, .pred_children) %>% mutate(model = "Random Forest") ``` ### compare regression and RF model ``` bind_rows(rf_auc, lr_auc) %>% ggplot(aes(x = 1 - specificity, y = sensitivity, col = model)) + geom_path(lwd = 1.5, alpha = 0.8) + geom_abline(lty = 3) + coord_equal() + scale_color_viridis_d(option = "plasma", end = .6) ``` ### the last fit ``` #the last model last_rf_mod <- rand_forest(mtry = 8, min_n = 7, trees = 1000) %>% set_engine("ranger", num.threads = cores, importance = "impurity") %>% set_mode("classification") #the last workflow last_rf_workflow <- rf_workflow %>% update_model(last_rf_mod) #the last fit set.seed(345) last_rf_fit <- last_rf_workflow %>% last_fit(splits) last_rf_fit ``` ``` last_rf_fit %>% collect_metrics() ``` ``` last_rf_fit %>% extract_fit_parsnip() %>% vip(num_features = 20) ``` ``` last_rf_fit %>% collect_predictions() %>% roc_curve(children, .pred_children) %>% autoplot() ```