INBO CODING CLUB

# INBO CODING CLUB 27 June 2024 Welcome! ## Share your code snippet If you want to share your code snippet, copy paste your snippet within a section of three backticks (```): As an **example**: ``` library(tidyverse) ``` (*you can copy paste this example and add your code further down*) ## Yellow sticky notes No yellow sticky notes online. Put your name + " | " and add a "*" each time you solve a challenge (see below). ## Participants Name | Challenges --- | --- Damiano Oldoni | *** Pieter Huybrechts | *** Nele Mullens | ** Ward Standaert | ** Rhea Maesele | * Dirk Maes | Dirk Festraets | Jo-Hannes Nowé |** Heleen Deroo | Sanne Govaert | ** Siebe Indestege | ** Anja Leyman | * Soria Delva | * Tina Tuerlings | *** Katrijn Alaerts | ** Lotte Pohl | * Lucia Manzanares | * ## Challenge 1 ### Damiano's solution (example) Copy paste this section to show your solutions. ```r # dummy code a <- 1 print("This is how to insert text which looks like code.") ``` ### Heleen's solution ```` map(swiss, mean) map(iris, class) list_with_dfs <- list(swiss = swiss, iris = iris) map(list_with_dfs, nrow) map_dbl(swiss, mean) map_chr(iris, class) map_dbl(list_with_dfs, nrow) map_df(swiss, mean) map_df(iris, class) map_df(list_with_dfs, nrow) ```` ### Nele's solution ``` #1.1 Compute the mean of every column in the swiss data.frame. #swiss is a basic package in R base, just like iris map(swiss, mean) #1.2 Determine the type of each column in the iris data.frame. map(iris, class) #1.3 After creating the variable list_with_dfs as a named list with swiss and # iris (code provided), compute the number of rows of each data.frame. list_with_dfs <- list(iris = iris, swiss = swiss) #list with dataframes map(list_with_dfs, nrow) #1.4 map() ALWAYS returns a list, ALWAYS. Apply the right map_*() variants to # the previous exercises to return a numeric vector (exercises 1 and 3), a # character vector (exercise 2) or a data.frame (exercises 1, 2 and 3). Hint: # use cheat sheet or the sections map-atomic and the-map-functions. map_vec(swiss, mean) map_vec(list_with_dfs, nrow) map_chr(iris, class) map_df(swiss, mean) map_df(iris, class) map_df(list_with_dfs, nrow) ``` ### Jo-Hannes' solution ``` ## CHALLENGE 1 #Write map() statements to: #1. map(swiss, mean) #2. map(iris,typeof) #3. list_with_dfs <- list(swiss = swiss, iris = iris) map(list_with_dfs,nrow) #4. map_dbl(swiss,mean) map_dbl(list_with_dfs,nrow) map_chr(iris,typeof) map_df(swiss,mean) map_df(iris,typeof) map_df(list_with_dfs,nrow) ``` ### Siebe's Solution ``` # 1.1 map(swiss, mean) # 1.2 map(iris, class) # 1.3 list_with_dfs <- list(swiss, iris) names(list_with_dfs) <- c("swiss","iris") map(list_with_dfs, nrow) # 1.4 map_dbl(swiss, mean) map_dbl(list_with_dfs, nrow) map_chr(iris,class) map_df(swiss, mean) map_df(iris, class) map_df(list_with_dfs, nrow) ``` ## Challenge 2 ### Sanne's solution ```r # 2.1 map(sensors, read_sensor_data) # 2.2 sensor_dfs <- map( sensors, read_sensor_data, path_pattern = path_pattern_datafiles, extension = file_extension ) # 2.3 map2(means, st_dev, rnorm, n = 10) %>% str() # 2.4 datetimes <- function(sensor_df) { summarize( sensor_df, min_datetime = min(datetime), max_datetime = max(datetime) ) } map_df(sensor_dfs, datetimes) ``` ### Ward's solution ```r # 2.1 # Define the list with the sensor IDs sensors <- list("A1", "G7", "H4") names(sensors) <- sensors #' Function to read a comma separated sensor data file. #' #' @param sensor character. A sensor ID. #' @return A tibble data.frame as returned by `readr::read_csv()`. #' @examples #' # example code #' library(readr) #' read_sensor_data(sensor = "H4") read_sensor_data <- function( sensor, path_pattern, extension) { path <- paste0("./data/20240627/20240627_sensor_", sensor, ".txt") readr::read_csv(path, na = "NA", col_types = cols( datetime = col_datetime(format = "%d/%m/%Y %H:%M:%S") ) ) } # Apply function to one sensor ID as check. sensor_df <- read_sensor_data(sensor = "A1") sensor_list <- map(sensors, \(x) read_sensor_data(x)) # 2.2 #' Function to read a comma separated sensor data file. #' #' @param sensor character. A sensor ID. #' @param path_pattern character. The path common to all sensor files. #' @param extension character. The file extension starting with `.`. #' @return A tibble data.frame as returned by `readr::read_csv()`. #' @examples #' # example code #' library(readr) #' path_pattern_datafiles <- "./data/20240627/20240627_sensor_" #' file_extension <- ".txt" #' read_sensor_data(sensor = "H4", path_pattern_datafiles, file_extension) read_sensor_data <- function( sensor, path_pattern, extension) { path <- paste0(path_pattern, sensor, extension) readr::read_csv(path, na = "NA", col_types = cols( datetime = col_datetime(format = "%d/%m/%Y %H:%M:%S") ) ) } # If data files are where they should be, here the path_pattern and # extension to use. Just copied from the @example above the function. path_pattern_datafiles <- "./data/20240627/20240627_sensor_" file_extension <- ".txt" # Apply function to one sensor ID as check. sensor_df <- read_sensor_data(sensor = "A1", path_pattern = path_pattern_datafiles, extension = file_extension ) map(sensors, read_sensor_data, path_pattern = path_pattern_datafiles, extension = file_extension) # 2.3 # Define means and standard deviations means <- c(-10, 0, 10, 20) st_dev <- c(1, 3, 2, 1.5) map2(means, st_dev, rnorm, n = 10) # 2.4 max_min_fun <- \(x) summarize(x, min_datetime = min(datetime), max_datetime = max(datetime)) map_df(sensor_list, max_min_fun) ``` ### Jo-Hannes' solution ```r # 2.1 # Define the list with the sensor IDs sensors <- list("A1", "G7", "H4") names(sensors) <- sensors #' Function to read a comma separated sensor data file. #' #' @param sensor character. A sensor ID. #' @return A tibble data.frame as returned by `readr::read_csv()`. #' @examples #' # example code #' library(readr) #' read_sensor_data(sensor = "H4") read_sensor_data <- function( sensor, path_pattern, extension) { path <- paste0("./data/20240627/20240627_sensor_", sensor, ".txt") readr::read_csv(path, na = "NA", col_types = cols( datetime = col_datetime(format = "%d/%m/%Y %H:%M:%S") ) ) } # Apply function to one sensor ID as check. sensor_df <- read_sensor_data(sensor = "A1") sensor_data <- map(sensors,read_sensor_data) # 2.2 Now, you want to generalize your script #' Function to read a comma separated sensor data file. #' #' @param sensor character. A sensor ID. #' @param path_pattern character. The path common to all sensor files. #' @param extension character. The file extension starting with `.`. #' @return A tibble data.frame as returned by `readr::read_csv()`. #' @examples #' # example code #' library(readr) #' path_pattern_datafiles <- "./data/20240627/20240627_sensor_" #' file_extension <- ".txt" #' read_sensor_data(sensor = "H4", path_pattern_datafiles, file_extension) read_sensor_data <- function( sensor, path_pattern, extension) { path <- paste0(path_pattern, sensor, extension) readr::read_csv(path, na = "NA", col_types = cols( datetime = col_datetime(format = "%d/%m/%Y %H:%M:%S") ) ) } # If data files are where they should be, here the path_pattern and # extension to use. Just copied from the @example above the function. path_pattern_datafiles <- "./data/20240627/20240627_sensor_" file_extension <- ".txt" # Apply function to one sensor ID as check. sensor_df <- read_sensor_data(sensor = "A1", path_pattern = path_pattern_datafiles, extension = file_extension ) sensor_data <- map(sensors, \(x) read_sensor_data(x,path_pattern= path_pattern_datafiles, extension=file_extension)) # 2.3 # Define means and standard deviations means <- c(-10, 0, 10, 20) st_dev <- c(1, 3, 2, 1.5) #for one example rnorm(n=10, mean = -10, sd=1) #map to all elements map2(means,st_dev, \(x,y) rnorm(n=10,mean=x,sd=y)) # 2.4 #sensor_data is a list of the sensor data.frames #Try it for one example summarize(sensor_data[[1]], min_datetime = min(datetime), max_datetime = max(datetime)) #map it to all elements of the list (so each dataframe) map(sensor_data, \(x) summarize(x, min_datetime = min(datetime), max_datetime = max(datetime))) # 2.3 # Define means and standard deviations means <- c(-10, 0, 10, 20) st_dev <- c(1, 3, 2, 1.5) #for one example rnorm(n=10, mean = -10, sd=1) #map to all elements map2(means,st_dev, \(x,y) rnorm(n=10,mean=x,sd=y)) # 2.4 #sensor_data is a list of the sensor data.frames #Try it for one example summarize(sensor_data[[1]], min_datetime = min(datetime), max_datetime = max(datetime)) #map it to all elements of the list (so each dataframe) map(sensor_data, \(x) summarize(x, min_datetime = min(datetime), max_datetime = max(datetime))) ``` ### Nele's solution ```r # 2.1 # Define the list with the sensor IDs sensors <- list("A1", "G7", "H4") names(sensors) <- sensors #' Function to read a comma separated sensor data file. #' #' @param sensor character. A sensor ID. #' @return A tibble data.frame as returned by `readr::read_csv()`. #' @examples #' # example code #' library(readr) #' read_sensor_data(sensor = "H4") read_sensor_data <- function( sensor, path_pattern, extension) { path <- paste0("./data/20240627/20240627_sensor_", sensor, ".txt") readr::read_csv(path, na = "NA", col_types = cols( datetime = col_datetime(format = "%d/%m/%Y %H:%M:%S") ) ) } # Apply function to one sensor ID as check. sensor_df <- read_sensor_data(sensor = "A1") all_sensors <- map(sensors, read_sensor_data) # 2.2 #' Function to read a comma separated sensor data file. #' #' @param sensor character. A sensor ID. #' @param path_pattern character. The path common to all sensor files. #' @param extension character. The file extension starting with `.`. #' @return A tibble data.frame as returned by `readr::read_csv()`. #' @examples #' # example code #' library(readr) #' path_pattern_datafiles <- "./data/20240627/20240627_sensor_" #' file_extension <- ".txt" #' read_sensor_data(sensor = "H4", path_pattern_datafiles, file_extension) read_sensor_data <- function( sensor, path_pattern, extension) { path <- paste0(path_pattern, sensor, extension) readr::read_csv(path, na = "NA", col_types = cols( datetime = col_datetime(format = "%d/%m/%Y %H:%M:%S") ) ) } # If data files are where they should be, here the path_pattern and # extension to use. Just copied from the @example above the function. path_pattern_datafiles <- "./data/20240627/20240627_sensor_" file_extension <- ".txt" # Apply function to one sensor ID as check. sensor_df <- read_sensor_data(sensor = "A1", path_pattern = path_pattern_datafiles, extension = file_extension ) map(sensors, read_sensor_data, path_pattern_datafiles, file_extension) # er zijn drie argumenten in de functie, de eerste vult ze vanzelf in, de tweede # en derde niet, deze moeten na de functie (hier read_sensor_data) zelf nog in- # gevuld worden # 2.3 Calculate 10 random numbers from a normal distribution (rnorm(n = 10)) for # each pair of the 4 provided means and standard deviations and return it as a list of 4 # Define means and standard deviations means <- c(-10, 0, 10, 20) st_dev <- c(1, 3, 2, 1.5) map2(means, st_dev, function(x, y) rnorm(n = 10, mean = x, sd = y)) # 2.4 # get sensor data ready sensor_data <- map(sensors, read_sensor_data, path_pattern_datafiles, file_extension) # we need to get a function to do what we want function(x) summarize(x, min_datetime = min(datetime), max_datetime = max(datetime)) # apply the function, output a dataframe min_max_time <- map_df(sensor_data, function(x) summarize(x, min_datetime = min(datetime), max_datetime = max(datetime))) # but I want to see what dates are from what sensor. Add names: min_max_time <- map2_df(sensor_data, sensors, function(x, sensors) summarize(x, sensor = sensors, min_datetime = min(datetime), max_datetime = max(datetime))) ``` ### Pieter's solution ```r # 2.1 # Define the list with the sensor IDs sensors <- list("A1", "G7", "H4") names(sensors) <- sensors #' Function to read a comma separated sensor data file. #' #' @param sensor character. A sensor ID. #' @return A tibble data.frame as returned by `readr::read_csv()`. #' @examples #' # example code #' library(readr) #' read_sensor_data(sensor = "H4") read_sensor_data <- function( sensor, path_pattern, extension) { path <- paste0("./data/20240627/20240627_sensor_", sensor, ".txt") readr::read_csv(path, na = "NA", col_types = cols( datetime = col_datetime(format = "%d/%m/%Y %H:%M:%S") ) ) } # Apply function to one sensor ID as check. sensor_df <- read_sensor_data(sensor = "A1") ### 2.1 map(sensors, read_sensor_data) # 2.2 #' Function to read a comma separated sensor data file. #' #' @param sensor character. A sensor ID. #' @param path_pattern character. The path common to all sensor files. #' @param extension character. The file extension starting with `.`. #' @return A tibble data.frame as returned by `readr::read_csv()`. #' @examples #' # example code #' library(readr) #' path_pattern_datafiles <- "./data/20240627/20240627_sensor_" #' file_extension <- ".txt" #' read_sensor_data(sensor = "H4", path_pattern_datafiles, file_extension) read_sensor_data <- function( sensor, path_pattern, extension) { path <- paste0(path_pattern, sensor, extension) readr::read_csv(path, na = "NA", col_types = cols( datetime = col_datetime(format = "%d/%m/%Y %H:%M:%S") ) ) } # If data files are where they should be, here the path_pattern and # extension to use. Just copied from the @example above the function. path_pattern_datafiles <- "./data/20200630/20200630_sensor_" file_extension <- ".txt" # Apply function to one sensor ID as check. sensor_df <- read_sensor_data(sensor = "A1", path_pattern = path_pattern_datafiles, extension = file_extension ) all_sensor_dfs <- map( sensors, ~ read_sensor_data(.x, path_pattern = "./data/20240627/20240627_sensor_", extension = ".txt" ) ) map( sensors, read_sensor_data, path_pattern = "./data/20240627/20240627_sensor_", extension = ".txt" ) # 2.3 # Define means and standard deviations means <- c(-10, 0, 10, 20) st_dev <- c(1, 3, 2, 1.5) map2(means, st_dev, \(mean_to_calc, sd_to_calc) rnorm(n = 10, mean = mean_to_calc, sd = sd_to_calc) ) # 2.4 get_min_max_dtime <- function(df) { summarize(df, min_datetime = min(datetime), max_datetime = max(datetime)) |> ungroup() } all_sensor_dfs |> map(get_min_max_dtime) ``` ## Challenge 3 ### Sanne's solution ```r # 3.1 lepidoptera_in_prot_areas <- readRDS("./data/20240627/20240627_lepidoptera_in_prot_areas.RData") map(lepidoptera_in_prot_areas, length) %>% str() # 3.2 obs <- readr::read_tsv("./data/20240627/20240627_lepidoptera_2024.tsv", na = "") slice(obs, unlist(lepidoptera_in_prot_areas)) 3.3 # Species data.frames species_1 <- tibble( species = c("Balanus tintinnabulum", "Callinectes sapidus", "Mnemiopsis leidyi"), is_marine = c(TRUE, TRUE, TRUE) ) species_2 <- tibble( species = c("Leuciscus aspius", "Rhithropanopeus harrisii", "Mnemiopsis leidyi"), genus = c("Leuciscus", "Rhithropanopeus", "Mnemiopsis leidyi") ) species_3 <- tibble( species = c("Rhithropanopeus harrisii","Potamopyrgus antipodarum"), informal_group = c("mollusca", "mollusca") ) reduce(list(species_1, species_2, species_3), full_join, by = "species") ``` ### Tina's solution ```r #### CHALLENGE 3A#### # Load lepidoptera_in_prot_areas lepidoptera_in_prot_areas <- readRDS("./data/20240627/20240627_lepidoptera_in_prot_areas.RData") lepidoptera_in_prot_areas #list with the (row numbers of the) lepidoptera observations taken in protected areas. # It's a kind of "special" list, but still a list! So, you can apply all purrr # functions to it. class(lepidoptera_in_prot_areas) # 3.1 how many observations for each protected area? map_int(lepidoptera_in_prot_areas, length) # 3.2 Given obs, the data.frame with the GBIF observations #(see the provided code), how to get a data.frame with only the observations #taken in protected areas? obs <- readr::read_tsv("./data/20240627/20240627_lepidoptera_2024.tsv", na = "") str(obs) protected <- map_df(lepidoptera_in_prot_areas, ~slice(obs,.x)) slice(iris,c(1,3,6)) # 3.3 purrr is more than the map family. To join the data.frames species_1, #species_2 and species_3 (code provided) you could apply full_join() twice: # Species data.frames species_1 <- tibble( species = c("Balanus tintinnabulum", "Callinectes sapidus", "Mnemiopsis leidyi"), is_marine = c(TRUE, TRUE, TRUE) ) species_2 <- tibble( species = c("Leuciscus aspius", "Rhithropanopeus harrisii", "Mnemiopsis leidyi"), genus = c("Leuciscus", "Rhithropanopeus", "Mnemiopsis leidyi") ) species_3 <- tibble( species = c("Rhithropanopeus harrisii","Potamopyrgus antipodarum"), informal_group = c("mollusca", "mollusca") ) full_join(species_1, species_2, by = "species") %>% full_join(species_3, by = "species") reduce(list(species_1,species_2,species_3),full_join) #### Challenge 3B#### # Height of 5 giraffes height_giraffe <- runif(n = 6, min = 4.3, max = 5.7) # Weight of 5 giraffes weight_giraffe <- rnorm(n = 6, mean = 1192, sd = 300) # GPS tracker ID gps_tracker_giraffe <- c("A31", "E4T", "RT4", "YU7", "3G1", "ON9") # Data.frame with weight, height and GPS tracker IDs. df <- tibble::tibble( "weight" = weight_giraffe, "height" = height_giraffe, "gpsID" = gps_tracker_giraffe ) # 3B.1 FOR EACH column OF df and class numeric, APPLY FUNCTION floor(). #Modify the columns. df <- df %>% mutate(across(where(is.numeric), ~ map_dbl(., ~ floor(.)))) # 3B.2 FOR EACH column OF df ending with ight (height, weight), #APPLY FUNCTION floor(). Modify the columns. Same result of exercise 1 expected. df <- df %>% mutate(across(ends_with("ight"), ~ floor(.))) # 3B.3 Instead of modifying the columns, how to add new ones? df <- df %>% mutate(across(ends_with("ight"), list(floored=~ floor(.)))) ``` ### Pieter's solution ```r # 3.1 map_vec(lepidoptera_in_prot_areas, length) # 3.2 obs <- readr::read_tsv("./data/20240627/20240627_lepidoptera_2024.tsv", na = "") str(obs) glimpse(obs) map(lepidoptera_in_prot_areas, ~ slice(obs, .x)) # 3.3 # Species data.frames species_1 <- tibble( species = c("Balanus tintinnabulum", "Callinectes sapidus", "Mnemiopsis leidyi"), is_marine = c(TRUE, TRUE, TRUE) ) species_2 <- tibble( species = c("Leuciscus aspius", "Rhithropanopeus harrisii", "Mnemiopsis leidyi"), genus = c("Leuciscus", "Rhithropanopeus", "Mnemiopsis leidyi") ) species_3 <- tibble( species = c("Rhithropanopeus harrisii","Potamopyrgus antipodarum"), informal_group = c("mollusca", "mollusca") ) # Not DRY solution (hardcoded) full_join(species_1, species_2, by = "species") %>% full_join(species_3, by = "species") reduce( list( species_1, species_2, species_3 ), ~ full_join(.x , .y, by = join_by(species)) ) ## Challenge 3B # Height of 5 giraffes height_giraffe <- runif(n = 6, min = 4.3, max = 5.7) # Weight of 5 giraffes weight_giraffe <- rnorm(n = 6, mean = 1192, sd = 300) # GPS tracker ID gps_tracker_giraffe <- c("A31", "E4T", "RT4", "YU7", "3G1", "ON9") # Data.frame with weight, height and GPS tracker IDs. df <- tibble::tibble( "weight" = weight_giraffe, "height" = height_giraffe, "gpsID" = gps_tracker_giraffe ) # 3B.1 mutate(df, across(where(is.numeric), floor)) # 3B.2 mutate(df, across(ends_with('ight'), floor)) # 3B.3 summarise(df, .by = where(is.numeric), across(everything())) ``` ## Bonus Challange ### Pieter's solution ```r # BC.1 penguin_plot_tbl <- penguins %>% group_by(species) %>% nest() %>% mutate(plot = map(data, ~ ggplot(.x) + geom_point(aes( x = bill_length_mm, y = bill_depth_mm, colour = sex ) ) + ggtitle(species) ) ) # BC.2 paths <- paste0(penguin_plot_tbl$species, ".png") plots <- pull(penguin_plot_tbl, plot) pwalk(list(paths, plots), ggsave, path = "./data/20240627") ` ```