# INBO CODING CLUB 30 January 2025 Welcome! ## Share your code snippet If you want to share your code snippet, copy paste your snippet within a section of three backticks (```): As an **example**: ``` library(tidyverse) ``` (*you can copy paste this example and add your code further down*) ## Yellow sticky notes No yellow sticky notes online. Put your name + " | " and add a "*" each time you solve a challenge (see below). ## Participants Name | Challenges --- | --- Damiano Oldoni | *** Dirk Maes | * Emma Cartuyvels|*** Falk Mielke | ** Pieter Huybrechts | *** Berber Meulepas | * Jonathan De Schrijver |* Adriaan Seynaeve | Sanne Govaert | *** Arc'hantael Labriere | Cyrielle Delvenne ## Tipps ### break long pipes *(with greetings from Floris)* Data pipe chains with `%>%` tend to get long. Here is a useful package for debugging long chains of piped operations on the fly. https://github.com/MilesMcBain/breakerofchains > Break your chain at the cursor line. Run the first bit. See the output. Be free. ## Challenge 1 ### Damiano's solution (example) Copy paste this section to show your solutions. ```r # dummy code print("This is how to insert code.") ``` ### Emma's solution ```r butterflies_eu_distr_all |> distinct(threatStatus) |> pull() butterflies_eu_distr <- butterflies_eu_distr_all |> filter(!is.na(country)) butterflies_eu_distr |> group_by(threatStatus) |> summarise(number_of_species = n_distinct(taxonKey)) butterflies_eu_distr |> group_by(country, threatStatus) |> summarise(number_of_species = n_distinct(taxonKey)) butterflies_eu_distr |> group_by(country, threatStatus) |> summarise(number_of_species = n_distinct(taxonKey)) |> arrange(desc(number_of_species)) butterflies_eu_distr |> select(-source, -remarks) butterflies_eu_distr |> select(taxonKey, country, locality, locationId, threatStatus) butterflies_eu_distr |> select(-any_of(c("source", "remarks", "test"))) ``` ### Falk's attempt ```r butterflies_eu_distr_all %>% select(threatStatus) %>% unique butterflies_eu_distr_all %>% pull(threatStatus) %>% unique butterflies_eu_distr <- butterflies_eu_distr_all %>% filter(!is.na(country)) butterflies_eu_distr %>% count(country, threatStatus) %>% arrange(desc(n)) butterflies_eu_distr <- butterflies_eu_distr %>% relocate(country, .before = locality) %>% relocate(threatStatus, .after = locationId) %>% select(!any_of(c("source", "remarks"))) # I often prefer this output to `glimpse()`... knitr::kable(t(head(butterflies_eu_distr, 3))) ``` ### Sanne's solution ```r # Get the unique values of column threatStatus in butterflies_eu_distr. (Bonus: both as vector (basic R) or data.frame with 1 one column (dplyr)). butterflies_eu_distr_all %>% pull(threatStatus) %>% unique() unique(butterflies_eu_distr_all$threatStatus) # Remove the rows without country, i.e. the data at European level. Save it as butterflies_eu_distr. butterflies_eu_distr <- butterflies_eu_distr_all %>% filter(!is.na(country)) # Count the number of species for each threatStatus. butterflies_eu_distr %>% group_by(threatStatus) %>% summarise(species_nr = n()) # Count the number of species for each threatStatus and country. butterflies_eu_distr %>% group_by(threatStatus, country) %>% summarise(species_nr = n()) # Order the count of species for each threatStatus and country in descending order. butterflies_eu_distr %>% group_by(threatStatus, country) %>% summarise(species_nr = n()) %>% arrange(desc(species_nr), .by_group = TRUE) # Remove columns source and remarks from butterflies_eu_distr. butterflies_eu_distr %>% select(-source, -remarks) # Move column country between taxonKey and locality. Move threatStatus after locationId. # Optional. Sometimes we cannot know in advance that the columns we want to select/remove from a data.frame exist. For example, how to remove source and remarks without returning an error if they do not exist? butterflies_eu_distr %>% select(taxonKey, country, locality, locationId, threatStatus) ``` ### Janne's solution ```r #Get the unique values of column threatStatus in butterflies_eu_distr. #(Bonus: both as vector (basic R) or data.frame with 1 one column (dplyr)). threatstatus_names <- butterflies_eu_distr_all$threatStatus |> unique() threatstatus_names <- butterflies_eu_distr_all |> dplyr::select(threatStatus) |> dplyr::distinct(threatStatus) |> as.data.frame() # Remove the rows without country, i.e. the data at European level. # Save it as butterflies_eu_distr. butterflies_eu_distr <- butterflies_eu_distr_all |> dplyr::filter(!is.na(country)) butterflies_eu_distr <- butterflies_eu_distr_all |> tidyr::drop_na(country) # Count the number of taxon keys (column taxonKey) for each threatStatus. butterflies_eu_distr_sum <- butterflies_eu_distr |> dplyr::group_by(threatStatus) |> #dplyr::distinct(taxonKey) |> dplyr::summarise(n_species = n()) |> dplyr::ungroup() butterflies_eu_distr_sum <- butterflies_eu_distr |> dplyr::count(threatStatus) # Count the number of taxon keys (column taxonKey) for each threatStatus and country. butterflies_eu_distr_sum <- butterflies_eu_distr |> dplyr::count(threatStatus, country) # Order the count of taxon keys (column taxonKey) for each threatStatus and country in descending order. butterflies_eu_distr |> dplyr::count(threatStatus, country) |> dplyr::arrange(n |> desc()) # Remove columns source and remarks from butterflies_eu_distr. butterflies_eu_distr_upd <- butterflies_eu_distr |> dplyr::select(!c(source, remarks)) # Move column country between taxonKey and locality. Move threatStatus after locationId. butterflies_eu_distr_upd <- butterflies_eu_distr |> dplyr::relocate(country, .after = taxonKey) |> dplyr::relocate(threatStatus, .after = locationId) # Optional. Sometimes we cannot know in advance that the columns we want to select/remove from a data.frame exist. For example, how to remove source and remarks without returning an error if they do not exist? butterflies_eu_distr_upd <- butterflies_eu_distr |> dplyr::select(!tidyselect::any_of(c("source", "remarks"))) ``` ### Pieter's solution ```r # unique values of column threatStatus in butterflies_eu_distr distinct(butterflies_eu_distr_all, threatStatus) unique(pull(butterflies_eu_distr_all, threatStatus)) # Remove the rows without country butterflies_eu_distr <- filter(butterflies_eu_distr_all, !is.na(country)) # Count the number of species for each threatStatus select(butterflies_eu_distr, taxonKey, threatStatus) %>% count(threatStatus, sort = TRUE) # Count the number of species for each threatStatus and country select(butterflies_eu_distr, taxonKey, threatStatus, country) %>% group_by(threatStatus, country) %>% tally(sort = FALSE) # Order the count of species for each threatStatus and country in descending # order arrange(butterflies_eu_distr, desc(threatStatus), desc(country)) %>% glimpse() # Remove columns source and remarks from butterflies_eu_distr select(butterflies_eu_distr, -all_of(c("remarks", "source"))) %>% glimpse() # Move column country between taxonKey and locality. Move threatStatus after # locationId relocate( butterflies_eu_distr, country, .before = locality ) %>% relocate( threatStatus, .after = locationId ) %>% glimpse() # how to remove source and remarks without returning an error if they do not # exist? select(butterflies_eu_distr, -any_of(c("source", "remarks"))) %>% glimpse() ``` ## Cyrielle's ```r #1.1. Get the unique values of column threatStatus in butterflies_eu_distr. #R basics unique(butterflies_eu_distr_all$threatStatus) #dplyr butterflies_eu_distr_all %>% distinct(threatStatus) #1.2. Remove the rows without country, i.e. the data at European level. Save it as butterflies_eu_distr. butterflies_eu_distr <- butterflies_eu_distr_all %>% filter(!is.na(country)) ##1.3. Count the number of taxon keys (column taxonKey) for each threatStatus butterflies_eu_distr %>% group_by(threatStatus) %>% summarise(count = n()) ##1.4. Count the number of taxon keys (column taxonKey) for each threatStatus and country butterflies_eu_distr %>% group_by(threatStatus,country) %>% summarise(count = n()) ##1.5. Order the count of taxon keys (column taxonKey) for each threatStatus and country in descending order. butterflies_eu_distr %>% arrange(desc(threatStatus)) #how to add country as an argument?? ##1.6. Remove columns source and remarks from butterflies_eu_distr select(butterflies_eu_distr,-c(source, remarks)) ##1.7. Move column country between taxonKey and locality. Move threatStatus after locationId. butterflies_eu_distr %>% relocate(country, .after= taxonKey ``` ## Challenge 2 ### Falk's suggestions ```r butterflies_be <- butterflies_eu %>% filter(key %in% pull(butterflies_eu_distr %>% filter(country == "BE"), taxonKey)) # (Pieter's solution is better ;) countries_unreported_threatstatus <- butterflies_eu_distr %>% count(country, threatStatus) %>% filter(is.na(threatStatus)) %>% pull(country) %>% unique # (3) transect_counts %>% mutate(idx = tolower(species)) %>% left_join( butterflies_eu_vern_dutch %>% mutate(idx2 = tolower(vernacularName)) %>% select(idx2, taxonKey) , by = join_by(idx == idx2), relationship = "many-to-many", unmatched = "drop", # "error" suffix = c("", ".duplicates") ) %>% select(-idx) %>% right_join( butterflies_be %>% select(nubKey, scientificName), by = join_by(taxonKey == nubKey), relationship = "many-to-one", unmatched = "drop", # "error" suffix = c("", ".duplicates") ) ``` ## Pieter's solution ```r belgian_butterfly_keys <- filter(butterflies_eu_distr, country == "BE") %>% pull(taxonKey) butterflies_be <- butterflies_eu %>% filter(key %in% belgian_butterfly_keys) # Some countries haven't reported the threatStatus (threatStatus = NA) for any # species in butterflies_eu_distr. Which ones? butterflies_eu_distr %>% summarise( no_threatstatus_reported = all(is.na(threatStatus)), .by = country ) %>% filter(no_threatstatus_reported) %>% pull(country) # Add to transect_counts a column called scientificName with the scientific name # contained in butterflies_be vernacular_lut <- select(butterflies_eu_vern_dutch, taxonKey, vernacularName) %>% distinct() %>% left_join( select(butterflies_be, nubKey, scientificName), by = join_by(taxonKey == nubKey) ) %>% select(taxonKey, vernacularName, scientificName) %>% mutate(vernacularName_lower = tolower(vernacularName)) %>% select(scientificName, vernacularName_lower) %>% distinct() transect_counts %>% mutate(vernacularName_lower = tolower(species)) %>% left_join(vernacular_lut, by = "vernacularName_lower") %>% select(c(colnames(transect_counts), "scientificName")) ``` ### Emma's solution ``` r belgian_butterflies <- butterflies_eu_distr |> filter(country == "BE") |> pull(taxonKey) butterflies_be <- butterflies_eu |> filter(speciesKey %in% belgian_butterflies) butterflies_eu_distr |> group_by(country) |> summarise(test = sum(!is.na(threatStatus))) |> filter(test == 0) transect_counts <- transect_counts |> left_join(butterflies_eu_vern_dutch |> distinct(taxonKey, vernacularName), join_by(species == vernacularName)) |> left_join(butterflies_be |> select(nubKey, scientificName), join_by(taxonKey == nubKey)) ``` ### Janne's solution ```r #Filter butterflies_eu to get only the Belgian butterflies (country code: BE in butterflies_eu_distr). Save the result as butterflies_be. Important: no need to join the two datasets! keys_BE <- butterflies_eu_distr |> dplyr::filter(grepl("BE", country)) |> dplyr::pull(taxonKey) butterflies_be <- butterflies_eu |> dplyr::filter(key %in% keys_BE) #Some countries haven't reported the threatStatus (threatStatus = NA) for any species in butterflies_eu_distr. Which ones? Important: many countries reported the threatStatus for some species, but not for all of them and should be excluded. butterflies_eu_distr_upd <- butterflies_eu_distr |> dplyr::mutate( n_rows = dplyr::n(), n_rows_na = is.na(threatStatus) |> sum(), .by = country ) |> dplyr::filter(n_rows == n_rows_na) # Add to transect_counts a column called scientificName with the scientific name contained in butterflies_be (column scientificName). transect_counts_upd <- transect_counts |> dplyr::mutate(species = species |>tolower()) butterflies_eu_vern_dutch_upd <- butterflies_eu_vern_dutch |> dplyr::mutate(vernacularName = vernacularName |> tolower()) |> dplyr::distinct(vernacularName, .keep_all = TRUE) butterflies_joined <- dplyr::left_join( x = transect_counts_upd, y = butterflies_eu_vern_dutch_upd, by = c("species" = "vernacularName") ) |> dplyr::left_join( x = _, y = butterflies_be, by = c("taxonKey" = "nubKey") ) ``` ### Sanne's solution ```r #1 taxonKey_be <- butterflies_eu_distr %>% filter(country == "BE") %>% pull(taxonKey) butterflies_be <- butterflies_eu %>% filter(key %in% taxonKey_be) #2 countries_no_threatstatus <- butterflies_eu_distr %>% filter(is.na(threatStatus)) %>% distinct(country) %>% pull(country) butterflies_eu_distr %>% filter(!country %in% countries_no_threatstatus) #3 vernacular_names <- butterflies_eu_vern_dutch %>% select(vernacularName, taxonKey) %>% distinct() scientific_names <- butterflies_be %>% select(nubKey, scientificName) transect_counts %>% left_join(vernacular_names, join_by(species == vernacularName)) %>% left_join(scientific_names, join_by(taxonKey == nubKey)) ``` ## Challenge 3 ### Emma's solution ``` r butterflies_eu_vern_dutch |> group_by(taxonKey) |> summarize(n = n_distinct(vernacularName)) butterflies_eu_vern_dutch |> mutate(vernacularName_lower = tolower(vernacularName)) |> group_by(taxonKey) |> mutate(n = n_distinct(vernacularName_lower)) |> distinct(taxonKey, vernacularName_lower, n) |> arrange(desc(n)) ``` ### Pieter's solution ```r # The data.frame butterflies_eu_vern_dutch contains several vernacular names for # the same species (taxonKey). Some of them are exactly the same: they are just # collected from different checklists. How many unique vernacular names are # there for each taxon? butterflies_eu_vern_dutch %>% summarise(n_unique_names = n_distinct(vernacularName), .by = taxonKey) # We can see that some unique vernacular names are just the same name written in # different ways. For example, "Atalanta" and "atalanta". How many unique # vernacular names are there for each taxon if we ignore the # case/capitalization? Return a data.frame with two columns: taxonKey and n. # Order the result by n, from high to low. Tip: stringr package can be useful. butterflies_eu_vern_dutch %>% summarise(n = n_distinct(tolower(vernacularName)), .by = taxonKey) %>% arrange(desc(n)) ## More verbose butterflies_eu_vern_dutch %>% mutate(lower_vernacular = stringr::str_to_lower(vernacularName), taxonKey, .keep = "none") %>% distinct() %>% group_by(taxonKey) %>% tally(sort = TRUE) ``` ## Bonus ### Pieter`s solution ```r ## BONUS CHALLENGE 1 - Choices 🤷 # Only preferred filter(butterflies_eu_vern_dutch, preferred) # Only not preferred filter(butterflies_eu_vern_dutch, !preferred) %>% mutate(vernacularName = first(vernacularName), .by = "taxonKey") # Only one vernacularName butterflies_eu_vern_dutch %>% filter(.by = taxonKey, n_distinct(vernacularName) == 1) # In one step (Not sure this is what we want) butterflies_eu_vern_dutch %>% mutate( .by = "taxonKey", vernacularName = case_when( preferred ~ vernacularName, !preferred ~ first(vernacularName), n_distinct(vernacularName) == 1 ~ vernacularName, .default = vernacularName ) ) ## BONUS CHALLENGE 2 - Tidying up your data 🧹 sheet_path <- "data/20250130/20250130_butterfly_transect_counts_raw.xls" ## counts counts_raw <- readxl::read_excel( path = sheet_path, skip = 4, .name_repair = "universal", range = "A5:AB36" ) counts_raw %>% pivot_longer(cols = matches("O[0-9]{2}"), values_to = "n", names_to = "section") %>% rename(species = NAAM) %>% filter(!is.na(n)) ## names observers get_observer_names <- function(sheet_path) { # Read the entire first column and then extract the observers only by detecting # the counts table, this allows for any number of counters raw_a_col <- readxl::read_excel( path = sheet_path, range = readxl::cell_cols("A")) %>% dplyr::pull("Tellers") row_index_of_counts <- seq(raw_a_col)[stringr::str_detect(raw_a_col, stringr::fixed("NAAM"))] %>% min(na.rm = TRUE) observer_names <- head(raw_a_col, n = row_index_of_counts - 1) %>% .[!is.na(.)] # remove NA # Add identifier based on file and return as tibble dplyr::tibble( tellers = observer_names, sheet_id = tools::md5sum(sheet_path) ) } get_ymd <- function(sheet_path){ raw_date <- readxl::read_excel( path = sheet_path, range = "C1:H2") dplyr::tibble( date = lubridate::ymd( sprintf("%i-%i-%i", raw_date$Jaar, raw_date$Maand, raw_date$Dag) ), sheet_id = tools::md5sum(sheet_path) ) } get_start_end_time <- function(sheet_path){ raw_times <- readxl::read_excel( path = sheet_path, range = "J1:M2" ) get_ymd(sheet_path = sheet_path) %>% dplyr::mutate( start_time = lubridate::ymd_hms( paste( date, lubridate::hour(raw_times$`Tijd begin`), lubridate::minute(raw_times$`Tijd begin`), lubridate::second(raw_times$`Tijd begin`) ) ), end_time = lubridate::ymd_hms( paste( .data$date, lubridate::hour(raw_times$`Tijd einde`), lubridate::minute(raw_times$`Tijd einde`), lubridate::second(raw_times$`Tijd einde`) ) ) ) %>% dplyr::relocate(dplyr::ends_with("id"), .after = dplyr::everything()) } get_temp <- function(sheet_path){ raw_temp <- readxl::read_excel( path = sheet_path, range = "O2:P2", col_names = c("temp", "unit") ) %>% # Use the readr parser to get sensible types readr::type_convert(readr::cols(), guess_integer = TRUE) # little helper to reverse string to fix temp unit reverse_string <- function(x) { stringr::str_c(stringr::str_split(x, "")[[1]] %>% rev(), collapse = "") } temperature_unit <- raw_temp$unit %>% stringr::str_remove(stringr::fixed("/")) %>% reverse_string() temperature <- raw_temp$temp %>% units::set_units(temperature_unit, mode = "standard") tibble( temperature = temperature, sheet_id = tools::md5sum(sheet_path) ) } get_cloud <- function(sheet_path){ raw_cloud <- readxl::read_excel( path = sheet_path, range = "Q2:R2", col_names = c("cloud", "unit") ) tibble( cloud = paste0(raw_cloud$cloud, raw_cloud$unit), sheet_id = tools::md5sum(sheet_path) ) } get_wind_speed <- function(sheet_path){ raw_wind_speed <- readxl::read_excel( path = sheet_path, range = "W1:AB2", ) %>% # drop emtpy columns dplyr::select(dplyr::where(~!is.logical(.x))) %>% # take last, and highest value in case of more than one dplyr::pull(var = -1) # convert beaufort into m/S ## https://en.wikipedia.org/wiki/Beaufort_scale wind_speed <- 0.836 * raw_wind_speed^(3/2) %>% units::set_units("m/s") # return as tibble tibble( wind_speed = wind_speed, sheet_id = tools::md5sum(sheet_path) ) } ## Build identifier based on exact file (md5sum), or based on combination of ## observers and measurements combined_measurements <- list( get_observer_names(sheet_path), get_start_end_time(sheet_path), get_temp(sheet_path), get_cloud(sheet_path), get_wind_speed(sheet_path) ) %>% purrr::reduce(~dplyr::full_join(.x, .y, by = dplyr::join_by(sheet_id))) # build identifier from the combined measurements combined_measurements_id <- dplyr::mutate( combined_measurements, measurement_id = digest::digest(combined_measurements, "spookyhash") ) %>% dplyr::relocate( dplyr::ends_with("_id"), .after = dplyr::everything() ) glimpse(combined_measurements_id) ```