INBO CODING CLUB

# INBO CODING CLUB 27 May 2025 Welcome! ## Share your code snippet If you want to share your code snippet, copy paste your snippet within a section of three backticks (```): As an **example**: ``` library(tidyverse) ``` (*you can copy paste this example and add your code further down*) ## Yellow sticky notes No yellow sticky notes online. Put your name + " | " and add a "*" each time you solve a challenge (see below). ## Participants Name | Challenges --- | --- Damiano Oldoni | *** Raïsa Carmen | * Jorre | ** Falk Mielke | Charlotte Van Moorleghem | Florian Van Hecke | Pieter | ** ## Challenge 1 ### Damiano's solution (example) You can copy paste this section to show your solution. ```r # dummy code print("This is how to insert code.") ``` ### Falks attempt ```r ### 1.1 # first, confirm on "dates" counters_dates <- counts %>% select(date_count, counter_name) %>% distinct() %>% count(counter_name, name = "n_dates") %>% arrange(desc(n_dates)) # then, apply to "organisations" counters_organisations <- counts %>% select(organisation_id, counter_name) %>% distinct() %>% count(organisation_id, name = "n_organisations") %>% arrange(desc(n_organisations)) knitr::kable(head(counters_organisations)) ### 1.2 counts %>% select(type_obs, counter_name) %>% summarise( organisations = paste0( sort(unique(as.character(type_obs))), collapse = "," ), .by = counter_name ) counters_organisations <- counts %>% select(organisation_id, counter_name) %>% distinct() %>% summarise( organisations = paste0(as.character(organisation_id), collapse = ", "), .by = counter_name ) ### 1.3 # https://tidyr.tidyverse.org/reference/unite.html counts <- counts %>% unite( "org_counter", organisation_id, counter_name, sep = "-", remove = FALSE ) ### 1.4 # if you just want the unique org_counters, do counts %>% arrange(org_counter, date_digitalisation) %>% slice(1, .by = org_counter) # if you want all observations arranged, join to the original data set counts %>% left_join( counts %>% arrange(org_counter, date_digitalisation) %>% slice(1, .by = org_counter) %>% rename(c(first_digitalisation = date_digitalisation)) %>% select(org_counter, first_digitalisation) , by = join_by(org_counter), relationship = "many-to-one", unmatched = "error" ) %>% arrange(first_digitalisation, organisation_id, counter_name) %>% pull(org_counter) ``` *Jorre:* suggestion instead of the whole left_join(), do ```r counts |> mutate(first_digitalisation=min(date_digitalisation),.by=org_counter) ``` ;-) *Falk:* good idea, thank you, Jorre! Ideas to check: (i) could we directly mutate in the `slice` part as well? (ii) Does `min()` also work with non-numerics? *Jorre:* Yes :-) (i) ```r counts |> slice_min(date_digitalisation,by=org_counter) ``` but then you still have duplicates, so you still need distinct(). (ii) min() does work for non-numerics. It selects the first alphabetical value for character vectors. It does not work on factors. *Falk:* good to know, thank you! ### Jorre ```r # Some counters changed organisation during the years. Create a data.frame with # two columns counter_name and n_organisations with the number of organisations # a counter has belonged to. Order by n_organisations from high to low, and by # counter_name alphabetically. Save the result as counters_organisations. counts_n_organisations <- counts |> summarize( n_organisations=n_distinct(organisation_id), .by=counter_name ) |> arrange(desc(n_organisations),counter_name) # Instead of only having the amount of organisations a counter belongs to, we # now want an additional column called organisations containing the organisation # IDs, comma separated. Same order as before. Save the result as # counters_organisations (overwrite). counts_n_organisations2 <- counts |> summarize( n_organisations=n_distinct(organisation_id), organisations=paste(unique(organisation_id),collapse=';'), .by=counter_name ) |> arrange(desc(n_organisations),counter_name) # Add a new column called org_counter to counts with a string formatted as # follows: organisation_id-counter_name, e.g. 398668-Germayne Galea. counts_org_counter <- counts |> mutate(org_counter=sprintf('%i-%s',organisation_id,counter_name)) # Now we want to create a vector of organisations/counters (column org_counter) which is ordered according to: # (1) the first date they submitted the data (date_digitalisation), # (2) the organisation ID (organisation_id)*, alphabetically, # (3) their name (counter_name)*, alphabetically vec_org_counter <- counts_org_counter |> distinct(date_digitalisation,organisation_id,counter_name,org_counter) |> filter(date_digitalisation==min(date_digitalisation),.by=org_counter) |> arrange(date_digitalisation,organisation_id,counter_name) |> pull(org_counter) ``` ### Raïsa ```r ## 1.1 #### # I'll work with the dates that the counters went on-site # instead of the organisations counters_dates <- counts %>% group_by(counter_name) %>% summarize(nb_dates = n_distinct(date_count)) %>% arrange(-nb_dates, counter_name) ## 1.2 #### # I'll work with the dates that the counters went on-site # instead of the organisations counters_dates <- counts %>% group_by(counter_name) %>% summarize(nb_dates = n_distinct(date_count), dates = paste0(as.Date(sort(unique(date_count))), collapse = ", ")) %>% arrange(-nb_dates, counter_name) ## 1.3 #### counts <- counts %>% mutate(org_counter = paste0(organisation_id, "-", counter_name)) ## 1.4 #### counts %>% group_by(date_digitalisation, organisation_id, counter_name) %>% dplyr::filter(row_number() == 1) %>%#keep only the first observation for each group ungroup() %>% arrange(-date_digitalisation, organisation_id, counter_name) %>% dplyr::pull(org_counter) ``` ### Charlotte's attempt # Challenge 1 ```r ## 1.1 #### counters_organisations <- prt %>% group_by(counter_name) %>% summarise(n_organisations = length(unique(organisation_id))) %>% setorder(counter_name) ## 1.2 #### counters_organisations <- prt %>% group_by(counter_name) %>% summarise(n_organisations = length(unique(organisation_id)), organisation_ids = first(paste(unique(organisation_id),collapse = ",")), org_counter = first(paste(organisation_id,counter_name, sep = "-"))) %>% setorder(counter_name) ## 1.4 #### vect <- prt %>% setorder(counter_name) %>% setorder(organisation_id) %>% group_by(counter_name) %>% summarise(first_date = first(min(date_digitalisation)), n_organisations = length(unique(organisation_id)), organisation_ids = first(paste(unique(organisation_id),collapse = ",")), org_counter = first(paste(organisation_id,counter_name, sep = "-"))) %>% setorder(first_date) vect1 <- vect1$org_counter ``` ### Robbie ```r ## 1.1 #### counters_organisation <- counts %>% group_by(counter_name, organisation_id) %>% summarise(n_organisations = n_distinct(organisation_id), .groups = 'drop') %>% arrange(desc(n_organisations)) ## 1.2 #### counters_organisation <- counters_organisation %>% group_by(organisation_id, counter_name, n_organisations) %>% summarise(organisations = toString(organisation_id)) ## 1.3 #### counters_organisation <- counters_organisation %>% mutate(org_counter = paste((organisation_id), "-", counter_name)) ``` ## Challenge 2 ### Jorre ```r # Create an overview table containing the number of counts and the dates and the # counters per each hunting ground, as shown here below. overview_table <- counts |> mutate( date_counter=sprintf( '%s (%i-%s)', as.Date(date_count), organisation_id, counter_name ) ) |> summarize( n_counts=n_distinct(date_counter), dates_counters=paste(unique(date_counter),collapse=', '), .by=hunting_ground ) |> arrange(hunting_ground) ``` ## Challenge 3 ### Jorre ```r # Replace counter_name in counts with the anonymized version of the counter # name. Use the digest::digest() function with algo = "sha256". Save the result # as counts_anonymised. counts_anonymised <- counts |> mutate( counter_name=map_chr(counter_name,~digest(.x,algo = "sha256")) ) # Starting from counts_anonymised we could create an overview table with # anonymysed counter names. However, such a table would be unpractical, as it # would be unreadable due to the long hashes. Order the hashes alphabetically* # and use their index as counter name. You can store the hashes and their index # in a two-column data.frame called ordered_hashes. ordered_hashes <- counts_anonymised |> distinct(counter_name) |> arrange(counter_name) |> mutate(counter_name_index=row_number()) counts_anonymised2 <- counts_anonymised |> left_join(ordered_hashes,join_by(counter_name)) |> select(-counter_name) ``` ### Falk ```r ### 3.1 counts_anonymized <- counts %>% mutate(counter_name = purrr::map( counter_name, function(counter_name) digest::digest(object = counter_name, algo = "sha256") )) # note: better to use `map_chr`! (see the others) # just to check counts_anonymized %>% sample_n(5) %>% pull(org_counter, counter_name) # do not forget to drop the `org_counter` and other de-anonymizing columns! counts_anonymized <- counts_anonymized %>% select(-org_counter) ### 3.2 ordered_hashes <- counts_anonymized %>% select(counter_name) %>% distinct() %>% arrange(counter_name) %>% tibble::rowid_to_column("counter_index") %>% mutate(counter_name = unlist(counter_name)) knitr::kable(head(ordered_hashes)) ``` ### Raïsa ```r ## 3A.1 #### counts_anonymised <- counts %>% mutate(counter_name = map_chr(.x = org_counter, .f = function(x){ digest::digest(object = x, algo = "sha256")})) ## 3A.2 #### ordered_hashes <- counts_anonymised %>% dplyr::select(counter_name) %>% distinct() %>% arrange(counter_name) %>% mutate(counter_nb = row_number()) overview_table_anonymised <- counts_anonymised |> left_join(ordered_hashes) |> mutate( date_counter = sprintf( '%s (%.0f)', as.Date(date_count), counter_nb ) ) |> summarize( n_counts = n_distinct(date_counter), dates_counters = paste(unique(date_counter), collapse = ', '), .by = hunting_ground ) |> arrange(hunting_ground) ``` ### Charlotte ````r counts_anonymised <- counts %>% group_by(counter_name) %>% mutate(counter_name = digest::digest(object = counter_name, algo = "sha256")) %>% ungroup() ```` ### Robbie ```r ## 3A.1 #### counters_organisation$counter_name2 <- sapply(counters_organisation$counter_name, digest, algo = "sha256") ```