# INBO CODING CLUB
30 January 2025
Welcome!
## Share your code snippet
If you want to share your code snippet, copy paste your snippet within a section of three backticks (```):
As an **example**:
```
library(tidyverse)
```
(*you can copy paste this example and add your code further down*)
## Yellow sticky notes
No yellow sticky notes online. Put your name + " | " and add a "*" each time you solve a challenge (see below).
## Participants
Name | Challenges
--- | ---
Damiano Oldoni | ***
Dirk Maes | *
Emma Cartuyvels|***
Falk Mielke | **
Pieter Huybrechts | ***
Berber Meulepas | *
Jonathan De Schrijver |*
Adriaan Seynaeve |
Sanne Govaert | ***
Arc'hantael Labriere |
Cyrielle Delvenne
## Tipps
### break long pipes
*(with greetings from Floris)*
Data pipe chains with `%>%` tend to get long.
Here is a useful package for debugging long chains of piped operations on the fly.
https://github.com/MilesMcBain/breakerofchains
> Break your chain at the cursor line. Run the first bit. See the output. Be free.
## Challenge 1
### Damiano's solution (example)
Copy paste this section to show your solutions.
```r
# dummy code
print("This is how to insert code.")
```
### Emma's solution
```r
butterflies_eu_distr_all |>
distinct(threatStatus) |>
pull()
butterflies_eu_distr <-
butterflies_eu_distr_all |>
filter(!is.na(country))
butterflies_eu_distr |>
group_by(threatStatus) |>
summarise(number_of_species = n_distinct(taxonKey))
butterflies_eu_distr |>
group_by(country, threatStatus) |>
summarise(number_of_species = n_distinct(taxonKey))
butterflies_eu_distr |>
group_by(country, threatStatus) |>
summarise(number_of_species = n_distinct(taxonKey)) |>
arrange(desc(number_of_species))
butterflies_eu_distr |>
select(-source, -remarks)
butterflies_eu_distr |>
select(taxonKey, country, locality, locationId, threatStatus)
butterflies_eu_distr |>
select(-any_of(c("source", "remarks", "test")))
```
### Falk's attempt
```r
butterflies_eu_distr_all %>%
select(threatStatus) %>%
unique
butterflies_eu_distr_all %>%
pull(threatStatus) %>%
unique
butterflies_eu_distr <- butterflies_eu_distr_all %>%
filter(!is.na(country))
butterflies_eu_distr %>%
count(country, threatStatus) %>%
arrange(desc(n))
butterflies_eu_distr <- butterflies_eu_distr %>%
relocate(country, .before = locality) %>%
relocate(threatStatus, .after = locationId) %>%
select(!any_of(c("source", "remarks")))
# I often prefer this output to `glimpse()`...
knitr::kable(t(head(butterflies_eu_distr, 3)))
```
### Sanne's solution
```r
# Get the unique values of column threatStatus in butterflies_eu_distr. (Bonus: both as vector (basic R) or data.frame with 1 one column (dplyr)).
butterflies_eu_distr_all %>%
pull(threatStatus) %>%
unique()
unique(butterflies_eu_distr_all$threatStatus)
# Remove the rows without country, i.e. the data at European level. Save it as butterflies_eu_distr.
butterflies_eu_distr <-
butterflies_eu_distr_all %>%
filter(!is.na(country))
# Count the number of species for each threatStatus.
butterflies_eu_distr %>%
group_by(threatStatus) %>%
summarise(species_nr = n())
# Count the number of species for each threatStatus and country.
butterflies_eu_distr %>%
group_by(threatStatus, country) %>%
summarise(species_nr = n())
# Order the count of species for each threatStatus and country in descending order.
butterflies_eu_distr %>%
group_by(threatStatus, country) %>%
summarise(species_nr = n()) %>%
arrange(desc(species_nr), .by_group = TRUE)
# Remove columns source and remarks from butterflies_eu_distr.
butterflies_eu_distr %>%
select(-source, -remarks)
# Move column country between taxonKey and locality. Move threatStatus after locationId.
# Optional. Sometimes we cannot know in advance that the columns we want to select/remove from a data.frame exist. For example, how to remove source and remarks without returning an error if they do not exist?
butterflies_eu_distr %>%
select(taxonKey, country, locality, locationId, threatStatus)
```
### Janne's solution
```r
#Get the unique values of column threatStatus in butterflies_eu_distr.
#(Bonus: both as vector (basic R) or data.frame with 1 one column (dplyr)).
threatstatus_names <- butterflies_eu_distr_all$threatStatus |>
unique()
threatstatus_names <- butterflies_eu_distr_all |>
dplyr::select(threatStatus) |>
dplyr::distinct(threatStatus) |>
as.data.frame()
# Remove the rows without country, i.e. the data at European level.
# Save it as butterflies_eu_distr.
butterflies_eu_distr <- butterflies_eu_distr_all |>
dplyr::filter(!is.na(country))
butterflies_eu_distr <- butterflies_eu_distr_all |>
tidyr::drop_na(country)
# Count the number of taxon keys (column taxonKey) for each threatStatus.
butterflies_eu_distr_sum <- butterflies_eu_distr |>
dplyr::group_by(threatStatus) |>
#dplyr::distinct(taxonKey) |>
dplyr::summarise(n_species = n()) |>
dplyr::ungroup()
butterflies_eu_distr_sum <- butterflies_eu_distr |>
dplyr::count(threatStatus)
# Count the number of taxon keys (column taxonKey) for each threatStatus and country.
butterflies_eu_distr_sum <- butterflies_eu_distr |>
dplyr::count(threatStatus, country)
# Order the count of taxon keys (column taxonKey) for each threatStatus and country in descending order.
butterflies_eu_distr |>
dplyr::count(threatStatus, country) |>
dplyr::arrange(n |> desc())
# Remove columns source and remarks from butterflies_eu_distr.
butterflies_eu_distr_upd <- butterflies_eu_distr |>
dplyr::select(!c(source, remarks))
# Move column country between taxonKey and locality. Move threatStatus after locationId.
butterflies_eu_distr_upd <- butterflies_eu_distr |>
dplyr::relocate(country, .after = taxonKey) |>
dplyr::relocate(threatStatus, .after = locationId)
# Optional. Sometimes we cannot know in advance that the columns we want to select/remove from a data.frame exist. For example, how to remove source and remarks without returning an error if they do not exist?
butterflies_eu_distr_upd <- butterflies_eu_distr |>
dplyr::select(!tidyselect::any_of(c("source", "remarks")))
```
### Pieter's solution
```r
# unique values of column threatStatus in butterflies_eu_distr
distinct(butterflies_eu_distr_all, threatStatus)
unique(pull(butterflies_eu_distr_all, threatStatus))
# Remove the rows without country
butterflies_eu_distr <-
filter(butterflies_eu_distr_all, !is.na(country))
# Count the number of species for each threatStatus
select(butterflies_eu_distr, taxonKey, threatStatus) %>%
count(threatStatus, sort = TRUE)
# Count the number of species for each threatStatus and country
select(butterflies_eu_distr, taxonKey, threatStatus, country) %>%
group_by(threatStatus, country) %>%
tally(sort = FALSE)
# Order the count of species for each threatStatus and country in descending
# order
arrange(butterflies_eu_distr, desc(threatStatus), desc(country)) %>%
glimpse()
# Remove columns source and remarks from butterflies_eu_distr
select(butterflies_eu_distr, -all_of(c("remarks", "source"))) %>%
glimpse()
# Move column country between taxonKey and locality. Move threatStatus after
# locationId
relocate(
butterflies_eu_distr,
country,
.before = locality
) %>%
relocate(
threatStatus,
.after = locationId
) %>%
glimpse()
# how to remove source and remarks without returning an error if they do not
# exist?
select(butterflies_eu_distr, -any_of(c("source", "remarks"))) %>%
glimpse()
```
## Cyrielle's
```r
#1.1. Get the unique values of column threatStatus in butterflies_eu_distr.
#R basics
unique(butterflies_eu_distr_all$threatStatus)
#dplyr
butterflies_eu_distr_all %>%
distinct(threatStatus)
#1.2. Remove the rows without country, i.e. the data at European level. Save it as butterflies_eu_distr.
butterflies_eu_distr <- butterflies_eu_distr_all %>%
filter(!is.na(country))
##1.3. Count the number of taxon keys (column taxonKey) for each threatStatus
butterflies_eu_distr %>%
group_by(threatStatus) %>%
summarise(count = n())
##1.4. Count the number of taxon keys (column taxonKey) for each threatStatus and country
butterflies_eu_distr %>%
group_by(threatStatus,country) %>%
summarise(count = n())
##1.5. Order the count of taxon keys (column taxonKey) for each threatStatus and country in descending order.
butterflies_eu_distr %>%
arrange(desc(threatStatus))
#how to add country as an argument??
##1.6. Remove columns source and remarks from butterflies_eu_distr
select(butterflies_eu_distr,-c(source, remarks))
##1.7. Move column country between taxonKey and locality. Move threatStatus after locationId.
butterflies_eu_distr %>%
relocate(country, .after= taxonKey
```
## Challenge 2
### Falk's suggestions
```r
butterflies_be <- butterflies_eu %>%
filter(key %in% pull(butterflies_eu_distr %>%
filter(country == "BE"), taxonKey))
# (Pieter's solution is better ;)
countries_unreported_threatstatus <- butterflies_eu_distr %>%
count(country, threatStatus) %>%
filter(is.na(threatStatus)) %>%
pull(country) %>%
unique
# (3)
transect_counts %>%
mutate(idx = tolower(species)) %>%
left_join(
butterflies_eu_vern_dutch %>%
mutate(idx2 = tolower(vernacularName)) %>%
select(idx2, taxonKey)
,
by = join_by(idx == idx2),
relationship = "many-to-many",
unmatched = "drop", # "error"
suffix = c("", ".duplicates")
) %>%
select(-idx) %>%
right_join(
butterflies_be %>% select(nubKey, scientificName),
by = join_by(taxonKey == nubKey),
relationship = "many-to-one",
unmatched = "drop", # "error"
suffix = c("", ".duplicates")
)
```
## Pieter's solution
```r
belgian_butterfly_keys <-
filter(butterflies_eu_distr, country == "BE") %>%
pull(taxonKey)
butterflies_be <-
butterflies_eu %>%
filter(key %in% belgian_butterfly_keys)
# Some countries haven't reported the threatStatus (threatStatus = NA) for any
# species in butterflies_eu_distr. Which ones?
butterflies_eu_distr %>%
summarise(
no_threatstatus_reported = all(is.na(threatStatus)),
.by = country
) %>%
filter(no_threatstatus_reported) %>%
pull(country)
# Add to transect_counts a column called scientificName with the scientific name
# contained in butterflies_be
vernacular_lut <-
select(butterflies_eu_vern_dutch, taxonKey, vernacularName) %>%
distinct() %>%
left_join(
select(butterflies_be, nubKey, scientificName),
by = join_by(taxonKey == nubKey)
) %>%
select(taxonKey, vernacularName, scientificName) %>%
mutate(vernacularName_lower = tolower(vernacularName)) %>%
select(scientificName, vernacularName_lower) %>%
distinct()
transect_counts %>%
mutate(vernacularName_lower = tolower(species)) %>%
left_join(vernacular_lut, by = "vernacularName_lower") %>%
select(c(colnames(transect_counts), "scientificName"))
```
### Emma's solution
``` r
belgian_butterflies <-
butterflies_eu_distr |>
filter(country == "BE") |>
pull(taxonKey)
butterflies_be <-
butterflies_eu |>
filter(speciesKey %in% belgian_butterflies)
butterflies_eu_distr |>
group_by(country) |>
summarise(test = sum(!is.na(threatStatus))) |>
filter(test == 0)
transect_counts <-
transect_counts |>
left_join(butterflies_eu_vern_dutch |>
distinct(taxonKey, vernacularName),
join_by(species == vernacularName)) |>
left_join(butterflies_be |>
select(nubKey, scientificName),
join_by(taxonKey == nubKey))
```
### Janne's solution
```r
#Filter butterflies_eu to get only the Belgian butterflies (country code: BE in butterflies_eu_distr). Save the result as butterflies_be. Important: no need to join the two datasets!
keys_BE <- butterflies_eu_distr |>
dplyr::filter(grepl("BE", country)) |>
dplyr::pull(taxonKey)
butterflies_be <- butterflies_eu |>
dplyr::filter(key %in% keys_BE)
#Some countries haven't reported the threatStatus (threatStatus = NA) for any species in butterflies_eu_distr. Which ones? Important: many countries reported the threatStatus for some species, but not for all of them and should be excluded.
butterflies_eu_distr_upd <- butterflies_eu_distr |>
dplyr::mutate(
n_rows = dplyr::n(),
n_rows_na = is.na(threatStatus) |> sum(),
.by = country
) |>
dplyr::filter(n_rows == n_rows_na)
# Add to transect_counts a column called scientificName with the scientific name contained in butterflies_be (column scientificName).
transect_counts_upd <- transect_counts |>
dplyr::mutate(species = species |>tolower())
butterflies_eu_vern_dutch_upd <- butterflies_eu_vern_dutch |>
dplyr::mutate(vernacularName = vernacularName |> tolower()) |>
dplyr::distinct(vernacularName, .keep_all = TRUE)
butterflies_joined <- dplyr::left_join(
x = transect_counts_upd,
y = butterflies_eu_vern_dutch_upd,
by = c("species" = "vernacularName")
) |>
dplyr::left_join(
x = _,
y = butterflies_be,
by = c("taxonKey" = "nubKey")
)
```
### Sanne's solution
```r
#1
taxonKey_be <- butterflies_eu_distr %>%
filter(country == "BE") %>%
pull(taxonKey)
butterflies_be <-
butterflies_eu %>%
filter(key %in% taxonKey_be)
#2
countries_no_threatstatus <-
butterflies_eu_distr %>%
filter(is.na(threatStatus)) %>%
distinct(country) %>%
pull(country)
butterflies_eu_distr %>%
filter(!country %in% countries_no_threatstatus)
#3
vernacular_names <-
butterflies_eu_vern_dutch %>%
select(vernacularName, taxonKey) %>%
distinct()
scientific_names <-
butterflies_be %>%
select(nubKey, scientificName)
transect_counts %>%
left_join(vernacular_names, join_by(species == vernacularName)) %>%
left_join(scientific_names, join_by(taxonKey == nubKey))
```
## Challenge 3
### Emma's solution
``` r
butterflies_eu_vern_dutch |>
group_by(taxonKey) |>
summarize(n = n_distinct(vernacularName))
butterflies_eu_vern_dutch |>
mutate(vernacularName_lower = tolower(vernacularName)) |>
group_by(taxonKey) |>
mutate(n = n_distinct(vernacularName_lower)) |>
distinct(taxonKey, vernacularName_lower, n) |>
arrange(desc(n))
```
### Pieter's solution
```r
# The data.frame butterflies_eu_vern_dutch contains several vernacular names for
# the same species (taxonKey). Some of them are exactly the same: they are just
# collected from different checklists. How many unique vernacular names are
# there for each taxon?
butterflies_eu_vern_dutch %>%
summarise(n_unique_names = n_distinct(vernacularName), .by = taxonKey)
# We can see that some unique vernacular names are just the same name written in
# different ways. For example, "Atalanta" and "atalanta". How many unique
# vernacular names are there for each taxon if we ignore the
# case/capitalization? Return a data.frame with two columns: taxonKey and n.
# Order the result by n, from high to low. Tip: stringr package can be useful.
butterflies_eu_vern_dutch %>%
summarise(n = n_distinct(tolower(vernacularName)), .by = taxonKey) %>%
arrange(desc(n))
## More verbose
butterflies_eu_vern_dutch %>%
mutate(lower_vernacular = stringr::str_to_lower(vernacularName),
taxonKey,
.keep = "none") %>%
distinct() %>%
group_by(taxonKey) %>%
tally(sort = TRUE)
```
## Bonus
### Pieter`s solution
```r
## BONUS CHALLENGE 1 - Choices 🤷
# Only preferred
filter(butterflies_eu_vern_dutch, preferred)
# Only not preferred
filter(butterflies_eu_vern_dutch, !preferred) %>%
mutate(vernacularName = first(vernacularName), .by = "taxonKey")
# Only one vernacularName
butterflies_eu_vern_dutch %>%
filter(.by = taxonKey,
n_distinct(vernacularName) == 1)
# In one step (Not sure this is what we want)
butterflies_eu_vern_dutch %>%
mutate(
.by = "taxonKey",
vernacularName = case_when(
preferred ~ vernacularName,
!preferred ~ first(vernacularName),
n_distinct(vernacularName) == 1 ~ vernacularName,
.default = vernacularName
)
)
## BONUS CHALLENGE 2 - Tidying up your data 🧹
sheet_path <- "data/20250130/20250130_butterfly_transect_counts_raw.xls"
## counts
counts_raw <- readxl::read_excel(
path = sheet_path,
skip = 4,
.name_repair = "universal",
range = "A5:AB36"
)
counts_raw %>%
pivot_longer(cols = matches("O[0-9]{2}"),
values_to = "n",
names_to = "section") %>%
rename(species = NAAM) %>%
filter(!is.na(n))
## names observers
get_observer_names <- function(sheet_path) {
# Read the entire first column and then extract the observers only by detecting
# the counts table, this allows for any number of counters
raw_a_col <-
readxl::read_excel(
path = sheet_path,
range = readxl::cell_cols("A")) %>%
dplyr::pull("Tellers")
row_index_of_counts <-
seq(raw_a_col)[stringr::str_detect(raw_a_col, stringr::fixed("NAAM"))] %>%
min(na.rm = TRUE)
observer_names <-
head(raw_a_col, n = row_index_of_counts - 1) %>%
.[!is.na(.)] # remove NA
# Add identifier based on file and return as tibble
dplyr::tibble(
tellers = observer_names,
sheet_id = tools::md5sum(sheet_path)
)
}
get_ymd <- function(sheet_path){
raw_date <- readxl::read_excel(
path = sheet_path,
range = "C1:H2")
dplyr::tibble(
date =
lubridate::ymd(
sprintf("%i-%i-%i", raw_date$Jaar, raw_date$Maand, raw_date$Dag)
),
sheet_id = tools::md5sum(sheet_path)
)
}
get_start_end_time <- function(sheet_path){
raw_times <- readxl::read_excel(
path = sheet_path,
range = "J1:M2"
)
get_ymd(sheet_path = sheet_path) %>%
dplyr::mutate(
start_time = lubridate::ymd_hms(
paste(
date,
lubridate::hour(raw_times$`Tijd begin`),
lubridate::minute(raw_times$`Tijd begin`),
lubridate::second(raw_times$`Tijd begin`)
)
),
end_time = lubridate::ymd_hms(
paste(
.data$date,
lubridate::hour(raw_times$`Tijd einde`),
lubridate::minute(raw_times$`Tijd einde`),
lubridate::second(raw_times$`Tijd einde`)
)
)
) %>%
dplyr::relocate(dplyr::ends_with("id"), .after = dplyr::everything())
}
get_temp <- function(sheet_path){
raw_temp <- readxl::read_excel(
path = sheet_path,
range = "O2:P2",
col_names = c("temp", "unit")
) %>%
# Use the readr parser to get sensible types
readr::type_convert(readr::cols(),
guess_integer = TRUE)
# little helper to reverse string to fix temp unit
reverse_string <- function(x) {
stringr::str_c(stringr::str_split(x, "")[[1]] %>% rev(), collapse = "")
}
temperature_unit <-
raw_temp$unit %>%
stringr::str_remove(stringr::fixed("/")) %>%
reverse_string()
temperature <- raw_temp$temp %>%
units::set_units(temperature_unit, mode = "standard")
tibble(
temperature = temperature,
sheet_id = tools::md5sum(sheet_path)
)
}
get_cloud <- function(sheet_path){
raw_cloud <- readxl::read_excel(
path = sheet_path,
range = "Q2:R2",
col_names = c("cloud", "unit")
)
tibble(
cloud = paste0(raw_cloud$cloud, raw_cloud$unit),
sheet_id = tools::md5sum(sheet_path)
)
}
get_wind_speed <- function(sheet_path){
raw_wind_speed <-
readxl::read_excel(
path = sheet_path,
range = "W1:AB2",
) %>%
# drop emtpy columns
dplyr::select(dplyr::where(~!is.logical(.x))) %>%
# take last, and highest value in case of more than one
dplyr::pull(var = -1)
# convert beaufort into m/S
## https://en.wikipedia.org/wiki/Beaufort_scale
wind_speed <-
0.836 * raw_wind_speed^(3/2) %>%
units::set_units("m/s")
# return as tibble
tibble(
wind_speed = wind_speed,
sheet_id = tools::md5sum(sheet_path)
)
}
## Build identifier based on exact file (md5sum), or based on combination of
## observers and measurements
combined_measurements <-
list(
get_observer_names(sheet_path),
get_start_end_time(sheet_path),
get_temp(sheet_path),
get_cloud(sheet_path),
get_wind_speed(sheet_path)
) %>%
purrr::reduce(~dplyr::full_join(.x, .y, by = dplyr::join_by(sheet_id)))
# build identifier from the combined measurements
combined_measurements_id <-
dplyr::mutate(
combined_measurements,
measurement_id =
digest::digest(combined_measurements, "spookyhash")
) %>%
dplyr::relocate(
dplyr::ends_with("_id"),
.after = dplyr::everything()
)
glimpse(combined_measurements_id)
```