# INBO CODING CLUB 30 January 2024 Welcome! ## Share your code snippet If you want to share your code snippet, copy paste your snippet within a section of three backticks (```): As an **example**: ``` library(tidyverse) ``` (*you can copy paste this example and add your code further down*) ## Yellow sticky notes No yellow sticky notes online. Put your name + " | " and add a "*" each time you solve a challenge (see below). ## Participants Name | Challenges --- | --- Damiano Oldoni | Pieter Huybrechts |*** Nele Mullens | * Wouter Depaepe |** Patrik Oosterlynck | Dirk Maes | ** Ward Langeraert |*** Margot Vanhellemont |** Rhea Maesele | * Adriaan Seynaeve | Jorre Vannieuwenhuyze | *** Anja Leyman | * Emma Cartuyvels| ** ## Challenge 1 ### Ward ```{r} # Look at data glimpse(birds) #color_ring: column containing the color rings #metal_ring: column containing the metal rings #1.Get the length of the metal rings. sort(unique(str_length(birds$metal_ring))) #2. Do the color rings start with a "C"? all(str_starts(birds$color_ring, "C")) #3. Do the color rings end with a "R"? all(str_ends(birds$color_ring, "R")) #4. Are the color rings uppercase? all(str_detect(birds$color_ring, "[a-z]", negate = TRUE)) #5. Solve all the anomalies found in (4) by setting all color rings uppercase. birds_cleaned <- birds %>% mutate(color_ring = str_to_upper(color_ring)) all(str_detect(birds_cleaned$color_ring, "[a-z]", negate = TRUE)) ``` ### Pieter's solution :) ```R `library(tidyverse) # Challange 1 ------------------------------------------------------------- color_ring <- data.table::fread("data/20240130/20240130_bird_rings.txt", select = "color_ring") %>% dplyr::pull(color_ring) metal_ring <- readr::read_tsv("data/20240130/20240130_bird_rings.txt", col_select = "metal_ring") %>% dplyr::pull(metal_ring) ## 1. Get the length of the metal rings stringr::str_length(metal_ring) nchar(metal_ring) # unique lengths unique(nchar(metal_ring)) # count lengths tibble(metal_ring, nchar_ring = nchar(metal_ring)) %>% group_by(nchar_ring) %>% tally() ## 2. Do the color rings start with a "C"? color_ring %>% stringr::str_starts(fixed("C")) %>% all() # These rings do not! color_ring[stringr::str_starts(color_ring, pattern = "C", negate = TRUE)] # again, but with a regex stringr::str_extract(color_ring, "^(?!C).*") %>% .[!is.na(.)] # again but in base R color_ring[grepl("^[^C].*", color_ring)] ## 3. Do the color rings end with a "R"? stringr::str_ends(color_ring, fixed("R")) %>% all() # not all color rings end with R # These do not: stringr::str_extract(color_ring, ".*(?<!R)$") %>% stringr::str_view() # have a more detailed view ## 4. Are the color rings uppercase? stringr::str_detect(color_ring, "^[A-Z]+$") %>% all() # they are not all uppercase # again, using dplyr instead for a comparison all.equal(color_ring, stringr::str_to_upper(color_ring)) # These are not: stringr::str_extract(color_ring, ".+[a-z]+.+") %>% .[!is.na(.)] # or using a comparison instead color_ring[color_ring != stringr::str_to_upper(color_ring)] ## 5. Solve all the anomalies found in (4) by setting all color rings uppercase. stringr::str_to_upper(color_ring) # overwriting the existing column readr::read_tsv("data/20240130/20240130_bird_rings.txt") %>% mutate(color_ring = stringr::str_to_upper(color_ring)) ## Extra: tidyverse packages are made to work nicely together. Use stringr and dplyr to ## get the birds with a 6 characters long metal ring and a color ring starting ## with a C and ending with a R. arrow::read_tsv_arrow("data/20240130/20240130_bird_rings.txt") %>% filter( nchar(metal_ring) == 6, stringr::str_starts(color_ring, fixed("C")), stringr::str_ends(color_ring, fixed("R")) ) ```` ### Rhea solution ```{r} #1. Get the length of the metal rings. birds <- birds %>% mutate(length_metalring = str_length(metal_ring)) View(birds) #2. Do the color rings start with a "C"? birds <- birds %>% mutate(C_colorrings = str_starts(color_ring, "C")) View(birds) #If it starts with a C -> True; otherwise FALSE #3. Do the color rings end with a "R"? birds <- birds %>% mutate(R_colorrings = str_ends(color_ring, "R")) View(birds) #4. Are the color rings uppercase? birds <- birds %>% mutate(uppercase_colorrings = str_detect(color_ring, "[[:upper:]]")) View(birds) #5. Solve all the anomalies found in (4) by setting all color rings uppercase. birds <- birds %>% mutate(upper_colorrings = str_to_upper(color_ring)) View(birds) #Extra: tidyverse packages are made to work nicely together. #Use stringr and dplyr to get the birds with a 6 characters long metal ring and #a color ring starting with a C and ending with a R. bird_extra <- birds %>% filter(length_metalring == 6 & C_colorrings == TRUE & R_colorrings == TRUE) View(bird_extra) ``` ### Emma ``` str_length(birds$metal_ring) str_starts(birds$color_ring, "C") str_ends(birds$color_ring, "R") str_detect(birds$color_ring, regex("[:lower:]+")) str_to_upper(birds$color_ring) birds %>% filter(str_length(metal_ring) == 6, str_starts(color_ring, "C"), str_ends(color_ring, "R")) ``` ## Challenge 2 ### Emma ``` test <- birds %>% mutate(color_ring_complete = str_c(background_color, inscription_color, "(", color_ring, ")")) birds %>% filter(str_length(color_ring) == 4, str_detect(color_ring, regex("^.{2}A"))) str_detect(birds$color_ring, regex("\\d+")) test <- birds %>% mutate(digit = str_extract(color_ring, regex("\\d"))) ``` ### Pieter's solution :) ```R # Challenge 2 ------------------------------------------------------------- color_ring_tbbl <- readr::read_tsv("data/20240130/20240130_bird_rings.txt") ## 1. Create a new column called color_ring_complete containing color ring #information in this format: #background_color+inscription_color+"("+color_ring+")", e.g. RW(FJAC) # using glue color_ring_tbbl %>% mutate( color_ring_complete = stringr::str_glue("{background_color}{inscription_color}({color_ring})") ) %>% glimpse() # let's have a look # using concatenation color_ring_tbbl %>% mutate( color_ring_complete = stringr::str_c(background_color, inscription_color, "(", color_ring, ")") ) %>% glimpse() # let's have a look # using paste! color_ring_tbbl %>% mutate(color_ring_complete = paste0(background_color, inscription_color, "(", color_ring, ")")) %>% glimpse() # let's have a look ## 2. Are the color rings 4-letter only long and is the third letter an "A"? color_ring %>% stringr::str_detect("^[a-zA-Z]{2}A[a-zA-Z]{1}$") %>% all() # not all of them # These do: color_ring[stringr::str_detect(color_ring, "^[a-zA-Z]{2}A[a-zA-Z]{1}$")] # Without regex str_length(color_ring) == 4 & str_sub(color_ring, 3, 3) == "A" %>% all() # not all do # These do: color_ring[str_length(color_ring) == 4 & str_sub(color_ring, 3, 3) == "A"] ## 3. Do the color rings contain at least a digit? color_ring %>% stringr::str_detect("[0-9]+") %>% all() # not all of them! # these do not: stringr::str_extract(color_ring, "^\\D+$") %>% .[!is.na(.)] ## 4. Create a new column called digit containing the first digit, if any, as a ## number. color_ring_tbbl %>% mutate(digit = as.numeric(stringr::str_extract(color_ring, "\\d")), .keep = "used") %>% glimpse() ## Extra: again, by combining dplyr and stringr, select the birds whose color ## rings satisfy the condition in (2). color_ring_tbbl %>% filter(nchar(color_ring) == 4) %>% filter(stringr::str_sub(color_ring, 3,3) == "A") %>% glimpse() ``` ### Ward (!met regex) ```{r} #1. Create a new column called color_ring_complete containing color ring # information in this format: # background_color+inscription_color+"("+color_ring+")", e.g. RW(FJAC) birds <- birds %>% mutate(color_ring_complete = str_glue("{background_color}{inscription_color}", "({color_ring})")) #2. Are the color rings 4-letter only long and is the third letter an "A"? # 4 letters long str_equal(str_length(birds$color_ring), 4) # third letter str_starts(birds$color_ring, "[a-zA-Z]{2}A") # 4 letters AND third letter A str_detect(birds$color_ring, "^[a-zA-Z]{2}A[a-zA-Z]$") #3. Do the color rings contain at least a digit? str_detect(birds$color_ring, "\\d") #4. Create a new column called digit containing the first digit, if any, # as a number. birds <- birds %>% mutate(digit = as.numeric(str_match(color_ring, "\\d")[,1])) #str_extract Pieter solution better #5. By combining dplyr and stringr, select the birds whose color rings satisfy # the condition in (2). birds %>% filter(str_equal(str_length(color_ring), 4), str_starts(birds$color_ring, "[a-zA-Z]{2}A")) ``` ### Jorre ```{r} #Create a new column called color_ring_complete containing color ring #information in this format: #background_color+inscription_color+"("+color_ring+")", e.g. RW(FJAC) birds2 <- mutate(birds, color_ring_complete=sprintf("%s%s(%s)",background_color,inscription_color,color_ring), ) # Are the color rings 4-letter only long and is the third letter an "A"? birds |> mutate(test=nchar(color_ring)==4 & str_sub(color_ring,3,3)=='A' ) |> pull(test) |> table() # Do the color rings contain at least a digit? birds |> pull(color_ring) |> str_detect('\\d') |> table() # Create a new column called digit containing the first digit, if any, as a # number. birds2 <- birds |> mutate(digit=str_extract(color_ring,"[0-9]+") |> parse_integer() ) # Extra: again, by combining dplyr and stringr, select the birds whose color # rings satisfy the condition in (2). birds |> filter(nchar(color_ring)==4 & str_sub(color_ring,3,3)=='A') ``` ## INTERMEZZO Extra resources for regex: - https://www.datacamp.com/tutorial/regex-r-regular-expressions-guide - The R for data science book has a chapter: https://r4ds.hadley.nz/regexps.html - yet another regex tester: https://www.regextester.com/ - Another common pattern used in shells is globbing, for example: `*.txt` for files ending on `.txt`, you can convert these to regex with a base function: `glob2rx()` ### Extracting version numbers is extra difficult! There are **a lot** of exceptions in the wild: [Falsehoods programmers believe about versions](https://github.com/xenoterracide/falsehoods/blob/master/versions.md) ## Challenge 3 ### Emma ``` # Challenge 3A birds$color_ring == str_replace(birds$color_ring_dots, "\\.", "") birds$metal_ring <- str_replace(birds$metal_ring, "^\\*+", "") str_detect(birds$color_ring, "[aeiou]{2}") # Challenge 3B substrings_to_remove <- c("spec.", "indet.", "cf", "nov.", "ined.", "spp.", "sp.") cleaned_names <- str_remove_all(sc_names, paste0(substrings_to_remove, collapse = "|")) str_squish(cleaned_names) ``` ### Pieter's solution :smiley: ```R # Challenge 3 ------------------------------------------------------------- ## 1. The dots in color rings (column color_ring_dots), e.g. KRO.C, KZ.AC, are # used for improving readibility. Apart from that, the values in column # color_ring_dots should be exactly the same as the ones in column color_ring. # Find anomalies. color_ring_tbbl %>% filter(str_remove(color_ring_dots, fixed(".")) != color_ring) ## 2. Some metal rings (column metal_ring) start with one or more asterisks. ## Remove them. metal_ring %>% str_remove("^[\\*]+") ## 3. Find color rings (column color_ring) containing two consecutive vowels. # Vowels are complicated! https://en.wikipedia.org/wiki/Vowel color_ring[stringr::str_which(color_ring, "[aeiouAEIOU]{2}")] # Challenge 3B ------------------------------------------------------------ # do notice Salix cf alba scientific_names <- readr::read_lines( file.path("data", "20240130", "20240130_scientificnames.txt"), skip = 1) stringr::str_remove_all(scientific_names, "[a-zA-Z]+(?=\\.).") %>% stringr::str_squish() # remove cf using lookahead and lookbehind stringr::str_remove_all(scientific_names, "[a-zA-Z]+(?=\\.).") %>% str_remove_all("(?<= )cf(?= )") %>% stringr::str_squish() ``` ### Ward ```{r} # Challenge 3A #1. The dots in color rings (column color_ring_dots), e.g. KRO.C, KZ.AC, are # used for improving readibility. Apart from that, the values in column # color_ring_dots should be exactly the same as the ones in column color_ring. # Find anomalies. birds$color_ring == str_remove_all(birds$color_ring_dots, pattern = fixed(".")) #2. Some metal rings (column metal_ring) start with one or more asterisks. # Remove them. birds <- birds %>% mutate(metal_ring = str_replace(metal_ring, pattern = "^\\*+", replacement = "")) #3. Find color rings (column color_ring) containing two consecutive vowels. # y is sometimes considered a vowel but not always ... vowels <- "aeiouAEIOU" birds %>% filter(str_detect(color_ring, pattern = paste0("[", vowels, "]{2}"))) # Challenge 3B # Scientific names sometimes contain abbreviations like "sp.", "spec.", # "indet.", "cf", "nov.", "ined". Try to clean the names provided in # 20240130_scientificnames.txt by removing such abbreviations. # Ensure also that the resulting scientific names have no whitespaces at the # start or at the end and also that they have single spaces between words. str_remove_all(sc_names, pattern = "[a-zA-Z]*\\.") %>% str_remove(fixed("cf")) %>% # zie solution Pieter str_squish() ``` ### Jorre ```{r} # The dots in color rings (column color_ring_dots), e.g. KRO.C, KZ.AC, are used # for improving readibility. Apart from that, the values in column # color_ring_dots should be exactly the same as the ones in column color_ring. # Find anomalies. birds |> filter( str_remove_all(color_ring_dots,'[.]') != color_ring ) |> View() # Some metal rings (column metal_ring) start with one or more asterisks. Remove # them. birds |> mutate( metal_ring2 = str_remove_all(metal_ring,'^[*]+') ) |> View() # Find color rings (column color_ring) containing two consecutive vowels. birds |> filter( str_detect(color_ring,'[aeiouAEIOU]{2}') ) |> View() # Are you bored of working with bird rings? Maybe you find cleaning scientific # names something more similar to your daily tasks.This alternative challenge is # for you! # # Matching scientific names against the GBIF Taxonomy Backbone fails sometimes # just because the provided scientific name contains abbreviations like "sp.", # "spec.", "indet.", "cf", "nov.", "ined". Try to clean the names provided in # 20240130_scientificnames.txt by removing such abbreviations. # # Ensure also that the resulting scientific names have no whitespaces at the # start or at the end and also that they have single spaces between words. Hint: # Check the cheatsheet to find the stringr function to remove all these # whitespaces. sc_names |> str_remove_all('[A-z]+[.]') |> str_replace_all('\\b[^aeiouAEIOU]+\\b',' ') |> # for replacing 'cf' or any word without vowels str_squish() ``` ## Dirk ```{r} # Challenge 3B scientific_names <- read_delim("./data/20240130/20240130_scientificnames.txt", delim = "\t") head(scientific_names) scientific_names$scientific_name_clean <- word(scientific_names$scientific_name, 1, 1) head(scientific_names)``` ## Bonus challenge There is no bonus challange!?