owned this note
owned this note
Published
Linked with GitHub
# INBO CODING CLUB
26 September 2024
Welcome!
## Share your code snippet
If you want to share your code snippet, copy paste your snippet within a section of three backticks (```):
As an **example**:
```
library(tidyverse)
```
(*you can copy paste this example and add your code further down*)
## Yellow sticky notes
No yellow sticky notes online. Put your name + " | " and add a "*" each time you solve a challenge (see below).
## Participants
Name | Challenges
--- | ---
Damiano Oldoni | ***
Nele Mullens | **
Jorre Vannieuwenhuyze |
Sanne Govaert |
Pieter Huybrechts | ***
Falk Mielke |
Adriaan Seynaeve |
## Challenge 0
### Cecilia:
```{r}
make_bread <- function(grains, yeast, water, salt) {
# Code to generate `bread`
bread <- grains + yeast + water + salt
return(bread)
}
make_focaccia <- function(grains, yeast, water, salt) {
# Code to generate `focaccia`
focaccia <- grains + 1.5 * yeast + 0.7 * water + 2 * salt
return(focaccia)
}
make_doughs <- function(grains, yeast, water, salt) {
# Function to generate `bread`
bread <- make_bread(grains, yeast, water, salt)
# Function to generate `bread`
focaccia <- make_focaccia(grains, yeast, water, salt)
doughs <- list(bread = bread, focaccia = focaccia)
return(doughs)
}
```
### Thierry
```
make_doughs <- function(grains, yeast, water, salt) {
doughs <- list(
bread = make_bread(
grains = grains, yeast = yeast, water = water, salt = salt
),
focaccia = make_focaccia(
grains = grains, yeast = yeast, water = water, salt = salt
)
)
return(doughs)
}
make_bread <- function(grains, yeast, water, salt) {
grains + yeast + water + salt
}
make_focaccia <- function(grains, yeast, water, salt) {
grains + 1.5 * yeast + 0.7 * water + 2 * salt
}
```
### Falk
```{r}
make_dough_making_function <- function(m1, m2, m3, m4) {
function(grains, yeast, water, salt) {
m1*grains + m2*yeast + m3*water + m4*salt
}
}
make_bread <- make_dough_making_function(1, 1, 1, 1)
make_focaccia <- make_dough_making_function(1, 1.5, 0.7, 2)
make_doughs <- function(grains, yeast, water, salt) {
# Combine bread and focaccia as a list of doughs
doughs <- list(bread = make_bread(grains, yeast, water, salt),
focaccia = make_focaccia(grains, yeast, water, salt))
return(doughs)
}
```
### Kaat
```r
make_bread <- function(grains, yeast, water, salt) {
# Code to generate `bread`
bread <- grains + yeast + water + salt
return(bread)}
make_foccacia <- function(grains, yeast, water, salt) {
# Code to generate `bread`
focaccia <- grains + 1.5 * yeast + 0.7 * water + 2 * salt
return(focaccia)}
make_doughs <- function(grains, yeast, water, salt) {
# Code to generate `bread` and `focaccia`
bread <- make_bread(grains, yeast, water, salt)
focaccia <- make_foccacia(grains, yeast, water, salt)
# Combine bread and focaccia as a list of doughs
doughs <- list(bread = bread, focaccia = focaccia)
return(doughs)
}
```
## Robrecht
```r
make_doughs <- function(grains, yeast, water, salt, output = "both"){
if(output %in% c("both", "bread", "foccacia")){
bread <- grains + yeast + water + salt
foccacia <- grains + 1.5 * yeast + 0.7 * water + 2 * salt
doughs <- list(bread = bread, foccacia = foccacia)
ifelse(output == "bread", return(doughs$bread),
ifelse(output == "foccacio",return(doughs$foccacia),
return(doughs)))
}
else{
return("please provide output in the form as 'both',
'bread' or 'foccacia'")
}
}
```
## Challenge 1
### Damiano's solution (example)
Copy paste this section to show your solutions.
```r
# dummy code
print("This is how to insert code.")
```
### Jorre
```r
get_obs_2010 <- function(species){
# Set scientific name to lowercase
species <- tolower(species)
# Replace spaces with underscores
species <- str_replace_all(
species,
pattern = " ",
replacement = "_"
)
# Compose filename
file_name <- paste0("20240926_", species, "_2010", ".txt")
# Read file
ha_2010 <- read_tsv(paste0("./data/20240926/",file_name))
return(ha_2010)
}
get_obs <- function(speciesm,year){
# Set scientific name to lowercase
species <- tolower(species)
# Replace spaces with underscores
species <- str_replace_all(
species,
pattern = " ",
replacement = "_"
)
# Compose filename
file_name <- paste0("20240926_", species, "_",year, ".txt")
# Read file
ha <- read_tsv(paste0("./data/20240926/",file_name))
return(ha)
}
```
### Kaat
```
get_obs_2010 <- function (species){
# Set scientific name to lowercase
species <- tolower(species)
# Replace spaces with underscores
species <- str_replace_all(
species,
pattern = " ",
replacement = "_"
)
# Compose filename
file_name <- paste0("20240926_", species, "_2010", ".txt")
# Read file
ha_2010 <- read_tsv(paste0("./data/20240926/",
file_name))
return(ha_2010)
}
get_obs <- function (species, year){
# Set scientific name to lowercase
species <- tolower(species)
# Replace spaces with underscores
species <- str_replace_all(
species,
pattern = " ",
replacement = "_"
)
# Compose filename
file_name <- paste0("20240926_", species,"_", year, ".txt")
# Read file
ha <- read_tsv(paste0("./data/20240926/",
file_name))
return(ha)
}
```
### Nele
```r
get_obs_2010 <- function(species) {
species <- tolower(species)
species <- str_replace_all(
species,
pattern = " ",
replacement = "_"
)
file_name <- paste0("20240926_", species, "_2010", ".txt")
ha_2010 <- read_tsv(paste0("./data/20240926/",
file_name))
return(as.data.frame(ha_2010))
}
df_H_axyridis <- get_obs_2010("Harmonia axyridis")
get_obs <- function(species, year) {
species <- tolower(species)
species <- str_replace_all(
species,
pattern = " ",
replacement = "_"
)
file_name <- paste0("20240926_", species, "_", year, ".txt")
dataset_obs <- read_tsv(paste0("./data/20240926/",
file_name))
return(as.data.frame(dataset_obs))
}
df_H_axyridis <- get_obs("Harmonia axyridis", "2011")
```
### Thierry
```
read_data <- function(species = "Harmonia axyridis", year = 2010, root = ".") {
stopifnot(require(assertthat), require(readr))
assert_that(
is.string(species), noNA(species), is.string(root), noNA(root),
file_test("-d", root), is.count(year)
)
tolower(species) |>
gsub(pattern = " ", replacement = "_") |>
sprintf(fmt = "20240926_%s_%i.txt", year) -> filename
file.path(root, filename) |>
read_tsv()
}
```
### Sanne's solution
```r
# Challenge 1.1
get_obs_2010 <- function(species) {
# Set scientific name to lowercase and replace spaces with underscores
species <- tolower(species) %>%
stringr::str_replace_all(pattern = " ", replacement = "_")
# Compose filename
file_name <- paste0("20240926_", species, "_2010", ".txt")
# Read file
observations_2010 <- readr::read_tsv(paste0("./data/20240926/", file_name))
return(observations_2010)
}
# Challenge 1.2
get_obs <- function(species, year) {
# Set scientific name to lowercase and replace spaces with underscores
species <- tolower(species) %>%
stringr::str_replace_all(pattern = " ", replacement = "_")
# Compose filename
file_name <- paste0("20240926_", species, "_", year, ".txt")
# Read file
observations <- readr::read_tsv(paste0("./data/20240926/", file_name))
return(observations)
}
```
### Cecilia
```{r}
# Define functions
get_obs_2010 <- function(species) {
# Set scientific name to lowercase
species <- tolower(species)
# Replace spaces with underscores
species <- str_replace_all(
species,
pattern = " ",
replacement = "_"
)
# Compose filename
file_name <- paste0("20240926_", species, "_2010", ".txt")
# Read file
ha_2010 <- read_tsv(paste0("./data/20240926/",
file_name))
return(ha_2010)
}
get_obs <- function(species, year) {
# Set scientific name to lowercase
species <- tolower(species)
# Replace spaces with underscores
species <- str_replace_all(
species,
pattern = " ",
replacement = "_"
)
# Compose filename
file_name <- paste0("20240926_", species, "_", year, ".txt")
# Read file
ha <- read_tsv(paste0("./data/20240926/",
file_name))
return(ha)
}
# Define variable
species <- "Harmonia axyridis"
year <- "2011"
# Call function related
ha_2010 <- get_obs_2010(species)
ha <- get_obs(species, year)
```
### Robrecht
```
get_obs_2010 <- function(...){
files <- list.files(pattern = ".*.txt")
all_data <- c()
for(file in files){
data <- read.delim(file, header = TRUE)
all_data <- merge(all_data, data, all = TRUE)
}
return(all_data)
}
data <- get_obs_2010()
```
### Falk
```r
get_obs <- function(year, species) {
# Set scientific name to lowercase
fn_species <- tolower(species)
# Replace spaces with underscores
fn_species <- stringr::str_replace_all(fn_species, pattern = " ", replacement = "_")
# check year input
if (!is.character(year)) {
fn_year <- paste0(year)
} else {
fn_year <- year
}
# get current folder
fn_date <- "20240926" # format(lubridate::today(), format = "%Y%m%d")
file_name <- paste0("./data/",
fn_date, "/",
fn_date, "_",
fn_species, "_",
fn_year, ".txt"
)
# Read file
data <- readr::read_tsv(file_name)
return(data)
}
get_obs(2024, "Harmonia axyridis")
```
## Challenge 2
### Thierry
```
clean_data <- function(
data, max_coord_uncertain = 1000,
issues_to_discard = c(
"ZERO_COORDINATE", "COORDINATE_OUT_OF_RANGE", "COORDINATE_INVALID",
"COUNTRY_COORDINATE_MISMATCH"
),
occurrenceStatus_to_discard = c("absent", "excluded")
) {
stopifnot(require(assertthat), require(tidyverse))
assert_that(
inherits(data, "data.frame"), has_name(data, "issue"),
has_name(data, "occurrenceStatus"),
has_name(data, "coordinateUncertaintyInMeters"),
is.number(max_coord_uncertain), noNA(max_coord_uncertain)
)
issues_to_discard <- match.arg(issues_to_discard, several.ok = TRUE)
occurrenceStatus_to_discard <- match.arg(
occurrenceStatus_to_discard, several.ok = TRUE
)
data |>
filter(
.data$coordinateUncertaintyInMeters < max_coord_uncertain |
is.na(.data$coordinateUncertaintyInMeters),
!.data$issue %in% issues_to_discard,
!.data$occurrenceStatus %in% occurrenceStatus_to_discard
)
}
calc_grid_cell <- function(df, lon_step = 0.1, lat_step = 0.05) {
stopifnot(require(assertthat), require(tidyverse))
assert_that(
inherits(df, "data.frame"), has_name(df, "decimalLongitude"),
has_name(df, "decimalLongitude"), is.number(lon_step), noNA(lon_step),
is.number(lat_step), noNA(lat_step)
)
df |>
mutate(
cell_code = sprintf(
fmt = "01x005E%iN%i", floor(.data$decimalLongitude / lon_step),
floor(.data$decimalLatitude / lat_step))
)
}
calc_grid_cell_base <- function(
df, lon = "decimalLongitude", lat = "decimalLatitude", lon_step = 0.1,
lat_step = 0.05
) {
stopifnot(require(assertthat))
assert_that(
inherits(df, "data.frame"), has_name(df, lat), has_name(df, lon),
is.number(lon_step), noNA(lon_step), is.number(lat_step), noNA(lat_step)
)
df$cell_code <- sprintf(
fmt = "01x005E%iN%i", floor(df[, lon] / lon_step),
floor(df[, lat] / lat_step)
)
}
calc_n_obs_ind <- function(df) {
stopifnot(require(assertthat), require(tidyverse))
assert_that(
inherits(df, "data.frame"), has_name(df, "cell_code"),
has_name(df, "individualCount")
)
df |>
group_by(.data$cell_code) %>%
summarise(
n_observations = n(),
n_individuals = sum(.data$individualCount)
)
}
plot_distr_cells <- function(df, binwidth = 5) {
stopifnot(require(assertthat), require(tidyverse))
assert_that(
inherits(df, "data.frame"), has_name(df, "n_observations"),
has_name(df, "n_individuals"), is.number(binwidth), noNA(binwidth),
binwidth > 0
)
df |>
pivot_longer(
cols = c("n_observations", "n_individuals"), values_to = "n",
names_to = "indicator", names_pattern = "n_?(.*)"
) |>
ggplot() +
geom_histogram(
aes(x = n, fill = indicator), position = "dodge", binwidth = binwidth
) +
xlab("n (binwidth: 5)") +
ggtitle(label = "Grid cells distribution")
}
```
### Cecilia
```{r}
clean_data <- function(df,
max_coord_uncertain = 1000,
issues_to_discard = c(
"ZERO_COORDINATE",
"COORDINATE_OUT_OF_RANGE",
"COORDINATE_INVALID",
"COUNTRY_COORDINATE_MISMATCH"),
occurrenceStatus_to_discard = c(
"absent",
"excluded")
) {
## STEP 2: data cleaning
# Remove observations with coordinate uncertainty higher than 1000 meters
df <-
df %>%
filter(coordinateUncertaintyInMeters < max_coord_uncertain | is.na(coordinateUncertaintyInMeters))
# Remove data with some geographic issues
df <-
df %>%
filter(!issue %in% issues_to_discard)
# Remove absences
df <-
df %>%
filter(!occurrenceStatus %in% occurrenceStatus_to_discard)
return(df)
}
calc_grid_cell <- function(df, lon = 0.1, lan = 0.05) {
## STEP 3: get the grid cell code each observation belongs to and add it to
## the data.frame. We use grid cells of 0.1 lon degrees x 0.05 lat degrees
df <-
df %>%
mutate(cell_code = paste0(
"01x005",
"E", floor(decimalLongitude/lan), # cell size
"N", floor(decimalLatitude/lon)))
return(df)
}
ha_clean <- clean_data(ha)
ha_clean_extra_col <- calc_grid_cell(ha_clean)
```
### Nele
```{r}
#2.1: clean_data():
clean_data <- function(obs_data) {
obs_dat <- obs_data %>%
filter(coordinateUncertaintyInMeters < 1000 | is.na(coordinateUncertaintyInMeters))
obs_dat %>% filter(!issue %in% c(
"ZERO_COORDINATE",
"COORDINATE_OUT_OF_RANGE",
"COORDINATE_INVALID",
"COUNTRY_COORDINATE_MISMATCH"
))
obs_dat <-
obs_dat %>%
filter(!occurrenceStatus %in% c(
"absent",
"excluded"
))
}
clean_H_axyridis <- clean_data(df_H_axyridis)
#2.2: calc_grid_cell():
calc_grid_cell <- function(clean_data, name_long = "decimalLongitude",
name_lat = "decimalLatitude",lon = 0.1, lat = 0.05) {
clean_data <-
clean_data %>%
mutate(cell_code = paste0(
"01x005",
"E", floor(.data[[name_long]]/lon), #not just use name_long, because R will expect a colname names name_long
"N", floor(.data[[name_lat]]/lat)))
}
grid_H_axyridis <- calc_grid_cell(clean_H_axyridis)
#2.3: calc_n_obs_ind():
calc_n_obs_ind <- function(dataframe_obs) {
n_obs <- dataframe_obs %>%
group_by(cell_code) %>%
summarise(n_observations = n(), # number of observations (rows)
n_individuals = sum(individualCount)) # number of individuals
}
obs_H_axyridis <- calc_n_obs_ind(grid_H_axyridis)
#2.4: plot_distr_cells():
plot_distr_cells <- function (obs_count, binsize = 5) {
n_obs <- obs_count %>%
tidyr::pivot_longer(cols = c(n_observations, n_individuals),
values_to = "n",
names_to = "indicator",
names_pattern = "n_?(.*)"
)
p <- ggplot(n_obs) +
geom_histogram(aes(x = n, fill = indicator),
position = "dodge",
binwidth = binsize) +
xlab("n (binwidth: 5)") +
ggtitle(label = "Grid cells distribution")
p
return(p)
}
plot_distr_cells(obs_H_axyridis, 10)
```
### Jorre
```
clean_data <- function(
df,
max_coord_uncertain=1000,
issues_to_discard=c(
"ZERO_COORDINATE",
"COORDINATE_OUT_OF_RANGE",
"COORDINATE_INVALID",
"COUNTRY_COORDINATE_MISMATCH"
),
occurrenceStatus_to_discard=c(
"absent",
"excluded"
)
){
stopifnot(require(assertthat), require(tidyverse))
assert_that(
is.data.frame(df),
is.numeric(max_coord_uncertain), max_coord_uncertain>=0,
is.character(issues_to_discard),
is.character(occurrenceStatus_to_discard)
)
df |>
filter(
coordinateUncertaintyInMeters < max_coord_uncertain |
is.na(coordinateUncertaintyInMeters)
) |>
filter(!issue %in% issues_to_discard) |>
filter(!occurrenceStatus %in% occurrenceStatus_to_discard)
}
calc_grid_cell <- function(df,
lon=.1,lat=.05,
lonvar='decimalLongitude',latvar='decimalLatitude'
){
stopifnot(require(assertthat), require(tidyverse))
assert_that(
is.data.frame(df),
is.numeric(lon), is.numeric(lat),
is.character(lonvar),is.character(latvar)
)
df |>
mutate(
cell_code = paste0(
"01x005",
"E", floor(!!sym(lonvar)/lon),
"N", floor(!!sym(latvar)/lat)
)
)
}
```
### Falk
```r
clean_data <- function(df,
max_coord_uncertain = 1000,
issues_to_discard = c("ZERO_COORDINATE",
"COORDINATE_OUT_OF_RANGE",
"COORDINATE_INVALID",
"COUNTRY_COORDINATE_MISMATCH"
),
occurrenceStatus_to_discard = c("absent", "excluded")
) {
# Remove observations with coordinate uncertainty higher than max_coord_uncertain
df <- df %>%
filter(coordinateUncertaintyInMeters < max_coord_uncertain
, !is.na(coordinateUncertaintyInMeters)
)
# Remove data with some geographic issues
df <- df %>%
filter(!issue %in% issues_to_discard)
# Remove absences
df <- df %>%
filter(!occurrenceStatus %in% occurrenceStatus_to_discard)
return(df)
}
```
*(I got distracted afterwards...)*
### Emma's solution
```r
clean_data <- function(df,
max_coord_uncertain = 1000,
issues_to_discard = c(
"ZERO_COORDINATE",
"COORDINATE_OUT_OF_RANGE",
"COORDINATE_INVALID",
"COUNTRY_COORDINATE_MISMATCH"
),
occurrenceStatus_to_discard = c(
"absent",
"excluded"
)){
# Remove observations with coordinate uncertainty higher than 1000 meters
data_clean <-
df %>%
filter(coordinateUncertaintyInMeters < max_coord_uncertain |
is.na(coordinateUncertaintyInMeters))
# Remove data with some geographic issues
data_clean <-
data_clean %>%
filter(!issue %in% issues_to_discard)
# Remove absences
data_clean <-
data_clean %>%
filter(!occurrenceStatus %in% occurrenceStatus_to_discard)
return(data_clean)
}
calc_grid_cell <- function(df,
name_long = "decimalLongitude",
name_lat = "decimalLatitude",
lat = 0.05,
lon = 0.1){
df_grid <-
df %>%
mutate(cell_code = paste0(
"01x005",
"E", floor(get(name_long)/lon),
"N", floor(get(name_lat)/lat)))
return(df_grid)
}
calc_n_obs_ind <- function(df){
n_obs_ind <-
df %>%
group_by(cell_code) %>%
summarise(n_observations = n(), # number of observations (rows)
n_individuals = sum(individualCount)) # number of individuals
return(n_obs_ind)
}
plot_distr_cells <- function(df, binwidth = 5){
n_obs_ind_ha <-
df %>%
tidyr::pivot_longer(cols = c(n_observations, n_individuals),
values_to = "n",
names_to = "indicator",
names_pattern = "n_?(.*)"
)
p <- ggplot(n_obs_ind_ha) +
geom_histogram(aes(x = n, fill = indicator),
position = "dodge",
binwidth = binwidth) +
xlab(paste0("n (binwidth: ", binwidth, ")")) +
ggtitle(label = "Grid cells distribution")
return(p)
}
ha_2010 <- get_obs("Harmonia axyridis", 2010)
cleaned_ha_2010 <- clean_data(ha_2010)
grid_ha_2010 <- calc_grid_cell(cleaned_ha_2010)
n_obs_ha_2010 <- calc_n_obs_ind(grid_ha_2010)
plot_distr_cells(n_obs_ha_2010)
```
### Pieter
```r
#' Data Cleaning
#'
#' @param df data.frame with observations
#' @param max_coord_uncertain maximum of coordinateUncertaintyInMeters allowed (numeric).
#' @param issues_to_discard issues whose obs have to be removed (character).
#' @param occurrenceStatus_to_discard the occurrenceStatus values whose obs have to be removed (character).
#'
#' @return data.frame with the cleaned observations
#'
#' @examples clean_data(df = get_obs(species = "Harmonia axyridis", year = 2011))
clean_data <- function(df,
max_coord_uncertain = 1000,
issues_to_discard = c(
"ZERO_COORDINATE",
"COORDINATE_OUT_OF_RANGE",
"COORDINATE_INVALID",
"COUNTRY_COORDINATE_MISMATCH"
),
occurrenceStatus_to_discard = c(
"absent",
"excluded"
)) {
# Check if all the required columns are present in the input data.frame
assertthat::assert_that(assertthat::has_name(df, "coordinateUncertaintyInMeters"))
assertthat::assert_that(assertthat::has_name(df, "issue"))
assertthat::assert_that(assertthat::has_name(df, "occurrenceStatus"))
# Remove observations with coordinate uncertainty higher than the maximum
# allowed value
if (!missing(max_coord_uncertain)) {
df <-
dplyr::filter(df, coordinateUncertaintyInMeters < max_coord_uncertain) %>%
dplyr::filter(!is.na(coordinateUncertaintyInMeters))
}
# Remove data with some geographic issues
if (!missing(issues_to_discard)) {
df <- dplyr::filter(df, !issue %in% issues_to_discard)
}
# Remove absences
if (!missing(occurrenceStatus_to_discard)) {
df <- dplyr::filter(df, !occurrenceStatus %in% occurrenceStatus_to_discard)
}
# Return cleaned data.frame
return(df)
}
#' Calculate grid cell code based on decimal longitude and latitude
#'
#' This function calculates the grid cell code for each row in a data frame based on the decimal longitude and latitude values.
#' The grid cell code is generated by dividing the longitude and latitude values by the specified cell sizes and concatenating them with the grid cell code prefix.
#'
#' @param df The input data frame.
#' @param decimalLongitude The name of the column containing the decimal longitude values.
#' @param decimalLatitude The name of the column containing the decimal latitude values.
#' @param cell_size_E The size of the grid cells in the east-west direction.
#' @param cell_size_N The size of the grid cells in the north-south direction.
#'
#' @return The input data frame with an additional column "cell_code" containing the grid cell codes.
calculate_grid_cell_code <- function(df, decimalLongitude, decimalLatitude, cell_size_E, cell_size_N) {
# Function implementation goes here
# ...
# ...
}
# Usage example:
# df <- data.frame(decimalLongitude = c(1.234, 2.345, 3.456),
# decimalLatitude = c(4.567, 5.678, 6.789))
# df <- calculate_grid_cell_code(df, "decimalLongitude", "decimalLatitude", 0.1, 0.1)
#'
#' @examples
#' df <- data.frame(decimalLongitude = c(10.5, 11.2, 12.7),
#' decimalLatitude = c(50.2, 51.8, 52.3))
#' calc_grid_cell(df)
#' # Output:
#' # decimalLongitude decimalLatitude cell_code
#' # 1 10.5 50.2 01x005E210N100
#' # 2 11.2 51.8 01x005E224N103
#' # 3 12.7 52.3 01x005E254N104
calc_grid_cell <- function(df,
decimalLongitude = "decimalLongitude",
decimalLatitude = "decimalLatitude",
cell_size_E = 0.1,
cell_size_N = 0.05){
dplyr::mutate(df,
cell_code = paste0(
"01x005",
"E", floor(.data[[decimalLongitude]]/cell_size_E),
"N", floor(.data[[decimalLatitude]]/cell_size_N)))
}
#' Calculate the number of observations and individuals per cell code
#'
#' This function takes a data frame as input and calculates the number of observations
#' (rows) and the number of individuals per cell code. It groups the data frame by the
#' cell code and then calculates the counts using the `n()` and `sum()` functions.
#'
#' @param df The input data frame
#' @return A data frame with the number of observations and individuals per cell code
#'
#' @examples
#' df <- data.frame(cell_code = c("A", "A", "B", "B"),
#' individualCount = c(1, 2, 3, 4))
#' calc_n_obs_ind(df)
#'
calc_n_obs_ind <- function(df){
df %>%
group_by(cell_code) %>%
summarise(n_observations = n(), # number of observations (rows)
n_individuals = sum(individualCount)) # number of individuals
}
#' Plot Distribution of Cells
#'
#' This function takes a data frame and a width parameter as input and plots the distribution of grid cells.
#'
#' @param df The data frame containing the data for the plot.
#' @param width The width parameter for the histogram binwidth.
#'
#' @return Plot of the distribution of grid cells.
#'
#' @examples
#' df <- data.frame(n_observations = c(10, 20, 30),
#' n_individuals = c(5, 10, 15))
#' plot_distr_cells(df, 5)
#'
plot_distr_cells <- function(df, width){
# Prepare the data for the plot
n_obs_ind <- df %>%
tidyr::pivot_longer(cols = c(n_observations, n_individuals),
values_to = "n",
names_to = "indicator",
names_pattern = "n_?(.*)"
)
# Create the plot
p <- ggplot(n_obs_ind) +
geom_histogram(aes(x = n, fill = indicator),
position = "dodge",
binwidth = width) +
xlab(sprintf("n (binwidth: %s)", width)) +
ggtitle(label = "Grid cells distribution")
# Return the plot
return(p)
}
```
## Challenge 3
### Emma's solution
```r
analyse_obs <- function(species, year, ...){
ha_2010 <- get_obs(species, year)
cleaned_ha_2010 <- clean_data(ha_2010, ...)
grid_ha_2010 <- calc_grid_cell(cleaned_ha_2010)
n_obs_ha_2010 <- calc_n_obs_ind(grid_ha_2010)
p <- plot_distr_cells(n_obs_ha_2010)
return(list(data = n_obs_ha_2010, plot = p))
}
```
### Pieter
```r
analyse_obs <- function(species,
year,
max_coord_uncertain = 1000,
issues_to_discard = c(
"ZERO_COORDINATE",
"COORDINATE_OUT_OF_RANGE",
"COORDINATE_INVALID",
"COUNTRY_COORDINATE_MISMATCH"
),
occurrenceStatus_to_discard = c(
"absent",
"excluded"
),
decimalLongitude = "decimalLongitude",
decimalLatitude = "decimalLatitude",
cell_size_E = 0.1,
cell_size_N = 0.05,
width = 10) {
n_obs_ind <-
get_obs(species, year) %>%
clean_data(max_coord_uncertain, issues_to_discard, occurrenceStatus_to_discard) %>%
calc_grid_cell(decimalLongitude, decimalLatitude, cell_size_E, cell_size_N) %>%
calc_n_obs_ind()
plot <- plot_distr_cells(n_obs_ind, width)
return(list(n_obs_ind = n_obs_ind, plot = plot))
}
```
```r
ana_result <-
analyse_obs(
species = "Chorthippus biguttulus",
year = 2010,
max_coord_uncertain = 5000,
decimalLatitude = "latitude",
decimalLongitude = "longitude",
width = 5
)
ana_result$n_obs_ind
ana_result$plot
analyse_obs(
species = "Harmonia axyridis",
year = 2011
)$plot
```
## Notes
### Example of data masking using tidyselect
```r
my_select <- function(df, cols_to_select){
dplyr::select(df, dplyr::all_of(cols_to_select))
}
my_select(population, c("country", "year"))
`
```