24 Feburary 2022
Welcome!
If you want to share your code snippet, copy paste your snippet within a section of three backticks (```):
As an example:
library(tidyverse)
(you can copy paste this example and add your code further down)
No yellow sticky notes online. Put your name + " | " and add a "*" each time you solve a challenge (see below).
Name | Challenges |
---|---|
Damiano Oldoni | *** |
Emma Cartuyvels | ** |
Matthieu Chastel | |
Hans Van Calster | * |
Vincent Smeekens | * |
Adriaan Seynaeve | |
Abhinav Malasi | |
Jasmijn Hillaert |
Emma:
# read data file `20220224_gent_groeiperwijk.txt` containing the demographic
# evolution of Ghent districts from 1999 to 2009
gent_groeiperwijk <- read_delim("data/20220224/20220224_gent_groeiperwijk.txt",
delim = ";",
col_types = "cnnnnnnnnnnn",
quote = "''")
# read sheet `ALL DATA` from Excel datafile `20220224_manta_2014_2015.xlsx` with
# manta related data
manta_2014_2015 <- read_excel("data/20220224/20220224_manta_2014_2015.xlsx",
sheet = "ALL DATA",
skip = 1,
na = "/")
# Read butterfly related data file `20220224_butterflies_counts.txt` with
# semicolon (;) as delimiter, dot (.) as decimal mark and question mark (?) as
# NA. Read only the first 500 rows. Be sure column `Date` is parsed as
# datetime by using `format = "%d/%m/%Y %H:%M:%S"`.
butterflies_counts <- read_delim("data/20220224/20220224_butterflies_counts.txt",
delim = ";",
locale = locale(date_format = "%d/%m/%Y %H:%M:%S",
decimal_mark = "."),
na = "?",
n_max = 500)
Hans
library(readr)
library(readxl)
library(googlesheets4)
library(here)
## CHALLENGE 1
# read data file `20220224_gent_groeiperwijk.txt` containing the demographic
# evolution of Ghent districts from 1999 to 2009
datapath <- here("data", "20220224")
gent_groei <- read_delim(
here(datapath, "20220224_gent_groeiperwijk.txt"),
quote = "'")
gent_groei
# read sheet `ALL DATA` from Excel datafile `20220224_manta_2014_2015.xlsx` with
# manta related data
manta <- read_excel(here(datapath, "20220224_manta_2014_2015.xlsx"),
sheet = "ALL DATA", skip = 1)
# Read butterfly related data file `20220224_butterflies_counts.txt` with
# semicolon (;) as delimiter, dot (.) as decimal mark and question mark (?) as
# NA. Read only the first 500 rows. Be sure column `Date` is parsed as
# datetime by using `format = "%d/%m/%Y %H:%M:%S"`.
butterflies <- read_delim(
file = here(datapath, "20220224_butterflies_counts.txt"),
delim = ";",
na = "?",
locale = locale(
decimal_mark = ".",
date_format = "%d/%m/%Y %H:%M:%S"),
n_max = 500)
butterflies
Matthieu
gent_groeip <- read.csv2("./data/20220224/20220224_gent_groeiperwijk.txt")
Manta <- read_excel("./data/20220224/20220224_manta_2014_2015.xlsx", sheet = "ALL DATA")
Hans
## CHALLENGE 2: googlesheets
# 1. Read the sheet `HyaHoVrBl` from a googlesheet about [hyacint coverage in Ename](https://docs.google.com/spreadsheets/d/1Tc8U-ud4dEcxgOojS80w7gdQQ-Ac85A4dN-u47NSmYg)(https://docs.google.com/spreadsheets/d/1Tc8U-ud4dEcxgOojS80w7gdQQ-Ac85A4dN-u47NSmYg) paying attention to import all columns correctly
hya <- read_sheet(
ss = "1Tc8U-ud4dEcxgOojS80w7gdQQ-Ac85A4dN-u47NSmYg",
sheet = "HyaHoVrBl",
na = c("NA", "na", "NaN")) %>%
janitor::clean_names()
hya
# Extra: column names contain spaces, let's get rid of them with janitor package
if (!"janitor" %in% installed.packages()) {
install.packages("janitor")
}
# 2. Get metadata of the Google Sheet file
hya_meta <- sheet_properties(ss = "1Tc8U-ud4dEcxgOojS80w7gdQQ-Ac85A4dN-u47NSmYg")
hya_meta
# 3. But what if you don't want to allow tidyverse API to get access to your INBO
# Google account? Well, you can publish the sheet to the web as csv and then
# import it via `read_delim()` or `read_csv()`. Note: This method is useful if you
# don't have to import a lot of Googlesheets and they can be publicly available.
# Read the csv generated in this way by using this link:
hyacint_cov_link <- "https://docs.google.com/spreadsheets/d/e/2PACX-1vTpyMEyzb_M8vrSFUSzTegR4czallWdSUsDCuivdUIwdy7g8PIfrEJtgGBIXPV2UKKRaIZxN7o2O085/pub?gid=1007047746&single=true&output=csv"
hya_csv <- read_csv(hyacint_cov_link,
na = c("NA", "na", "NaN")) %>%
janitor::clean_names()
hya_csv
Emma:
ss = "https://docs.google.com/spreadsheets/d/1Tc8U-ud4dEcxgOojS80w7gdQQ-Ac85A4dN-u47NSmYg"
HyaHoVrBl <- read_sheet(ss,
sheet = "HyaHoVrBl")
# Extra: column names contain spaces, let's get rid of them with janitor package
if (!"janitor" %in% installed.packages()) {
install.packages("janitor")
}
library(janitor)
HyaHoVrBl <- clean_names(HyaHoVrBl)
# 2. Get metadata of the Google Sheet file
gs4_get(ss)
# 3. But what if you don't want to allow tidyverse API to get access to your INBO
# Google account? Well, you can publish the sheet to the web as csv and then
# import it via `read_delim()` or `read_csv()`. Note: This method is useful if you
# don't have to import a lot of Googlesheets and they can be publicly available.
# Read the csv generated in this way by using this link:
hyacint_cov_link <- read_delim("https://docs.google.com/spreadsheets/d/e/2PACX-1vTpyMEyzb_M8vrSFUSzTegR4czallWdSUsDCuivdUIwdy7g8PIfrEJtgGBIXPV2UKKRaIZxN7o2O085/pub?gid=1007047746&single=true&output=csv",
delim = ",")
Hans
## CHALLENGE 3: locale
# You get meteorological data from Wallonia (Waterloo) in Walloon! 1. Parse
# column `Hour` as a column containing number (by default read as a character
# column) 2. Parse column `AV Quality Code` as a column containing numbers, not
# as logicals 3. Parse the column `Date` as a date column. Hint: give a look to
# the readr's [`locale`
# vignette](https://readr.tidyverse.org/articles/locales.html) and check the
# structure of a typical date names locale, e.g. nl <- locale("nl")$date_names.
# This is a nice datetime [format specifications
# compendium](https://www.stat.berkeley.edu/~s133/dates.html)
days_walloon <- c("londi", "mårdi", "mierkidi", "djudi", "vénrdi",
"semdi", "dimegne")
months_walloon <- c("djanvî","fevrî", "måss", "avri", "may", "djun", "djulete",
"awousse","setimbre", "octôbe", "nôvimbe", "decimbe")
days_abbr_walloon <- c("lon", "mår", "mie", "dju", "vén", "sem", "dim")
months_abbr_walloon <- c("djan","fev", "mås", "avr", "may", "djun", "djul",
"awou","set", "oct", "nôv", "dec")
walloon_names <- date_names(mon = months_walloon,
mon_ab = months_abbr_walloon,
day = days_walloon,
day_ab = days_abbr_walloon,
am_pm = c("AM", "PM"))
read_delim(file = here(datapath, "20220224_rainfall_waterloo.txt"),
delim = ",", skip = 6,
col_types = cols(Date = col_date("%a %d %B %Y"),
Hour = col_time("%H")),
locale = locale(date_names = walloon_names,
grouping_mark = "'",
decimal_mark = ".",
date_format = "%a %d %B %Y")
)
## CHALLENGE 3: encoding
# Read file 20220224_latin-1_character_set.txt
latin1 <- read_delim(here(datapath, "20220224_latin-1_character_set.txt"),
locale = locale(encoding = "LATIN1"))
iconvlist()
latin1
Encoding(latin1$Char)
latin1$Char[22]
# Read file 20220224_turkish_iso8859-9.txt. Use View() to open the data.frame.
# Do you get the same while printing the data.frame on Console? Why?
turkish <- read_delim(here(datapath, "20220224_turkish_iso8859-9_encoding.txt"),
locale = locale(encoding = "x-mac-turkish"))
#not working...