owned this note
owned this note
Published
Linked with GitHub
---
title: "Target Actor 2023"
author: "Jia-Shen"
date: "2023-08-24"
output: html_document
---
# Target Actor 2023
## read all data from source
```{r}
rm(list = ls())
setwd('~/🪅Master/12_internship/2023 Global Climate Action report/Raw_data_formatted/data_2023/CDP Cities')
library(tidyverse)
x <- list.dirs("..")[grepl("clean data", list.dirs(".."), ignore.case = T)]
get_dirs <- function(path){
# Takes a path to the folder containing the cleaned data folders and
# returns a vector that contains all the folders with clean data
x <- list.dirs("..")[grepl("clean data", list.dirs(".."), ignore.case = T)]
return(x)
}
path = ".."
directory <- get_dirs(path)
get_files <- function(directory, path){
# Iterates through the folders containing clean data and reads them into
# the Global Env.
# Appends _ datasource name to the end of the table name
for (x in directory){
ds <- gsub(paste0(path, "/(.*?)/.*"), "\\1", x)
for (y in list.files(x)){
assign(paste0(gsub(("(.*?)\\.csv"), "\\1", y), "_", ds), read.csv(paste0(x, "/", y)),
envir = .GlobalEnv)
}
}
}
get_files(directory, path)
combine_df <- function(table_name){
# Creates and combines a table based on the individual data source tables
tables <- ls(.GlobalEnv, pattern = paste0(table_name, "_"))
if (length(tables) == 0){
print(paste0("No tables found for ", table_name))
}
df <- mget(tables, envir = .GlobalEnv) %>%
lapply(., function (x) mutate(x, across(names(x), ~as.character(.x)))) %>%
bind_rows() %>%
distinct()
return(df)
}
# Prioritise data for targets/emissions
Target <- combine_df('Target')
Actor <- combine_df('Actor')
Territory <- combine_df('Territory')
```
## combine the source
```{r}
Target <- combine_df('Target') # 12591 obs. of 16 variables
Actor <- combine_df('Actor') # 18655 obs. of 11 variables
Territory <- combine_df('Territory') # 18353 obs. of 10 variables
```
```{r}
Target_i <- Target %>%
select(target_id, actor_id, target_type, target_year, target_unit, target_value, baseline_value, baseline_year, net.emiss, baseline_unit, summary) # 12591 obs. of 11 variables
Target_I <- distinct(Target_i, target_id, .keep_all = TRUE) # 12157 obs. of 11 variables
Actor_I <- Actor %>%
select(actor_id, name, iso) # 18655 obs. of 3 variables
Territory_I <- Territory %>%
select(actor_id, area, lat, lng, City.Location, before) # 18353 obs. of 6 variables
#actor_id, target_type, target_year, target_unit, target_value, baseline_value, baseline_year, net.emiss, baseline_unit, summary by target_id in Target
#type, name, iso in Actor, by actor_id
#area, lat, lng, City.location, before in Territory, by actor_id
Target_a <- left_join(Target_I, Actor_I, by = "actor_id") # 15155 obs. of 13 variables
# indicating 2998 repetitive actor_id in Actor dataset
Target_b <- left_join(Target_a, Territory_I, by = "actor_id") # 23436 obs. of 18 variables
# indicating 5083 repetitive actor_id in Territory dataset
Target_Final <- distinct(Target_b, target_id, .keep_all = TRUE) # 12157 obs. of 18 variables
Target_F_2023 <- subset(Target_Final, target_year >= 2023)
write.csv(Target_F_2023, file = "target_2023.csv")
```
## Read the target_actor dataset
```{r compare the dataset}
ta2022 <- read.csv('target_actor2022.csv')
ta2023 <- read.csv('target_actor2023.csv')
ta2022_n <- ta2022 %>%
subset(target_year >= 2022)
ta2023_n <- ta2023 %>%
subset(target_year >= 2023)
ta_ant <- anti_join(ta2022_n, ta2023_n, by = 'target_id')
ta_ant
```
```{r combind the 2022 targets that hasn't expired}
ta2023_nn <- bind_rows(ta2023_n, ta_ant)
ta2023_nn # 5314 obs of 21 variables
write.csv(ta2023_nn, file = 'ta2023.csv')
```
## Compare the dataset
```{r}
## find the data that exist in JS's analysis but not in the 'target_actor2023.csv' file
A_comp <- anti_join(Target_F_2023, ta2023_nn, by = 'target_id') # 2012 obs. of 18 variables
write.csv(A_comp, file = 'comp2023.csv')
```