--- title: "Target Actor 2023" author: "Jia-Shen" date: "2023-08-24" output: html_document --- # Target Actor 2023 ## read all data from source ```{r} rm(list = ls()) setwd('~/🪅Master/12_internship/2023 Global Climate Action report/Raw_data_formatted/data_2023/CDP Cities') library(tidyverse) x <- list.dirs("..")[grepl("clean data", list.dirs(".."), ignore.case = T)] get_dirs <- function(path){ # Takes a path to the folder containing the cleaned data folders and # returns a vector that contains all the folders with clean data x <- list.dirs("..")[grepl("clean data", list.dirs(".."), ignore.case = T)] return(x) } path = ".." directory <- get_dirs(path) get_files <- function(directory, path){ # Iterates through the folders containing clean data and reads them into # the Global Env. # Appends _ datasource name to the end of the table name for (x in directory){ ds <- gsub(paste0(path, "/(.*?)/.*"), "\\1", x) for (y in list.files(x)){ assign(paste0(gsub(("(.*?)\\.csv"), "\\1", y), "_", ds), read.csv(paste0(x, "/", y)), envir = .GlobalEnv) } } } get_files(directory, path) combine_df <- function(table_name){ # Creates and combines a table based on the individual data source tables tables <- ls(.GlobalEnv, pattern = paste0(table_name, "_")) if (length(tables) == 0){ print(paste0("No tables found for ", table_name)) } df <- mget(tables, envir = .GlobalEnv) %>% lapply(., function (x) mutate(x, across(names(x), ~as.character(.x)))) %>% bind_rows() %>% distinct() return(df) } # Prioritise data for targets/emissions Target <- combine_df('Target') Actor <- combine_df('Actor') Territory <- combine_df('Territory') ``` ## combine the source ```{r} Target <- combine_df('Target') # 12591 obs. of 16 variables Actor <- combine_df('Actor') # 18655 obs. of 11 variables Territory <- combine_df('Territory') # 18353 obs. of 10 variables ``` ```{r} Target_i <- Target %>% select(target_id, actor_id, target_type, target_year, target_unit, target_value, baseline_value, baseline_year, net.emiss, baseline_unit, summary) # 12591 obs. of 11 variables Target_I <- distinct(Target_i, target_id, .keep_all = TRUE) # 12157 obs. of 11 variables Actor_I <- Actor %>% select(actor_id, name, iso) # 18655 obs. of 3 variables Territory_I <- Territory %>% select(actor_id, area, lat, lng, City.Location, before) # 18353 obs. of 6 variables #actor_id, target_type, target_year, target_unit, target_value, baseline_value, baseline_year, net.emiss, baseline_unit, summary by target_id in Target #type, name, iso in Actor, by actor_id #area, lat, lng, City.location, before in Territory, by actor_id Target_a <- left_join(Target_I, Actor_I, by = "actor_id") # 15155 obs. of 13 variables # indicating 2998 repetitive actor_id in Actor dataset Target_b <- left_join(Target_a, Territory_I, by = "actor_id") # 23436 obs. of 18 variables # indicating 5083 repetitive actor_id in Territory dataset Target_Final <- distinct(Target_b, target_id, .keep_all = TRUE) # 12157 obs. of 18 variables Target_F_2023 <- subset(Target_Final, target_year >= 2023) write.csv(Target_F_2023, file = "target_2023.csv") ``` ## Read the target_actor dataset ```{r compare the dataset} ta2022 <- read.csv('target_actor2022.csv') ta2023 <- read.csv('target_actor2023.csv') ta2022_n <- ta2022 %>% subset(target_year >= 2022) ta2023_n <- ta2023 %>% subset(target_year >= 2023) ta_ant <- anti_join(ta2022_n, ta2023_n, by = 'target_id') ta_ant ``` ```{r combind the 2022 targets that hasn't expired} ta2023_nn <- bind_rows(ta2023_n, ta_ant) ta2023_nn # 5314 obs of 21 variables write.csv(ta2023_nn, file = 'ta2023.csv') ``` ## Compare the dataset ```{r} ## find the data that exist in JS's analysis but not in the 'target_actor2023.csv' file A_comp <- anti_join(Target_F_2023, ta2023_nn, by = 'target_id') # 2012 obs. of 18 variables write.csv(A_comp, file = 'comp2023.csv') ```