Target Actor 2023

read all data from source

rm(list = ls())
setwd('~/🪅Master/12_internship/2023 Global Climate Action report/Raw_data_formatted/data_2023/CDP Cities')
library(tidyverse)

x <- list.dirs("..")[grepl("clean data", list.dirs(".."), ignore.case = T)]

get_dirs <- function(path){
  # Takes a path to the folder containing the cleaned data folders and
  # returns a vector that contains all the folders with clean data
  x <- list.dirs("..")[grepl("clean data", list.dirs(".."), ignore.case = T)]
  return(x)
}

path = ".."

directory <- get_dirs(path)

get_files <- function(directory, path){
  # Iterates through the folders containing clean data and reads them into 
  # the Global Env. 
  # Appends _ datasource name to the end of the table name 
  for (x in directory){
    ds <- gsub(paste0(path, "/(.*?)/.*"), "\\1", x)
    for (y in list.files(x)){
      assign(paste0(gsub(("(.*?)\\.csv"), "\\1", y), "_", ds), read.csv(paste0(x, "/", y)),
             envir = .GlobalEnv)
    }
  }
}

get_files(directory, path)

combine_df <- function(table_name){
  # Creates and combines a table based on the individual data source tables
  tables <- ls(.GlobalEnv, pattern = paste0(table_name, "_"))
  if (length(tables) == 0){
    print(paste0("No tables found for ", table_name))
  }
  df <- mget(tables, envir = .GlobalEnv) %>%
    lapply(., function (x) mutate(x, across(names(x), ~as.character(.x)))) %>% 
    bind_rows() %>%
    distinct()
  return(df)
}

# Prioritise data for targets/emissions 
Target <- combine_df('Target')
Actor <- combine_df('Actor')
Territory <- combine_df('Territory')

combine the source

Target <- combine_df('Target') # 12591 obs. of 16 variables
Actor <- combine_df('Actor') # 18655 obs. of 11 variables
Territory <- combine_df('Territory') # 18353 obs. of 10 variables
Target_i <- Target %>%
  select(target_id, actor_id, target_type, target_year, target_unit, target_value, baseline_value, baseline_year, net.emiss, baseline_unit, summary) # 12591 obs. of 11 variables

Target_I <- distinct(Target_i, target_id, .keep_all = TRUE) # 12157 obs. of 11 variables

Actor_I <- Actor %>%
  select(actor_id, name, iso) # 18655 obs. of 3 variables

Territory_I <- Territory %>%
  select(actor_id, area, lat, lng, City.Location, before) # 18353 obs. of 6 variables

#actor_id, target_type,	target_year,	target_unit,	target_value,	baseline_value,	baseline_year,	net.emiss,	baseline_unit, summary by target_id in Target
#type, name, iso in Actor, by actor_id
#area, lat, lng, City.location, before in Territory, by actor_id


Target_a <- left_join(Target_I, Actor_I, by = "actor_id") # 15155 obs. of 13 variables 
# indicating 2998 repetitive actor_id in Actor dataset
Target_b <- left_join(Target_a, Territory_I, by = "actor_id") # 23436 obs. of 18 variables
# indicating 5083 repetitive actor_id in Territory dataset

Target_Final <- distinct(Target_b, target_id, .keep_all = TRUE) # 12157 obs. of 18 variables

Target_F_2023 <- subset(Target_Final, target_year >= 2023)

write.csv(Target_F_2023, file = "target_2023.csv")

Read the target_actor dataset

ta2022 <- read.csv('target_actor2022.csv')
ta2023 <- read.csv('target_actor2023.csv')

ta2022_n <- ta2022 %>%
  subset(target_year >= 2022)

ta2023_n <- ta2023 %>%
  subset(target_year >= 2023)

ta_ant <- anti_join(ta2022_n, ta2023_n, by = 'target_id')
ta_ant
ta2023_nn <- bind_rows(ta2023_n, ta_ant)
ta2023_nn # 5314 obs of 21 variables
write.csv(ta2023_nn, file = 'ta2023.csv')

Compare the dataset

## find the data that exist in JS's analysis but not in the 'target_actor2023.csv' file
A_comp <- anti_join(Target_F_2023, ta2023_nn, by = 'target_id') # 2012 obs. of 18 variables
write.csv(A_comp, file = 'comp2023.csv')