# Using Big Query and the Voter File
## Some notes for Bang and Rongbo
There is "voters" and there is "votes" -- these files are built from the raw voter file -- one is "wide" (characteristics of voters), one is "long" (votes on the rows of a tabular data). The data are linkable by "respondent_id"
Run this to clean up these data
```
--- machine learning data
with temp1 as(
SELECT
st_geogpoint(long, lat) AS location,
registrant_id,
status,
cast(year_of_birth as int) as birth_year,
parse_date("%m/%d/%Y", left(registration_date, 10)) as registration_date,
parse_date("%m/%d/%Y", left(effective_date_of_change, 10)) as registration_change,
county, party,
occupation,
cast(REGEXP_EXTRACT(congressional, r"([0-9]+)") as int64) as old_cd_int,
cast(REGEXP_EXTRACT(congressional, r"([0-9]+)") as string) as old_cd_str,
cast(REGEXP_EXTRACT(legislative, r"([0-99]+)") as int64) as old_ld_int,
cast(REGEXP_EXTRACT(legislative, r"([0-99]+)") as string) as old_ld_str,
zipcode,
cast(registrant_id as string) as ident,
CASE
WHEN party = "REP" THEN "Republican (REP)"
WHEN party = "DEM" THEN "Democrat (DEM)"
WHEN party = "LBT" THEN "Libertarian (LIB)"
WHEN party = "GRN" THEN "Green (GREEN)"
ELSE "Independent"
END as party_identification,
CASE
WHEN status LIKE "inactive" THEN 0
ELSE 1
END as voter_status,
FROM `az-voter-file.az_file.parsed_voters_11-08-21`),
--- Pull in the voting data.
t1 as(
SELECT
election,
CASE
WHEN election = "GENERAL 2020" THEN 1
ELSE 0
END as gen_2020,
CASE
WHEN election = "PRIMARY 2020" THEN 1
ELSE 0
END as primary_2020,
CASE
WHEN election = "2018 PRIMARY ELECTION" THEN 1
ELSE 0
END as primary_2018,
CASE
WHEN election = "2016 PRIMARY ELECTION" THEN 1
ELSE 0
END as primary_2016,
CASE
WHEN election = "2014 GENERAL ELECTION" THEN 1
ELSE 0
END as gen_2014,
CASE
WHEN election = "2018 GENERAL ELECTION" THEN 1
ELSE 0
END as gen_2018,
CASE
WHEN election = "2016 GENERAL ELECTION" THEN 1
ELSE 0
END as gen_2016,
cast(is_general as int64) as general_participation,
cast(is_primary as int64) as primary_participation,
cast(registrant_id as string) as ident,
cast(REGEXP_CONTAINS(upper(election), "GENERAL 2020") as int64) as vote2020
FROM `az-voter-file.az_file.parsed_votes_08-14-2021`
),
t2 as(
select
ident,
sum(gen_2020) as general_2020,
sum(primary_2020) as primary_2020,
sum(gen_2018) as general_2018,
sum(primary_2018) as primary_2018,
sum(gen_2016) as general_2016,
sum(primary_2016) as primary_2016,
sum(gen_2014) as general_2014,
sum(general_participation) as general_participation,
sum(primary_participation) as primary_participation,
sum(vote2020) as vote2020
from t1
group by ident
),
vote_data as(
select *,
from t2 RIGHT JOIN temp1
ON t2.ident = temp1.registrant_id ),
tracts as(
select
vote_data.*,
bg.geo_ID
FROM vote_data
CROSS JOIN `bigquery-public-data.geo_census_tracts.census_tracts_arizona` as bg
WHERE
st_contains(bg.tract_geom, location)
),
coded_data AS(
SELECT *,
cast(cds.USERregistrantid as string) as ident,
cast(REGEXP_EXTRACT(cds.cdSHORTNAME, r"([0-99]+)") as string) as new_cd_str,
cast(REGEXP_EXTRACT(lds.ldSHORTNAME, r"([0-99]+)") as string) as new_ld_str,
cast(REGEXP_EXTRACT(cds.cdSHORTNAME, r"([0-99]+)") as int) as new_cd_int,
cast(REGEXP_EXTRACT(lds.ldSHORTNAME, r"([0-99]+)") as int) as new_ld_int
FROM `az_file.f_cds` as cds
JOIN `az_file.f_lds` as lds
ON cds.USERregistrantid = lds.USERregistrantid),
merged_files as(
select
*,
generate_uuid() as voter_identification,
cdDISTRICT as congressional_district,
ldDISTRICT as legislative_district,
from coded_data as t1 join tracts on t1.ident = tracts.registrant_id),
merged_files_districts as (
select
*,
cast(census.congressional_district as int) as congressional_district,
FROM `az_file.census_2020` as census join merged_files
on census.congressional_district = merged_files.congressional_district
)
select
*,
from `bigquery-public-data.census_bureau_acs.censustract_2018_5yr` as a
LEFT JOIN merged_files_districts
ON merged_files_districts.geo_ID = a.geo_id
```
The census data import instructions are here:
https://colab.research.google.com/drive/1YE6vN1Dwp98tABTI4urwgQN72sP9q66H?usp=sharing
This produces a rather large table, with tract and district characteristics. The labeling is somewhat poor, which is something I address below. From here, I download the data and run a neural network model. This is accomplished as follows.
Download and pickle the data.
```
import tensorflow as tf
import os
import pandas as pd
from datetime import datetime
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = "/Users/chrisweber/Dropbox/Keys/az-voter-file-30395362c45b.json"
from google.cloud import bigquery
from sklearn.preprocessing import MinMaxScaler
import pandas_gbq
bqclient = bigquery.Client()
project_id = "az-voter-file"
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
### Formulate the SQL query to pull the data from BigQuery
query = """
SELECT
geo_id,
registrant_id,
general_2020,
primary_2020,
general_2018,
primary_2018,
general_2016,
primary_2016,
general_2014,
bachelors_degree_2,
poverty,
total_pop,
birth_year,
registration_date,
registration_change,
median_age,
median_income,
white_pop,
black_pop,
asian_pop,
hispanic_pop,
amerindian_pop,
gini_index,
housing_units,
children,
employed_pop,
armed_forces,
pop_in_labor_force,
in_undergrad_college,
speak_only_english_at_home,
less_than_high_school_graduate,
P1_001N as total_district,
P1_002N as white_district,
P1_003N as black_district,
P1_004N as indian_district,
P1_005N as asian_district,
P4_001N as total_ethnicity,
P4_002N as latino,
FROM `az-voter-file.az_file.clean_data_machine_learning`
"""
df = pandas_gbq.read_gbq(query, project_id=project_id)
df.to_pickle('/Users/chrisweber/Desktop/voter_file00_00_01.pkl') ## For later load
```
You'll see a saved a file here that is called clean_data_machine_learning.
The neural network is then run on these data and the data are reloaded into big query as a table called nn<version> in the az_file dataset. The documentation for the machine learning protocol can be found in the Jupyter notebook entitled nn.ipynb.
# Extracting summary information
The dashboard requires us to note the lowest level of observation. Here, we'll say the tract. However, we won't load in much by way of tract geometries, just their centroids. The idea: Read in the NN data, join it to the primary data table.
```
-- Tract level data, useful summary statistics.
with m1 as (SELECT
geo_id as census_tract,
congressional_district,
legislative_district,
party_identification,
voter_identification as random_voter_identification,
general_2020,
primary_2020,
general_2018,
primary_2018,
general_2016,
primary_2016,
general_2014,
bachelors_degree_2 as tract_college,
poverty as tract_poverty,
total_pop as tract_n,
median_age as tract_age,
median_income as tract_income,
white_pop as tract_white,
hispanic_pop as tract_hispanic,
gini_index as tract_gini,
housing_units as tract_housing_units,
children as tract_children,
employed_pop as tract_employed,
armed_forces as tract_armed_forces,
pop_in_labor_force as tract_in_labor_force,
in_undergrad_college as tract_in_college,
P1_001N as total_race_district,
P1_002N as white_district,
P1_003N as black_district,
P1_004N as indian_district,
P1_005N as asian_district,
P4_001N as total_ethnicity_district,
P4_002N as latino_district,
registrant_id as registrant_id,
total_pop as tn,
FROM `az-voter-file.az_file.clean_data_machine_learning`),
--- Select out and construct percentages and meaningful measures.
m2 as (SELECT
census_tract,
SUM(CASE
WHEN party_identification = "Republican (REP)" THEN 1
ELSE 0 END) as Republicans,
SUM(CASE
WHEN party_identification = "Democrat (DEM)" THEN 1
ELSE 0 END) as Democrats,
SUM(CASE
WHEN party_identification = "Libertarian (LIB)" THEN 1
ELSE 0 END) as Libertarian,
SUM(CASE
WHEN party_identification = "Green (GREEN)" THEN 1
ELSE 0 END) as Greens,
SUM(CASE
WHEN party_identification <> "Republican (REP)" AND party_identification <> "Republican (REP)" AND
party_identification <> "Libertarian (LIB)" AND party_identification <> "Green (GREEN)"
THEN 1
ELSE 0 END) as Independents,
avg(safe_divide(point, tn)) as engaged_count_pr,
avg(safe_divide(tract_white, tn)) as white_tract,
avg(safe_divide(tract_hispanic, tn)) as hispanic_tract,
avg(tract_gini) as gini_tract,
avg(safe_divide(tract_college, tn)) as college_tract,
avg(safe_divide(tract_children, tn)) as children_tract,
avg(safe_divide(tract_employed, tn)) as employed_tract,
avg(safe_divide(tract_in_college, tn)) as tract_college,
avg(safe_divide(tract_poverty, tn)) as tract_poverty,
avg(safe_divide(tract_in_labor_force, tn)) as tract_in_labor_force,
avg(engaged_pr)*100 as engagement_score,
sum(CASE
WHEN general_2020 = 1
AND general_2018 = 1
AND general_2016 = 1
AND primary_2016 = 1
AND primary_2018 = 1
AND primary_2020 = 1 THEN 1
ELSE 0 END) as threeXthree,
sum(CASE
WHEN general_2020 = 1
AND general_2016 = 1
AND primary_2016 = 1
AND primary_2020 = 1 THEN 1
ELSE 0 END) as twoXtwo,
from m1 JOIN
`az-voter-file.az_file.nn02` as m2 ON
m1.registrant_id = m2.registrant_id
WHERE congressional_district IS NOT NULL
GROUP BY census_tract
)
select
*,
geo_id as geoid,
internal_point_lat as latitude,
internal_point_lon as longitude,
from `bigquery-public-data.geo_census_tracts.census_tracts_arizona` as centroids
right join m2 on centroids.geo_id = m2.census_tract
```
Now construct district and legislative summaries.
Here are the CDs
```
with m1 as (SELECT
geo_id as census_tract,
congressional_district,
legislative_district,
party_identification,
voter_identification as random_voter_identification,
general_2020,
primary_2020,
general_2018,
primary_2018,
general_2016,
primary_2016,
general_2014,
bachelors_degree_2 as tract_college,
poverty as tract_poverty,
total_pop as tract_n,
median_age as tract_age,
median_income as tract_income,
white_pop as tract_white,
hispanic_pop as tract_hispanic,
gini_index as tract_gini,
housing_units as tract_housing_units,
children as tract_children,
employed_pop as tract_employed,
armed_forces as tract_armed_forces,
pop_in_labor_force as tract_in_labor_force,
in_undergrad_college as tract_in_college,
P1_001N as total_race_district,
P1_003N as white_district,
P1_004N as black_district,
P1_005N as indian_district,
P1_006N as asian_district,
P4_001N as total_ethnicity_district,
P4_002N as latino_district,
registrant_id as registrant_id,
FROM `az-voter-file.az_file.clean_data_machine_learning`)
--- Select out and construct percentages and meaningful measures.
SELECT
congressional_district,
SUM(CASE
WHEN party_identification = "Republican (REP)" THEN 1
ELSE 0 END) as Republicans_cd,
SUM(CASE
WHEN party_identification = "Democrat (DEM)" THEN 1
ELSE 0 END) as Democrats_cd,
SUM(CASE
WHEN party_identification = "Libertarian (LIB)" THEN 1
ELSE 0 END) as Libertarian_cd,
SUM(CASE
WHEN party_identification = "Green (GREEN)" THEN 1
ELSE 0 END) as Greens_cd,
SUM(CASE
WHEN party_identification <> "Republican (REP)" AND party_identification <> "Republican (REP)" AND
party_identification <> "Libertarian (LIB)" AND party_identification <> "Green (GREEN)"
THEN 1
ELSE 0 END) as Independents_cd,
avg(safe_divide(point, total_race_district)) as engaged_pr_cd,
avg(safe_divide(white_district, total_race_district)) as white_cd,
avg(safe_divide(black_district, total_race_district)) as black_cd,
avg(safe_divide(indian_district, total_race_district)) as amerindian_cd,
avg(safe_divide(latino_district, total_ethnicity_district)) as hispanic_cd,
avg(engaged_pr)*100 as engagement_score_cd,
sum(CASE
WHEN general_2020 = 1
AND general_2018 = 1
AND general_2016 = 1
AND primary_2016 = 1
AND primary_2018 = 1
AND primary_2020 = 1 THEN 1
ELSE 0 END) as threeXthree,
sum(CASE
WHEN general_2020 = 1
AND general_2016 = 1
AND primary_2016 = 1
AND primary_2020 = 1 THEN 1
ELSE 0 END) as twoXtwo,
from m1 JOIN
`az-voter-file.az_file.nn02` as m2 ON
m1.registrant_id = m2.registrant_id
WHERE congressional_district IS NOT NULL
GROUP BY congressional_district
-- with m1 as (SELECT
-- geo_id as census_tract,
-- congressional_district,
-- legislative_district,
-- party_identification,
-- voter_identification as random_voter_identification,
-- general_2020,
-- primary_2020,
-- general_2018,
-- primary_2018,
-- general_2016,
-- primary_2016,
-- general_2014,
-- bachelors_degree_2 as tract_college,
-- poverty as tract_poverty,
-- total_pop as tract_n,
-- median_age as tract_age,
-- median_income as tract_income,
-- white_pop as tract_white,
-- hispanic_pop as tract_hispanic,
-- gini_index as tract_gini,
-- housing_units as tract_housing_units,
-- children as tract_children,
-- employed_pop as tract_employed,
-- armed_forces as tract_armed_forces,
-- pop_in_labor_force as tract_in_labor_force,
-- in_undergrad_college as tract_in_college,
-- P1_001N as total_race_district,
-- P1_003N as white_district,
-- P1_004N as black_district,
-- P1_005N as indian_district,
-- P1_006N as asian_district,
-- P4_001N as total_ethnicity_district,
-- P4_002N as latino_district,
-- registrant_id as registrant_id,
-- FROM `az-voter-file.az_file.clean_data_machine_learning`)
-- --- Select out and construct percentages and meaningful measures.
-- SELECT
-- legislative_district,
-- SUM(CASE
-- WHEN party_identification = "Republican (REP)" THEN 1
-- ELSE 0 END) as Republicans_cd,
-- SUM(CASE
-- WHEN party_identification = "Democrat (DEM)" THEN 1
-- ELSE 0 END) as Democrats_ld,
-- SUM(CASE
-- WHEN party_identification = "Libertarian (LIB)" THEN 1
-- ELSE 0 END) as Libertarian_ld,
-- SUM(CASE
-- WHEN party_identification = "Green (GREEN)" THEN 1
-- ELSE 0 END) as Greens_ld,
-- SUM(CASE
-- WHEN party_identification <> "Republican (REP)" AND party_identification <> "Republican (REP)" AND
-- party_identification <> "Libertarian (LIB)" AND party_identification <> "Green (GREEN)"
-- THEN 1
-- ELSE 0 END) as Independents_ld,
-- avg(safe_divide(point, total_race_district)) as engaged_pr_ld,
-- avg(safe_divide(white_district, total_race_district)) as white_ld,
-- avg(safe_divide(black_district, total_race_district)) as black_ld,
-- avg(safe_divide(indian_district, total_race_district)) as amerindian_ld,
-- avg(safe_divide(latino_district, total_ethnicity_district)) as hispanic_ld,
-- avg(engaged_pr)*100 as engagement_score_ld,
-- sum(CASE
-- WHEN general_2020 = 1
-- AND general_2018 = 1
-- AND general_2016 = 1
-- AND primary_2016 = 1
-- AND primary_2018 = 1
-- AND primary_2020 = 1 THEN 1
-- ELSE 0 END) as threeXthree,
-- sum(CASE
-- WHEN general_2020 = 1
-- AND general_2016 = 1
-- AND primary_2016 = 1
-- AND primary_2020 = 1 THEN 1
-- ELSE 0 END) as twoXtwo,
-- from m1 JOIN
-- `az-voter-file.az_file.nn02` as m2 ON
-- m1.registrant_id = m2.registrant_id
-- WHERE legislative_district IS NOT NULL
-- GROUP BY legislative_district
```
To recap, I now have several useful tables. 1) A table of tract characteristics, called **tracts_actionable_v2** 2) a table of LD, and a table of 3) CD characteristics. The tract data also includes the tract centroid latitude and longitude for mapping. Now the idea is to use the raw,
"clean_machine_learning" data, aggregated to
* tract
* LD
* CD
* Party Identification
These form the measures. Create this table, then join in the actionable files.
## Create a Star Table
```
with raw_1 as (SELECT
geo_id as census_tract,
congressional_district,
legislative_district,
party_identification,
registrant_id as registrant_id,
total_pop as t
FROM `az-voter-file.az_file.clean_data_machine_learning`),
tmp_dat as (SELECT
nn.engaged_pr as engaged_pr_unit,
nn.point as engaged_point_unit,
raw_1.*,
FROM `az-voter-file.az_file.nn02` as nn
JOIN raw_1 ON nn.registrant_id = raw_1.registrant_id),
star_voter as(
SELECT
count(distinct registrant_id) as voters,
census_tract as geoid,
congressional_district as cd,
legislative_district as ld,
party_identification as pid,
avg(engaged_pr_unit)*100 as engagement_unit,
avg(engaged_point_unit)*100 as engagement_point_unit,
FROM tmp_dat
WHERE party_identification is not null
group by 2, 3, 4, 5),
star_p2 as (
SELECT
*,
FROM `az-voter-file.public_data.congressional_districts_actionable_v2` as cd RIGHT JOIN
star_voter on star_voter.cd = cd.congressional_district),
star_p3 as (
SELECT
*,
FROM `az-voter-file.public_data.legislative_districts_actionable_v2` as ld RIGHT JOIN
star_p2 on star_p2.ld = ld.legislative_district)
SELECT
*,
cast(REGEXP_EXTRACT(latitude, r"([0-9.]+)") as FLOAT64 ) as lat,
cast(REGEXP_EXTRACT(longitude, r"([0-9.]+)") as FLOAT64 )*-1 as long
FROM `az-voter-file.public_data.tracts_actionable_v2` as tracts RIGHT JOIN
star_p3 on star_p3.geoid = tracts.geoid
```
This creates an 8000 or so row data table. Each row contains some count of LD x CD x Party x Tract combinations in the data. I'm basically just disaggregating within tract, though tract is the lowest unit of observation. In theory, this star_table_v2 can be read into tableau, with only two join-- the CD and LD geojsons.
A useful one:
```
with t1 as (SELECT
CASE
WHEN general_2020 = 1
AND general_2018 = 1
AND general_2016 = 1
AND primary_2016 = 1
AND primary_2018 = 1
AND primary_2020 = 1
THEN 1 ELSE 0 END as this,
CASE
WHEN general_2020 = 1
AND general_2018 = 1
AND general_2016 = 1
AND primary_2016 = 1
AND primary_2018 = 1
AND primary_2020 = 1 THEN 1
ELSE 0 END as threeXthree,
CASE
WHEN general_2020 = 1
AND general_2016 = 1
AND primary_2016 = 1
AND primary_2020 = 1 THEN 1
ELSE 0 END as twoXtwo,
registrant_id,
FROM `az-voter-file.az_file.clean_data_machine_learning`
)
SELECT
engaged_pr,
threeXthree,
twoXtwo,
from `az-voter-file.az_file.nn02` as nn
join t1 on t1.registrant_id = nn.registrant_id
```
## Data Pipeline
All data are delivered to us as a file with voter and votes data. Rongbo "cleaned" these data. We then wrote a useful utility to clean, merge, and identify new voters.
```
-- Construct the A file: Original file
with a as (
SELECT
residence_address,
registrant_id,
concat(residence_address,"+", registrant_id) as ckey
FROM `az-voter-file.az_file.parsed_voters_11-08-21`),
b as (
SELECT
status,
residence_address,
registrant_id,
concat(residence_address,"+", registrant_id) as ckey
FROM `az-voter-file.registration.registration_may2022_raw`
where status = "Active")
SELECT
a.residence_address,
a.registrant_id,
b.residence_address,
b.registrant_id,
b.status,
from a right join b
on a.ckey = b.ckey
WHERE (a.ckey IS NULL)
```
Then construct something akin to "updated_parsed_voters."
-- Connect the uncoded to coded to uncoded data.
-- This takes a few steps.
--- 1) Join the file called, missing_geo to the
with missing_geo as(
SELECT
cast(USER_registrant_id as string) as registered,
X as longitude,
Y as latitude,
from `az-voter-file.registration.missing_geo_091622`),
m2 as (SELECT
*,
FROM `az-voter-file.registration.registration_may2022_raw` as a LEFT JOIN
missing_geo ON
missing_geo.registered = a.registrant_id)
SELECT
*,
IFNULL(longitude, long) as updated_longitude,
IFNULL(latitude, lat) as updated_latitude,
FROM m2 LEFT JOIN `az-voter-file.az_file.parsed_voters_11-08-21` as t ON
m2.registrant_id = t.registrant_id
```
Call this "registration_voters." It's basically just the data with updated latitudes and longitudes. So, let's clean this up. Then build an updated parsed votes table. Call this full_registration_votes_May2022
```
SELECT
votes.*,
cast(registrant_id as STRING) as registrant_id_s,
parse_date("%m/%d/%Y", left(variable, 10)) as election_date,
right(variable, length(variable) - 11) as election,
extract(year from parse_date("%m/%d/%Y", left(variable, 10)) ) as year,
extract(month from parse_date("%m/%d/%Y", left(variable, 10)) ) as month,
extract(day from parse_date("%m/%d/%Y", left(variable, 10)) ) as day,
REGEXP_CONTAINS(upper(variable), "GENERAL") as is_general,
REGEXP_CONTAINS(upper(variable), "PRIMARY") as is_primary,
ROW_NUMBER() OVER (partition by registrant_id order by parse_date("%m/%d/%Y", left(variable, 10))) as election_number
FROM `az-voter-file.registration.votes` as votes
```
Construct the machine learning data, let's call that registration_machinelearning_may22
```
--- machine learning data
with registration_voters as(
SELECT
st_geogpoint(updated_longitude, updated_latitude) AS location,
updated_longitude,
updated_latitude,
registrant_id,
status,
cast(year_of_birth as int) as birth_year,
parse_date("%m/%d/%Y", left(registration_date, 10)) as registration_date,
parse_date("%m/%d/%Y", left(effective_date_of_change, 10)) as registration_change,
county, party,
occupation,
cast(REGEXP_EXTRACT(congressional, r"([0-9]+)") as string) as CD,
cast(REGEXP_EXTRACT(legislative, r"([0-9]+)") as string) as LD,
zipcode,
cast(registrant_id as string) as ident,
CASE
WHEN party = "REP" THEN "Republican (REP)"
WHEN party = "DEM" THEN "Democrat (DEM)"
WHEN party = "LBT" THEN "Libertarian (LIB)"
WHEN party = "GRN" THEN "Green (GREEN)"
ELSE "Independent"
END as party_identification,
CASE
WHEN status LIKE "inactive" THEN 0
ELSE 1
END as voter_status,
FROM `az-voter-file.registration.registration_voters`),
--- Pull in the voting data.
registration_votes as(
SELECT
election,
CASE
WHEN election = "GENERAL 2020" THEN 1
ELSE 0
END as gen_2020,
CASE
WHEN election = "PRIMARY 2020" THEN 1
ELSE 0
END as primary_2020,
CASE
WHEN election = "2018 PRIMARY ELECTION" THEN 1
ELSE 0
END as primary_2018,
CASE
WHEN election = "2016 PRIMARY ELECTION" THEN 1
ELSE 0
END as primary_2016,
CASE
WHEN election = "2014 GENERAL ELECTION" THEN 1
ELSE 0
END as gen_2014,
CASE
WHEN election = "2018 GENERAL ELECTION" THEN 1
ELSE 0
END as gen_2018,
CASE
WHEN election = "2016 GENERAL ELECTION" THEN 1
ELSE 0
END as gen_2016,
cast(is_general as int64) as general_participation,
cast(is_primary as int64) as primary_participation,
cast(registrant_id as string) as ident,
cast(REGEXP_CONTAINS(upper(election), "GENERAL 2020") as int64) as vote2020
FROM `az-voter-file.registration.full_registration_votes_2022`),
registration_count as(
select
ident,
sum(gen_2020) as general_2020,
sum(primary_2020) as primary_2020,
sum(gen_2018) as general_2018,
sum(primary_2018) as primary_2018,
sum(gen_2016) as general_2016,
sum(primary_2016) as primary_2016,
sum(gen_2014) as general_2014,
sum(general_participation) as general_participation,
sum(primary_participation) as primary_participation,
sum(vote2020) as vote2020
from registration_votes
group by ident
),
registration_merged as (
select
*,
from registration_count RIGHT JOIN registration_voters
ON registration_count.ident = registration_voters.registrant_id ),
tracts as(
select
registration_merged.*,
bg.geo_ID
FROM registration_merged
CROSS JOIN `bigquery-public-data.geo_census_tracts.census_tracts_arizona` as bg
WHERE
st_contains(bg.tract_geom, location))
select
*,
from `bigquery-public-data.census_bureau_acs.censustract_2018_5yr` as a
LEFT JOIN tracts
ON tracts.geo_ID = a.geo_id
where status = "Active"
```
After the updated neural network file is uploaded, then:
```
-- Tract level data, useful summary statistics.
with m1 as (SELECT
geo_id as census_tract,
CD,
LD,
party_identification,
general_2020,
primary_2020,
general_2018,
primary_2018,
general_2016,
primary_2016,
general_2014,
bachelors_degree_2 as tract_college,
poverty as tract_poverty,
total_pop as tract_n,
median_age as tract_age,
median_income as tract_income,
white_pop as tract_white,
hispanic_pop as tract_hispanic,
gini_index as tract_gini,
housing_units as tract_housing_units,
children as tract_children,
employed_pop as tract_employed,
armed_forces as tract_armed_forces,
pop_in_labor_force as tract_in_labor_force,
in_undergrad_college as tract_in_college,
registrant_id as registrant_id,
total_pop as tn,
FROM `az-voter-file.registration.clean_data_machine_learning`),
--- Select out and construct percentages and meaningful measures.
m2 as (SELECT
census_tract,
SUM(CASE
WHEN party_identification = "Republican (REP)" THEN 1
ELSE 0 END) as Republicans,
SUM(CASE
WHEN party_identification = "Democrat (DEM)" THEN 1
ELSE 0 END) as Democrats,
SUM(CASE
WHEN party_identification = "Libertarian (LIB)" THEN 1
ELSE 0 END) as Libertarian,
SUM(CASE
WHEN party_identification = "Green (GREEN)" THEN 1
ELSE 0 END) as Greens,
SUM(CASE
WHEN party_identification <> "Republican (REP)" AND party_identification <> "Republican (REP)" AND
party_identification <> "Libertarian (LIB)" AND party_identification <> "Green (GREEN)"
THEN 1
ELSE 0 END) as Independents,
avg(safe_divide(point, tn)) as engaged_count_pr,
avg(safe_divide(tract_white, tn)) as white_tract,
avg(safe_divide(tract_hispanic, tn)) as hispanic_tract,
avg(tract_gini) as gini_tract,
avg(safe_divide(tract_college, tn)) as college_tract,
avg(safe_divide(tract_children, tn)) as children_tract,
avg(safe_divide(tract_employed, tn)) as employed_tract,
avg(safe_divide(tract_in_college, tn)) as tract_college,
avg(safe_divide(tract_poverty, tn)) as tract_poverty,
avg(safe_divide(tract_in_labor_force, tn)) as tract_in_labor_force,
avg(engaged_pr)*100 as engagement_score,
sum(CASE
WHEN general_2020 = 1
AND general_2018 = 1
AND general_2016 = 1
AND primary_2016 = 1
AND primary_2018 = 1
AND primary_2020 = 1 THEN 1
ELSE 0 END) as threeXthree,
sum(CASE
WHEN general_2020 = 1
AND general_2016 = 1
AND primary_2016 = 1
AND primary_2020 = 1 THEN 1
ELSE 0 END) as twoXtwo,
from m1 JOIN
`az-voter-file.registration.nn03` as m2 ON
m1.registrant_id = m2.registrant_id
WHERE CD IS NOT NULL
GROUP BY census_tract
)
select
*,
geo_id as geoid,
internal_point_lat as latitude,
internal_point_lon as longitude,
from `bigquery-public-data.geo_census_tracts.census_tracts_arizona` as centroids
right join m2 on centroids.geo_id = m2.census_tract
```