Created
October 1, 2024 20:10
-
-
Save gordonje/b815a9df498ccde614364c32c1eac149 to your computer and use it in GitHub Desktop.
Loading and Transforming Voter Registration Data from BLOCKS (in DuckDB)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
CREATE OR REPLACE TABLE | |
batch | |
AS | |
FROM | |
read_csv_auto( | |
'extracts/blocks/voter_registration_scan_batches.csv', | |
normalize_names=TRUE | |
) | |
; | |
CREATE OR REPLACE TABLE | |
delivery | |
AS | |
FROM | |
read_csv_auto( | |
'extracts/blocks/deliveries.csv', | |
normalize_names=TRUE | |
) | |
; | |
CREATE OR REPLACE TABLE | |
location | |
AS | |
FROM | |
read_csv_auto( | |
'extracts/blocks/locations.csv', | |
normalize_names=TRUE | |
) | |
; | |
CREATE OR REPLACE TABLE | |
qc | |
AS | |
FROM | |
read_csv_auto( | |
'extracts/blocks/scans_qc_overview.csv', | |
normalize_names=TRUE | |
) | |
; | |
CREATE OR REPLACE TABLE | |
reg_form | |
AS | |
FROM | |
read_csv_auto( | |
'extracts/blocks/registration_forms.csv', | |
normalize_names=TRUE | |
) | |
; | |
CREATE OR REPLACE TABLE | |
scan | |
AS | |
FROM | |
read_csv_auto( | |
'extracts/blocks/voter_registration_scans.csv', | |
normalize_names=TRUE | |
) | |
; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
ATTACH './build/blocks.duckdb' AS blocks (READ_ONLY); | |
CREATE OR REPLACE TABLE | |
vended_form AS | |
SELECT | |
r.id, | |
r.created_at::DATE AS date_created, | |
d.turn_in_date, | |
l._name AS turn_in_location, | |
CASE | |
r.extras::JSON->>'$.registration_type_MO' | |
WHEN 'address_change' | |
THEN 'Address Change' | |
WHEN 'name_change' | |
THEN 'Name Change' | |
WHEN 'new_registration' | |
THEN 'New Registration' | |
ELSE NULL | |
END AS reg_type, | |
UPPER( TRIM( r.name_prefix )) AS name_prefix, | |
UPPER( TRIM( r.first_name )) AS first_name, | |
UPPER( TRIM( r.middle_name )) AS middle_name, | |
UPPER( TRIM( r.last_name )) AS last_name, | |
TRY_CAST( r.date_of_birth AS DATE) AS dob, | |
UPPER( TRIM( r.party )) AS party, | |
UPPER( TRIM( r.voting_street_address_one )) AS voting_street_address_one, | |
UPPER( TRIM( r.voting_street_address_two )) AS voting_street_address_two, | |
UPPER( TRIM( r.voting_city )) AS voting_city, | |
UPPER( TRIM( r.voting_state )) AS voting_state, | |
REGEXP_EXTRACT( r.voting_zipcode, '\d{5}' ) AS voting_zipcode, | |
UPPER( TRIM( | |
REPLACE( | |
COALESCE( q.visual_qc_county, r.county), | |
'Saint ', 'St. ' | |
) | |
)) AS county, | |
CASE | |
WHEN | |
LENGTH(r.mailing_street_address_one) > 3 OR | |
LENGTH(r.mailing_street_address_two) > 3 | |
THEN | |
TRIM( | |
REGEXP_REPLACE( | |
CONCAT_WS( | |
' ', | |
UPPER(r.mailing_street_address_one), | |
UPPER(r.mailing_street_address_two) | |
), | |
'--.+--', | |
'' | |
) | |
) | |
WHEN r.extras::JSON->>'$.mailing_street_address' NOT IN ( | |
'n/a', 'na', 'same', 'address on last voter registration', | |
'unknown', 'other', 'not sure' | |
) | |
AND NOT REGEXP_MATCHES( r.extras::JSON->>'$.mailing_street_address', '--.+--' ) | |
AND LENGTH( r.extras::JSON->>'$.mailing_street_address' ) > 2 | |
THEN | |
UPPER( TRIM( r.extras::JSON->>'$.mailing_street_address' )) | |
ELSE | |
NULL | |
END AS mailing_street_address, | |
CASE | |
WHEN REGEXP_MATCHES( r.mailing_city, '--.+--') | |
THEN NULL | |
ELSE UPPER( TRIM( r.mailing_city )) | |
END AS mailing_city, | |
REGEXP_EXTRACT( r.mailing_zipcode, '\d{5}' ) AS mailing_zipcode, | |
NULLIF( | |
TRIM( REGEXP_REPLACE( r.phone_number, '\D', '', 'g' )), | |
'' | |
) AS phone_number, | |
COALESCE( | |
LOWER( TRIM( REPLACE(r.email_address, ' ', '_' ))), | |
LIST_ANY_VALUE( LIST_FILTER( r.extras::JSON->>'$.*', x -> CONTAINS( x, '@'))) | |
) AS email_address, | |
CASE | |
WHEN r.extras::JSON->>'$.previous_name' IN ( | |
'n/a', 'na', 'same', 'address on last voter registration', | |
'unknown', 'other', 'not sure' | |
) | |
OR REGEXP_MATCHES( r.extras::JSON->>'$.previous_name', '--.+--') | |
OR LENGTH( r.extras::JSON->>'$.previous_name' ) < 2 | |
THEN NULL | |
ELSE UPPER( TRIM( r.extras::JSON->>'$.previous_name' )) | |
END AS previous_name, | |
CASE | |
WHEN r.extras::JSON->>'$.previous_residential_address' IN ( | |
'n/a', 'na', 'same', 'address on last voter registration', | |
'unknown', 'other', 'not sure' | |
) | |
OR REGEXP_MATCHES( r.extras::JSON->>'$.previous_residential_address', '--.+--') | |
OR LENGTH( r.extras::JSON->>'$.previous_residential_address' ) < 2 | |
THEN NULL | |
ELSE UPPER( TRIM( r.extras::JSON->>'$.previous_residential_address' )) | |
END AS previous_residential_address, | |
CASE | |
WHEN r.extras::JSON->>'$.previous_city' IN ( | |
'n/a', 'na', 'same', 'address on last voter registration', | |
'unknown', 'other', 'not sure' | |
) | |
OR REGEXP_MATCHES( r.extras::JSON->>'$.previous_city', '--.+--') | |
OR LENGTH( r.extras::JSON->>'$.previous_city' ) < 2 | |
THEN NULL | |
ELSE UPPER( TRIM( r.extras::JSON->>'$.previous_city' )) | |
END AS previous_city, | |
CASE | |
WHEN r.extras::JSON->>'$.previous_county' IN ( | |
'n/a', 'na', 'same', 'address on last voter registration', | |
'unknown', 'other', 'not sure' | |
) | |
OR REGEXP_MATCHES( r.extras::JSON->>'$.previous_county', '--.+--') | |
OR LENGTH( r.extras::JSON->>'$.previous_county' ) < 2 | |
THEN NULL | |
ELSE UPPER( TRIM( r.extras::JSON->>'$.previous_county' )) | |
END AS previous_county, | |
CASE | |
WHEN r.extras::JSON->>'$.previous_state' IN ( | |
'n/a', 'na', 'same', 'address on last voter registration', | |
'unknown', 'other', 'not sure' | |
) | |
OR REGEXP_MATCHES( r.extras::JSON->>'$.previous_state', '--.+--') | |
OR LENGTH( r.extras::JSON->>'$.previous_state') < 2 | |
THEN NULL | |
ELSE UPPER( TRIM( r.extras::JSON->>'$.previous_state' )) | |
END AS previous_state, | |
CASE | |
WHEN r.extras::JSON->>'$.location_of_birth' IN ( | |
'n/a', 'na', 'same', 'address on last voter registration', | |
'unknown', 'other', 'not sure' | |
) | |
OR REGEXP_MATCHES( r.extras::JSON->>'$.location_of_birth', '--.+--') | |
OR LENGTH( r.extras::JSON->>'$.location_of_birth' ) < 2 | |
THEN NULL | |
ELSE UPPER( TRIM( r.extras::JSON->>'$.location_of_birth' )) | |
END AS location_of_birth | |
FROM | |
blocks.reg_form AS r | |
JOIN | |
blocks.scan AS s | |
ON | |
r.scan_id = s.id | |
JOIN | |
blocks.qc AS q | |
ON | |
r.id = q.registration_form_id | |
AND | |
s.id = q.voter_registration_scan_id | |
JOIN | |
blocks.delivery AS d | |
ON | |
s.delivery_id = d.id | |
JOIN | |
blocks.location AS l | |
ON | |
d.turn_in_location_id = l.id | |
WHERE | |
d.status = 3 | |
AND | |
( r.extras::JSON->>'$.registration_type_MO' ) <> '--impossible--' | |
AND | |
r.voting_state = 'MO' | |
AND | |
q.through_visual_qc | |
; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment