Skip to content

Instantly share code, notes, and snippets.

@gordonje
Created October 1, 2024 20:10
Show Gist options
  • Save gordonje/b815a9df498ccde614364c32c1eac149 to your computer and use it in GitHub Desktop.
Save gordonje/b815a9df498ccde614364c32c1eac149 to your computer and use it in GitHub Desktop.
Loading and Transforming Voter Registration Data from BLOCKS (in DuckDB)
CREATE OR REPLACE TABLE
batch
AS
FROM
read_csv_auto(
'extracts/blocks/voter_registration_scan_batches.csv',
normalize_names=TRUE
)
;
CREATE OR REPLACE TABLE
delivery
AS
FROM
read_csv_auto(
'extracts/blocks/deliveries.csv',
normalize_names=TRUE
)
;
CREATE OR REPLACE TABLE
location
AS
FROM
read_csv_auto(
'extracts/blocks/locations.csv',
normalize_names=TRUE
)
;
CREATE OR REPLACE TABLE
qc
AS
FROM
read_csv_auto(
'extracts/blocks/scans_qc_overview.csv',
normalize_names=TRUE
)
;
CREATE OR REPLACE TABLE
reg_form
AS
FROM
read_csv_auto(
'extracts/blocks/registration_forms.csv',
normalize_names=TRUE
)
;
CREATE OR REPLACE TABLE
scan
AS
FROM
read_csv_auto(
'extracts/blocks/voter_registration_scans.csv',
normalize_names=TRUE
)
;
ATTACH './build/blocks.duckdb' AS blocks (READ_ONLY);
CREATE OR REPLACE TABLE
vended_form AS
SELECT
r.id,
r.created_at::DATE AS date_created,
d.turn_in_date,
l._name AS turn_in_location,
CASE
r.extras::JSON->>'$.registration_type_MO'
WHEN 'address_change'
THEN 'Address Change'
WHEN 'name_change'
THEN 'Name Change'
WHEN 'new_registration'
THEN 'New Registration'
ELSE NULL
END AS reg_type,
UPPER( TRIM( r.name_prefix )) AS name_prefix,
UPPER( TRIM( r.first_name )) AS first_name,
UPPER( TRIM( r.middle_name )) AS middle_name,
UPPER( TRIM( r.last_name )) AS last_name,
TRY_CAST( r.date_of_birth AS DATE) AS dob,
UPPER( TRIM( r.party )) AS party,
UPPER( TRIM( r.voting_street_address_one )) AS voting_street_address_one,
UPPER( TRIM( r.voting_street_address_two )) AS voting_street_address_two,
UPPER( TRIM( r.voting_city )) AS voting_city,
UPPER( TRIM( r.voting_state )) AS voting_state,
REGEXP_EXTRACT( r.voting_zipcode, '\d{5}' ) AS voting_zipcode,
UPPER( TRIM(
REPLACE(
COALESCE( q.visual_qc_county, r.county),
'Saint ', 'St. '
)
)) AS county,
CASE
WHEN
LENGTH(r.mailing_street_address_one) > 3 OR
LENGTH(r.mailing_street_address_two) > 3
THEN
TRIM(
REGEXP_REPLACE(
CONCAT_WS(
' ',
UPPER(r.mailing_street_address_one),
UPPER(r.mailing_street_address_two)
),
'--.+--',
''
)
)
WHEN r.extras::JSON->>'$.mailing_street_address' NOT IN (
'n/a', 'na', 'same', 'address on last voter registration',
'unknown', 'other', 'not sure'
)
AND NOT REGEXP_MATCHES( r.extras::JSON->>'$.mailing_street_address', '--.+--' )
AND LENGTH( r.extras::JSON->>'$.mailing_street_address' ) > 2
THEN
UPPER( TRIM( r.extras::JSON->>'$.mailing_street_address' ))
ELSE
NULL
END AS mailing_street_address,
CASE
WHEN REGEXP_MATCHES( r.mailing_city, '--.+--')
THEN NULL
ELSE UPPER( TRIM( r.mailing_city ))
END AS mailing_city,
REGEXP_EXTRACT( r.mailing_zipcode, '\d{5}' ) AS mailing_zipcode,
NULLIF(
TRIM( REGEXP_REPLACE( r.phone_number, '\D', '', 'g' )),
''
) AS phone_number,
COALESCE(
LOWER( TRIM( REPLACE(r.email_address, ' ', '_' ))),
LIST_ANY_VALUE( LIST_FILTER( r.extras::JSON->>'$.*', x -> CONTAINS( x, '@')))
) AS email_address,
CASE
WHEN r.extras::JSON->>'$.previous_name' IN (
'n/a', 'na', 'same', 'address on last voter registration',
'unknown', 'other', 'not sure'
)
OR REGEXP_MATCHES( r.extras::JSON->>'$.previous_name', '--.+--')
OR LENGTH( r.extras::JSON->>'$.previous_name' ) < 2
THEN NULL
ELSE UPPER( TRIM( r.extras::JSON->>'$.previous_name' ))
END AS previous_name,
CASE
WHEN r.extras::JSON->>'$.previous_residential_address' IN (
'n/a', 'na', 'same', 'address on last voter registration',
'unknown', 'other', 'not sure'
)
OR REGEXP_MATCHES( r.extras::JSON->>'$.previous_residential_address', '--.+--')
OR LENGTH( r.extras::JSON->>'$.previous_residential_address' ) < 2
THEN NULL
ELSE UPPER( TRIM( r.extras::JSON->>'$.previous_residential_address' ))
END AS previous_residential_address,
CASE
WHEN r.extras::JSON->>'$.previous_city' IN (
'n/a', 'na', 'same', 'address on last voter registration',
'unknown', 'other', 'not sure'
)
OR REGEXP_MATCHES( r.extras::JSON->>'$.previous_city', '--.+--')
OR LENGTH( r.extras::JSON->>'$.previous_city' ) < 2
THEN NULL
ELSE UPPER( TRIM( r.extras::JSON->>'$.previous_city' ))
END AS previous_city,
CASE
WHEN r.extras::JSON->>'$.previous_county' IN (
'n/a', 'na', 'same', 'address on last voter registration',
'unknown', 'other', 'not sure'
)
OR REGEXP_MATCHES( r.extras::JSON->>'$.previous_county', '--.+--')
OR LENGTH( r.extras::JSON->>'$.previous_county' ) < 2
THEN NULL
ELSE UPPER( TRIM( r.extras::JSON->>'$.previous_county' ))
END AS previous_county,
CASE
WHEN r.extras::JSON->>'$.previous_state' IN (
'n/a', 'na', 'same', 'address on last voter registration',
'unknown', 'other', 'not sure'
)
OR REGEXP_MATCHES( r.extras::JSON->>'$.previous_state', '--.+--')
OR LENGTH( r.extras::JSON->>'$.previous_state') < 2
THEN NULL
ELSE UPPER( TRIM( r.extras::JSON->>'$.previous_state' ))
END AS previous_state,
CASE
WHEN r.extras::JSON->>'$.location_of_birth' IN (
'n/a', 'na', 'same', 'address on last voter registration',
'unknown', 'other', 'not sure'
)
OR REGEXP_MATCHES( r.extras::JSON->>'$.location_of_birth', '--.+--')
OR LENGTH( r.extras::JSON->>'$.location_of_birth' ) < 2
THEN NULL
ELSE UPPER( TRIM( r.extras::JSON->>'$.location_of_birth' ))
END AS location_of_birth
FROM
blocks.reg_form AS r
JOIN
blocks.scan AS s
ON
r.scan_id = s.id
JOIN
blocks.qc AS q
ON
r.id = q.registration_form_id
AND
s.id = q.voter_registration_scan_id
JOIN
blocks.delivery AS d
ON
s.delivery_id = d.id
JOIN
blocks.location AS l
ON
d.turn_in_location_id = l.id
WHERE
d.status = 3
AND
( r.extras::JSON->>'$.registration_type_MO' ) <> '--impossible--'
AND
r.voting_state = 'MO'
AND
q.through_visual_qc
;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment