Created
March 12, 2024 17:27
-
-
Save NickCrews/bb846c80f457c29d9ee7e5bc69f22e88 to your computer and use it in GitHub Desktop.
This script takes the Federal Election Commission's weekly PostgreSQL dump file and converts it to a directory of parquet files, using an ephemeral postgres instance in a docker container and duckdb.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# This script takes the FEC's PostgreSQL dump file and converts it to a directory | |
# of parquet files. | |
# See https://cg-519a459a-0ea3-42c2-b7bc-fa1143481f74.s3-us-gov-west-1.amazonaws.com/bulk-downloads/index.html?prefix=bulk-downloads/data-dump/schedules/ | |
# for the PostgreSQL dump file and more info. | |
# | |
# This requires you to | |
# 1. Have Docker installed and running | |
# 2. Have the `duckdb` command line tool installed | |
# 3. Several Hundred GB of free disk space | |
# 4. Either patience, or a decent amount of CPU and RAM | |
# | |
# Currently this only deals with schedule A filings. Modify this if you need others. | |
# Ways to improve this: | |
# - Containerize the `duckdb` command line tool. Then users only need docker. | |
# - Parrallelize (everything?): for every table, spin up a new container | |
# and restore just that table, then write the parquet file. | |
# Currently it looks like only one CPU is maxed out at a time. | |
# - Convert to a python script and expose a python API to do conversion/filtering | |
# on the fly. This would probably speed up the writing of the parquet files. | |
# - Benchmark writing the tables to CSV directly with postgres, then converting to parquet | |
# with `duckdb`. This might be faster than streaming the records pg->duckdb, | |
# which I think happens one-by-one. | |
# - Include the step for downloading the pgdump file from the FEC's website. | |
# Check the arguments | |
if [ "$#" -ne 2 ]; then | |
echo "Usage: $0 <pgdump_file> <output_directory>" | |
exit 1 | |
fi | |
pgdump_file=$1 | |
output_directory=$2 | |
container_name=my_pg_container | |
username=myuser | |
ensure_pg_running() { | |
# Check if the container is already running | |
if [ "$(docker ps -q -f name=$container_name)" ]; then | |
echo "PostgreSQL container is already running" | |
return | |
fi | |
# Check if the container is stopped | |
if [ "$(docker ps -aq -f status=exited -f name=$container_name)" ]; then | |
echo "Starting the stopped PostgreSQL container" | |
docker start $container_name | |
else | |
echo "PostgreSQL container is not running, starting a new container" | |
docker run \ | |
--name $container_name \ | |
-e POSTGRES_USER=$username \ | |
-e POSTGRES_PASSWORD=mysecretpassword \ | |
-p 5432:5432 \ | |
-d \ | |
postgres | |
echo "Creating schema and extensions in the database" | |
# need to run the following SQL commands to create the schema and extensions: | |
# Ignore any errors you see here, they are expected. | |
docker exec $container_name psql -U $username -d postgres -c "CREATE SCHEMA disclosure;" | |
docker exec $container_name psql -U $username -d postgres -c "grant all on schema disclosure to public;" | |
docker exec $container_name psql -U $username -d postgres -c "CREATE EXTENSION pg_trgm;" | |
docker exec $container_name psql -U $username -d postgres -c "CREATE EXTENSION btree_gin;" | |
fi | |
} | |
restore_table() { | |
if [ "$#" -ne 2 ]; then | |
echo "Usage: $0 <dump_file> <table_name>" | |
exit 1 | |
fi | |
dump_file=$1 | |
table_name=$2 | |
if [ ! -f $dump_file ]; then | |
echo "File $dump_file does not exist" | |
exit 1 | |
fi | |
ensure_pg_running | |
echo "Restoring table $table_name from $dump_file" | |
docker exec -i $container_name pg_restore \ | |
--verbose \ | |
--no-acl \ | |
--no-owner \ | |
-U $username \ | |
-d postgres \ | |
--table $table_name \ | |
< $dump_file | |
} | |
parquet_path() { | |
if [ "$#" -ne 1 ]; then | |
echo "Usage: $0 <table_name>" | |
exit 1 | |
fi | |
echo "$output_directory/$1.parquet" | |
} | |
write_parquet() { | |
if [ "$#" -ne 2 ]; then | |
echo "Usage: $0 <table_name> <output_directory>" | |
fi | |
table_name=$1 | |
out_path=$2 | |
directory=$(dirname $out_path) | |
if [ ! -d $directory ]; then | |
echo "Directory $directory does not exist, creating" | |
mkdir -p $directory | |
fi | |
# TODO: also containerize this using eg: | |
# docker run -it --rm \ | |
# -v $output_directory:/output \ | |
# davidgasquez/duckdb \ | |
echo "Writing table $table_name to $out_path" | |
duckdb -c " | |
SET temp_directory='.duckdb_tmp/'; | |
--- Without this, duckdb has to load the whole thing to keep the order. | |
--- This was running me out of memory. | |
--- https://duckdb.org/docs/guides/performance/how_to_tune_workloads.html#the-preserve_insertion_order-option | |
SET preserve_insertion_order = false; | |
INSTALL postgres; | |
ATTACH 'postgres://myuser:mysecretpassword@localhost:5432/postgres' AS p (TYPE postgres); | |
COPY (SELECT * FROM p.disclosure.$table_name) TO '$out_path'; | |
" | |
} | |
pgdump_to_parquets() { | |
# $1 : pgdump file | |
# $2 : output directory | |
# rest: table names | |
if [ "$#" -lt 3 ]; then | |
echo "Usage: $0 <pgdump_file> <output_directory> <table1> <table2> ..." | |
exit 1 | |
fi | |
pgdump_file=$1 | |
output_directory=$2 | |
shift 2 | |
tables=("$@") | |
ensure_pg_running | |
for table in "${tables[@]}"; do | |
p=$(parquet_path $table) | |
if [ -f $p ]; then | |
echo "Parquet file $p already exists, skipping" | |
continue | |
fi | |
restore_table $pgdump_file $table | |
write_parquet $table $p | |
done | |
} | |
tables=( | |
"fec_fitem_sched_a" | |
"fec_fitem_sched_a_1975_1976" | |
"fec_fitem_sched_a_1977_1978" | |
"fec_fitem_sched_a_1979_1980" | |
"fec_fitem_sched_a_1981_1982" | |
"fec_fitem_sched_a_1983_1984" | |
"fec_fitem_sched_a_1985_1986" | |
"fec_fitem_sched_a_1987_1988" | |
"fec_fitem_sched_a_1989_1990" | |
"fec_fitem_sched_a_1991_1992" | |
"fec_fitem_sched_a_1993_1994" | |
"fec_fitem_sched_a_1995_1996" | |
"fec_fitem_sched_a_1997_1998" | |
"fec_fitem_sched_a_1999_2000" | |
"fec_fitem_sched_a_2001_2002" | |
"fec_fitem_sched_a_2003_2004" | |
"fec_fitem_sched_a_2005_2006" | |
"fec_fitem_sched_a_2007_2008" | |
"fec_fitem_sched_a_2009_2010" | |
"fec_fitem_sched_a_2011_2012" | |
"fec_fitem_sched_a_2013_2014" | |
"fec_fitem_sched_a_2015_2016" | |
"fec_fitem_sched_a_2017_2018" | |
"fec_fitem_sched_a_2019_2020" | |
"fec_fitem_sched_a_2021_2022" | |
"fec_fitem_sched_a_2023_2024" | |
) | |
pgdump_to_parquets $pgdump_file $output_directory "${tables[@]}" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment