Skip to content

Instantly share code, notes, and snippets.

@migurski
Last active January 25, 2025 21:28
Show Gist options
  • Save migurski/1ca4dd455febb6efddafc9f56141214c to your computer and use it in GitHub Desktop.
Save migurski/1ca4dd455febb6efddafc9f56141214c to your computer and use it in GitHub Desktop.
"""Coalesce Excel sheets that appear to be excerpts of larger sheets
Requires pylightxl: https://pylightxl.readthedocs.io/en/latest/
PDF sources like this Gogebic County example (1) can be converted by AWS Textract
into usable Excel workbooks with multiple sheets (2). This scripts makes a best-effort
attempt to stitch together large tables broken up across multiple pages based on
row counts, column counts, and cell values representing precincts. The results still
require a lot of manual interpretation to be turned into useful OpenElections data (3).
1) https://github.com/openelections/openelections-sources-mi/blob/master/2024/general/Gogebic%20MI%20Official%20Statement%20of%20Votes%20Cast%20with%20Certification%2011.5.2024.pdf
2) https://thescoop.org/archives/2024/01/27/a-better-way-to-process-mississippi-precinct-results/
3) https://docs.google.com/spreadsheets/d/1iOEqLFcwuA3J_HXUtEWIZUirA7J1_NdsnVUnc8z0TZY/edit?gid=2129694166#gid=2129694166
"""
import csv
import itertools
import operator
import os
import re
import sys
import pylightxl
# Some Excel PDF outputs repeat headers on each page, others do not
HEADERS_REPEATED = False
sig_getter = operator.itemgetter(1)
def ws_signature(db, ws_name):
"""Create a primitive signature based on the top and bottom of the first column
"""
first_column = db.ws(ws_name).col(1)
cell_signatures = [''.join({w for w in cell.split()}) for cell in first_column]
full_signature = re.sub('\W', '', ''.join(cell_signatures)).lower()
return full_signature[:64] + full_signature[-64:]
if __name__ == '__main__':
input_xlsx, output_xlsx = sys.argv[1:]
db1 = pylightxl.readxl(input_xlsx)
db2 = pylightxl.Database()
ws_sizes = [(nm, ws_signature(db1, nm), *db1.ws(nm).size) for nm in db1.ws_names]
last_col_count = None
for signature, group in itertools.groupby(ws_sizes, key=sig_getter):
ws_names, _, row_counts, col_counts = zip(*group)
row_count, col_count = max(row_counts), sum(col_counts)
if col_count != last_col_count:
print("New sheet", row_count, "x", col_counts, file=sys.stderr)
db2.add_ws(ws=f"Sheet{len(db2.ws_names) + 1}")
input_offset, output_offset = 0, 0
else:
print("...append rows", row_count, "x", col_counts, file=sys.stderr)
input_offset, output_offset = 1 if HEADERS_REPEATED else 0, ws2.size[0]
ws2 = db2.ws(f"Sheet{len(db2.ws_names)}")
for row in range(1, row_count + 1 - input_offset):
rows = [db1.ws(nm).row(row + input_offset) for nm in ws_names]
vals = list(itertools.chain(*rows))
for val, col in zip(vals, range(1, len(vals) + 1)):
ws2.update_index(row=row + output_offset, col=col, val=val)
last_col_count = col_count
pylightxl.writexl(db=db2, fn=output_xlsx)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment