Last active
January 25, 2025 21:28
-
-
Save migurski/1ca4dd455febb6efddafc9f56141214c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Coalesce Excel sheets that appear to be excerpts of larger sheets | |
Requires pylightxl: https://pylightxl.readthedocs.io/en/latest/ | |
PDF sources like this Gogebic County example (1) can be converted by AWS Textract | |
into usable Excel workbooks with multiple sheets (2). This scripts makes a best-effort | |
attempt to stitch together large tables broken up across multiple pages based on | |
row counts, column counts, and cell values representing precincts. The results still | |
require a lot of manual interpretation to be turned into useful OpenElections data (3). | |
1) https://github.com/openelections/openelections-sources-mi/blob/master/2024/general/Gogebic%20MI%20Official%20Statement%20of%20Votes%20Cast%20with%20Certification%2011.5.2024.pdf | |
2) https://thescoop.org/archives/2024/01/27/a-better-way-to-process-mississippi-precinct-results/ | |
3) https://docs.google.com/spreadsheets/d/1iOEqLFcwuA3J_HXUtEWIZUirA7J1_NdsnVUnc8z0TZY/edit?gid=2129694166#gid=2129694166 | |
""" | |
import csv | |
import itertools | |
import operator | |
import os | |
import re | |
import sys | |
import pylightxl | |
# Some Excel PDF outputs repeat headers on each page, others do not | |
HEADERS_REPEATED = False | |
sig_getter = operator.itemgetter(1) | |
def ws_signature(db, ws_name): | |
"""Create a primitive signature based on the top and bottom of the first column | |
""" | |
first_column = db.ws(ws_name).col(1) | |
cell_signatures = [''.join({w for w in cell.split()}) for cell in first_column] | |
full_signature = re.sub('\W', '', ''.join(cell_signatures)).lower() | |
return full_signature[:64] + full_signature[-64:] | |
if __name__ == '__main__': | |
input_xlsx, output_xlsx = sys.argv[1:] | |
db1 = pylightxl.readxl(input_xlsx) | |
db2 = pylightxl.Database() | |
ws_sizes = [(nm, ws_signature(db1, nm), *db1.ws(nm).size) for nm in db1.ws_names] | |
last_col_count = None | |
for signature, group in itertools.groupby(ws_sizes, key=sig_getter): | |
ws_names, _, row_counts, col_counts = zip(*group) | |
row_count, col_count = max(row_counts), sum(col_counts) | |
if col_count != last_col_count: | |
print("New sheet", row_count, "x", col_counts, file=sys.stderr) | |
db2.add_ws(ws=f"Sheet{len(db2.ws_names) + 1}") | |
input_offset, output_offset = 0, 0 | |
else: | |
print("...append rows", row_count, "x", col_counts, file=sys.stderr) | |
input_offset, output_offset = 1 if HEADERS_REPEATED else 0, ws2.size[0] | |
ws2 = db2.ws(f"Sheet{len(db2.ws_names)}") | |
for row in range(1, row_count + 1 - input_offset): | |
rows = [db1.ws(nm).row(row + input_offset) for nm in ws_names] | |
vals = list(itertools.chain(*rows)) | |
for val, col in zip(vals, range(1, len(vals) + 1)): | |
ws2.update_index(row=row + output_offset, col=col, val=val) | |
last_col_count = col_count | |
pylightxl.writexl(db=db2, fn=output_xlsx) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment