migurski · January 25, 2025 21:28
diff --git a/stitch-textract-results.py b/stitch-textract-results.py
 """Coalesce Excel sheets that appear to be excerpts of larger sheets

 Requires pylightxl: https://pylightxl.readthedocs.io/en/latest/

 PDF sources like this Gogebic County example (1) can be converted by AWS Textract
 into usable Excel workbooks with multiple sheets (2). This scripts makes a best-effort
 attempt to stitch together large tables broken up across multiple pages based on
 row counts, column counts, and cell values representing precincts. The results still
 require a lot of manual interpretation to be turned into useful OpenElections data (3).

 1) https://github.com/openelections/openelections-sources-mi/blob/master/2024/general/Gogebic%20MI%20Official%20Statement%20of%20Votes%20Cast%20with%20Certification%2011.5.2024.pdf
 2) https://thescoop.org/archives/2024/01/27/a-better-way-to-process-mississippi-precinct-results/
 3) https://docs.google.com/spreadsheets/d/1iOEqLFcwuA3J_HXUtEWIZUirA7J1_NdsnVUnc8z0TZY/edit?gid=2129694166#gid=2129694166
 """
 import csv
 import itertools
 import operator
 import os
 import re
 import sys

 import pylightxl

 # Some Excel PDF outputs repeat headers on each page, others do not
 HEADERS_REPEATED = False

 sig_getter = operator.itemgetter(1)

 def ws_signature(db, ws_name):
    """Create a primitive signature based on the top and bottom of the first column
    """
    first_column = db.ws(ws_name).col(1)
    cell_signatures = [''.join({w for w in cell.split()}) for cell in first_column]
    full_signature = re.sub('\W', '', ''.join(cell_signatures)).lower()
    return full_signature[:64] + full_signature[-64:]

 if __name__ == '__main__':
    input_xlsx, output_xlsx = sys.argv[1:]
    
    db1 = pylightxl.readxl(input_xlsx)
    db2 = pylightxl.Database()
    ws_sizes = [(nm, ws_signature(db1, nm), *db1.ws(nm).size) for nm in db1.ws_names]
    
    last_col_count = None
    
    for signature, group in itertools.groupby(ws_sizes, key=sig_getter):
        ws_names, _, row_counts, col_counts = zip(*group)
        row_count, col_count = max(row_counts), sum(col_counts)

        if col_count != last_col_count:
            print("New sheet", row_count, "x", col_counts, file=sys.stderr)
            db2.add_ws(ws=f"Sheet{len(db2.ws_names) + 1}")
            input_offset, output_offset = 0, 0
        else:
            print("...append rows", row_count, "x", col_counts, file=sys.stderr)
            input_offset, output_offset = 1 if HEADERS_REPEATED else 0, ws2.size[0]
        
        ws2 = db2.ws(f"Sheet{len(db2.ws_names)}")
        for row in range(1, row_count + 1 - input_offset):
            rows = [db1.ws(nm).row(row + input_offset) for nm in ws_names]
            vals = list(itertools.chain(*rows))
            for val, col in zip(vals, range(1, len(vals) + 1)):
                ws2.update_index(row=row + output_offset, col=col, val=val)
        
        last_col_count = col_count
    
    pylightxl.writexl(db=db2, fn=output_xlsx)
	"""Coalesce Excel sheets that appear to be excerpts of larger sheets

	Requires pylightxl: https://pylightxl.readthedocs.io/en/latest/

	PDF sources like this Gogebic County example (1) can be converted by AWS Textract
	into usable Excel workbooks with multiple sheets (2). This scripts makes a best-effort
	attempt to stitch together large tables broken up across multiple pages based on
	row counts, column counts, and cell values representing precincts. The results still
	require a lot of manual interpretation to be turned into useful OpenElections data (3).

	1) https://github.com/openelections/openelections-sources-mi/blob/master/2024/general/Gogebic%20MI%20Official%20Statement%20of%20Votes%20Cast%20with%20Certification%2011.5.2024.pdf
	2) https://thescoop.org/archives/2024/01/27/a-better-way-to-process-mississippi-precinct-results/
	3) https://docs.google.com/spreadsheets/d/1iOEqLFcwuA3J_HXUtEWIZUirA7J1_NdsnVUnc8z0TZY/edit?gid=2129694166#gid=2129694166
	"""
	import csv
	import itertools
	import operator
	import os
	import re
	import sys

	import pylightxl

	# Some Excel PDF outputs repeat headers on each page, others do not
	HEADERS_REPEATED = False

	sig_getter = operator.itemgetter(1)

	def ws_signature(db, ws_name):
	"""Create a primitive signature based on the top and bottom of the first column
	"""
	first_column = db.ws(ws_name).col(1)
	cell_signatures = [''.join({w for w in cell.split()}) for cell in first_column]
	full_signature = re.sub('\W', '', ''.join(cell_signatures)).lower()
	return full_signature[:64] + full_signature[-64:]

	if __name__ == '__main__':
	input_xlsx, output_xlsx = sys.argv[1:]

	db1 = pylightxl.readxl(input_xlsx)
	db2 = pylightxl.Database()
	ws_sizes = [(nm, ws_signature(db1, nm), *db1.ws(nm).size) for nm in db1.ws_names]

	last_col_count = None

	for signature, group in itertools.groupby(ws_sizes, key=sig_getter):
	ws_names, _, row_counts, col_counts = zip(*group)
	row_count, col_count = max(row_counts), sum(col_counts)

	if col_count != last_col_count:
	print("New sheet", row_count, "x", col_counts, file=sys.stderr)
	db2.add_ws(ws=f"Sheet{len(db2.ws_names) + 1}")
	input_offset, output_offset = 0, 0
	else:
	print("...append rows", row_count, "x", col_counts, file=sys.stderr)
	input_offset, output_offset = 1 if HEADERS_REPEATED else 0, ws2.size[0]

	ws2 = db2.ws(f"Sheet{len(db2.ws_names)}")
	for row in range(1, row_count + 1 - input_offset):
	rows = [db1.ws(nm).row(row + input_offset) for nm in ws_names]
	vals = list(itertools.chain(*rows))
	for val, col in zip(vals, range(1, len(vals) + 1)):
	ws2.update_index(row=row + output_offset, col=col, val=val)

	last_col_count = col_count

	pylightxl.writexl(db=db2, fn=output_xlsx)