Skip to content

Instantly share code, notes, and snippets.

@alexshpilkin
Last active May 28, 2018 22:34
Show Gist options
  • Select an option

  • Save alexshpilkin/71502afd060d2566b3eb29db4605c0e4 to your computer and use it in GitHub Desktop.

Select an option

Save alexshpilkin/71502afd060d2566b3eb29db4605c0e4 to your computer and use it in GitHub Desktop.
Convert Georgian election PDFs into TSV
#!/bin/sh -eu
set -o pipefail # FIXME bashism
pdftotext -q -layout "$1" - \
| iconv -t iso8859-1//translit | iconv -f georgian-ps \
| "$0.post"
#!/usr/bin/env python3
from collections import namedtuple
from sys import exit, stderr, stdin
Column = namedtuple('Column', 'start end name')
lines = iter(stdin)
line = next(lines)
print('# ' + line.strip()) # title
data = []
prevcols = None
while line != '\f':
# FIXME This assumes lines inside a single heading go from longest to
# shortest. Good enough for now.
heads = []
for line in lines:
assert not line.startswith('\f')
if len(line.strip()) == 0: continue
if all(s.isdigit() for s in line.split()): break
line = line.rstrip()
heads.append(Column(len(line)-len(line.lstrip()), len(line),
line.lstrip()))
heads.sort(key=lambda x: x.start)
cols = [Column(None, -1, None)]
for s, e, n in heads:
ps, pe, pn = cols[-1]
if pe >= e:
cols[-1] = Column(ps, pe, pn+' '+n)
else:
cols.append(Column(s, e, n))
cols = cols[1:]
assert (prevcols is None or
all(pc.name == c.name for pc, c in zip(prevcols, cols)))
prevcols = cols
data.append(line.split())
for line in lines:
row = line.split()
if len(row) == 0: break # end of page
start = len(line) - len(line.lstrip()); shift = 0
while start > cols[shift].end: shift += 1
assert shift == 0 or shift == 2
if shift + len(row) == len(cols):
data.append([''] * shift + row)
else:
data.append(row)
print("warning: row {} has {} entries instead of {}"
.format(len(data), len(row), len(cols)),
file=stderr)
for line in lines:
if len(line.strip()) != 0: break
assert line.strip().isdigit() # page number
line = next(lines, None)
assert line.startswith('\f') # next page title
print('\t'.join(c.name for c in cols))
for row in data:
assert(all('\t' not in field for field in row))
print('\t'.join(row))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment