alexshpilkin · May 28, 2018 22:34
diff --git a/convert b/convert
 #!/bin/sh -eu
 set -o pipefail # FIXME bashism

 pdftotext -q -layout "$1" - \
  | iconv -t iso8859-1//translit | iconv -f georgian-ps \
  | "$0.post"
diff --git a/convert.post b/convert.post
 #!/usr/bin/env python3
 from collections import namedtuple
 from sys import exit, stderr, stdin

 Column = namedtuple('Column', 'start end name')

 lines = iter(stdin)
 line = next(lines)
 print('# ' + line.strip()) # title

 data = []
 prevcols = None

 while line != '\f':
 	# FIXME This assumes lines inside a single heading go from longest to
 	# shortest. Good enough for now.

 	heads = []
 	for line in lines:
 		assert not line.startswith('\f')
 		if len(line.strip()) == 0: continue
 		if all(s.isdigit() for s in line.split()): break
 		line = line.rstrip()
 		heads.append(Column(len(line)-len(line.lstrip()), len(line),
 		                    line.lstrip()))
 	heads.sort(key=lambda x: x.start)

 	cols = [Column(None, -1, None)]
 	for s, e, n in heads:
 		ps, pe, pn = cols[-1]
 		if pe >= e:
 			cols[-1] = Column(ps, pe, pn+' '+n)
 		else:
 			cols.append(Column(s, e, n))
 	cols = cols[1:]

 	assert (prevcols is None or
 	        all(pc.name == c.name for pc, c in zip(prevcols, cols)))
 	prevcols = cols

 	data.append(line.split())
 	for line in lines:
 		row = line.split()
 		if len(row) == 0: break # end of page
 		start = len(line) - len(line.lstrip()); shift = 0
 		while start > cols[shift].end: shift += 1
 		assert shift == 0 or shift == 2
 		if shift + len(row) == len(cols):
 			data.append([''] * shift + row)
 		else:
 			data.append(row)
 			print("warning: row {} has {} entries instead of {}"
 			      .format(len(data), len(row), len(cols)),
 			      file=stderr)

 	for line in lines:
 		if len(line.strip()) != 0: break
 	assert line.strip().isdigit() # page number
 	line = next(lines, None)
 	assert line.startswith('\f')  # next page title

 print('\t'.join(c.name for c in cols))
 for row in data:
 	assert(all('\t' not in field for field in row))
 	print('\t'.join(row))
	#!/bin/sh -eu
	set -o pipefail # FIXME bashism

	pdftotext -q -layout "$1" - \
	\| iconv -t iso8859-1//translit \| iconv -f georgian-ps \
	\| "$0.post"
	#!/usr/bin/env python3
	from collections import namedtuple
	from sys import exit, stderr, stdin

	Column = namedtuple('Column', 'start end name')

	lines = iter(stdin)
	line = next(lines)
	print('# ' + line.strip()) # title

	data = []
	prevcols = None

	while line != '\f':
	# FIXME This assumes lines inside a single heading go from longest to
	# shortest. Good enough for now.

	heads = []
	for line in lines:
	assert not line.startswith('\f')
	if len(line.strip()) == 0: continue
	if all(s.isdigit() for s in line.split()): break
	line = line.rstrip()
	heads.append(Column(len(line)-len(line.lstrip()), len(line),
	line.lstrip()))
	heads.sort(key=lambda x: x.start)

	cols = [Column(None, -1, None)]
	for s, e, n in heads:
	ps, pe, pn = cols[-1]
	if pe >= e:
	cols[-1] = Column(ps, pe, pn+' '+n)
	else:
	cols.append(Column(s, e, n))
	cols = cols[1:]

	assert (prevcols is None or
	all(pc.name == c.name for pc, c in zip(prevcols, cols)))
	prevcols = cols

	data.append(line.split())
	for line in lines:
	row = line.split()
	if len(row) == 0: break # end of page
	start = len(line) - len(line.lstrip()); shift = 0
	while start > cols[shift].end: shift += 1
	assert shift == 0 or shift == 2
	if shift + len(row) == len(cols):
	data.append([''] * shift + row)
	else:
	data.append(row)
	print("warning: row {} has {} entries instead of {}"
	.format(len(data), len(row), len(cols)),
	file=stderr)

	for line in lines:
	if len(line.strip()) != 0: break
	assert line.strip().isdigit() # page number
	line = next(lines, None)
	assert line.startswith('\f') # next page title

	print('\t'.join(c.name for c in cols))
	for row in data:
	assert(all('\t' not in field for field in row))
	print('\t'.join(row))