zopieux · April 7, 2018 17:41
diff --git a/ocr-pdf.py b/ocr-pdf.py
 #!/usr/bin/env python3

 """
 Example usage:
 $ pip install reportlab pillow
 $ ./overlay.py these-lemaire.tiff 'ocr/{p}.json' these-lemaire.pdf
 """

 import argparse
 import json
 from collections import namedtuple
 from pathlib import Path

 from PIL import Image, ImageSequence
 from reportlab.lib import pagesizes
 from reportlab.pdfbase.pdfmetrics import stringWidth
 from reportlab.pdfgen import canvas

 TextAnnotation = namedtuple('TextAnnotation', 'content x y width height')


 def parse_google_ocr(data):
    for a in data['responses'][0]['textAnnotations']:
        poly = a['boundingPoly']['vertices']
        x, y = poly[0]['x'], poly[0]['y']
        width = poly[2]['x'] - x
        height = poly[2]['y'] - y
        if height > 200:
            continue
        yield TextAnnotation(a['description'], x, y, width, height)


 if __name__ == '__main__':
    p = argparse.ArgumentParser()
    p.add_argument('-p', '--pages', action='append',
                   help="pages to use (1-indexed)")
    p.add_argument('tiff', type=argparse.FileType('rb'))
    p.add_argument('jsonpattern')
    p.add_argument('output', type=argparse.FileType('wb'))

    args = p.parse_args()

    tiff = Image.open(args.tiff)
    tiff_pages = ImageSequence.Iterator(tiff)

    print(f"Page count: {tiff.n_frames}")

    pages = set(range(tiff.n_frames))
    if args.pages:
        wanted = set()
        for p in args.pages:
            for p in p.split(','):
                if p.isnumeric():
                    wanted.add(int(p) - 1)
                elif p.count('-') == 1:
                    f, t = p.split('-')
                    wanted.update(range(int(f) - 1, int(t)))
        pages &= wanted

    width, height = pagesize = pagesizes.A4
    c = canvas.Canvas(args.output, pagesize=pagesize)

    for page in sorted(pages):
        p = tiff_pages[page]
        print(f"Page {page + 1}: size {p.size}")

        print("\tdrawing background image")
        c.drawInlineImage(p, 0, 0, width=width, height=height,
                          preserveAspectRatio=True)

        try:
            jsonfile = Path(eval("f'{}'".format(args.jsonpattern), {'p': page + 1}))
            with jsonfile.open() as jsondata:
                texts = list(parse_google_ocr(json.load(jsondata)))
        except (FileNotFoundError, KeyError):
            texts = []

        scale = 2 * width / p.size[0]  # ???
        print(f"\tdrawing {len(texts)} texts, scale: {scale}")
        text_writer = c.beginText()
        text_writer.setTextRenderMode(3)
        for text in texts:
            font_size = 9
            while stringWidth(text.content, text_writer._fontname,
                              font_size) < text.width * scale * 0.95:
                font_size += 0.5
            text_writer.setFont(text_writer._fontname, font_size)
            text_writer.setTextOrigin(text.x * scale,
                                      height - text.y * scale - 18)
            text_writer.textOut(text.content)
        c.drawText(text_writer)

        # finish page
        c.showPage()

    c.save()
	#!/usr/bin/env python3

	"""
	Example usage:
	$ pip install reportlab pillow
	$ ./overlay.py these-lemaire.tiff 'ocr/{p}.json' these-lemaire.pdf
	"""

	import argparse
	import json
	from collections import namedtuple
	from pathlib import Path

	from PIL import Image, ImageSequence
	from reportlab.lib import pagesizes
	from reportlab.pdfbase.pdfmetrics import stringWidth
	from reportlab.pdfgen import canvas

	TextAnnotation = namedtuple('TextAnnotation', 'content x y width height')


	def parse_google_ocr(data):
	for a in data['responses'][0]['textAnnotations']:
	poly = a['boundingPoly']['vertices']
	x, y = poly[0]['x'], poly[0]['y']
	width = poly[2]['x'] - x
	height = poly[2]['y'] - y
	if height > 200:
	continue
	yield TextAnnotation(a['description'], x, y, width, height)


	if __name__ == '__main__':
	p = argparse.ArgumentParser()
	p.add_argument('-p', '--pages', action='append',
	help="pages to use (1-indexed)")
	p.add_argument('tiff', type=argparse.FileType('rb'))
	p.add_argument('jsonpattern')
	p.add_argument('output', type=argparse.FileType('wb'))

	args = p.parse_args()

	tiff = Image.open(args.tiff)
	tiff_pages = ImageSequence.Iterator(tiff)

	print(f"Page count: {tiff.n_frames}")

	pages = set(range(tiff.n_frames))
	if args.pages:
	wanted = set()
	for p in args.pages:
	for p in p.split(','):
	if p.isnumeric():
	wanted.add(int(p) - 1)
	elif p.count('-') == 1:
	f, t = p.split('-')
	wanted.update(range(int(f) - 1, int(t)))
	pages &= wanted

	width, height = pagesize = pagesizes.A4
	c = canvas.Canvas(args.output, pagesize=pagesize)

	for page in sorted(pages):
	p = tiff_pages[page]
	print(f"Page {page + 1}: size {p.size}")

	print("\tdrawing background image")
	c.drawInlineImage(p, 0, 0, width=width, height=height,
	preserveAspectRatio=True)

	try:
	jsonfile = Path(eval("f'{}'".format(args.jsonpattern), {'p': page + 1}))
	with jsonfile.open() as jsondata:
	texts = list(parse_google_ocr(json.load(jsondata)))
	except (FileNotFoundError, KeyError):
	texts = []

	scale = 2 * width / p.size[0] # ???
	print(f"\tdrawing {len(texts)} texts, scale: {scale}")
	text_writer = c.beginText()
	text_writer.setTextRenderMode(3)
	for text in texts:
	font_size = 9
	while stringWidth(text.content, text_writer._fontname,
	font_size) < text.width * scale * 0.95:
	font_size += 0.5
	text_writer.setFont(text_writer._fontname, font_size)
	text_writer.setTextOrigin(text.x * scale,
	height - text.y * scale - 18)
	text_writer.textOut(text.content)
	c.drawText(text_writer)

	# finish page
	c.showPage()

	c.save()