Last active
April 7, 2018 17:41
-
-
Save zopieux/71707bc63b5e9341e8db3629570287d7 to your computer and use it in GitHub Desktop.
Google OCR to PDF invisible overlay
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
Example usage: | |
$ pip install reportlab pillow | |
$ ./overlay.py these-lemaire.tiff 'ocr/{p}.json' these-lemaire.pdf | |
""" | |
import argparse | |
import json | |
from collections import namedtuple | |
from pathlib import Path | |
from PIL import Image, ImageSequence | |
from reportlab.lib import pagesizes | |
from reportlab.pdfbase.pdfmetrics import stringWidth | |
from reportlab.pdfgen import canvas | |
TextAnnotation = namedtuple('TextAnnotation', 'content x y width height') | |
def parse_google_ocr(data): | |
for a in data['responses'][0]['textAnnotations']: | |
poly = a['boundingPoly']['vertices'] | |
x, y = poly[0]['x'], poly[0]['y'] | |
width = poly[2]['x'] - x | |
height = poly[2]['y'] - y | |
if height > 200: | |
continue | |
yield TextAnnotation(a['description'], x, y, width, height) | |
if __name__ == '__main__': | |
p = argparse.ArgumentParser() | |
p.add_argument('-p', '--pages', action='append', | |
help="pages to use (1-indexed)") | |
p.add_argument('tiff', type=argparse.FileType('rb')) | |
p.add_argument('jsonpattern') | |
p.add_argument('output', type=argparse.FileType('wb')) | |
args = p.parse_args() | |
tiff = Image.open(args.tiff) | |
tiff_pages = ImageSequence.Iterator(tiff) | |
print(f"Page count: {tiff.n_frames}") | |
pages = set(range(tiff.n_frames)) | |
if args.pages: | |
wanted = set() | |
for p in args.pages: | |
for p in p.split(','): | |
if p.isnumeric(): | |
wanted.add(int(p) - 1) | |
elif p.count('-') == 1: | |
f, t = p.split('-') | |
wanted.update(range(int(f) - 1, int(t))) | |
pages &= wanted | |
width, height = pagesize = pagesizes.A4 | |
c = canvas.Canvas(args.output, pagesize=pagesize) | |
for page in sorted(pages): | |
p = tiff_pages[page] | |
print(f"Page {page + 1}: size {p.size}") | |
print("\tdrawing background image") | |
c.drawInlineImage(p, 0, 0, width=width, height=height, | |
preserveAspectRatio=True) | |
try: | |
jsonfile = Path(eval("f'{}'".format(args.jsonpattern), {'p': page + 1})) | |
with jsonfile.open() as jsondata: | |
texts = list(parse_google_ocr(json.load(jsondata))) | |
except (FileNotFoundError, KeyError): | |
texts = [] | |
scale = 2 * width / p.size[0] # ??? | |
print(f"\tdrawing {len(texts)} texts, scale: {scale}") | |
text_writer = c.beginText() | |
text_writer.setTextRenderMode(3) | |
for text in texts: | |
font_size = 9 | |
while stringWidth(text.content, text_writer._fontname, | |
font_size) < text.width * scale * 0.95: | |
font_size += 0.5 | |
text_writer.setFont(text_writer._fontname, font_size) | |
text_writer.setTextOrigin(text.x * scale, | |
height - text.y * scale - 18) | |
text_writer.textOut(text.content) | |
c.drawText(text_writer) | |
# finish page | |
c.showPage() | |
c.save() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment