Skip to content

Instantly share code, notes, and snippets.

@ties
Last active September 12, 2016 09:57
Show Gist options
  • Select an option

  • Save ties/3a262a351e0eb7fc3bafccf1c33d759c to your computer and use it in GitHub Desktop.

Select an option

Save ties/3a262a351e0eb7fc3bafccf1c33d759c to your computer and use it in GitHub Desktop.
Utility: Grep in OCR (tesseract) output
"""
To be used when you want to grep in OCR output.
python ocr_grep.py Screenshot\*.png -i werkzaamheden
<glob pattern> <grep arguments>
Each file is run through tesseract, its output is then piped through
`grep <grep_arguments>`
#futurework: Cache OCR output
"""
import argparse
import glob
import os
from subprocess import Popen, PIPE
def tesseract_text(name, grep_pattern):
with Popen(["tesseract", name, "-"], stdout=PIPE, stderr=PIPE) as proc:
with Popen(["grep"] + grep_pattern,
stdin=proc.stdout, stdout=PIPE) as grep_proc:
grep_out = grep_proc.stdout.read().decode('utf8').strip()
for line in grep_out.split("\n"):
if (line):
print("{} {}".format(name, line))
def for_globbed_files(pattern, grep_pattern):
for file_name in glob.glob(pattern):
tesseract_text(os.path.abspath(file_name), grep_pattern)
parser = argparse.ArgumentParser(description='Run OCR (tesseract) on all '
'matching files')
parser.add_argument('pattern', default='*.png', type=str, help='Glob pattern')
parser.add_argument('args', nargs=argparse.REMAINDER,
help='Args passed to grep')
if __name__ == "__main__":
args = parser.parse_args()
for_globbed_files(args.pattern, args.args)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment