Last active
September 12, 2016 09:57
-
-
Save ties/3a262a351e0eb7fc3bafccf1c33d759c to your computer and use it in GitHub Desktop.
Utility: Grep in OCR (tesseract) output
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| To be used when you want to grep in OCR output. | |
| python ocr_grep.py Screenshot\*.png -i werkzaamheden | |
| <glob pattern> <grep arguments> | |
| Each file is run through tesseract, its output is then piped through | |
| `grep <grep_arguments>` | |
| #futurework: Cache OCR output | |
| """ | |
| import argparse | |
| import glob | |
| import os | |
| from subprocess import Popen, PIPE | |
| def tesseract_text(name, grep_pattern): | |
| with Popen(["tesseract", name, "-"], stdout=PIPE, stderr=PIPE) as proc: | |
| with Popen(["grep"] + grep_pattern, | |
| stdin=proc.stdout, stdout=PIPE) as grep_proc: | |
| grep_out = grep_proc.stdout.read().decode('utf8').strip() | |
| for line in grep_out.split("\n"): | |
| if (line): | |
| print("{} {}".format(name, line)) | |
| def for_globbed_files(pattern, grep_pattern): | |
| for file_name in glob.glob(pattern): | |
| tesseract_text(os.path.abspath(file_name), grep_pattern) | |
| parser = argparse.ArgumentParser(description='Run OCR (tesseract) on all ' | |
| 'matching files') | |
| parser.add_argument('pattern', default='*.png', type=str, help='Glob pattern') | |
| parser.add_argument('args', nargs=argparse.REMAINDER, | |
| help='Args passed to grep') | |
| if __name__ == "__main__": | |
| args = parser.parse_args() | |
| for_globbed_files(args.pattern, args.args) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment