-
-
Save dterracino/d5e8ec5fdd33db2c24e71145e6fe67ba to your computer and use it in GitHub Desktop.
Script to run Google Drive's OCR on a batch of image files
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
from datetime import datetime | |
import os | |
import subprocess | |
import logging | |
FILE_EXTENSIONS = {".jpg", ".png", ".pdf"} | |
def get_ids(s): | |
for line in s.decode().splitlines(): | |
if "id:" in line: | |
yield line.split(":")[1].strip() | |
def run_command(cmd): | |
logging.debug("running " + " ".join(cmd)) | |
result = subprocess.run(cmd, capture_output=True, check=True) | |
logging.debug(result.stdout.decode("utf-8")) | |
return result | |
def make_timestamped_folder(name): | |
dirname = name + " " + datetime.utcnow().isoformat() | |
result = run_command(["gdmkdir.py", dirname]) | |
return next(get_ids(result.stdout)) | |
def main(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument( | |
"indir", | |
help="Input directory containing .jpg, .png or .pdf files", | |
metavar="INPUT_DIR", | |
) | |
parser.add_argument( | |
"-o", | |
"--output-file", | |
help="where to save OCR output (default: ocr-result.txt)", | |
) | |
parser.add_argument( | |
"-R", | |
"--remove", | |
action="store_true", | |
help="Remove uploaded files after OCR is completed", | |
) | |
logging_group = parser.add_mutually_exclusive_group() | |
logging_group.add_argument("-v", "--verbose", action="store_true") | |
logging_group.add_argument("-q", "--quiet", action="store_true") | |
options = parser.parse_args() | |
logging.basicConfig( | |
level=("DEBUG" if options.verbose else "WARNING" if options.quiet else "INFO"), | |
format="%(message)s", | |
) | |
input_dir = os.path.abspath(options.indir) | |
folder_id = make_timestamped_folder(os.path.basename(input_dir)) | |
infiles = [] | |
for filename in os.listdir(input_dir): | |
basename, ext = os.path.splitext(filename) | |
if ext in FILE_EXTENSIONS: | |
infiles.append(os.path.join(input_dir, filename)) | |
outfiles = [] | |
for image in sorted(infiles): | |
logging.info("uploading %s", image) | |
result = run_command(["gdput.py", "-t", "ocr", "-f", folder_id, image]) | |
for file_id in get_ids(result.stdout): | |
filename = os.path.splitext(image)[1] + ".txt" | |
outfile = os.path.join(input_dir, filename) | |
outfiles.append(outfile) | |
logging.info("downloading OCR text for %s", image) | |
run_command(["gdget.py", "-f", "txt", "-s", outfile, file_id]) | |
merged_filename = os.path.join(input_dir, "ocr-result.txt") | |
logging.info("Merging all text files into {}".format(merged_filename)) | |
with open(merged_filename, "w") as f: | |
for textfile in outfiles: | |
for line in open(textfile, "r"): | |
f.write(line) | |
f.write("\n\n") | |
if options.remove: | |
logging.info("removing uploaded files from GDrive") | |
subprocess.run(["gdrm.py", folder_id], check=True) | |
logging.info("Done!") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment