Skip to content

Instantly share code, notes, and snippets.

@dterracino
Forked from anthrotype/google-ocr.py
Created October 1, 2019 15:45
Show Gist options
  • Save dterracino/d5e8ec5fdd33db2c24e71145e6fe67ba to your computer and use it in GitHub Desktop.
Save dterracino/d5e8ec5fdd33db2c24e71145e6fe67ba to your computer and use it in GitHub Desktop.
Script to run Google Drive's OCR on a batch of image files
import argparse
from datetime import datetime
import os
import subprocess
import logging
FILE_EXTENSIONS = {".jpg", ".png", ".pdf"}
def get_ids(s):
for line in s.decode().splitlines():
if "id:" in line:
yield line.split(":")[1].strip()
def run_command(cmd):
logging.debug("running " + " ".join(cmd))
result = subprocess.run(cmd, capture_output=True, check=True)
logging.debug(result.stdout.decode("utf-8"))
return result
def make_timestamped_folder(name):
dirname = name + " " + datetime.utcnow().isoformat()
result = run_command(["gdmkdir.py", dirname])
return next(get_ids(result.stdout))
def main():
parser = argparse.ArgumentParser()
parser.add_argument(
"indir",
help="Input directory containing .jpg, .png or .pdf files",
metavar="INPUT_DIR",
)
parser.add_argument(
"-o",
"--output-file",
help="where to save OCR output (default: ocr-result.txt)",
)
parser.add_argument(
"-R",
"--remove",
action="store_true",
help="Remove uploaded files after OCR is completed",
)
logging_group = parser.add_mutually_exclusive_group()
logging_group.add_argument("-v", "--verbose", action="store_true")
logging_group.add_argument("-q", "--quiet", action="store_true")
options = parser.parse_args()
logging.basicConfig(
level=("DEBUG" if options.verbose else "WARNING" if options.quiet else "INFO"),
format="%(message)s",
)
input_dir = os.path.abspath(options.indir)
folder_id = make_timestamped_folder(os.path.basename(input_dir))
infiles = []
for filename in os.listdir(input_dir):
basename, ext = os.path.splitext(filename)
if ext in FILE_EXTENSIONS:
infiles.append(os.path.join(input_dir, filename))
outfiles = []
for image in sorted(infiles):
logging.info("uploading %s", image)
result = run_command(["gdput.py", "-t", "ocr", "-f", folder_id, image])
for file_id in get_ids(result.stdout):
filename = os.path.splitext(image)[1] + ".txt"
outfile = os.path.join(input_dir, filename)
outfiles.append(outfile)
logging.info("downloading OCR text for %s", image)
run_command(["gdget.py", "-f", "txt", "-s", outfile, file_id])
merged_filename = os.path.join(input_dir, "ocr-result.txt")
logging.info("Merging all text files into {}".format(merged_filename))
with open(merged_filename, "w") as f:
for textfile in outfiles:
for line in open(textfile, "r"):
f.write(line)
f.write("\n\n")
if options.remove:
logging.info("removing uploaded files from GDrive")
subprocess.run(["gdrm.py", folder_id], check=True)
logging.info("Done!")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment