dterracino · October 1, 2019 15:45
diff --git a/google-ocr.py b/google-ocr.py
 import argparse
 from datetime import datetime
 import os
 import subprocess
 import logging


 FILE_EXTENSIONS = {".jpg", ".png", ".pdf"}


 def get_ids(s):
    for line in s.decode().splitlines():
        if "id:" in line:
            yield line.split(":")[1].strip()


 def run_command(cmd):
    logging.debug("running " + " ".join(cmd))
    result = subprocess.run(cmd, capture_output=True, check=True)
    logging.debug(result.stdout.decode("utf-8"))
    return result


 def make_timestamped_folder(name):
    dirname = name + " " + datetime.utcnow().isoformat()
    result = run_command(["gdmkdir.py", dirname])
    return next(get_ids(result.stdout))


 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "indir",
        help="Input directory containing .jpg, .png or .pdf files",
        metavar="INPUT_DIR",
    )
    parser.add_argument(
        "-o",
        "--output-file",
        help="where to save OCR output (default: ocr-result.txt)",
    )
    parser.add_argument(
        "-R",
        "--remove",
        action="store_true",
        help="Remove uploaded files after OCR is completed",
    )
    logging_group = parser.add_mutually_exclusive_group()
    logging_group.add_argument("-v", "--verbose", action="store_true")
    logging_group.add_argument("-q", "--quiet", action="store_true")
    options = parser.parse_args()

    logging.basicConfig(
        level=("DEBUG" if options.verbose else "WARNING" if options.quiet else "INFO"),
        format="%(message)s",
    )

    input_dir = os.path.abspath(options.indir)
    folder_id = make_timestamped_folder(os.path.basename(input_dir))

    infiles = []
    for filename in os.listdir(input_dir):
        basename, ext = os.path.splitext(filename)
        if ext in FILE_EXTENSIONS:
            infiles.append(os.path.join(input_dir, filename))

    outfiles = []

    for image in sorted(infiles):
        logging.info("uploading %s", image)

        result = run_command(["gdput.py", "-t", "ocr", "-f", folder_id, image])

        for file_id in get_ids(result.stdout):
            filename = os.path.splitext(image)[1] + ".txt"
            outfile = os.path.join(input_dir, filename)
            outfiles.append(outfile)

            logging.info("downloading OCR text for %s", image)
            run_command(["gdget.py", "-f", "txt", "-s", outfile, file_id])

    merged_filename = os.path.join(input_dir, "ocr-result.txt")
    logging.info("Merging all text files into {}".format(merged_filename))

    with open(merged_filename, "w") as f:
        for textfile in outfiles:
            for line in open(textfile, "r"):
                f.write(line)
            f.write("\n\n")

    if options.remove:
        logging.info("removing uploaded files from GDrive")
        subprocess.run(["gdrm.py", folder_id], check=True)

    logging.info("Done!")


 if __name__ == "__main__":
    main()
	import argparse
	from datetime import datetime
	import os
	import subprocess
	import logging


	FILE_EXTENSIONS = {".jpg", ".png", ".pdf"}


	def get_ids(s):
	for line in s.decode().splitlines():
	if "id:" in line:
	yield line.split(":")[1].strip()


	def run_command(cmd):
	logging.debug("running " + " ".join(cmd))
	result = subprocess.run(cmd, capture_output=True, check=True)
	logging.debug(result.stdout.decode("utf-8"))
	return result


	def make_timestamped_folder(name):
	dirname = name + " " + datetime.utcnow().isoformat()
	result = run_command(["gdmkdir.py", dirname])
	return next(get_ids(result.stdout))


	def main():
	parser = argparse.ArgumentParser()
	parser.add_argument(
	"indir",
	help="Input directory containing .jpg, .png or .pdf files",
	metavar="INPUT_DIR",
	)
	parser.add_argument(
	"-o",
	"--output-file",
	help="where to save OCR output (default: ocr-result.txt)",
	)
	parser.add_argument(
	"-R",
	"--remove",
	action="store_true",
	help="Remove uploaded files after OCR is completed",
	)
	logging_group = parser.add_mutually_exclusive_group()
	logging_group.add_argument("-v", "--verbose", action="store_true")
	logging_group.add_argument("-q", "--quiet", action="store_true")
	options = parser.parse_args()

	logging.basicConfig(
	level=("DEBUG" if options.verbose else "WARNING" if options.quiet else "INFO"),
	format="%(message)s",
	)

	input_dir = os.path.abspath(options.indir)
	folder_id = make_timestamped_folder(os.path.basename(input_dir))

	infiles = []
	for filename in os.listdir(input_dir):
	basename, ext = os.path.splitext(filename)
	if ext in FILE_EXTENSIONS:
	infiles.append(os.path.join(input_dir, filename))

	outfiles = []

	for image in sorted(infiles):
	logging.info("uploading %s", image)

	result = run_command(["gdput.py", "-t", "ocr", "-f", folder_id, image])

	for file_id in get_ids(result.stdout):
	filename = os.path.splitext(image)[1] + ".txt"
	outfile = os.path.join(input_dir, filename)
	outfiles.append(outfile)

	logging.info("downloading OCR text for %s", image)
	run_command(["gdget.py", "-f", "txt", "-s", outfile, file_id])

	merged_filename = os.path.join(input_dir, "ocr-result.txt")
	logging.info("Merging all text files into {}".format(merged_filename))

	with open(merged_filename, "w") as f:
	for textfile in outfiles:
	for line in open(textfile, "r"):
	f.write(line)
	f.write("\n\n")

	if options.remove:
	logging.info("removing uploaded files from GDrive")
	subprocess.run(["gdrm.py", folder_id], check=True)

	logging.info("Done!")


	if __name__ == "__main__":
	main()