Skip to content

Instantly share code, notes, and snippets.

@BaderSZ
Created November 11, 2024 10:57
Show Gist options
  • Save BaderSZ/ac7b28e2d77ddc2e68b84659447062ba to your computer and use it in GitHub Desktop.
Save BaderSZ/ac7b28e2d77ddc2e68b84659447062ba to your computer and use it in GitHub Desktop.
OCR an input image using Google Vision API.
#!/usr/bin/env python3
"""
OCR an image using Google Vision API.
Requires:
- google-cloud-vision = "^3.8.0"
usage: googleocr.py [-h] -i INPUT_FILE [-o OUTPUT_FILE]
Google Vision OCR script. You'll need an API key from Google Console.
Either set it in the environment via "GOOGLE_API_KEY=" or change the variable in the script.
options:
-h, --help show this help message and exit
-i, --input INPUT_FILE
Path to the image to OCR.
-o, --output OUTPUT_FILE
Output file to write to (Default is ./output/INPUT_BASENAME.txt,
must not exist)
(C) Bader Zaidan 2024 - GPL-2.0
"""
from os import getenv
from argparse import ArgumentParser
from pathlib import Path
from google.cloud.vision import Image, AnnotateImageResponse, ImageAnnotatorClient
G_API_KEY = "MY_API_KEY" # get one from console.cloud.google.com
CLIENT_OPTS = {
"api_endpoint": "eu-vision.googleapis.com",
"api_key": getenv("GOOGLE_API_KEY", G_API_KEY),
}
def init_argparse() -> ArgumentParser:
"""Get init args."""
parser: ArgumentParser = ArgumentParser(
prog=__package__, # ,"ocr.py",
epilog="(C) Bader Zaidan 2024 - GPL-2.0",
description="""Google Vision OCR script. You'll need an API key from Google Console.
Either set it in the environment via "GOOGLE_API_KEY=" or change the variable
in the script.""",
)
parser.add_argument(
"-i",
"--input",
dest="input_file",
required=True,
help="Path to the image to OCR.",
type=Path,
)
parser.add_argument(
"-o",
"--output",
dest="output_file",
required=False,
help="Output file to write to (Default is ./output/INPUT_BASENAME.txt, must not exist)",
type=Path,
default=None,
)
return parser.parse_args()
def vision_parse(path_arg: Path) -> AnnotateImageResponse:
"""Use Google Vision API to detect text in an image file."""
client: ImageAnnotatorClient = ImageAnnotatorClient(
credentials=None, client_options=CLIENT_OPTS
)
image: Image = Image(content=path_arg.read_bytes())
# pylint: disable=no-member
response: AnnotateImageResponse = client.document_text_detection(image=image)
if response.error.message:
# pylint: disable=broad-exception-raised
raise Exception(f"{response.error.message}")
return response
def get_output_path(img_path: Path, extension: str = ".pdf") -> Path:
"""Take filename (with rel path) and returns the output relative to the script."""
output_path = Path("output") / (img_path.name.replace(img_path.suffix, extension))
if output_path.exists():
raise FileExistsError(f"File {output_path} already exists.")
return output_path
def main() -> None:
"""Parse args and OCR a file, output to text."""
args: ArgumentParser = init_argparse()
input_image_path: Path = args.input_file
output_file_path: Path = (
args.output_file
if args.output_file
else get_output_path(args.input_file, extension=".txt")
)
try:
parsed = vision_parse(input_image_path)
# output_file_path.write_text(str(parsed.full_text_annotation))
output_file_path.write_text(str(parsed.text_annotation))
except Exception as exp: # pylint: disable=broad-exception-caught
print(f"Failed to parse file '{input_image_path}':", exp)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment