Last active
December 22, 2023 09:58
-
-
Save mancap314/2a6fa4b0b9be1a1c2a7aa27981adc6ed to your computer and use it in GitHub Desktop.
Single local PDF extraction with Document AI processor
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Command line tool to run OCR with a Document AI processor over a single local PDF file | |
`pip install python-dotenv google-cloud google-cloud-documentai` | |
Also create a .env file for populating default values, containing following | |
keys: GCP_PROJECT_ID, PROCESSOR_ID, PROCESSOR_LOCATION corresponding to an OCR | |
processor created on the Document AI platform of your GCP project. | |
""" | |
from google.api_core.client_options import ClientOptions | |
from google.cloud import documentai | |
from typing import Union | |
import os | |
import argparse | |
from dotenv import load_dotenv | |
def ocr_extract( | |
file_path: Union[str, os.PathLike], | |
mime_type: str, | |
project_id: str, | |
processor_id: str, | |
processor_location: str, | |
) -> str: | |
# Instantiates a client | |
docai_client = documentai.DocumentProcessorServiceClient( | |
client_options=ClientOptions(api_endpoint=f"{processor_location}-documentai.googleapis.com") | |
) | |
# The full resource name of the processor, e.g.: | |
# projects/project-id/locations/location/processor/processor-id | |
# You must create new processors in the Cloud Console first | |
resource_name = docai_client.processor_path( | |
project_id, | |
processor_location, | |
processor_id | |
) | |
# Read the file into memory | |
with open(file_path, "rb") as image: | |
image_content = image.read() | |
# Load Binary Data into Document AI RawDocument Object | |
raw_document = documentai.RawDocument( | |
content=image_content, | |
mime_type=mime_type | |
) | |
# Configure the process request | |
request = documentai.ProcessRequest( | |
name=resource_name, | |
raw_document=raw_document | |
) | |
# Use the Document AI client to process the sample form | |
result = docai_client.process_document(request=request) | |
document_object = result.document | |
print("[INFO] ocr_extract(): Document processing complete.") | |
return document_object.text | |
def validate_file(f): | |
if not os.path.isfile(f): | |
raise argparse.ArgumentTypeError(f"{f} does not exist".format(f)) | |
return f | |
if __name__ == "__main__": | |
load_dotenv() | |
parser = argparse.ArgumentParser() | |
parser.add_argument("-f", "--file-path", | |
help="Path of the file to extract", | |
type=validate_file, | |
dest="file_path", | |
required=True | |
) | |
parser.add_argument("-m", "--mime-type", | |
help="Mime-Type of the file to extract. See: https://cloud.google.com/document-ai/docs/file-types", | |
type=str, | |
dest="mime_type", | |
required=False, | |
default="application/pdf" | |
) | |
parser.add_argument("-p", "--project-id", | |
help="GCloud Project ID. Get it through `gcloud config get-value core/project`", | |
type=str, | |
dest="project_id", | |
required=False, | |
default=os.getenv("GCP_PROJECT_ID") | |
) | |
parser.add_argument("-r", "--processor-id", | |
help="ID of the Processor created on the GCloud Document AI console", | |
type=str, | |
dest="processor_id", | |
required=False, | |
default=os.getenv("PROCESSOR_ID") | |
) | |
parser.add_argument("-l", "--location", | |
help="Location of the Document AI Processor. 'eu' or 'us'.", | |
type=str, | |
dest="processor_location", | |
required=False, | |
default=os.getenv("PROCESSOR_LOCATION") | |
) | |
args = parser.parse_args() | |
file_path = args.file_path | |
mime_type = args.mime_type | |
project_id = args.project_id | |
processor_id = args.processor_id | |
processor_location = args.processor_location | |
text = ocr_extract( | |
file_path=file_path, | |
mime_type=mime_type, | |
project_id=project_id, | |
processor_id=processor_id, | |
processor_location=processor_location | |
) | |
# Do whatever locally with the text then... | |
print(text[:100]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment