Skip to content

Instantly share code, notes, and snippets.

@oyiptong
Created November 5, 2024 04:54
Show Gist options
  • Save oyiptong/efacca1c3ef2c752f78c33cc889a6c80 to your computer and use it in GitHub Desktop.
Save oyiptong/efacca1c3ef2c752f78c33cc889a6c80 to your computer and use it in GitHub Desktop.
A program that takes PDF chunks and sends them to Google's Document AI for OCR
from glob import glob
from google.api_core.client_options import ClientOptions
from google.cloud import documentai # type: ignore
# TODO(developer): Uncomment these variables before running the sample.
project_id = "PROJECT_ID"
location = "us" # Format is "us" or "eu"
file_paths = sorted(glob("document-chunk-*.pdf"))
processor_display_name = "BOOK_OCR_PROCESSOR" # Must be unique per project, e.g.: "My Processor"
def quickstart(
project_id: str,
location: str,
file_path: str,
processor_display_name: str = "Book OCR",
):
# You must set the `api_endpoint`if you use a location other than "us".
opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
client = documentai.DocumentProcessorServiceClient(client_options=opts)
# The full resource name of the location, e.g.:
# `projects/{project_id}/locations/{location}`
parent = client.common_location_path(project_id, location)
# Create a Processor
"""
processor = client.create_processor(
parent=parent,
processor=documentai.Processor(
type_="OCR_PROCESSOR", # Refer to https://cloud.google.com/document-ai/docs/create-processor for how to get available processor types
display_name=processor_display_name,
),
)
"""
name = client.processor_path(project_id, location, "PROCESSOR_ID")
processor = client.get_processor(name=name)
# Print the processor information
print(f"Processor Name: {processor.name}")
# Read the file into memory
with open(file_path, "rb") as image:
image_content = image.read()
# Load binary data
raw_document = documentai.RawDocument(
content=image_content,
mime_type="application/pdf", # Refer to https://cloud.google.com/document-ai/docs/file-types for supported file types
)
# Configure the process request
# `processor.name` is the full resource name of the processor, e.g.:
# `projects/{project_id}/locations/{location}/processors/{processor_id}`
request = documentai.ProcessRequest(name=processor.name, raw_document=raw_document)
result = client.process_document(request=request)
# For a full list of `Document` object attributes, reference this page:
# https://cloud.google.com/document-ai/docs/reference/rest/v1/Document
document = result.document
# Read the text recognition output from the processor
print("The document contains the following text:")
with open(file_path+".txt", 'w') as f:
f.write(document.text)
for path in file_paths:
quickstart(project_id, location, path, processor_display_name)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment