Last active
December 22, 2023 10:02
-
-
Save mancap314/a96ae779eee07e6b9c940e8b72bb9c87 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Command line tool for processing of a batch of PDF files stored on gcloud storage with a Document | |
AI processor. | |
`pip install python-dotenv google-cloud google-cloud-documentai | |
google-cloud-documentai-toolbox google-cloud-storage` | |
Also create a .env file for populating default values, containing following | |
keys: GCP_PROJECT_ID, PROCESSOR_ID, PROCESSOR_LOCATION, GCS_DOCUMENT_BUCKET and | |
GCS_DOCUMENT_TEXT corresponding to an OCR | |
processor created on the Document AI platform of your GCP project. The two last | |
keys correspond to the name of gcloud storage buckets, first for the stored PDF | |
documents and last for storing the results, without 'gs://'. | |
""" | |
from typing import Optional | |
from google.api_core.client_options import ClientOptions | |
from google.cloud import documentai | |
from google.cloud import documentai_toolbox | |
import os | |
from dotenv import load_dotenv | |
import argparse | |
def batch_process_toolbox( | |
project_id: str, | |
location: str, | |
processor_id: str, | |
gcs_input_uri: str, | |
gcs_output_uri: str, | |
processor_version_id: Optional[str] = None, | |
input_mime_type: Optional[str] = None, | |
field_mask: Optional[str] = None, | |
): | |
# You must set the api_endpoint if you use a location other than "us". | |
opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com") | |
client = documentai.DocumentProcessorServiceClient(client_options=opts) | |
if not gcs_input_uri.endswith("/") and "." in gcs_input_uri: | |
# Specify specific GCS URIs to process individual documents | |
gcs_document = documentai.GcsDocument( | |
gcs_uri=gcs_input_uri, mime_type=input_mime_type | |
) | |
# Load GCS Input URI into a List of document files | |
gcs_documents = documentai.GcsDocuments(documents=[gcs_document]) | |
input_config = documentai.BatchDocumentsInputConfig(gcs_documents=gcs_documents) | |
else: | |
# Specify a GCS URI Prefix to process an entire directory | |
gcs_prefix = documentai.GcsPrefix(gcs_uri_prefix=gcs_input_uri) | |
input_config = documentai.BatchDocumentsInputConfig(gcs_prefix=gcs_prefix) | |
# Cloud Storage URI for the Output Directory | |
gcs_output_config = documentai.DocumentOutputConfig.GcsOutputConfig( | |
gcs_uri=gcs_output_uri, field_mask=field_mask | |
) | |
# Where to write results | |
output_config = documentai.DocumentOutputConfig(gcs_output_config=gcs_output_config) | |
if processor_version_id: | |
# The full resource name of the processor version, e.g.: | |
# projects/{project_id}/locations/{location}/processors/{processor_id}/processorVersions/{processor_version_id} | |
name = client.processor_version_path( | |
project_id, location, processor_id, processor_version_id | |
) | |
else: | |
# The full resource name of the processor, e.g.: | |
# projects/{project_id}/locations/{location}/processors/{processor_id} | |
name = client.processor_path(project_id, location, processor_id) | |
request = documentai.BatchProcessRequest( | |
name=name, | |
input_documents=input_config, | |
document_output_config=output_config, | |
) | |
# BatchProcess returns a Long Running Operation (LRO) | |
operation = client.batch_process_documents(request) | |
# Operation Name Format: projects/{project_id}/locations/{location}/operations/{operation_id} | |
documents = documentai_toolbox.document.Document.from_batch_process_operation( | |
location=location, operation_name=operation.operation.name | |
) | |
for document in documents: | |
# Read the text recognition output from the processor | |
print("The document contains the following text:") | |
# Truncated at 100 characters for brevity | |
print(document.text[:100]) | |
if __name__ == "__main__": | |
load_dotenv() | |
parser = argparse.ArgumentParser() | |
parser.add_argument("-i", "--input-uri", | |
help="Input URI of GCloud Storage, e.g. gs://truc/[/bidule.pdf]", | |
type=str, | |
dest="input_uri", | |
required=False, | |
default=os.getenv("GCS_DOCUMENT_BUCKET") | |
) | |
parser.add_argument("-o", "--output-uri", | |
help="Output URI of GCloud Storage, e.g. gs://truc/[/bidule.pdf]", | |
type=str, | |
dest="output_uri", | |
required=False, | |
default=os.getenv("GCS_TEXT_BUCKET") | |
) | |
parser.add_argument("-m", "--mime-type", | |
help="Mime-Type of the file to extract. See: https://cloud.google.com/document-ai/docs/file-types", | |
type=str, | |
dest="mime_type", | |
required=False, | |
default="application/pdf" | |
) | |
parser.add_argument("-p", "--project-id", | |
help="GCloud Project ID. Get it through `gcloud config get-value core/project`", | |
type=str, | |
dest="project_id", | |
required=False, | |
default=os.getenv("GCP_PROJECT_ID") | |
) | |
parser.add_argument("-r", "--processor-id", | |
help="ID of the Processor created on the GCloud Document AI console", | |
type=str, | |
dest="processor_id", | |
required=False, | |
default=os.getenv("PROCESSOR_ID") | |
) | |
parser.add_argument("-l", "--location", | |
help="Location of the Document AI Processor. 'eu' or 'us'.", | |
type=str, | |
dest="processor_location", | |
required=False, | |
default=os.getenv("PROCESSOR_LOCATION") | |
) | |
parser.add_argument("-f", "--field-mask", | |
help="Fields to return in the document object", | |
type=str, | |
dest="field_mask", | |
required=False, | |
default="text,pages.pageNumber" | |
) | |
args = parser.parse_args() | |
input_uri = args.input_uri | |
output_uri = args.output_uri | |
mime_type = args.mime_type | |
project_id = args.project_id | |
processor_id = args.processor_id | |
processor_location = args.processor_location | |
field_mask = args.field_mask | |
batch_process_toolbox( | |
project_id=project_id, | |
location=processor_location, | |
processor_id=processor_id, | |
gcs_input_uri=input_uri, | |
gcs_output_uri=output_uri, | |
input_mime_type=mime_type, | |
field_mask=field_mask, | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment