oyiptong · November 5, 2024 04:54
diff --git a/document_ai_ocr.py b/document_ai_ocr.py
 from glob import glob

 from google.api_core.client_options import ClientOptions
 from google.cloud import documentai  # type: ignore

 # TODO(developer): Uncomment these variables before running the sample.
 project_id = "PROJECT_ID"
 location = "us"  # Format is "us" or "eu"
 file_paths = sorted(glob("document-chunk-*.pdf"))
 processor_display_name = "BOOK_OCR_PROCESSOR" # Must be unique per project, e.g.: "My Processor"


 def quickstart(
    project_id: str,
    location: str,
    file_path: str,
    processor_display_name: str = "Book OCR",
 ):
    # You must set the `api_endpoint`if you use a location other than "us".
    opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")

    client = documentai.DocumentProcessorServiceClient(client_options=opts)

    # The full resource name of the location, e.g.:
    # `projects/{project_id}/locations/{location}`
    parent = client.common_location_path(project_id, location)

    # Create a Processor
    """
    processor = client.create_processor(
        parent=parent,
        processor=documentai.Processor(
            type_="OCR_PROCESSOR",  # Refer to https://cloud.google.com/document-ai/docs/create-processor for how to get available processor types
            display_name=processor_display_name,
        ),
    )
    """
    name = client.processor_path(project_id, location, "PROCESSOR_ID")
    processor = client.get_processor(name=name)

    # Print the processor information
    print(f"Processor Name: {processor.name}")

    # Read the file into memory
    with open(file_path, "rb") as image:
        image_content = image.read()

    # Load binary data
    raw_document = documentai.RawDocument(
        content=image_content,
        mime_type="application/pdf",  # Refer to https://cloud.google.com/document-ai/docs/file-types for supported file types
    )

    # Configure the process request
    # `processor.name` is the full resource name of the processor, e.g.:
    # `projects/{project_id}/locations/{location}/processors/{processor_id}`
    request = documentai.ProcessRequest(name=processor.name, raw_document=raw_document)

    result = client.process_document(request=request)

    # For a full list of `Document` object attributes, reference this page:
    # https://cloud.google.com/document-ai/docs/reference/rest/v1/Document
    document = result.document

    # Read the text recognition output from the processor
    print("The document contains the following text:")
    with open(file_path+".txt", 'w') as f:
        f.write(document.text)
        
 for path in file_paths:
    quickstart(project_id, location, path, processor_display_name)
	from glob import glob

	from google.api_core.client_options import ClientOptions
	from google.cloud import documentai # type: ignore

	# TODO(developer): Uncomment these variables before running the sample.
	project_id = "PROJECT_ID"
	location = "us" # Format is "us" or "eu"
	file_paths = sorted(glob("document-chunk-*.pdf"))
	processor_display_name = "BOOK_OCR_PROCESSOR" # Must be unique per project, e.g.: "My Processor"


	def quickstart(
	project_id: str,
	location: str,
	file_path: str,
	processor_display_name: str = "Book OCR",
	):
	# You must set the `api_endpoint`if you use a location other than "us".
	opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")

	client = documentai.DocumentProcessorServiceClient(client_options=opts)

	# The full resource name of the location, e.g.:
	# `projects/{project_id}/locations/{location}`
	parent = client.common_location_path(project_id, location)

	# Create a Processor
	"""
	processor = client.create_processor(
	parent=parent,
	processor=documentai.Processor(
	type_="OCR_PROCESSOR", # Refer to https://cloud.google.com/document-ai/docs/create-processor for how to get available processor types
	display_name=processor_display_name,
	),
	)
	"""
	name = client.processor_path(project_id, location, "PROCESSOR_ID")
	processor = client.get_processor(name=name)

	# Print the processor information
	print(f"Processor Name: {processor.name}")

	# Read the file into memory
	with open(file_path, "rb") as image:
	image_content = image.read()

	# Load binary data
	raw_document = documentai.RawDocument(
	content=image_content,
	mime_type="application/pdf", # Refer to https://cloud.google.com/document-ai/docs/file-types for supported file types
	)

	# Configure the process request
	# `processor.name` is the full resource name of the processor, e.g.:
	# `projects/{project_id}/locations/{location}/processors/{processor_id}`
	request = documentai.ProcessRequest(name=processor.name, raw_document=raw_document)

	result = client.process_document(request=request)

	# For a full list of `Document` object attributes, reference this page:
	# https://cloud.google.com/document-ai/docs/reference/rest/v1/Document
	document = result.document

	# Read the text recognition output from the processor
	print("The document contains the following text:")
	with open(file_path+".txt", 'w') as f:
	f.write(document.text)

	for path in file_paths:
	quickstart(project_id, location, path, processor_display_name)