Created
November 30, 2018 18:00
-
-
Save rasmi/835a25e36ffff1c5d460dd2b2819299a to your computer and use it in GitHub Desktop.
Google Cloud Vision API Document OCR
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# Copyright 2018 Google Inc. All Rights Reserved. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
"""Perform OCR on a directory of PDF documents in Google Cloud Storage. | |
Example Usage: | |
GCS_BUCKET="gs://my-bucket" | |
PDF_INPUT_DIRECTORY="${GCS_BUCKET}/pdf-input-directory" | |
OCR_OUTPUT_DIRECTORY="${GCS_BUCKET}/ocr-output-directory" | |
TEXT_OUTPUT_DIRECTORY="${GCS_BUCKET}/text-output-directory" | |
# To kick off an OCR process, run: | |
python document_ocr.py run-ocr \ | |
--input-directory $PDF_INPUT_DIRECTORY \ | |
--output-directory $OCR_OUTPUT_DIRECTORY | |
# Wait until OCR is complete. | |
# To write the plaintext output to GCS, run: | |
python document_ocr.py convert-ocr-output \ | |
--input-directory $OCR_OUTPUT_DIRECTORY \ | |
--output-directory $TEXT_OUTPUT_DIRECTORY | |
# To print the plaintext output locally, run: | |
python document_ocr.py print-ocr-output \ | |
--input-directory $OCR_OUTPUT_DIRECTORY | |
Based on | |
https://github.com/GoogleCloudPlatform/python-docs-samples/blob/8f28cb6fc85335b0b4fb0b0a113d0248852ac94d/vision/cloud-client/detect/detect.py#L665 | |
""" | |
import argparse | |
import os | |
import re | |
from google.cloud import storage | |
from google.cloud import vision | |
from google.protobuf import json_format | |
def gcs_bucket_and_prefix(gcs_path): | |
match = re.match(r'gs://([^/]+)/(.+)', gcs_path) | |
bucket_name = match.group(1) | |
prefix = match.group(2) | |
return (bucket_name, prefix) | |
def list_blobs(gcs_directory): | |
storage_client = storage.Client() | |
bucket_name, prefix = gcs_bucket_and_prefix(gcs_directory) | |
bucket = storage_client.get_bucket(bucket_name=bucket_name) | |
# List objects with the given prefix. | |
blob_list = list(bucket.list_blobs(prefix=prefix)) | |
# Remove directories from blob_list. | |
blob_list = [blob for blob in blob_list if not blob.name.endswith('/')] | |
return blob_list | |
def write_string_to_gcs(string, uri): | |
storage_client = storage.Client() | |
bucket_name, prefix = gcs_bucket_and_prefix(uri) | |
bucket = storage_client.get_bucket(bucket_name=bucket_name) | |
blob = bucket.blob(prefix) | |
blob.upload_from_string(string) | |
def blob_uri(blob): | |
uri = 'gs://{}/{}'.format(blob.bucket.name, blob.name) | |
return uri | |
def filename_extension(path): | |
full_filename = os.path.basename(path) | |
filename, extension = full_filename.split('.') | |
return (filename, extension) | |
def create_ocr_request(source_blob, output_directory): | |
feature = vision.types.Feature( | |
type=vision.enums.Feature.Type.DOCUMENT_TEXT_DETECTION) | |
# Source file input configuration. | |
source_uri = blob_uri(source_blob) | |
gcs_source = vision.types.GcsSource(uri=source_uri) | |
# Supported mime_types are: 'application/pdf' and 'image/tiff' | |
mime_type = 'application/pdf' | |
input_config = vision.types.InputConfig( | |
gcs_source=gcs_source, | |
mime_type=mime_type) | |
# Destination file output configuration. | |
filename, _ = filename_extension(source_blob.name) | |
destination_uri = '{}/{}-'.format(output_directory.rstrip('/'), filename) | |
gcs_destination = vision.types.GcsDestination(uri=destination_uri) | |
# How many pages should be grouped into each json output file (max 100). | |
batch_size = 100 | |
output_config = vision.types.OutputConfig( | |
gcs_destination=gcs_destination, | |
batch_size=batch_size) | |
request = vision.types.AsyncAnnotateFileRequest( | |
features=[feature], | |
input_config=input_config, | |
output_config=output_config) | |
return request | |
def async_detect_document(input_directory, output_directory): | |
"""OCR with PDF/TIFF as source files on GCS.""" | |
client = vision.ImageAnnotatorClient() | |
input_blobs = list_blobs(input_directory) | |
for blob in input_blobs: | |
request = create_ocr_request(blob, output_directory) | |
client.async_batch_annotate_files(requests=[request]) | |
def read_ocr_output(blob): | |
"""Reads an OCR output from GCS.""" | |
json_string = blob.download_as_string() | |
response = json_format.Parse( | |
json_string, | |
vision.types.AnnotateFileResponse()) | |
all_pages = [] | |
for page in response.responses: | |
annotation = page.full_text_annotation | |
all_pages.append(annotation.text) | |
return '\n'.join(all_pages) | |
def convert_ocr_output(input_directory, output_directory): | |
"""Converts OCR output to .txt files written on GCS.""" | |
ocr_output_blobs = list_blobs(input_directory) | |
for blob in ocr_output_blobs: | |
parsed_text = read_ocr_output(blob) | |
filename, _ = filename_extension(blob.name) | |
destination_uri = '{}/{}.txt'.format( | |
output_directory.rstrip('/'), filename) | |
write_string_to_gcs(parsed_text, destination_uri) | |
def print_ocr_output(input_directory): | |
"""Prints all OCR output blobs in a directory.""" | |
ocr_output_blobs = list_blobs(input_directory) | |
for blob in ocr_output_blobs: | |
print(read_ocr_output(blob)) | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser() | |
parser.add_argument('command') | |
parser.add_argument('--input-directory', required=True) | |
parser.add_argument('--output-directory') | |
args = parser.parse_args() | |
if args.command == 'run-ocr': | |
assert args.output_directory, ( | |
'--output-directory is required to run OCR.') | |
async_detect_document(args.input_directory, args.output_directory) | |
elif args.command == 'convert-ocr-output': | |
assert args.output_directory, ( | |
'--output-directory is required to convert OCR output.') | |
convert_ocr_output(args.input_directory, args.output_directory) | |
elif args.command == 'print-ocr-output': | |
print_ocr_output(args.input_directory) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment