-
-
Save rdapaz/f27ecbf89ab90e2111e77f420ab17c7d to your computer and use it in GitHub Desktop.
A service class for interacting with Azure Document Intelligence API.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import logging | |
import requests | |
import time | |
from typing import Union, Dict | |
from config.settings import get_settings | |
class DocumentIntelligenceService: | |
""" | |
A service class for interacting with Azure Document Intelligence API. | |
This class provides methods to analyze documents using Azure's Document Intelligence service. | |
""" | |
def __init__(self): | |
""" | |
Initialize the DocumentIntelligenceService with API credentials and endpoint. | |
""" | |
settings = get_settings() | |
self.key = settings.document_intelligence.api_key | |
self.endpoint = settings.document_intelligence.endpoint | |
self.api_version = "2024-02-29-preview" # Currently only available in East US, West US2, and West Europe | |
def analyze( | |
self, | |
source: Union[str, bytes], | |
is_url: bool = True, | |
model_id: str = "prebuilt-layout", | |
) -> Dict: | |
""" | |
Analyze a document using Azure Document Intelligence. | |
Args: | |
source (Union[str, bytes]): The document source, either a URL or base64 encoded content. | |
is_url (bool): True if the source is a URL, False if it's base64 encoded content. | |
model_id (str): The ID of the model to use for analysis. | |
Returns: | |
Dict: The analysis results. | |
Raises: | |
requests.HTTPError: If the API request fails. | |
""" | |
result_id = self._submit_analysis(source, is_url, model_id) | |
return self._get_analysis_results(result_id, model_id) | |
def _submit_analysis( | |
self, source: Union[str, bytes], is_url: bool, model_id: str | |
) -> str: | |
""" | |
Submit a document for analysis to Azure Document Intelligence. | |
Args: | |
source (Union[str, bytes]): The document source, either a URL or base64 encoded content. | |
is_url (bool): True if the source is a URL, False if it's base64 encoded content. | |
model_id (str): The ID of the model to use for analysis. | |
Returns: | |
str: The result ID for the submitted analysis. | |
Raises: | |
ValueError: If the Operation-Location header is missing in the response. | |
requests.HTTPError: If the API request fails. | |
""" | |
url = f"{self.endpoint}/documentintelligence/documentModels/{model_id}:analyze?api-version={self.api_version}&outputContentFormat=markdown" | |
headers = { | |
"Content-Type": "application/json", | |
"Ocp-Apim-Subscription-Key": self.key, | |
} | |
data = {"urlSource": source} if is_url else {"base64Source": source} | |
logging.info("Submitting document for analysis") | |
response = requests.post(url, headers=headers, json=data) | |
response.raise_for_status() | |
operation_location = response.headers.get("Operation-Location") | |
if not operation_location: | |
raise ValueError("Operation-Location header is missing in the response.") | |
return operation_location.split("/")[-1].split("?")[0] | |
def _get_analysis_results(self, result_id: str, model_id: str) -> Dict: | |
""" | |
Retrieve the analysis results from Azure Document Intelligence. | |
Args: | |
result_id (str): The ID of the analysis result to retrieve. | |
model_id (str): The ID of the model used for analysis. | |
Returns: | |
Dict: The analysis results. | |
Raises: | |
requests.HTTPError: If the API request fails. | |
""" | |
url = f"{self.endpoint}/documentintelligence/documentModels/{model_id}/analyzeResults/{result_id}?api-version={self.api_version}&outputContentFormat=markdown" | |
headers = {"Ocp-Apim-Subscription-Key": self.key} | |
while True: | |
logging.info("Waiting for analysis to complete.") | |
time.sleep(2) | |
response = requests.get(url, headers=headers) | |
response.raise_for_status() | |
data = response.json() | |
if data.get("status") in ["succeeded", "failed"]: | |
return data | |
if __name__ == "__main__": | |
# Example usage of the DocumentIntelligenceService | |
client = DocumentIntelligenceService() | |
analysis_results = client.analyze( | |
source="https://s2.q4cdn.com/299287126/files/doc_financials/2024/ar/Amazon-com-Inc-2023-Annual-Report.pdf" | |
) | |
print(analysis_results.keys()) | |
print(analysis_results["analyzeResult"].keys()) | |
print(analysis_results["analyzeResult"]["content"]) | |
print(analysis_results["analyzeResult"]["tables"]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment