Created
November 8, 2022 06:25
-
-
Save navanchauhan/5fc602b1e023b60a66bc63bd4eecd4f8 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import fitz | |
from PIL import Image | |
from azure.cognitiveservices.vision.computervision import ComputerVisionClient | |
from azure.cognitiveservices.vision.computervision.models import OperationStatusCodes | |
from azure.cognitiveservices.vision.computervision.models import VisualFeatureTypes | |
from msrest.authentication import CognitiveServicesCredentials | |
from array import array | |
import os | |
import sys | |
import time | |
from io import BytesIO | |
from tqdm import tqdm | |
subscription_key = "" | |
endpoint = "" | |
computervision_client = ComputerVisionClient(endpoint, CognitiveServicesCredentials(subscription_key)) | |
cooldown = 20 # seconds, 20 calls per minute for student tier but doesn't work without cooldown | |
def replace_markdown_chars(text): | |
text = text.replace("* *","**") | |
text = text.replace("[ ", "[") | |
text = text.replace(" ]", "]") | |
text = text.replace(" .", ".") | |
return text | |
def text2mdtxt(text): | |
"""Converts text to Markdown text.""" | |
txt_block = "" | |
incomplete_line = "" | |
for line in text.splitlines(): | |
if line[0] == '#': # Header | |
txt_block += incomplete_line + "\n" | |
incomplete_line = "" | |
line = line.replace(' #', '#') | |
txt_block += line + "\n" | |
elif "¶" in line.replace(' ',''): | |
txt_block += incomplete_line + "\n\n" | |
incomplete_line = "" | |
else: | |
line = line.strip() | |
incomplete_line += line + " " | |
txt_block += incomplete_line + "\n" | |
return replace_markdown_chars(txt_block) | |
def get_images_from_pdf(pdf_file): | |
pdf_file = fitz.open(pdf_file) | |
images = [] | |
# Get Pages | |
for page_index in tqdm(range(len(pdf_file))): | |
page = pdf_file[page_index] | |
image_list = page.get_images() | |
# printing number of images found in this page | |
if image_list: | |
print(f"[+] Found a total of {len(image_list)} images in page {page_index}") | |
else: | |
print("[!] No images found on page", page_index) | |
for image_index, img in enumerate(page.get_images(), start=1): | |
# get the XREF of the image | |
xref = img[0] | |
# extract the image bytes | |
base_image = pdf_file.extract_image(xref) | |
image_bytes = base_image["image"] | |
# get the image extension | |
image_ext = base_image["ext"] | |
images.append(image_bytes) | |
return images | |
def get_text_from_pdf(fname: str): | |
all_txt = "" | |
images = get_images_from_pdf(fname) | |
for image in tqdm(images): | |
''' | |
OCR: Read File using the Read API, extract text - local | |
This example will extract text in an image | |
This API call can also extract handwriting style text | |
''' | |
# Call API with file and raw response (allows you to get the operation location) | |
read_response = computervision_client.read_in_stream(BytesIO(image), raw=True) | |
# Get the operation location (URL with an ID at the end) from the response | |
read_operation_location = read_response.headers["Operation-Location"] | |
# Grab the ID from the URL | |
operation_id = read_operation_location.split("/")[-1] | |
# Call the "GET" API and wait for it to retrieve the results | |
while True: | |
read_result = computervision_client.get_read_result(operation_id) | |
if read_result.status not in ['notStarted', 'running']: | |
break | |
time.sleep(1) | |
# Append the detected text, line by line | |
if read_result.status == OperationStatusCodes.succeeded: | |
for text_result in read_result.analyze_result.read_results: | |
for line in text_result.lines: | |
all_txt += line.text + "\n" | |
# Now, sleep for cooldown period | |
time.sleep(cooldown) | |
return all_txt | |
if __name__ == "__main__": | |
fname = "PDFs/zamn.pdf" | |
txt = get_text_from_pdf(fname) | |
with open("output.md","w") as f: | |
f.write(text2mdtxt(txt)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment