Created
August 20, 2024 21:45
-
-
Save ivorkchan/e6caa654e862603e5f0f6c886cfb767f to your computer and use it in GitHub Desktop.
OCR prompt with Gemini.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import google.generativeai as genai | |
import os | |
from dotenv import load_dotenv | |
import time | |
import signal | |
load_dotenv() | |
API_KEY = os.getenv("GOOGLE_API_KEY") | |
if not API_KEY: | |
raise ValueError("Please set the GOOGLE_API_KEY in the .env file.") | |
genai.configure(api_key=API_KEY) | |
# Timeout handler | |
def timeout_handler(signum, frame): | |
raise TimeoutError("The operation timed out.") | |
def extract_text_from_pdf(file_path: str, start_page: int, end_page: int) -> str: | |
""" | |
Extract main text from the specified range of pages of a PDF file. | |
If start_page is 0, extract from all pages. | |
Args: | |
file_path (str): Path to the PDF file | |
start_page (int): Start page number (0 for all pages) | |
end_page (int): End page number | |
Returns: | |
str: Extracted text formatted as Markdown | |
""" | |
file_source = genai.upload_file(file_path) | |
model = genai.GenerativeModel(model_name="gemini-1.5-flash") | |
page_range = "all pages" if start_page == 0 else f"pages {start_page} to {end_page}" | |
prompt = f""" | |
Extract the main body text from {page_range} of the provided PDF file. Exclude cover pages, title pages, | |
table of contents, appendices, indexes, headers, footers, bookmarks, annotations, images, tables, footnotes, | |
and any other non-body text elements, include titles and subtitles if exist. | |
Format the extracted text as Markdown, following these guidelines: | |
1. Use appropriate heading levels (#, ##, ###, etc.) to represent the document's structure. | |
2. Maintain paragraph separations from the original text. | |
3. Use Markdown list formatting (ordered or unordered) for any lists encountered. | |
4. Preserve important emphasis such as bold or italics (if present in the original). | |
5. Use Markdown quote formatting for any quotations. | |
If you've finished extracting all the requested text, end your response with the phrase 'Gemini Work Done' | |
on a new line. If you haven't finished, simply stop at a natural breakpoint, and I will prompt you to continue. | |
""" | |
extracted_text = "" | |
chat = model.start_chat(history=[]) | |
try: | |
response = chat.send_message([prompt, file_source]) | |
extracted_text += response.text | |
while not extracted_text.strip().endswith("Gemini Work Done"): | |
time.sleep(1) # avoid 429 error | |
response = chat.send_message( | |
"Please continue the extraction, maintaining the same formatting." | |
) | |
extracted_text += "\n" + response.text | |
except Exception as e: | |
print(f"Error during text extraction: {str(e)}") | |
return extracted_text.replace("Gemini Work Done", "").strip() | |
def save_to_markdown(content: str, output_path: str): | |
""" | |
Save the content to a Markdown file. | |
Args: | |
content (str): The content to save | |
output_path (str): The path where to save the Markdown file | |
""" | |
with open(output_path, "w", encoding="utf-8") as f: | |
f.write(content) | |
print(f"Output saved to {output_path}") | |
def get_page_range() -> tuple: | |
""" | |
Get the page range from user input. | |
Returns: | |
tuple: (start_page, end_page) | |
""" | |
while True: | |
try: | |
page_range = input( | |
"Enter the page range (format: a-b, or 0 for all pages): " | |
) | |
if page_range == "0": | |
return 0, 0 | |
start, end = map(int, page_range.split("-")) | |
if 0 < start <= end: | |
return start, end | |
else: | |
print( | |
"Invalid range. Start page must be positive and end page must be greater than or equal to start page." | |
) | |
except ValueError: | |
print( | |
"Invalid input. Please use the format: a-b (e.g., 1-3) or 0 for all pages." | |
) | |
if __name__ == "__main__": | |
# Set the timeout | |
timeout = 30 * 60 # 30 minutes | |
# Set up the signal handler | |
signal.signal(signal.SIGALRM, timeout_handler) | |
signal.alarm(timeout) | |
try: | |
pdf_path = input("Enter the path to the source PDF file: ").strip() | |
while not os.path.isfile(pdf_path): | |
print("File not found. Please enter a valid file path.") | |
pdf_path = input("Enter the path to the source PDF file: ").strip() | |
start_page, end_page = get_page_range() | |
output_dir = os.path.dirname(pdf_path) | |
output_filename = ( | |
f"ocr_output_{'all' if start_page == 0 else f'{start_page}-{end_page}'}.md" | |
) | |
output_path = os.path.join(output_dir, output_filename) | |
extracted_text = extract_text_from_pdf(pdf_path, start_page, end_page) | |
save_to_markdown(extracted_text, output_path) | |
print("Text extraction completed successfully.") | |
except TimeoutError: | |
print("The operation timed out.") | |
finally: | |
signal.alarm(0) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment