Skip to content

Instantly share code, notes, and snippets.

@ivorkchan
Created August 20, 2024 21:45
Show Gist options
  • Save ivorkchan/e6caa654e862603e5f0f6c886cfb767f to your computer and use it in GitHub Desktop.
Save ivorkchan/e6caa654e862603e5f0f6c886cfb767f to your computer and use it in GitHub Desktop.
OCR prompt with Gemini.
import google.generativeai as genai
import os
from dotenv import load_dotenv
import time
import signal
load_dotenv()
API_KEY = os.getenv("GOOGLE_API_KEY")
if not API_KEY:
raise ValueError("Please set the GOOGLE_API_KEY in the .env file.")
genai.configure(api_key=API_KEY)
# Timeout handler
def timeout_handler(signum, frame):
raise TimeoutError("The operation timed out.")
def extract_text_from_pdf(file_path: str, start_page: int, end_page: int) -> str:
"""
Extract main text from the specified range of pages of a PDF file.
If start_page is 0, extract from all pages.
Args:
file_path (str): Path to the PDF file
start_page (int): Start page number (0 for all pages)
end_page (int): End page number
Returns:
str: Extracted text formatted as Markdown
"""
file_source = genai.upload_file(file_path)
model = genai.GenerativeModel(model_name="gemini-1.5-flash")
page_range = "all pages" if start_page == 0 else f"pages {start_page} to {end_page}"
prompt = f"""
Extract the main body text from {page_range} of the provided PDF file. Exclude cover pages, title pages,
table of contents, appendices, indexes, headers, footers, bookmarks, annotations, images, tables, footnotes,
and any other non-body text elements, include titles and subtitles if exist.
Format the extracted text as Markdown, following these guidelines:
1. Use appropriate heading levels (#, ##, ###, etc.) to represent the document's structure.
2. Maintain paragraph separations from the original text.
3. Use Markdown list formatting (ordered or unordered) for any lists encountered.
4. Preserve important emphasis such as bold or italics (if present in the original).
5. Use Markdown quote formatting for any quotations.
If you've finished extracting all the requested text, end your response with the phrase 'Gemini Work Done'
on a new line. If you haven't finished, simply stop at a natural breakpoint, and I will prompt you to continue.
"""
extracted_text = ""
chat = model.start_chat(history=[])
try:
response = chat.send_message([prompt, file_source])
extracted_text += response.text
while not extracted_text.strip().endswith("Gemini Work Done"):
time.sleep(1) # avoid 429 error
response = chat.send_message(
"Please continue the extraction, maintaining the same formatting."
)
extracted_text += "\n" + response.text
except Exception as e:
print(f"Error during text extraction: {str(e)}")
return extracted_text.replace("Gemini Work Done", "").strip()
def save_to_markdown(content: str, output_path: str):
"""
Save the content to a Markdown file.
Args:
content (str): The content to save
output_path (str): The path where to save the Markdown file
"""
with open(output_path, "w", encoding="utf-8") as f:
f.write(content)
print(f"Output saved to {output_path}")
def get_page_range() -> tuple:
"""
Get the page range from user input.
Returns:
tuple: (start_page, end_page)
"""
while True:
try:
page_range = input(
"Enter the page range (format: a-b, or 0 for all pages): "
)
if page_range == "0":
return 0, 0
start, end = map(int, page_range.split("-"))
if 0 < start <= end:
return start, end
else:
print(
"Invalid range. Start page must be positive and end page must be greater than or equal to start page."
)
except ValueError:
print(
"Invalid input. Please use the format: a-b (e.g., 1-3) or 0 for all pages."
)
if __name__ == "__main__":
# Set the timeout
timeout = 30 * 60 # 30 minutes
# Set up the signal handler
signal.signal(signal.SIGALRM, timeout_handler)
signal.alarm(timeout)
try:
pdf_path = input("Enter the path to the source PDF file: ").strip()
while not os.path.isfile(pdf_path):
print("File not found. Please enter a valid file path.")
pdf_path = input("Enter the path to the source PDF file: ").strip()
start_page, end_page = get_page_range()
output_dir = os.path.dirname(pdf_path)
output_filename = (
f"ocr_output_{'all' if start_page == 0 else f'{start_page}-{end_page}'}.md"
)
output_path = os.path.join(output_dir, output_filename)
extracted_text = extract_text_from_pdf(pdf_path, start_page, end_page)
save_to_markdown(extracted_text, output_path)
print("Text extraction completed successfully.")
except TimeoutError:
print("The operation timed out.")
finally:
signal.alarm(0)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment