ivorkchan · August 20, 2024 21:45
diff --git a/ocr-gemini.py b/ocr-gemini.py
 import google.generativeai as genai
 import os
 from dotenv import load_dotenv
 import time
 import signal

 load_dotenv()

 API_KEY = os.getenv("GOOGLE_API_KEY")
 if not API_KEY:
    raise ValueError("Please set the GOOGLE_API_KEY in the .env file.")

 genai.configure(api_key=API_KEY)


 # Timeout handler
 def timeout_handler(signum, frame):
    raise TimeoutError("The operation timed out.")


 def extract_text_from_pdf(file_path: str, start_page: int, end_page: int) -> str:
    """
    Extract main text from the specified range of pages of a PDF file.
    If start_page is 0, extract from all pages.

    Args:
    file_path (str): Path to the PDF file
    start_page (int): Start page number (0 for all pages)
    end_page (int): End page number

    Returns:
    str: Extracted text formatted as Markdown
    """
    file_source = genai.upload_file(file_path)
    model = genai.GenerativeModel(model_name="gemini-1.5-flash")

    page_range = "all pages" if start_page == 0 else f"pages {start_page} to {end_page}"

    prompt = f"""
    Extract the main body text from {page_range} of the provided PDF file. Exclude cover pages, title pages, 
    table of contents, appendices, indexes, headers, footers, bookmarks, annotations, images, tables, footnotes, 
    and any other non-body text elements, include titles and subtitles if exist.

    Format the extracted text as Markdown, following these guidelines:
    1. Use appropriate heading levels (#, ##, ###, etc.) to represent the document's structure.
    2. Maintain paragraph separations from the original text.
    3. Use Markdown list formatting (ordered or unordered) for any lists encountered.
    4. Preserve important emphasis such as bold or italics (if present in the original).
    5. Use Markdown quote formatting for any quotations.

    If you've finished extracting all the requested text, end your response with the phrase 'Gemini Work Done' 
    on a new line. If you haven't finished, simply stop at a natural breakpoint, and I will prompt you to continue.
    """

    extracted_text = ""
    chat = model.start_chat(history=[])

    try:
        response = chat.send_message([prompt, file_source])
        extracted_text += response.text

        while not extracted_text.strip().endswith("Gemini Work Done"):
            time.sleep(1)  # avoid 429 error
            response = chat.send_message(
                "Please continue the extraction, maintaining the same formatting."
            )
            extracted_text += "\n" + response.text

    except Exception as e:
        print(f"Error during text extraction: {str(e)}")

    return extracted_text.replace("Gemini Work Done", "").strip()


 def save_to_markdown(content: str, output_path: str):
    """
    Save the content to a Markdown file.

    Args:
    content (str): The content to save
    output_path (str): The path where to save the Markdown file
    """
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(content)
    print(f"Output saved to {output_path}")


 def get_page_range() -> tuple:
    """
    Get the page range from user input.

    Returns:
    tuple: (start_page, end_page)
    """
    while True:
        try:
            page_range = input(
                "Enter the page range (format: a-b, or 0 for all pages): "
            )
            if page_range == "0":
                return 0, 0
            start, end = map(int, page_range.split("-"))
            if 0 < start <= end:
                return start, end
            else:
                print(
                    "Invalid range. Start page must be positive and end page must be greater than or equal to start page."
                )
        except ValueError:
            print(
                "Invalid input. Please use the format: a-b (e.g., 1-3) or 0 for all pages."
            )


 if __name__ == "__main__":
    # Set the timeout
    timeout = 30 * 60  # 30 minutes

    # Set up the signal handler
    signal.signal(signal.SIGALRM, timeout_handler)
    signal.alarm(timeout)

    try:
        pdf_path = input("Enter the path to the source PDF file: ").strip()

        while not os.path.isfile(pdf_path):
            print("File not found. Please enter a valid file path.")
            pdf_path = input("Enter the path to the source PDF file: ").strip()

        start_page, end_page = get_page_range()

        output_dir = os.path.dirname(pdf_path)
        output_filename = (
            f"ocr_output_{'all' if start_page == 0 else f'{start_page}-{end_page}'}.md"
        )
        output_path = os.path.join(output_dir, output_filename)

        extracted_text = extract_text_from_pdf(pdf_path, start_page, end_page)
        save_to_markdown(extracted_text, output_path)

        print("Text extraction completed successfully.")

    except TimeoutError:
        print("The operation timed out.")

    finally:
        signal.alarm(0)
	import google.generativeai as genai
	import os
	from dotenv import load_dotenv
	import time
	import signal

	load_dotenv()

	API_KEY = os.getenv("GOOGLE_API_KEY")
	if not API_KEY:
	raise ValueError("Please set the GOOGLE_API_KEY in the .env file.")

	genai.configure(api_key=API_KEY)


	# Timeout handler
	def timeout_handler(signum, frame):
	raise TimeoutError("The operation timed out.")


	def extract_text_from_pdf(file_path: str, start_page: int, end_page: int) -> str:
	"""
	Extract main text from the specified range of pages of a PDF file.
	If start_page is 0, extract from all pages.

	Args:
	file_path (str): Path to the PDF file
	start_page (int): Start page number (0 for all pages)
	end_page (int): End page number

	Returns:
	str: Extracted text formatted as Markdown
	"""
	file_source = genai.upload_file(file_path)
	model = genai.GenerativeModel(model_name="gemini-1.5-flash")

	page_range = "all pages" if start_page == 0 else f"pages {start_page} to {end_page}"

	prompt = f"""
	Extract the main body text from {page_range} of the provided PDF file. Exclude cover pages, title pages,
	table of contents, appendices, indexes, headers, footers, bookmarks, annotations, images, tables, footnotes,
	and any other non-body text elements, include titles and subtitles if exist.

	Format the extracted text as Markdown, following these guidelines:
	1. Use appropriate heading levels (#, ##, ###, etc.) to represent the document's structure.
	2. Maintain paragraph separations from the original text.
	3. Use Markdown list formatting (ordered or unordered) for any lists encountered.
	4. Preserve important emphasis such as bold or italics (if present in the original).
	5. Use Markdown quote formatting for any quotations.

	If you've finished extracting all the requested text, end your response with the phrase 'Gemini Work Done'
	on a new line. If you haven't finished, simply stop at a natural breakpoint, and I will prompt you to continue.
	"""

	extracted_text = ""
	chat = model.start_chat(history=[])

	try:
	response = chat.send_message([prompt, file_source])
	extracted_text += response.text

	while not extracted_text.strip().endswith("Gemini Work Done"):
	time.sleep(1) # avoid 429 error
	response = chat.send_message(
	"Please continue the extraction, maintaining the same formatting."
	)
	extracted_text += "\n" + response.text

	except Exception as e:
	print(f"Error during text extraction: {str(e)}")

	return extracted_text.replace("Gemini Work Done", "").strip()


	def save_to_markdown(content: str, output_path: str):
	"""
	Save the content to a Markdown file.

	Args:
	content (str): The content to save
	output_path (str): The path where to save the Markdown file
	"""
	with open(output_path, "w", encoding="utf-8") as f:
	f.write(content)
	print(f"Output saved to {output_path}")


	def get_page_range() -> tuple:
	"""
	Get the page range from user input.

	Returns:
	tuple: (start_page, end_page)
	"""
	while True:
	try:
	page_range = input(
	"Enter the page range (format: a-b, or 0 for all pages): "
	)
	if page_range == "0":
	return 0, 0
	start, end = map(int, page_range.split("-"))
	if 0 < start <= end:
	return start, end
	else:
	print(
	"Invalid range. Start page must be positive and end page must be greater than or equal to start page."
	)
	except ValueError:
	print(
	"Invalid input. Please use the format: a-b (e.g., 1-3) or 0 for all pages."
	)


	if __name__ == "__main__":
	# Set the timeout
	timeout = 30 * 60 # 30 minutes

	# Set up the signal handler
	signal.signal(signal.SIGALRM, timeout_handler)
	signal.alarm(timeout)

	try:
	pdf_path = input("Enter the path to the source PDF file: ").strip()

	while not os.path.isfile(pdf_path):
	print("File not found. Please enter a valid file path.")
	pdf_path = input("Enter the path to the source PDF file: ").strip()

	start_page, end_page = get_page_range()

	output_dir = os.path.dirname(pdf_path)
	output_filename = (
	f"ocr_output_{'all' if start_page == 0 else f'{start_page}-{end_page}'}.md"
	)
	output_path = os.path.join(output_dir, output_filename)

	extracted_text = extract_text_from_pdf(pdf_path, start_page, end_page)
	save_to_markdown(extracted_text, output_path)

	print("Text extraction completed successfully.")

	except TimeoutError:
	print("The operation timed out.")

	finally:
	signal.alarm(0)