engineervix · March 1, 2026 18:30
diff --git a/pdf_to_md.py b/pdf_to_md.py
 import os
 import sys
 import pathlib
 import time
 import argparse
 import re
 from dotenv import load_dotenv
 from google import genai
 from google.genai import types

 # Load environment variables from .env file
 load_dotenv()

 def convert_pdf_to_md(pdf_path: str, model_id: str, output_dir: str = "docs"):
    """
    Uploads a PDF to Gemini Files API and converts it to a set of Markdown files
    structured for a static site generator.
    """
    client = genai.Client()
    path = pathlib.Path(pdf_path)

    if not path.exists():
        print(f"Error: File '{pdf_path}' not found.")
        sys.exit(1)

    print(f"Uploading {path.name}...")
    uploaded_file = client.files.upload(file=path)

    print("Processing document...")
    while True:
        file_info = client.files.get(name=uploaded_file.name)
        if file_info.state.name == 'ACTIVE':
            break
        elif file_info.state.name == 'FAILED':
            print("Error: File processing failed.")
            return
        time.sleep(2)

    print(f"Converting to Markdown using {model_id}...")

    system_instruction = """
 <role>
 You are an expert document digitization assistant. You convert complex legislative PDFs into a series of Markdown files optimized for a static site generator.
 </role>

 <instructions>
 1. **Analyze**: Break down the legislative Act into logical sections:
    - index.md: The title page and "Arrangement of Sections".
    - 01-part-i.md, 02-part-ii.md, etc.: Each major Part of the Act.
    - 99-schedules.md: Any schedules or subsidiary legislation at the end.
 2. **Execute**: For each section, generate the Markdown content.
 3. **Format**:
    - Each file MUST start with a YAML front matter block containing `title` and `description`.
    - Use H1 (#) for the main section title inside the file (e.g., # THE CYBER SECURITY ACT, 2025 or # PART I - PRELIMINARY).
    - Preserving legal numbering and hierarchy is the priority.
    - Use HTML `<ol>` tags with the appropriate `type` attribute (e.g., `<ol type="a">` for (a), (b), (c) or `<ol type="i">` for (i), (ii), (iii)) for itemized lists to ensure exact legal lettering is preserved.
    - REMOVE all page headers, footers, and page numbers.
 4. **Output Structure**:
    - You must output the entire set of files as a single response.
    - Use the delimiter `--- FILE: filename.md ---` at the start of each new file's content.
 </instructions>

 <example_output>
 --- FILE: index.md ---
 ---
 title: The Cyber Security Act, 2025
 description: Arrangement of Sections and Preliminary information.
 ---
 # THE CYBER SECURITY ACT, 2025
 **ARRANGEMENT OF SECTIONS**
 ...

 --- FILE: 01-part-i.md ---
 ---
 title: Part I - Preliminary
 description: Short title, commencement, and interpretation.
 ---
 # PART I - PRELIMINARY
 **1. Short title and commencement**
 This Act may be cited as...

 **2. Interpretation**
 In this Act:
 <ol type="a">
  <li>"access" means...</li>
  <li>"Agency" means...</li>
 </ol>
 </example_output>

 <task>
 Convert the uploaded PDF into multiple Markdown files as described.
 Follow the naming convention (index.md, 01-part-i.md, etc.).
 Ensure every definition and section is captured faithfully.
 </task>
 """

    response = client.models.generate_content(
        model=model_id,
        contents=[
            uploaded_file,
            "Please perform the multi-file conversion for the entire document."
        ],
        config=types.GenerateContentConfig(
            system_instruction=system_instruction,
            temperature=1.0,
        )
    )

    full_text = response.text

    # Create the output directory
    out_path = pathlib.Path(output_dir)
    out_path.mkdir(parents=True, exist_ok=True)

    # Split the output based on the delimiter
    file_chunks = re.split(r'--- FILE: ([\w\.-]+) ---', full_text)

    if len(file_chunks) < 2:
        # Fallback if the model didn't use delimiters correctly
        print("Warning: Model did not use delimiters correctly. Saving as a single file.")
        with open(out_path / "index.md", 'w', encoding='utf-8') as f:
            f.write(full_text)
    else:
        # file_chunks[0] is usually empty or preamble before first tag
        for i in range(1, len(file_chunks), 2):
            filename = file_chunks[i].strip()
            content = file_chunks[i+1].strip()

            # Clean up potential leading/trailing Markdown artifacts from the split
            with open(out_path / filename, 'w', encoding='utf-8') as f:
                f.write(content)
            print(f"Created {output_dir}/{filename}")

    # Cleanup
    client.files.delete(name=uploaded_file.name)
    print("Cleaned up uploaded file from API.")

 def main():
    parser = argparse.ArgumentParser(description="Convert PDF to Markdown docs.")
    parser.add_argument("pdf_file", help="Path to the PDF file to convert.")
    parser.add_argument("--model", default="gemini-3-flash-preview",
                        help="Gemini model to use (default: gemini-3-flash-preview).")
    parser.add_argument("--out", default="docs", help="Output directory (default: docs).")

    args = parser.parse_args()

    if not os.environ.get("GEMINI_API_KEY") and not os.environ.get("GOOGLE_API_KEY"):
        print("Error: Please set GEMINI_API_KEY or GOOGLE_API_KEY in your environment or .env file.")
        sys.exit(1)

    convert_pdf_to_md(args.pdf_file, args.model, args.out)

 if __name__ == "__main__":
    main()
diff --git a/requirements.txt b/requirements.txt
 google-genai==1.65.0
 python-dotenv==1.2.1
	import os
	import sys
	import pathlib
	import time
	import argparse
	import re
	from dotenv import load_dotenv
	from google import genai
	from google.genai import types

	# Load environment variables from .env file
	load_dotenv()

	def convert_pdf_to_md(pdf_path: str, model_id: str, output_dir: str = "docs"):
	"""
	Uploads a PDF to Gemini Files API and converts it to a set of Markdown files
	structured for a static site generator.
	"""
	client = genai.Client()
	path = pathlib.Path(pdf_path)

	if not path.exists():
	print(f"Error: File '{pdf_path}' not found.")
	sys.exit(1)

	print(f"Uploading {path.name}...")
	uploaded_file = client.files.upload(file=path)

	print("Processing document...")
	while True:
	file_info = client.files.get(name=uploaded_file.name)
	if file_info.state.name == 'ACTIVE':
	break
	elif file_info.state.name == 'FAILED':
	print("Error: File processing failed.")
	return
	time.sleep(2)

	print(f"Converting to Markdown using {model_id}...")

	system_instruction = """
	<role>
	You are an expert document digitization assistant. You convert complex legislative PDFs into a series of Markdown files optimized for a static site generator.
	</role>

	<instructions>
	1. Analyze: Break down the legislative Act into logical sections:
	- index.md: The title page and "Arrangement of Sections".
	- 01-part-i.md, 02-part-ii.md, etc.: Each major Part of the Act.
	- 99-schedules.md: Any schedules or subsidiary legislation at the end.
	2. Execute: For each section, generate the Markdown content.
	3. Format:
	- Each file MUST start with a YAML front matter block containing `title` and `description`.
	- Use H1 (#) for the main section title inside the file (e.g., # THE CYBER SECURITY ACT, 2025 or # PART I - PRELIMINARY).
	- Preserving legal numbering and hierarchy is the priority.
	- Use HTML `<ol>` tags with the appropriate `type` attribute (e.g., `<ol type="a">` for (a), (b), (c) or `<ol type="i">` for (i), (ii), (iii)) for itemized lists to ensure exact legal lettering is preserved.
	- REMOVE all page headers, footers, and page numbers.
	4. Output Structure:
	- You must output the entire set of files as a single response.
	- Use the delimiter `--- FILE: filename.md ---` at the start of each new file's content.
	</instructions>

	<example_output>
	--- FILE: index.md ---
	---
	title: The Cyber Security Act, 2025
	description: Arrangement of Sections and Preliminary information.
	---
	# THE CYBER SECURITY ACT, 2025
	ARRANGEMENT OF SECTIONS
	...

	--- FILE: 01-part-i.md ---
	---
	title: Part I - Preliminary
	description: Short title, commencement, and interpretation.
	---
	# PART I - PRELIMINARY
	1. Short title and commencement
	This Act may be cited as...

	2. Interpretation
	In this Act:
	<ol type="a">
	<li>"access" means...</li>
	<li>"Agency" means...</li>
	</ol>
	</example_output>

	<task>
	Convert the uploaded PDF into multiple Markdown files as described.
	Follow the naming convention (index.md, 01-part-i.md, etc.).
	Ensure every definition and section is captured faithfully.
	</task>
	"""

	response = client.models.generate_content(
	model=model_id,
	contents=[
	uploaded_file,
	"Please perform the multi-file conversion for the entire document."
	],
	config=types.GenerateContentConfig(
	system_instruction=system_instruction,
	temperature=1.0,
	)
	)

	full_text = response.text

	# Create the output directory
	out_path = pathlib.Path(output_dir)
	out_path.mkdir(parents=True, exist_ok=True)

	# Split the output based on the delimiter
	file_chunks = re.split(r'--- FILE: ([\w\.-]+) ---', full_text)

	if len(file_chunks) < 2:
	# Fallback if the model didn't use delimiters correctly
	print("Warning: Model did not use delimiters correctly. Saving as a single file.")
	with open(out_path / "index.md", 'w', encoding='utf-8') as f:
	f.write(full_text)
	else:
	# file_chunks[0] is usually empty or preamble before first tag
	for i in range(1, len(file_chunks), 2):
	filename = file_chunks[i].strip()
	content = file_chunks[i+1].strip()

	# Clean up potential leading/trailing Markdown artifacts from the split
	with open(out_path / filename, 'w', encoding='utf-8') as f:
	f.write(content)
	print(f"Created {output_dir}/{filename}")

	# Cleanup
	client.files.delete(name=uploaded_file.name)
	print("Cleaned up uploaded file from API.")

	def main():
	parser = argparse.ArgumentParser(description="Convert PDF to Markdown docs.")
	parser.add_argument("pdf_file", help="Path to the PDF file to convert.")
	parser.add_argument("--model", default="gemini-3-flash-preview",
	help="Gemini model to use (default: gemini-3-flash-preview).")
	parser.add_argument("--out", default="docs", help="Output directory (default: docs).")

	args = parser.parse_args()

	if not os.environ.get("GEMINI_API_KEY") and not os.environ.get("GOOGLE_API_KEY"):
	print("Error: Please set GEMINI_API_KEY or GOOGLE_API_KEY in your environment or .env file.")
	sys.exit(1)

	convert_pdf_to_md(args.pdf_file, args.model, args.out)

	if __name__ == "__main__":
	main()
No results found