Skip to content

Instantly share code, notes, and snippets.

@engineervix
Last active March 1, 2026 18:30
Show Gist options
  • Select an option

  • Save engineervix/c9fabbff3a2aac90c6315d12fe6569ad to your computer and use it in GitHub Desktop.

Select an option

Save engineervix/c9fabbff3a2aac90c6315d12fe6569ad to your computer and use it in GitHub Desktop.
Convert PDF to Markdown docs using Gemini
import os
import sys
import pathlib
import time
import argparse
import re
from dotenv import load_dotenv
from google import genai
from google.genai import types
# Load environment variables from .env file
load_dotenv()
def convert_pdf_to_md(pdf_path: str, model_id: str, output_dir: str = "docs"):
"""
Uploads a PDF to Gemini Files API and converts it to a set of Markdown files
structured for a static site generator.
"""
client = genai.Client()
path = pathlib.Path(pdf_path)
if not path.exists():
print(f"Error: File '{pdf_path}' not found.")
sys.exit(1)
print(f"Uploading {path.name}...")
uploaded_file = client.files.upload(file=path)
print("Processing document...")
while True:
file_info = client.files.get(name=uploaded_file.name)
if file_info.state.name == 'ACTIVE':
break
elif file_info.state.name == 'FAILED':
print("Error: File processing failed.")
return
time.sleep(2)
print(f"Converting to Markdown using {model_id}...")
system_instruction = """
<role>
You are an expert document digitization assistant. You convert complex legislative PDFs into a series of Markdown files optimized for a static site generator.
</role>
<instructions>
1. **Analyze**: Break down the legislative Act into logical sections:
- index.md: The title page and "Arrangement of Sections".
- 01-part-i.md, 02-part-ii.md, etc.: Each major Part of the Act.
- 99-schedules.md: Any schedules or subsidiary legislation at the end.
2. **Execute**: For each section, generate the Markdown content.
3. **Format**:
- Each file MUST start with a YAML front matter block containing `title` and `description`.
- Use H1 (#) for the main section title inside the file (e.g., # THE CYBER SECURITY ACT, 2025 or # PART I - PRELIMINARY).
- Preserving legal numbering and hierarchy is the priority.
- Use HTML `<ol>` tags with the appropriate `type` attribute (e.g., `<ol type="a">` for (a), (b), (c) or `<ol type="i">` for (i), (ii), (iii)) for itemized lists to ensure exact legal lettering is preserved.
- REMOVE all page headers, footers, and page numbers.
4. **Output Structure**:
- You must output the entire set of files as a single response.
- Use the delimiter `--- FILE: filename.md ---` at the start of each new file's content.
</instructions>
<example_output>
--- FILE: index.md ---
---
title: The Cyber Security Act, 2025
description: Arrangement of Sections and Preliminary information.
---
# THE CYBER SECURITY ACT, 2025
**ARRANGEMENT OF SECTIONS**
...
--- FILE: 01-part-i.md ---
---
title: Part I - Preliminary
description: Short title, commencement, and interpretation.
---
# PART I - PRELIMINARY
**1. Short title and commencement**
This Act may be cited as...
**2. Interpretation**
In this Act:
<ol type="a">
<li>"access" means...</li>
<li>"Agency" means...</li>
</ol>
</example_output>
<task>
Convert the uploaded PDF into multiple Markdown files as described.
Follow the naming convention (index.md, 01-part-i.md, etc.).
Ensure every definition and section is captured faithfully.
</task>
"""
response = client.models.generate_content(
model=model_id,
contents=[
uploaded_file,
"Please perform the multi-file conversion for the entire document."
],
config=types.GenerateContentConfig(
system_instruction=system_instruction,
temperature=1.0,
)
)
full_text = response.text
# Create the output directory
out_path = pathlib.Path(output_dir)
out_path.mkdir(parents=True, exist_ok=True)
# Split the output based on the delimiter
file_chunks = re.split(r'--- FILE: ([\w\.-]+) ---', full_text)
if len(file_chunks) < 2:
# Fallback if the model didn't use delimiters correctly
print("Warning: Model did not use delimiters correctly. Saving as a single file.")
with open(out_path / "index.md", 'w', encoding='utf-8') as f:
f.write(full_text)
else:
# file_chunks[0] is usually empty or preamble before first tag
for i in range(1, len(file_chunks), 2):
filename = file_chunks[i].strip()
content = file_chunks[i+1].strip()
# Clean up potential leading/trailing Markdown artifacts from the split
with open(out_path / filename, 'w', encoding='utf-8') as f:
f.write(content)
print(f"Created {output_dir}/{filename}")
# Cleanup
client.files.delete(name=uploaded_file.name)
print("Cleaned up uploaded file from API.")
def main():
parser = argparse.ArgumentParser(description="Convert PDF to Markdown docs.")
parser.add_argument("pdf_file", help="Path to the PDF file to convert.")
parser.add_argument("--model", default="gemini-3-flash-preview",
help="Gemini model to use (default: gemini-3-flash-preview).")
parser.add_argument("--out", default="docs", help="Output directory (default: docs).")
args = parser.parse_args()
if not os.environ.get("GEMINI_API_KEY") and not os.environ.get("GOOGLE_API_KEY"):
print("Error: Please set GEMINI_API_KEY or GOOGLE_API_KEY in your environment or .env file.")
sys.exit(1)
convert_pdf_to_md(args.pdf_file, args.model, args.out)
if __name__ == "__main__":
main()
google-genai==1.65.0
python-dotenv==1.2.1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment