Last active
March 1, 2026 18:30
-
-
Save engineervix/c9fabbff3a2aac90c6315d12fe6569ad to your computer and use it in GitHub Desktop.
Convert PDF to Markdown docs using Gemini
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import os | |
| import sys | |
| import pathlib | |
| import time | |
| import argparse | |
| import re | |
| from dotenv import load_dotenv | |
| from google import genai | |
| from google.genai import types | |
| # Load environment variables from .env file | |
| load_dotenv() | |
| def convert_pdf_to_md(pdf_path: str, model_id: str, output_dir: str = "docs"): | |
| """ | |
| Uploads a PDF to Gemini Files API and converts it to a set of Markdown files | |
| structured for a static site generator. | |
| """ | |
| client = genai.Client() | |
| path = pathlib.Path(pdf_path) | |
| if not path.exists(): | |
| print(f"Error: File '{pdf_path}' not found.") | |
| sys.exit(1) | |
| print(f"Uploading {path.name}...") | |
| uploaded_file = client.files.upload(file=path) | |
| print("Processing document...") | |
| while True: | |
| file_info = client.files.get(name=uploaded_file.name) | |
| if file_info.state.name == 'ACTIVE': | |
| break | |
| elif file_info.state.name == 'FAILED': | |
| print("Error: File processing failed.") | |
| return | |
| time.sleep(2) | |
| print(f"Converting to Markdown using {model_id}...") | |
| system_instruction = """ | |
| <role> | |
| You are an expert document digitization assistant. You convert complex legislative PDFs into a series of Markdown files optimized for a static site generator. | |
| </role> | |
| <instructions> | |
| 1. **Analyze**: Break down the legislative Act into logical sections: | |
| - index.md: The title page and "Arrangement of Sections". | |
| - 01-part-i.md, 02-part-ii.md, etc.: Each major Part of the Act. | |
| - 99-schedules.md: Any schedules or subsidiary legislation at the end. | |
| 2. **Execute**: For each section, generate the Markdown content. | |
| 3. **Format**: | |
| - Each file MUST start with a YAML front matter block containing `title` and `description`. | |
| - Use H1 (#) for the main section title inside the file (e.g., # THE CYBER SECURITY ACT, 2025 or # PART I - PRELIMINARY). | |
| - Preserving legal numbering and hierarchy is the priority. | |
| - Use HTML `<ol>` tags with the appropriate `type` attribute (e.g., `<ol type="a">` for (a), (b), (c) or `<ol type="i">` for (i), (ii), (iii)) for itemized lists to ensure exact legal lettering is preserved. | |
| - REMOVE all page headers, footers, and page numbers. | |
| 4. **Output Structure**: | |
| - You must output the entire set of files as a single response. | |
| - Use the delimiter `--- FILE: filename.md ---` at the start of each new file's content. | |
| </instructions> | |
| <example_output> | |
| --- FILE: index.md --- | |
| --- | |
| title: The Cyber Security Act, 2025 | |
| description: Arrangement of Sections and Preliminary information. | |
| --- | |
| # THE CYBER SECURITY ACT, 2025 | |
| **ARRANGEMENT OF SECTIONS** | |
| ... | |
| --- FILE: 01-part-i.md --- | |
| --- | |
| title: Part I - Preliminary | |
| description: Short title, commencement, and interpretation. | |
| --- | |
| # PART I - PRELIMINARY | |
| **1. Short title and commencement** | |
| This Act may be cited as... | |
| **2. Interpretation** | |
| In this Act: | |
| <ol type="a"> | |
| <li>"access" means...</li> | |
| <li>"Agency" means...</li> | |
| </ol> | |
| </example_output> | |
| <task> | |
| Convert the uploaded PDF into multiple Markdown files as described. | |
| Follow the naming convention (index.md, 01-part-i.md, etc.). | |
| Ensure every definition and section is captured faithfully. | |
| </task> | |
| """ | |
| response = client.models.generate_content( | |
| model=model_id, | |
| contents=[ | |
| uploaded_file, | |
| "Please perform the multi-file conversion for the entire document." | |
| ], | |
| config=types.GenerateContentConfig( | |
| system_instruction=system_instruction, | |
| temperature=1.0, | |
| ) | |
| ) | |
| full_text = response.text | |
| # Create the output directory | |
| out_path = pathlib.Path(output_dir) | |
| out_path.mkdir(parents=True, exist_ok=True) | |
| # Split the output based on the delimiter | |
| file_chunks = re.split(r'--- FILE: ([\w\.-]+) ---', full_text) | |
| if len(file_chunks) < 2: | |
| # Fallback if the model didn't use delimiters correctly | |
| print("Warning: Model did not use delimiters correctly. Saving as a single file.") | |
| with open(out_path / "index.md", 'w', encoding='utf-8') as f: | |
| f.write(full_text) | |
| else: | |
| # file_chunks[0] is usually empty or preamble before first tag | |
| for i in range(1, len(file_chunks), 2): | |
| filename = file_chunks[i].strip() | |
| content = file_chunks[i+1].strip() | |
| # Clean up potential leading/trailing Markdown artifacts from the split | |
| with open(out_path / filename, 'w', encoding='utf-8') as f: | |
| f.write(content) | |
| print(f"Created {output_dir}/{filename}") | |
| # Cleanup | |
| client.files.delete(name=uploaded_file.name) | |
| print("Cleaned up uploaded file from API.") | |
| def main(): | |
| parser = argparse.ArgumentParser(description="Convert PDF to Markdown docs.") | |
| parser.add_argument("pdf_file", help="Path to the PDF file to convert.") | |
| parser.add_argument("--model", default="gemini-3-flash-preview", | |
| help="Gemini model to use (default: gemini-3-flash-preview).") | |
| parser.add_argument("--out", default="docs", help="Output directory (default: docs).") | |
| args = parser.parse_args() | |
| if not os.environ.get("GEMINI_API_KEY") and not os.environ.get("GOOGLE_API_KEY"): | |
| print("Error: Please set GEMINI_API_KEY or GOOGLE_API_KEY in your environment or .env file.") | |
| sys.exit(1) | |
| convert_pdf_to_md(args.pdf_file, args.model, args.out) | |
| if __name__ == "__main__": | |
| main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| google-genai==1.65.0 | |
| python-dotenv==1.2.1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment