Last active
March 28, 2026 13:33
-
-
Save ttakezawa/de1097d4e182f0eeb8cf014f4f84e181 to your computer and use it in GitHub Desktop.
PDF to Markdown converter using docling VLM pipeline with OpenRouter API
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """PDF to Markdown converter using docling VLM pipeline with OpenRouter API.""" | |
| import argparse | |
| import logging | |
| import os | |
| import sys | |
| import time | |
| from docling.datamodel.base_models import InputFormat | |
| from docling.datamodel.pipeline_options import VlmConvertOptions, VlmPipelineOptions | |
| from docling.datamodel.vlm_engine_options import ApiVlmEngineOptions, VlmEngineType | |
| from docling.document_converter import DocumentConverter, PdfFormatOption | |
| from docling.pipeline.vlm_pipeline import VlmPipeline | |
| from docling_core.types.doc.document import ImageRefMode | |
| class ProgressLogger(logging.Handler): | |
| """VLM推論の進捗をページ単位で表示する。""" | |
| def __init__(self): | |
| super().__init__() | |
| self.start_time = time.time() | |
| def emit(self, record): | |
| msg = record.getMessage() | |
| elapsed = time.time() - self.start_time | |
| # doclingの内部ログからページ処理の進捗を拾う | |
| if any(kw in msg.lower() for kw in ["page", "convert", "process", "vlm"]): | |
| print(f"[{elapsed:6.1f}s] {msg}", file=sys.stderr) | |
| def main(): | |
| parser = argparse.ArgumentParser(description="PDF to Markdown (docling + OpenRouter VLM)") | |
| parser.add_argument("input", help="Input PDF file path") | |
| parser.add_argument("-o", "--output", default=".", help="Output directory (default: current dir)") | |
| parser.add_argument("--api-key", required=True, help="OpenRouter API key") | |
| parser.add_argument("--model", default="google/gemini-3.1-flash-lite-preview", help="OpenRouter model name") | |
| parser.add_argument("--preset", default="qwen", help="VLM preset (default: qwen)") | |
| parser.add_argument("--concurrency", type=int, default=4, help="Concurrent API requests (default: 4)") | |
| parser.add_argument("--timeout", type=float, default=120, help="API timeout in seconds (default: 120)") | |
| parser.add_argument("--max-tokens", type=int, default=20000, help="Max tokens per request (default: 20000)") | |
| args = parser.parse_args() | |
| if not os.path.exists(args.input): | |
| print(f"Error: {args.input} not found", file=sys.stderr) | |
| sys.exit(1) | |
| # docling内部ログを表示して進捗を追える | |
| logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s") | |
| print(f"Input: {args.input}", file=sys.stderr) | |
| print(f"Output: {args.output}/", file=sys.stderr) | |
| print(f"Model: {args.model}", file=sys.stderr) | |
| print(f"Preset: {args.preset}", file=sys.stderr) | |
| print(file=sys.stderr) | |
| start = time.time() | |
| vlm_options = VlmConvertOptions.from_preset( | |
| args.preset, | |
| engine_options=ApiVlmEngineOptions( | |
| runtime_type=VlmEngineType.API, | |
| url="https://openrouter.ai/api/v1/chat/completions", | |
| headers={"Authorization": f"Bearer {args.api_key}"}, | |
| params={ | |
| "model": args.model, | |
| "max_tokens": args.max_tokens, | |
| }, | |
| concurrency=args.concurrency, | |
| timeout=args.timeout, | |
| ), | |
| ) | |
| pipeline_options = VlmPipelineOptions( | |
| vlm_options=vlm_options, | |
| enable_remote_services=True, | |
| ) | |
| converter = DocumentConverter( | |
| format_options={ | |
| InputFormat.PDF: PdfFormatOption( | |
| pipeline_options=pipeline_options, | |
| pipeline_cls=VlmPipeline, | |
| ) | |
| } | |
| ) | |
| print("Converting...", file=sys.stderr) | |
| result = converter.convert(args.input) | |
| os.makedirs(args.output, exist_ok=True) | |
| stem = os.path.splitext(os.path.basename(args.input))[0] | |
| out_path = os.path.join(args.output, f"{stem}.md") | |
| md = result.document.export_to_markdown(image_mode=ImageRefMode.REFERENCED) | |
| with open(out_path, "w") as f: | |
| f.write(md) | |
| elapsed = time.time() - start | |
| print(f"\nDone: {out_path} ({elapsed:.1f}s)", file=sys.stderr) | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment