Skip to content

Instantly share code, notes, and snippets.

@ttakezawa
Last active March 28, 2026 13:33
Show Gist options
  • Select an option

  • Save ttakezawa/de1097d4e182f0eeb8cf014f4f84e181 to your computer and use it in GitHub Desktop.

Select an option

Save ttakezawa/de1097d4e182f0eeb8cf014f4f84e181 to your computer and use it in GitHub Desktop.
PDF to Markdown converter using docling VLM pipeline with OpenRouter API
"""PDF to Markdown converter using docling VLM pipeline with OpenRouter API."""
import argparse
import logging
import os
import sys
import time
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import VlmConvertOptions, VlmPipelineOptions
from docling.datamodel.vlm_engine_options import ApiVlmEngineOptions, VlmEngineType
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.pipeline.vlm_pipeline import VlmPipeline
from docling_core.types.doc.document import ImageRefMode
class ProgressLogger(logging.Handler):
"""VLM推論の進捗をページ単位で表示する。"""
def __init__(self):
super().__init__()
self.start_time = time.time()
def emit(self, record):
msg = record.getMessage()
elapsed = time.time() - self.start_time
# doclingの内部ログからページ処理の進捗を拾う
if any(kw in msg.lower() for kw in ["page", "convert", "process", "vlm"]):
print(f"[{elapsed:6.1f}s] {msg}", file=sys.stderr)
def main():
parser = argparse.ArgumentParser(description="PDF to Markdown (docling + OpenRouter VLM)")
parser.add_argument("input", help="Input PDF file path")
parser.add_argument("-o", "--output", default=".", help="Output directory (default: current dir)")
parser.add_argument("--api-key", required=True, help="OpenRouter API key")
parser.add_argument("--model", default="google/gemini-3.1-flash-lite-preview", help="OpenRouter model name")
parser.add_argument("--preset", default="qwen", help="VLM preset (default: qwen)")
parser.add_argument("--concurrency", type=int, default=4, help="Concurrent API requests (default: 4)")
parser.add_argument("--timeout", type=float, default=120, help="API timeout in seconds (default: 120)")
parser.add_argument("--max-tokens", type=int, default=20000, help="Max tokens per request (default: 20000)")
args = parser.parse_args()
if not os.path.exists(args.input):
print(f"Error: {args.input} not found", file=sys.stderr)
sys.exit(1)
# docling内部ログを表示して進捗を追える
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s")
print(f"Input: {args.input}", file=sys.stderr)
print(f"Output: {args.output}/", file=sys.stderr)
print(f"Model: {args.model}", file=sys.stderr)
print(f"Preset: {args.preset}", file=sys.stderr)
print(file=sys.stderr)
start = time.time()
vlm_options = VlmConvertOptions.from_preset(
args.preset,
engine_options=ApiVlmEngineOptions(
runtime_type=VlmEngineType.API,
url="https://openrouter.ai/api/v1/chat/completions",
headers={"Authorization": f"Bearer {args.api_key}"},
params={
"model": args.model,
"max_tokens": args.max_tokens,
},
concurrency=args.concurrency,
timeout=args.timeout,
),
)
pipeline_options = VlmPipelineOptions(
vlm_options=vlm_options,
enable_remote_services=True,
)
converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options,
pipeline_cls=VlmPipeline,
)
}
)
print("Converting...", file=sys.stderr)
result = converter.convert(args.input)
os.makedirs(args.output, exist_ok=True)
stem = os.path.splitext(os.path.basename(args.input))[0]
out_path = os.path.join(args.output, f"{stem}.md")
md = result.document.export_to_markdown(image_mode=ImageRefMode.REFERENCED)
with open(out_path, "w") as f:
f.write(md)
elapsed = time.time() - start
print(f"\nDone: {out_path} ({elapsed:.1f}s)", file=sys.stderr)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment