Skip to content

Instantly share code, notes, and snippets.

@schrodyn
Forked from fr0gger/doclingPDF.py
Created January 10, 2025 00:47
Show Gist options
  • Save schrodyn/f60fad33270fd3ad9e1f1f407123628d to your computer and use it in GitHub Desktop.
Save schrodyn/f60fad33270fd3ad9e1f1f407123628d to your computer and use it in GitHub Desktop.
# Thomas Roccia - Docling demo
import json
from pathlib import Path
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions, TesseractOcrOptions
from docling.datamodel.base_models import InputFormat
from docling.document_converter import DocumentConverter, PdfFormatOption, WordFormatOption, SimplePipeline
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True # Enable OCR
pipeline_options.ocr_options = TesseractOcrOptions() # Use Tesseract OCR
# document converter
converter = DocumentConverter(
format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)}
)
# pass the PDF report
input_doc = "https://cdn-dynmedia-1.microsoft.com/is/content/microsoftcorp/microsoft/final/en-us/microsoft-brand/documents/Exec%20Summary_2024%20Microsoft%20Digital%20Defense%20Report.pdf"
result = converter.convert(input_doc)
# Export to markdown and JSON
output_path = Path("output")
output_path.mkdir(exist_ok=True)
markdown_output = result.document.export_to_markdown()
with open(output_path / "threat_report.md", "w", encoding="utf-8") as md_file:
md_file.write(markdown_output)
json_output = result.document.export_to_dict()
with open(output_path / "threat_report.json", "w", encoding="utf-8") as json_file:
json.dump(json_output, json_file)
print("All done")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment