-
-
Save schrodyn/f60fad33270fd3ad9e1f1f407123628d to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Thomas Roccia - Docling demo | |
import json | |
from pathlib import Path | |
from docling.document_converter import DocumentConverter, PdfFormatOption | |
from docling.datamodel.pipeline_options import PdfPipelineOptions, TesseractOcrOptions | |
from docling.datamodel.base_models import InputFormat | |
from docling.document_converter import DocumentConverter, PdfFormatOption, WordFormatOption, SimplePipeline | |
pipeline_options = PdfPipelineOptions() | |
pipeline_options.do_ocr = True # Enable OCR | |
pipeline_options.ocr_options = TesseractOcrOptions() # Use Tesseract OCR | |
# document converter | |
converter = DocumentConverter( | |
format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)} | |
) | |
# pass the PDF report | |
input_doc = "https://cdn-dynmedia-1.microsoft.com/is/content/microsoftcorp/microsoft/final/en-us/microsoft-brand/documents/Exec%20Summary_2024%20Microsoft%20Digital%20Defense%20Report.pdf" | |
result = converter.convert(input_doc) | |
# Export to markdown and JSON | |
output_path = Path("output") | |
output_path.mkdir(exist_ok=True) | |
markdown_output = result.document.export_to_markdown() | |
with open(output_path / "threat_report.md", "w", encoding="utf-8") as md_file: | |
md_file.write(markdown_output) | |
json_output = result.document.export_to_dict() | |
with open(output_path / "threat_report.json", "w", encoding="utf-8") as json_file: | |
json.dump(json_output, json_file) | |
print("All done") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment