Skip to content

Instantly share code, notes, and snippets.

@owainlewis
Last active September 18, 2025 11:08
Show Gist options
  • Save owainlewis/53010ab975c4246a214c3fbec1db6b6c to your computer and use it in GitHub Desktop.
Save owainlewis/53010ab975c4246a214c3fbec1db6b6c to your computer and use it in GitHub Desktop.
Docling
import tiktoken
from docling.document_converter import DocumentConverter
from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
from docling_core.transforms.chunker.tokenizer.openai import OpenAITokenizer
def pdf_to_chunks(pdf_url_or_path: str, max_tokens: int = 500):
"""
Convert a PDF to text chunks ready for vector storage.
Args:
pdf_url_or_path: URL or local path to PDF file
max_tokens: Maximum tokens per chunk (default: 500)
Returns:
List of text chunks
"""
# Step 1: Convert PDF to document
converter = DocumentConverter()
document = converter.convert(pdf_url_or_path).document
# Step 2: Set up chunker
tokenizer_enc = tiktoken.encoding_for_model("text-embedding-3-large")
tokenizer = OpenAITokenizer(tokenizer=tokenizer_enc, max_tokens=max_tokens)
chunker = HybridChunker(tokenizer=tokenizer)
# Step 3: Create chunks
chunks = list(chunker.chunk(dl_doc=document))
# Step 4: Extract text from chunks
text_chunks = [chunk.text for chunk in chunks]
return text_chunks
if __name__ == "__main__":
# Process the Bitcoin whitepaper
pdf_url = "https://bitcoin.org/bitcoin.pdf"
print("Processing PDF...")
chunks = pdf_to_chunks(pdf_url)
print(f"Created {len(chunks)} chunks")
print("\nFirst chunk preview:")
print(chunks[0])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment