Last active
September 18, 2025 11:08
-
-
Save owainlewis/53010ab975c4246a214c3fbec1db6b6c to your computer and use it in GitHub Desktop.
Docling
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import tiktoken | |
from docling.document_converter import DocumentConverter | |
from docling_core.transforms.chunker.hybrid_chunker import HybridChunker | |
from docling_core.transforms.chunker.tokenizer.openai import OpenAITokenizer | |
def pdf_to_chunks(pdf_url_or_path: str, max_tokens: int = 500): | |
""" | |
Convert a PDF to text chunks ready for vector storage. | |
Args: | |
pdf_url_or_path: URL or local path to PDF file | |
max_tokens: Maximum tokens per chunk (default: 500) | |
Returns: | |
List of text chunks | |
""" | |
# Step 1: Convert PDF to document | |
converter = DocumentConverter() | |
document = converter.convert(pdf_url_or_path).document | |
# Step 2: Set up chunker | |
tokenizer_enc = tiktoken.encoding_for_model("text-embedding-3-large") | |
tokenizer = OpenAITokenizer(tokenizer=tokenizer_enc, max_tokens=max_tokens) | |
chunker = HybridChunker(tokenizer=tokenizer) | |
# Step 3: Create chunks | |
chunks = list(chunker.chunk(dl_doc=document)) | |
# Step 4: Extract text from chunks | |
text_chunks = [chunk.text for chunk in chunks] | |
return text_chunks | |
if __name__ == "__main__": | |
# Process the Bitcoin whitepaper | |
pdf_url = "https://bitcoin.org/bitcoin.pdf" | |
print("Processing PDF...") | |
chunks = pdf_to_chunks(pdf_url) | |
print(f"Created {len(chunks)} chunks") | |
print("\nFirst chunk preview:") | |
print(chunks[0]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment