Skip to content

Instantly share code, notes, and snippets.

View davidmezzetti's full-sized avatar

David Mezzetti davidmezzetti

View GitHub Profile
from txtai.pipeline import Textractor
# Docling backend, split text by sections
textractor = Textractor(sections=True, backend="docling")
# BERT Paper
textractor("https://arxiv.org/pdf/1810.04805")
# PDF converted to Markdown, split on Markdown sections
# ['## BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding...
from datasets import load_dataset
from sklearn.metrics import accuracy_score
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from txtai.pipeline import HFTrainer
def metrics(pred):
labels, preds = pred.label_ids, pred.predictions.argmax(-1)
# Calculate accuracy
return {"accuracy": accuracy_score(labels, preds)}
from txtai import Embeddings
from txtai.pipeline import Textractor
urls = "https://github.com/neuml/txtai"
textractor = Textractor(chunker="semantic")
embeddings = Embeddings(backend="ggml", ggml={"quantize": "q4_0"})
embeddings.index((url, x) for x in textractor(url))
embeddings.save("gguf")

Browser automation with Playwright

This example adds the Playwright MCP service to txtai agents.

Start the Playright MCP server locally.

npx @playwright/mcp@latest --port 8931

Text extraction MCP service

Extract text using txtai, docling, docker. Service available via Model Context Protocol (MCP).

/tmp/config/config.yml

# Enable MCP server
mcp: True

# Enable file uploads
from txtai import Agent
agent = Agent(
tools=["http://mcp.server/path"],
model="LLM path"
)

Wikipedia Embeddings MCP Server

config.yml

# Enable MCP server
mcp: True

# Load Wikipedia Embeddings index
cloud:
 provider: huggingface-hub
from txtai import Embeddings
# Start the indexing run
embeddings = Embeddings(content=True)
embeddings.index(stream(), checkpoint="checkpoint dir")
# Elapsed time ⏳ then ⚡💥🔥
# error, power outage, random failure
# Fix the issue 🧑‍🔧⚙️
from txtai import Embeddings
embeddings = Embeddings(content=True, graph=True)
embeddings.index(...)
# Standard Vector Search
embeddings.search("vector search query")
# Vector SQL query
embeddings.search("""