David Mezzetti davidmezzetti

Image to parse

from txtai.pipeline import Textractor

textractor = Textractor(backend="docling", headers={"user-agent": "Mozilla/5.0"})
textractor("https://miro.medium.com/v2/resize:fit:720/format:webp/1*HHPVwIrcxYcLRvjDpwLQyQ.png")

Browser automation with Playwright

This example adds the Playwright MCP service to txtai agents.

Start the Playright MCP server locally.

npx @playwright/mcp@latest --port 8931

Text extraction MCP service

Extract text using txtai, docling, docker. Service available via Model Context Protocol (MCP).

/tmp/config/config.yml

# Enable MCP server
mcp: True

# Enable file uploads

Wikipedia Embeddings MCP Server

config.yml

# Enable MCP server
mcp: True

# Load Wikipedia Embeddings index
cloud:
 provider: huggingface-hub

	import soundfile as sf

	from txtai.pipeline import TextToSpeech

	# Build pipeline
	tts = TextToSpeech("neuml/kokoro-int8-onnx")

	# Generate speech
	speech, rate = tts(
	"""Have you ever considered having a snooty British accent?

	from smolagents import WebSearchTool
	from txtai import LLM

	def webrag(query):
	prompt = f"""
	Answer the following question using ONLY the context below.

	Query: {query}
	Context: {search(query)}
	"""

	from txtai.pipeline import Textractor

	# Docling backend, split text by sections
	textractor = Textractor(sections=True, backend="docling")

	# BERT Paper
	textractor("https://arxiv.org/pdf/1810.04805")

	# PDF converted to Markdown, split on Markdown sections
	# ['## BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding...

	from datasets import load_dataset
	from sklearn.metrics import accuracy_score
	from transformers import AutoModelForSequenceClassification, AutoTokenizer
	from txtai.pipeline import HFTrainer

	def metrics(pred):
	labels, preds = pred.label_ids, pred.predictions.argmax(-1)

	# Calculate accuracy
	return {"accuracy": accuracy_score(labels, preds)}

	from txtai import Embeddings
	from txtai.pipeline import Textractor

	urls = "https://github.com/neuml/txtai"
	textractor = Textractor(chunker="semantic")

	embeddings = Embeddings(backend="ggml", ggml={"quantize": "q4_0"})
	embeddings.index((url, x) for x in textractor(url))
	embeddings.save("gguf")

	from txtai import Agent

	agent = Agent(
	tools=["http://mcp.server/path"],
	model="LLM path"
	)