David Mezzetti davidmezzetti

Founder/CEO at NeuML. Building easy-to-use semantic search and workflow applications with txtai.

davidmezzetti / txtai-simple-embeddings-train.py

Last active October 20, 2021 14:17

	from datasets import load_dataset

	from transformers import AutoTokenizer

	from txtai.models import Registry
	from txtai.pipeline import HFTrainer, Labels

	ds = load_dataset("emotion")

	# Set seed for reproducibility

davidmezzetti / txtai-simple-embeddings-layer.py

Last active October 20, 2021 14:09

	class SimpleEmbeddings(nn.Module):
	def __init__(self, embeddings):
	super().__init__()

	self.embeddings = embeddings

	def forward(self, input_ids=None, **kwargs):
	return (self.embeddings(input_ids),)

davidmezzetti / txtai-simple-embeddings-index.py

Last active October 20, 2021 14:45

	from txtai.embeddings import Embeddings

	data = ["Glad you found it", "Happy to see you", "What a cute picture",
	"I'm angry", "That's upsetting", "That is so troubling",
	"A shocking development right now", "Never thought I would see that", "Didn't see that coming"]

	embeddings = Embeddings({"method": "pooling", "path": SimpleEmbeddings(model.embedding),
	"tokenizer": "bert-base-uncased"})

	# Create an index for the list of text

davidmezzetti / txtai-tfidf-logr.py

Created October 20, 2021 15:01

	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.linear_model import LogisticRegression
	from sklearn.pipeline import Pipeline

	# Train the model
	pipeline = Pipeline([
	('tfidf', TfidfVectorizer()),
	('lr', LogisticRegression(max_iter=250))
	])

davidmezzetti / txtai-tfidf-logr-run.py

Last active October 20, 2021 15:56

	from txtai.pipeline import MLOnnx, Similarity

	def tokenize(inputs, **kwargs):
	if isinstance(inputs, str):
	inputs = [inputs]

	return {"input_ids": [[x] for x in inputs]}

	# Export to ONNX
	onnx = MLOnnx()

davidmezzetti / txtai-simple-test.py

Created October 20, 2021 15:47

	import time

	from txtai.pipeline import Labels

	# Create labels instances per model type
	tflabels = Labels(train("microsoft/xtremedistil-l6-h384-uncased", ds["train"], logging_steps=2000), dynamic=False)

	thlabels = Labels((model, tokenizer), dynamic=False)
	thlabels.pipeline.model.config.id2label = {0: "sadness", 1: "joy", 2: "love", 3: "anger", 4: "fear", 5: "surprise"}

davidmezzetti / txtai-workflow-options.py

Created November 15, 2021 13:55

	# In Python
	from txtai.pipeline import Summary, Textractor
	from txtai.workflow import UrlTask, Task, Workflow

	textract = Textractor(paragraphs=True, minlength=100, join=True)
	summary = Summary()

	workflow = Workflow([UrlTask(textract), Task(summary)])
	print(list(workflow(["https://github.com/neuml/txtai"])))

davidmezzetti / txtai-content-storage.py

Created January 23, 2022 12:03

	from txtai.embeddings import Embeddings

	data = ["US tops 5 million confirmed virus cases",
	"Canada's last fully intact ice shelf has suddenly collapsed, forming a Manhattan-sized iceberg",
	"Beijing mobilises invasion craft along coast as Taiwan tensions escalate",
	"The National Park Service warns against sacrificing slower friends in a bear attack",
	"Maine man wins $1M from $25 lottery ticket",
	"Make huge profits without work, earn up to $100,000 a day"]

	# Create embeddings index with content enabled. The default behavior is to only store indexed vectors.

davidmezzetti / txtai-query-sql.py

Created January 23, 2022 12:11

	# Create an index for the list of text
	embeddings.index([(uid, {"text": text, "length": len(text)}, None) for uid, text in enumerate(data)])

	# Filter by score
	print(embeddings.search("select text, score from txtai where similar('hiking danger') and score >= 0.15"))

	# Filter by metadata field 'length'
	print(embeddings.search("select text, length, score from txtai where similar('feel good story') and score >= 0.05 and length >= 40"))

	# Run aggregate queries

davidmezzetti / txtai-object-storage.py

Created January 23, 2022 12:16

	import urllib

	from IPython.display import Image

	# Get an image
	request = urllib.request.urlopen("https://raw.githubusercontent.com/neuml/txtai/master/demo.gif")

	# Upsert new record having both text and an object
	embeddings.upsert([("txtai", {"text": "txtai executes machine-learning workflows to transform data and build AI-powered semantic search applications.", "object": request.read()}, None)])