Skip to content

Instantly share code, notes, and snippets.

View davidmezzetti's full-sized avatar

David Mezzetti davidmezzetti

View GitHub Profile
from datasets import load_dataset
from transformers import AutoTokenizer
from txtai.models import Registry
from txtai.pipeline import HFTrainer, Labels
ds = load_dataset("emotion")
# Set seed for reproducibility
class SimpleEmbeddings(nn.Module):
def __init__(self, embeddings):
super().__init__()
self.embeddings = embeddings
def forward(self, input_ids=None, **kwargs):
return (self.embeddings(input_ids),)
from txtai.embeddings import Embeddings
data = ["Glad you found it", "Happy to see you", "What a cute picture",
"I'm angry", "That's upsetting", "That is so troubling",
"A shocking development right now", "Never thought I would see that", "Didn't see that coming"]
embeddings = Embeddings({"method": "pooling", "path": SimpleEmbeddings(model.embedding),
"tokenizer": "bert-base-uncased"})
# Create an index for the list of text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
# Train the model
pipeline = Pipeline([
('tfidf', TfidfVectorizer()),
('lr', LogisticRegression(max_iter=250))
])
from txtai.pipeline import MLOnnx, Similarity
def tokenize(inputs, **kwargs):
if isinstance(inputs, str):
inputs = [inputs]
return {"input_ids": [[x] for x in inputs]}
# Export to ONNX
onnx = MLOnnx()
import time
from txtai.pipeline import Labels
# Create labels instances per model type
tflabels = Labels(train("microsoft/xtremedistil-l6-h384-uncased", ds["train"], logging_steps=2000), dynamic=False)
thlabels = Labels((model, tokenizer), dynamic=False)
thlabels.pipeline.model.config.id2label = {0: "sadness", 1: "joy", 2: "love", 3: "anger", 4: "fear", 5: "surprise"}
# In Python
from txtai.pipeline import Summary, Textractor
from txtai.workflow import UrlTask, Task, Workflow
textract = Textractor(paragraphs=True, minlength=100, join=True)
summary = Summary()
workflow = Workflow([UrlTask(textract), Task(summary)])
print(list(workflow(["https://github.com/neuml/txtai"])))
from txtai.embeddings import Embeddings
data = ["US tops 5 million confirmed virus cases",
"Canada's last fully intact ice shelf has suddenly collapsed, forming a Manhattan-sized iceberg",
"Beijing mobilises invasion craft along coast as Taiwan tensions escalate",
"The National Park Service warns against sacrificing slower friends in a bear attack",
"Maine man wins $1M from $25 lottery ticket",
"Make huge profits without work, earn up to $100,000 a day"]
# Create embeddings index with content enabled. The default behavior is to only store indexed vectors.
# Create an index for the list of text
embeddings.index([(uid, {"text": text, "length": len(text)}, None) for uid, text in enumerate(data)])
# Filter by score
print(embeddings.search("select text, score from txtai where similar('hiking danger') and score >= 0.15"))
# Filter by metadata field 'length'
print(embeddings.search("select text, length, score from txtai where similar('feel good story') and score >= 0.05 and length >= 40"))
# Run aggregate queries
import urllib
from IPython.display import Image
# Get an image
request = urllib.request.urlopen("https://raw.githubusercontent.com/neuml/txtai/master/demo.gif")
# Upsert new record having both text and an object
embeddings.upsert([("txtai", {"text": "txtai executes machine-learning workflows to transform data and build AI-powered semantic search applications.", "object": request.read()}, None)])