This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from datasets import load_dataset | |
from transformers import AutoTokenizer | |
from txtai.models import Registry | |
from txtai.pipeline import HFTrainer, Labels | |
ds = load_dataset("emotion") | |
# Set seed for reproducibility |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class SimpleEmbeddings(nn.Module): | |
def __init__(self, embeddings): | |
super().__init__() | |
self.embeddings = embeddings | |
def forward(self, input_ids=None, **kwargs): | |
return (self.embeddings(input_ids),) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from txtai.embeddings import Embeddings | |
data = ["Glad you found it", "Happy to see you", "What a cute picture", | |
"I'm angry", "That's upsetting", "That is so troubling", | |
"A shocking development right now", "Never thought I would see that", "Didn't see that coming"] | |
embeddings = Embeddings({"method": "pooling", "path": SimpleEmbeddings(model.embedding), | |
"tokenizer": "bert-base-uncased"}) | |
# Create an index for the list of text |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.pipeline import Pipeline | |
# Train the model | |
pipeline = Pipeline([ | |
('tfidf', TfidfVectorizer()), | |
('lr', LogisticRegression(max_iter=250)) | |
]) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from txtai.pipeline import MLOnnx, Similarity | |
def tokenize(inputs, **kwargs): | |
if isinstance(inputs, str): | |
inputs = [inputs] | |
return {"input_ids": [[x] for x in inputs]} | |
# Export to ONNX | |
onnx = MLOnnx() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import time | |
from txtai.pipeline import Labels | |
# Create labels instances per model type | |
tflabels = Labels(train("microsoft/xtremedistil-l6-h384-uncased", ds["train"], logging_steps=2000), dynamic=False) | |
thlabels = Labels((model, tokenizer), dynamic=False) | |
thlabels.pipeline.model.config.id2label = {0: "sadness", 1: "joy", 2: "love", 3: "anger", 4: "fear", 5: "surprise"} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# In Python | |
from txtai.pipeline import Summary, Textractor | |
from txtai.workflow import UrlTask, Task, Workflow | |
textract = Textractor(paragraphs=True, minlength=100, join=True) | |
summary = Summary() | |
workflow = Workflow([UrlTask(textract), Task(summary)]) | |
print(list(workflow(["https://github.com/neuml/txtai"]))) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from txtai.embeddings import Embeddings | |
data = ["US tops 5 million confirmed virus cases", | |
"Canada's last fully intact ice shelf has suddenly collapsed, forming a Manhattan-sized iceberg", | |
"Beijing mobilises invasion craft along coast as Taiwan tensions escalate", | |
"The National Park Service warns against sacrificing slower friends in a bear attack", | |
"Maine man wins $1M from $25 lottery ticket", | |
"Make huge profits without work, earn up to $100,000 a day"] | |
# Create embeddings index with content enabled. The default behavior is to only store indexed vectors. |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Create an index for the list of text | |
embeddings.index([(uid, {"text": text, "length": len(text)}, None) for uid, text in enumerate(data)]) | |
# Filter by score | |
print(embeddings.search("select text, score from txtai where similar('hiking danger') and score >= 0.15")) | |
# Filter by metadata field 'length' | |
print(embeddings.search("select text, length, score from txtai where similar('feel good story') and score >= 0.05 and length >= 40")) | |
# Run aggregate queries |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib | |
from IPython.display import Image | |
# Get an image | |
request = urllib.request.urlopen("https://raw.githubusercontent.com/neuml/txtai/master/demo.gif") | |
# Upsert new record having both text and an object | |
embeddings.upsert([("txtai", {"text": "txtai executes machine-learning workflows to transform data and build AI-powered semantic search applications.", "object": request.read()}, None)]) |