Skip to content

Instantly share code, notes, and snippets.

View davidmezzetti's full-sized avatar

David Mezzetti davidmezzetti

View GitHub Profile
from txtai.pipeline import Transcription, Translation
from txtai.workflow import FileTask
# Transcription instance
transcribe = Transcription("facebook/wav2vec2-large-960h")
# Create a translation instance
translate = Translation()
tasks = [
# Run initial query
uid = embeddings.search("feel good story", 1)[0][0]
print("Initial: ", data[uid])
# Update data
data[0] = "See it: baby panda born"
embeddings.upsert([(0, data[0], None)])
uid = embeddings.search("feel good story", 1)[0][0]
print("After update: ", data[uid])
# pip install txtai
from txtai.embeddings import Embeddings
import requests
# Embeddings with sentence-transformers backend
embeddings = Embeddings({"method": "transformers", "path": "sentence-transformers/paraphrase-mpnet-base-v2"})
# Query HN
data = [x["title"] for x in requests.get("https://hn.algolia.com/api/v1/search?tags=front_page").json()["hits"]]
# pip install txtai
from txtai.pipeline import Textractor
# Extract text from document into string
textractor = Textractor()
textractor("article.pdf")
# Extract text from document as list of sentences
textractor = Textractor(sentences=True)
from datasets import load_dataset
from txtai.pipeline import HFTrainer, Labels
# Load 500 GLUE sst2 sentiment records (remove .select to train on full sst2 dataset)
ds = load_dataset("glue", "sst2")["train"].select(range(500)).flatten_indices()
# Train sentiment classifier with txtai
trainer = HFTrainer()
model, tokenizer = trainer("bert-base-uncased", ds, columns=("sentence", "label"))
from transformers import pipeline
from txtai.embeddings import Embeddings
from txtai.models import OnnxModel
from txtai.pipeline import HFOnnx, Labels
# Export to model.onnx
path = "distilbert-base-uncased-finetuned-sst-2-english"
onnx = HFOnnx()
model = onnx(path, "text-classification", "classify.onnx")
from transformers import pipeline
from txtai.pipeline import HFTrainer
# Training data
data = [
{"question": "What ingredient?", "context": "Pour 1 can whole tomatoes", "answers": "tomatoes"},
{"question": "What ingredient?", "context": "Dice 1 yellow onion", "answers": "onion"},
{"question": "What ingredient?", "context": "Cut 1 red pepper", "answers": "pepper"},
{"question": "What ingredient?", "context": "Peel and dice 1 clove garlic", "answers": "garlic"},
{"question": "What ingredient?", "context": "Put 1/2 lb beef", "answers": "beef"},
from txtai.pipeline import Similarity
# Use default sentiment analysis model
similarity = Similarity(dynamic=False)
# Query with label text
similarity("positive", ["I am happy", "I am mad"])
# Query with label id
similarity("1", ["I am happy", "I am mad"])
from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from txtai.models import Models
from txtai.pipeline import MLOnnx
from transformers import pipeline
import os
import random
import torch
import numpy as np
from torch import nn
from torch.nn import CrossEntropyLoss
from transformers import AutoConfig, AutoTokenizer