Skip to content

Instantly share code, notes, and snippets.

View davidmezzetti's full-sized avatar

David Mezzetti davidmezzetti

View GitHub Profile
import pandas as pd
from txtai import Embeddings
# Load Embeddings index
embeddings = Embeddings()
embeddings.load(provider="huggingface-hub", container="neuml/txtai-arxiv")
# Results as a DataFrame
pd.DataFrame(embeddings.search("Intelligent life beyond our planet", 5))

Original URL: https://dev.to/neuml/how-rag-with-txtai-works-4lkh

from txtai.pipeline import Textractor

textractor = Textractor(headers={"user-agent": "Mozilla/5.0"})
textractor("https://dev.to/neuml/how-rag-with-txtai-works-4lkh")

How RAG with txtai works - DEV Community

# CAUTION: PyMuPDF is AGPL-3
# pip install pymupdf4llm
import pymupdf4llm
from glob import glob
from txtai import Embeddings
from txtai.pipeline import Textractor
def stream(dir):
from txtai.pipeline import Textractor
textractor = Textractor(sections=True)
# Install [pipeline-data] extra to support extracting text from docx/pdf/xlsx
for section in textractor("https://github.com/neuml/txtai"):
print(f"\n[SECTION]\n{section}")
import time
from datetime import timedelta
from datasets import load_dataset
from txtai import LLM
from txtai.pipeline import Labels, HFTrainer
def prompt(text):
text = f"""
from txtai.pipeline import Translation
# Load pipeline
translate = Translation()
# Run translations
languages = ["fr", "es", "de", "hi", "ja"]
for language in languages:
text = translate("The sky is blue, the stars are far", language)
english = translate(text, "en")
# Install txtai with torch cpu
pip install txtai torch==2.3.1+cpu \
-f https://download.pytorch.org/whl/torch_stable.html
# Install llama.cpp
CMAKE_ARGS="-DLLAMA_CUDA=on" pip install llama-cpp-python
import time
from txtai.pipeline import LLM, Summary, Textractor
from txtai.workflow import Task, Workflow
# Extract text from HTML, ignore boilerplate text
textractor = Textractor(lines=True, join=True, minlength=100)
text = textractor("https://github.com/neuml/txtai")
# Summarization with standard models
##################################
# Data functions
##################################
import re
from datasets import load_dataset
def clean(text):
text = text.replace("\n", " ").strip()
from transformers import AutoTokenizer
from txtai.pipeline import Tokenizer
# Split using built-in Python method
print("Create embeddings for text".split())
print("🚀Create embeddings for text⭐".split())
print("为文本创建嵌入".split())
# Remove stop words
tokenizer = Tokenizer(stopwords=True)