Examples with Redis Vector and ChromaDB
Embeddings via ru-en-RoSBERTa https://huggingface.co/ai-forever/ru-en-RoSBERTa
Examples with Redis Vector and ChromaDB
Embeddings via ru-en-RoSBERTa https://huggingface.co/ai-forever/ru-en-RoSBERTa
| import torch | |
| from langchain_core.embeddings import Embeddings | |
| from sentence_transformers import SentenceTransformer | |
| class RuEnRoSBERTaEmbeddings(Embeddings): | |
| def __init__( | |
| self, | |
| model_name: str = "ai-forever/ru-en-RoSBERTa", | |
| device: str | None = None, | |
| use_prompt_name: bool = True, # set to False if your sentence-transformers < 2.4.0 | |
| normalize: bool = True, | |
| batch_size: int = 32, | |
| ): | |
| self.device = device or ("cuda" if torch.cuda.is_available() else "cpu") | |
| self.model = SentenceTransformer(model_name, device=self.device) | |
| self.use_prompt_name = use_prompt_name | |
| self.normalize = normalize | |
| self.batch_size = batch_size | |
| def _encode(self, texts: list[str], prompt_name: str) -> list[list[float]]: | |
| kwargs = { | |
| "batch_size": self.batch_size, | |
| "convert_to_numpy": True, | |
| } | |
| if self.normalize: | |
| kwargs["normalize_embeddings"] = True | |
| if self.use_prompt_name: | |
| # Requires sentence-transformers >= 2.4.0 | |
| kwargs["prompt_name"] = prompt_name | |
| encoded = self.model.encode(texts, **kwargs) | |
| else: | |
| # Fallback: manually prepend prefixes | |
| prefix = { | |
| "search_query": "search_query: ", | |
| "search_document": "search_document: ", | |
| }[prompt_name] | |
| prefixed = [prefix + t for t in texts] | |
| encoded = self.model.encode(prefixed, **kwargs) | |
| return encoded.tolist() | |
| def embed_documents(self, texts: list[str]) -> list[list[float]]: | |
| # Use "search_document" for indexing documents | |
| return self._encode(texts, prompt_name="search_document") | |
| def embed_query(self, text: str) -> list[float]: | |
| # Use "search_query" for user questions | |
| return self._encode([text], prompt_name="search_query")[0] |
| [project] | |
| name = "gigachain-chat-qa-rag" | |
| version = "0.1.0" | |
| description = "Add your description here" | |
| readme = "README.md" | |
| requires-python = ">=3.13" | |
| dependencies = [ | |
| "chromadb>=1.0.20", | |
| "dotenv>=0.9.9", | |
| "ipywidgets>=8.1.7", | |
| "jupyter>=1.1.1", | |
| "langchain>=0.3.27", | |
| "langchain-chroma>=0.2.5", | |
| "langchain-community>=0.3.29", | |
| "langchain-gigachat>=0.3.12", | |
| "langchain-openai>=0.3.32", | |
| "langchain-redis>=0.2.3", | |
| "notebook>=7.4.5", | |
| "python-dotenv>=1.1.1", | |
| "sentence-transformers>=5.1.0", | |
| "torch>=2.8.0", | |
| "torchvision>=0.23.0", | |
| "transformers>=4.56.0", | |
| ] |
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "id": "initial_id", | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2025-09-01T11:13:44.315008Z", | |
| "start_time": "2025-09-01T11:13:44.310104Z" | |
| }, | |
| "collapsed": true, | |
| "jupyter": { | |
| "outputs_hidden": true | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "True" | |
| ] | |
| }, | |
| "execution_count": 4, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "from dotenv import find_dotenv, load_dotenv\n", | |
| "\n", | |
| "load_dotenv(find_dotenv())" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "id": "241c3f043c50f368", | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2025-09-01T11:13:44.318787Z", | |
| "start_time": "2025-09-01T11:13:44.315872Z" | |
| } | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "import logging\n", | |
| "\n", | |
| "logging.getLogger(\"httpx\").setLevel(logging.WARNING)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "id": "a68f1ba58ed7e1ea", | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2025-09-01T11:13:44.974080Z", | |
| "start_time": "2025-09-01T11:13:44.345379Z" | |
| } | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "from langchain_openai import ChatOpenAI\n", | |
| "\n", | |
| "llm = ChatOpenAI()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 7, | |
| "id": "71f815eecb77bc6", | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2025-09-01T11:13:49.446464Z", | |
| "start_time": "2025-09-01T11:13:44.974879Z" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "'По свидетельству Евангелий, у Понтия Пилата был пурпурный плащ.'" | |
| ] | |
| }, | |
| "execution_count": 7, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "from langchain.schema import HumanMessage\n", | |
| "\n", | |
| "question = \"Какой плащ был у Понтия Пилата?\"\n", | |
| "llm.invoke([HumanMessage(content=question)]).content[0:200]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 8, | |
| "id": "a1277e507bd675f5", | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2025-09-01T11:13:49.469872Z", | |
| "start_time": "2025-09-01T11:13:49.448818Z" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Total documents: 91\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "from langchain_community.document_loaders import TextLoader\n", | |
| "from langchain.text_splitter import (\n", | |
| " RecursiveCharacterTextSplitter,\n", | |
| ")\n", | |
| "\n", | |
| "loader = TextLoader(\"./sample_data/мастер_и_маргарита.txt\")\n", | |
| "documents = loader.load()\n", | |
| "text_splitter = RecursiveCharacterTextSplitter(\n", | |
| " chunk_size=1000,\n", | |
| " chunk_overlap=200,\n", | |
| ")\n", | |
| "documents = text_splitter.split_documents(documents)\n", | |
| "print(f\"Total documents: {len(documents)}\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 9, | |
| "id": "9b18a0f34531feb0", | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2025-09-01T11:13:52.722573Z", | |
| "start_time": "2025-09-01T11:13:49.471293Z" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "Some weights of RobertaModel were not initialized from the model checkpoint at ai-forever/ru-en-RoSBERTa and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']\n", | |
| "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "from embeddings_ru_en_rosberta import RuEnRoSBERTaEmbeddings\n", | |
| "\n", | |
| "embeddings = RuEnRoSBERTaEmbeddings(\n", | |
| " model_name=\"ai-forever/ru-en-RoSBERTa\",\n", | |
| " use_prompt_name=True,\n", | |
| ")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 10, | |
| "id": "5d8f49398c6d0fd0", | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2025-09-01T11:14:09.770636Z", | |
| "start_time": "2025-09-01T11:13:52.724344Z" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", | |
| "To disable this warning, you can either:\n", | |
| "\t- Avoid using `tokenizers` before the fork if possible\n", | |
| "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "from chromadb.config import Settings as ChromadbSettings\n", | |
| "from langchain_chroma import Chroma\n", | |
| "\n", | |
| "chroma_db = Chroma.from_documents(\n", | |
| " documents,\n", | |
| " embeddings,\n", | |
| " client_settings=ChromadbSettings(\n", | |
| " anonymized_telemetry=False,\n", | |
| " persist_directory=\"./chroma-db\",\n", | |
| " ),\n", | |
| ")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 11, | |
| "id": "9e8d728ad7df77da", | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2025-09-01T11:14:09.871217Z", | |
| "start_time": "2025-09-01T11:14:09.771492Z" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "4" | |
| ] | |
| }, | |
| "execution_count": 11, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "docs = chroma_db.similarity_search(question, k=4)\n", | |
| "len(docs)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 12, | |
| "id": "1339e3f8d207924d", | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2025-09-01T11:14:22.618539Z", | |
| "start_time": "2025-09-01T11:14:22.613595Z" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "... ему-то пропал: – Все просто: в белом плаще...\n", | |
| "\n", | |
| "\n", | |
| "\n", | |
| "Глава 2\n", | |
| "\n", | |
| "Понтий Пилат\n", | |
| "\n", | |
| "В белом плаще с кровавым подбоем, шаркающей кавалерийской походкой, ранним утром четырнадцатого числа весенн ...\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "print(f\"... {str(docs[0])[620:800]} ...\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 13, | |
| "id": "fc507d167f3ba66f", | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2025-09-01T11:15:01.876757Z", | |
| "start_time": "2025-09-01T11:14:59.755273Z" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "{'query': 'Какой плащ был у Понтия Пилата?',\n", | |
| " 'result': 'Понтий Пилат был одет в белый плащ с кровавым подбоем.'}" | |
| ] | |
| }, | |
| "execution_count": 13, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "from langchain.chains import RetrievalQA\n", | |
| "\n", | |
| "qa_chain = RetrievalQA.from_chain_type(\n", | |
| " llm,\n", | |
| " retriever=chroma_db.as_retriever(), \n", | |
| ")\n", | |
| "\n", | |
| "qa_chain.invoke({\"query\": question})" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 14, | |
| "id": "e62199724cd293ed", | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2025-09-01T11:15:15.907446Z", | |
| "start_time": "2025-09-01T11:15:13.945007Z" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "{'query': 'Какая трость была у Воланда?',\n", | |
| " 'result': 'В романе \"Мастер и Маргарита\" Михаила Булгакова, у Воланда трость с черным набалдашником в виде головы пуделя.'}" | |
| ] | |
| }, | |
| "execution_count": 14, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "qa_chain.invoke({\"query\": \"Какая трость была у Воланда?\"})" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 15, | |
| "id": "c6450392da04e409", | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2025-09-01T11:15:28.819236Z", | |
| "start_time": "2025-09-01T11:15:27.147356Z" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "{'query': 'что не смогли купить герои романа на Патриарших?',\n", | |
| " 'result': 'Герои романа \"Мастер и Маргарита\" не смогли купить ни нарзану, ни пива на Патриарших прудах.'}" | |
| ] | |
| }, | |
| "execution_count": 15, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "qa_chain.invoke({\"query\": \"что не смогли купить герои романа на Патриарших?\"})" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3 (ipykernel)", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.13.2" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 5 | |
| } |