Examples with Redis Vector and ChromaDB
Embeddings via ru-en-RoSBERTa https://huggingface.co/ai-forever/ru-en-RoSBERTa
Examples with Redis Vector and ChromaDB
Embeddings via ru-en-RoSBERTa https://huggingface.co/ai-forever/ru-en-RoSBERTa
| import torch | |
| from langchain_core.embeddings import Embeddings | |
| from sentence_transformers import SentenceTransformer | |
| class RuEnRoSBERTaEmbeddings(Embeddings): | |
| def __init__( | |
| self, | |
| model_name: str = "ai-forever/ru-en-RoSBERTa", | |
| device: str | None = None, | |
| use_prompt_name: bool = True, # set to False if your sentence-transformers < 2.4.0 | |
| normalize: bool = True, | |
| batch_size: int = 32, | |
| ): | |
| self.device = device or ("cuda" if torch.cuda.is_available() else "cpu") | |
| self.model = SentenceTransformer(model_name, device=self.device) | |
| self.use_prompt_name = use_prompt_name | |
| self.normalize = normalize | |
| self.batch_size = batch_size | |
| def _encode(self, texts: list[str], prompt_name: str) -> list[list[float]]: | |
| kwargs = { | |
| "batch_size": self.batch_size, | |
| "convert_to_numpy": True, | |
| } | |
| if self.normalize: | |
| kwargs["normalize_embeddings"] = True | |
| if self.use_prompt_name: | |
| # Requires sentence-transformers >= 2.4.0 | |
| kwargs["prompt_name"] = prompt_name | |
| encoded = self.model.encode(texts, **kwargs) | |
| else: | |
| # Fallback: manually prepend prefixes | |
| prefix = { | |
| "search_query": "search_query: ", | |
| "search_document": "search_document: ", | |
| }[prompt_name] | |
| prefixed = [prefix + t for t in texts] | |
| encoded = self.model.encode(prefixed, **kwargs) | |
| return encoded.tolist() | |
| def embed_documents(self, texts: list[str]) -> list[list[float]]: | |
| # Use "search_document" for indexing documents | |
| return self._encode(texts, prompt_name="search_document") | |
| def embed_query(self, text: str) -> list[float]: | |
| # Use "search_query" for user questions | |
| return self._encode([text], prompt_name="search_query")[0] |
| [project] | |
| name = "gigachain-chat-qa-rag" | |
| version = "0.1.0" | |
| description = "Add your description here" | |
| readme = "README.md" | |
| requires-python = ">=3.13" | |
| dependencies = [ | |
| "chromadb>=1.0.20", | |
| "dotenv>=0.9.9", | |
| "ipywidgets>=8.1.7", | |
| "jupyter>=1.1.1", | |
| "langchain>=0.3.27", | |
| "langchain-chroma>=0.2.5", | |
| "langchain-community>=0.3.29", | |
| "langchain-gigachat>=0.3.12", | |
| "langchain-openai>=0.3.32", | |
| "langchain-redis>=0.2.3", | |
| "notebook>=7.4.5", | |
| "python-dotenv>=1.1.1", | |
| "sentence-transformers>=5.1.0", | |
| "torch>=2.8.0", | |
| "torchvision>=0.23.0", | |
| "transformers>=4.56.0", | |
| ] |
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "id": "initial_id", | |
| "metadata": { | |
| "jupyter": { | |
| "is_executing": true | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "True" | |
| ] | |
| }, | |
| "execution_count": 1, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "from dotenv import find_dotenv, load_dotenv\n", | |
| "\n", | |
| "load_dotenv(find_dotenv())" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "id": "241c3f043c50f368", | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2025-09-02T13:56:52.234697Z", | |
| "start_time": "2025-09-02T13:56:52.233342Z" | |
| } | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "import logging\n", | |
| "\n", | |
| "logging.getLogger(\"httpx\").setLevel(logging.WARNING)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "id": "a68f1ba58ed7e1ea", | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2025-09-02T13:56:52.831505Z", | |
| "start_time": "2025-09-02T13:56:52.236089Z" | |
| } | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "from langchain_openai import ChatOpenAI\n", | |
| "\n", | |
| "llm = ChatOpenAI()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "id": "71f815eecb77bc6", | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2025-09-02T13:56:54.842698Z", | |
| "start_time": "2025-09-02T13:56:52.832340Z" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "'По легенде, на картине известного итальянского художника Тициана \"Экспозиция Откровенного Тела Иисуса (Из детища)\" Понтий Пилат изображен в длинном красном плаще с меховым воротником. Однако точно неи'" | |
| ] | |
| }, | |
| "execution_count": 4, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "from langchain.schema import HumanMessage\n", | |
| "\n", | |
| "question = \"Какой плащ был у Понтия Пилата?\"\n", | |
| "llm.invoke([HumanMessage(content=question)]).content[0:200]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "id": "a1277e507bd675f5", | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2025-09-02T13:56:54.863509Z", | |
| "start_time": "2025-09-02T13:56:54.843878Z" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Total documents: 91\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "from langchain_community.document_loaders import TextLoader\n", | |
| "from langchain.text_splitter import (\n", | |
| " RecursiveCharacterTextSplitter,\n", | |
| ")\n", | |
| "\n", | |
| "loader = TextLoader(\"./sample_data/мастер_и_маргарита.txt\")\n", | |
| "documents = loader.load()\n", | |
| "text_splitter = RecursiveCharacterTextSplitter(\n", | |
| " chunk_size=1000,\n", | |
| " chunk_overlap=200,\n", | |
| ")\n", | |
| "documents = text_splitter.split_documents(documents)\n", | |
| "print(f\"Total documents: {len(documents)}\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "id": "9b18a0f34531feb0", | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2025-09-02T13:57:01.102908Z", | |
| "start_time": "2025-09-02T13:56:54.865645Z" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "Some weights of RobertaModel were not initialized from the model checkpoint at ai-forever/ru-en-RoSBERTa and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']\n", | |
| "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "from embeddings_ru_en_rosberta import RuEnRoSBERTaEmbeddings\n", | |
| "\n", | |
| "embeddings = RuEnRoSBERTaEmbeddings(\n", | |
| " model_name=\"ai-forever/ru-en-RoSBERTa\",\n", | |
| " use_prompt_name=True,\n", | |
| ")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 7, | |
| "id": "5d8f49398c6d0fd0", | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2025-09-02T13:57:16.600433Z", | |
| "start_time": "2025-09-02T13:57:01.103500Z" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", | |
| "To disable this warning, you can either:\n", | |
| "\t- Avoid using `tokenizers` before the fork if possible\n", | |
| "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "application/vnd.jupyter.widget-view+json": { | |
| "model_id": "eb97ff4c1d3643d89d8655ad7f341764", | |
| "version_major": 2, | |
| "version_minor": 0 | |
| }, | |
| "text/plain": [ | |
| "Batches: 0%| | 0/1 [00:00<?, ?it/s]" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| }, | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "17:07:56 redisvl.index.index INFO Index already exists, not overwriting.\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "application/vnd.jupyter.widget-view+json": { | |
| "model_id": "aa3b26134db447fba437d256a35ab161", | |
| "version_major": 2, | |
| "version_minor": 0 | |
| }, | |
| "text/plain": [ | |
| "Batches: 0%| | 0/3 [00:00<?, ?it/s]" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| } | |
| ], | |
| "source": [ | |
| "from langchain_redis import RedisVectorStore, RedisConfig\n", | |
| "\n", | |
| "config = RedisConfig(\n", | |
| " index_name=\"Master_and_Margarita\",\n", | |
| ")\n", | |
| "\n", | |
| "vector_store = RedisVectorStore.from_documents(\n", | |
| " documents=documents,\n", | |
| " embedding=embeddings,\n", | |
| " config=config,\n", | |
| ")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 8, | |
| "id": "9e8d728ad7df77da", | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2025-09-02T13:57:16.722800Z", | |
| "start_time": "2025-09-02T13:57:16.601191Z" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "application/vnd.jupyter.widget-view+json": { | |
| "model_id": "a6eb0a3ec13d411b8d059c3feb7e0a36", | |
| "version_major": 2, | |
| "version_minor": 0 | |
| }, | |
| "text/plain": [ | |
| "Batches: 0%| | 0/1 [00:00<?, ?it/s]" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| }, | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "3" | |
| ] | |
| }, | |
| "execution_count": 8, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "docs = vector_store.similarity_search(question, k=3)\n", | |
| "len(docs)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 9, | |
| "id": "1339e3f8d207924d", | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2025-09-02T13:57:16.725370Z", | |
| "start_time": "2025-09-02T13:57:16.723385Z" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "... ему-то пропал: – Все просто: в белом плаще...\n", | |
| "\n", | |
| "\n", | |
| "\n", | |
| "Глава 2\n", | |
| "\n", | |
| "Понтий Пилат\n", | |
| "\n", | |
| "В белом плаще с кровавым подбоем, шаркающей кавалерийской походкой, ранним утром четырнадцатого числа весенн ...\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "print(f\"... {str(docs[0])[620:800]} ...\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 10, | |
| "id": "fc507d167f3ba66f", | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2025-09-02T13:57:18.512847Z", | |
| "start_time": "2025-09-02T13:57:16.726017Z" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "application/vnd.jupyter.widget-view+json": { | |
| "model_id": "92faeb8c294c46d89459c4940890fcef", | |
| "version_major": 2, | |
| "version_minor": 0 | |
| }, | |
| "text/plain": [ | |
| "Batches: 0%| | 0/1 [00:00<?, ?it/s]" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| }, | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "{'query': 'Какой плащ был у Понтия Пилата?',\n", | |
| " 'result': 'Понтий Пилат был одет в белый плащ с кровавым подбоем.'}" | |
| ] | |
| }, | |
| "execution_count": 10, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "from langchain.chains import RetrievalQA\n", | |
| "\n", | |
| "qa_chain = RetrievalQA.from_chain_type(\n", | |
| " llm,\n", | |
| " retriever=vector_store.as_retriever(), \n", | |
| ")\n", | |
| "\n", | |
| "qa_chain.invoke({\"query\": question})" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 11, | |
| "id": "e62199724cd293ed", | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2025-09-02T13:57:20.089560Z", | |
| "start_time": "2025-09-02T13:57:18.513966Z" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "application/vnd.jupyter.widget-view+json": { | |
| "model_id": "acd8467b50cc40ba9dc51f57e1962a29", | |
| "version_major": 2, | |
| "version_minor": 0 | |
| }, | |
| "text/plain": [ | |
| "Batches: 0%| | 0/1 [00:00<?, ?it/s]" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| }, | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "{'query': 'Какая трость была у Воланда?',\n", | |
| " 'result': 'У Воланда в руках была трость с черным набалдашником в виде головы пуделя.'}" | |
| ] | |
| }, | |
| "execution_count": 11, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "qa_chain.invoke({\"query\": \"Какая трость была у Воланда?\"})" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 12, | |
| "id": "c6450392da04e409", | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2025-09-02T13:57:21.780655Z", | |
| "start_time": "2025-09-02T13:57:20.094606Z" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "application/vnd.jupyter.widget-view+json": { | |
| "model_id": "8cc070ce89e34d21afa8374a04601fd5", | |
| "version_major": 2, | |
| "version_minor": 0 | |
| }, | |
| "text/plain": [ | |
| "Batches: 0%| | 0/1 [00:00<?, ?it/s]" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| }, | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "{'query': 'что не смогли купить герои романа на Патриарших?',\n", | |
| " 'result': 'Герои романа не смогли купить на Патриарших алкоголь, так как им сказали, что пиво будет привезено к вечеру, а нарзану, который запросил Берлиоз, не оказалось. Вместо этого им предложили абрикосовую, но она была теплой.'}" | |
| ] | |
| }, | |
| "execution_count": 12, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "qa_chain.invoke({\"query\": \"что не смогли купить герои романа на Патриарших?\"})" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3 (ipykernel)", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.13.2" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 5 | |
| } |