rsaryev · July 6, 2023 00:27
diff --git a/talk-codebase.ipynb b/talk-codebase.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Talk to a text using LangChain, GPT4All and FAISS"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-07-06T00:18:27.054516Z",
     "start_time": "2023-07-06T00:18:27.052700Z"
    }
   },
   "outputs": [],
   "source": [
    "import os\n",
    "from pathlib import Path\n",
    "\n",
    "from langchain.callbacks.manager import CallbackManager\n",
    "from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler\n",
    "from langchain.llms import LlamaCpp"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Initialize LLM"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])\n",
    "model_path = os.path.join(str(Path.home()), \".cache\", \"gpt4all\", \"orca-mini-3b.ggmlv3.q4_0.bin\").replace(\"\\\\\", \"\\\\\\\\\")\n",
    "llm = LlamaCpp(model_path=model_path, callback_manager=callback_manager, verbose=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Embeddings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-07-06T00:07:52.099206Z",
     "start_time": "2023-07-06T00:07:51.855485Z"
    }
   },
   "outputs": [],
   "source": [
    "from langchain.embeddings import HuggingFaceEmbeddings\n",
    "\n",
    "embeddings = HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Document Loader"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-07-06T00:09:51.689269Z",
     "start_time": "2023-07-06T00:09:51.370223Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Created a chunk of size 862, which is longer than the specified 500\n",
      "Created a chunk of size 674, which is longer than the specified 500\n"
     ]
    }
   ],
   "source": [
    "from langchain.text_splitter import CharacterTextSplitter\n",
    "from langchain.vectorstores import FAISS\n",
    "from langchain.document_loaders import TextLoader\n",
    "\n",
    "loader = TextLoader(\"./data.txt\")\n",
    "documents = loader.load()\n",
    "text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=0)\n",
    "docs = text_splitter.split_documents(documents)\n",
    "\n",
    "db = FAISS.from_documents(docs, embeddings)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Search for similar documents and send them to the LLM with question"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-07-06T00:26:53.266848Z",
     "start_time": "2023-07-06T00:26:46.620748Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Llama.generate: prefix-match hit\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " \n",
      "To create a vector store, you need to follow these steps:\n",
      "1. Decide on the number of dimensions for your embeddings (k).\n",
      "2. Choose a directory on your local machine where you want to save your vector store.\n",
      "3. Load the embeddings from your data.\n",
      "4. Determine which index you want to use, and save that index as a subdirectory in your chosen directory.\n",
      "5. Use the `get_local_vector_store()` function provided by the library to create a new vector store with the saved index. \n",
      "6. Finally, call the function with search parameters as an argument, like this:\n",
      "```\n",
      "new_db = get_local_vector_store(embeddings, index_path)\n",
      "return new_db.as_retriever(search_kwargs={\"k\": k})\n",
      "```"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n",
      "llama_print_timings:        load time =   283.81 ms\n",
      "llama_print_timings:      sample time =   134.72 ms /   188 runs   (    0.72 ms per token,  1395.48 tokens per second)\n",
      "llama_print_timings: prompt eval time =     0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)\n",
      "llama_print_timings:        eval time =  6114.88 ms /   188 runs   (   32.53 ms per token,    30.74 tokens per second)\n",
      "llama_print_timings:       total time =  6596.15 ms\n"
     ]
    }
   ],
   "source": [
    "from langchain.chains import RetrievalQA\n",
    "\n",
    "retriever = db.as_retriever(search_kwargs={\"k\": 1})\n",
    "qa = RetrievalQA.from_chain_type(\n",
    "    llm=llm,\n",
    "    chain_type=\"stuff\",\n",
    "    retriever=retriever,\n",
    "    return_source_documents=True\n",
    ")\n",
    "docs = qa(\"how to create a store?\")\n",
    "file_paths = [os.path.abspath(s.metadata[\"source\"]) for s in docs['source_documents']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
 }
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Talk to a text using LangChain, GPT4All and FAISS"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 53,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2023-07-06T00:18:27.054516Z",
	"start_time": "2023-07-06T00:18:27.052700Z"
	}
	},
	"outputs": [],
	"source": [
	"import os\n",
	"from pathlib import Path\n",
	"\n",
	"from langchain.callbacks.manager import CallbackManager\n",
	"from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler\n",
	"from langchain.llms import LlamaCpp"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Initialize LLM"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])\n",
	"model_path = os.path.join(str(Path.home()), \".cache\", \"gpt4all\", \"orca-mini-3b.ggmlv3.q4_0.bin\").replace(\"\\\\\", \"\\\\\\\\\")\n",
	"llm = LlamaCpp(model_path=model_path, callback_manager=callback_manager, verbose=False)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Embeddings"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 39,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2023-07-06T00:07:52.099206Z",
	"start_time": "2023-07-06T00:07:51.855485Z"
	}
	},
	"outputs": [],
	"source": [
	"from langchain.embeddings import HuggingFaceEmbeddings\n",
	"\n",
	"embeddings = HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2')"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Document Loader"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 42,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2023-07-06T00:09:51.689269Z",
	"start_time": "2023-07-06T00:09:51.370223Z"
	}
	},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"Created a chunk of size 862, which is longer than the specified 500\n",
	"Created a chunk of size 674, which is longer than the specified 500\n"
	]
	}
	],
	"source": [
	"from langchain.text_splitter import CharacterTextSplitter\n",
	"from langchain.vectorstores import FAISS\n",
	"from langchain.document_loaders import TextLoader\n",
	"\n",
	"loader = TextLoader(\"./data.txt\")\n",
	"documents = loader.load()\n",
	"text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=0)\n",
	"docs = text_splitter.split_documents(documents)\n",
	"\n",
	"db = FAISS.from_documents(docs, embeddings)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Search for similar documents and send them to the LLM with question"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 81,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2023-07-06T00:26:53.266848Z",
	"start_time": "2023-07-06T00:26:46.620748Z"
	}
	},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"Llama.generate: prefix-match hit\n"
	]
	},
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	" \n",
	"To create a vector store, you need to follow these steps:\n",
	"1. Decide on the number of dimensions for your embeddings (k).\n",
	"2. Choose a directory on your local machine where you want to save your vector store.\n",
	"3. Load the embeddings from your data.\n",
	"4. Determine which index you want to use, and save that index as a subdirectory in your chosen directory.\n",
	"5. Use the `get_local_vector_store()` function provided by the library to create a new vector store with the saved index. \n",
	"6. Finally, call the function with search parameters as an argument, like this:\n",
	"```\n",
	"new_db = get_local_vector_store(embeddings, index_path)\n",
	"return new_db.as_retriever(search_kwargs={\"k\": k})\n",
	"```"
	]
	},
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"\n",
	"llama_print_timings: load time = 283.81 ms\n",
	"llama_print_timings: sample time = 134.72 ms / 188 runs ( 0.72 ms per token, 1395.48 tokens per second)\n",
	"llama_print_timings: prompt eval time = 0.00 ms / 1 tokens ( 0.00 ms per token, inf tokens per second)\n",
	"llama_print_timings: eval time = 6114.88 ms / 188 runs ( 32.53 ms per token, 30.74 tokens per second)\n",
	"llama_print_timings: total time = 6596.15 ms\n"
	]
	}
	],
	"source": [
	"from langchain.chains import RetrievalQA\n",
	"\n",
	"retriever = db.as_retriever(search_kwargs={\"k\": 1})\n",
	"qa = RetrievalQA.from_chain_type(\n",
	" llm=llm,\n",
	" chain_type=\"stuff\",\n",
	" retriever=retriever,\n",
	" return_source_documents=True\n",
	")\n",
	"docs = qa(\"how to create a store?\")\n",
	"file_paths = [os.path.abspath(s.metadata[\"source\"]) for s in docs['source_documents']]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3 (ipykernel)",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.9.6"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 1
	}