Created
April 9, 2023 11:37
-
-
Save xinqiu/521c383665e24a3de01a5cd799d5aca5 to your computer and use it in GitHub Desktop.
Creating a private data QA bot entirely using the open-source LLM project
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from langchain import PromptTemplate, LLMChain | |
from langchain.document_loaders import UnstructuredHTMLLoader | |
from langchain.embeddings import LlamaCppEmbeddings | |
from langchain.llms import LlamaCpp | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.vectorstores.faiss import FAISS | |
loader = UnstructuredHTMLLoader("langchain/docs/_build/html/index.html") | |
embedding = LlamaCppEmbeddings(model_path="path/models/ggml-model-q4_0.bin") | |
llm = LlamaCpp(model_path="path/models/ggml-model-q4_0.bin") | |
def split_chunks(sources: list) -> list: | |
chunks = [] | |
splitter = RecursiveCharacterTextSplitter(separator="", chunk_size=256, chunk_overlap=16) | |
for chunk in splitter.split_documents(sources): | |
chunks.append(chunk) | |
return chunks | |
def generate_embedding(chunks: list): | |
texts = [doc.page_content for doc in chunks] | |
metadatas = [doc.metadata for doc in chunks] | |
search_index = FAISS.from_texts(texts, embedding, metadatas=metadatas) | |
return search_index | |
def similarity_search( | |
query: str, index: FAISS | |
) -> (list, list): | |
matched_docs = index.similarity_search(query, k=4) | |
sources = [] | |
for doc in matched_docs: | |
sources.append( | |
{ | |
"page_content": doc.page_content, | |
"metadata": doc.metadata, | |
} | |
) | |
return matched_docs, sources | |
docs = loader.load() | |
chunks = split_chunks(docs) | |
embeddings = generate_embedding(chunks) | |
question = "What are the use cases of LangChain?" | |
matched_docs, sources = similarity_search(question, embeddings) | |
template = """ | |
Please use the following context to answer questions. | |
Context: {context} | |
--- | |
Question: {question} | |
Answer: Let's think step by step.""" | |
context = "\n".join([doc.page_content for doc in matched_docs]) | |
prompt = PromptTemplate(template=template, input_variables=["context", "question"]).partial(context=context) | |
llm_chain = LLMChain(prompt=prompt, llm=llm) | |
print(llm_chain.run(question)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment