Created
          April 9, 2023 11:37 
        
      - 
      
- 
        Save xinqiu/521c383665e24a3de01a5cd799d5aca5 to your computer and use it in GitHub Desktop. 
    Creating a private data QA bot entirely using the open-source LLM project
  
        
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | from langchain import PromptTemplate, LLMChain | |
| from langchain.document_loaders import UnstructuredHTMLLoader | |
| from langchain.embeddings import LlamaCppEmbeddings | |
| from langchain.llms import LlamaCpp | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain.vectorstores.faiss import FAISS | |
| loader = UnstructuredHTMLLoader("langchain/docs/_build/html/index.html") | |
| embedding = LlamaCppEmbeddings(model_path="path/models/ggml-model-q4_0.bin") | |
| llm = LlamaCpp(model_path="path/models/ggml-model-q4_0.bin") | |
| def split_chunks(sources: list) -> list: | |
| chunks = [] | |
| splitter = RecursiveCharacterTextSplitter(separator="", chunk_size=256, chunk_overlap=16) | |
| for chunk in splitter.split_documents(sources): | |
| chunks.append(chunk) | |
| return chunks | |
| def generate_embedding(chunks: list): | |
| texts = [doc.page_content for doc in chunks] | |
| metadatas = [doc.metadata for doc in chunks] | |
| search_index = FAISS.from_texts(texts, embedding, metadatas=metadatas) | |
| return search_index | |
| def similarity_search( | |
| query: str, index: FAISS | |
| ) -> (list, list): | |
| matched_docs = index.similarity_search(query, k=4) | |
| sources = [] | |
| for doc in matched_docs: | |
| sources.append( | |
| { | |
| "page_content": doc.page_content, | |
| "metadata": doc.metadata, | |
| } | |
| ) | |
| return matched_docs, sources | |
| docs = loader.load() | |
| chunks = split_chunks(docs) | |
| embeddings = generate_embedding(chunks) | |
| question = "What are the use cases of LangChain?" | |
| matched_docs, sources = similarity_search(question, embeddings) | |
| template = """ | |
| Please use the following context to answer questions. | |
| Context: {context} | |
| --- | |
| Question: {question} | |
| Answer: Let's think step by step.""" | |
| context = "\n".join([doc.page_content for doc in matched_docs]) | |
| prompt = PromptTemplate(template=template, input_variables=["context", "question"]).partial(context=context) | |
| llm_chain = LLMChain(prompt=prompt, llm=llm) | |
| print(llm_chain.run(question)) | 
  
    Sign up for free
    to join this conversation on GitHub.
    Already have an account?
    Sign in to comment