Last active
June 13, 2023 05:36
-
-
Save ranfysvalle02/12d43d7ffd2e3936984ca96568564a3e to your computer and use it in GitHub Desktop.
BVR++ | Reading a large text (a whole book! hundreds of pages!) Using Best Vector Representation, Langchain, And LLMs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Inspired by: https://github.com/gkamradt/langchain-tutorials/blob/main/data_generation/5%20Levels%20Of%20Summarization%20-%20Novice%20To%20Expert.ipynb | |
from langchain.embeddings.openai import OpenAIEmbeddings | |
from langchain.llms import AzureOpenAI | |
import numpy as np | |
from multiprocessing import Pool | |
from tqdm import tqdm | |
from langchain.chains.summarize import load_summarize_chain | |
from langchain.embeddings.openai import OpenAIEmbeddings | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.vectorstores import MongoDBAtlasVectorSearch | |
from langchain.document_loaders import TextLoader | |
from langchain.llms import LlamaCpp | |
from typing import Any, Dict, List | |
from langchain.docstore.document import Document | |
from langchain.document_loaders import ( | |
TextLoader | |
) | |
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer | |
from tqdm import tqdm | |
from langchain import PromptTemplate | |
tqdm.pandas() | |
from sklearn.cluster import KMeans | |
llm = LlamaCpp(n_ctx=4096,temperature=0.1,model_path="/Users/fabian/dev/OPENSOURCE/models/GPT4All-13B-snoozy.ggmlv3.q4_1.bin") | |
llm2 = AzureOpenAI( | |
deployment_name="", | |
openai_api_base="https://.openai.azure.com/", | |
openai_api_key="", | |
temperature=0.1 | |
) | |
azureEmbeddings = OpenAIEmbeddings( | |
deployment="", | |
model="text-embedding-ada-002", | |
openai_api_base="https://.openai.azure.com/", | |
openai_api_key="", | |
openai_api_type="azure", | |
chunk_size=1 | |
) | |
# Different LLMs, chunk sizes, and chunk amount will affect response quality | |
# and tokens used. | |
# Map file extensions to document loaders and their arguments | |
LOADER_MAPPING = { | |
".txt": (TextLoader, {"encoding": "utf8"}), | |
# Add more mappings for other file extensions and loaders as needed | |
} | |
vectorstore = MongoDBAtlasVectorSearch(collection, azureEmbeddings) | |
combine_prompt = """ | |
You will be given a series of summaries from a text. | |
Your goal is to give a CONCISE summary of what happened in the text. | |
Think critically and analytically. | |
\n\n | |
```{text}``` | |
\n\n | |
CONCISE SUMMARY: | |
""" | |
combine_prompt_template = PromptTemplate(template=combine_prompt, input_variables=["text"]) | |
stuff_chain = load_summarize_chain(llm=llm2, | |
chain_type="stuff", | |
prompt=combine_prompt_template, | |
verbose=False # Set this to true if you want to see the inner workings | |
) | |
chunk_prompt = """ | |
You will be given a text enclosed in triple backticks (```) | |
Your goal is to give a CONCISE summary of what happened in the text. | |
YOU MUST THINK CRITICALLY AND ANALYTICALLY. | |
```{text}``` | |
CONCISE SUMMARY: | |
""" | |
chunk_prompt_template = PromptTemplate(template=chunk_prompt, input_variables=["text"]) | |
chunk_chain = load_summarize_chain(llm=llm, | |
chain_type="stuff", | |
prompt=chunk_prompt_template, | |
verbose=False # Set this to true if you want to see the inner workings | |
) | |
refine_prompt = """ | |
You will be given a summary of a story enclosed in triple backticks (```) | |
IMPROVE THE QUALITY OF THE SUMMARY | |
```{text}``` | |
CONCISE SUMMARY: | |
""" | |
refine_prompt_template = PromptTemplate(template=refine_prompt, input_variables=["text"]) | |
refine_chain = load_summarize_chain(llm=llm2, | |
chain_type="stuff", | |
prompt=refine_prompt_template, | |
verbose=False # Set this to true if you want to see the inner workings | |
) | |
def process_documents() -> List[Document]: | |
""" | |
Load documents and split in chunks | |
""" | |
print(f"Loading single document") | |
documents = load_single_document("./docs/sample.txt") | |
if not documents: | |
print("No new documents to load") | |
exit(0) | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=5500, chunk_overlap=500) | |
texts = text_splitter.split_documents(documents) | |
for count, value in enumerate(texts): | |
print(count, value) | |
#texts[count].page_content=summarize_text(texts[count].page_content) | |
return texts | |
def load_single_document(file_path: str) -> List[Document]: | |
ext = "." + file_path.rsplit(".", 1)[-1] | |
if ext in LOADER_MAPPING: | |
loader_class, loader_args = LOADER_MAPPING[ext] | |
loader = loader_class(file_path, **loader_args) | |
return loader.load() | |
raise ValueError(f"Unsupported file extension '{ext}'") | |
def cluster_cleanup(docs,vectors,i): | |
print("FIRST CHUNKY CHUNK") | |
j = 0 | |
doc = docs[j] | |
chunk_summary = chunk_chain.run([doc]) | |
print (f"Summary #{j} (chunk numero uno) - Preview: {chunk_summary} \n") | |
# Perform K-means clustering | |
kmeans = KMeans(n_clusters=i, random_state=42).fit(vectors) | |
# Find the closest embeddings to the centroids | |
# Create an empty list that will hold your closest points | |
closest_indices = [] | |
# Loop through the number of clusters you have | |
for i in range(i): | |
# Get the list of distances from that particular cluster center | |
distances = np.linalg.norm(vectors - kmeans.cluster_centers_[i], axis=1) | |
# Find the list position of the closest one (using argmin to find the smallest distance) | |
closest_index = np.argmin(distances) | |
# Append that position to your closest indices list | |
closest_indices.append(closest_index) | |
selected_indices = sorted(closest_indices) | |
selected_docs = [docs[doc] for doc in selected_indices] | |
# Make an empty list to hold your summaries | |
summary_list = [] | |
# Go get a summary of the last chunk | |
# Append that summary to your list | |
summary_list.append(chunk_summary) | |
# Loop through a range of the lenght of your selected docs | |
for x, doc in enumerate(selected_docs): | |
# Go get a summary of the chunk | |
chunk_summary = chunk_chain.run([doc]) | |
# Append that summary to your list | |
summary_list.append(chunk_summary) | |
print (f"Summary #{x} (chunk #{selected_indices[x]}) - Preview: {chunk_summary} \n") | |
# Go get a summary of the last chunk | |
print("LAST CHUNKY CHUNK") | |
j = len(docs)-1 | |
doc = docs[j] | |
chunk_summary = chunk_chain.run([doc]) | |
print (f"Summary #{j} (chunk last) - Preview: {chunk_summary} \n") | |
# Append that summary to your list | |
summary_list.append(chunk_summary) | |
summaries = "\n".join(summary_list) | |
# Convert it back to a document | |
summaries = Document(page_content=summaries) | |
print (f"Your total summary has {llm.get_num_tokens(summaries.page_content)} tokens") | |
print (f"Summaries:{summaries.page_content}") | |
return summaries | |
def main(): | |
docs = process_documents() | |
vectors = azureEmbeddings.embed_documents([x.page_content for x in docs]) | |
output = stuff_chain.run([cluster_cleanup(docs,vectors,4)]) | |
tmpDoc = Document(page_content=output,metadata={}) | |
output = refine_chain.run([tmpDoc]) | |
print("\n\n=====THE RESULT======\n\n") | |
print(f"Final answer: {output}") | |
exit() | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Another, more concise summary of The Wizard of Oz
Dorothy and her dog Toto are transported to a strange land where they meet the Tin Woodman, the Scarecrow, and the Cowardly Lion. They journey to the Emerald City to meet the Wizard of Oz, who tells them they must kill the Wicked Witch of the West to return home. Dorothy uses a charm to call Winged Monkeys to help her, and they eventually return home with the help of Glinda the Good Witch of the North. Along the way, they encounter talking trees and a china wall, and the Scarecrow, Tin Woodman, and Lion are granted their wishes. Dorothy returns home with the help of the Silver Shoes.