Skip to content

Instantly share code, notes, and snippets.

@ranfysvalle02
Last active June 13, 2023 05:36
Show Gist options
  • Save ranfysvalle02/12d43d7ffd2e3936984ca96568564a3e to your computer and use it in GitHub Desktop.
Save ranfysvalle02/12d43d7ffd2e3936984ca96568564a3e to your computer and use it in GitHub Desktop.
BVR++ | Reading a large text (a whole book! hundreds of pages!) Using Best Vector Representation, Langchain, And LLMs
# Inspired by: https://github.com/gkamradt/langchain-tutorials/blob/main/data_generation/5%20Levels%20Of%20Summarization%20-%20Novice%20To%20Expert.ipynb
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.llms import AzureOpenAI
import numpy as np
from multiprocessing import Pool
from tqdm import tqdm
from langchain.chains.summarize import load_summarize_chain
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import MongoDBAtlasVectorSearch
from langchain.document_loaders import TextLoader
from langchain.llms import LlamaCpp
from typing import Any, Dict, List
from langchain.docstore.document import Document
from langchain.document_loaders import (
TextLoader
)
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
from tqdm import tqdm
from langchain import PromptTemplate
tqdm.pandas()
from sklearn.cluster import KMeans
llm = LlamaCpp(n_ctx=4096,temperature=0.1,model_path="/Users/fabian/dev/OPENSOURCE/models/GPT4All-13B-snoozy.ggmlv3.q4_1.bin")
llm2 = AzureOpenAI(
deployment_name="",
openai_api_base="https://.openai.azure.com/",
openai_api_key="",
temperature=0.1
)
azureEmbeddings = OpenAIEmbeddings(
deployment="",
model="text-embedding-ada-002",
openai_api_base="https://.openai.azure.com/",
openai_api_key="",
openai_api_type="azure",
chunk_size=1
)
# Different LLMs, chunk sizes, and chunk amount will affect response quality
# and tokens used.
# Map file extensions to document loaders and their arguments
LOADER_MAPPING = {
".txt": (TextLoader, {"encoding": "utf8"}),
# Add more mappings for other file extensions and loaders as needed
}
vectorstore = MongoDBAtlasVectorSearch(collection, azureEmbeddings)
combine_prompt = """
You will be given a series of summaries from a text.
Your goal is to give a CONCISE summary of what happened in the text.
Think critically and analytically.
\n\n
```{text}```
\n\n
CONCISE SUMMARY:
"""
combine_prompt_template = PromptTemplate(template=combine_prompt, input_variables=["text"])
stuff_chain = load_summarize_chain(llm=llm2,
chain_type="stuff",
prompt=combine_prompt_template,
verbose=False # Set this to true if you want to see the inner workings
)
chunk_prompt = """
You will be given a text enclosed in triple backticks (```)
Your goal is to give a CONCISE summary of what happened in the text.
YOU MUST THINK CRITICALLY AND ANALYTICALLY.
```{text}```
CONCISE SUMMARY:
"""
chunk_prompt_template = PromptTemplate(template=chunk_prompt, input_variables=["text"])
chunk_chain = load_summarize_chain(llm=llm,
chain_type="stuff",
prompt=chunk_prompt_template,
verbose=False # Set this to true if you want to see the inner workings
)
refine_prompt = """
You will be given a summary of a story enclosed in triple backticks (```)
IMPROVE THE QUALITY OF THE SUMMARY
```{text}```
CONCISE SUMMARY:
"""
refine_prompt_template = PromptTemplate(template=refine_prompt, input_variables=["text"])
refine_chain = load_summarize_chain(llm=llm2,
chain_type="stuff",
prompt=refine_prompt_template,
verbose=False # Set this to true if you want to see the inner workings
)
def process_documents() -> List[Document]:
"""
Load documents and split in chunks
"""
print(f"Loading single document")
documents = load_single_document("./docs/sample.txt")
if not documents:
print("No new documents to load")
exit(0)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=5500, chunk_overlap=500)
texts = text_splitter.split_documents(documents)
for count, value in enumerate(texts):
print(count, value)
#texts[count].page_content=summarize_text(texts[count].page_content)
return texts
def load_single_document(file_path: str) -> List[Document]:
ext = "." + file_path.rsplit(".", 1)[-1]
if ext in LOADER_MAPPING:
loader_class, loader_args = LOADER_MAPPING[ext]
loader = loader_class(file_path, **loader_args)
return loader.load()
raise ValueError(f"Unsupported file extension '{ext}'")
def cluster_cleanup(docs,vectors,i):
print("FIRST CHUNKY CHUNK")
j = 0
doc = docs[j]
chunk_summary = chunk_chain.run([doc])
print (f"Summary #{j} (chunk numero uno) - Preview: {chunk_summary} \n")
# Perform K-means clustering
kmeans = KMeans(n_clusters=i, random_state=42).fit(vectors)
# Find the closest embeddings to the centroids
# Create an empty list that will hold your closest points
closest_indices = []
# Loop through the number of clusters you have
for i in range(i):
# Get the list of distances from that particular cluster center
distances = np.linalg.norm(vectors - kmeans.cluster_centers_[i], axis=1)
# Find the list position of the closest one (using argmin to find the smallest distance)
closest_index = np.argmin(distances)
# Append that position to your closest indices list
closest_indices.append(closest_index)
selected_indices = sorted(closest_indices)
selected_docs = [docs[doc] for doc in selected_indices]
# Make an empty list to hold your summaries
summary_list = []
# Go get a summary of the last chunk
# Append that summary to your list
summary_list.append(chunk_summary)
# Loop through a range of the lenght of your selected docs
for x, doc in enumerate(selected_docs):
# Go get a summary of the chunk
chunk_summary = chunk_chain.run([doc])
# Append that summary to your list
summary_list.append(chunk_summary)
print (f"Summary #{x} (chunk #{selected_indices[x]}) - Preview: {chunk_summary} \n")
# Go get a summary of the last chunk
print("LAST CHUNKY CHUNK")
j = len(docs)-1
doc = docs[j]
chunk_summary = chunk_chain.run([doc])
print (f"Summary #{j} (chunk last) - Preview: {chunk_summary} \n")
# Append that summary to your list
summary_list.append(chunk_summary)
summaries = "\n".join(summary_list)
# Convert it back to a document
summaries = Document(page_content=summaries)
print (f"Your total summary has {llm.get_num_tokens(summaries.page_content)} tokens")
print (f"Summaries:{summaries.page_content}")
return summaries
def main():
docs = process_documents()
vectors = azureEmbeddings.embed_documents([x.page_content for x in docs])
output = stuff_chain.run([cluster_cleanup(docs,vectors,4)])
tmpDoc = Document(page_content=output,metadata={})
output = refine_chain.run([tmpDoc])
print("\n\n=====THE RESULT======\n\n")
print(f"Final answer: {output}")
exit()
main()
@ranfysvalle02
Copy link
Author

Another, more concise summary of The Wizard of Oz

Dorothy and her dog Toto are transported to a strange land where they meet the Tin Woodman, the Scarecrow, and the Cowardly Lion. They journey to the Emerald City to meet the Wizard of Oz, who tells them they must kill the Wicked Witch of the West to return home. Dorothy uses a charm to call Winged Monkeys to help her, and they eventually return home with the help of Glinda the Good Witch of the North. Along the way, they encounter talking trees and a china wall, and the Scarecrow, Tin Woodman, and Lion are granted their wishes. Dorothy returns home with the help of the Silver Shoes.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment