Skip to content

Instantly share code, notes, and snippets.

@ranfysvalle02
Last active June 13, 2023 05:36
Show Gist options
  • Save ranfysvalle02/12d43d7ffd2e3936984ca96568564a3e to your computer and use it in GitHub Desktop.
Save ranfysvalle02/12d43d7ffd2e3936984ca96568564a3e to your computer and use it in GitHub Desktop.
BVR++ | Reading a large text (a whole book! hundreds of pages!) Using Best Vector Representation, Langchain, And LLMs
# Inspired by: https://github.com/gkamradt/langchain-tutorials/blob/main/data_generation/5%20Levels%20Of%20Summarization%20-%20Novice%20To%20Expert.ipynb
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.llms import AzureOpenAI
import numpy as np
from multiprocessing import Pool
from tqdm import tqdm
from langchain.chains.summarize import load_summarize_chain
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import MongoDBAtlasVectorSearch
from langchain.document_loaders import TextLoader
from langchain.llms import LlamaCpp
from typing import Any, Dict, List
from langchain.docstore.document import Document
from langchain.document_loaders import (
TextLoader
)
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
from tqdm import tqdm
from langchain import PromptTemplate
tqdm.pandas()
from sklearn.cluster import KMeans
llm = LlamaCpp(n_ctx=4096,temperature=0.1,model_path="/Users/fabian/dev/OPENSOURCE/models/GPT4All-13B-snoozy.ggmlv3.q4_1.bin")
llm2 = AzureOpenAI(
deployment_name="",
openai_api_base="https://.openai.azure.com/",
openai_api_key="",
temperature=0.1
)
azureEmbeddings = OpenAIEmbeddings(
deployment="",
model="text-embedding-ada-002",
openai_api_base="https://.openai.azure.com/",
openai_api_key="",
openai_api_type="azure",
chunk_size=1
)
# Different LLMs, chunk sizes, and chunk amount will affect response quality
# and tokens used.
# Map file extensions to document loaders and their arguments
LOADER_MAPPING = {
".txt": (TextLoader, {"encoding": "utf8"}),
# Add more mappings for other file extensions and loaders as needed
}
vectorstore = MongoDBAtlasVectorSearch(collection, azureEmbeddings)
combine_prompt = """
You will be given a series of summaries from a text.
Your goal is to give a CONCISE summary of what happened in the text.
Think critically and analytically.
\n\n
```{text}```
\n\n
CONCISE SUMMARY:
"""
combine_prompt_template = PromptTemplate(template=combine_prompt, input_variables=["text"])
stuff_chain = load_summarize_chain(llm=llm2,
chain_type="stuff",
prompt=combine_prompt_template,
verbose=False # Set this to true if you want to see the inner workings
)
chunk_prompt = """
You will be given a text enclosed in triple backticks (```)
Your goal is to give a CONCISE summary of what happened in the text.
YOU MUST THINK CRITICALLY AND ANALYTICALLY.
```{text}```
CONCISE SUMMARY:
"""
chunk_prompt_template = PromptTemplate(template=chunk_prompt, input_variables=["text"])
chunk_chain = load_summarize_chain(llm=llm,
chain_type="stuff",
prompt=chunk_prompt_template,
verbose=False # Set this to true if you want to see the inner workings
)
refine_prompt = """
You will be given a summary of a story enclosed in triple backticks (```)
IMPROVE THE QUALITY OF THE SUMMARY
```{text}```
CONCISE SUMMARY:
"""
refine_prompt_template = PromptTemplate(template=refine_prompt, input_variables=["text"])
refine_chain = load_summarize_chain(llm=llm2,
chain_type="stuff",
prompt=refine_prompt_template,
verbose=False # Set this to true if you want to see the inner workings
)
def process_documents() -> List[Document]:
"""
Load documents and split in chunks
"""
print(f"Loading single document")
documents = load_single_document("./docs/sample.txt")
if not documents:
print("No new documents to load")
exit(0)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=5500, chunk_overlap=500)
texts = text_splitter.split_documents(documents)
for count, value in enumerate(texts):
print(count, value)
#texts[count].page_content=summarize_text(texts[count].page_content)
return texts
def load_single_document(file_path: str) -> List[Document]:
ext = "." + file_path.rsplit(".", 1)[-1]
if ext in LOADER_MAPPING:
loader_class, loader_args = LOADER_MAPPING[ext]
loader = loader_class(file_path, **loader_args)
return loader.load()
raise ValueError(f"Unsupported file extension '{ext}'")
def cluster_cleanup(docs,vectors,i):
print("FIRST CHUNKY CHUNK")
j = 0
doc = docs[j]
chunk_summary = chunk_chain.run([doc])
print (f"Summary #{j} (chunk numero uno) - Preview: {chunk_summary} \n")
# Perform K-means clustering
kmeans = KMeans(n_clusters=i, random_state=42).fit(vectors)
# Find the closest embeddings to the centroids
# Create an empty list that will hold your closest points
closest_indices = []
# Loop through the number of clusters you have
for i in range(i):
# Get the list of distances from that particular cluster center
distances = np.linalg.norm(vectors - kmeans.cluster_centers_[i], axis=1)
# Find the list position of the closest one (using argmin to find the smallest distance)
closest_index = np.argmin(distances)
# Append that position to your closest indices list
closest_indices.append(closest_index)
selected_indices = sorted(closest_indices)
selected_docs = [docs[doc] for doc in selected_indices]
# Make an empty list to hold your summaries
summary_list = []
# Go get a summary of the last chunk
# Append that summary to your list
summary_list.append(chunk_summary)
# Loop through a range of the lenght of your selected docs
for x, doc in enumerate(selected_docs):
# Go get a summary of the chunk
chunk_summary = chunk_chain.run([doc])
# Append that summary to your list
summary_list.append(chunk_summary)
print (f"Summary #{x} (chunk #{selected_indices[x]}) - Preview: {chunk_summary} \n")
# Go get a summary of the last chunk
print("LAST CHUNKY CHUNK")
j = len(docs)-1
doc = docs[j]
chunk_summary = chunk_chain.run([doc])
print (f"Summary #{j} (chunk last) - Preview: {chunk_summary} \n")
# Append that summary to your list
summary_list.append(chunk_summary)
summaries = "\n".join(summary_list)
# Convert it back to a document
summaries = Document(page_content=summaries)
print (f"Your total summary has {llm.get_num_tokens(summaries.page_content)} tokens")
print (f"Summaries:{summaries.page_content}")
return summaries
def main():
docs = process_documents()
vectors = azureEmbeddings.embed_documents([x.page_content for x in docs])
output = stuff_chain.run([cluster_cleanup(docs,vectors,4)])
tmpDoc = Document(page_content=output,metadata={})
output = refine_chain.run([tmpDoc])
print("\n\n=====THE RESULT======\n\n")
print(f"Final answer: {output}")
exit()
main()
@ranfysvalle02
Copy link
Author

Summary of the entire book of The Wizard of Oz

If you are wondering why it says "Silver Shoes", in this version of the story the slippers are indeed silver.

“The Silver Shoes,” said the Good Witch, “have wonderful powers. And one of the most curious things about them is that they can carry you to any place in the world in three steps, and each step will be made in the wink of an eye. All you have to do is to knock the heels together three times and command the shoes to carry you wherever you wish to go.”

Final answer:

  • Dorothy lives with her Aunt Em and Uncle Henry on a farm in Kansas
  • A cyclone hits and Dorothy and Toto are transported to Oz
  • Dorothy is told to see the Great Oz to get back home
  • Dorothy meets the Scarecrow, Tin Woodman, and Cowardly Lion on her journey to see the Wizard
  • They face several challenges along the way, including the Wicked Witch of the West
  • The group finally reaches the Wizard, who is revealed to be a humbug
  • Dorothy uses the Silver Shoes to return home to Kansas
  • The Silver Shoes are lost forever in the desert
  • Dorothy is reunited with Aunt Em and Uncle Henry

@ranfysvalle02
Copy link
Author

Change Different Chunk Quantity by updating:
output = stuff_chain.run([cluster_cleanup(docs,vectors,{ANY AMOUNT})])

Remember that everything needs to fit in the context window of the model being used.

Change the Chunk Size by updating:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2500, chunk_overlap=500)

The logic uses the first chunk and the last chunk ALWAYS, and then uses Best Vector Representation to get the "juice" of the story as best as possible.

In this case I combine a local LLM and an AzureOpenAI LLM, but you can use whatever combination you prefer.

@ranfysvalle02
Copy link
Author

Process Used:

  • Always use first chunk
  • Use BVR (usually catches the "juice" in between) [bvr == best vector representation]
  • Select N amount of chunks/vectors
  • Always use the last chunk
  • Force the structure of the response into using bullet lists
  • Use GPT4ALL to generate the individual summaries (local LLM to save on tokens, but can be Azure too)
  • Use Azure GPT3.5 Turbo to summarize the smaller summaries all together

@ranfysvalle02
Copy link
Author

Summary for Through the looking glass

* Alice is playing with her kittens and takes them outside to see the bonfire.
* Alice meets the White Queen and they discuss jam and time.
* Alice meets Humpty Dumpty and they discuss his fall from the wall.
* Alice meets the White Knight and they encounter a tin soldier with a box.
* Alice attends a dinner with the Red Queen and meets a leg of mutton and plum pudding.
* The speaker recounts a dream about fishes and poetry and reflects on the nature of dreams and life.

Summary for The Time Machine
The Time Traveller presents a time machine to his guests and disappears twice. The narrator travels through time and finds himself in a future where humans are divided into two species, the Eloi and the Morlocks. He befriends an Eloi named Weena and discovers that the Morlocks are cannibalistic and dangerous. The narrator eventually finds the Time Machine but is trapped by the Morlocks. The Time Traveller disappears again, leaving behind broken glass, and the narrator waits for his return.

@ranfysvalle02
Copy link
Author

Another version of Dorothy and the Wizard of Oz (Wizard in Oz?)

Dorothy and Zeb travel to Hugson's Ranch and experience fairy-like phenomena and talking piglets. They then travel to the Country of the Gargoyles and are transported to the Emerald City where the Wizard saves Eureka's life by tricking the jury. Zeb and Jim return to Kansas.

@ranfysvalle02
Copy link
Author

Another, more concise summary of The Wizard of Oz

Dorothy and her dog Toto are transported to a strange land where they meet the Tin Woodman, the Scarecrow, and the Cowardly Lion. They journey to the Emerald City to meet the Wizard of Oz, who tells them they must kill the Wicked Witch of the West to return home. Dorothy uses a charm to call Winged Monkeys to help her, and they eventually return home with the help of Glinda the Good Witch of the North. Along the way, they encounter talking trees and a china wall, and the Scarecrow, Tin Woodman, and Lion are granted their wishes. Dorothy returns home with the help of the Silver Shoes.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment