-
-
Save ranfysvalle02/12d43d7ffd2e3936984ca96568564a3e to your computer and use it in GitHub Desktop.
# Inspired by: https://github.com/gkamradt/langchain-tutorials/blob/main/data_generation/5%20Levels%20Of%20Summarization%20-%20Novice%20To%20Expert.ipynb | |
from langchain.embeddings.openai import OpenAIEmbeddings | |
from langchain.llms import AzureOpenAI | |
import numpy as np | |
from multiprocessing import Pool | |
from tqdm import tqdm | |
from langchain.chains.summarize import load_summarize_chain | |
from langchain.embeddings.openai import OpenAIEmbeddings | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.vectorstores import MongoDBAtlasVectorSearch | |
from langchain.document_loaders import TextLoader | |
from langchain.llms import LlamaCpp | |
from typing import Any, Dict, List | |
from langchain.docstore.document import Document | |
from langchain.document_loaders import ( | |
TextLoader | |
) | |
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer | |
from tqdm import tqdm | |
from langchain import PromptTemplate | |
tqdm.pandas() | |
from sklearn.cluster import KMeans | |
llm = LlamaCpp(n_ctx=4096,temperature=0.1,model_path="/Users/fabian/dev/OPENSOURCE/models/GPT4All-13B-snoozy.ggmlv3.q4_1.bin") | |
llm2 = AzureOpenAI( | |
deployment_name="", | |
openai_api_base="https://.openai.azure.com/", | |
openai_api_key="", | |
temperature=0.1 | |
) | |
azureEmbeddings = OpenAIEmbeddings( | |
deployment="", | |
model="text-embedding-ada-002", | |
openai_api_base="https://.openai.azure.com/", | |
openai_api_key="", | |
openai_api_type="azure", | |
chunk_size=1 | |
) | |
# Different LLMs, chunk sizes, and chunk amount will affect response quality | |
# and tokens used. | |
# Map file extensions to document loaders and their arguments | |
LOADER_MAPPING = { | |
".txt": (TextLoader, {"encoding": "utf8"}), | |
# Add more mappings for other file extensions and loaders as needed | |
} | |
vectorstore = MongoDBAtlasVectorSearch(collection, azureEmbeddings) | |
combine_prompt = """ | |
You will be given a series of summaries from a text. | |
Your goal is to give a CONCISE summary of what happened in the text. | |
Think critically and analytically. | |
\n\n | |
```{text}``` | |
\n\n | |
CONCISE SUMMARY: | |
""" | |
combine_prompt_template = PromptTemplate(template=combine_prompt, input_variables=["text"]) | |
stuff_chain = load_summarize_chain(llm=llm2, | |
chain_type="stuff", | |
prompt=combine_prompt_template, | |
verbose=False # Set this to true if you want to see the inner workings | |
) | |
chunk_prompt = """ | |
You will be given a text enclosed in triple backticks (```) | |
Your goal is to give a CONCISE summary of what happened in the text. | |
YOU MUST THINK CRITICALLY AND ANALYTICALLY. | |
```{text}``` | |
CONCISE SUMMARY: | |
""" | |
chunk_prompt_template = PromptTemplate(template=chunk_prompt, input_variables=["text"]) | |
chunk_chain = load_summarize_chain(llm=llm, | |
chain_type="stuff", | |
prompt=chunk_prompt_template, | |
verbose=False # Set this to true if you want to see the inner workings | |
) | |
refine_prompt = """ | |
You will be given a summary of a story enclosed in triple backticks (```) | |
IMPROVE THE QUALITY OF THE SUMMARY | |
```{text}``` | |
CONCISE SUMMARY: | |
""" | |
refine_prompt_template = PromptTemplate(template=refine_prompt, input_variables=["text"]) | |
refine_chain = load_summarize_chain(llm=llm2, | |
chain_type="stuff", | |
prompt=refine_prompt_template, | |
verbose=False # Set this to true if you want to see the inner workings | |
) | |
def process_documents() -> List[Document]: | |
""" | |
Load documents and split in chunks | |
""" | |
print(f"Loading single document") | |
documents = load_single_document("./docs/sample.txt") | |
if not documents: | |
print("No new documents to load") | |
exit(0) | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=5500, chunk_overlap=500) | |
texts = text_splitter.split_documents(documents) | |
for count, value in enumerate(texts): | |
print(count, value) | |
#texts[count].page_content=summarize_text(texts[count].page_content) | |
return texts | |
def load_single_document(file_path: str) -> List[Document]: | |
ext = "." + file_path.rsplit(".", 1)[-1] | |
if ext in LOADER_MAPPING: | |
loader_class, loader_args = LOADER_MAPPING[ext] | |
loader = loader_class(file_path, **loader_args) | |
return loader.load() | |
raise ValueError(f"Unsupported file extension '{ext}'") | |
def cluster_cleanup(docs,vectors,i): | |
print("FIRST CHUNKY CHUNK") | |
j = 0 | |
doc = docs[j] | |
chunk_summary = chunk_chain.run([doc]) | |
print (f"Summary #{j} (chunk numero uno) - Preview: {chunk_summary} \n") | |
# Perform K-means clustering | |
kmeans = KMeans(n_clusters=i, random_state=42).fit(vectors) | |
# Find the closest embeddings to the centroids | |
# Create an empty list that will hold your closest points | |
closest_indices = [] | |
# Loop through the number of clusters you have | |
for i in range(i): | |
# Get the list of distances from that particular cluster center | |
distances = np.linalg.norm(vectors - kmeans.cluster_centers_[i], axis=1) | |
# Find the list position of the closest one (using argmin to find the smallest distance) | |
closest_index = np.argmin(distances) | |
# Append that position to your closest indices list | |
closest_indices.append(closest_index) | |
selected_indices = sorted(closest_indices) | |
selected_docs = [docs[doc] for doc in selected_indices] | |
# Make an empty list to hold your summaries | |
summary_list = [] | |
# Go get a summary of the last chunk | |
# Append that summary to your list | |
summary_list.append(chunk_summary) | |
# Loop through a range of the lenght of your selected docs | |
for x, doc in enumerate(selected_docs): | |
# Go get a summary of the chunk | |
chunk_summary = chunk_chain.run([doc]) | |
# Append that summary to your list | |
summary_list.append(chunk_summary) | |
print (f"Summary #{x} (chunk #{selected_indices[x]}) - Preview: {chunk_summary} \n") | |
# Go get a summary of the last chunk | |
print("LAST CHUNKY CHUNK") | |
j = len(docs)-1 | |
doc = docs[j] | |
chunk_summary = chunk_chain.run([doc]) | |
print (f"Summary #{j} (chunk last) - Preview: {chunk_summary} \n") | |
# Append that summary to your list | |
summary_list.append(chunk_summary) | |
summaries = "\n".join(summary_list) | |
# Convert it back to a document | |
summaries = Document(page_content=summaries) | |
print (f"Your total summary has {llm.get_num_tokens(summaries.page_content)} tokens") | |
print (f"Summaries:{summaries.page_content}") | |
return summaries | |
def main(): | |
docs = process_documents() | |
vectors = azureEmbeddings.embed_documents([x.page_content for x in docs]) | |
output = stuff_chain.run([cluster_cleanup(docs,vectors,4)]) | |
tmpDoc = Document(page_content=output,metadata={}) | |
output = refine_chain.run([tmpDoc]) | |
print("\n\n=====THE RESULT======\n\n") | |
print(f"Final answer: {output}") | |
exit() | |
main() |
Change Different Chunk Quantity by updating:
output = stuff_chain.run([cluster_cleanup(docs,vectors,{ANY AMOUNT})])
Remember that everything needs to fit in the context window of the model being used.
Change the Chunk Size by updating:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2500, chunk_overlap=500)
The logic uses the first chunk and the last chunk ALWAYS, and then uses Best Vector Representation to get the "juice" of the story as best as possible.
In this case I combine a local LLM and an AzureOpenAI LLM, but you can use whatever combination you prefer.
Process Used:
- Always use first chunk
- Use BVR (usually catches the "juice" in between) [bvr == best vector representation]
- Select N amount of chunks/vectors
- Always use the last chunk
- Force the structure of the response into using bullet lists
- Use GPT4ALL to generate the individual summaries (local LLM to save on tokens, but can be Azure too)
- Use Azure GPT3.5 Turbo to summarize the smaller summaries all together
Summary for Through the looking glass
* Alice is playing with her kittens and takes them outside to see the bonfire.
* Alice meets the White Queen and they discuss jam and time.
* Alice meets Humpty Dumpty and they discuss his fall from the wall.
* Alice meets the White Knight and they encounter a tin soldier with a box.
* Alice attends a dinner with the Red Queen and meets a leg of mutton and plum pudding.
* The speaker recounts a dream about fishes and poetry and reflects on the nature of dreams and life.
Summary for The Time Machine
The Time Traveller presents a time machine to his guests and disappears twice. The narrator travels through time and finds himself in a future where humans are divided into two species, the Eloi and the Morlocks. He befriends an Eloi named Weena and discovers that the Morlocks are cannibalistic and dangerous. The narrator eventually finds the Time Machine but is trapped by the Morlocks. The Time Traveller disappears again, leaving behind broken glass, and the narrator waits for his return.
Another version of Dorothy and the Wizard of Oz (Wizard in Oz?)
Dorothy and Zeb travel to Hugson's Ranch and experience fairy-like phenomena and talking piglets. They then travel to the Country of the Gargoyles and are transported to the Emerald City where the Wizard saves Eureka's life by tricking the jury. Zeb and Jim return to Kansas.
Another, more concise summary of The Wizard of Oz
Dorothy and her dog Toto are transported to a strange land where they meet the Tin Woodman, the Scarecrow, and the Cowardly Lion. They journey to the Emerald City to meet the Wizard of Oz, who tells them they must kill the Wicked Witch of the West to return home. Dorothy uses a charm to call Winged Monkeys to help her, and they eventually return home with the help of Glinda the Good Witch of the North. Along the way, they encounter talking trees and a china wall, and the Scarecrow, Tin Woodman, and Lion are granted their wishes. Dorothy returns home with the help of the Silver Shoes.
Summary of the entire book of The Wizard of Oz
If you are wondering why it says "Silver Shoes", in this version of the story the slippers are indeed silver.
“The Silver Shoes,” said the Good Witch, “have wonderful powers. And one of the most curious things about them is that they can carry you to any place in the world in three steps, and each step will be made in the wink of an eye. All you have to do is to knock the heels together three times and command the shoes to carry you wherever you wish to go.”
Final answer: