ranfysvalle02 · June 13, 2023 05:36 · ranfysvalle02 · Jun 13, 2023
diff --git a/VectorStuff-BVR-book.py b/VectorStuff-BVR-book.py
 # Inspired by: https://github.com/gkamradt/langchain-tutorials/blob/main/data_generation/5%20Levels%20Of%20Summarization%20-%20Novice%20To%20Expert.ipynb
 from langchain.embeddings.openai import OpenAIEmbeddings
 from langchain.llms import AzureOpenAI
 import numpy as np
 from multiprocessing import Pool
 from tqdm import tqdm
 from langchain.chains.summarize import load_summarize_chain
 from langchain.embeddings.openai import OpenAIEmbeddings
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.vectorstores import MongoDBAtlasVectorSearch
 from langchain.document_loaders import TextLoader
 from langchain.llms import LlamaCpp
 from typing import Any, Dict, List
 from langchain.docstore.document import Document
 from langchain.document_loaders import (
    TextLoader
 )
 from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
 from tqdm import tqdm
 from langchain import PromptTemplate
 tqdm.pandas()
 from sklearn.cluster import KMeans

 llm = LlamaCpp(n_ctx=4096,temperature=0.1,model_path="/Users/fabian/dev/OPENSOURCE/models/GPT4All-13B-snoozy.ggmlv3.q4_1.bin")
 llm2 = AzureOpenAI(
    deployment_name="",
    openai_api_base="https://.openai.azure.com/",
    openai_api_key="",
    temperature=0.1
 )
 azureEmbeddings = OpenAIEmbeddings(
    deployment="",
    model="text-embedding-ada-002",
    openai_api_base="https://.openai.azure.com/",
    openai_api_key="",
    openai_api_type="azure",
    chunk_size=1
 )

 # Different LLMs, chunk sizes, and chunk amount will affect response quality
 # and tokens used.

 # Map file extensions to document loaders and their arguments
 LOADER_MAPPING = {
    ".txt": (TextLoader, {"encoding": "utf8"}),
    # Add more mappings for other file extensions and loaders as needed
 }

 vectorstore = MongoDBAtlasVectorSearch(collection, azureEmbeddings)

 combine_prompt = """
 You will be given a series of summaries from a text.
 Your goal is to give a CONCISE summary of what happened in the text. 
 Think critically and analytically.
 \n\n
 ```{text}```
 \n\n
 CONCISE SUMMARY:
 """
 combine_prompt_template = PromptTemplate(template=combine_prompt, input_variables=["text"])
 stuff_chain = load_summarize_chain(llm=llm2,
                            chain_type="stuff",
                            prompt=combine_prompt_template,
                            verbose=False # Set this to true if you want to see the inner workings
                            )
 chunk_prompt = """
 You will be given a text enclosed in triple backticks (```)
 Your goal is to give a CONCISE summary of what happened in the text.
 YOU MUST THINK CRITICALLY AND ANALYTICALLY.
 ```{text}```
 CONCISE SUMMARY:
 """
 chunk_prompt_template = PromptTemplate(template=chunk_prompt, input_variables=["text"])
 chunk_chain = load_summarize_chain(llm=llm,
                            chain_type="stuff",
                            prompt=chunk_prompt_template,
                            verbose=False # Set this to true if you want to see the inner workings
                            )

 refine_prompt = """
 You will be given a summary of a story enclosed in triple backticks (```)
 IMPROVE THE QUALITY OF THE SUMMARY
 ```{text}```
 CONCISE SUMMARY:
 """
 refine_prompt_template = PromptTemplate(template=refine_prompt, input_variables=["text"])
 refine_chain = load_summarize_chain(llm=llm2,
                            chain_type="stuff",
                            prompt=refine_prompt_template,
                            verbose=False # Set this to true if you want to see the inner workings
                            )

 def process_documents() -> List[Document]:
    """
    Load documents and split in chunks
    """
    print(f"Loading single document")
    documents = load_single_document("./docs/sample.txt")
    if not documents:
        print("No new documents to load")
        exit(0)
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=5500, chunk_overlap=500)
    texts = text_splitter.split_documents(documents)
    for count, value in enumerate(texts):
        print(count, value)
        #texts[count].page_content=summarize_text(texts[count].page_content)
    
    return texts
 def load_single_document(file_path: str) -> List[Document]:
    ext = "." + file_path.rsplit(".", 1)[-1]
    if ext in LOADER_MAPPING:
        loader_class, loader_args = LOADER_MAPPING[ext]
        loader = loader_class(file_path, **loader_args)
        return loader.load()

    raise ValueError(f"Unsupported file extension '{ext}'")
 def cluster_cleanup(docs,vectors,i):
    print("FIRST CHUNKY CHUNK")
    j = 0
    doc = docs[j]
    chunk_summary = chunk_chain.run([doc])
    print (f"Summary #{j} (chunk numero uno) - Preview: {chunk_summary} \n")

    # Perform K-means clustering
    kmeans = KMeans(n_clusters=i, random_state=42).fit(vectors)
    # Find the closest embeddings to the centroids
    # Create an empty list that will hold your closest points
    closest_indices = []

    # Loop through the number of clusters you have
    for i in range(i):
        
        # Get the list of distances from that particular cluster center
        distances = np.linalg.norm(vectors - kmeans.cluster_centers_[i], axis=1)
        
        # Find the list position of the closest one (using argmin to find the smallest distance)
        closest_index = np.argmin(distances)
        
        # Append that position to your closest indices list
        closest_indices.append(closest_index)
    selected_indices = sorted(closest_indices)
    selected_docs = [docs[doc] for doc in selected_indices]
    # Make an empty list to hold your summaries
    summary_list = []
     # Go get a summary of the last chunk

    # Append that summary to your list
    summary_list.append(chunk_summary)
    # Loop through a range of the lenght of your selected docs
    for x, doc in enumerate(selected_docs):
        
        # Go get a summary of the chunk
        chunk_summary = chunk_chain.run([doc])
        
        # Append that summary to your list
        summary_list.append(chunk_summary)
        
        print (f"Summary #{x} (chunk #{selected_indices[x]}) - Preview: {chunk_summary} \n")
    # Go get a summary of the last chunk
    print("LAST CHUNKY CHUNK")
    j = len(docs)-1
    doc = docs[j]
    chunk_summary = chunk_chain.run([doc])
    print (f"Summary #{j} (chunk last) - Preview: {chunk_summary} \n")

    # Append that summary to your list
    summary_list.append(chunk_summary)
    summaries = "\n".join(summary_list)
    # Convert it back to a document
    summaries = Document(page_content=summaries)
    print (f"Your total summary has {llm.get_num_tokens(summaries.page_content)} tokens")
    print (f"Summaries:{summaries.page_content}")
    return summaries
 def main():
    docs = process_documents()
    vectors = azureEmbeddings.embed_documents([x.page_content for x in docs])
    output = stuff_chain.run([cluster_cleanup(docs,vectors,4)])
    tmpDoc = Document(page_content=output,metadata={})
    output = refine_chain.run([tmpDoc])
    print("\n\n=====THE RESULT======\n\n")
    print(f"Final answer: {output}")
    exit()
 main()
	# Inspired by: https://github.com/gkamradt/langchain-tutorials/blob/main/data_generation/5%20Levels%20Of%20Summarization%20-%20Novice%20To%20Expert.ipynb
	from langchain.embeddings.openai import OpenAIEmbeddings
	from langchain.llms import AzureOpenAI
	import numpy as np
	from multiprocessing import Pool
	from tqdm import tqdm
	from langchain.chains.summarize import load_summarize_chain
	from langchain.embeddings.openai import OpenAIEmbeddings
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.vectorstores import MongoDBAtlasVectorSearch
	from langchain.document_loaders import TextLoader
	from langchain.llms import LlamaCpp
	from typing import Any, Dict, List
	from langchain.docstore.document import Document
	from langchain.document_loaders import (
	TextLoader
	)
	from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
	from tqdm import tqdm
	from langchain import PromptTemplate
	tqdm.pandas()
	from sklearn.cluster import KMeans

	llm = LlamaCpp(n_ctx=4096,temperature=0.1,model_path="/Users/fabian/dev/OPENSOURCE/models/GPT4All-13B-snoozy.ggmlv3.q4_1.bin")
	llm2 = AzureOpenAI(
	deployment_name="",
	openai_api_base="https://.openai.azure.com/",
	openai_api_key="",
	temperature=0.1
	)
	azureEmbeddings = OpenAIEmbeddings(
	deployment="",
	model="text-embedding-ada-002",
	openai_api_base="https://.openai.azure.com/",
	openai_api_key="",
	openai_api_type="azure",
	chunk_size=1
	)

	# Different LLMs, chunk sizes, and chunk amount will affect response quality
	# and tokens used.

	# Map file extensions to document loaders and their arguments
	LOADER_MAPPING = {
	".txt": (TextLoader, {"encoding": "utf8"}),
	# Add more mappings for other file extensions and loaders as needed
	}

	vectorstore = MongoDBAtlasVectorSearch(collection, azureEmbeddings)

	combine_prompt = """
	You will be given a series of summaries from a text.
	Your goal is to give a CONCISE summary of what happened in the text.
	Think critically and analytically.
	\n\n
	```{text}```
	\n\n
	CONCISE SUMMARY:
	"""
	combine_prompt_template = PromptTemplate(template=combine_prompt, input_variables=["text"])
	stuff_chain = load_summarize_chain(llm=llm2,
	chain_type="stuff",
	prompt=combine_prompt_template,
	verbose=False # Set this to true if you want to see the inner workings
	)
	chunk_prompt = """
	You will be given a text enclosed in triple backticks (```)
	Your goal is to give a CONCISE summary of what happened in the text.
	YOU MUST THINK CRITICALLY AND ANALYTICALLY.
	```{text}```
	CONCISE SUMMARY:
	"""
	chunk_prompt_template = PromptTemplate(template=chunk_prompt, input_variables=["text"])
	chunk_chain = load_summarize_chain(llm=llm,
	chain_type="stuff",
	prompt=chunk_prompt_template,
	verbose=False # Set this to true if you want to see the inner workings
	)

	refine_prompt = """
	You will be given a summary of a story enclosed in triple backticks (```)
	IMPROVE THE QUALITY OF THE SUMMARY
	```{text}```
	CONCISE SUMMARY:
	"""
	refine_prompt_template = PromptTemplate(template=refine_prompt, input_variables=["text"])
	refine_chain = load_summarize_chain(llm=llm2,
	chain_type="stuff",
	prompt=refine_prompt_template,
	verbose=False # Set this to true if you want to see the inner workings
	)

	def process_documents() -> List[Document]:
	"""
	Load documents and split in chunks
	"""
	print(f"Loading single document")
	documents = load_single_document("./docs/sample.txt")
	if not documents:
	print("No new documents to load")
	exit(0)
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=5500, chunk_overlap=500)
	texts = text_splitter.split_documents(documents)
	for count, value in enumerate(texts):
	print(count, value)
	#texts[count].page_content=summarize_text(texts[count].page_content)

	return texts
	def load_single_document(file_path: str) -> List[Document]:
	ext = "." + file_path.rsplit(".", 1)[-1]
	if ext in LOADER_MAPPING:
	loader_class, loader_args = LOADER_MAPPING[ext]
	loader = loader_class(file_path, **loader_args)
	return loader.load()

	raise ValueError(f"Unsupported file extension '{ext}'")
	def cluster_cleanup(docs,vectors,i):
	print("FIRST CHUNKY CHUNK")
	j = 0
	doc = docs[j]
	chunk_summary = chunk_chain.run([doc])
	print (f"Summary #{j} (chunk numero uno) - Preview: {chunk_summary} \n")

	# Perform K-means clustering
	kmeans = KMeans(n_clusters=i, random_state=42).fit(vectors)
	# Find the closest embeddings to the centroids
	# Create an empty list that will hold your closest points
	closest_indices = []

	# Loop through the number of clusters you have
	for i in range(i):

	# Get the list of distances from that particular cluster center
	distances = np.linalg.norm(vectors - kmeans.cluster_centers_[i], axis=1)

	# Find the list position of the closest one (using argmin to find the smallest distance)
	closest_index = np.argmin(distances)

	# Append that position to your closest indices list
	closest_indices.append(closest_index)
	selected_indices = sorted(closest_indices)
	selected_docs = [docs[doc] for doc in selected_indices]
	# Make an empty list to hold your summaries
	summary_list = []
	# Go get a summary of the last chunk

	# Append that summary to your list
	summary_list.append(chunk_summary)
	# Loop through a range of the lenght of your selected docs
	for x, doc in enumerate(selected_docs):

	# Go get a summary of the chunk
	chunk_summary = chunk_chain.run([doc])

	# Append that summary to your list
	summary_list.append(chunk_summary)

	print (f"Summary #{x} (chunk #{selected_indices[x]}) - Preview: {chunk_summary} \n")
	# Go get a summary of the last chunk
	print("LAST CHUNKY CHUNK")
	j = len(docs)-1
	doc = docs[j]
	chunk_summary = chunk_chain.run([doc])
	print (f"Summary #{j} (chunk last) - Preview: {chunk_summary} \n")

	# Append that summary to your list
	summary_list.append(chunk_summary)
	summaries = "\n".join(summary_list)
	# Convert it back to a document
	summaries = Document(page_content=summaries)
	print (f"Your total summary has {llm.get_num_tokens(summaries.page_content)} tokens")
	print (f"Summaries:{summaries.page_content}")
	return summaries
	def main():
	docs = process_documents()
	vectors = azureEmbeddings.embed_documents([x.page_content for x in docs])
	output = stuff_chain.run([cluster_cleanup(docs,vectors,4)])
	tmpDoc = Document(page_content=output,metadata={})
	output = refine_chain.run([tmpDoc])
	print("\n\n=====THE RESULT======\n\n")
	print(f"Final answer: {output}")
	exit()
	main()