morganmcg1 · May 20, 2023 17:26
diff --git a/wandbot_synth.py b/wandbot_synth.py
 #!/usr/bin/env python
 # coding: utf-8

 # In this notebook we will automatically generate a set of evaluation questions based on wandb docs

 import random
 import wandb
 import re
 import openai
 import os
 from tqdm.auto import tqdm
 import time

 from langchain.docstore.document import Document
 from langchain.document_loaders import UnstructuredMarkdownLoader
 from langchain.text_splitter import MarkdownTextSplitter

 PROJECT = "wandbot_synth" 
 ENTITY = "wandbot"

 import openai
 from getpass import getpass

 def get_openai_key():
  if os.getenv("OPENAI_API_KEY") is None:
    if any(['VSCODE' in x for x in os.environ.keys()]):
      print('Please enter password in the VS Code prompt at the top of your VS Code window!')
    os.environ["OPENAI_API_KEY"] = getpass("Paste your OpenAI key from: https://platform.openai.com/account/api-keys\n")
    openai.api_key = os.getenv("OPENAI_API_KEY")
  assert os.getenv("OPENAI_API_KEY", "").startswith("sk-"), "This doesn't look like a valid OpenAI API key"
  print("OpenAI API key configured")

 cohere_api_key = ""
 get_openai_key()

 # # Answer Questions with WandBot


 import time
 from typing import Any, Dict, List

 import json
 import wandb
 from wandb.sdk.lib.runid import generate_id
 from wandb.integration.langchain import WandbTracer

 from langchain import LLMChain
 from langchain.chains import HypotheticalDocumentEmbedder, RetrievalQAWithSourcesChain
 from langchain.chat_models import ChatOpenAI
 from langchain.embeddings.openai import OpenAIEmbeddings
 from langchain.prompts.chat import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    SystemMessagePromptTemplate,
 )
 from langchain.schema import Document
 from langchain.vectorstores import FAISS
 from langchain.vectorstores.base import VectorStoreRetriever
 from langchain.chains import HypotheticalDocumentEmbedder, RetrievalQAWithSourcesChain
 from langchain.callbacks import get_openai_callback



 class VectorStoreRetrieverWithScore(VectorStoreRetriever):
    def get_relevant_documents(self, query: str) -> List[Document]:
        if self.search_type == "similarity":
            docs_and_scores = self.vectorstore.similarity_search_with_score(
                query, **self.search_kwargs
            )
            docs = []
            for doc, score in docs_and_scores:
                doc.metadata["score"] = score
                docs.append(doc)
        elif self.search_type == "mmr":
            docs = self.vectorstore.max_marginal_relevance_search(
                query, **self.search_kwargs
            )
        else:
            raise ValueError(f"search_type of {self.search_type} not allowed.")
        return docs


 class FAISSWithScore(FAISS):
    def as_retriever(self) -> VectorStoreRetrieverWithScore:
        return VectorStoreRetrieverWithScore(
            vectorstore=self,
            search_type="similarity",
            search_kwargs={"k": 10},
        )


 class RetrievalQAWithSourcesChainWithScore(RetrievalQAWithSourcesChain):
    reduce_k_below_max_tokens: bool = True
    max_tokens_limit: int = 2816

    def _get_docs(self, inputs: Dict[str, Any]) -> List[Document]:
        question = inputs[self.question_key]
        docs = self.retriever.get_relevant_documents(question)
        return self._reduce_tokens_below_limit(docs)


 def load_artifacts(config):
    faiss_artifact = wandb.use_artifact(config.faiss_artifact, type="search_index")
    faiss_artifact_dir = faiss_artifact.download()

    hyde_prompt_artifact = wandb.use_artifact(
        config.hyde_prompt_artifact, type="prompt"
    )
    hyde_artifact_dir = hyde_prompt_artifact.download()
    hyde_prompt_file = f"{hyde_artifact_dir}/hyde_prompt.txt"

    chat_prompt_artifact = wandb.use_artifact(
        config.chat_prompt_artifact, type="prompt"
    )
    chat_artifact_dir = chat_prompt_artifact.download()
    chat_prompt_file = f"{chat_artifact_dir}/chat_prompt.txt"

    return {
        "faiss": faiss_artifact_dir,
        "hyde_prompt": hyde_prompt_file,
        "chat_prompt": chat_prompt_file,
    }


 # In[10]:


 import json 
 def parse_source_documents(source_documents):
    source_docs_dict = {}
    for i,source_doc in enumerate(source_documents):
        source_docs_dict[f"source_doc_{i}"] ={
            "page_content":source_doc.page_content,
            "metadata":source_doc.metadata["source"],
            "lookup_index":source_doc.lookup_index,
            "lookup_str":source_doc.lookup_str,
            }   

    return json.dumps(source_docs_dict)


 # In[11]:


 # qa_chain.json


 # In[12]:


 from types import SimpleNamespace

 # login to openai with your api key
 get_openai_key()

 wandbot_config = SimpleNamespace(
    faiss_artifact="parambharat/wandb_docs_bot/faiss_store:latest",
    hyde_prompt_artifact="parambharat/wandb_docs_bot/hyde_prompt:latest",
    chat_prompt_artifact="parambharat/wandb_docs_bot/system_prompt:latest",
    model_name="gpt-3.5-turbo",
    eval_model = 'command-nightly',
    temperature=0,
    hyde_llm_temperature=0.3,
    command_llm_temperature=0.0,
    cohere_generate_cost_usd = 0.0000025  # cost per characters (not tokens), $0.0025 per generation unit (1000 chars)
 )

 wandb.init(
        name="synth_answer_generation_test",
        project=PROJECT,
        entity=ENTITY,
        config=wandbot_config,
        )

 artifacts = load_artifacts(wandb.config)


 # ### Prompts

 # Load wandbot v1 prompts

 # In[13]:


 # LOAD DATA AND PROMPTS FROM ARTIFACTS
 faiss_dir = artifacts["faiss"]
 hyde_prompt_template =  open(artifacts["hyde_prompt"]).read()
 wandbot_v1_system_prompt_template = open(artifacts["chat_prompt"]).read()
 human_message_prompt_template = "{question}"


 # SETUP Hypothetical Document Embedder (HyDE)
 hyde_messages = [
    SystemMessagePromptTemplate.from_template(hyde_prompt_template),
    HumanMessagePromptTemplate.from_template("{question}"),
 ]
 hyde_prompt = ChatPromptTemplate.from_messages(hyde_messages)


 # ### Alternative System Prompts
 # 
 # Create alternate wandbot prompts to test

 # In[14]:


 system_prompts = {}

 system_prompts["wandbot_v1_few_shot"] = wandbot_v1_system_prompt_template

 system_prompts["wandbot_v1_zero_shot"]= """
 As an AI assistant for the open source library wandb, your task is to answer questions based on 
 the given extracted parts of a long document and the question. You can provide a conversational 
 answer with a hyperlink to the documentation only if it is explicitly listed as a source in the context. 

 Provide a code block directly from the documentation wherever possible. If you do not know the answer, 
 you can say "Hmm, I'm not sure." If the question is not related to wandb or Weights & Biases, politely 
 inform the user that you can only answer questions related to wandb. The documentation for wandb can be
 found at https://docs.wandb.ai.

 Begin:
 ================

 Question: {question}
 ================
 {summaries}
 ================
 Final Answer in Markdown:
 """

 system_prompts["default_langchain_qa"]= """
 Use the following pieces of context to answer the question at the end. If you don't know the answer, 
 just say that you don't know, don't try to make up an answer.

 {summaries}

 Question: {question}
 Helpful Answer:
 """


 # Get prompt token counts

 # In[15]:


 import tiktoken
 enc = tiktoken.encoding_for_model("gpt-4")
 jj = enc.encode("hello world")
 len(jj)

 for k in system_prompts.keys():
    print(f"{k} token count: {len(enc.encode(system_prompts[k]))}")


 # ### Evaluation Propmt
 # Cohere Command Grader Prompt

 # In[16]:


 human_prompt = "Human:"
 assistant_prompt = "GRADER_RESPONSE:"

 grade_command = """Grade the following WANDBOT_RESPONSE given the USER_QUESTION and SUPPORTING_DOCUMENTATION. 
 Grade the WANDBOT_RESPONSE based ONLY on its factual accuracy. It is OK if the WANDBOT_RESPONSE contains more information than in SUPPORTING_DOCUMENTATION, as long as it does not contain any conflicting statements.
 Your GRADE should only be POSITIVE or NEGATIVE to indicate whether the WANDBOT_RESPONSE is accurate or not given the SUPPORTING_DOCUMENTATION, no other information is required.
 If the WANDBOT_RESPONSE answers that there is no specific information provided in the context or that it doesn't know, then the GRADE is NEGATIVE. 
 Only respond with POSITIVE or NEGATIVE for GRADE."""


 def command_eval_prompt_constructor(question, source_documents, answer, grade_command=grade_command):
    evaluation_prompts_template = f"""As an experienced software quality assurance tester, you are evaluating the quality of the response (WANDBOT_RESPONSE) from a
 Weights & Biasses (aka wandb, W&B) support bot called `wandbot`. Weights & Biasses is a machine learning ops (MLOps) python library and app.
 Supporting documentation (SUPPORTING_DOCUMENTATION) is provided to help you assess the quality of the response. You job is to grade (GRADE) the response.

 This is the example format of the input and a grade given to the `wandbot` support bot response:

 =====================
 USER_QUESTION: user question here
 WANDBOT_RESPONSE: the response from the `wandbot` support bot here
 SUPPORTING_DOCUMENTATION: retrieved documentation from the wandb docs here
 {assistant_prompt} GRADE: POSITIVE or NEGATIVE here
 =====================

 this is a real examples:

 =====================
 USER_QUESTION: How do I create a wandb sweep?
 WANDBOT_RESPONSE: To create a W&B Articfact, you can use the wandb.Artifact class like so ```artifact = wandb.Artifact(name='bicycle-dataset', type='dataset')```
 SUPPORTING_DOCUMENTATION: '{{"source_doc_0": {{"page_content": "Use Weights & Biases Sweeps to automate hyperparameter search and explore the space of possible models. Create a sweep with a few lines of code.", "metadata": "https://docs.wandb.ai/guide", "lookup_index": 0, "lookup_str": ""}}}}'
 {assistant_prompt} GRADE: NEGATIVE
 =====================

 {grade_command}

 USER_QUESTION: {question}
 WANDBOT_RESPONSE: {answer}
 SUPPORTING_DOCUMENTATION: {source_documents}
 {assistant_prompt} GRADE:"""
    return evaluation_prompts_template

 question = "what is wandb?"
 answer = "Weights & Biases is a machine learning platform for teams."
 source_documents = "[hey, ho]"
 # print(command_eval_prompt_constructor(question, source_documents, answer, grade_command))


 # Cohere Command prompt template

 # In[17]:


 from tokenizers import Tokenizer

 eval_grader_prompt_template = command_eval_prompt_constructor("", "", "")

 command_nightly_tokenizer = Tokenizer.from_pretrained("Cohere/command-nightly")
 prompt_enc = command_nightly_tokenizer.encode(eval_grader_prompt_template)
 print(f"Command prompt template token count: {len(prompt_enc.ids)}")


 # Claude Grader Prompt

 # In[18]:


 # def claude_eval_prompt_constructor(question, source_documents, answer):
 #     evaluation_prompts_template = f"""As an experienced software quality assurance tester, you are evaluating the quality of the response from a
 #     Weights & Biasses (aka wandb, W&B) support bot called wandbot. Weights & Biasses is a machine learning ops (MLOps) python library and app.
 #     Supporting documentation is provided to help you assess the quality of the response.
 #     Your feedback should only only be "POSITIVE" or "NEGATIVE" to indicate whether the response is accurate or not,
 #     no other information is required. For example:
 #     {anthropic.HUMAN_PROMPT}
 #     =====================
 #     USER_QUESTION: What is wandb?
 #     SUPPORTING_DOCUMENTATION: '{{"source_doc_0": {{"page_content": "Weights & Biases is the machine learning platform for developers to build better models faster", "metadata": "https://docs.wandb.ai/guide", "lookup_index": 0, "lookup_str": ""}}}}'
 #     WANDBOT_RESPONSE: Weights & Biases is a machine learning platform for teams.
 #     {anthropic.AI_PROMPT}
 #     POSITIVE
 #     {anthropic.HUMAN_PROMPT}
 #     =====================
 #     USER_QUESTION: How do I create a wandb sweep?
 #     SUPPORTING_DOCUMENTATION: '{{"source_doc_0": {{"page_content": "Use Weights & Biases Sweeps to automate hyperparameter search and explore the space of possible models. Create a sweep with a few lines of code.", "metadata": "https://docs.wandb.ai/guide", "lookup_index": 0, "lookup_str": ""}}}}'
 #     WANDBOT_RESPONSE: To create a W&B Articfact, you can use the wandb.Artifact class like so ```artifact = wandb.Artifact(name='bicycle-dataset', type='dataset')```
 #     {anthropic.AI_PROMPT}
 #     NEGATIVE
 #     {anthropic.HUMAN_PROMPT}
 #     =====================
 #     USER_QUESTION: {question}
 #     SUPPORTING_DOCUMENTATION: {source_documents}
 #     WANDBOT_RESPONSE: {answer}
 #     {anthropic.AI_PROMPT}"""
 #     return evaluation_prompts_template

 # question = "what is wandb?"
 # answer = "Weights & Biases is a machine learning platform for teams."
 # print(claude_eval_prompt_constructor(question, source_documents, answer))


 # ### Load Embeddings and Vector Store

 # In[19]:


 base_embeddings = OpenAIEmbeddings()
 embeddings = HypotheticalDocumentEmbedder(
    llm_chain=LLMChain(llm=ChatOpenAI(
        temperature=wandb.config.hyde_llm_temperature,), 
        prompt=hyde_prompt),
    base_embeddings=base_embeddings,
    verbose=True
 )

 # LOAD FAISS VECTOR STORE
 vector_store = FAISSWithScore.load_local(faiss_dir, embeddings)


 # In[20]:


 # LOAD QA CHAINS FOR EACH SYSTEM PROMPT
 def load_qa_chain(system_prompt_template, vector_store=vector_store, chain_type="stuff"):
    qa_messages = [
        SystemMessagePromptTemplate.from_template(system_prompt_template, input_variables=["context", "question"]),
        HumanMessagePromptTemplate.from_template(human_message_prompt_template),
    ]   
    qa_prompt = ChatPromptTemplate.from_messages(qa_messages)

    llm = ChatOpenAI(
        model_name=wandb.config.model_name,
        temperature=wandb.config.temperature,
        request_timeout=20
        )

    qa_chain = RetrievalQAWithSourcesChainWithScore.from_chain_type(
        llm = llm,
        chain_type=chain_type,
        retriever=vector_store.as_retriever(),
        chain_type_kwargs={"prompt": qa_prompt},
        return_source_documents=True,
        verbose=True
    )
    return qa_chain


 # Create timestamps

 # In[21]:


 import pandas as pd
 import numpy as np

 def generate_timestamps(n=10000, start_date='2023-03-01', end_date='2023-05-31'):
    # Range of datetimes with 1-minute intervals
    rng = pd.date_range(start_date, end_date, freq='S')

    # Create weights for all datetimes
    weights = pd.Series(1, index=rng)

    # Decrease weights for weekends
    weights[rng.to_series().dt.dayofweek > 4] *= 0.5

    # Decrease weight for Easter Sunday (2023-04-09)
    easter = pd.to_datetime('2023-04-09')
    weights[rng.to_series().between(easter, easter + pd.DateOffset(days=1))] *= 0.5
    easter_monday = pd.to_datetime('2023-04-10')
    weights[rng.to_series().between(easter, easter + pd.DateOffset(days=1))] *= 0.5

    # Increase weights for 8am-6pm on weekdays
    mask = ((rng.to_series().dt.hour >= 8) & (rng.to_series().dt.hour <= 18) & (rng.to_series().dt.dayofweek <= 4))
    weights[mask] *= 1.2

    # Increase weights for Tuesday, Wednesday, Thursday
    mask = ((rng.to_series().dt.dayofweek >= 1) & (rng.to_series().dt.dayofweek <= 3))
    weights[mask] *= 1.5

    # Normalize weights
    weights /= weights.sum()

    # Sample 10000 datetimes using weights
    sampled_datetimes = np.random.choice(rng, size=n, p=weights)

    # Sort the datetimes
    sampled_datetimes.sort()

    return sampled_datetimes

 timestamps = generate_timestamps()
 len(timestamps)


 # In[22]:


 import matplotlib.pyplot as plt

 def plot_timestamps(timestamps):
    plt.figure(figsize=(10,6))
    plt.hist(timestamps, bins=100, alpha=0.5, color='blue')
    plt.xlabel('Datetime')
    plt.ylabel('Frequency')
    plt.title('Distribution of Timestamps')
    plt.show()

 timestamps = generate_timestamps()
 # plot_timestamps(timestamps)


 # In[23]:


 # WANDB LOGGING CONFIG
 wandb_config = {"project": PROJECT, "entity":ENTITY}  # config for OpenAI autologger

 table_cols = [
  "request_timestamp","query_id", "query", "wandbot_answer", "retrived_source_documents",
  "synth_user_feedback_signal", "elapsed_time_s", 
  "prompt_tokens", "completion_tokens", "total_tokens",
  "answer_cost_usd", "successful_requests", 
  "system_prompt_version", "system_prompt_template", "human_message_prompt_template", "hyde_prompt_template", "eval_prompt_template",
  "wandb_run_id", "wandbot_model", "wandbot_temperature", "hyde_llm_temperature", 
  "eval_model", "eval_elapsed_time_s",
  "eval_total_chars", "eval_cost_usd",  "eval_total_tokens", "eval_prompt_tokens", "eval_completion_tokens"
 ]


 # ### Load Questions


 # In[35]:


 # import pandas as pd
 # df = pd.read_csv('sythetic-user-questions_2023-05-14.csv')
 # questions = df["question"].values
 # questions = questions[:5]
 # questions

 artifact = wandb.use_artifact('wandbot/wandbot_synth/run-2cv1ao9n-generated_questions_table:v0', type='run_table')
 # artifact_dir = artifact.download("data")
 df = artifact.get("generated_questions_table").get_dataframe()

 # with open('data/generated_questions_table.table.json') as f:
 #     js = json.load(f)

 # columns = js['columns']
 # data = js['data']
 # df = pd.DataFrame(data, columns=columns)
 questions = df["question"].values

 # shuffle the questions
 np.random.shuffle(questions)
 print(len(questions))
 df.head()


 # Setup Evaluation Model

 # In[28]:


 import cohere

 def calculate_eval_tokens(eval_grader_prompt, eval_completion):
    prompt_enc = command_nightly_tokenizer.encode(eval_grader_prompt)
    prompt_tokens_count = len(prompt_enc.ids)
    completion_enc = command_nightly_tokenizer.encode(eval_completion)
    completion_token_count = len(completion_enc.ids)
    completion_total_tokens = prompt_tokens_count + completion_token_count
    return completion_total_tokens, prompt_tokens_count, completion_token_count

 co = cohere.Client(cohere_api_key)


 # ## Run Synth WandBot

 # Setup chain variants

 # In[27]:


 chains = {}
 for system_prompt in system_prompts.keys():
    chains[f"{system_prompt}"] = load_qa_chain(system_prompts[system_prompt], vector_store, chain_type="stuff")

 # chains.keys()


 # In[29]:


 import traceback

 import langchain; 
 langchain.debug=False


 for i_q, question in enumerate(questions):

    # USER QUERY
    query_id = generate_id(length=16)
    tstamp = timestamps[i_q]
    
    # RUN CHAIN
    system_prompt = random.choice(list(chains.keys()))
    qa_chain = chains[system_prompt]

    try:
        start_time = time.time()
        with get_openai_callback() as openai_cb:
            response = qa_chain({"question": question}, 
                                callbacks=[WandbTracer(wandb_config)],
                                return_only_outputs=False,
                                )
        end_time = time.time()
        elapsed_time = end_time - start_time
        answer = response["answer"]

        # RETRIEVED DOCUMENTS
        source_docs = response["source_documents"]
        source_documents = parse_source_documents(source_docs)

        # TOKEN METRICS
        prompts_tokens = openai_cb.prompt_tokens
        completion_tokens = openai_cb.completion_tokens
        total_tokens = openai_cb.total_tokens
        total_cost = openai_cb.total_cost
        successful_requests = openai_cb.successful_requests

        # GENERATE SYNTHETIC USER FEEDBACK
        eval_grader_prompt = command_eval_prompt_constructor(question, source_documents, answer)

        eval_completion = ""
        eval_total_tokens, eval_prompt_tokens, eval_completion_tokens = calculate_eval_tokens(eval_grader_prompt, eval_completion)

        eval_start_time = time.time()
        response = co.generate(
            model=wandb.config.eval_model,
            prompt = eval_grader_prompt,
            max_tokens=50,  
            temperature=wandb.config.command_llm_temperature,
            stop_sequences=["====================="],
            truncate="end"
        )
        eval_end_time = time.time()
        eval_elapsed_time = eval_end_time - eval_start_time
        eval_completion = response.generations[0].text

        # Get eval token counts
        eval_total_tokens, eval_prompt_tokens, eval_completion_tokens = calculate_eval_tokens(eval_grader_prompt, eval_completion)
        eval_total_chars = len(eval_grader_prompt) + len(eval_completion)
        eval_cost = eval_total_chars * wandb.config.cohere_generate_cost_usd 

        synth_user_feedback = "POSITIVE" if "positive" in eval_completion.lower() else "NEGATIVE"
        # synth_user_feedback = "POSITIVE"

        # LOG TO WANDB
        wandb_table = wandb.Table(table_cols)
        wandb_table.add_data(tstamp, query_id, question, answer, source_documents,
                            synth_user_feedback, elapsed_time, 
                            prompts_tokens, completion_tokens, total_tokens,
                            total_cost, successful_requests, 
                            system_prompt, system_prompts[system_prompt], human_message_prompt_template, hyde_prompt_template, eval_grader_prompt_template,
                             wandb.run.id, wandb.config.model_name, wandb.config.temperature, wandb.config.hyde_llm_temperature, 
                             wandb.config.eval_model, eval_elapsed_time, 
                             eval_total_chars, eval_cost, eval_total_tokens, eval_prompt_tokens, eval_completion_tokens)
        wandb.log({"logs/qa_with_eval": wandb_table})
    except Exception as e:
        print(f"Question {i_q}, Error occured: {e}")
        traceback.print_exc()
        
    # break
    if i_q % 20 == 0: print(i_q)


 # In[27]:


 print("DONE!")


 # In[ ]:


 # config_table_cols = ["query_id","run_id",  "system_prompt_template", 
 #                         "human_message_prompt_template", "hyde_prompt_template", "eval_grader_prompt_template"]
 # config_table = wandb.Table(config_table_cols)  
 # config_table.add_data(query_id, wandb.run.id, system_prompts[system_prompt], 
 #                         human_message_prompt_template, hyde_prompt_template, eval_grader_prompt_template)
 # wandb.log({"logs/config_table_test": config_table})


 # In[ ]:


 # # ANTHROPIC EVALUATION
 # eval_model = "claude-v1.3-100k" # "claude-v1",
 # # anthropic_api = "XXX"
 # # client = anthropic.Client(api_key=anthropic_api)
 # # max_tokens_to_sample = 100000

 # eval_prompt_template = claude_eval_prompt_constructor("", "", "")  # Just to log the eval prompt template
 # # eval_prompt = claude_eval_prompt_constructor(question, source_documents, answer)

 # eval_start_time = time.time()
 # # resp = client.completion(
 # #     prompt=eval_prompt,
 # #     stop_sequences=[anthropic.HUMAN_PROMPT],
 # #     model=eval_model,
 # #     max_tokens_to_sample=max_tokens_to_sample,
 # # )