Created
August 19, 2024 06:30
-
-
Save victor-iyi/7322236186444960059ea46eae69e2d2 to your computer and use it in GitHub Desktop.
Evaluate the output of Large Language Models based on how helpful the results are from Not Helpful to Highly helpful on a likert scale
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import logging | |
import os | |
import pathlib | |
import re | |
import time | |
from typing import Any | |
import jsonlines | |
import openai | |
import torch | |
from rlhf_trl.args import PPOArgs | |
from rlhf_trl.evaluate.utils import load_models | |
from rlhf_trl.evaluate.utils import load_pipelines | |
from rlhf_trl.evaluate.utils import reduce_context_length | |
from rlhf_trl.predict import generate_with_model | |
from rlhf_trl.predict import generate_with_pipeline | |
from torch.utils.data import DataLoader | |
from tqdm import tqdm | |
from transformers import AutoTokenizer | |
CHATGPT_ANNOTATOR_PROMPT = """You are evaluating a response that has been submitted for a particular task, using a specific set of standards. Below is the data: | |
[BEGIN DATA] | |
*** | |
[Task]: {} | |
*** | |
[Submission]: {} | |
*** | |
[Criterion]: helpfulness: | |
"1": "Not helpful - The generated text is completely irrelevant, unclear, or incomplete. It does not provide any useful information to the user." | |
"2": "Somewhat helpful - The generated text has some relevance to the user’s question, but it may be unclear or incomplete. It provides only | |
partial information, or the information provided may not be useful for the user’s needs." | |
"3": "Moderately helpful - The generated text is relevant to the user’s question, and it provides a clear and complete answer. However, it may | |
lack detail or explanation that would be helpful for the user." | |
"4": "Helpful - The generated text is quite relevant to the user’s question, and it provides a clear, complete, and detailed answer. It offers | |
additional information or explanations that are useful for the user. However, some of the points of the response are somewhat repetitive or could | |
be combined for greater clarity and concision" | |
"5": "Very helpful - The generated text is highly relevant to the user’s question, and it provides a clear, complete, and detailed answer. It offers | |
additional information, explanations, or analogies that are not only useful but also insightful and valuable to the user. However, the structured | |
of the response is not well-organized and there is no clear progression or logical sequence of different points in the response." | |
"6": "Highly helpful - The generated text provides a clear, complete, and detailed answer. It offers additional information or explanations that | |
are not only useful but also insightful and valuable to the user. The response is also in a logical and easy-to-follow manner by explicitly using | |
headings, bullet points, or numbered lists to break up the information and make it easier to read." | |
*** | |
[END DATA] | |
Does the submission meet the criterion? First, write out in a step by step manner your reasoning about the criterion to be sure that your | |
conclusion is correct. Avoid simply stating the correct answers at the outset. Then print the choice only from “1, 2, 3, 4, 5, 6” (without quotes | |
or punctuation) on its own line corresponding to the correct answer. At the end, repeat just the selected choice again by itself on a new line.""" | |
# Regex pattern to match a single digit. | |
SINGLE_DIGIT_PATTERN = re.compile(r'\b\d\b') | |
def api_request( | |
prompt: str, | |
openai_model: str = 'gpt-3.5-turbo', | |
max_tokens: int = 512, | |
) -> str: | |
"""Send a request to the OpenAI API and return the response. | |
Args: | |
prompt (str): The prompt to send to the API. | |
openai_model (str, optional): The OpenAI model to use. | |
Defaults to 'gpt-3.5-turbo'. | |
max_tokens (int, optional): The maximum number of tokens to generate. | |
Defaults to 512. | |
Returns: | |
str: The response from the API. | |
""" | |
# Set the OpenAI API key. | |
openai.api_key = os.getenv('OPENAI_API_KEY') | |
# Send the request. | |
output = openai.ChatCompletion.create( | |
model=openai_model, | |
messages=[{'role': 'user', 'content': prompt}], | |
temperature=0.7, | |
max_tokens=max_tokens, | |
top_p=1, | |
frequency_penalty=0, | |
presence_penalty=0, | |
) | |
return output['choices'][0]['message']['content'] | |
def chatgpt_output( | |
prompt: str, | |
openai_model: str = 'gpt-3.5-turbo', | |
max_tokens: int = 512, | |
retry_secs: int = 5, | |
) -> tuple[bool, str]: | |
"""Send a request to the OpenAI API and return the response. | |
Args: | |
prompt (str): The prompt to send to the API. | |
openai_model (str, optional): The OpenAI model to use. | |
Defaults to 'gpt-3.5-turbo'. | |
max_tokens (int, optional): The maximum number of tokens to generate. | |
Defaults to 512. | |
retry_secs (int, optional): The number of seconds to wait before retrying. | |
Defaults to 5. | |
Returns: | |
tuple[bool, str]: A tuple containing a boolean indicating whether | |
the request was successful and the response. | |
""" | |
success = False | |
while True: | |
try: | |
output = api_request( | |
prompt=prompt, | |
openai_model=openai_model, | |
max_tokens=max_tokens, | |
) | |
success = True | |
break | |
except openai.InvalidRequestError as e: | |
logging.error(f'Failed to get response: {e}') | |
logging.error(f'Error: {e.error["code"]}') | |
if e.error['code'] == 'content_length_exceeded': | |
output = e.error['code'] | |
break | |
else: | |
# Retry in `retry_secs` seconds. | |
time.sleep(retry_secs) | |
continue | |
return success, output | |
def evaluate_with_chatgpt( # noqa: C901 | |
args: PPOArgs, | |
model_map: dict[str, str], | |
tokenizer: AutoTokenizer, | |
loader: DataLoader[Any], | |
use_pipeline: bool = False, | |
device: torch.device | str = 'cpu', | |
**gen_kwargs: Any, | |
) -> None: | |
"""Evaluate the model with ChatGPT evaluation score. | |
Args: | |
args (PPOArgs): The script arguments. | |
model_map (dict[str, str]): The model name to model mapping. | |
tokenizer (AutoTokenizer): The tokenizer to use. | |
loader (DataLoader): The data loader to use. | |
use_pipeline (bool, optional): Whether to use the `pipeline` API. | |
Defaults to False. | |
device (torch.device | str): The device to use. | |
Defaults to 'cpu'. | |
gen_kwargs: Keyword arguments for the generation function. | |
""" | |
if use_pipeline: | |
logging.info('Using pipeline for evaluation.') | |
models = load_pipelines(model_map, tokenizer=tokenizer) | |
else: | |
logging.info('Using model.generate() for evaluation.') | |
models = load_models(model_map) | |
# Get the eval_name and save_path. | |
if args.eval_name is None: | |
# Assume path is of format: experiments/<project_name>/<run_name>/model | |
# and use <run_name> as the eval_name. | |
try: | |
eval_name = pathlib.Path(args.ppo_model_name).parent.name | |
except Exception as e: | |
logging.exception(f'{e}: Unable to get eval_name from {args.ppo_model_name}') | |
logging.warning('Using default eval_name: reward-eval') | |
eval_name = 'reward-eval' | |
else: | |
eval_name = args.eval_name | |
# Create the save_path. | |
os.makedirs(args.eval_save_path, exist_ok=True) | |
save_path = os.path.join(args.eval_save_path, f'{eval_name}.jsonl') | |
if os.path.isfile(save_path): | |
logging.warning(f'{save_path} already exists. Overwriting...') | |
# Write the data to the jsonl file. | |
with jsonlines.open(save_path, 'w') as writer: | |
for batch in tqdm(loader, desc='Evalutaing'): | |
_data: dict[str, Any] = {} # [query, answer, answer_score, model_name_output, model_name_score] | |
_mapping: dict[str, list[tuple[str, str, int]]] = {} # model_name -> [(query, model_output, model_score)] | |
for model_name, model in models.items(): | |
# Model output. | |
if use_pipeline: | |
model_outputs = generate_with_pipeline( | |
pipeline=model, | |
texts=batch['query'], | |
**gen_kwargs, | |
) | |
else: | |
model_outputs = generate_with_model( | |
model=model, | |
tokenizer=tokenizer, | |
input_ids=torch.stack(batch['input_ids'], dim=0).to(device), | |
**gen_kwargs, | |
) | |
# Get ChatGPT helpfulness scores. | |
results = get_chatgpt_scores( | |
queries=batch['query'], | |
outputs=model_outputs, | |
max_tokens=args.max_token, | |
) | |
_mapping[model_name] = results | |
for name, results in _mapping.items(): | |
i = 0 | |
_data['query'] = results[0][i] | |
for (_, output, score) in results: | |
_data[f'{name}_output'] = output | |
_data[f'{name}_score'] = score | |
i += 1 | |
logging.info(_data) | |
# Write the data to jsonl file. | |
writer.write(_data) | |
# for query, output in zip(batch, model_outputs): | |
# try: | |
# score = _get_score_from_chatgpt(query, output) | |
# _mapping[model_name] = (output, score) | |
# except Exception as e: | |
# logging.exception(f'Error: {e}') | |
# # Prompt engineering to get the score w/ query & output. | |
# prompt = CHATGPT_ANNOTATOR_PROMPT.format(query, output) | |
# success, response = chatgpt_output(prompt, max_tokens=args.max_token) | |
# if not success: | |
# if response == 'content_length_exceeded': | |
# # Reduce the context length of the query and try again. | |
# query = reduce_context_length(query) | |
# prompt = CHATGPT_ANNOTATOR_PROMPT.format(query, output) | |
# success, response = chatgpt_output(prompt, max_tokens=args.max_token) | |
# else: | |
# logging.error(f'Failed to get response: {response}') | |
# continue | |
# if success: | |
# try: | |
# score = int(response.strip('\n').split('\n')[-1]) | |
# eval_scores[model_name].append({ | |
# 'query': query, | |
# 'output': output, | |
# 'score': score, | |
# }) | |
# except Exception as e: | |
# logging.warning(f'Failed to parse score: {e}') | |
# _score = response.strip('\n') | |
# logging.warning(f'Score needs to be ready manually: {_score}') | |
# # Save the results. | |
# if args.eval_name is None: | |
# try: | |
# # Assume path is of format: experiments/<project_name>/<run_name>/model | |
# # and use <run_name> as the eval_name. | |
# eval_name = pathlib.Path(args.ppo_model_name).parent.name | |
# except Exception as e: | |
# logging.exception(f'{e}: Unable to get eval_name from {args.ppo_model_name}') | |
# logging.warning('Using default eval_name: chatgpt-eval') | |
# eval_name = 'chatgpt-eval' | |
# else: | |
# eval_name = args.eval_name | |
# save_path = os.path.join(args.eval_save_path, f'{eval_name}.json') | |
# os.makedirs(args.eval_save_path, exist_ok=True) | |
# logging.info(f'Saving evaluation results to {save_path}...') | |
# with open(save_path, 'w') as f: | |
# json.dump(dict(eval_scores), f) | |
def get_chatgpt_scores( | |
queries: list[str], | |
outputs: list[str], | |
max_tokens: int = 512, | |
) -> list[tuple[str, str, int]]: | |
"""Get ChatGPT score from query, output pair. | |
Args: | |
queries (list[str]): The queries. | |
outputs (list[str]): The list of outputs. | |
max_tokens (int, optional): Maximum number of tokens to generate. Defaults to 512. | |
Returns: | |
list[tuple[str, str, int]]: Query, output & ChatGPT scores. | |
""" | |
results = [] | |
for query, output in zip(queries, outputs): | |
try: | |
score = _get_score_from_chatgpt(query, output, max_tokens=max_tokens) | |
results.append((query, output, score)) | |
except Exception as e: | |
logging.exception(f'Error: {e}') | |
return results | |
def _get_score_from_chatgpt( | |
query: str, | |
output: str, | |
max_tokens: int = 512, | |
) -> int: | |
"""Get the score from ChatGPT. | |
Args: | |
query (str): The query. | |
output (str): The output. | |
max_tokens (int): Maximum number of tokens to generate. | |
Defaults to 512. | |
Raises: | |
ValueError: Failed to get response. | |
Failed to parse score. | |
Returns: | |
int: The score. | |
""" | |
prompt = CHATGPT_ANNOTATOR_PROMPT.format(query, output) | |
success, response = chatgpt_output(prompt, max_tokens=max_tokens) | |
if not success: | |
# Try again with a reduced context length. | |
if response == 'content_length_exceeded': | |
# Reduce the context length of the query and try again. | |
query = reduce_context_length(query) | |
prompt = CHATGPT_ANNOTATOR_PROMPT.format(query, output) | |
success, response = chatgpt_output(prompt) | |
else: | |
# Continue to the next query. | |
raise ValueError(f'Failed to get response: {response}') | |
if success: | |
# Parse the score. | |
match = SINGLE_DIGIT_PATTERN.search(response.strip('\n').split('\n')[-1]) | |
if match: | |
return int(match.group()) | |
else: | |
_score = response.strip('\n') | |
logging.warning(f'\nScore needs to be ready manually: {_score}') | |
raise ValueError('Failed to parse score.') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment