victor-iyi · August 19, 2024 06:30
diff --git a/chatgpt.py b/chatgpt.py
 import logging
 import os
 import pathlib
 import re
 import time
 from typing import Any

 import jsonlines
 import openai
 import torch
 from rlhf_trl.args import PPOArgs
 from rlhf_trl.evaluate.utils import load_models
 from rlhf_trl.evaluate.utils import load_pipelines
 from rlhf_trl.evaluate.utils import reduce_context_length
 from rlhf_trl.predict import generate_with_model
 from rlhf_trl.predict import generate_with_pipeline
 from torch.utils.data import DataLoader
 from tqdm import tqdm
 from transformers import AutoTokenizer


 CHATGPT_ANNOTATOR_PROMPT = """You are evaluating a response that has been submitted for a particular task, using a specific set of standards. Below is the data:
 [BEGIN DATA]
 ***
 [Task]: {}
 ***
 [Submission]: {}
 ***
 [Criterion]: helpfulness:
 "1": "Not helpful - The generated text is completely irrelevant, unclear, or incomplete. It does not provide any useful information to the user."
 "2": "Somewhat helpful - The generated text has some relevance to the user’s question, but it may be unclear or incomplete. It provides only
 partial information, or the information provided may not be useful for the user’s needs."
 "3": "Moderately helpful - The generated text is relevant to the user’s question, and it provides a clear and complete answer. However, it may
 lack detail or explanation that would be helpful for the user."
 "4": "Helpful - The generated text is quite relevant to the user’s question, and it provides a clear, complete, and detailed answer. It offers
 additional information or explanations that are useful for the user. However, some of the points of the response are somewhat repetitive or could
 be combined for greater clarity and concision"
 "5": "Very helpful - The generated text is highly relevant to the user’s question, and it provides a clear, complete, and detailed answer. It offers
 additional information, explanations, or analogies that are not only useful but also insightful and valuable to the user. However, the structured
 of the response is not well-organized and there is no clear progression or logical sequence of different points in the response."
 "6": "Highly helpful - The generated text provides a clear, complete, and detailed answer. It offers additional information or explanations that
 are not only useful but also insightful and valuable to the user. The response is also in a logical and easy-to-follow manner by explicitly using
 headings, bullet points, or numbered lists to break up the information and make it easier to read."
 ***
 [END DATA]
 Does the submission meet the criterion? First, write out in a step by step manner your reasoning about the criterion to be sure that your
 conclusion is correct. Avoid simply stating the correct answers at the outset. Then print the choice only from “1, 2, 3, 4, 5, 6” (without quotes
 or punctuation) on its own line corresponding to the correct answer. At the end, repeat just the selected choice again by itself on a new line."""


 # Regex pattern to match a single digit.
 SINGLE_DIGIT_PATTERN = re.compile(r'\b\d\b')


 def api_request(
    prompt: str,
    openai_model: str = 'gpt-3.5-turbo',
    max_tokens: int = 512,
 ) -> str:
    """Send a request to the OpenAI API and return the response.

    Args:
        prompt (str): The prompt to send to the API.
        openai_model (str, optional): The OpenAI model to use.
            Defaults to 'gpt-3.5-turbo'.
        max_tokens (int, optional): The maximum number of tokens to generate.
            Defaults to 512.

    Returns:
        str: The response from the API.

    """
    # Set the OpenAI API key.
    openai.api_key = os.getenv('OPENAI_API_KEY')

    # Send the request.
    output = openai.ChatCompletion.create(
        model=openai_model,
        messages=[{'role': 'user', 'content': prompt}],
        temperature=0.7,
        max_tokens=max_tokens,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
    )

    return output['choices'][0]['message']['content']


 def chatgpt_output(
    prompt: str,
    openai_model: str = 'gpt-3.5-turbo',
    max_tokens: int = 512,
    retry_secs: int = 5,
 ) -> tuple[bool, str]:
    """Send a request to the OpenAI API and return the response.

    Args:
        prompt (str): The prompt to send to the API.
        openai_model (str, optional): The OpenAI model to use.
            Defaults to 'gpt-3.5-turbo'.
        max_tokens (int, optional): The maximum number of tokens to generate.
            Defaults to 512.
        retry_secs (int, optional): The number of seconds to wait before retrying.
            Defaults to 5.

    Returns:
        tuple[bool, str]: A tuple containing a boolean indicating whether
            the request was successful and the response.

    """
    success = False

    while True:
        try:
            output = api_request(
                prompt=prompt,
                openai_model=openai_model,
                max_tokens=max_tokens,
            )
            success = True
            break
        except openai.InvalidRequestError as e:
            logging.error(f'Failed to get response: {e}')
            logging.error(f'Error: {e.error["code"]}')
            if e.error['code'] == 'content_length_exceeded':
                output = e.error['code']
                break
            else:
                # Retry in `retry_secs` seconds.
                time.sleep(retry_secs)
                continue

    return success, output


 def evaluate_with_chatgpt(  # noqa: C901
    args: PPOArgs,
    model_map: dict[str, str],
    tokenizer: AutoTokenizer,
    loader: DataLoader[Any],
    use_pipeline: bool = False,
    device: torch.device | str = 'cpu',
    **gen_kwargs: Any,
 ) -> None:
    """Evaluate the model with ChatGPT evaluation score.

    Args:
        args (PPOArgs): The script arguments.
        model_map (dict[str, str]): The model name to model mapping.
        tokenizer (AutoTokenizer): The tokenizer to use.
        loader (DataLoader): The data loader to use.
        use_pipeline (bool, optional): Whether to use the `pipeline` API.
            Defaults to False.
        device (torch.device | str): The device to use.
            Defaults to 'cpu'.
        gen_kwargs: Keyword arguments for the generation function.

    """
    if use_pipeline:
        logging.info('Using pipeline for evaluation.')
        models = load_pipelines(model_map, tokenizer=tokenizer)
    else:
        logging.info('Using model.generate() for evaluation.')
        models = load_models(model_map)

    # Get the eval_name and save_path.
    if args.eval_name is None:
        # Assume path is of format: experiments/<project_name>/<run_name>/model
        # and use <run_name> as the eval_name.
        try:
            eval_name = pathlib.Path(args.ppo_model_name).parent.name
        except Exception as e:
            logging.exception(f'{e}: Unable to get eval_name from {args.ppo_model_name}')
            logging.warning('Using default eval_name: reward-eval')
            eval_name = 'reward-eval'
    else:
        eval_name = args.eval_name

    # Create the save_path.
    os.makedirs(args.eval_save_path, exist_ok=True)
    save_path = os.path.join(args.eval_save_path, f'{eval_name}.jsonl')
    if os.path.isfile(save_path):
        logging.warning(f'{save_path} already exists. Overwriting...')

    # Write the data to the jsonl file.
    with jsonlines.open(save_path, 'w') as writer:
        for batch in tqdm(loader, desc='Evalutaing'):

            _data: dict[str, Any] = {}  # [query, answer, answer_score, model_name_output, model_name_score]
            _mapping: dict[str, list[tuple[str, str, int]]] = {}  # model_name -> [(query, model_output, model_score)]

            for model_name, model in models.items():
                # Model output.
                if use_pipeline:
                    model_outputs = generate_with_pipeline(
                        pipeline=model,
                        texts=batch['query'],
                        **gen_kwargs,
                    )
                else:
                    model_outputs = generate_with_model(
                        model=model,
                        tokenizer=tokenizer,
                        input_ids=torch.stack(batch['input_ids'], dim=0).to(device),
                        **gen_kwargs,
                    )

                # Get ChatGPT helpfulness scores.
                results = get_chatgpt_scores(
                    queries=batch['query'],
                    outputs=model_outputs,
                    max_tokens=args.max_token,
                )
                _mapping[model_name] = results

            for name, results in _mapping.items():
                i = 0
                _data['query'] = results[0][i]
                for (_, output, score) in results:
                    _data[f'{name}_output'] = output
                    _data[f'{name}_score'] = score
                    i += 1

            logging.info(_data)
            # Write the data to jsonl file.
            writer.write(_data)

            # for query, output in zip(batch, model_outputs):
            #     try:
            #         score = _get_score_from_chatgpt(query, output)
            #         _mapping[model_name] = (output, score)
            #     except Exception as e:
            #         logging.exception(f'Error: {e}')

            #     # Prompt engineering to get the score w/ query & output.
            #     prompt = CHATGPT_ANNOTATOR_PROMPT.format(query, output)

            #     success, response = chatgpt_output(prompt, max_tokens=args.max_token)

            #     if not success:
            #         if response == 'content_length_exceeded':
            #             # Reduce the context length of the query and try again.
            #             query = reduce_context_length(query)
            #             prompt = CHATGPT_ANNOTATOR_PROMPT.format(query, output)
            #             success, response = chatgpt_output(prompt, max_tokens=args.max_token)
            #         else:
            #             logging.error(f'Failed to get response: {response}')
            #             continue

            #     if success:
            #         try:
            #             score = int(response.strip('\n').split('\n')[-1])
            #             eval_scores[model_name].append({
            #                 'query': query,
            #                 'output': output,
            #                 'score': score,
            #             })
            #         except Exception as e:
            #             logging.warning(f'Failed to parse score: {e}')
            #             _score = response.strip('\n')
            #             logging.warning(f'Score needs to be ready manually: {_score}')

    # # Save the results.
    # if args.eval_name is None:
    #     try:
    #         # Assume path is of format: experiments/<project_name>/<run_name>/model
    #         # and use <run_name> as the eval_name.
    #         eval_name = pathlib.Path(args.ppo_model_name).parent.name
    #     except Exception as e:
    #         logging.exception(f'{e}: Unable to get eval_name from {args.ppo_model_name}')
    #         logging.warning('Using default eval_name: chatgpt-eval')
    #         eval_name = 'chatgpt-eval'
    # else:
    #     eval_name = args.eval_name
    # save_path = os.path.join(args.eval_save_path, f'{eval_name}.json')
    # os.makedirs(args.eval_save_path, exist_ok=True)

    # logging.info(f'Saving evaluation results to {save_path}...')
    # with open(save_path, 'w') as f:
    #     json.dump(dict(eval_scores), f)


 def get_chatgpt_scores(
    queries: list[str],
    outputs: list[str],
    max_tokens: int = 512,
 ) -> list[tuple[str, str, int]]:
    """Get ChatGPT score from query, output pair.

    Args:
        queries (list[str]): The queries.
        outputs (list[str]): The list of outputs.
        max_tokens (int, optional): Maximum number of tokens to generate. Defaults to 512.

    Returns:
        list[tuple[str, str, int]]: Query, output & ChatGPT scores.

    """
    results = []
    for query, output in zip(queries, outputs):
        try:
            score = _get_score_from_chatgpt(query, output, max_tokens=max_tokens)
            results.append((query, output, score))
        except Exception as e:
            logging.exception(f'Error: {e}')

    return results


 def _get_score_from_chatgpt(
    query: str,
    output: str,
    max_tokens: int = 512,
 ) -> int:
    """Get the score from ChatGPT.

    Args:
        query (str): The query.
        output (str): The output.
        max_tokens (int): Maximum number of tokens to generate.
            Defaults to 512.

    Raises:
        ValueError: Failed to get response.
                    Failed to parse score.

    Returns:
        int: The score.

    """
    prompt = CHATGPT_ANNOTATOR_PROMPT.format(query, output)
    success, response = chatgpt_output(prompt, max_tokens=max_tokens)

    if not success:
        # Try again with a reduced context length.
        if response == 'content_length_exceeded':
            # Reduce the context length of the query and try again.
            query = reduce_context_length(query)
            prompt = CHATGPT_ANNOTATOR_PROMPT.format(query, output)
            success, response = chatgpt_output(prompt)
        else:
            # Continue to the next query.
            raise ValueError(f'Failed to get response: {response}')

    if success:
        # Parse the score.
        match = SINGLE_DIGIT_PATTERN.search(response.strip('\n').split('\n')[-1])
        if match:
            return int(match.group())
        else:
            _score = response.strip('\n')
            logging.warning(f'\nScore needs to be ready manually: {_score}')

    raise ValueError('Failed to parse score.')
	import logging
	import os
	import pathlib
	import re
	import time
	from typing import Any

	import jsonlines
	import openai
	import torch
	from rlhf_trl.args import PPOArgs
	from rlhf_trl.evaluate.utils import load_models
	from rlhf_trl.evaluate.utils import load_pipelines
	from rlhf_trl.evaluate.utils import reduce_context_length
	from rlhf_trl.predict import generate_with_model
	from rlhf_trl.predict import generate_with_pipeline
	from torch.utils.data import DataLoader
	from tqdm import tqdm
	from transformers import AutoTokenizer


	CHATGPT_ANNOTATOR_PROMPT = """You are evaluating a response that has been submitted for a particular task, using a specific set of standards. Below is the data:
	[BEGIN DATA]
	***
	[Task]: {}
	***
	[Submission]: {}
	***
	[Criterion]: helpfulness:
	"1": "Not helpful - The generated text is completely irrelevant, unclear, or incomplete. It does not provide any useful information to the user."
	"2": "Somewhat helpful - The generated text has some relevance to the user’s question, but it may be unclear or incomplete. It provides only
	partial information, or the information provided may not be useful for the user’s needs."
	"3": "Moderately helpful - The generated text is relevant to the user’s question, and it provides a clear and complete answer. However, it may
	lack detail or explanation that would be helpful for the user."
	"4": "Helpful - The generated text is quite relevant to the user’s question, and it provides a clear, complete, and detailed answer. It offers
	additional information or explanations that are useful for the user. However, some of the points of the response are somewhat repetitive or could
	be combined for greater clarity and concision"
	"5": "Very helpful - The generated text is highly relevant to the user’s question, and it provides a clear, complete, and detailed answer. It offers
	additional information, explanations, or analogies that are not only useful but also insightful and valuable to the user. However, the structured
	of the response is not well-organized and there is no clear progression or logical sequence of different points in the response."
	"6": "Highly helpful - The generated text provides a clear, complete, and detailed answer. It offers additional information or explanations that
	are not only useful but also insightful and valuable to the user. The response is also in a logical and easy-to-follow manner by explicitly using
	headings, bullet points, or numbered lists to break up the information and make it easier to read."
	***
	[END DATA]
	Does the submission meet the criterion? First, write out in a step by step manner your reasoning about the criterion to be sure that your
	conclusion is correct. Avoid simply stating the correct answers at the outset. Then print the choice only from “1, 2, 3, 4, 5, 6” (without quotes
	or punctuation) on its own line corresponding to the correct answer. At the end, repeat just the selected choice again by itself on a new line."""


	# Regex pattern to match a single digit.
	SINGLE_DIGIT_PATTERN = re.compile(r'\b\d\b')


	def api_request(
	prompt: str,
	openai_model: str = 'gpt-3.5-turbo',
	max_tokens: int = 512,
	) -> str:
	"""Send a request to the OpenAI API and return the response.

	Args:
	prompt (str): The prompt to send to the API.
	openai_model (str, optional): The OpenAI model to use.
	Defaults to 'gpt-3.5-turbo'.
	max_tokens (int, optional): The maximum number of tokens to generate.
	Defaults to 512.

	Returns:
	str: The response from the API.

	"""
	# Set the OpenAI API key.
	openai.api_key = os.getenv('OPENAI_API_KEY')

	# Send the request.
	output = openai.ChatCompletion.create(
	model=openai_model,
	messages=[{'role': 'user', 'content': prompt}],
	temperature=0.7,
	max_tokens=max_tokens,
	top_p=1,
	frequency_penalty=0,
	presence_penalty=0,
	)

	return output['choices'][0]['message']['content']


	def chatgpt_output(
	prompt: str,
	openai_model: str = 'gpt-3.5-turbo',
	max_tokens: int = 512,
	retry_secs: int = 5,
	) -> tuple[bool, str]:
	"""Send a request to the OpenAI API and return the response.

	Args:
	prompt (str): The prompt to send to the API.
	openai_model (str, optional): The OpenAI model to use.
	Defaults to 'gpt-3.5-turbo'.
	max_tokens (int, optional): The maximum number of tokens to generate.
	Defaults to 512.
	retry_secs (int, optional): The number of seconds to wait before retrying.
	Defaults to 5.

	Returns:
	tuple[bool, str]: A tuple containing a boolean indicating whether
	the request was successful and the response.

	"""
	success = False

	while True:
	try:
	output = api_request(
	prompt=prompt,
	openai_model=openai_model,
	max_tokens=max_tokens,
	)
	success = True
	break
	except openai.InvalidRequestError as e:
	logging.error(f'Failed to get response: {e}')
	logging.error(f'Error: {e.error["code"]}')
	if e.error['code'] == 'content_length_exceeded':
	output = e.error['code']
	break
	else:
	# Retry in `retry_secs` seconds.
	time.sleep(retry_secs)
	continue

	return success, output


	def evaluate_with_chatgpt( # noqa: C901
	args: PPOArgs,
	model_map: dict[str, str],
	tokenizer: AutoTokenizer,
	loader: DataLoader[Any],
	use_pipeline: bool = False,
	device: torch.device \| str = 'cpu',
	**gen_kwargs: Any,
	) -> None:
	"""Evaluate the model with ChatGPT evaluation score.

	Args:
	args (PPOArgs): The script arguments.
	model_map (dict[str, str]): The model name to model mapping.
	tokenizer (AutoTokenizer): The tokenizer to use.
	loader (DataLoader): The data loader to use.
	use_pipeline (bool, optional): Whether to use the `pipeline` API.
	Defaults to False.
	device (torch.device \| str): The device to use.
	Defaults to 'cpu'.
	gen_kwargs: Keyword arguments for the generation function.

	"""
	if use_pipeline:
	logging.info('Using pipeline for evaluation.')
	models = load_pipelines(model_map, tokenizer=tokenizer)
	else:
	logging.info('Using model.generate() for evaluation.')
	models = load_models(model_map)

	# Get the eval_name and save_path.
	if args.eval_name is None:
	# Assume path is of format: experiments/<project_name>/<run_name>/model
	# and use <run_name> as the eval_name.
	try:
	eval_name = pathlib.Path(args.ppo_model_name).parent.name
	except Exception as e:
	logging.exception(f'{e}: Unable to get eval_name from {args.ppo_model_name}')
	logging.warning('Using default eval_name: reward-eval')
	eval_name = 'reward-eval'
	else:
	eval_name = args.eval_name

	# Create the save_path.
	os.makedirs(args.eval_save_path, exist_ok=True)
	save_path = os.path.join(args.eval_save_path, f'{eval_name}.jsonl')
	if os.path.isfile(save_path):
	logging.warning(f'{save_path} already exists. Overwriting...')

	# Write the data to the jsonl file.
	with jsonlines.open(save_path, 'w') as writer:
	for batch in tqdm(loader, desc='Evalutaing'):

	_data: dict[str, Any] = {} # [query, answer, answer_score, model_name_output, model_name_score]
	_mapping: dict[str, list[tuple[str, str, int]]] = {} # model_name -> [(query, model_output, model_score)]

	for model_name, model in models.items():
	# Model output.
	if use_pipeline:
	model_outputs = generate_with_pipeline(
	pipeline=model,
	texts=batch['query'],
	**gen_kwargs,
	)
	else:
	model_outputs = generate_with_model(
	model=model,
	tokenizer=tokenizer,
	input_ids=torch.stack(batch['input_ids'], dim=0).to(device),
	**gen_kwargs,
	)

	# Get ChatGPT helpfulness scores.
	results = get_chatgpt_scores(
	queries=batch['query'],
	outputs=model_outputs,
	max_tokens=args.max_token,
	)
	_mapping[model_name] = results

	for name, results in _mapping.items():
	i = 0
	_data['query'] = results[0][i]
	for (_, output, score) in results:
	_data[f'{name}_output'] = output
	_data[f'{name}_score'] = score
	i += 1

	logging.info(_data)
	# Write the data to jsonl file.
	writer.write(_data)

	# for query, output in zip(batch, model_outputs):
	# try:
	# score = _get_score_from_chatgpt(query, output)
	# _mapping[model_name] = (output, score)
	# except Exception as e:
	# logging.exception(f'Error: {e}')

	# # Prompt engineering to get the score w/ query & output.
	# prompt = CHATGPT_ANNOTATOR_PROMPT.format(query, output)

	# success, response = chatgpt_output(prompt, max_tokens=args.max_token)

	# if not success:
	# if response == 'content_length_exceeded':
	# # Reduce the context length of the query and try again.
	# query = reduce_context_length(query)
	# prompt = CHATGPT_ANNOTATOR_PROMPT.format(query, output)
	# success, response = chatgpt_output(prompt, max_tokens=args.max_token)
	# else:
	# logging.error(f'Failed to get response: {response}')
	# continue

	# if success:
	# try:
	# score = int(response.strip('\n').split('\n')[-1])
	# eval_scores[model_name].append({
	# 'query': query,
	# 'output': output,
	# 'score': score,
	# })
	# except Exception as e:
	# logging.warning(f'Failed to parse score: {e}')
	# _score = response.strip('\n')
	# logging.warning(f'Score needs to be ready manually: {_score}')

	# # Save the results.
	# if args.eval_name is None:
	# try:
	# # Assume path is of format: experiments/<project_name>/<run_name>/model
	# # and use <run_name> as the eval_name.
	# eval_name = pathlib.Path(args.ppo_model_name).parent.name
	# except Exception as e:
	# logging.exception(f'{e}: Unable to get eval_name from {args.ppo_model_name}')
	# logging.warning('Using default eval_name: chatgpt-eval')
	# eval_name = 'chatgpt-eval'
	# else:
	# eval_name = args.eval_name
	# save_path = os.path.join(args.eval_save_path, f'{eval_name}.json')
	# os.makedirs(args.eval_save_path, exist_ok=True)

	# logging.info(f'Saving evaluation results to {save_path}...')
	# with open(save_path, 'w') as f:
	# json.dump(dict(eval_scores), f)


	def get_chatgpt_scores(
	queries: list[str],
	outputs: list[str],
	max_tokens: int = 512,
	) -> list[tuple[str, str, int]]:
	"""Get ChatGPT score from query, output pair.

	Args:
	queries (list[str]): The queries.
	outputs (list[str]): The list of outputs.
	max_tokens (int, optional): Maximum number of tokens to generate. Defaults to 512.

	Returns:
	list[tuple[str, str, int]]: Query, output & ChatGPT scores.

	"""
	results = []
	for query, output in zip(queries, outputs):
	try:
	score = _get_score_from_chatgpt(query, output, max_tokens=max_tokens)
	results.append((query, output, score))
	except Exception as e:
	logging.exception(f'Error: {e}')

	return results


	def _get_score_from_chatgpt(
	query: str,
	output: str,
	max_tokens: int = 512,
	) -> int:
	"""Get the score from ChatGPT.

	Args:
	query (str): The query.
	output (str): The output.
	max_tokens (int): Maximum number of tokens to generate.
	Defaults to 512.

	Raises:
	ValueError: Failed to get response.
	Failed to parse score.

	Returns:
	int: The score.

	"""
	prompt = CHATGPT_ANNOTATOR_PROMPT.format(query, output)
	success, response = chatgpt_output(prompt, max_tokens=max_tokens)

	if not success:
	# Try again with a reduced context length.
	if response == 'content_length_exceeded':
	# Reduce the context length of the query and try again.
	query = reduce_context_length(query)
	prompt = CHATGPT_ANNOTATOR_PROMPT.format(query, output)
	success, response = chatgpt_output(prompt)
	else:
	# Continue to the next query.
	raise ValueError(f'Failed to get response: {response}')

	if success:
	# Parse the score.
	match = SINGLE_DIGIT_PATTERN.search(response.strip('\n').split('\n')[-1])
	if match:
	return int(match.group())
	else:
	_score = response.strip('\n')
	logging.warning(f'\nScore needs to be ready manually: {_score}')

	raise ValueError('Failed to parse score.')