Created
March 15, 2024 03:17
-
-
Save anadim/6a7318ac586b5021fc9ace94ab7018c5 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Import necessary modules | |
# !pip install openai==0.28 | |
from decimal import Decimal | |
import anthropic | |
import openai | |
import re | |
import matplotlib.pyplot as plt | |
import random | |
import numpy as np | |
# Function to generate a prompt for the models | |
def generate_prompt(a, b): | |
return f"What is {a} * {b}? No reasoning, just provide the result." | |
# Function to extract the result from model responses | |
def extract_result(text): | |
text = text.replace(",", "") | |
text = text.replace(".", "") | |
# Find the last string of digits in the text | |
match = re.findall(r"\d+", text) | |
if match: | |
return int(match[-1]) | |
else: | |
return None | |
# Function to test a single model with a given prompt | |
def test_model(model_name, prompt, anthropic_client): | |
while True: | |
try: | |
if model_name.startswith("claude"): | |
response = anthropic_client.messages.create( | |
model=model_name, | |
max_tokens=1000, | |
temperature=0.0, | |
messages=[{"role": "user", "content": prompt}] | |
) | |
return response.content[0].text, extract_result(response.content[0].text) | |
else: | |
response = openai.ChatCompletion.create( | |
model=model_name, | |
messages=[{"role": "user", "content": prompt}], | |
temperature=0 | |
) | |
return response.choices[0].message.content, extract_result(response.choices[0].message.content) | |
except openai.error.APIError as e: | |
print(f"OpenAI API Error: {str(e)}") | |
print("Retrying...") | |
except anthropic.APIError as e: | |
print(f"Anthropic API Error: {str(e)}") | |
print("Retrying...") | |
# time.sleep(5) | |
except Exception as e: | |
print(f"Unexpected Error: {str(e)}") | |
print("Retrying...") | |
# Function to run the experiment across different models and digit lengths | |
def run_experiment(models, digit_lengths, anthropic_client, num_samples=100): | |
results = {model: [] for model in models} | |
for length in digit_lengths: | |
print(f"\n\n\n\n ********** Digit Length: {length} **********") | |
total_relative_error = {model: 0 for model in models} | |
for sample in range(num_samples): | |
a = random.randint(10 ** (length - 1), 10 ** length - 1) | |
b = random.randint(10 ** (length - 1), 10 ** length - 1) | |
expected_result = a * b | |
prompt = generate_prompt(a, b) | |
print(f"\n\nSample {sample + 1}:") | |
# print(f"Prompt: {prompt}") | |
print(f"Expected Result: {expected_result}") | |
for model in models: | |
model_output, predicted_result = test_model(model, prompt, anthropic_client) | |
if predicted_result is not None: | |
relative_error = min(abs(Decimal(predicted_result - expected_result) / Decimal(expected_result)), Decimal(1)) | |
else: | |
relative_error = 1.0 | |
total_relative_error[model] += relative_error | |
print(f"\nModel: {model}") | |
print(f"Output: {model_output}") | |
print(f"Predicted Result: {predicted_result}") | |
print(f"\n Relative Error: {relative_error:.4f}") | |
for model in models: | |
avg_relative_error = total_relative_error[model] / num_samples | |
results[model].append(avg_relative_error) | |
print(f"\nModel: {model}") | |
print(f"Average Relative Error: {avg_relative_error:.4f}") | |
return results | |
# Function to plot the results of the experiment | |
def plot_results(results, digit_lengths): | |
claude_colors = ['#FF0000', '#FF4500', '#FF6347'] # Shades of red for Claude models | |
openai_colors = ['#0000FF', '#1E90FF', '#00BFFF'] # Shades of blue for OpenAI models | |
markers = ['o', 's', '^', 'D', 'v', '<'] # Different markers for each model | |
for i, (model, relative_errors) in enumerate(results.items()): | |
# Create mask for missing data points | |
mask = [True] * len(digit_lengths) | |
for j, length in enumerate(digit_lengths): | |
if j >= len(relative_errors): | |
mask[j] = False | |
# Plot only the available data points | |
x = np.array(digit_lengths)[mask] | |
y = np.array(relative_errors, dtype=float) # Convert to float data type | |
# Calculate the standard deviation for each data point | |
std_dev = np.std(y) | |
# Determine the color based on the model type | |
if model.startswith('claude'): | |
color = claude_colors[i % len(claude_colors)] | |
else: | |
color = openai_colors[i % len(openai_colors)] | |
# Plot the average curve with increased linewidth and marker | |
plt.plot(x, y, label=model, color=color, linewidth=3, marker=markers[i], markersize=10) | |
# Clip the error bars to be within the range [0, 1] | |
lower_bound = np.clip(y - std_dev, 0, 1) | |
upper_bound = np.clip(y + std_dev, 0, 1) | |
# Fill the shaded region around the average curve | |
plt.fill_between(x, lower_bound, upper_bound, color=color, alpha=0.2) | |
# Increase font sizes | |
plt.xlabel("Number of Digits", fontsize=16) | |
plt.ylabel("Relative Error", fontsize=16) | |
plt.title("Relative Error vs. Number of Digits", fontsize=18) | |
plt.xticks(fontsize=14) | |
plt.yticks(fontsize=14) | |
plt.legend(fontsize=12) | |
plt.tight_layout() | |
# Replace with your actual API keys | |
anthropic_api_key = "your key buddy" | |
openai_api_key = "your key buddy" | |
# Initialize the anthropic client | |
anthropic_client = anthropic.Anthropic(api_key=anthropic_api_key) | |
# Set the OpenAI API key | |
openai.api_key = openai_api_key | |
# List of models to test | |
models = ["claude-3-haiku-20240307", "claude-3-sonnet-20240229", "claude-3-opus-20240229", "gpt-3.5-turbo", "gpt-4-0314", "gpt-4-1106-preview"] | |
# Digit lengths to test | |
digit_lengths = range(5, 26,1) # Test multiplication for 1 to 10 digits | |
# Run the experiment | |
results = run_experiment(models, digit_lengths, anthropic_client) | |
# Plot the results | |
plot_results(results, digit_lengths) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@anadim Thank you for sharing this experiment! I think it may be interesting to control for length of the predicted output (and perhaps compare the error rate only for the subset where the model outputs the length of predicted output correctly).
Here are two randomly generated 25-digit numbers where each digit is required to be an even number.
If you ask
claude-3-haiku-20240307
for their multiplication:When you switch to
claude-3-sonnet-20240229
, model response repeats524
consecutively for997
times before reaching the max token of1000
to sample:And finally when you use
claude-3-opus-20240229
, the model outputs 50-digit response as expected, though it is the wrong answer.ps. I should do the control experiment but just limiting the input digits to only even numbers show similar pattern of relative error rates across models.
gpt-4-0314
not included because deprecatedn=20
samples anddigit_lengths=range(5, 26, 4)
for quick iteration.