Skip to content

Instantly share code, notes, and snippets.

@vwxyzjn
Created November 22, 2024 04:25
Show Gist options
  • Save vwxyzjn/ff9f5759444c02e4142d54e907d3f964 to your computer and use it in GitHub Desktop.
Save vwxyzjn/ff9f5759444c02e4142d54e907d3f964 to your computer and use it in GitHub Desktop.
import json
import os
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "0"
import gradio as gr
import numpy as np
import pandas as pd
from apscheduler.schedulers.background import BackgroundScheduler
from huggingface_hub import HfApi, snapshot_download
from src.md import ABOUT_TEXT, BOTTOM_TEXT, TOP_TEXT
api = HfApi()
# evals_repo = os.environ.get("EVALS_REPO")
evals_repo = "allenai/tulu-3-evals"
repo_dir = "./evals/"
def restart_space():
api.restart_space(repo_id="allenai/tulu-3-leaderboard")
def export_to_csv(dataframe):
"""Export the given dataframe to a CSV file and return the file path."""
csv_file = "exported_results.csv"
dataframe.to_csv(csv_file, index=False)
return csv_file
print("Pulling evaluation results")
repo = snapshot_download(
local_dir=repo_dir,
ignore_patterns=[],
repo_id=evals_repo,
tqdm_class=None,
etag_timeout=30,
repo_type="dataset",
)
# now we have downloaded the dataset, walk through the results directory.
# each folder should be a model, and each file in the folder should be a result
results_dir = os.path.join(repo_dir, os.environ.get("RESULTS_DIR", "results"))
model_results = {}
model_metadata = {}
timestamped_results = {}
for folder in os.listdir(results_dir):
folder_path = os.path.join(results_dir, folder)
if os.path.isdir(folder_path):
# try to load up model metadata
if os.path.isfile(os.path.join(folder_path, "metadata.json")):
try:
with open(os.path.join(folder_path, "metadata.json"), "r", encoding="utf-8") as f:
metadata = json.load(f)
model_metadata[folder] = metadata
except json.decoder.JSONDecodeError:
print(f"Error reading metadata for {folder}")
model_results[folder] = {}
for file in os.listdir(folder_path):
file_path = os.path.join(folder_path, file)
# grab the timecode from the file name
timestamp = file.split("-")[:2] # format is <date>-<time>-<name...>
timestamp = "-".join(timestamp)
if os.path.isfile(file_path):
try:
with open(file_path, "r", encoding="utf-8") as f:
first_line = f.readline()
data = json.loads(first_line)
except json.decoder.JSONDecodeError:
print(f"Error reading {file_path}")
continue
try:
score = data["metrics"]["primary_score"]
dataset_name = data["task_name"]
except KeyError:
print(f"Error reading {file_path}, wrong format.")
continue
# only update if non-nan and timestamp is more recent
if not np.isnan(score):
if (
folder + "-" + dataset_name not in timestamped_results
or timestamped_results[folder + "-" + dataset_name] < timestamp
):
model_results[folder][dataset_name] = score
if dataset_name == "minerva_math::tulu":
if "exact_match_flex_macro" in data["metrics"]:
model_results[folder]["math::flex"] = data["metrics"]["exact_match_flex_macro"]
timestamped_results[folder + "-" + dataset_name] = timestamp
else:
print(
"skipping",
folder + "-" + dataset_name,
"because",
timestamped_results[folder + "-" + dataset_name],
"is more recent than",
timestamp,
)
# clean: remove models that don't have any evals
models_to_remove = []
for model, results in model_results.items():
if not results:
print(f"Removing {model} because it has no evals")
models_to_remove.append(model)
for model in models_to_remove:
del model_results[model]
del model_metadata[model]
# add link metadata
for model in model_results:
if model in model_metadata and "wandb_path" in model_metadata[model]:
model_results[model]['link'] = model_metadata[model]["wandb_path"]
# clean data: all models should have the same evals
# for any model that doesn't have an eval, add a nan
evals = set()
for model in model_results:
for eval in model_results[model]:
evals.add(eval)
if "link" in evals:
evals.remove("link")
for model in model_results:
for eval in evals:
if eval not in model_results[model]:
model_results[model][eval] = np.nan
# now, turn into dataframe. Columns are evals, rows are models
df = pd.DataFrame(model_results).T
import wandb
wandb_api = wandb.Api()
for index, row in df.iterrows():
if "link" not in row or not isinstance(row["link"], str):
continue
try:
link = row["link"].replace("https://wandb.ai/", "")
wandb_run = wandb_api.run(link)
modified = False
for key in row.keys():
if key == "link":
continue
if f"oe-eval/{key}" not in wandb_run.summary:
wandb_run.summary[f"oe-eval/{key}"] = row[key]
modified = True
elif wandb_run.summary[f"oe-eval/{key}"] != row[key]:
wandb_run.summary[f"oe-eval/{key}"] = row[key]
modified = True
if not modified:
print(f"Already logged metrics to {wandb_run.url}")
else:
wandb_run.update()
print(f"Logged metrics to {wandb_run.url}")
except Exception as e:
print(f"Error logging metrics to {row['link']}: {e}")
breakpoint()
# multiply by 100 to get percentage
for col in df.columns:
if col == "link":
continue
if col == "alpaca_eval": # alpaca_eval is already in percentage
continue
df[col] = df[col] * 100
# add link column if not present, all nan
if "link" not in df.columns:
df["link"] = np.nan
# add average column
df["Average"] = df.drop(columns=["link"]).mean(axis=1)
df.index.name = "Model"
df.reset_index(inplace=True)
# sort by average and model
df = df.sort_values(by=["Average", "Model"], ascending=[False, True])
def regex_table(dataframe, regex):
"""
Takes a model name as a regex, then returns only the rows that has that in it.
"""
# Ensure regex is not None
if regex is None:
regex = ""
# Split regex statement by comma and trim whitespace around regexes
regex_list = [x.strip() for x in regex.split(",")]
# Join the list into a single regex pattern with '|' acting as OR
combined_regex = "|".join(regex_list)
# Filter the dataframe such that 'model' contains any of the regex patterns
data = dataframe[dataframe["Model"].str.contains(combined_regex, case=False, na=False)]
data.reset_index(drop=True, inplace=True)
# replace column '' with count/rank
data["Rank"] = np.arange(1, 1 + len(data))
if "Average" in data.columns:
data["Average"] = np.round(np.array(data["Average"].values).astype(float), 1)
# replace blank cells with 'NaN' and round all others to 1 decimal
for col in data.columns:
if col not in ["Model", "Average", "Rank", "link"]:
data[col] = data[col].replace("", np.nan)
data[col] = np.round(np.array(data[col].values).astype(float), 1)
data[col] = data[col].replace(np.nan, "NaN")
# put rank and average at 2nd and 3rd columns
data = data[
["Rank", "Model", "link", "Average"] + [col for col in sorted(data.columns) if col not in ["Rank", "Model", "Average", "link"]]
]
return data
# pieces of text that map to a certain base model
base_model_mapping = {
"llama-3-8b": ["Llama-3-8B", "llama-3-8b", "llama_3.0_tulu_2_8b", "llama-3-8b", "llama_3_8b-"],
"llama-3.1-8b": ["Llama-3.1-8B", "llama_3.1_tulu_2_8b", "llama_31_tulu_2_8b"],
"llama-3.1-70b": [],
"olmo-1.7": [],
"olmoe": [],
"llama-2-7b": [],
"pythia": [],
"other": [],
}
stages = ["sft", "merge", "rlhf"]
rlhf_tags = ["rlhf", "dpo", "reject", "ppo", "online-dpo"]
def update_table(show_columns, base_models, selected_stages, regex):
# Sort the columns alphabetically before displaying
columns_to_display = ["Rank", "Model", "link"] + sorted(show_columns)
filtered_df = regex_table(df.copy(), regex)
# Filter columns based on selected checkboxes
filtered_df = filtered_df[columns_to_display]
# Filter model names by the base_models selected
# First, take the model names from the base_model_mapping (these will be permitted patterns)
permitted_models = []
include_other = False
for base_model in base_models:
if base_model.lower() == "other":
include_other = True
else:
permitted_models.extend(base_model_mapping.get(base_model.lower(), []))
# include the key name
permitted_models.append(base_model)
if permitted_models or include_other:
# Create a more precise filtering function
def model_filter(model_name):
base_model_name = model_metadata.get(model_name, {}).get("base_model", "")
# if no base model name is found, use the model name itself
if not base_model_name:
base_model_name = model_name
if include_other:
# If "other" is selected, include models that don't match any specific base model
return not any(
any(permitted.lower() in base_model_name.lower() for permitted in base_model_mapping[key])
for key in base_model_mapping
if key != "other"
) or any(permitted.lower() in base_model_name.lower() for permitted in permitted_models)
else:
# If "other" is not selected, only include models that match the selected base models
return any(permitted.lower() in base_model_name.lower() for permitted in permitted_models)
# Apply the filter
filtered_df = filtered_df[filtered_df["Model"].apply(model_filter)]
# filtering stages
# filter 1: if sft is selected, model names do not have "merge" or any of rlhf_tags
# filter 2: if merge is not selected, is models without merge
# sft_list = filtered_df["Model"].apply(lambda x: "merge" not in x.lower() and not any(tag in x for tag in rlhf_tags))
# merge_list = filtered_df["Model"].apply(lambda x: "merge" in x.lower())
# rlhf_list = filtered_df["Model"].apply(lambda x: any(tag in x for tag in rlhf_tags))
if len(selected_stages) == 3:
# no need to filter
pass
elif selected_stages:
def is_stage_model(model, stage):
return model_metadata.get(model, {}).get("model_type", "") == stage
sft_mask = filtered_df["Model"].apply(lambda x: bool(is_stage_model(x, "sft")))
merge_mask = filtered_df["Model"].apply(lambda x: bool(is_stage_model(x, "merge")))
rlhf_mask = filtered_df["Model"].apply(lambda x: bool(is_stage_model(x, "ppo") or is_stage_model(x, "dpo")))
stage_mask = pd.Series(False, index=filtered_df.index)
if "sft" in selected_stages:
stage_mask |= sft_mask
if "merge" in selected_stages:
stage_mask |= merge_mask
if "rlhf" in selected_stages:
stage_mask |= rlhf_mask
filtered_df = filtered_df[stage_mask]
else:
# no models
filtered_df = filtered_df[filtered_df["Model"].apply(lambda x: False)]
def highlight_nan(val):
if val == "NaN" or pd.isna(val):
return "background-color: #89CFF0"
else:
return ""
# Ensure all numeric columns show one decimal place and replace NaN with 'NaN'
for col in filtered_df.columns[3:]:
filtered_df[col] = pd.to_numeric(filtered_df[col], errors="coerce").map(
lambda x: f"{x:.1f}" if pd.notna(x) else "NaN"
)
# recompute Average with current columns and nanmean
filtered_df["Average"] = filtered_df.iloc[:, 3:].apply(lambda row: np.nanmean([float(x) for x in row]), axis=1)
# round to 2 decimal places
filtered_df["Average"] = filtered_df["Average"].map(lambda x: f"{x:.2f}")
# sort by Average
filtered_df = filtered_df.sort_values(by="Average", ascending=False)
# update rank
filtered_df["Rank"] = np.arange(1, 1 + len(filtered_df))
# Get the column names starting from the second column
columns_to_highlight = filtered_df.columns[3:]
# replace model name with link if available
def apply_link_name(link, model_name):
return "<a href='" + link + "'>" + model_name + "</a>"
filtered_df["Model"] = filtered_df.apply(lambda x: apply_link_name(x['link'], x['Model']) if not pd.isna(x["link"]) else x["Model"], axis=1)
# drop link
filtered_df = filtered_df.drop(columns=["link"])
# Apply the styling only to the selected columns
filtered_df = filtered_df.style.applymap(highlight_nan, subset=columns_to_highlight)
return filtered_df
reasoning_evals = ["Average", "gsm8k", "minerva_math::llama3", "oi_MATH_cot", "oi_bbh_cot"]
code_evals = ["Average", "codex_humaneval", "codex_humanevalplus"]
def select_all_columns():
return columns
def select_reasoning_columns():
return reasoning_evals
def show_code_columns():
return code_evals
def show_math_and_code_columns():
return reasoning_evals + code_evals[1:]
def show_instruction_columns():
return ["Average", "drop", "ifeval", "oi_alpaca_eval", "oi_alpaca_eval_2", "alpaca_eval"]
def select_no_columns():
return []
total_models = len(df)
# Sort columns alphabetically to match the order of columns in the DataFrame
columns = sorted([col for col in df.columns if col not in ["Rank", "Model", "link"]])
with gr.Blocks(theme="allenai/[email protected]") as app:
with gr.Row():
with gr.Column(scale=6):
gr.Markdown(TOP_TEXT)
with gr.Column(scale=4):
gr.Markdown(
"""
![](file/src/logo.png)
"""
)
with gr.Tabs(elem_classes="tab-buttons") as tabs:
with gr.TabItem("🏆 Tulu 3 Leaderboard"):
with gr.Row():
search_1 = gr.Textbox(
label="Model Search (delimit with , )",
placeholder="Model Search (delimit with , )",
show_label=False,
)
with gr.Accordion("Additional Options", open=False):
with gr.Row():
checkboxes = gr.CheckboxGroup(
label="Select evaluations to display",
choices=columns,
value=columns, # default all columns selected
show_label=True,
)
with gr.Row():
show_all_button = gr.Button(
"Show All Evals", size="sm", variant="primary", elem_classes="smaller-font-button"
)
show_none_button = gr.Button(
"Show No Evals", size="sm", variant="primary", elem_classes="smaller-font-button"
)
show_reasoning_button = gr.Button(
"Show Reasoning Evals", size="sm", variant="secondary", elem_classes="smaller-font-button"
)
show_code_button = gr.Button(
"Show Code Evals", size="sm", variant="secondary", elem_classes="smaller-font-button"
)
show_code_and_math_button = gr.Button(
"Show Math and Code Evals", size="sm", variant="secondary", elem_classes="smaller-font-button"
)
show_instruction_button = gr.Button(
"Show Instruction Evals", size="sm", variant="secondary", elem_classes="smaller-font-button"
)
with gr.Row():
with gr.Column(scale=6):
# add checkbox group to select base models
model_options = list(map(str.lower, base_model_mapping.keys()))
base_model_checkboxes = gr.CheckboxGroup(
label="Select base models",
choices=model_options,
value=model_options,
show_label=True,
)
with gr.Column(scale=3):
# checkbox for training stages
training_stages = gr.CheckboxGroup(
label="Select training stages",
choices=stages,
value=stages,
show_label=True,
)
with gr.Row():
tulu_3_table = gr.Dataframe(
update_table(checkboxes.value, base_model_checkboxes.value, training_stages.value, search_1.value),
height=1000,
datatype="markdown"
)
with gr.Row():
gr.Markdown(BOTTOM_TEXT)
# Update the table when search box or checkboxes change
search_1.change(
fn=update_table,
inputs=[checkboxes, base_model_checkboxes, training_stages, search_1],
outputs=tulu_3_table,
)
checkboxes.change(
fn=update_table,
inputs=[checkboxes, base_model_checkboxes, training_stages, search_1],
outputs=tulu_3_table,
)
base_model_checkboxes.change(
fn=update_table,
inputs=[checkboxes, base_model_checkboxes, training_stages, search_1],
outputs=tulu_3_table,
)
training_stages.change(
fn=update_table,
inputs=[checkboxes, base_model_checkboxes, training_stages, search_1],
outputs=tulu_3_table,
)
# Handle the show all and show none buttons
show_all_button.click(fn=select_all_columns, outputs=checkboxes)
show_none_button.click(fn=select_no_columns, outputs=checkboxes)
show_reasoning_button.click(fn=select_reasoning_columns, outputs=checkboxes)
show_code_and_math_button.click(fn=show_math_and_code_columns, outputs=checkboxes)
show_code_button.click(fn=show_code_columns, outputs=checkboxes)
show_instruction_button.click(fn=show_instruction_columns, outputs=checkboxes)
with gr.TabItem("About"):
with gr.Row():
gr.Markdown(ABOUT_TEXT)
with gr.Row():
export_button = gr.Button("Export to CSV", size="sm", variant="primary")
csv_output = gr.File(label="Download CSV")
export_button.click(fn=lambda: export_to_csv(df), outputs=csv_output)
scheduler = BackgroundScheduler()
scheduler.add_job(restart_space, "interval", seconds=10800) # restarted every 3h
scheduler.start()
app.launch(allowed_paths=["src/"])
import json
import os
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "0"
import gradio as gr
import numpy as np
import pandas as pd
from apscheduler.schedulers.background import BackgroundScheduler
from huggingface_hub import HfApi, snapshot_download
from src.md import ABOUT_TEXT, BOTTOM_TEXT, TOP_TEXT
api = HfApi()
# evals_repo = os.environ.get("EVALS_REPO")
evals_repo = "allenai/tulu-3-evals"
repo_dir = "./evals/"
def restart_space():
api.restart_space(repo_id="allenai/tulu-3-leaderboard")
def export_to_csv(dataframe):
"""Export the given dataframe to a CSV file and return the file path."""
csv_file = "exported_results.csv"
dataframe.to_csv(csv_file, index=False)
return csv_file
print("Pulling evaluation results")
repo = snapshot_download(
local_dir=repo_dir,
ignore_patterns=[],
repo_id=evals_repo,
tqdm_class=None,
etag_timeout=30,
repo_type="dataset",
)
# now we have downloaded the dataset, walk through the results directory.
# each folder should be a model, and each file in the folder should be a result
results_dir = os.path.join(repo_dir, os.environ.get("RESULTS_DIR", "results"))
model_results = {}
model_metadata = {}
timestamped_results = {}
for folder in os.listdir(results_dir):
folder_path = os.path.join(results_dir, folder)
if os.path.isdir(folder_path):
# try to load up model metadata
if os.path.isfile(os.path.join(folder_path, "metadata.json")):
try:
with open(os.path.join(folder_path, "metadata.json"), "r", encoding="utf-8") as f:
metadata = json.load(f)
model_metadata[folder] = metadata
except json.decoder.JSONDecodeError:
print(f"Error reading metadata for {folder}")
model_results[folder] = {}
for file in os.listdir(folder_path):
file_path = os.path.join(folder_path, file)
# grab the timecode from the file name
timestamp = file.split("-")[:2] # format is <date>-<time>-<name...>
timestamp = "-".join(timestamp)
if os.path.isfile(file_path):
try:
with open(file_path, "r", encoding="utf-8") as f:
first_line = f.readline()
data = json.loads(first_line)
except json.decoder.JSONDecodeError:
print(f"Error reading {file_path}")
continue
try:
score = data["metrics"]["primary_score"]
dataset_name = data["task_name"]
except KeyError:
print(f"Error reading {file_path}, wrong format.")
continue
# only update if non-nan and timestamp is more recent
if not np.isnan(score):
if (
folder + "-" + dataset_name not in timestamped_results
or timestamped_results[folder + "-" + dataset_name] < timestamp
):
model_results[folder][dataset_name] = score
if dataset_name == "minerva_math::tulu":
if "exact_match_flex_macro" in data["metrics"]:
model_results[folder]["math::flex"] = data["metrics"]["exact_match_flex_macro"]
timestamped_results[folder + "-" + dataset_name] = timestamp
else:
print(
"skipping",
folder + "-" + dataset_name,
"because",
timestamped_results[folder + "-" + dataset_name],
"is more recent than",
timestamp,
)
# clean: remove models that don't have any evals
models_to_remove = []
for model, results in model_results.items():
if not results:
print(f"Removing {model} because it has no evals")
models_to_remove.append(model)
for model in models_to_remove:
del model_results[model]
del model_metadata[model]
# add link metadata
for model in model_results:
if model in model_metadata and "wandb_path" in model_metadata[model]:
model_results[model]['link'] = model_metadata[model]["wandb_path"]
# clean data: all models should have the same evals
# for any model that doesn't have an eval, add a nan
evals = set()
for model in model_results:
for eval in model_results[model]:
evals.add(eval)
if "link" in evals:
evals.remove("link")
for model in model_results:
for eval in evals:
if eval not in model_results[model]:
model_results[model][eval] = np.nan
# now, turn into dataframe. Columns are evals, rows are models
df = pd.DataFrame(model_results).T
import wandb
wandb_api = wandb.Api()
for index, row in df.iterrows():
if "link" not in row or not isinstance(row["link"], str):
continue
try:
link = row["link"].replace("https://wandb.ai/", "")
wandb_run = wandb_api.run(link)
modified = False
for key in row.keys():
if key == "link":
continue
if f"oe-eval/{key}" not in wandb_run.summary:
wandb_run.summary[f"oe-eval/{key}"] = row[key]
modified = True
elif wandb_run.summary[f"oe-eval/{key}"] != row[key]:
wandb_run.summary[f"oe-eval/{key}"] = row[key]
modified = True
if not modified:
print(f"Already logged metrics to {wandb_run.url}")
else:
wandb_run.update()
print(f"Logged metrics to {wandb_run.url}")
except Exception as e:
print(f"Error logging metrics to {row['link']}: {e}")
breakpoint()
# multiply by 100 to get percentage
for col in df.columns:
if col == "link":
continue
if col == "alpaca_eval": # alpaca_eval is already in percentage
continue
df[col] = df[col] * 100
# add link column if not present, all nan
if "link" not in df.columns:
df["link"] = np.nan
# add average column
df["Average"] = df.drop(columns=["link"]).mean(axis=1)
df.index.name = "Model"
df.reset_index(inplace=True)
# sort by average and model
df = df.sort_values(by=["Average", "Model"], ascending=[False, True])
def regex_table(dataframe, regex):
"""
Takes a model name as a regex, then returns only the rows that has that in it.
"""
# Ensure regex is not None
if regex is None:
regex = ""
# Split regex statement by comma and trim whitespace around regexes
regex_list = [x.strip() for x in regex.split(",")]
# Join the list into a single regex pattern with '|' acting as OR
combined_regex = "|".join(regex_list)
# Filter the dataframe such that 'model' contains any of the regex patterns
data = dataframe[dataframe["Model"].str.contains(combined_regex, case=False, na=False)]
data.reset_index(drop=True, inplace=True)
# replace column '' with count/rank
data["Rank"] = np.arange(1, 1 + len(data))
if "Average" in data.columns:
data["Average"] = np.round(np.array(data["Average"].values).astype(float), 1)
# replace blank cells with 'NaN' and round all others to 1 decimal
for col in data.columns:
if col not in ["Model", "Average", "Rank", "link"]:
data[col] = data[col].replace("", np.nan)
data[col] = np.round(np.array(data[col].values).astype(float), 1)
data[col] = data[col].replace(np.nan, "NaN")
# put rank and average at 2nd and 3rd columns
data = data[
["Rank", "Model", "link", "Average"] + [col for col in sorted(data.columns) if col not in ["Rank", "Model", "Average", "link"]]
]
return data
# pieces of text that map to a certain base model
base_model_mapping = {
"llama-3-8b": ["Llama-3-8B", "llama-3-8b", "llama_3.0_tulu_2_8b", "llama-3-8b", "llama_3_8b-"],
"llama-3.1-8b": ["Llama-3.1-8B", "llama_3.1_tulu_2_8b", "llama_31_tulu_2_8b"],
"llama-3.1-70b": [],
"olmo-1.7": [],
"olmoe": [],
"llama-2-7b": [],
"pythia": [],
"other": [],
}
stages = ["sft", "merge", "rlhf"]
rlhf_tags = ["rlhf", "dpo", "reject", "ppo", "online-dpo"]
def update_table(show_columns, base_models, selected_stages, regex):
# Sort the columns alphabetically before displaying
columns_to_display = ["Rank", "Model", "link"] + sorted(show_columns)
filtered_df = regex_table(df.copy(), regex)
# Filter columns based on selected checkboxes
filtered_df = filtered_df[columns_to_display]
# Filter model names by the base_models selected
# First, take the model names from the base_model_mapping (these will be permitted patterns)
permitted_models = []
include_other = False
for base_model in base_models:
if base_model.lower() == "other":
include_other = True
else:
permitted_models.extend(base_model_mapping.get(base_model.lower(), []))
# include the key name
permitted_models.append(base_model)
if permitted_models or include_other:
# Create a more precise filtering function
def model_filter(model_name):
base_model_name = model_metadata.get(model_name, {}).get("base_model", "")
# if no base model name is found, use the model name itself
if not base_model_name:
base_model_name = model_name
if include_other:
# If "other" is selected, include models that don't match any specific base model
return not any(
any(permitted.lower() in base_model_name.lower() for permitted in base_model_mapping[key])
for key in base_model_mapping
if key != "other"
) or any(permitted.lower() in base_model_name.lower() for permitted in permitted_models)
else:
# If "other" is not selected, only include models that match the selected base models
return any(permitted.lower() in base_model_name.lower() for permitted in permitted_models)
# Apply the filter
filtered_df = filtered_df[filtered_df["Model"].apply(model_filter)]
# filtering stages
# filter 1: if sft is selected, model names do not have "merge" or any of rlhf_tags
# filter 2: if merge is not selected, is models without merge
# sft_list = filtered_df["Model"].apply(lambda x: "merge" not in x.lower() and not any(tag in x for tag in rlhf_tags))
# merge_list = filtered_df["Model"].apply(lambda x: "merge" in x.lower())
# rlhf_list = filtered_df["Model"].apply(lambda x: any(tag in x for tag in rlhf_tags))
if len(selected_stages) == 3:
# no need to filter
pass
elif selected_stages:
def is_stage_model(model, stage):
return model_metadata.get(model, {}).get("model_type", "") == stage
sft_mask = filtered_df["Model"].apply(lambda x: bool(is_stage_model(x, "sft")))
merge_mask = filtered_df["Model"].apply(lambda x: bool(is_stage_model(x, "merge")))
rlhf_mask = filtered_df["Model"].apply(lambda x: bool(is_stage_model(x, "ppo") or is_stage_model(x, "dpo")))
stage_mask = pd.Series(False, index=filtered_df.index)
if "sft" in selected_stages:
stage_mask |= sft_mask
if "merge" in selected_stages:
stage_mask |= merge_mask
if "rlhf" in selected_stages:
stage_mask |= rlhf_mask
filtered_df = filtered_df[stage_mask]
else:
# no models
filtered_df = filtered_df[filtered_df["Model"].apply(lambda x: False)]
def highlight_nan(val):
if val == "NaN" or pd.isna(val):
return "background-color: #89CFF0"
else:
return ""
# Ensure all numeric columns show one decimal place and replace NaN with 'NaN'
for col in filtered_df.columns[3:]:
filtered_df[col] = pd.to_numeric(filtered_df[col], errors="coerce").map(
lambda x: f"{x:.1f}" if pd.notna(x) else "NaN"
)
# recompute Average with current columns and nanmean
filtered_df["Average"] = filtered_df.iloc[:, 3:].apply(lambda row: np.nanmean([float(x) for x in row]), axis=1)
# round to 2 decimal places
filtered_df["Average"] = filtered_df["Average"].map(lambda x: f"{x:.2f}")
# sort by Average
filtered_df = filtered_df.sort_values(by="Average", ascending=False)
# update rank
filtered_df["Rank"] = np.arange(1, 1 + len(filtered_df))
# Get the column names starting from the second column
columns_to_highlight = filtered_df.columns[3:]
# replace model name with link if available
def apply_link_name(link, model_name):
return "<a href='" + link + "'>" + model_name + "</a>"
filtered_df["Model"] = filtered_df.apply(lambda x: apply_link_name(x['link'], x['Model']) if not pd.isna(x["link"]) else x["Model"], axis=1)
# drop link
filtered_df = filtered_df.drop(columns=["link"])
# Apply the styling only to the selected columns
filtered_df = filtered_df.style.applymap(highlight_nan, subset=columns_to_highlight)
return filtered_df
reasoning_evals = ["Average", "gsm8k", "minerva_math::llama3", "oi_MATH_cot", "oi_bbh_cot"]
code_evals = ["Average", "codex_humaneval", "codex_humanevalplus"]
def select_all_columns():
return columns
def select_reasoning_columns():
return reasoning_evals
def show_code_columns():
return code_evals
def show_math_and_code_columns():
return reasoning_evals + code_evals[1:]
def show_instruction_columns():
return ["Average", "drop", "ifeval", "oi_alpaca_eval", "oi_alpaca_eval_2", "alpaca_eval"]
def select_no_columns():
return []
total_models = len(df)
# Sort columns alphabetically to match the order of columns in the DataFrame
columns = sorted([col for col in df.columns if col not in ["Rank", "Model", "link"]])
with gr.Blocks(theme="allenai/[email protected]") as app:
with gr.Row():
with gr.Column(scale=6):
gr.Markdown(TOP_TEXT)
with gr.Column(scale=4):
gr.Markdown(
"""
![](file/src/logo.png)
"""
)
with gr.Tabs(elem_classes="tab-buttons") as tabs:
with gr.TabItem("🏆 Tulu 3 Leaderboard"):
with gr.Row():
search_1 = gr.Textbox(
label="Model Search (delimit with , )",
placeholder="Model Search (delimit with , )",
show_label=False,
)
with gr.Accordion("Additional Options", open=False):
with gr.Row():
checkboxes = gr.CheckboxGroup(
label="Select evaluations to display",
choices=columns,
value=columns, # default all columns selected
show_label=True,
)
with gr.Row():
show_all_button = gr.Button(
"Show All Evals", size="sm", variant="primary", elem_classes="smaller-font-button"
)
show_none_button = gr.Button(
"Show No Evals", size="sm", variant="primary", elem_classes="smaller-font-button"
)
show_reasoning_button = gr.Button(
"Show Reasoning Evals", size="sm", variant="secondary", elem_classes="smaller-font-button"
)
show_code_button = gr.Button(
"Show Code Evals", size="sm", variant="secondary", elem_classes="smaller-font-button"
)
show_code_and_math_button = gr.Button(
"Show Math and Code Evals", size="sm", variant="secondary", elem_classes="smaller-font-button"
)
show_instruction_button = gr.Button(
"Show Instruction Evals", size="sm", variant="secondary", elem_classes="smaller-font-button"
)
with gr.Row():
with gr.Column(scale=6):
# add checkbox group to select base models
model_options = list(map(str.lower, base_model_mapping.keys()))
base_model_checkboxes = gr.CheckboxGroup(
label="Select base models",
choices=model_options,
value=model_options,
show_label=True,
)
with gr.Column(scale=3):
# checkbox for training stages
training_stages = gr.CheckboxGroup(
label="Select training stages",
choices=stages,
value=stages,
show_label=True,
)
with gr.Row():
tulu_3_table = gr.Dataframe(
update_table(checkboxes.value, base_model_checkboxes.value, training_stages.value, search_1.value),
height=1000,
datatype="markdown"
)
with gr.Row():
gr.Markdown(BOTTOM_TEXT)
# Update the table when search box or checkboxes change
search_1.change(
fn=update_table,
inputs=[checkboxes, base_model_checkboxes, training_stages, search_1],
outputs=tulu_3_table,
)
checkboxes.change(
fn=update_table,
inputs=[checkboxes, base_model_checkboxes, training_stages, search_1],
outputs=tulu_3_table,
)
base_model_checkboxes.change(
fn=update_table,
inputs=[checkboxes, base_model_checkboxes, training_stages, search_1],
outputs=tulu_3_table,
)
training_stages.change(
fn=update_table,
inputs=[checkboxes, base_model_checkboxes, training_stages, search_1],
outputs=tulu_3_table,
)
# Handle the show all and show none buttons
show_all_button.click(fn=select_all_columns, outputs=checkboxes)
show_none_button.click(fn=select_no_columns, outputs=checkboxes)
show_reasoning_button.click(fn=select_reasoning_columns, outputs=checkboxes)
show_code_and_math_button.click(fn=show_math_and_code_columns, outputs=checkboxes)
show_code_button.click(fn=show_code_columns, outputs=checkboxes)
show_instruction_button.click(fn=show_instruction_columns, outputs=checkboxes)
with gr.TabItem("About"):
with gr.Row():
gr.Markdown(ABOUT_TEXT)
with gr.Row():
export_button = gr.Button("Export to CSV", size="sm", variant="primary")
csv_output = gr.File(label="Download CSV")
export_button.click(fn=lambda: export_to_csv(df), outputs=csv_output)
scheduler = BackgroundScheduler()
scheduler.add_job(restart_space, "interval", seconds=10800) # restarted every 3h
scheduler.start()
app.launch(allowed_paths=["src/"])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment