Created
November 22, 2024 04:25
-
-
Save vwxyzjn/ff9f5759444c02e4142d54e907d3f964 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import os | |
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "0" | |
import gradio as gr | |
import numpy as np | |
import pandas as pd | |
from apscheduler.schedulers.background import BackgroundScheduler | |
from huggingface_hub import HfApi, snapshot_download | |
from src.md import ABOUT_TEXT, BOTTOM_TEXT, TOP_TEXT | |
api = HfApi() | |
# evals_repo = os.environ.get("EVALS_REPO") | |
evals_repo = "allenai/tulu-3-evals" | |
repo_dir = "./evals/" | |
def restart_space(): | |
api.restart_space(repo_id="allenai/tulu-3-leaderboard") | |
def export_to_csv(dataframe): | |
"""Export the given dataframe to a CSV file and return the file path.""" | |
csv_file = "exported_results.csv" | |
dataframe.to_csv(csv_file, index=False) | |
return csv_file | |
print("Pulling evaluation results") | |
repo = snapshot_download( | |
local_dir=repo_dir, | |
ignore_patterns=[], | |
repo_id=evals_repo, | |
tqdm_class=None, | |
etag_timeout=30, | |
repo_type="dataset", | |
) | |
# now we have downloaded the dataset, walk through the results directory. | |
# each folder should be a model, and each file in the folder should be a result | |
results_dir = os.path.join(repo_dir, os.environ.get("RESULTS_DIR", "results")) | |
model_results = {} | |
model_metadata = {} | |
timestamped_results = {} | |
for folder in os.listdir(results_dir): | |
folder_path = os.path.join(results_dir, folder) | |
if os.path.isdir(folder_path): | |
# try to load up model metadata | |
if os.path.isfile(os.path.join(folder_path, "metadata.json")): | |
try: | |
with open(os.path.join(folder_path, "metadata.json"), "r", encoding="utf-8") as f: | |
metadata = json.load(f) | |
model_metadata[folder] = metadata | |
except json.decoder.JSONDecodeError: | |
print(f"Error reading metadata for {folder}") | |
model_results[folder] = {} | |
for file in os.listdir(folder_path): | |
file_path = os.path.join(folder_path, file) | |
# grab the timecode from the file name | |
timestamp = file.split("-")[:2] # format is <date>-<time>-<name...> | |
timestamp = "-".join(timestamp) | |
if os.path.isfile(file_path): | |
try: | |
with open(file_path, "r", encoding="utf-8") as f: | |
first_line = f.readline() | |
data = json.loads(first_line) | |
except json.decoder.JSONDecodeError: | |
print(f"Error reading {file_path}") | |
continue | |
try: | |
score = data["metrics"]["primary_score"] | |
dataset_name = data["task_name"] | |
except KeyError: | |
print(f"Error reading {file_path}, wrong format.") | |
continue | |
# only update if non-nan and timestamp is more recent | |
if not np.isnan(score): | |
if ( | |
folder + "-" + dataset_name not in timestamped_results | |
or timestamped_results[folder + "-" + dataset_name] < timestamp | |
): | |
model_results[folder][dataset_name] = score | |
if dataset_name == "minerva_math::tulu": | |
if "exact_match_flex_macro" in data["metrics"]: | |
model_results[folder]["math::flex"] = data["metrics"]["exact_match_flex_macro"] | |
timestamped_results[folder + "-" + dataset_name] = timestamp | |
else: | |
print( | |
"skipping", | |
folder + "-" + dataset_name, | |
"because", | |
timestamped_results[folder + "-" + dataset_name], | |
"is more recent than", | |
timestamp, | |
) | |
# clean: remove models that don't have any evals | |
models_to_remove = [] | |
for model, results in model_results.items(): | |
if not results: | |
print(f"Removing {model} because it has no evals") | |
models_to_remove.append(model) | |
for model in models_to_remove: | |
del model_results[model] | |
del model_metadata[model] | |
# add link metadata | |
for model in model_results: | |
if model in model_metadata and "wandb_path" in model_metadata[model]: | |
model_results[model]['link'] = model_metadata[model]["wandb_path"] | |
# clean data: all models should have the same evals | |
# for any model that doesn't have an eval, add a nan | |
evals = set() | |
for model in model_results: | |
for eval in model_results[model]: | |
evals.add(eval) | |
if "link" in evals: | |
evals.remove("link") | |
for model in model_results: | |
for eval in evals: | |
if eval not in model_results[model]: | |
model_results[model][eval] = np.nan | |
# now, turn into dataframe. Columns are evals, rows are models | |
df = pd.DataFrame(model_results).T | |
import wandb | |
wandb_api = wandb.Api() | |
for index, row in df.iterrows(): | |
if "link" not in row or not isinstance(row["link"], str): | |
continue | |
try: | |
link = row["link"].replace("https://wandb.ai/", "") | |
wandb_run = wandb_api.run(link) | |
modified = False | |
for key in row.keys(): | |
if key == "link": | |
continue | |
if f"oe-eval/{key}" not in wandb_run.summary: | |
wandb_run.summary[f"oe-eval/{key}"] = row[key] | |
modified = True | |
elif wandb_run.summary[f"oe-eval/{key}"] != row[key]: | |
wandb_run.summary[f"oe-eval/{key}"] = row[key] | |
modified = True | |
if not modified: | |
print(f"Already logged metrics to {wandb_run.url}") | |
else: | |
wandb_run.update() | |
print(f"Logged metrics to {wandb_run.url}") | |
except Exception as e: | |
print(f"Error logging metrics to {row['link']}: {e}") | |
breakpoint() | |
# multiply by 100 to get percentage | |
for col in df.columns: | |
if col == "link": | |
continue | |
if col == "alpaca_eval": # alpaca_eval is already in percentage | |
continue | |
df[col] = df[col] * 100 | |
# add link column if not present, all nan | |
if "link" not in df.columns: | |
df["link"] = np.nan | |
# add average column | |
df["Average"] = df.drop(columns=["link"]).mean(axis=1) | |
df.index.name = "Model" | |
df.reset_index(inplace=True) | |
# sort by average and model | |
df = df.sort_values(by=["Average", "Model"], ascending=[False, True]) | |
def regex_table(dataframe, regex): | |
""" | |
Takes a model name as a regex, then returns only the rows that has that in it. | |
""" | |
# Ensure regex is not None | |
if regex is None: | |
regex = "" | |
# Split regex statement by comma and trim whitespace around regexes | |
regex_list = [x.strip() for x in regex.split(",")] | |
# Join the list into a single regex pattern with '|' acting as OR | |
combined_regex = "|".join(regex_list) | |
# Filter the dataframe such that 'model' contains any of the regex patterns | |
data = dataframe[dataframe["Model"].str.contains(combined_regex, case=False, na=False)] | |
data.reset_index(drop=True, inplace=True) | |
# replace column '' with count/rank | |
data["Rank"] = np.arange(1, 1 + len(data)) | |
if "Average" in data.columns: | |
data["Average"] = np.round(np.array(data["Average"].values).astype(float), 1) | |
# replace blank cells with 'NaN' and round all others to 1 decimal | |
for col in data.columns: | |
if col not in ["Model", "Average", "Rank", "link"]: | |
data[col] = data[col].replace("", np.nan) | |
data[col] = np.round(np.array(data[col].values).astype(float), 1) | |
data[col] = data[col].replace(np.nan, "NaN") | |
# put rank and average at 2nd and 3rd columns | |
data = data[ | |
["Rank", "Model", "link", "Average"] + [col for col in sorted(data.columns) if col not in ["Rank", "Model", "Average", "link"]] | |
] | |
return data | |
# pieces of text that map to a certain base model | |
base_model_mapping = { | |
"llama-3-8b": ["Llama-3-8B", "llama-3-8b", "llama_3.0_tulu_2_8b", "llama-3-8b", "llama_3_8b-"], | |
"llama-3.1-8b": ["Llama-3.1-8B", "llama_3.1_tulu_2_8b", "llama_31_tulu_2_8b"], | |
"llama-3.1-70b": [], | |
"olmo-1.7": [], | |
"olmoe": [], | |
"llama-2-7b": [], | |
"pythia": [], | |
"other": [], | |
} | |
stages = ["sft", "merge", "rlhf"] | |
rlhf_tags = ["rlhf", "dpo", "reject", "ppo", "online-dpo"] | |
def update_table(show_columns, base_models, selected_stages, regex): | |
# Sort the columns alphabetically before displaying | |
columns_to_display = ["Rank", "Model", "link"] + sorted(show_columns) | |
filtered_df = regex_table(df.copy(), regex) | |
# Filter columns based on selected checkboxes | |
filtered_df = filtered_df[columns_to_display] | |
# Filter model names by the base_models selected | |
# First, take the model names from the base_model_mapping (these will be permitted patterns) | |
permitted_models = [] | |
include_other = False | |
for base_model in base_models: | |
if base_model.lower() == "other": | |
include_other = True | |
else: | |
permitted_models.extend(base_model_mapping.get(base_model.lower(), [])) | |
# include the key name | |
permitted_models.append(base_model) | |
if permitted_models or include_other: | |
# Create a more precise filtering function | |
def model_filter(model_name): | |
base_model_name = model_metadata.get(model_name, {}).get("base_model", "") | |
# if no base model name is found, use the model name itself | |
if not base_model_name: | |
base_model_name = model_name | |
if include_other: | |
# If "other" is selected, include models that don't match any specific base model | |
return not any( | |
any(permitted.lower() in base_model_name.lower() for permitted in base_model_mapping[key]) | |
for key in base_model_mapping | |
if key != "other" | |
) or any(permitted.lower() in base_model_name.lower() for permitted in permitted_models) | |
else: | |
# If "other" is not selected, only include models that match the selected base models | |
return any(permitted.lower() in base_model_name.lower() for permitted in permitted_models) | |
# Apply the filter | |
filtered_df = filtered_df[filtered_df["Model"].apply(model_filter)] | |
# filtering stages | |
# filter 1: if sft is selected, model names do not have "merge" or any of rlhf_tags | |
# filter 2: if merge is not selected, is models without merge | |
# sft_list = filtered_df["Model"].apply(lambda x: "merge" not in x.lower() and not any(tag in x for tag in rlhf_tags)) | |
# merge_list = filtered_df["Model"].apply(lambda x: "merge" in x.lower()) | |
# rlhf_list = filtered_df["Model"].apply(lambda x: any(tag in x for tag in rlhf_tags)) | |
if len(selected_stages) == 3: | |
# no need to filter | |
pass | |
elif selected_stages: | |
def is_stage_model(model, stage): | |
return model_metadata.get(model, {}).get("model_type", "") == stage | |
sft_mask = filtered_df["Model"].apply(lambda x: bool(is_stage_model(x, "sft"))) | |
merge_mask = filtered_df["Model"].apply(lambda x: bool(is_stage_model(x, "merge"))) | |
rlhf_mask = filtered_df["Model"].apply(lambda x: bool(is_stage_model(x, "ppo") or is_stage_model(x, "dpo"))) | |
stage_mask = pd.Series(False, index=filtered_df.index) | |
if "sft" in selected_stages: | |
stage_mask |= sft_mask | |
if "merge" in selected_stages: | |
stage_mask |= merge_mask | |
if "rlhf" in selected_stages: | |
stage_mask |= rlhf_mask | |
filtered_df = filtered_df[stage_mask] | |
else: | |
# no models | |
filtered_df = filtered_df[filtered_df["Model"].apply(lambda x: False)] | |
def highlight_nan(val): | |
if val == "NaN" or pd.isna(val): | |
return "background-color: #89CFF0" | |
else: | |
return "" | |
# Ensure all numeric columns show one decimal place and replace NaN with 'NaN' | |
for col in filtered_df.columns[3:]: | |
filtered_df[col] = pd.to_numeric(filtered_df[col], errors="coerce").map( | |
lambda x: f"{x:.1f}" if pd.notna(x) else "NaN" | |
) | |
# recompute Average with current columns and nanmean | |
filtered_df["Average"] = filtered_df.iloc[:, 3:].apply(lambda row: np.nanmean([float(x) for x in row]), axis=1) | |
# round to 2 decimal places | |
filtered_df["Average"] = filtered_df["Average"].map(lambda x: f"{x:.2f}") | |
# sort by Average | |
filtered_df = filtered_df.sort_values(by="Average", ascending=False) | |
# update rank | |
filtered_df["Rank"] = np.arange(1, 1 + len(filtered_df)) | |
# Get the column names starting from the second column | |
columns_to_highlight = filtered_df.columns[3:] | |
# replace model name with link if available | |
def apply_link_name(link, model_name): | |
return "<a href='" + link + "'>" + model_name + "</a>" | |
filtered_df["Model"] = filtered_df.apply(lambda x: apply_link_name(x['link'], x['Model']) if not pd.isna(x["link"]) else x["Model"], axis=1) | |
# drop link | |
filtered_df = filtered_df.drop(columns=["link"]) | |
# Apply the styling only to the selected columns | |
filtered_df = filtered_df.style.applymap(highlight_nan, subset=columns_to_highlight) | |
return filtered_df | |
reasoning_evals = ["Average", "gsm8k", "minerva_math::llama3", "oi_MATH_cot", "oi_bbh_cot"] | |
code_evals = ["Average", "codex_humaneval", "codex_humanevalplus"] | |
def select_all_columns(): | |
return columns | |
def select_reasoning_columns(): | |
return reasoning_evals | |
def show_code_columns(): | |
return code_evals | |
def show_math_and_code_columns(): | |
return reasoning_evals + code_evals[1:] | |
def show_instruction_columns(): | |
return ["Average", "drop", "ifeval", "oi_alpaca_eval", "oi_alpaca_eval_2", "alpaca_eval"] | |
def select_no_columns(): | |
return [] | |
total_models = len(df) | |
# Sort columns alphabetically to match the order of columns in the DataFrame | |
columns = sorted([col for col in df.columns if col not in ["Rank", "Model", "link"]]) | |
with gr.Blocks(theme="allenai/[email protected]") as app: | |
with gr.Row(): | |
with gr.Column(scale=6): | |
gr.Markdown(TOP_TEXT) | |
with gr.Column(scale=4): | |
gr.Markdown( | |
""" | |
data:image/s3,"s3://crabby-images/abae1/abae16698d433772579e7153dff33f00628ae657" alt="" | |
""" | |
) | |
with gr.Tabs(elem_classes="tab-buttons") as tabs: | |
with gr.TabItem("🏆 Tulu 3 Leaderboard"): | |
with gr.Row(): | |
search_1 = gr.Textbox( | |
label="Model Search (delimit with , )", | |
placeholder="Model Search (delimit with , )", | |
show_label=False, | |
) | |
with gr.Accordion("Additional Options", open=False): | |
with gr.Row(): | |
checkboxes = gr.CheckboxGroup( | |
label="Select evaluations to display", | |
choices=columns, | |
value=columns, # default all columns selected | |
show_label=True, | |
) | |
with gr.Row(): | |
show_all_button = gr.Button( | |
"Show All Evals", size="sm", variant="primary", elem_classes="smaller-font-button" | |
) | |
show_none_button = gr.Button( | |
"Show No Evals", size="sm", variant="primary", elem_classes="smaller-font-button" | |
) | |
show_reasoning_button = gr.Button( | |
"Show Reasoning Evals", size="sm", variant="secondary", elem_classes="smaller-font-button" | |
) | |
show_code_button = gr.Button( | |
"Show Code Evals", size="sm", variant="secondary", elem_classes="smaller-font-button" | |
) | |
show_code_and_math_button = gr.Button( | |
"Show Math and Code Evals", size="sm", variant="secondary", elem_classes="smaller-font-button" | |
) | |
show_instruction_button = gr.Button( | |
"Show Instruction Evals", size="sm", variant="secondary", elem_classes="smaller-font-button" | |
) | |
with gr.Row(): | |
with gr.Column(scale=6): | |
# add checkbox group to select base models | |
model_options = list(map(str.lower, base_model_mapping.keys())) | |
base_model_checkboxes = gr.CheckboxGroup( | |
label="Select base models", | |
choices=model_options, | |
value=model_options, | |
show_label=True, | |
) | |
with gr.Column(scale=3): | |
# checkbox for training stages | |
training_stages = gr.CheckboxGroup( | |
label="Select training stages", | |
choices=stages, | |
value=stages, | |
show_label=True, | |
) | |
with gr.Row(): | |
tulu_3_table = gr.Dataframe( | |
update_table(checkboxes.value, base_model_checkboxes.value, training_stages.value, search_1.value), | |
height=1000, | |
datatype="markdown" | |
) | |
with gr.Row(): | |
gr.Markdown(BOTTOM_TEXT) | |
# Update the table when search box or checkboxes change | |
search_1.change( | |
fn=update_table, | |
inputs=[checkboxes, base_model_checkboxes, training_stages, search_1], | |
outputs=tulu_3_table, | |
) | |
checkboxes.change( | |
fn=update_table, | |
inputs=[checkboxes, base_model_checkboxes, training_stages, search_1], | |
outputs=tulu_3_table, | |
) | |
base_model_checkboxes.change( | |
fn=update_table, | |
inputs=[checkboxes, base_model_checkboxes, training_stages, search_1], | |
outputs=tulu_3_table, | |
) | |
training_stages.change( | |
fn=update_table, | |
inputs=[checkboxes, base_model_checkboxes, training_stages, search_1], | |
outputs=tulu_3_table, | |
) | |
# Handle the show all and show none buttons | |
show_all_button.click(fn=select_all_columns, outputs=checkboxes) | |
show_none_button.click(fn=select_no_columns, outputs=checkboxes) | |
show_reasoning_button.click(fn=select_reasoning_columns, outputs=checkboxes) | |
show_code_and_math_button.click(fn=show_math_and_code_columns, outputs=checkboxes) | |
show_code_button.click(fn=show_code_columns, outputs=checkboxes) | |
show_instruction_button.click(fn=show_instruction_columns, outputs=checkboxes) | |
with gr.TabItem("About"): | |
with gr.Row(): | |
gr.Markdown(ABOUT_TEXT) | |
with gr.Row(): | |
export_button = gr.Button("Export to CSV", size="sm", variant="primary") | |
csv_output = gr.File(label="Download CSV") | |
export_button.click(fn=lambda: export_to_csv(df), outputs=csv_output) | |
scheduler = BackgroundScheduler() | |
scheduler.add_job(restart_space, "interval", seconds=10800) # restarted every 3h | |
scheduler.start() | |
app.launch(allowed_paths=["src/"]) | |
import json | |
import os | |
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "0" | |
import gradio as gr | |
import numpy as np | |
import pandas as pd | |
from apscheduler.schedulers.background import BackgroundScheduler | |
from huggingface_hub import HfApi, snapshot_download | |
from src.md import ABOUT_TEXT, BOTTOM_TEXT, TOP_TEXT | |
api = HfApi() | |
# evals_repo = os.environ.get("EVALS_REPO") | |
evals_repo = "allenai/tulu-3-evals" | |
repo_dir = "./evals/" | |
def restart_space(): | |
api.restart_space(repo_id="allenai/tulu-3-leaderboard") | |
def export_to_csv(dataframe): | |
"""Export the given dataframe to a CSV file and return the file path.""" | |
csv_file = "exported_results.csv" | |
dataframe.to_csv(csv_file, index=False) | |
return csv_file | |
print("Pulling evaluation results") | |
repo = snapshot_download( | |
local_dir=repo_dir, | |
ignore_patterns=[], | |
repo_id=evals_repo, | |
tqdm_class=None, | |
etag_timeout=30, | |
repo_type="dataset", | |
) | |
# now we have downloaded the dataset, walk through the results directory. | |
# each folder should be a model, and each file in the folder should be a result | |
results_dir = os.path.join(repo_dir, os.environ.get("RESULTS_DIR", "results")) | |
model_results = {} | |
model_metadata = {} | |
timestamped_results = {} | |
for folder in os.listdir(results_dir): | |
folder_path = os.path.join(results_dir, folder) | |
if os.path.isdir(folder_path): | |
# try to load up model metadata | |
if os.path.isfile(os.path.join(folder_path, "metadata.json")): | |
try: | |
with open(os.path.join(folder_path, "metadata.json"), "r", encoding="utf-8") as f: | |
metadata = json.load(f) | |
model_metadata[folder] = metadata | |
except json.decoder.JSONDecodeError: | |
print(f"Error reading metadata for {folder}") | |
model_results[folder] = {} | |
for file in os.listdir(folder_path): | |
file_path = os.path.join(folder_path, file) | |
# grab the timecode from the file name | |
timestamp = file.split("-")[:2] # format is <date>-<time>-<name...> | |
timestamp = "-".join(timestamp) | |
if os.path.isfile(file_path): | |
try: | |
with open(file_path, "r", encoding="utf-8") as f: | |
first_line = f.readline() | |
data = json.loads(first_line) | |
except json.decoder.JSONDecodeError: | |
print(f"Error reading {file_path}") | |
continue | |
try: | |
score = data["metrics"]["primary_score"] | |
dataset_name = data["task_name"] | |
except KeyError: | |
print(f"Error reading {file_path}, wrong format.") | |
continue | |
# only update if non-nan and timestamp is more recent | |
if not np.isnan(score): | |
if ( | |
folder + "-" + dataset_name not in timestamped_results | |
or timestamped_results[folder + "-" + dataset_name] < timestamp | |
): | |
model_results[folder][dataset_name] = score | |
if dataset_name == "minerva_math::tulu": | |
if "exact_match_flex_macro" in data["metrics"]: | |
model_results[folder]["math::flex"] = data["metrics"]["exact_match_flex_macro"] | |
timestamped_results[folder + "-" + dataset_name] = timestamp | |
else: | |
print( | |
"skipping", | |
folder + "-" + dataset_name, | |
"because", | |
timestamped_results[folder + "-" + dataset_name], | |
"is more recent than", | |
timestamp, | |
) | |
# clean: remove models that don't have any evals | |
models_to_remove = [] | |
for model, results in model_results.items(): | |
if not results: | |
print(f"Removing {model} because it has no evals") | |
models_to_remove.append(model) | |
for model in models_to_remove: | |
del model_results[model] | |
del model_metadata[model] | |
# add link metadata | |
for model in model_results: | |
if model in model_metadata and "wandb_path" in model_metadata[model]: | |
model_results[model]['link'] = model_metadata[model]["wandb_path"] | |
# clean data: all models should have the same evals | |
# for any model that doesn't have an eval, add a nan | |
evals = set() | |
for model in model_results: | |
for eval in model_results[model]: | |
evals.add(eval) | |
if "link" in evals: | |
evals.remove("link") | |
for model in model_results: | |
for eval in evals: | |
if eval not in model_results[model]: | |
model_results[model][eval] = np.nan | |
# now, turn into dataframe. Columns are evals, rows are models | |
df = pd.DataFrame(model_results).T | |
import wandb | |
wandb_api = wandb.Api() | |
for index, row in df.iterrows(): | |
if "link" not in row or not isinstance(row["link"], str): | |
continue | |
try: | |
link = row["link"].replace("https://wandb.ai/", "") | |
wandb_run = wandb_api.run(link) | |
modified = False | |
for key in row.keys(): | |
if key == "link": | |
continue | |
if f"oe-eval/{key}" not in wandb_run.summary: | |
wandb_run.summary[f"oe-eval/{key}"] = row[key] | |
modified = True | |
elif wandb_run.summary[f"oe-eval/{key}"] != row[key]: | |
wandb_run.summary[f"oe-eval/{key}"] = row[key] | |
modified = True | |
if not modified: | |
print(f"Already logged metrics to {wandb_run.url}") | |
else: | |
wandb_run.update() | |
print(f"Logged metrics to {wandb_run.url}") | |
except Exception as e: | |
print(f"Error logging metrics to {row['link']}: {e}") | |
breakpoint() | |
# multiply by 100 to get percentage | |
for col in df.columns: | |
if col == "link": | |
continue | |
if col == "alpaca_eval": # alpaca_eval is already in percentage | |
continue | |
df[col] = df[col] * 100 | |
# add link column if not present, all nan | |
if "link" not in df.columns: | |
df["link"] = np.nan | |
# add average column | |
df["Average"] = df.drop(columns=["link"]).mean(axis=1) | |
df.index.name = "Model" | |
df.reset_index(inplace=True) | |
# sort by average and model | |
df = df.sort_values(by=["Average", "Model"], ascending=[False, True]) | |
def regex_table(dataframe, regex): | |
""" | |
Takes a model name as a regex, then returns only the rows that has that in it. | |
""" | |
# Ensure regex is not None | |
if regex is None: | |
regex = "" | |
# Split regex statement by comma and trim whitespace around regexes | |
regex_list = [x.strip() for x in regex.split(",")] | |
# Join the list into a single regex pattern with '|' acting as OR | |
combined_regex = "|".join(regex_list) | |
# Filter the dataframe such that 'model' contains any of the regex patterns | |
data = dataframe[dataframe["Model"].str.contains(combined_regex, case=False, na=False)] | |
data.reset_index(drop=True, inplace=True) | |
# replace column '' with count/rank | |
data["Rank"] = np.arange(1, 1 + len(data)) | |
if "Average" in data.columns: | |
data["Average"] = np.round(np.array(data["Average"].values).astype(float), 1) | |
# replace blank cells with 'NaN' and round all others to 1 decimal | |
for col in data.columns: | |
if col not in ["Model", "Average", "Rank", "link"]: | |
data[col] = data[col].replace("", np.nan) | |
data[col] = np.round(np.array(data[col].values).astype(float), 1) | |
data[col] = data[col].replace(np.nan, "NaN") | |
# put rank and average at 2nd and 3rd columns | |
data = data[ | |
["Rank", "Model", "link", "Average"] + [col for col in sorted(data.columns) if col not in ["Rank", "Model", "Average", "link"]] | |
] | |
return data | |
# pieces of text that map to a certain base model | |
base_model_mapping = { | |
"llama-3-8b": ["Llama-3-8B", "llama-3-8b", "llama_3.0_tulu_2_8b", "llama-3-8b", "llama_3_8b-"], | |
"llama-3.1-8b": ["Llama-3.1-8B", "llama_3.1_tulu_2_8b", "llama_31_tulu_2_8b"], | |
"llama-3.1-70b": [], | |
"olmo-1.7": [], | |
"olmoe": [], | |
"llama-2-7b": [], | |
"pythia": [], | |
"other": [], | |
} | |
stages = ["sft", "merge", "rlhf"] | |
rlhf_tags = ["rlhf", "dpo", "reject", "ppo", "online-dpo"] | |
def update_table(show_columns, base_models, selected_stages, regex): | |
# Sort the columns alphabetically before displaying | |
columns_to_display = ["Rank", "Model", "link"] + sorted(show_columns) | |
filtered_df = regex_table(df.copy(), regex) | |
# Filter columns based on selected checkboxes | |
filtered_df = filtered_df[columns_to_display] | |
# Filter model names by the base_models selected | |
# First, take the model names from the base_model_mapping (these will be permitted patterns) | |
permitted_models = [] | |
include_other = False | |
for base_model in base_models: | |
if base_model.lower() == "other": | |
include_other = True | |
else: | |
permitted_models.extend(base_model_mapping.get(base_model.lower(), [])) | |
# include the key name | |
permitted_models.append(base_model) | |
if permitted_models or include_other: | |
# Create a more precise filtering function | |
def model_filter(model_name): | |
base_model_name = model_metadata.get(model_name, {}).get("base_model", "") | |
# if no base model name is found, use the model name itself | |
if not base_model_name: | |
base_model_name = model_name | |
if include_other: | |
# If "other" is selected, include models that don't match any specific base model | |
return not any( | |
any(permitted.lower() in base_model_name.lower() for permitted in base_model_mapping[key]) | |
for key in base_model_mapping | |
if key != "other" | |
) or any(permitted.lower() in base_model_name.lower() for permitted in permitted_models) | |
else: | |
# If "other" is not selected, only include models that match the selected base models | |
return any(permitted.lower() in base_model_name.lower() for permitted in permitted_models) | |
# Apply the filter | |
filtered_df = filtered_df[filtered_df["Model"].apply(model_filter)] | |
# filtering stages | |
# filter 1: if sft is selected, model names do not have "merge" or any of rlhf_tags | |
# filter 2: if merge is not selected, is models without merge | |
# sft_list = filtered_df["Model"].apply(lambda x: "merge" not in x.lower() and not any(tag in x for tag in rlhf_tags)) | |
# merge_list = filtered_df["Model"].apply(lambda x: "merge" in x.lower()) | |
# rlhf_list = filtered_df["Model"].apply(lambda x: any(tag in x for tag in rlhf_tags)) | |
if len(selected_stages) == 3: | |
# no need to filter | |
pass | |
elif selected_stages: | |
def is_stage_model(model, stage): | |
return model_metadata.get(model, {}).get("model_type", "") == stage | |
sft_mask = filtered_df["Model"].apply(lambda x: bool(is_stage_model(x, "sft"))) | |
merge_mask = filtered_df["Model"].apply(lambda x: bool(is_stage_model(x, "merge"))) | |
rlhf_mask = filtered_df["Model"].apply(lambda x: bool(is_stage_model(x, "ppo") or is_stage_model(x, "dpo"))) | |
stage_mask = pd.Series(False, index=filtered_df.index) | |
if "sft" in selected_stages: | |
stage_mask |= sft_mask | |
if "merge" in selected_stages: | |
stage_mask |= merge_mask | |
if "rlhf" in selected_stages: | |
stage_mask |= rlhf_mask | |
filtered_df = filtered_df[stage_mask] | |
else: | |
# no models | |
filtered_df = filtered_df[filtered_df["Model"].apply(lambda x: False)] | |
def highlight_nan(val): | |
if val == "NaN" or pd.isna(val): | |
return "background-color: #89CFF0" | |
else: | |
return "" | |
# Ensure all numeric columns show one decimal place and replace NaN with 'NaN' | |
for col in filtered_df.columns[3:]: | |
filtered_df[col] = pd.to_numeric(filtered_df[col], errors="coerce").map( | |
lambda x: f"{x:.1f}" if pd.notna(x) else "NaN" | |
) | |
# recompute Average with current columns and nanmean | |
filtered_df["Average"] = filtered_df.iloc[:, 3:].apply(lambda row: np.nanmean([float(x) for x in row]), axis=1) | |
# round to 2 decimal places | |
filtered_df["Average"] = filtered_df["Average"].map(lambda x: f"{x:.2f}") | |
# sort by Average | |
filtered_df = filtered_df.sort_values(by="Average", ascending=False) | |
# update rank | |
filtered_df["Rank"] = np.arange(1, 1 + len(filtered_df)) | |
# Get the column names starting from the second column | |
columns_to_highlight = filtered_df.columns[3:] | |
# replace model name with link if available | |
def apply_link_name(link, model_name): | |
return "<a href='" + link + "'>" + model_name + "</a>" | |
filtered_df["Model"] = filtered_df.apply(lambda x: apply_link_name(x['link'], x['Model']) if not pd.isna(x["link"]) else x["Model"], axis=1) | |
# drop link | |
filtered_df = filtered_df.drop(columns=["link"]) | |
# Apply the styling only to the selected columns | |
filtered_df = filtered_df.style.applymap(highlight_nan, subset=columns_to_highlight) | |
return filtered_df | |
reasoning_evals = ["Average", "gsm8k", "minerva_math::llama3", "oi_MATH_cot", "oi_bbh_cot"] | |
code_evals = ["Average", "codex_humaneval", "codex_humanevalplus"] | |
def select_all_columns(): | |
return columns | |
def select_reasoning_columns(): | |
return reasoning_evals | |
def show_code_columns(): | |
return code_evals | |
def show_math_and_code_columns(): | |
return reasoning_evals + code_evals[1:] | |
def show_instruction_columns(): | |
return ["Average", "drop", "ifeval", "oi_alpaca_eval", "oi_alpaca_eval_2", "alpaca_eval"] | |
def select_no_columns(): | |
return [] | |
total_models = len(df) | |
# Sort columns alphabetically to match the order of columns in the DataFrame | |
columns = sorted([col for col in df.columns if col not in ["Rank", "Model", "link"]]) | |
with gr.Blocks(theme="allenai/[email protected]") as app: | |
with gr.Row(): | |
with gr.Column(scale=6): | |
gr.Markdown(TOP_TEXT) | |
with gr.Column(scale=4): | |
gr.Markdown( | |
""" | |
data:image/s3,"s3://crabby-images/abae1/abae16698d433772579e7153dff33f00628ae657" alt="" | |
""" | |
) | |
with gr.Tabs(elem_classes="tab-buttons") as tabs: | |
with gr.TabItem("🏆 Tulu 3 Leaderboard"): | |
with gr.Row(): | |
search_1 = gr.Textbox( | |
label="Model Search (delimit with , )", | |
placeholder="Model Search (delimit with , )", | |
show_label=False, | |
) | |
with gr.Accordion("Additional Options", open=False): | |
with gr.Row(): | |
checkboxes = gr.CheckboxGroup( | |
label="Select evaluations to display", | |
choices=columns, | |
value=columns, # default all columns selected | |
show_label=True, | |
) | |
with gr.Row(): | |
show_all_button = gr.Button( | |
"Show All Evals", size="sm", variant="primary", elem_classes="smaller-font-button" | |
) | |
show_none_button = gr.Button( | |
"Show No Evals", size="sm", variant="primary", elem_classes="smaller-font-button" | |
) | |
show_reasoning_button = gr.Button( | |
"Show Reasoning Evals", size="sm", variant="secondary", elem_classes="smaller-font-button" | |
) | |
show_code_button = gr.Button( | |
"Show Code Evals", size="sm", variant="secondary", elem_classes="smaller-font-button" | |
) | |
show_code_and_math_button = gr.Button( | |
"Show Math and Code Evals", size="sm", variant="secondary", elem_classes="smaller-font-button" | |
) | |
show_instruction_button = gr.Button( | |
"Show Instruction Evals", size="sm", variant="secondary", elem_classes="smaller-font-button" | |
) | |
with gr.Row(): | |
with gr.Column(scale=6): | |
# add checkbox group to select base models | |
model_options = list(map(str.lower, base_model_mapping.keys())) | |
base_model_checkboxes = gr.CheckboxGroup( | |
label="Select base models", | |
choices=model_options, | |
value=model_options, | |
show_label=True, | |
) | |
with gr.Column(scale=3): | |
# checkbox for training stages | |
training_stages = gr.CheckboxGroup( | |
label="Select training stages", | |
choices=stages, | |
value=stages, | |
show_label=True, | |
) | |
with gr.Row(): | |
tulu_3_table = gr.Dataframe( | |
update_table(checkboxes.value, base_model_checkboxes.value, training_stages.value, search_1.value), | |
height=1000, | |
datatype="markdown" | |
) | |
with gr.Row(): | |
gr.Markdown(BOTTOM_TEXT) | |
# Update the table when search box or checkboxes change | |
search_1.change( | |
fn=update_table, | |
inputs=[checkboxes, base_model_checkboxes, training_stages, search_1], | |
outputs=tulu_3_table, | |
) | |
checkboxes.change( | |
fn=update_table, | |
inputs=[checkboxes, base_model_checkboxes, training_stages, search_1], | |
outputs=tulu_3_table, | |
) | |
base_model_checkboxes.change( | |
fn=update_table, | |
inputs=[checkboxes, base_model_checkboxes, training_stages, search_1], | |
outputs=tulu_3_table, | |
) | |
training_stages.change( | |
fn=update_table, | |
inputs=[checkboxes, base_model_checkboxes, training_stages, search_1], | |
outputs=tulu_3_table, | |
) | |
# Handle the show all and show none buttons | |
show_all_button.click(fn=select_all_columns, outputs=checkboxes) | |
show_none_button.click(fn=select_no_columns, outputs=checkboxes) | |
show_reasoning_button.click(fn=select_reasoning_columns, outputs=checkboxes) | |
show_code_and_math_button.click(fn=show_math_and_code_columns, outputs=checkboxes) | |
show_code_button.click(fn=show_code_columns, outputs=checkboxes) | |
show_instruction_button.click(fn=show_instruction_columns, outputs=checkboxes) | |
with gr.TabItem("About"): | |
with gr.Row(): | |
gr.Markdown(ABOUT_TEXT) | |
with gr.Row(): | |
export_button = gr.Button("Export to CSV", size="sm", variant="primary") | |
csv_output = gr.File(label="Download CSV") | |
export_button.click(fn=lambda: export_to_csv(df), outputs=csv_output) | |
scheduler = BackgroundScheduler() | |
scheduler.add_job(restart_space, "interval", seconds=10800) # restarted every 3h | |
scheduler.start() | |
app.launch(allowed_paths=["src/"]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment