vwxyzjn · November 22, 2024 04:25
diff --git a/app.py b/app.py
 import json
 import os
 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "0"

 import gradio as gr
 import numpy as np
 import pandas as pd
 from apscheduler.schedulers.background import BackgroundScheduler
 from huggingface_hub import HfApi, snapshot_download

 from src.md import ABOUT_TEXT, BOTTOM_TEXT, TOP_TEXT

 api = HfApi()

 # evals_repo = os.environ.get("EVALS_REPO")
 evals_repo = "allenai/tulu-3-evals"
 repo_dir = "./evals/"


 def restart_space():
    api.restart_space(repo_id="allenai/tulu-3-leaderboard")


 def export_to_csv(dataframe):
    """Export the given dataframe to a CSV file and return the file path."""
    csv_file = "exported_results.csv"
    dataframe.to_csv(csv_file, index=False)
    return csv_file


 print("Pulling evaluation results")
 repo = snapshot_download(
    local_dir=repo_dir,
    ignore_patterns=[],
    repo_id=evals_repo,
    tqdm_class=None,
    etag_timeout=30,
    repo_type="dataset",
 )

 # now we have downloaded the dataset, walk through the results directory.
 # each folder should be a model, and each file in the folder should be a result
 results_dir = os.path.join(repo_dir, os.environ.get("RESULTS_DIR", "results"))
 model_results = {}
 model_metadata = {}
 timestamped_results = {}
 for folder in os.listdir(results_dir):
    folder_path = os.path.join(results_dir, folder)
    if os.path.isdir(folder_path):
        # try to load up model metadata
        if os.path.isfile(os.path.join(folder_path, "metadata.json")):
            try:
                with open(os.path.join(folder_path, "metadata.json"), "r", encoding="utf-8") as f:
                    metadata = json.load(f)
                    model_metadata[folder] = metadata
            except json.decoder.JSONDecodeError:
                print(f"Error reading metadata for {folder}")
        model_results[folder] = {}
        for file in os.listdir(folder_path):
            file_path = os.path.join(folder_path, file)
            # grab the timecode from the file name
            timestamp = file.split("-")[:2]  # format is <date>-<time>-<name...>
            timestamp = "-".join(timestamp)
            if os.path.isfile(file_path):
                try:
                    with open(file_path, "r", encoding="utf-8") as f:
                        first_line = f.readline()
                        data = json.loads(first_line)
                except json.decoder.JSONDecodeError:
                    print(f"Error reading {file_path}")
                    continue
                try:
                    score = data["metrics"]["primary_score"]
                    dataset_name = data["task_name"]
                except KeyError:
                    print(f"Error reading {file_path}, wrong format.")
                    continue
                # only update if non-nan and timestamp is more recent
                if not np.isnan(score):
                    if (
                        folder + "-" + dataset_name not in timestamped_results
                        or timestamped_results[folder + "-" + dataset_name] < timestamp
                    ):
                        model_results[folder][dataset_name] = score
                        if dataset_name == "minerva_math::tulu":
                            if "exact_match_flex_macro" in data["metrics"]:
                                model_results[folder]["math::flex"] = data["metrics"]["exact_match_flex_macro"]
                        timestamped_results[folder + "-" + dataset_name] = timestamp
                    else:
                        print(
                            "skipping",
                            folder + "-" + dataset_name,
                            "because",
                            timestamped_results[folder + "-" + dataset_name],
                            "is more recent than",
                            timestamp,
                        )

 # clean: remove models that don't have any evals
 models_to_remove = []
 for model, results in model_results.items():
    if not results:
        print(f"Removing {model} because it has no evals")
        models_to_remove.append(model)
 for model in models_to_remove:
    del model_results[model]
    del model_metadata[model]

 # add link metadata
 for model in model_results:
    if model in model_metadata and "wandb_path" in model_metadata[model]:
        model_results[model]['link'] = model_metadata[model]["wandb_path"]

 # clean data: all models should have the same evals
 # for any model that doesn't have an eval, add a nan
 evals = set()
 for model in model_results:
    for eval in model_results[model]:
        evals.add(eval)
 if "link" in evals:
    evals.remove("link")
 for model in model_results:
    for eval in evals:
        if eval not in model_results[model]:
            model_results[model][eval] = np.nan


 # now, turn into dataframe. Columns are evals, rows are models
 df = pd.DataFrame(model_results).T

 import wandb
 wandb_api = wandb.Api()
 for index, row in df.iterrows():
    if "link" not in row or not isinstance(row["link"], str):
        continue

    try:
        link = row["link"].replace("https://wandb.ai/", "")
        wandb_run = wandb_api.run(link)
        modified = False
        for key in row.keys():
            if key == "link":
                continue
            if f"oe-eval/{key}" not in wandb_run.summary:
                wandb_run.summary[f"oe-eval/{key}"] = row[key]
                modified = True
            elif wandb_run.summary[f"oe-eval/{key}"] != row[key]:
                wandb_run.summary[f"oe-eval/{key}"] = row[key]
                modified = True
        if not modified:
            print(f"Already logged metrics to {wandb_run.url}")
        else:
            wandb_run.update()
            print(f"Logged metrics to {wandb_run.url}")
    except Exception as e:
        print(f"Error logging metrics to {row['link']}: {e}")

 breakpoint()
 # multiply by 100 to get percentage
 for col in df.columns:
    if col == "link":
        continue
    if col == "alpaca_eval":    # alpaca_eval is already in percentage
        continue
    df[col] = df[col] * 100
 # add link column if not present, all nan
 if "link" not in df.columns:
    df["link"] = np.nan
 # add average column
 df["Average"] = df.drop(columns=["link"]).mean(axis=1)
 df.index.name = "Model"
 df.reset_index(inplace=True)
 # sort by average and model
 df = df.sort_values(by=["Average", "Model"], ascending=[False, True])


 def regex_table(dataframe, regex):
    """
    Takes a model name as a regex, then returns only the rows that has that in it.
    """
    # Ensure regex is not None
    if regex is None:
        regex = ""

    # Split regex statement by comma and trim whitespace around regexes
    regex_list = [x.strip() for x in regex.split(",")]
    # Join the list into a single regex pattern with '|' acting as OR
    combined_regex = "|".join(regex_list)
    # Filter the dataframe such that 'model' contains any of the regex patterns
    data = dataframe[dataframe["Model"].str.contains(combined_regex, case=False, na=False)]
    data.reset_index(drop=True, inplace=True)
    # replace column '' with count/rank
    data["Rank"] = np.arange(1, 1 + len(data))
    if "Average" in data.columns:
        data["Average"] = np.round(np.array(data["Average"].values).astype(float), 1)
    # replace blank cells with 'NaN' and round all others to 1 decimal
    for col in data.columns:
        if col not in ["Model", "Average", "Rank", "link"]:
            data[col] = data[col].replace("", np.nan)
            data[col] = np.round(np.array(data[col].values).astype(float), 1)
            data[col] = data[col].replace(np.nan, "NaN")
    # put rank and average at 2nd and 3rd columns
    data = data[
        ["Rank", "Model", "link", "Average"] + [col for col in sorted(data.columns) if col not in ["Rank", "Model", "Average", "link"]]
    ]
    return data


 # pieces of text that map to a certain base model
 base_model_mapping = {
    "llama-3-8b": ["Llama-3-8B", "llama-3-8b", "llama_3.0_tulu_2_8b", "llama-3-8b", "llama_3_8b-"],
    "llama-3.1-8b": ["Llama-3.1-8B", "llama_3.1_tulu_2_8b", "llama_31_tulu_2_8b"],
    "llama-3.1-70b": [],
    "olmo-1.7": [],
    "olmoe": [],
    "llama-2-7b": [],
    "pythia": [],
    "other": [],
 }

 stages = ["sft", "merge", "rlhf"]
 rlhf_tags = ["rlhf", "dpo", "reject", "ppo", "online-dpo"]


 def update_table(show_columns, base_models, selected_stages, regex):
    # Sort the columns alphabetically before displaying
    columns_to_display = ["Rank", "Model", "link"] + sorted(show_columns)
    filtered_df = regex_table(df.copy(), regex)

    # Filter columns based on selected checkboxes
    filtered_df = filtered_df[columns_to_display]

    # Filter model names by the base_models selected
    # First, take the model names from the base_model_mapping (these will be permitted patterns)
    permitted_models = []
    include_other = False
    for base_model in base_models:
        if base_model.lower() == "other":
            include_other = True
        else:
            permitted_models.extend(base_model_mapping.get(base_model.lower(), []))
            # include the key name
            permitted_models.append(base_model)

    if permitted_models or include_other:
        # Create a more precise filtering function
        def model_filter(model_name):
            base_model_name = model_metadata.get(model_name, {}).get("base_model", "")
            # if no base model name is found, use the model name itself
            if not base_model_name:
                base_model_name = model_name
            if include_other:
                # If "other" is selected, include models that don't match any specific base model
                return not any(
                    any(permitted.lower() in base_model_name.lower() for permitted in base_model_mapping[key])
                    for key in base_model_mapping
                    if key != "other"
                ) or any(permitted.lower() in base_model_name.lower() for permitted in permitted_models)
            else:
                # If "other" is not selected, only include models that match the selected base models
                return any(permitted.lower() in base_model_name.lower() for permitted in permitted_models)

        # Apply the filter
        filtered_df = filtered_df[filtered_df["Model"].apply(model_filter)]

    # filtering stages
    # filter 1: if sft is selected, model names do not have "merge" or any of rlhf_tags
    # filter 2: if merge is not selected, is models without merge
    # sft_list = filtered_df["Model"].apply(lambda x: "merge" not in x.lower() and not any(tag in x for tag in rlhf_tags))
    # merge_list = filtered_df["Model"].apply(lambda x: "merge" in x.lower())
    # rlhf_list = filtered_df["Model"].apply(lambda x: any(tag in x for tag in rlhf_tags))

    if len(selected_stages) == 3:
        # no need to filter
        pass
    elif selected_stages:
        def is_stage_model(model, stage):
            return model_metadata.get(model, {}).get("model_type", "") == stage

        sft_mask = filtered_df["Model"].apply(lambda x: bool(is_stage_model(x, "sft")))
        merge_mask = filtered_df["Model"].apply(lambda x: bool(is_stage_model(x, "merge")))
        rlhf_mask = filtered_df["Model"].apply(lambda x: bool(is_stage_model(x, "ppo") or is_stage_model(x, "dpo")))

        stage_mask = pd.Series(False, index=filtered_df.index)
        if "sft" in selected_stages:
            stage_mask |= sft_mask
        if "merge" in selected_stages:
            stage_mask |= merge_mask
        if "rlhf" in selected_stages:
            stage_mask |= rlhf_mask

        filtered_df = filtered_df[stage_mask]
    else:
        # no models
        filtered_df = filtered_df[filtered_df["Model"].apply(lambda x: False)]

    def highlight_nan(val):
        if val == "NaN" or pd.isna(val):
            return "background-color: #89CFF0"
        else:
            return ""

    # Ensure all numeric columns show one decimal place and replace NaN with 'NaN'
    for col in filtered_df.columns[3:]:
        filtered_df[col] = pd.to_numeric(filtered_df[col], errors="coerce").map(
            lambda x: f"{x:.1f}" if pd.notna(x) else "NaN"
        )

    # recompute Average with current columns and nanmean
    filtered_df["Average"] = filtered_df.iloc[:, 3:].apply(lambda row: np.nanmean([float(x) for x in row]), axis=1)
    # round to 2 decimal places
    filtered_df["Average"] = filtered_df["Average"].map(lambda x: f"{x:.2f}")
    # sort by Average
    filtered_df = filtered_df.sort_values(by="Average", ascending=False)
    # update rank
    filtered_df["Rank"] = np.arange(1, 1 + len(filtered_df))

    # Get the column names starting from the second column
    columns_to_highlight = filtered_df.columns[3:]

    # replace model name with link if available
    def apply_link_name(link, model_name):
        return "<a href='" + link + "'>" + model_name + "</a>"
    filtered_df["Model"] = filtered_df.apply(lambda x: apply_link_name(x['link'], x['Model']) if not pd.isna(x["link"]) else x["Model"], axis=1)

    # drop link
    filtered_df = filtered_df.drop(columns=["link"])

    # Apply the styling only to the selected columns
    filtered_df = filtered_df.style.applymap(highlight_nan, subset=columns_to_highlight)

    return filtered_df


 reasoning_evals = ["Average", "gsm8k", "minerva_math::llama3", "oi_MATH_cot", "oi_bbh_cot"]
 code_evals = ["Average", "codex_humaneval", "codex_humanevalplus"]


 def select_all_columns():
    return columns


 def select_reasoning_columns():
    return reasoning_evals


 def show_code_columns():
    return code_evals


 def show_math_and_code_columns():
    return reasoning_evals + code_evals[1:]


 def show_instruction_columns():
    return ["Average", "drop", "ifeval", "oi_alpaca_eval", "oi_alpaca_eval_2", "alpaca_eval"]


 def select_no_columns():
    return []


 total_models = len(df)
 # Sort columns alphabetically to match the order of columns in the DataFrame
 columns = sorted([col for col in df.columns if col not in ["Rank", "Model", "link"]])


 with gr.Blocks(theme="allenai/[email protected]") as app:
    with gr.Row():
        with gr.Column(scale=6):
            gr.Markdown(TOP_TEXT)
        with gr.Column(scale=4):
            gr.Markdown(
                """
                        ![](file/src/logo.png)
                        """
            )
    with gr.Tabs(elem_classes="tab-buttons") as tabs:
        with gr.TabItem("🏆 Tulu 3 Leaderboard"):
            with gr.Row():
                search_1 = gr.Textbox(
                    label="Model Search (delimit with , )",
                    placeholder="Model Search (delimit with , )",
                    show_label=False,
                )
            with gr.Accordion("Additional Options", open=False):
                with gr.Row():
                    checkboxes = gr.CheckboxGroup(
                        label="Select evaluations to display",
                        choices=columns,
                        value=columns,  # default all columns selected
                        show_label=True,
                    )
                with gr.Row():
                    show_all_button = gr.Button(
                        "Show All Evals", size="sm", variant="primary", elem_classes="smaller-font-button"
                    )
                    show_none_button = gr.Button(
                        "Show No Evals", size="sm", variant="primary", elem_classes="smaller-font-button"
                    )
                    show_reasoning_button = gr.Button(
                        "Show Reasoning Evals", size="sm", variant="secondary", elem_classes="smaller-font-button"
                    )
                    show_code_button = gr.Button(
                        "Show Code Evals", size="sm", variant="secondary", elem_classes="smaller-font-button"
                    )
                    show_code_and_math_button = gr.Button(
                        "Show Math and Code Evals", size="sm", variant="secondary", elem_classes="smaller-font-button"
                    )
                    show_instruction_button = gr.Button(
                        "Show Instruction Evals", size="sm", variant="secondary", elem_classes="smaller-font-button"
                    )
                with gr.Row():
                    with gr.Column(scale=6):
                        # add checkbox group to select base models
                        model_options = list(map(str.lower, base_model_mapping.keys()))
                        base_model_checkboxes = gr.CheckboxGroup(
                            label="Select base models",
                            choices=model_options,
                            value=model_options,
                            show_label=True,
                        )
                    with gr.Column(scale=3):
                        # checkbox for training stages
                        training_stages = gr.CheckboxGroup(
                            label="Select training stages",
                            choices=stages,
                            value=stages,
                            show_label=True,
                        )

            with gr.Row():
                tulu_3_table = gr.Dataframe(
                    update_table(checkboxes.value, base_model_checkboxes.value, training_stages.value, search_1.value),
                    height=1000,
                    datatype="markdown"
                )
            with gr.Row():
                gr.Markdown(BOTTOM_TEXT)

            # Update the table when search box or checkboxes change
            search_1.change(
                fn=update_table,
                inputs=[checkboxes, base_model_checkboxes, training_stages, search_1],
                outputs=tulu_3_table,
            )
            checkboxes.change(
                fn=update_table,
                inputs=[checkboxes, base_model_checkboxes, training_stages, search_1],
                outputs=tulu_3_table,
            )
            base_model_checkboxes.change(
                fn=update_table,
                inputs=[checkboxes, base_model_checkboxes, training_stages, search_1],
                outputs=tulu_3_table,
            )
            training_stages.change(
                fn=update_table,
                inputs=[checkboxes, base_model_checkboxes, training_stages, search_1],
                outputs=tulu_3_table,
            )

            # Handle the show all and show none buttons
            show_all_button.click(fn=select_all_columns, outputs=checkboxes)
            show_none_button.click(fn=select_no_columns, outputs=checkboxes)
            show_reasoning_button.click(fn=select_reasoning_columns, outputs=checkboxes)
            show_code_and_math_button.click(fn=show_math_and_code_columns, outputs=checkboxes)
            show_code_button.click(fn=show_code_columns, outputs=checkboxes)
            show_instruction_button.click(fn=show_instruction_columns, outputs=checkboxes)

        with gr.TabItem("About"):
            with gr.Row():
                gr.Markdown(ABOUT_TEXT)
        
        with gr.Row():
            export_button = gr.Button("Export to CSV", size="sm", variant="primary")
            csv_output = gr.File(label="Download CSV")
            export_button.click(fn=lambda: export_to_csv(df), outputs=csv_output)

 scheduler = BackgroundScheduler()
 scheduler.add_job(restart_space, "interval", seconds=10800)  # restarted every 3h
 scheduler.start()
 app.launch(allowed_paths=["src/"])
 import json
 import os
 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "0"

 import gradio as gr
 import numpy as np
 import pandas as pd
 from apscheduler.schedulers.background import BackgroundScheduler
 from huggingface_hub import HfApi, snapshot_download

 from src.md import ABOUT_TEXT, BOTTOM_TEXT, TOP_TEXT

 api = HfApi()

 # evals_repo = os.environ.get("EVALS_REPO")
 evals_repo = "allenai/tulu-3-evals"
 repo_dir = "./evals/"


 def restart_space():
    api.restart_space(repo_id="allenai/tulu-3-leaderboard")


 def export_to_csv(dataframe):
    """Export the given dataframe to a CSV file and return the file path."""
    csv_file = "exported_results.csv"
    dataframe.to_csv(csv_file, index=False)
    return csv_file


 print("Pulling evaluation results")
 repo = snapshot_download(
    local_dir=repo_dir,
    ignore_patterns=[],
    repo_id=evals_repo,
    tqdm_class=None,
    etag_timeout=30,
    repo_type="dataset",
 )

 # now we have downloaded the dataset, walk through the results directory.
 # each folder should be a model, and each file in the folder should be a result
 results_dir = os.path.join(repo_dir, os.environ.get("RESULTS_DIR", "results"))
 model_results = {}
 model_metadata = {}
 timestamped_results = {}
 for folder in os.listdir(results_dir):
    folder_path = os.path.join(results_dir, folder)
    if os.path.isdir(folder_path):
        # try to load up model metadata
        if os.path.isfile(os.path.join(folder_path, "metadata.json")):
            try:
                with open(os.path.join(folder_path, "metadata.json"), "r", encoding="utf-8") as f:
                    metadata = json.load(f)
                    model_metadata[folder] = metadata
            except json.decoder.JSONDecodeError:
                print(f"Error reading metadata for {folder}")
        model_results[folder] = {}
        for file in os.listdir(folder_path):
            file_path = os.path.join(folder_path, file)
            # grab the timecode from the file name
            timestamp = file.split("-")[:2]  # format is <date>-<time>-<name...>
            timestamp = "-".join(timestamp)
            if os.path.isfile(file_path):
                try:
                    with open(file_path, "r", encoding="utf-8") as f:
                        first_line = f.readline()
                        data = json.loads(first_line)
                except json.decoder.JSONDecodeError:
                    print(f"Error reading {file_path}")
                    continue
                try:
                    score = data["metrics"]["primary_score"]
                    dataset_name = data["task_name"]
                except KeyError:
                    print(f"Error reading {file_path}, wrong format.")
                    continue
                # only update if non-nan and timestamp is more recent
                if not np.isnan(score):
                    if (
                        folder + "-" + dataset_name not in timestamped_results
                        or timestamped_results[folder + "-" + dataset_name] < timestamp
                    ):
                        model_results[folder][dataset_name] = score
                        if dataset_name == "minerva_math::tulu":
                            if "exact_match_flex_macro" in data["metrics"]:
                                model_results[folder]["math::flex"] = data["metrics"]["exact_match_flex_macro"]
                        timestamped_results[folder + "-" + dataset_name] = timestamp
                    else:
                        print(
                            "skipping",
                            folder + "-" + dataset_name,
                            "because",
                            timestamped_results[folder + "-" + dataset_name],
                            "is more recent than",
                            timestamp,
                        )

 # clean: remove models that don't have any evals
 models_to_remove = []
 for model, results in model_results.items():
    if not results:
        print(f"Removing {model} because it has no evals")
        models_to_remove.append(model)
 for model in models_to_remove:
    del model_results[model]
    del model_metadata[model]

 # add link metadata
 for model in model_results:
    if model in model_metadata and "wandb_path" in model_metadata[model]:
        model_results[model]['link'] = model_metadata[model]["wandb_path"]

 # clean data: all models should have the same evals
 # for any model that doesn't have an eval, add a nan
 evals = set()
 for model in model_results:
    for eval in model_results[model]:
        evals.add(eval)
 if "link" in evals:
    evals.remove("link")
 for model in model_results:
    for eval in evals:
        if eval not in model_results[model]:
            model_results[model][eval] = np.nan


 # now, turn into dataframe. Columns are evals, rows are models
 df = pd.DataFrame(model_results).T

 import wandb
 wandb_api = wandb.Api()
 for index, row in df.iterrows():
    if "link" not in row or not isinstance(row["link"], str):
        continue

    try:
        link = row["link"].replace("https://wandb.ai/", "")
        wandb_run = wandb_api.run(link)
        modified = False
        for key in row.keys():
            if key == "link":
                continue
            if f"oe-eval/{key}" not in wandb_run.summary:
                wandb_run.summary[f"oe-eval/{key}"] = row[key]
                modified = True
            elif wandb_run.summary[f"oe-eval/{key}"] != row[key]:
                wandb_run.summary[f"oe-eval/{key}"] = row[key]
                modified = True
        if not modified:
            print(f"Already logged metrics to {wandb_run.url}")
        else:
            wandb_run.update()
            print(f"Logged metrics to {wandb_run.url}")
    except Exception as e:
        print(f"Error logging metrics to {row['link']}: {e}")

 breakpoint()
 # multiply by 100 to get percentage
 for col in df.columns:
    if col == "link":
        continue
    if col == "alpaca_eval":    # alpaca_eval is already in percentage
        continue
    df[col] = df[col] * 100
 # add link column if not present, all nan
 if "link" not in df.columns:
    df["link"] = np.nan
 # add average column
 df["Average"] = df.drop(columns=["link"]).mean(axis=1)
 df.index.name = "Model"
 df.reset_index(inplace=True)
 # sort by average and model
 df = df.sort_values(by=["Average", "Model"], ascending=[False, True])


 def regex_table(dataframe, regex):
    """
    Takes a model name as a regex, then returns only the rows that has that in it.
    """
    # Ensure regex is not None
    if regex is None:
        regex = ""

    # Split regex statement by comma and trim whitespace around regexes
    regex_list = [x.strip() for x in regex.split(",")]
    # Join the list into a single regex pattern with '|' acting as OR
    combined_regex = "|".join(regex_list)
    # Filter the dataframe such that 'model' contains any of the regex patterns
    data = dataframe[dataframe["Model"].str.contains(combined_regex, case=False, na=False)]
    data.reset_index(drop=True, inplace=True)
    # replace column '' with count/rank
    data["Rank"] = np.arange(1, 1 + len(data))
    if "Average" in data.columns:
        data["Average"] = np.round(np.array(data["Average"].values).astype(float), 1)
    # replace blank cells with 'NaN' and round all others to 1 decimal
    for col in data.columns:
        if col not in ["Model", "Average", "Rank", "link"]:
            data[col] = data[col].replace("", np.nan)
            data[col] = np.round(np.array(data[col].values).astype(float), 1)
            data[col] = data[col].replace(np.nan, "NaN")
    # put rank and average at 2nd and 3rd columns
    data = data[
        ["Rank", "Model", "link", "Average"] + [col for col in sorted(data.columns) if col not in ["Rank", "Model", "Average", "link"]]
    ]
    return data


 # pieces of text that map to a certain base model
 base_model_mapping = {
    "llama-3-8b": ["Llama-3-8B", "llama-3-8b", "llama_3.0_tulu_2_8b", "llama-3-8b", "llama_3_8b-"],
    "llama-3.1-8b": ["Llama-3.1-8B", "llama_3.1_tulu_2_8b", "llama_31_tulu_2_8b"],
    "llama-3.1-70b": [],
    "olmo-1.7": [],
    "olmoe": [],
    "llama-2-7b": [],
    "pythia": [],
    "other": [],
 }

 stages = ["sft", "merge", "rlhf"]
 rlhf_tags = ["rlhf", "dpo", "reject", "ppo", "online-dpo"]


 def update_table(show_columns, base_models, selected_stages, regex):
    # Sort the columns alphabetically before displaying
    columns_to_display = ["Rank", "Model", "link"] + sorted(show_columns)
    filtered_df = regex_table(df.copy(), regex)

    # Filter columns based on selected checkboxes
    filtered_df = filtered_df[columns_to_display]

    # Filter model names by the base_models selected
    # First, take the model names from the base_model_mapping (these will be permitted patterns)
    permitted_models = []
    include_other = False
    for base_model in base_models:
        if base_model.lower() == "other":
            include_other = True
        else:
            permitted_models.extend(base_model_mapping.get(base_model.lower(), []))
            # include the key name
            permitted_models.append(base_model)

    if permitted_models or include_other:
        # Create a more precise filtering function
        def model_filter(model_name):
            base_model_name = model_metadata.get(model_name, {}).get("base_model", "")
            # if no base model name is found, use the model name itself
            if not base_model_name:
                base_model_name = model_name
            if include_other:
                # If "other" is selected, include models that don't match any specific base model
                return not any(
                    any(permitted.lower() in base_model_name.lower() for permitted in base_model_mapping[key])
                    for key in base_model_mapping
                    if key != "other"
                ) or any(permitted.lower() in base_model_name.lower() for permitted in permitted_models)
            else:
                # If "other" is not selected, only include models that match the selected base models
                return any(permitted.lower() in base_model_name.lower() for permitted in permitted_models)

        # Apply the filter
        filtered_df = filtered_df[filtered_df["Model"].apply(model_filter)]

    # filtering stages
    # filter 1: if sft is selected, model names do not have "merge" or any of rlhf_tags
    # filter 2: if merge is not selected, is models without merge
    # sft_list = filtered_df["Model"].apply(lambda x: "merge" not in x.lower() and not any(tag in x for tag in rlhf_tags))
    # merge_list = filtered_df["Model"].apply(lambda x: "merge" in x.lower())
    # rlhf_list = filtered_df["Model"].apply(lambda x: any(tag in x for tag in rlhf_tags))

    if len(selected_stages) == 3:
        # no need to filter
        pass
    elif selected_stages:
        def is_stage_model(model, stage):
            return model_metadata.get(model, {}).get("model_type", "") == stage

        sft_mask = filtered_df["Model"].apply(lambda x: bool(is_stage_model(x, "sft")))
        merge_mask = filtered_df["Model"].apply(lambda x: bool(is_stage_model(x, "merge")))
        rlhf_mask = filtered_df["Model"].apply(lambda x: bool(is_stage_model(x, "ppo") or is_stage_model(x, "dpo")))

        stage_mask = pd.Series(False, index=filtered_df.index)
        if "sft" in selected_stages:
            stage_mask |= sft_mask
        if "merge" in selected_stages:
            stage_mask |= merge_mask
        if "rlhf" in selected_stages:
            stage_mask |= rlhf_mask

        filtered_df = filtered_df[stage_mask]
    else:
        # no models
        filtered_df = filtered_df[filtered_df["Model"].apply(lambda x: False)]

    def highlight_nan(val):
        if val == "NaN" or pd.isna(val):
            return "background-color: #89CFF0"
        else:
            return ""

    # Ensure all numeric columns show one decimal place and replace NaN with 'NaN'
    for col in filtered_df.columns[3:]:
        filtered_df[col] = pd.to_numeric(filtered_df[col], errors="coerce").map(
            lambda x: f"{x:.1f}" if pd.notna(x) else "NaN"
        )

    # recompute Average with current columns and nanmean
    filtered_df["Average"] = filtered_df.iloc[:, 3:].apply(lambda row: np.nanmean([float(x) for x in row]), axis=1)
    # round to 2 decimal places
    filtered_df["Average"] = filtered_df["Average"].map(lambda x: f"{x:.2f}")
    # sort by Average
    filtered_df = filtered_df.sort_values(by="Average", ascending=False)
    # update rank
    filtered_df["Rank"] = np.arange(1, 1 + len(filtered_df))

    # Get the column names starting from the second column
    columns_to_highlight = filtered_df.columns[3:]

    # replace model name with link if available
    def apply_link_name(link, model_name):
        return "<a href='" + link + "'>" + model_name + "</a>"
    filtered_df["Model"] = filtered_df.apply(lambda x: apply_link_name(x['link'], x['Model']) if not pd.isna(x["link"]) else x["Model"], axis=1)

    # drop link
    filtered_df = filtered_df.drop(columns=["link"])

    # Apply the styling only to the selected columns
    filtered_df = filtered_df.style.applymap(highlight_nan, subset=columns_to_highlight)

    return filtered_df


 reasoning_evals = ["Average", "gsm8k", "minerva_math::llama3", "oi_MATH_cot", "oi_bbh_cot"]
 code_evals = ["Average", "codex_humaneval", "codex_humanevalplus"]


 def select_all_columns():
    return columns


 def select_reasoning_columns():
    return reasoning_evals


 def show_code_columns():
    return code_evals


 def show_math_and_code_columns():
    return reasoning_evals + code_evals[1:]


 def show_instruction_columns():
    return ["Average", "drop", "ifeval", "oi_alpaca_eval", "oi_alpaca_eval_2", "alpaca_eval"]


 def select_no_columns():
    return []


 total_models = len(df)
 # Sort columns alphabetically to match the order of columns in the DataFrame
 columns = sorted([col for col in df.columns if col not in ["Rank", "Model", "link"]])


 with gr.Blocks(theme="allenai/[email protected]") as app:
    with gr.Row():
        with gr.Column(scale=6):
            gr.Markdown(TOP_TEXT)
        with gr.Column(scale=4):
            gr.Markdown(
                """
                        ![](file/src/logo.png)
                        """
            )
    with gr.Tabs(elem_classes="tab-buttons") as tabs:
        with gr.TabItem("🏆 Tulu 3 Leaderboard"):
            with gr.Row():
                search_1 = gr.Textbox(
                    label="Model Search (delimit with , )",
                    placeholder="Model Search (delimit with , )",
                    show_label=False,
                )
            with gr.Accordion("Additional Options", open=False):
                with gr.Row():
                    checkboxes = gr.CheckboxGroup(
                        label="Select evaluations to display",
                        choices=columns,
                        value=columns,  # default all columns selected
                        show_label=True,
                    )
                with gr.Row():
                    show_all_button = gr.Button(
                        "Show All Evals", size="sm", variant="primary", elem_classes="smaller-font-button"
                    )
                    show_none_button = gr.Button(
                        "Show No Evals", size="sm", variant="primary", elem_classes="smaller-font-button"
                    )
                    show_reasoning_button = gr.Button(
                        "Show Reasoning Evals", size="sm", variant="secondary", elem_classes="smaller-font-button"
                    )
                    show_code_button = gr.Button(
                        "Show Code Evals", size="sm", variant="secondary", elem_classes="smaller-font-button"
                    )
                    show_code_and_math_button = gr.Button(
                        "Show Math and Code Evals", size="sm", variant="secondary", elem_classes="smaller-font-button"
                    )
                    show_instruction_button = gr.Button(
                        "Show Instruction Evals", size="sm", variant="secondary", elem_classes="smaller-font-button"
                    )
                with gr.Row():
                    with gr.Column(scale=6):
                        # add checkbox group to select base models
                        model_options = list(map(str.lower, base_model_mapping.keys()))
                        base_model_checkboxes = gr.CheckboxGroup(
                            label="Select base models",
                            choices=model_options,
                            value=model_options,
                            show_label=True,
                        )
                    with gr.Column(scale=3):
                        # checkbox for training stages
                        training_stages = gr.CheckboxGroup(
                            label="Select training stages",
                            choices=stages,
                            value=stages,
                            show_label=True,
                        )

            with gr.Row():
                tulu_3_table = gr.Dataframe(
                    update_table(checkboxes.value, base_model_checkboxes.value, training_stages.value, search_1.value),
                    height=1000,
                    datatype="markdown"
                )
            with gr.Row():
                gr.Markdown(BOTTOM_TEXT)

            # Update the table when search box or checkboxes change
            search_1.change(
                fn=update_table,
                inputs=[checkboxes, base_model_checkboxes, training_stages, search_1],
                outputs=tulu_3_table,
            )
            checkboxes.change(
                fn=update_table,
                inputs=[checkboxes, base_model_checkboxes, training_stages, search_1],
                outputs=tulu_3_table,
            )
            base_model_checkboxes.change(
                fn=update_table,
                inputs=[checkboxes, base_model_checkboxes, training_stages, search_1],
                outputs=tulu_3_table,
            )
            training_stages.change(
                fn=update_table,
                inputs=[checkboxes, base_model_checkboxes, training_stages, search_1],
                outputs=tulu_3_table,
            )

            # Handle the show all and show none buttons
            show_all_button.click(fn=select_all_columns, outputs=checkboxes)
            show_none_button.click(fn=select_no_columns, outputs=checkboxes)
            show_reasoning_button.click(fn=select_reasoning_columns, outputs=checkboxes)
            show_code_and_math_button.click(fn=show_math_and_code_columns, outputs=checkboxes)
            show_code_button.click(fn=show_code_columns, outputs=checkboxes)
            show_instruction_button.click(fn=show_instruction_columns, outputs=checkboxes)

        with gr.TabItem("About"):
            with gr.Row():
                gr.Markdown(ABOUT_TEXT)
        
        with gr.Row():
            export_button = gr.Button("Export to CSV", size="sm", variant="primary")
            csv_output = gr.File(label="Download CSV")
            export_button.click(fn=lambda: export_to_csv(df), outputs=csv_output)

 scheduler = BackgroundScheduler()
 scheduler.add_job(restart_space, "interval", seconds=10800)  # restarted every 3h
 scheduler.start()
 app.launch(allowed_paths=["src/"])