leaderboard |
N/A |
none |
0 |
acc |
↑ |
0.4464 |
± |
0.0045 |
|
|
none |
0 |
acc_norm |
↑ |
0.5778 |
± |
0.0051 |
|
|
none |
0 |
exact_match |
↑ |
0.1992 |
± |
0.0055 |
|
|
none |
0 |
inst_level_loose_acc |
↑ |
0.8297 |
± |
N/A |
|
|
none |
0 |
inst_level_strict_acc |
↑ |
0.8058 |
± |
N/A |
|
|
none |
0 |
prompt_level_loose_acc |
↑ |
0.7671 |
± |
0.0182 |
|
|
none |
0 |
prompt_level_strict_acc |
↑ |
0.7338 |
± |
0.0190 |
- leaderboard_bbh |
N/A |
none |
3 |
acc_norm |
↑ |
0.6454 |
± |
0.0057 |
- leaderboard_bbh_boolean_expressions |
0 |
none |
3 |
acc_norm |
↑ |
0.8160 |
± |
0.0246 |
- leaderboard_bbh_causal_judgement |
0 |
none |
3 |
acc_norm |
↑ |
0.6578 |
± |
0.0348 |
- leaderboard_bbh_date_understanding |
0 |
none |
3 |
acc_norm |
↑ |
0.7480 |
± |
0.0275 |
- leaderboard_bbh_disambiguation_qa |
0 |
none |
3 |
acc_norm |
↑ |
0.6480 |
± |
0.0303 |
- leaderboard_bbh_formal_fallacies |
0 |
none |
3 |
acc_norm |
↑ |
0.6320 |
± |
0.0306 |
- leaderboard_bbh_geometric_shapes |
0 |
none |
3 |
acc_norm |
↑ |
0.4760 |
± |
0.0316 |
- leaderboard_bbh_hyperbaton |
0 |
none |
3 |
acc_norm |
↑ |
0.8080 |
± |
0.0250 |
- leaderboard_bbh_logical_deduction_five_objects |
0 |
none |
3 |
acc_norm |
↑ |
0.6000 |
± |
0.0310 |
- leaderboard_bbh_logical_deduction_seven_objects |
0 |
none |
3 |
acc_norm |
↑ |
0.5880 |
± |
0.0312 |
- leaderboard_bbh_logical_deduction_three_objects |
0 |
none |
3 |
acc_norm |
↑ |
0.8360 |
± |
0.0235 |
- leaderboard_bbh_movie_recommendation |
0 |
none |
3 |
acc_norm |
↑ |
0.7960 |
± |
0.0255 |
- leaderboard_bbh_navigate |
0 |
none |
3 |
acc_norm |
↑ |
0.7000 |
± |
0.0290 |
- leaderboard_bbh_object_counting |
0 |
none |
3 |
acc_norm |
↑ |
0.4280 |
± |
0.0314 |
- leaderboard_bbh_penguins_in_a_table |
0 |
none |
3 |
acc_norm |
↑ |
0.7534 |
± |
0.0358 |
- leaderboard_bbh_reasoning_about_colored_objects |
0 |
none |
3 |
acc_norm |
↑ |
0.8040 |
± |
0.0252 |
- leaderboard_bbh_ruin_names |
0 |
none |
3 |
acc_norm |
↑ |
0.8720 |
± |
0.0212 |
- leaderboard_bbh_salient_translation_error_detection |
0 |
none |
3 |
acc_norm |
↑ |
0.6160 |
± |
0.0308 |
- leaderboard_bbh_snarks |
0 |
none |
3 |
acc_norm |
↑ |
0.7584 |
± |
0.0322 |
- leaderboard_bbh_sports_understanding |
0 |
none |
3 |
acc_norm |
↑ |
0.8280 |
± |
0.0239 |
- leaderboard_bbh_temporal_sequences |
0 |
none |
3 |
acc_norm |
↑ |
0.9760 |
± |
0.0097 |
- leaderboard_bbh_tracking_shuffled_objects_five_objects |
0 |
none |
3 |
acc_norm |
↑ |
0.2160 |
± |
0.0261 |
- leaderboard_bbh_tracking_shuffled_objects_seven_objects |
0 |
none |
3 |
acc_norm |
↑ |
0.1960 |
± |
0.0252 |
- leaderboard_bbh_tracking_shuffled_objects_three_objects |
0 |
none |
3 |
acc_norm |
↑ |
0.3200 |
± |
0.0296 |
- leaderboard_bbh_web_of_lies |
0 |
none |
3 |
acc_norm |
↑ |
0.4960 |
± |
0.0317 |
- leaderboard_gpqa |
N/A |
none |
0 |
acc_norm |
↑ |
0.3565 |
± |
0.0139 |
- leaderboard_gpqa_diamond |
1 |
none |
0 |
acc_norm |
↑ |
0.3283 |
± |
0.0335 |
- leaderboard_gpqa_extended |
1 |
none |
0 |
acc_norm |
↑ |
0.3553 |
± |
0.0205 |
- leaderboard_gpqa_main |
1 |
none |
0 |
acc_norm |
↑ |
0.3705 |
± |
0.0228 |
- leaderboard_ifeval |
2 |
none |
0 |
inst_level_loose_acc |
↑ |
0.8297 |
± |
N/A |
|
|
none |
0 |
inst_level_strict_acc |
↑ |
0.8058 |
± |
N/A |
|
|
none |
0 |
prompt_level_loose_acc |
↑ |
0.7671 |
± |
0.0182 |
|
|
none |
0 |
prompt_level_strict_acc |
↑ |
0.7338 |
± |
0.0190 |
- leaderboard_math_algebra_hard |
1 |
none |
4 |
exact_match |
↑ |
0.2612 |
± |
0.0128 |
- leaderboard_math_counting_and_prob_hard |
1 |
none |
4 |
exact_match |
↑ |
0.2131 |
± |
0.0188 |
- leaderboard_math_geometry_hard |
1 |
none |
4 |
exact_match |
↑ |
0.2109 |
± |
0.0187 |
- leaderboard_math_hard |
N/A |
none |
4 |
exact_match |
↑ |
0.1992 |
± |
0.0055 |
- leaderboard_math_intermediate_algebra_hard |
1 |
none |
4 |
exact_match |
↑ |
0.1030 |
± |
0.0101 |
- leaderboard_math_num_theory_hard |
1 |
none |
4 |
exact_match |
↑ |
0.1185 |
± |
0.0139 |
- leaderboard_math_prealgebra_hard |
1 |
none |
4 |
exact_match |
↑ |
0.3031 |
± |
0.0156 |
- leaderboard_math_precalculus_hard |
1 |
none |
4 |
exact_match |
↑ |
0.1154 |
± |
0.0137 |
- leaderboard_mmlu_pro |
0.1 |
none |
5 |
acc |
↑ |
0.4464 |
± |
0.0045 |
- leaderboard_musr |
N/A |
none |
0 |
acc_norm |
↑ |
0.4114 |
± |
0.0176 |
- leaderboard_musr_murder_mysteries |
1 |
none |
0 |
acc_norm |
↑ |
0.5240 |
± |
0.0316 |
- leaderboard_musr_object_placements |
1 |
none |
0 |
acc_norm |
↑ |
0.2852 |
± |
0.0283 |
- leaderboard_musr_team_allocation |
1 |
none |
0 |
acc_norm |
↑ |
0.4280 |
± |
0.0314 |