Skip to content

Instantly share code, notes, and snippets.

@Helw150
Last active May 13, 2026 20:25
Show Gist options
  • Select an option

  • Save Helw150/a06d335d41cbe7121ded948c89dd3f36 to your computer and use it in GitHub Desktop.

Select an option

Save Helw150/a06d335d41cbe7121ded948c89dd3f36 to your computer and use it in GitHub Desktop.
Grug MoE data-mixture comparison (v0/v2/v3/v4) across compute scales — full lm-eval results across 17 tasks
[
{
"mix": "v0",
"hidden_dim": 512,
"budget": 2.19e+17,
"tasks": {
"mmlu_sl_verb_0shot": {
"bpb,none": 0.844007877090332,
"bpb_stderr,none": 0.004281199742552705,
"acc_norm,none": 0.27332288847742486,
"acc_norm_stderr,none": 0.003753850490600821,
"choice_logprob,none": -3.043445214997677,
"choice_logprob_stderr,none": 0.026555210423050656,
"choice_prob_norm,none": 0.25426655551896116,
"choice_prob_norm_stderr,none": 0.00045542617738300654,
"choice_logprob_norm,none": -1.3982650663285163,
"choice_logprob_norm_stderr,none": 0.001983432015974189,
"acc,none": 0.25751317476143,
"acc_stderr,none": 0.0036830955102596413,
"logprob,none": -12.398654076514974,
"logprob_stderr,none": 0.04022890901630262
},
"mmlu_sl_verb_5shot": {
"logprob,none": -9.150586650434162,
"logprob_stderr,none": 0.0437860436375247,
"choice_prob_norm,none": 0.2533328393394025,
"choice_prob_norm_stderr,none": 0.00038756810709663183,
"choice_logprob,none": -3.0488093627343145,
"choice_logprob_stderr,none": 0.027576481710546295,
"acc_norm,none": 0.27795185870958555,
"acc_norm_stderr,none": 0.003768336194386366,
"bpb,none": 0.548681596735387,
"bpb_stderr,none": 0.0027853622786252513,
"choice_logprob_norm,none": -1.3931666061972847,
"choice_logprob_norm_stderr,none": 0.0016667230544346093,
"acc,none": 0.25801167924797036,
"acc_stderr,none": 0.0036777670843517758
},
"arc_challenge_5shot": {
"acc,none": 0.18344709897610922,
"acc_stderr,none": 0.011310170179554538,
"acc_norm,none": 0.2235494880546075,
"acc_norm_stderr,none": 0.012174896631202612,
"bpb,none": 1.4939646127332267,
"bpb_stderr,none": 0.026052531422912324,
"logprob,none": -25.201515757183166,
"logprob_stderr,none": 0.40587633676351703,
"choice_logprob,none": -6.179149515974148,
"choice_logprob_stderr,none": 0.19074901798103436,
"choice_prob_norm,none": 0.24421896079372202,
"choice_prob_norm_stderr,none": 0.0020540497334768444,
"choice_logprob_norm,none": -1.4612485921546,
"choice_logprob_norm_stderr,none": 0.010459092921839218
},
"arc_easy_5shot": {
"acc,none": 0.39941077441077444,
"acc_stderr,none": 0.010050018228742123,
"acc_norm,none": 0.38215488215488214,
"acc_norm_stderr,none": 0.009970747281292443,
"bpb,none": 1.3642281627798005,
"bpb_stderr,none": 0.013579442555016155,
"logprob,none": -18.200637711820377,
"logprob_stderr,none": 0.24737452991476383,
"choice_logprob,none": -3.2206430668231834,
"choice_logprob_stderr,none": 0.09855560033006468,
"choice_prob_norm,none": 0.27701670445333143,
"choice_prob_norm_stderr,none": 0.0017801352285700502,
"choice_logprob_norm,none": -1.3367244872650283,
"choice_logprob_norm_stderr,none": 0.007127254716244113
},
"boolq_10shot": {
"acc,none": 0.535474006116208,
"acc_stderr,none": 0.00872301756568373,
"bpb,none": 0.6421441648309578,
"bpb_stderr,none": 0.007033838483279408,
"logprob,none": -1.078191109170243,
"logprob_stderr,none": 0.009527333827094765,
"choice_logprob,none": -0.7567635277470782,
"choice_logprob_stderr,none": 0.008415952419342955,
"choice_prob_norm,none": 0.5259153499375072,
"choice_prob_norm_stderr,none": 0.002922074311745619,
"choice_logprob_norm,none": -0.7032777721755389,
"choice_logprob_norm_stderr,none": 0.006455724758507388
},
"boolq_sl_verb_10shot": {
"acc,none": 0.6073394495412844,
"acc_stderr,none": 0.008541161248702911,
"acc_norm,none": 0.6100917431192661,
"acc_norm_stderr,none": 0.00853043797286262,
"bpb,none": 0.781249138328381,
"bpb_stderr,none": 0.01792698664529871,
"logprob,none": -1.1282020455106683,
"logprob_stderr,none": 0.02461400496836941,
"choice_logprob,none": -1.1095931076643404,
"choice_logprob_stderr,none": 0.024625304958620156,
"choice_prob_norm,none": 0.5761914163699757,
"choice_prob_norm_stderr,none": 0.005965626139627796,
"choice_logprob_norm,none": -0.8994806084977474,
"choice_logprob_norm_stderr,none": 0.017654638360913667
},
"copa_0shot": {
"acc,none": 0.54,
"acc_stderr,none": 0.05009082659620333,
"bpb,none": 1.932529378886802,
"bpb_stderr,none": 0.050196515588808964,
"logprob,none": -36.46297584533691,
"logprob_stderr,none": 0.6275435679451141,
"choice_logprob,none": -1.5684399778005398,
"choice_logprob_stderr,none": 0.25810873920107935,
"choice_prob_norm,none": 0.5008998882565759,
"choice_prob_norm_stderr,none": 0.008403353777918801,
"choice_logprob_norm,none": -0.7073998440685311,
"choice_logprob_norm_stderr,none": 0.018757463416034615
},
"csqa_5shot": {
"acc,none": 0.21867321867321868,
"acc_stderr,none": 0.011834072858346475,
"bpb,none": 3.3944986407019795,
"bpb_stderr,none": 0.07036943089405387,
"logprob,none": -2.352887162217149,
"logprob_stderr,none": 0.048776372621821336,
"choice_logprob,none": -2.2934845489053415,
"choice_logprob_stderr,none": 0.04875833998131373,
"choice_prob_norm,none": 0.21005525824056398,
"choice_prob_norm_stderr,none": 0.00573610313594625,
"choice_logprob_norm,none": -2.7836692402555645,
"choice_logprob_norm_stderr,none": 0.07056143439586685
},
"csqa_sl_verb_5shot": {
"acc,none": 0.21785421785421785,
"acc_stderr,none": 0.011818079981132525,
"acc_norm,none": 0.2497952497952498,
"acc_norm_stderr,none": 0.012393731327126517,
"bpb,none": 1.0283069448483697,
"bpb_stderr,none": 0.011691902814805668,
"logprob,none": -8.32615207786154,
"logprob_stderr,none": 0.06789450079528266,
"choice_logprob,none": -2.9300014365243117,
"choice_logprob_stderr,none": 0.06343665511527985,
"choice_prob_norm,none": 0.21264917554927573,
"choice_prob_norm_stderr,none": 0.001895513866004499,
"choice_logprob_norm,none": -1.604318958879433,
"choice_logprob_norm_stderr,none": 0.010183666364888024
},
"hellaswag_0shot": {
"acc,none": 0.2755427205735909,
"acc_stderr,none": 0.004458742356237827,
"acc_norm,none": 0.2842063333997212,
"acc_norm_stderr,none": 0.00450113789523078,
"bpb,none": 1.059523145316047,
"bpb_stderr,none": 0.0022444255991642737,
"logprob,none": -97.3619536872025,
"logprob_stderr,none": 0.48363229578906214,
"choice_logprob,none": -27.102028974702925,
"choice_logprob_stderr,none": 0.31415916304164965,
"choice_prob_norm,none": 0.2561454675384467,
"choice_prob_norm_stderr,none": 0.0004366764395449189,
"choice_logprob_norm,none": -1.3767884583825902,
"choice_logprob_norm_stderr,none": 0.0017318776455852686
},
"hellaswag_5shot": {
"acc,none": 0.2731527584146584,
"acc_stderr,none": 0.004446680081493803,
"acc_norm,none": 0.2832105158334993,
"acc_norm_stderr,none": 0.0044963697421320625,
"bpb,none": 1.0621285463880195,
"bpb_stderr,none": 0.0022168107518051694,
"logprob,none": -97.87679240248208,
"logprob_stderr,none": 0.48800701290965154,
"choice_logprob,none": -27.41035264155543,
"choice_logprob_stderr,none": 0.3170651738314153,
"choice_prob_norm,none": 0.255918238323607,
"choice_prob_norm_stderr,none": 0.00043183433091282055,
"choice_logprob_norm,none": -1.3773365329893177,
"choice_logprob_norm_stderr,none": 0.0017110710547015664
},
"openbookqa_0shot": {
"acc,none": 0.156,
"acc_stderr,none": 0.01624363602839109,
"acc_norm,none": 0.276,
"acc_norm_stderr,none": 0.02001121929807354,
"bpb,none": 2.0457006929799872,
"bpb_stderr,none": 0.05234425573331846,
"logprob,none": -21.423335208892823,
"logprob_stderr,none": 0.5723682593644139,
"choice_logprob,none": -6.946740517766322,
"choice_logprob_stderr,none": 0.3398069620244233,
"choice_prob_norm,none": 0.2559159542838489,
"choice_prob_norm_stderr,none": 0.005456659411254479,
"choice_logprob_norm,none": -1.509851229757715,
"choice_logprob_norm_stderr,none": 0.03220580301083027
},
"piqa_5shot": {
"acc,none": 0.6186071817192601,
"acc_stderr,none": 0.01133285040652868,
"acc_norm,none": 0.6006528835690969,
"acc_norm_stderr,none": 0.011427006685027255,
"bpb,none": 1.3454683630407622,
"bpb_stderr,none": 0.010830648783079756,
"logprob,none": -80.70319272696129,
"logprob_stderr,none": 1.6881639018523786,
"choice_logprob,none": -3.7250006152484225,
"choice_logprob_stderr,none": 0.24830992680518993,
"choice_prob_norm,none": 0.5057906922894322,
"choice_prob_norm_stderr,none": 0.00141780632702078,
"choice_logprob_norm,none": -0.689898526159878,
"choice_logprob_norm_stderr,none": 0.0031762510346056622
},
"winogrande_5shot": {
"acc,none": 0.5082872928176796,
"acc_stderr,none": 0.014050555322824194,
"bpb,none": 0.48343625036531535,
"bpb_stderr,none": 0.026419434099997325,
"logprob,none": -21.73852891101551,
"logprob_stderr,none": 0.31737259551999136,
"choice_logprob,none": -0.7682591176586984,
"choice_logprob_stderr,none": 0.013940856014081878,
"choice_prob_norm,none": 0.5008961398172945,
"choice_prob_norm_stderr,none": 0.0007634802237262012,
"choice_logprob_norm,none": -0.6934601876237907,
"choice_logprob_norm_stderr,none": 0.002288066806855179
},
"wsc273_0shot": {
"acc,none": 0.5238095238095238,
"acc_stderr,none": 0.03028256065887908,
"bpb,none": 0.7321154207777446,
"bpb_stderr,none": 0.02406740256267047,
"logprob,none": -30.54474130916945,
"logprob_stderr,none": 0.5608394280435071,
"choice_logprob,none": -0.8329238946298883,
"choice_logprob_stderr,none": 0.04411749135415132,
"choice_prob_norm,none": 0.5007932504499706,
"choice_prob_norm_stderr,none": 0.0012283331704642,
"choice_logprob_norm,none": -0.692377059436893,
"choice_logprob_norm_stderr,none": 0.0024497244389824267
},
"medmcqa_5shot": {
"acc,none": 0.2577097776715276,
"acc_stderr,none": 0.00676332572657815,
"acc_norm,none": 0.2577097776715276,
"acc_norm_stderr,none": 0.00676332572657815,
"logprob,none": -1.6676009283264146,
"logprob_stderr,none": 0.01211629223566799
},
"medmcqa_sl_verb_5shot": {
"acc,none": 0.21778627779105905,
"acc_stderr,none": 0.006382428483871769,
"acc_norm,none": 0.25077695433899116,
"acc_norm_stderr,none": 0.006702816982619845,
"bpb,none": 0.5672725270755324,
"bpb_stderr,none": 0.00544400700742468,
"logprob,none": -7.468935673868944,
"logprob_stderr,none": 0.04929996698112747,
"choice_logprob,none": -3.156554204167492,
"choice_logprob_stderr,none": 0.04064688133216061,
"choice_prob_norm,none": 0.24998755186001656,
"choice_prob_norm_stderr,none": 0.0008006475335414557,
"choice_logprob_norm,none": -1.4105696716118796,
"choice_logprob_norm_stderr,none": 0.0036161209895869753
},
"truthfulqa_mc1_0shot": {
"acc,none": 0.23745410036719705,
"acc_stderr,none": 0.014896277441041843,
"logprob,none": -27.668901143488423,
"logprob_stderr,none": 0.5155153998573488
},
"logprob_gsm8k_5shot": {
"nll,none": 249.00128224150893,
"nll_stderr,none": 2.7224087755745114,
"bpb,none": 1.2938561600269785,
"bpb_stderr,none": 0.008077388733314489
},
"logprob_humaneval_10shot": {
"nll,none": 95.12898236367761,
"nll_stderr,none": 4.950599241344104,
"bpb,none": 0.8647876570287482,
"bpb_stderr,none": 0.027305095746882137
}
}
},
{
"mix": "v2",
"hidden_dim": 512,
"budget": 2.19e+17,
"tasks": {
"mmlu_sl_verb_0shot": {
"choice_logprob_norm,none": -1.395105808725625,
"choice_logprob_norm_stderr,none": 0.0018874264609554715,
"bpb,none": 0.7086432691399365,
"bpb_stderr,none": 0.0037642409695114263,
"acc_norm,none": 0.2804443811422874,
"acc_norm_stderr,none": 0.0037814795663111355,
"choice_prob_norm,none": 0.25430199441097195,
"choice_prob_norm_stderr,none": 0.0004345215762368066,
"acc,none": 0.2664150405925082,
"acc_stderr,none": 0.003720863676213897,
"logprob,none": -10.879439385268903,
"logprob_stderr,none": 0.04589588515139443,
"choice_logprob,none": -3.26679195925276,
"choice_logprob_stderr,none": 0.030529247948099983
},
"mmlu_sl_verb_5shot": {
"acc_norm,none": 0.2745335422304515,
"acc_norm_stderr,none": 0.003753912793560197,
"choice_prob_norm,none": 0.25438074961280066,
"choice_prob_norm_stderr,none": 0.0004152774493322435,
"bpb,none": 0.555535466797542,
"bpb_stderr,none": 0.0031185170868660004,
"choice_logprob,none": -3.5104746791472374,
"choice_logprob_stderr,none": 0.03519841089032272,
"choice_logprob_norm,none": -1.3945937913065722,
"choice_logprob_norm_stderr,none": 0.001874030058258482,
"acc,none": 0.26370887337986043,
"acc_stderr,none": 0.003707121434082123,
"logprob,none": -9.198550460823393,
"logprob_stderr,none": 0.05378404592279555
},
"arc_challenge_5shot": {
"acc,none": 0.19112627986348124,
"acc_stderr,none": 0.011490055292778599,
"acc_norm,none": 0.22098976109215018,
"acc_norm_stderr,none": 0.012124929206818258,
"bpb,none": 1.489832873492701,
"bpb_stderr,none": 0.02717879408473882,
"logprob,none": -24.8621712874227,
"logprob_stderr,none": 0.39980296901624135,
"choice_logprob,none": -6.160979878220389,
"choice_logprob_stderr,none": 0.1906458394945994,
"choice_prob_norm,none": 0.24613135192540953,
"choice_prob_norm_stderr,none": 0.002183901046334659,
"choice_logprob_norm,none": -1.4617512614955517,
"choice_logprob_norm_stderr,none": 0.011962603981041372
},
"arc_easy_5shot": {
"acc,none": 0.42297979797979796,
"acc_stderr,none": 0.010137328382209088,
"acc_norm,none": 0.406986531986532,
"acc_norm_stderr,none": 0.010080695355466598,
"bpb,none": 1.387534733610468,
"bpb_stderr,none": 0.01537529127210736,
"logprob,none": -18.011766900317838,
"logprob_stderr,none": 0.2406318292539621,
"choice_logprob,none": -3.110095427498679,
"choice_logprob_stderr,none": 0.10149254975214446,
"choice_prob_norm,none": 0.28301377885231027,
"choice_prob_norm_stderr,none": 0.0019324969155231323,
"choice_logprob_norm,none": -1.3236154616516096,
"choice_logprob_norm_stderr,none": 0.007749568127906316
},
"boolq_10shot": {
"acc,none": 0.5666666666666667,
"acc_stderr,none": 0.008666972565214514,
"bpb,none": 0.8127147770859842,
"bpb_stderr,none": 0.01077108724649005,
"logprob,none": -1.3609937103516465,
"logprob_stderr,none": 0.01643642626382858,
"choice_logprob,none": -0.7703883397780966,
"choice_logprob_stderr,none": 0.011417959600312522,
"choice_prob_norm,none": 0.5482880534919546,
"choice_prob_norm_stderr,none": 0.003682284394153467,
"choice_logprob_norm,none": -0.6999610769423285,
"choice_logprob_norm_stderr,none": 0.008559105338723531
},
"boolq_sl_verb_10shot": {
"acc,none": 0.5327217125382263,
"acc_stderr,none": 0.008726308038444403,
"acc_norm,none": 0.5434250764525994,
"acc_norm_stderr,none": 0.008712010793695301,
"bpb,none": 0.601618129293495,
"bpb_stderr,none": 0.01109298411838269,
"logprob,none": -0.9971367144803388,
"logprob_stderr,none": 0.01710603442884549,
"choice_logprob,none": -0.966932997884124,
"choice_logprob_stderr,none": 0.016961873963066938,
"choice_prob_norm,none": 0.5295111556301993,
"choice_prob_norm_stderr,none": 0.004306882748278512,
"choice_logprob_norm,none": -0.7885458839494792,
"choice_logprob_norm_stderr,none": 0.010834385112369352
},
"copa_0shot": {
"acc,none": 0.6,
"acc_stderr,none": 0.04923659639173309,
"bpb,none": 1.9200667700736944,
"bpb_stderr,none": 0.05129172565603094,
"logprob,none": -36.307588214874265,
"logprob_stderr,none": 0.7025852589241688,
"choice_logprob,none": -1.6349282082433967,
"choice_logprob_stderr,none": 0.2563397538643638,
"choice_prob_norm,none": 0.5045399053052483,
"choice_prob_norm_stderr,none": 0.009019915694046634,
"choice_logprob_norm,none": -0.7030432609585621,
"choice_logprob_norm_stderr,none": 0.02061245573401139
},
"csqa_5shot": {
"acc,none": 0.19656019656019655,
"acc_stderr,none": 0.011377439773964,
"bpb,none": 3.700175443457485,
"bpb_stderr,none": 0.09103222386766453,
"logprob,none": -2.564766176209696,
"logprob_stderr,none": 0.06309872931397345,
"choice_logprob,none": -2.5384716222597494,
"choice_logprob_stderr,none": 0.06310537788715918,
"choice_prob_norm,none": 0.2011090417277458,
"choice_prob_norm_stderr,none": 0.004951753153655882,
"choice_logprob_norm,none": -3.1040275132391195,
"choice_logprob_norm_stderr,none": 0.09112662467956259
},
"csqa_sl_verb_5shot": {
"acc,none": 0.21621621621621623,
"acc_stderr,none": 0.01178588917548664,
"acc_norm,none": 0.23587223587223588,
"acc_norm_stderr,none": 0.012154622820781504,
"bpb,none": 1.0214474073668882,
"bpb_stderr,none": 0.012401567172941753,
"logprob,none": -8.22377738944826,
"logprob_stderr,none": 0.07575049549544435,
"choice_logprob,none": -3.0526434544044316,
"choice_logprob_stderr,none": 0.06789605144455163,
"choice_prob_norm,none": 0.21230096957013608,
"choice_prob_norm_stderr,none": 0.0020234177041320695,
"choice_logprob_norm,none": -1.6128310731104252,
"choice_logprob_norm_stderr,none": 0.010728376678642517
},
"hellaswag_0shot": {
"acc,none": 0.2818163712407887,
"acc_stderr,none": 0.004489648865080841,
"acc_norm,none": 0.29575781716789484,
"acc_norm_stderr,none": 0.004554499409290671,
"bpb,none": 1.0604522626751571,
"bpb_stderr,none": 0.002347232941483141,
"logprob,none": -97.45293985764549,
"logprob_stderr,none": 0.4877105995743941,
"choice_logprob,none": -27.044610263316372,
"choice_logprob_stderr,none": 0.3163294608459877,
"choice_prob_norm,none": 0.25792808333865613,
"choice_prob_norm_stderr,none": 0.0004617017000080293,
"choice_logprob_norm,none": -1.3713845923943457,
"choice_logprob_norm_stderr,none": 0.0018215144098603715
},
"hellaswag_5shot": {
"acc,none": 0.27972515435172274,
"acc_stderr,none": 0.004479467619464835,
"acc_norm,none": 0.2927703644692292,
"acc_norm_stderr,none": 0.004541039698729829,
"bpb,none": 1.052438515268161,
"bpb_stderr,none": 0.0022855055454450374,
"logprob,none": -96.76464004902174,
"logprob_stderr,none": 0.48497949747468927,
"choice_logprob,none": -27.207802913947848,
"choice_logprob_stderr,none": 0.3170761167400707,
"choice_prob_norm,none": 0.2572310812638424,
"choice_prob_norm_stderr,none": 0.00044265170655079846,
"choice_logprob_norm,none": -1.3728559573518737,
"choice_logprob_norm_stderr,none": 0.0017515763916904391
},
"openbookqa_0shot": {
"acc,none": 0.166,
"acc_stderr,none": 0.01665661687653113,
"acc_norm,none": 0.268,
"acc_norm_stderr,none": 0.019827714859587592,
"bpb,none": 2.0430624431901023,
"bpb_stderr,none": 0.05348166345275381,
"logprob,none": -21.434500136375426,
"logprob_stderr,none": 0.5811476660213469,
"choice_logprob,none": -7.1119697689300665,
"choice_logprob_stderr,none": 0.3474521931093143,
"choice_prob_norm,none": 0.2596508704253059,
"choice_prob_norm_stderr,none": 0.005753092038542383,
"choice_logprob_norm,none": -1.4996703821450863,
"choice_logprob_norm_stderr,none": 0.03237955248457045
},
"piqa_5shot": {
"acc,none": 0.6126224156692056,
"acc_stderr,none": 0.01136603808343591,
"acc_norm,none": 0.5984766050054406,
"acc_norm_stderr,none": 0.011437324373397844,
"bpb,none": 1.346865107049732,
"bpb_stderr,none": 0.011462353078616861,
"logprob,none": -80.50279071048243,
"logprob_stderr,none": 1.6973949430845747,
"choice_logprob,none": -3.8368748542399627,
"choice_logprob_stderr,none": 0.2499631337880358,
"choice_prob_norm,none": 0.5057863173560802,
"choice_prob_norm_stderr,none": 0.0014744863002791257,
"choice_logprob_norm,none": -0.6904674005767536,
"choice_logprob_norm_stderr,none": 0.0032460385380844265
},
"winogrande_5shot": {
"acc,none": 0.5146014206787688,
"acc_stderr,none": 0.014046492383275842,
"bpb,none": 0.4855374598470294,
"bpb_stderr,none": 0.025742442598531194,
"logprob,none": -21.923497182700046,
"logprob_stderr,none": 0.3158033457422442,
"choice_logprob,none": -0.8000682439938214,
"choice_logprob_stderr,none": 0.016809219923152076,
"choice_prob_norm,none": 0.5009294975632409,
"choice_prob_norm_stderr,none": 0.0007856596659935949,
"choice_logprob_norm,none": -0.693629638764659,
"choice_logprob_norm_stderr,none": 0.0024759192573889106
},
"wsc273_0shot": {
"acc,none": 0.5091575091575091,
"acc_stderr,none": 0.030311867945261854,
"bpb,none": 0.7423261934631866,
"bpb_stderr,none": 0.02451552761399825,
"logprob,none": -30.840766480554155,
"logprob_stderr,none": 0.5852038618254907,
"choice_logprob,none": -0.9987486646253337,
"choice_logprob_stderr,none": 0.06145714611343547,
"choice_prob_norm,none": 0.5002619838156802,
"choice_prob_norm_stderr,none": 0.001284676402507507,
"choice_logprob_norm,none": -0.6935330445997154,
"choice_logprob_norm_stderr,none": 0.002600351355440756
},
"medmcqa_5shot": {
"acc,none": 0.2763566818073153,
"acc_stderr,none": 0.006915208017087954,
"acc_norm,none": 0.2763566818073153,
"acc_norm_stderr,none": 0.006915208017087954,
"logprob,none": -1.4470207297343145,
"logprob_stderr,none": 0.007964545144399966
},
"medmcqa_sl_verb_5shot": {
"acc,none": 0.22687066698541716,
"acc_stderr,none": 0.006476244480762607,
"acc_norm,none": 0.2534066459478843,
"acc_norm_stderr,none": 0.00672603380780031,
"bpb,none": 0.4870434199938374,
"bpb_stderr,none": 0.005712604826753254,
"logprob,none": -6.335990024017655,
"logprob_stderr,none": 0.05467469291586012,
"choice_logprob,none": -3.1816474104731958,
"choice_logprob_stderr,none": 0.045359823837981435,
"choice_prob_norm,none": 0.25134192350728396,
"choice_prob_norm_stderr,none": 0.0008110848923755798,
"choice_logprob_norm,none": -1.4059551486234891,
"choice_logprob_norm_stderr,none": 0.0036537496170158316
},
"truthfulqa_mc1_0shot": {
"acc,none": 0.22643818849449204,
"acc_stderr,none": 0.014651337324602592,
"logprob,none": -26.759819291066947,
"logprob_stderr,none": 0.5077560946554257
},
"logprob_gsm8k_5shot": {
"nll,none": 254.650383417854,
"nll_stderr,none": 2.865119070723374,
"bpb,none": 1.3215150142333845,
"bpb_stderr,none": 0.008789955135988231
},
"logprob_humaneval_10shot": {
"nll,none": 107.93575578201109,
"nll_stderr,none": 5.431641005740183,
"bpb,none": 0.9908253818862274,
"bpb_stderr,none": 0.03155326583552453
}
}
},
{
"mix": "v3",
"hidden_dim": 512,
"budget": 2.19e+17,
"tasks": {
"mmlu_sl_verb_0shot": {
"acc,none": 0.26684233015239994,
"acc_stderr,none": 0.0037260168667051403,
"acc_norm,none": 0.2812989602620709,
"acc_norm_stderr,none": 0.0037901415430174196,
"bpb,none": 0.6269147047636995,
"bpb_stderr,none": 0.0032322057653284024,
"logprob,none": -9.742547235739172,
"logprob_stderr,none": 0.039023499834109354,
"choice_logprob,none": -2.8547423022152967,
"choice_logprob_stderr,none": 0.025801330341823882,
"choice_prob_norm,none": 0.2541127371908383,
"choice_prob_norm_stderr,none": 0.00041266809020224345,
"choice_logprob_norm,none": -1.3928761929921796,
"choice_logprob_norm_stderr,none": 0.0017974926944226569,
"sample_count": {
"acc,none": 14042,
"acc_norm,none": 14042,
"bpb,none": 14042,
"logprob,none": 14042,
"choice_logprob,none": 14042,
"choice_prob_norm,none": 14042,
"choice_logprob_norm,none": 14042
}
},
"mmlu_sl_verb_5shot": {
"acc,none": 0.2634240136732659,
"acc_stderr,none": 0.003714353925489619,
"acc_norm,none": 0.27994587665574705,
"acc_norm_stderr,none": 0.003786695065744675,
"bpb,none": 0.3349757325611144,
"bpb_stderr,none": 0.0018413122245872007,
"logprob,none": -6.265405864100078,
"logprob_stderr,none": 0.040860611694026196,
"choice_logprob,none": -2.696346618180892,
"choice_logprob_stderr,none": 0.026354033252580806,
"choice_prob_norm,none": 0.2525867221345686,
"choice_prob_norm_stderr,none": 0.00028995850438336346,
"choice_logprob_norm,none": -1.3874403115669558,
"choice_logprob_norm_stderr,none": 0.0012565002854746177,
"sample_count": {
"acc,none": 14042,
"acc_norm,none": 14042,
"bpb,none": 14042,
"logprob,none": 14042,
"choice_logprob,none": 14042,
"choice_prob_norm,none": 14042,
"choice_logprob_norm,none": 14042
}
},
"arc_challenge_5shot": {
"acc,none": 0.18003412969283278,
"acc_stderr,none": 0.011227856729050042,
"acc_norm,none": 0.2090443686006826,
"acc_norm_stderr,none": 0.011882746987406448,
"bpb,none": 1.5056673895489818,
"bpb_stderr,none": 0.026601531332319416,
"logprob,none": -25.544686375217633,
"logprob_stderr,none": 0.41765748907984457,
"choice_logprob,none": -6.292516564525333,
"choice_logprob_stderr,none": 0.18918113367678352,
"choice_prob_norm,none": 0.24443493452430692,
"choice_prob_norm_stderr,none": 0.002000694071827269,
"choice_logprob_norm,none": -1.4584900106904373,
"choice_logprob_norm_stderr,none": 0.010473236534207186
},
"arc_easy_5shot": {
"acc,none": 0.41203703703703703,
"acc_stderr,none": 0.010099765857562759,
"acc_norm,none": 0.3939393939393939,
"acc_norm_stderr,none": 0.010026305355981802,
"bpb,none": 1.366618370072764,
"bpb_stderr,none": 0.013590783840504918,
"logprob,none": -18.46069774623672,
"logprob_stderr,none": 0.257249295863158,
"choice_logprob,none": -3.3198551903359,
"choice_logprob_stderr,none": 0.10477930078908752,
"choice_prob_norm,none": 0.28026239543277925,
"choice_prob_norm_stderr,none": 0.001861688811683428,
"choice_logprob_norm,none": -1.3284425372742612,
"choice_logprob_norm_stderr,none": 0.007358125735955402
},
"boolq_10shot": {
"acc,none": 0.5296636085626911,
"acc_stderr,none": 0.008729651343606142,
"bpb,none": 0.7080951594868731,
"bpb_stderr,none": 0.007534277740155469,
"logprob,none": -1.22105823525595,
"logprob_stderr,none": 0.01234856754062047,
"choice_logprob,none": -0.7337582696269975,
"choice_logprob_stderr,none": 0.006584067693744113,
"choice_prob_norm,none": 0.5249368286929101,
"choice_prob_norm_stderr,none": 0.00239055660626695,
"choice_logprob_norm,none": -0.6828626460422125,
"choice_logprob_norm_stderr,none": 0.005038215177149247
},
"boolq_sl_verb_10shot": {
"acc,none": 0.5446483180428134,
"acc_stderr,none": 0.008710119143608424,
"acc_norm,none": 0.5602446483180428,
"acc_norm_stderr,none": 0.008681343983423958,
"bpb,none": 0.5584264300582101,
"bpb_stderr,none": 0.010294208995737797,
"logprob,none": -0.8867521860548481,
"logprob_stderr,none": 0.01445560999137545,
"choice_logprob,none": -0.8766496772391723,
"choice_logprob_stderr,none": 0.014457699509682795,
"choice_prob_norm,none": 0.5335762395809978,
"choice_prob_norm_stderr,none": 0.004063768694464024,
"choice_logprob_norm,none": -0.7611293301293726,
"choice_logprob_norm_stderr,none": 0.010069680463397658
},
"copa_0shot": {
"acc,none": 0.55,
"acc_stderr,none": 0.05,
"bpb,none": 1.9412281989491103,
"bpb_stderr,none": 0.053503958123955586,
"logprob,none": -36.64149988174439,
"logprob_stderr,none": 0.7121939038766281,
"choice_logprob,none": -1.8378197154960296,
"choice_logprob_stderr,none": 0.30618391834273084,
"choice_prob_norm,none": 0.4962514280310157,
"choice_prob_norm_stderr,none": 0.008901716274512847,
"choice_logprob_norm,none": -0.7193142676418873,
"choice_logprob_norm_stderr,none": 0.020288767729933287
},
"csqa_5shot": {
"acc,none": 0.20638820638820637,
"acc_stderr,none": 0.011586881879177835,
"bpb,none": 3.3802579735168834,
"bpb_stderr,none": 0.07427730102694352,
"logprob,none": -2.3430162839085034,
"logprob_stderr,none": 0.05148510178642825,
"choice_logprob,none": -2.3370383041004765,
"choice_logprob_stderr,none": 0.05148720286997365,
"choice_prob_norm,none": 0.20358528776009918,
"choice_prob_norm_stderr,none": 0.0047617388299911545,
"choice_logprob_norm,none": -2.81402997057312,
"choice_logprob_norm_stderr,none": 0.07434291950311715
},
"csqa_sl_verb_5shot": {
"acc,none": 0.24078624078624078,
"acc_stderr,none": 0.012241029737913613,
"acc_norm,none": 0.2596232596232596,
"acc_norm_stderr,none": 0.012552154236305981,
"bpb,none": 0.6344186175567482,
"bpb_stderr,none": 0.010779995969509551,
"logprob,none": -5.138235038269943,
"logprob_stderr,none": 0.07413193757799576,
"choice_logprob,none": -3.060806680465147,
"choice_logprob_stderr,none": 0.07139531567057507,
"choice_prob_norm,none": 0.21106069650439113,
"choice_prob_norm_stderr,none": 0.0018090940948582233,
"choice_logprob_norm,none": -1.610079916779593,
"choice_logprob_norm_stderr,none": 0.010101732954993387
},
"hellaswag_0shot": {
"acc,none": 0.27434773949412466,
"acc_stderr,none": 0.004452731272840582,
"acc_norm,none": 0.2811192989444334,
"acc_norm_stderr,none": 0.004486268470666261,
"bpb,none": 1.0900296120083375,
"bpb_stderr,none": 0.0023101684356920922,
"logprob,none": -100.17899374039709,
"logprob_stderr,none": 0.49890266118413285,
"choice_logprob,none": -28.179910831945875,
"choice_logprob_stderr,none": 0.325081453192061,
"choice_prob_norm,none": 0.25565736368719405,
"choice_prob_norm_stderr,none": 0.0004470412811279136,
"choice_logprob_norm,none": -1.3794781373392433,
"choice_logprob_norm_stderr,none": 0.0017778156873739898
},
"hellaswag_5shot": {
"acc,none": 0.2704640509858594,
"acc_stderr,none": 0.004432917403755067,
"acc_norm,none": 0.28121888070105555,
"acc_norm_stderr,none": 0.004486752200430311,
"bpb,none": 1.0971178635470091,
"bpb_stderr,none": 0.0022757577536049486,
"logprob,none": -100.99335159730256,
"logprob_stderr,none": 0.5037734921466325,
"choice_logprob,none": -28.430880582977448,
"choice_logprob_stderr,none": 0.3272217991027028,
"choice_prob_norm,none": 0.25518029494158306,
"choice_prob_norm_stderr,none": 0.00043631586852072534,
"choice_logprob_norm,none": -1.3806908926720352,
"choice_logprob_norm_stderr,none": 0.0017406953068043025
},
"openbookqa_0shot": {
"acc,none": 0.148,
"acc_stderr,none": 0.015896458561251246,
"acc_norm,none": 0.246,
"acc_norm_stderr,none": 0.019279819056352555,
"bpb,none": 2.071987712869857,
"bpb_stderr,none": 0.05143296580703173,
"logprob,none": -21.7383246717453,
"logprob_stderr,none": 0.5877514820689698,
"choice_logprob,none": -7.230179626940098,
"choice_logprob_stderr,none": 0.3540939204939283,
"choice_prob_norm,none": 0.2544992299744627,
"choice_prob_norm_stderr,none": 0.005523972794841713,
"choice_logprob_norm,none": -1.5166920137611337,
"choice_logprob_norm_stderr,none": 0.030278205322549843
},
"piqa_5shot": {
"acc,none": 0.6137105549510338,
"acc_stderr,none": 0.011360138833823683,
"acc_norm,none": 0.6164309031556039,
"acc_norm_stderr,none": 0.011345128734116278,
"bpb,none": 1.3502708496117517,
"bpb_stderr,none": 0.011122066203128382,
"logprob,none": -81.08814221964307,
"logprob_stderr,none": 1.7058425758276525,
"choice_logprob,none": -3.843236403699351,
"choice_logprob_stderr,none": 0.2579128186990966,
"choice_prob_norm,none": 0.5057770048799564,
"choice_prob_norm_stderr,none": 0.0014346697565567714,
"choice_logprob_norm,none": -0.6897870863103704,
"choice_logprob_norm_stderr,none": 0.0030871781803235997
},
"winogrande_5shot": {
"acc,none": 0.4909234411996843,
"acc_stderr,none": 0.0140501700944977,
"bpb,none": 0.4946571271106702,
"bpb_stderr,none": 0.02708889395838287,
"logprob,none": -22.255206053522315,
"logprob_stderr,none": 0.3224974639590515,
"choice_logprob,none": -0.7829697673753819,
"choice_logprob_stderr,none": 0.014830317845139945,
"choice_prob_norm,none": 0.5007496740254456,
"choice_prob_norm_stderr,none": 0.0007808958933147997,
"choice_logprob_norm,none": -0.6940473917226799,
"choice_logprob_norm_stderr,none": 0.0025529763168751843
},
"wsc273_0shot": {
"acc,none": 0.5128205128205128,
"acc_stderr,none": 0.030306985365626097,
"bpb,none": 0.7274675068738532,
"bpb_stderr,none": 0.02522797824744406,
"logprob,none": -30.15777778276157,
"logprob_stderr,none": 0.5934826212390054,
"choice_logprob,none": -0.9212171118300828,
"choice_logprob_stderr,none": 0.05261533627178825,
"choice_prob_norm,none": 0.5004322248767205,
"choice_prob_norm_stderr,none": 0.001356269558879158,
"choice_logprob_norm,none": -0.6932931798321386,
"choice_logprob_norm_stderr,none": 0.002740215457699106
},
"medmcqa_5shot": {
"acc,none": 0.2749223045661009,
"acc_stderr,none": 0.006904070961661391,
"acc_norm,none": 0.2749223045661009,
"acc_norm_stderr,none": 0.006904070961661391,
"logprob,none": -1.525614637009641,
"logprob_stderr,none": 0.009778845593370562
},
"medmcqa_sl_verb_5shot": {
"acc,none": 0.24695194836241932,
"acc_stderr,none": 0.006668460128724211,
"acc_norm,none": 0.265598852498207,
"acc_norm_stderr,none": 0.006829481867963804,
"bpb,none": 0.3154387533014714,
"bpb_stderr,none": 0.003722807714106041,
"logprob,none": -4.373024704067039,
"logprob_stderr,none": 0.04474834589222865,
"choice_logprob,none": -2.523247571724084,
"choice_logprob_stderr,none": 0.034800071950167516,
"choice_prob_norm,none": 0.2513992180444905,
"choice_prob_norm_stderr,none": 0.0005814317939933852,
"choice_logprob_norm,none": -1.3936370368957913,
"choice_logprob_norm_stderr,none": 0.0026294029779488307
},
"truthfulqa_mc1_0shot": {
"acc,none": 0.2386780905752754,
"acc_stderr,none": 0.014922629695456416,
"logprob,none": -27.703576085617083,
"logprob_stderr,none": 0.5364291512585702
},
"logprob_gsm8k_5shot": {
"nll,none": 236.30206623612435,
"nll_stderr,none": 2.81112604378555,
"bpb,none": 1.2134874700959222,
"bpb_stderr,none": 0.008180005640304246
},
"logprob_humaneval_10shot": {
"nll,none": 79.22695929829668,
"nll_stderr,none": 4.277470061189917,
"bpb,none": 0.7462923271430614,
"bpb_stderr,none": 0.02737976564104958
}
}
},
{
"mix": "v4",
"hidden_dim": 512,
"budget": 2.19e+17,
"tasks": {
"mmlu_sl_verb_0shot": {
"acc,none": 0.2553055120353226,
"acc_stderr,none": 0.00367838581462502,
"acc_norm,none": 0.27125765560461473,
"acc_norm_stderr,none": 0.0037488085614330816,
"bpb,none": 0.6390266847406983,
"bpb_stderr,none": 0.003566320582268053,
"logprob,none": -9.013411016877344,
"logprob_stderr,none": 0.03638521570634543,
"choice_logprob,none": -2.6363229740059633,
"choice_logprob_stderr,none": 0.022939767755543034,
"choice_prob_norm,none": 0.2524952111006773,
"choice_prob_norm_stderr,none": 0.00037153168615355745,
"choice_logprob_norm,none": -1.3953207683721074,
"choice_logprob_norm_stderr,none": 0.0016240531970126697,
"sample_count": {
"acc,none": 14042,
"acc_norm,none": 14042,
"bpb,none": 14042,
"logprob,none": 14042,
"choice_logprob,none": 14042,
"choice_prob_norm,none": 14042,
"choice_logprob_norm,none": 14042
}
},
"mmlu_sl_verb_5shot": {
"acc,none": 0.26114513602050987,
"acc_stderr,none": 0.0036996431729517452,
"acc_norm,none": 0.2709015809713716,
"acc_norm_stderr,none": 0.0037445679826478536,
"bpb,none": 0.34944660629584556,
"bpb_stderr,none": 0.0018626168249877765,
"logprob,none": -6.3097955451112515,
"logprob_stderr,none": 0.04402366277082722,
"choice_logprob,none": -2.7861677576228936,
"choice_logprob_stderr,none": 0.02728436697932576,
"choice_prob_norm,none": 0.2527074983287065,
"choice_prob_norm_stderr,none": 0.0002808249583230201,
"choice_logprob_norm,none": -1.3858056394730192,
"choice_logprob_norm_stderr,none": 0.0012062018737212107,
"sample_count": {
"acc,none": 14042,
"acc_norm,none": 14042,
"bpb,none": 14042,
"logprob,none": 14042,
"choice_logprob,none": 14042,
"choice_prob_norm,none": 14042,
"choice_logprob_norm,none": 14042
}
},
"arc_challenge_5shot": {
"acc,none": 0.17320819112627986,
"acc_stderr,none": 0.011058694183280345,
"acc_norm,none": 0.21843003412969283,
"acc_norm_stderr,none": 0.012074291605700971,
"bpb,none": 1.515098615907457,
"bpb_stderr,none": 0.02647828888168984,
"logprob,none": -25.406775207649726,
"logprob_stderr,none": 0.4010915752813438,
"choice_logprob,none": -6.128682172554726,
"choice_logprob_stderr,none": 0.18670629478233425,
"choice_prob_norm,none": 0.24424003177704995,
"choice_prob_norm_stderr,none": 0.0019909469925435603,
"choice_logprob_norm,none": -1.4608414972698616,
"choice_logprob_norm_stderr,none": 0.010678727697787002
},
"arc_easy_5shot": {
"acc,none": 0.41624579124579125,
"acc_stderr,none": 0.01011481940450087,
"acc_norm,none": 0.39057239057239057,
"acc_norm_stderr,none": 0.01001105911206425,
"bpb,none": 1.4087741651802528,
"bpb_stderr,none": 0.013796311572408584,
"logprob,none": -18.62068775486866,
"logprob_stderr,none": 0.24609795377486243,
"choice_logprob,none": -3.277522570024079,
"choice_logprob_stderr,none": 0.10122211999497388,
"choice_prob_norm,none": 0.27907478198842045,
"choice_prob_norm_stderr,none": 0.0018462385567067831,
"choice_logprob_norm,none": -1.3317550554033473,
"choice_logprob_norm_stderr,none": 0.007316569507089113
},
"boolq_10shot": {
"acc,none": 0.5137614678899083,
"acc_stderr,none": 0.008741742106878657,
"bpb,none": 0.7286610122314143,
"bpb_stderr,none": 0.008914918894223323,
"logprob,none": -1.273001033742129,
"logprob_stderr,none": 0.01486321445527341,
"choice_logprob,none": -0.7926264912596379,
"choice_logprob_stderr,none": 0.0090788009785167,
"choice_prob_norm,none": 0.5205144023555821,
"choice_prob_norm_stderr,none": 0.0026616239644261415,
"choice_logprob_norm,none": -0.7016038938416879,
"choice_logprob_norm_stderr,none": 0.005709866492914998
},
"boolq_sl_verb_10shot": {
"acc,none": 0.5357798165137615,
"acc_stderr,none": 0.008722635482201091,
"acc_norm,none": 0.5602446483180428,
"acc_norm_stderr,none": 0.008681343983423956,
"bpb,none": 0.5509832646849105,
"bpb_stderr,none": 0.009810966500804727,
"logprob,none": -0.9067304967011152,
"logprob_stderr,none": 0.014519104928619731,
"choice_logprob,none": -0.8898638427319436,
"choice_logprob_stderr,none": 0.014521917030274745,
"choice_prob_norm,none": 0.527889529733078,
"choice_prob_norm_stderr,none": 0.0038568469994045276,
"choice_logprob_norm,none": -0.7594750535008054,
"choice_logprob_norm_stderr,none": 0.009572287082251565
},
"copa_0shot": {
"acc,none": 0.58,
"acc_stderr,none": 0.049604496374885836,
"bpb,none": 1.919532833929384,
"bpb_stderr,none": 0.05030447484178161,
"logprob,none": -36.31590324401856,
"logprob_stderr,none": 0.7022728333295246,
"choice_logprob,none": -1.5237651283051747,
"choice_logprob_stderr,none": 0.2516922344221307,
"choice_prob_norm,none": 0.499464094404323,
"choice_prob_norm_stderr,none": 0.008018648398832776,
"choice_logprob_norm,none": -0.7090264651354934,
"choice_logprob_norm_stderr,none": 0.018050767048602083
},
"csqa_5shot": {
"acc,none": 0.20475020475020475,
"acc_stderr,none": 0.011552714477876671,
"bpb,none": 3.148213720923554,
"bpb_stderr,none": 0.058978042609837675,
"logprob,none": -2.1821754644582936,
"logprob_stderr,none": 0.04088046394995327,
"choice_logprob,none": -2.164364806547257,
"choice_logprob_stderr,none": 0.04087718298862214,
"choice_prob_norm,none": 0.20467033120522585,
"choice_prob_norm_stderr,none": 0.00534799300109999,
"choice_logprob_norm,none": -2.581631593711548,
"choice_logprob_norm_stderr,none": 0.05906931912929533
},
"csqa_sl_verb_5shot": {
"acc,none": 0.21703521703521703,
"acc_stderr,none": 0.011802018846529998,
"acc_norm,none": 0.2244062244062244,
"acc_norm_stderr,none": 0.011944134676023543,
"bpb,none": 0.7256812573105682,
"bpb_stderr,none": 0.00928482014813608,
"logprob,none": -5.9137141026989735,
"logprob_stderr,none": 0.06319895391742075,
"choice_logprob,none": -2.7346849452204633,
"choice_logprob_stderr,none": 0.05809857353035866,
"choice_prob_norm,none": 0.20792358055004104,
"choice_prob_norm_stderr,none": 0.0015422881711299034,
"choice_logprob_norm,none": -1.6081352514053808,
"choice_logprob_norm_stderr,none": 0.008189810299965718
},
"hellaswag_0shot": {
"acc,none": 0.27365066719776937,
"acc_stderr,none": 0.0044492062959223385,
"acc_norm,none": 0.28281218880701053,
"acc_norm_stderr,none": 0.004494454911844578,
"bpb,none": 1.0740874480201377,
"bpb_stderr,none": 0.0022677168676193547,
"logprob,none": -98.78564271754037,
"logprob_stderr,none": 0.492517482756744,
"choice_logprob,none": -27.512362821253223,
"choice_logprob_stderr,none": 0.319339602879014,
"choice_prob_norm,none": 0.2558516005274605,
"choice_prob_norm_stderr,none": 0.0004393847946353529,
"choice_logprob_norm,none": -1.3781728465369107,
"choice_logprob_norm_stderr,none": 0.001746737742556392
},
"hellaswag_5shot": {
"acc,none": 0.2717586138219478,
"acc_stderr,none": 0.0044395694474074245,
"acc_norm,none": 0.2826130252937662,
"acc_norm_stderr,none": 0.004493495872000052,
"bpb,none": 1.0736875169267772,
"bpb_stderr,none": 0.0022517433643217404,
"logprob,none": -98.90815309641538,
"logprob_stderr,none": 0.4948650918629855,
"choice_logprob,none": -27.860797247308327,
"choice_logprob_stderr,none": 0.3223766978580821,
"choice_prob_norm,none": 0.2555068692757805,
"choice_prob_norm_stderr,none": 0.00043432782926097806,
"choice_logprob_norm,none": -1.3791684810533902,
"choice_logprob_norm_stderr,none": 0.0017251494480918924
},
"openbookqa_0shot": {
"acc,none": 0.12,
"acc_stderr,none": 0.014547276256845681,
"acc_norm,none": 0.262,
"acc_norm_stderr,none": 0.019684688820194727,
"bpb,none": 2.079080085689421,
"bpb_stderr,none": 0.05234480486774542,
"logprob,none": -21.64301951789856,
"logprob_stderr,none": 0.5691098132930806,
"choice_logprob,none": -7.156216684167616,
"choice_logprob_stderr,none": 0.3352144353973983,
"choice_prob_norm,none": 0.25144437273949805,
"choice_prob_norm_stderr,none": 0.005423994663229431,
"choice_logprob_norm,none": -1.5301603545658207,
"choice_logprob_norm_stderr,none": 0.030627603288517418
},
"piqa_5shot": {
"acc,none": 0.6175190424374319,
"acc_stderr,none": 0.011339019654272347,
"acc_norm,none": 0.6235038084874864,
"acc_norm_stderr,none": 0.011304341550126724,
"bpb,none": 1.319102970924556,
"bpb_stderr,none": 0.010677881732896393,
"logprob,none": -79.11812885282349,
"logprob_stderr,none": 1.6524612666055258,
"choice_logprob,none": -3.7497750724473944,
"choice_logprob_stderr,none": 0.25588934103428335,
"choice_prob_norm,none": 0.5070240070215941,
"choice_prob_norm_stderr,none": 0.0014095207300394968,
"choice_logprob_norm,none": -0.6867061286650964,
"choice_logprob_norm_stderr,none": 0.002925559867739968
},
"winogrande_5shot": {
"acc,none": 0.5067087608524072,
"acc_stderr,none": 0.014051220692330346,
"bpb,none": 0.4880828489628066,
"bpb_stderr,none": 0.026688288906278192,
"logprob,none": -21.954895537014355,
"logprob_stderr,none": 0.3190544626321982,
"choice_logprob,none": -0.7630275153842117,
"choice_logprob_stderr,none": 0.012693712932140106,
"choice_prob_norm,none": 0.5008606085032802,
"choice_prob_norm_stderr,none": 0.0007794560275754764,
"choice_logprob_norm,none": -0.6937036703519631,
"choice_logprob_norm_stderr,none": 0.002436890803632395
},
"wsc273_0shot": {
"acc,none": 0.5311355311355311,
"acc_stderr,none": 0.030258116236228354,
"bpb,none": 0.7306337300105169,
"bpb_stderr,none": 0.026023760734723574,
"logprob,none": -30.255369392506804,
"logprob_stderr,none": 0.5934678978621671,
"choice_logprob,none": -0.8493739626737397,
"choice_logprob_stderr,none": 0.04461467499844924,
"choice_prob_norm,none": 0.5004867547282129,
"choice_prob_norm_stderr,none": 0.0013390217607164177,
"choice_logprob_norm,none": -0.6931541682543674,
"choice_logprob_norm_stderr,none": 0.0026953318225662716
},
"medmcqa_5shot": {
"acc,none": 0.254601960315563,
"acc_stderr,none": 0.006736479313671978,
"acc_norm,none": 0.254601960315563,
"acc_norm_stderr,none": 0.006736479313671978,
"logprob,none": -1.5643893120364656,
"logprob_stderr,none": 0.009425238897616482
},
"medmcqa_sl_verb_5shot": {
"acc,none": 0.23595505617977527,
"acc_stderr,none": 0.006565715840345552,
"acc_norm,none": 0.2660769782452785,
"acc_norm_stderr,none": 0.006833400751317717,
"bpb,none": 0.3124402539296528,
"bpb_stderr,none": 0.0037842713576437255,
"logprob,none": -4.155512506757203,
"logprob_stderr,none": 0.03843810575569772,
"choice_logprob,none": -2.290517659898563,
"choice_logprob_stderr,none": 0.030926033553213488,
"choice_prob_norm,none": 0.25180879174081117,
"choice_prob_norm_stderr,none": 0.0005218437168622727,
"choice_logprob_norm,none": -1.3893679942965198,
"choice_logprob_norm_stderr,none": 0.0023539446344596877
},
"truthfulqa_mc1_0shot": {
"acc,none": 0.24112607099143207,
"acc_stderr,none": 0.014974827279752339,
"logprob,none": -27.578771195475884,
"logprob_stderr,none": 0.5337992699494261
},
"logprob_gsm8k_5shot": {
"nll,none": 230.45470771182207,
"nll_stderr,none": 2.6424109037706884,
"bpb,none": 1.1896377127095594,
"bpb_stderr,none": 0.007719148528220743
},
"logprob_humaneval_10shot": {
"nll,none": 81.68491005316014,
"nll_stderr,none": 4.32215653040701,
"bpb,none": 0.7464391807207046,
"bpb_stderr,none": 0.026115364473812668
}
}
},
{
"mix": "v0",
"hidden_dim": 768,
"budget": 1.7e+18,
"tasks": {
"mmlu_sl_verb_0shot": {
"acc_norm,none": 0.2775957840763424,
"acc_norm_stderr,none": 0.0037750876153391203,
"choice_logprob_norm,none": -1.391611070771956,
"choice_logprob_norm_stderr,none": 0.0015553923744926368,
"choice_logprob,none": -2.3037084751387655,
"choice_logprob_stderr,none": 0.017418539987674318,
"choice_prob_norm,none": 0.2530658875682952,
"choice_prob_norm_stderr,none": 0.0003613764085044182,
"bpb,none": 0.6721383592481129,
"bpb_stderr,none": 0.0034005418561082676,
"acc,none": 0.26150121065375304,
"acc_stderr,none": 0.0037041449232690232,
"logprob,none": -9.8900194814625,
"logprob_stderr,none": 0.02629969002592281
},
"mmlu_sl_verb_5shot": {
"acc,none": 0.2701182167782367,
"acc_stderr,none": 0.0037393188843305473,
"acc_norm,none": 0.2822959692351517,
"acc_norm_stderr,none": 0.0037926419826868423,
"choice_prob_norm,none": 0.253361590100185,
"choice_prob_norm_stderr,none": 0.00031179648016030603,
"choice_logprob_norm,none": -1.3862776793159397,
"choice_logprob_norm_stderr,none": 0.0013342697128226242,
"bpb,none": 0.36913150000490247,
"bpb_stderr,none": 0.0021225757010844147,
"logprob,none": -6.0357810376667285,
"logprob_stderr,none": 0.028751212649447598,
"choice_logprob,none": -2.560116508422288,
"choice_logprob_stderr,none": 0.020859660753929612
},
"arc_challenge_5shot": {
"acc,none": 0.1945392491467577,
"acc_stderr,none": 0.011567709174648728,
"acc_norm,none": 0.25,
"acc_norm_stderr,none": 0.012653835621466646,
"bpb,none": 1.2990533200623655,
"bpb_stderr,none": 0.02389440741664436,
"logprob,none": -21.88834087026811,
"logprob_stderr,none": 0.3619037308298327,
"choice_logprob,none": -5.4782723607587895,
"choice_logprob_stderr,none": 0.16721914210574626,
"choice_prob_norm,none": 0.24918523792736294,
"choice_prob_norm_stderr,none": 0.0021066020166690346,
"choice_logprob_norm,none": -1.4422897685462663,
"choice_logprob_norm_stderr,none": 0.010874403351557026
},
"arc_easy_5shot": {
"acc,none": 0.49326599326599324,
"acc_stderr,none": 0.010258852980991825,
"acc_norm,none": 0.4654882154882155,
"acc_norm_stderr,none": 0.010235314238969392,
"bpb,none": 1.1177755064573227,
"bpb_stderr,none": 0.011837585105800234,
"logprob,none": -15.169280452158315,
"logprob_stderr,none": 0.22071591731508317,
"choice_logprob,none": -2.576503623539023,
"choice_logprob_stderr,none": 0.08702443636440085,
"choice_prob_norm,none": 0.2924226290313906,
"choice_prob_norm_stderr,none": 0.0017791272246730923,
"choice_logprob_norm,none": -1.2745287621863055,
"choice_logprob_norm_stderr,none": 0.006427102488010312
},
"boolq_10shot": {
"acc,none": 0.5850152905198777,
"acc_stderr,none": 0.008617716361921568,
"bpb,none": 0.49409554146117635,
"bpb_stderr,none": 0.007473232864219822,
"logprob,none": -0.7884021228241993,
"logprob_stderr,none": 0.00996583463958122,
"choice_logprob,none": -0.7369341656711315,
"choice_logprob_stderr,none": 0.009847674123941044,
"choice_prob_norm,none": 0.5399909671321121,
"choice_prob_norm_stderr,none": 0.003280597434464875,
"choice_logprob_norm,none": -0.6926188752292329,
"choice_logprob_norm_stderr,none": 0.007338771601016744
},
"boolq_sl_verb_10shot": {
"acc,none": 0.5602446483180428,
"acc_stderr,none": 0.008681343983423961,
"acc_norm,none": 0.5807339449541284,
"acc_norm_stderr,none": 0.008630302070999095,
"bpb,none": 0.5007309178926469,
"bpb_stderr,none": 0.00840140955320915,
"logprob,none": -0.8160195627708319,
"logprob_stderr,none": 0.012073004117876136,
"choice_logprob,none": -0.805065733835213,
"choice_logprob_stderr,none": 0.012071066960501553,
"choice_prob_norm,none": 0.535390097067654,
"choice_prob_norm_stderr,none": 0.0035521285398966668,
"choice_logprob_norm,none": -0.7187861770233629,
"choice_logprob_norm_stderr,none": 0.00826172120852258
},
"copa_0shot": {
"acc,none": 0.67,
"acc_stderr,none": 0.04725815626252606,
"bpb,none": 1.745047501847046,
"bpb_stderr,none": 0.04642315348274156,
"logprob,none": -32.88267251968384,
"logprob_stderr,none": 0.5847845172179112,
"choice_logprob,none": -1.217226976845902,
"choice_logprob_stderr,none": 0.2178711669542397,
"choice_prob_norm,none": 0.5100176054638571,
"choice_prob_norm_stderr,none": 0.008069449235766888,
"choice_logprob_norm,none": -0.6875663877071697,
"choice_logprob_norm_stderr,none": 0.01768362645271402
},
"csqa_5shot": {
"acc,none": 0.20884520884520885,
"acc_stderr,none": 0.011637590576063055,
"bpb,none": 2.7082703285716865,
"bpb_stderr,none": 0.03715734041556212,
"logprob,none": -1.8772299424436227,
"logprob_stderr,none": 0.025755505746153002,
"choice_logprob,none": -1.8585579291608698,
"choice_logprob_stderr,none": 0.02580663121801978,
"choice_prob_norm,none": 0.20429292250736647,
"choice_prob_norm_stderr,none": 0.004053738224061062,
"choice_logprob_norm,none": -2.067910764468694,
"choice_logprob_norm_stderr,none": 0.03749094489139384
},
"csqa_sl_verb_5shot": {
"acc,none": 0.21048321048321048,
"acc_stderr,none": 0.011671038436522901,
"acc_norm,none": 0.23996723996723995,
"acc_norm_stderr,none": 0.012226783409751469,
"bpb,none": 0.6441823880814893,
"bpb_stderr,none": 0.008549812822294335,
"logprob,none": -5.222800996828822,
"logprob_stderr,none": 0.05706464555613182,
"choice_logprob,none": -2.5830368086363924,
"choice_logprob_stderr,none": 0.05420669315677992,
"choice_prob_norm,none": 0.21245686706248418,
"choice_prob_norm_stderr,none": 0.0015317316633513673,
"choice_logprob_norm,none": -1.5843735654273707,
"choice_logprob_norm_stderr,none": 0.007915773259093957
},
"hellaswag_0shot": {
"acc,none": 0.30103565026887075,
"acc_stderr,none": 0.004577707025031402,
"acc_norm,none": 0.33608842859988053,
"acc_norm_stderr,none": 0.004714041652598645,
"bpb,none": 0.9569241910847363,
"bpb_stderr,none": 0.0021455688918908702,
"logprob,none": -87.91369925982163,
"logprob_stderr,none": 0.43935017760698675,
"choice_logprob,none": -23.19143379806059,
"choice_logprob_stderr,none": 0.2796031552364701,
"choice_prob_norm,none": 0.26202436456375305,
"choice_prob_norm_stderr,none": 0.00042843709347952896,
"choice_logprob_norm,none": -1.352910072370103,
"choice_logprob_norm_stderr,none": 0.0016609235613495087
},
"hellaswag_5shot": {
"acc,none": 0.2982473610834495,
"acc_stderr,none": 0.004565536808632522,
"acc_norm,none": 0.3325034853614818,
"acc_norm_stderr,none": 0.004701474865207039,
"bpb,none": 0.9521061871979787,
"bpb_stderr,none": 0.0021221993555018753,
"logprob,none": -87.59647112200399,
"logprob_stderr,none": 0.439894342974009,
"choice_logprob,none": -23.552198224947265,
"choice_logprob_stderr,none": 0.2827691640786014,
"choice_prob_norm,none": 0.2615570232697049,
"choice_prob_norm_stderr,none": 0.0004246979309501359,
"choice_logprob_norm,none": -1.3544657472921613,
"choice_logprob_norm_stderr,none": 0.0016456457725755095
},
"openbookqa_0shot": {
"acc,none": 0.18,
"acc_stderr,none": 0.017198592476314264,
"acc_norm,none": 0.27,
"acc_norm_stderr,none": 0.01987435483128749,
"bpb,none": 1.9446477187944575,
"bpb_stderr,none": 0.052677841494031055,
"logprob,none": -20.190474004745482,
"logprob_stderr,none": 0.5390781181062633,
"choice_logprob,none": -6.4651767189708655,
"choice_logprob_stderr,none": 0.3134653299574563,
"choice_prob_norm,none": 0.2648284760112857,
"choice_prob_norm_stderr,none": 0.005724075145660683,
"choice_logprob_norm,none": -1.4763545745536883,
"choice_logprob_norm_stderr,none": 0.03181823190384659
},
"piqa_5shot": {
"acc,none": 0.6550598476605005,
"acc_stderr,none": 0.011090670102993161,
"acc_norm,none": 0.6512513601741022,
"acc_norm_stderr,none": 0.011119263056159588,
"bpb,none": 1.1999875195553047,
"bpb_stderr,none": 0.011284952406624285,
"logprob,none": -71.1018480836891,
"logprob_stderr,none": 1.4948048391986262,
"choice_logprob,none": -3.265468970186985,
"choice_logprob_stderr,none": 0.22136651025327092,
"choice_prob_norm,none": 0.5102461993314247,
"choice_prob_norm_stderr,none": 0.0014059441446873727,
"choice_logprob_norm,none": -0.6809067069938138,
"choice_logprob_norm_stderr,none": 0.003156800336821187
},
"winogrande_5shot": {
"acc,none": 0.5114443567482242,
"acc_stderr,none": 0.014048804199859329,
"bpb,none": 0.45609870059930196,
"bpb_stderr,none": 0.025146910655114735,
"logprob,none": -20.520925116520186,
"logprob_stderr,none": 0.29678939016817496,
"choice_logprob,none": -0.7704629461893834,
"choice_logprob_stderr,none": 0.015193384863767105,
"choice_prob_norm,none": 0.5010707054381249,
"choice_prob_norm_stderr,none": 0.0007834292997275223,
"choice_logprob_norm,none": -0.6936490525307388,
"choice_logprob_norm_stderr,none": 0.0027829346803724215
},
"wsc273_0shot": {
"acc,none": 0.5531135531135531,
"acc_stderr,none": 0.030145416591160438,
"bpb,none": 0.67698671023787,
"bpb_stderr,none": 0.021979327889394554,
"logprob,none": -28.21373940562154,
"logprob_stderr,none": 0.5021699672178789,
"choice_logprob,none": -0.8130615044760181,
"choice_logprob_stderr,none": 0.04625245472538832,
"choice_prob_norm,none": 0.5016862376228498,
"choice_prob_norm_stderr,none": 0.0012715073257390224,
"choice_logprob_norm,none": -0.6906336423309668,
"choice_logprob_norm_stderr,none": 0.0024950331619603837
},
"medmcqa_5shot": {
"acc,none": 0.25197226870666983,
"acc_stderr,none": 0.00671341057274834,
"acc_norm,none": 0.25197226870666983,
"acc_norm_stderr,none": 0.00671341057274834,
"logprob,none": -1.4492818664598204,
"logprob_stderr,none": 0.0056398661877203465
},
"medmcqa_sl_verb_5shot": {
"acc,none": 0.24049725077695433,
"acc_stderr,none": 0.006608877757906801,
"acc_norm,none": 0.2550800860626345,
"acc_norm_stderr,none": 0.006740638791982925,
"bpb,none": 0.36389366195139783,
"bpb_stderr,none": 0.004394280237268685,
"logprob,none": -4.67044810487299,
"logprob_stderr,none": 0.03853183344816826,
"choice_logprob,none": -2.5214551319843004,
"choice_logprob_stderr,none": 0.03263217231457442,
"choice_prob_norm,none": 0.25215177191095767,
"choice_prob_norm_stderr,none": 0.0006548805042050642,
"choice_logprob_norm,none": -1.3941606684728476,
"choice_logprob_norm_stderr,none": 0.0029943180744198277
},
"truthfulqa_mc1_0shot": {
"acc,none": 0.2215422276621787,
"acc_stderr,none": 0.014537867601301145,
"logprob,none": -24.17374606114998,
"logprob_stderr,none": 0.45471216502872974
},
"logprob_gsm8k_5shot": {
"nll,none": 187.34562071134323,
"nll_stderr,none": 2.254213084999866,
"bpb,none": 0.9638362974839736,
"bpb_stderr,none": 0.0069839552336510066
},
"logprob_humaneval_10shot": {
"nll,none": 75.958770810104,
"nll_stderr,none": 3.869064462254324,
"bpb,none": 0.7154369386112661,
"bpb_stderr,none": 0.024701782506346564
}
}
},
{
"mix": "v2",
"hidden_dim": 768,
"budget": 1.7e+18,
"tasks": {
"mmlu_sl_verb_0shot": {
"choice_prob_norm,none": 0.2540078438483089,
"choice_prob_norm_stderr,none": 0.0003950146977538393,
"bpb,none": 0.630836113763738,
"bpb_stderr,none": 0.0034420608674845072,
"logprob,none": -9.216035056737201,
"logprob_stderr,none": 0.028407621684874686,
"acc,none": 0.26605896595926504,
"acc_stderr,none": 0.0037265302413822345,
"choice_logprob,none": -2.5552756392952287,
"choice_logprob_stderr,none": 0.02051436813616077,
"choice_logprob_norm,none": -1.3921231673770733,
"choice_logprob_norm_stderr,none": 0.001696175570576605,
"acc_norm,none": 0.2751032616436405,
"acc_norm_stderr,none": 0.003757557237347936
},
"mmlu_sl_verb_5shot": {
"acc,none": 0.2741774675972084,
"acc_stderr,none": 0.003762954740555449,
"choice_prob_norm,none": 0.25372568733411754,
"choice_prob_norm_stderr,none": 0.00031626526844855867,
"bpb,none": 0.3520098493411551,
"bpb_stderr,none": 0.0020665687953088357,
"acc_norm,none": 0.2803731662156388,
"acc_norm_stderr,none": 0.00378647862035118,
"logprob,none": -6.0582327865717485,
"logprob_stderr,none": 0.035340315604244055,
"choice_logprob,none": -2.756019169963866,
"choice_logprob_stderr,none": 0.025075966501954407,
"choice_logprob_norm,none": -1.384894680459617,
"choice_logprob_norm_stderr,none": 0.0013349007829572429
},
"arc_challenge_5shot": {
"acc,none": 0.2175767918088737,
"acc_stderr,none": 0.012057262020972502,
"acc_norm,none": 0.2645051194539249,
"acc_norm_stderr,none": 0.012889272949313364,
"bpb,none": 1.2767585656683982,
"bpb_stderr,none": 0.023838389186886518,
"logprob,none": -21.650589166647745,
"logprob_stderr,none": 0.36685339341096324,
"choice_logprob,none": -5.38573060146569,
"choice_logprob_stderr,none": 0.1690200550739846,
"choice_prob_norm,none": 0.2517412157952101,
"choice_prob_norm_stderr,none": 0.0020922538941133286,
"choice_logprob_norm,none": -1.4322964730580594,
"choice_logprob_norm_stderr,none": 0.010889640242400956
},
"arc_easy_5shot": {
"acc,none": 0.5109427609427609,
"acc_stderr,none": 0.010257326131172875,
"acc_norm,none": 0.5004208754208754,
"acc_norm_stderr,none": 0.010259779886094418,
"bpb,none": 1.0780598841198599,
"bpb_stderr,none": 0.012181664239511076,
"logprob,none": -14.719918382488919,
"logprob_stderr,none": 0.2191182331259823,
"choice_logprob,none": -2.4034967520442434,
"choice_logprob_stderr,none": 0.08495732381111372,
"choice_prob_norm,none": 0.2998081729503958,
"choice_prob_norm_stderr,none": 0.0018599082559222627,
"choice_logprob_norm,none": -1.2522716570330117,
"choice_logprob_norm_stderr,none": 0.0066828004516811205
},
"boolq_10shot": {
"acc,none": 0.5241590214067279,
"acc_stderr,none": 0.008734840763194161,
"bpb,none": 0.6066002273508516,
"bpb_stderr,none": 0.007455694237622892,
"logprob,none": -1.0340861801707417,
"logprob_stderr,none": 0.011358614622538975,
"choice_logprob,none": -0.8071693390587505,
"choice_logprob_stderr,none": 0.010142921993201668,
"choice_prob_norm,none": 0.5222511487380298,
"choice_prob_norm_stderr,none": 0.0030723125576453416,
"choice_logprob_norm,none": -0.7173137661181964,
"choice_logprob_norm_stderr,none": 0.006834398042458719
},
"boolq_sl_verb_10shot": {
"acc,none": 0.5697247706422018,
"acc_stderr,none": 0.008659608602932495,
"acc_norm,none": 0.5837920489296636,
"acc_norm_stderr,none": 0.008621380519419278,
"bpb,none": 0.7037882765719319,
"bpb_stderr,none": 0.015542286079910459,
"logprob,none": -1.0663110018505597,
"logprob_stderr,none": 0.021257464026123718,
"choice_logprob,none": -1.0127293218081876,
"choice_logprob_stderr,none": 0.02130786713780054,
"choice_prob_norm,none": 0.5533353622550738,
"choice_prob_norm_stderr,none": 0.005088416740268586,
"choice_logprob_norm,none": -0.8457833515832397,
"choice_logprob_norm_stderr,none": 0.015181862190279869
},
"copa_0shot": {
"acc,none": 0.65,
"acc_stderr,none": 0.047937248544110196,
"bpb,none": 1.8546625683366378,
"bpb_stderr,none": 0.05305699917413002,
"logprob,none": -34.73192947387695,
"logprob_stderr,none": 0.5707587662603286,
"choice_logprob,none": -1.2919342387281028,
"choice_logprob_stderr,none": 0.22408305328415662,
"choice_prob_norm,none": 0.5085437035242247,
"choice_prob_norm_stderr,none": 0.009136998278660999,
"choice_logprob_norm,none": -0.6943084074677344,
"choice_logprob_norm_stderr,none": 0.019840236643233748
},
"csqa_5shot": {
"acc,none": 0.21212121212121213,
"acc_stderr,none": 0.01170420281420025,
"bpb,none": 3.500847134179842,
"bpb_stderr,none": 0.081985779112656,
"logprob,none": -2.426602320628123,
"logprob_stderr,none": 0.056828211637947934,
"choice_logprob,none": -2.414209093293303,
"choice_logprob_stderr,none": 0.05684788904568409,
"choice_prob_norm,none": 0.2026777514576224,
"choice_prob_norm_stderr,none": 0.005205199439650168,
"choice_logprob_norm,none": -2.9333657572689336,
"choice_logprob_norm_stderr,none": 0.08218007673061352
},
"csqa_sl_verb_5shot": {
"acc,none": 0.27764127764127766,
"acc_stderr,none": 0.012821491901599497,
"acc_norm,none": 0.28992628992628994,
"acc_norm_stderr,none": 0.012990192906601208,
"bpb,none": 0.7278105251167664,
"bpb_stderr,none": 0.009948200510483247,
"logprob,none": -5.857854351852879,
"logprob_stderr,none": 0.06200791056605534,
"choice_logprob,none": -2.4607794690097635,
"choice_logprob_stderr,none": 0.05734824369698029,
"choice_prob_norm,none": 0.21963906264811586,
"choice_prob_norm_stderr,none": 0.0017687153334317231,
"choice_logprob_norm,none": -1.560420362611322,
"choice_logprob_norm_stderr,none": 0.009020672665039387
},
"hellaswag_0shot": {
"acc,none": 0.3200557657837084,
"acc_stderr,none": 0.004655442766599458,
"acc_norm,none": 0.3679545907189803,
"acc_norm_stderr,none": 0.004812633280078266,
"bpb,none": 0.9540877214137002,
"bpb_stderr,none": 0.00222792153316327,
"logprob,none": -87.33792356147492,
"logprob_stderr,none": 0.43654116368740237,
"choice_logprob,none": -22.551195968341176,
"choice_logprob_stderr,none": 0.2775152461199856,
"choice_prob_norm,none": 0.2656253688728796,
"choice_prob_norm_stderr,none": 0.00044812549442936736,
"choice_logprob_norm,none": -1.3402022360881427,
"choice_logprob_norm_stderr,none": 0.0017198324351090767
},
"hellaswag_5shot": {
"acc,none": 0.3104959171479785,
"acc_stderr,none": 0.004617510423156671,
"acc_norm,none": 0.3575980880302729,
"acc_norm_stderr,none": 0.004783133725599507,
"bpb,none": 0.9459726089030255,
"bpb_stderr,none": 0.00214501632085492,
"logprob,none": -86.8965860186666,
"logprob_stderr,none": 0.43671983617678317,
"choice_logprob,none": -22.94937022533771,
"choice_logprob_stderr,none": 0.27954745144555293,
"choice_prob_norm,none": 0.26374843093315214,
"choice_prob_norm_stderr,none": 0.0004292208136610329,
"choice_logprob_norm,none": -1.3462078377996887,
"choice_logprob_norm_stderr,none": 0.0016516130788471967
},
"openbookqa_0shot": {
"acc,none": 0.202,
"acc_stderr,none": 0.017973260031288258,
"acc_norm,none": 0.304,
"acc_norm_stderr,none": 0.020591649571224932,
"bpb,none": 1.9421223179926048,
"bpb_stderr,none": 0.05390181468108013,
"logprob,none": -20.149716567993163,
"logprob_stderr,none": 0.5456166117411405,
"choice_logprob,none": -6.528613793204991,
"choice_logprob_stderr,none": 0.32731071815575646,
"choice_prob_norm,none": 0.2695584061430335,
"choice_prob_norm_stderr,none": 0.005978992778579398,
"choice_logprob_norm,none": -1.4669640208829184,
"choice_logprob_norm_stderr,none": 0.032866052905030904
},
"piqa_5shot": {
"acc,none": 0.6588683351468988,
"acc_stderr,none": 0.011061289443962714,
"acc_norm,none": 0.6577801958650707,
"acc_norm_stderr,none": 0.011069764658685451,
"bpb,none": 1.2017750498066977,
"bpb_stderr,none": 0.011273415908386208,
"logprob,none": -71.33160145835336,
"logprob_stderr,none": 1.510308364798975,
"choice_logprob,none": -3.2394468434463275,
"choice_logprob_stderr,none": 0.22018840222417524,
"choice_prob_norm,none": 0.5110515190521396,
"choice_prob_norm_stderr,none": 0.0013972988334892084,
"choice_logprob_norm,none": -0.6788681723766132,
"choice_logprob_norm_stderr,none": 0.0029807177483003942
},
"winogrande_5shot": {
"acc,none": 0.4940805051302289,
"acc_stderr,none": 0.014051500838485807,
"bpb,none": 0.4533295910393437,
"bpb_stderr,none": 0.0246540642680002,
"logprob,none": -20.42467473940108,
"logprob_stderr,none": 0.29524299652390007,
"choice_logprob,none": -0.7792085946040049,
"choice_logprob_stderr,none": 0.01540477769465132,
"choice_prob_norm,none": 0.5012321330767487,
"choice_prob_norm_stderr,none": 0.0008050836840195189,
"choice_logprob_norm,none": -0.6927810087161707,
"choice_logprob_norm_stderr,none": 0.002203592803854555
},
"wsc273_0shot": {
"acc,none": 0.5421245421245421,
"acc_stderr,none": 0.03020916805173445,
"bpb,none": 0.69385733165508,
"bpb_stderr,none": 0.022998563698878597,
"logprob,none": -28.90814703637427,
"logprob_stderr,none": 0.524553202970914,
"choice_logprob,none": -0.9073719422206339,
"choice_logprob_stderr,none": 0.05850329381819185,
"choice_prob_norm,none": 0.5017320241407558,
"choice_prob_norm_stderr,none": 0.001213596529043583,
"choice_logprob_norm,none": -0.6904815685479141,
"choice_logprob_norm_stderr,none": 0.002413960436212978
},
"medmcqa_5shot": {
"acc,none": 0.2634472866363854,
"acc_stderr,none": 0.006811719684474193,
"acc_norm,none": 0.2634472866363854,
"acc_norm_stderr,none": 0.006811719684474193,
"logprob,none": -1.540723278338391,
"logprob_stderr,none": 0.0093866109633236
},
"medmcqa_sl_verb_5shot": {
"acc,none": 0.23691130767391824,
"acc_stderr,none": 0.006574888462196943,
"acc_norm,none": 0.2589050920392063,
"acc_norm_stderr,none": 0.006773532148760628,
"bpb,none": 0.33043938542739426,
"bpb_stderr,none": 0.0041065256563731015,
"logprob,none": -4.312089565636409,
"logprob_stderr,none": 0.04196129456888169,
"choice_logprob,none": -2.507792368013145,
"choice_logprob_stderr,none": 0.03505141101309658,
"choice_prob_norm,none": 0.2521027067325027,
"choice_prob_norm_stderr,none": 0.0006165004483683178,
"choice_logprob_norm,none": -1.3921941747791449,
"choice_logprob_norm_stderr,none": 0.002789274572546021
},
"truthfulqa_mc1_0shot": {
"acc,none": 0.19951040391676866,
"acc_stderr,none": 0.013989929967559664,
"logprob,none": -23.7913622867813,
"logprob_stderr,none": 0.45205070626438093
},
"logprob_gsm8k_5shot": {
"nll,none": 203.36205847322987,
"nll_stderr,none": 2.37812013888117,
"bpb,none": 1.0524536086094196,
"bpb_stderr,none": 0.007615089454244769
},
"logprob_humaneval_10shot": {
"nll,none": 82.56886346747235,
"nll_stderr,none": 4.131968135247329,
"bpb,none": 0.7633049076553656,
"bpb_stderr,none": 0.025037153623458777
}
}
},
{
"mix": "v3",
"hidden_dim": 768,
"budget": 1.7e+18,
"tasks": {
"mmlu_sl_verb_0shot": {
"acc,none": 0.2649195271328871,
"acc_stderr,none": 0.0037155548914596776,
"acc_norm,none": 0.2806580259222333,
"acc_norm_stderr,none": 0.0037854715595279447,
"bpb,none": 0.6024695207378018,
"bpb_stderr,none": 0.00320163813255778,
"logprob,none": -8.978868199945595,
"logprob_stderr,none": 0.028158187532551462,
"choice_logprob,none": -2.334509456091557,
"choice_logprob_stderr,none": 0.018004150410301928,
"choice_prob_norm,none": 0.25327204781743756,
"choice_prob_norm_stderr,none": 0.00036052901475016897,
"choice_logprob_norm,none": -1.3915630382334927,
"choice_logprob_norm_stderr,none": 0.0015694241062031151,
"sample_count": {
"acc,none": 14042,
"acc_norm,none": 14042,
"bpb,none": 14042,
"logprob,none": 14042,
"choice_logprob,none": 14042,
"choice_prob_norm,none": 14042,
"choice_logprob_norm,none": 14042
}
},
"mmlu_sl_verb_5shot": {
"acc,none": 0.27075915111807436,
"acc_stderr,none": 0.003744294461398903,
"acc_norm,none": 0.27610027061672127,
"acc_norm_stderr,none": 0.003765766814329832,
"bpb,none": 0.26312100151367407,
"bpb_stderr,none": 0.0016353854951214155,
"logprob,none": -4.2898202062336175,
"logprob_stderr,none": 0.02519748498054928,
"choice_logprob,none": -2.230520127314357,
"choice_logprob_stderr,none": 0.018307220488809502,
"choice_prob_norm,none": 0.25252420092599626,
"choice_prob_norm_stderr,none": 0.00024856771120859505,
"choice_logprob_norm,none": -1.3858647229236738,
"choice_logprob_norm_stderr,none": 0.0010940303113457382,
"sample_count": {
"acc,none": 14042,
"acc_norm,none": 14042,
"bpb,none": 14042,
"logprob,none": 14042,
"choice_logprob,none": 14042,
"choice_prob_norm,none": 14042,
"choice_logprob_norm,none": 14042
}
},
"arc_challenge_5shot": {
"acc,none": 0.19795221843003413,
"acc_stderr,none": 0.011643990971573379,
"acc_norm,none": 0.2440273037542662,
"acc_norm_stderr,none": 0.012551447627856255,
"bpb,none": 1.3090066714226691,
"bpb_stderr,none": 0.022716803418791277,
"logprob,none": -22.191338637583083,
"logprob_stderr,none": 0.36999834908122636,
"choice_logprob,none": -5.6414561788964805,
"choice_logprob_stderr,none": 0.17537006077048314,
"choice_prob_norm,none": 0.24977816951595155,
"choice_prob_norm_stderr,none": 0.002139670595246792,
"choice_logprob_norm,none": -1.439082310977836,
"choice_logprob_norm_stderr,none": 0.010576117473324044
},
"arc_easy_5shot": {
"acc,none": 0.494949494949495,
"acc_stderr,none": 0.01025926010256586,
"acc_norm,none": 0.46296296296296297,
"acc_norm_stderr,none": 0.010231597249131053,
"bpb,none": 1.1585534970490599,
"bpb_stderr,none": 0.012481991142278126,
"logprob,none": -15.496999463047644,
"logprob_stderr,none": 0.22045258447123706,
"choice_logprob,none": -2.602005149650746,
"choice_logprob_stderr,none": 0.08816401484489347,
"choice_prob_norm,none": 0.29242443961219655,
"choice_prob_norm_stderr,none": 0.001805858994633604,
"choice_logprob_norm,none": -1.277943804823616,
"choice_logprob_norm_stderr,none": 0.006795895723581999
},
"boolq_10shot": {
"acc,none": 0.5470948012232416,
"acc_stderr,none": 0.00870617688583775,
"bpb,none": 0.6959641183652738,
"bpb_stderr,none": 0.011316618445433812,
"logprob,none": -1.1740724734210093,
"logprob_stderr,none": 0.017778192949909743,
"choice_logprob,none": -0.7711060924984204,
"choice_logprob_stderr,none": 0.009861549612757899,
"choice_prob_norm,none": 0.5320772519733842,
"choice_prob_norm_stderr,none": 0.003225951917275769,
"choice_logprob_norm,none": -0.7075314292595454,
"choice_logprob_norm_stderr,none": 0.007441305705339594
},
"boolq_sl_verb_10shot": {
"acc,none": 0.5443425076452599,
"acc_stderr,none": 0.008710597021081264,
"acc_norm,none": 0.5773700305810398,
"acc_norm_stderr,none": 0.008639722698719017,
"bpb,none": 0.5048590515072109,
"bpb_stderr,none": 0.0066582364068149165,
"logprob,none": -0.8384363474831305,
"logprob_stderr,none": 0.009256347318990573,
"choice_logprob,none": -0.762414644839067,
"choice_logprob_stderr,none": 0.009190905530342953,
"choice_prob_norm,none": 0.5267977251387381,
"choice_prob_norm_stderr,none": 0.002942421159012307,
"choice_logprob_norm,none": -0.702744399639691,
"choice_logprob_norm_stderr,none": 0.006554018005252461
},
"copa_0shot": {
"acc,none": 0.65,
"acc_stderr,none": 0.0479372485441102,
"bpb,none": 1.7791294591456646,
"bpb_stderr,none": 0.045845252353962085,
"logprob,none": -33.65744636535644,
"logprob_stderr,none": 0.6226578097302043,
"choice_logprob,none": -1.3726728936669463,
"choice_logprob_stderr,none": 0.2457164596192268,
"choice_prob_norm,none": 0.507246365763789,
"choice_prob_norm_stderr,none": 0.008340205133308472,
"choice_logprob_norm,none": -0.6939474969184765,
"choice_logprob_norm_stderr,none": 0.0181700017282806
},
"csqa_5shot": {
"acc,none": 0.19164619164619165,
"acc_stderr,none": 0.01126862497880163,
"bpb,none": 3.671707202868784,
"bpb_stderr,none": 0.08557388817365925,
"logprob,none": -2.545033495510142,
"logprob_stderr,none": 0.059315299317124,
"choice_logprob,none": -2.536059170760092,
"choice_logprob_stderr,none": 0.059330788905025834,
"choice_prob_norm,none": 0.19702426764674388,
"choice_prob_norm_stderr,none": 0.00564354409779329,
"choice_logprob_norm,none": -3.1414222594330394,
"choice_logprob_norm_stderr,none": 0.08585929446901473
},
"csqa_sl_verb_5shot": {
"acc,none": 0.21457821457821458,
"acc_stderr,none": 0.011753423094216849,
"acc_norm,none": 0.22522522522522523,
"acc_norm_stderr,none": 0.01195959122428623,
"bpb,none": 0.5689145201033557,
"bpb_stderr,none": 0.010213968008008441,
"logprob,none": -4.55398947854788,
"logprob_stderr,none": 0.0671578478652312,
"choice_logprob,none": -2.818725038008329,
"choice_logprob_stderr,none": 0.06353769805138083,
"choice_prob_norm,none": 0.20872767447697946,
"choice_prob_norm_stderr,none": 0.0016339148543048907,
"choice_logprob_norm,none": -1.6126542167411066,
"choice_logprob_norm_stderr,none": 0.009423999621414866
},
"hellaswag_0shot": {
"acc,none": 0.29725154351722766,
"acc_stderr,none": 0.004561141293448453,
"acc_norm,none": 0.32483569010157337,
"acc_norm_stderr,none": 0.004673563250946118,
"bpb,none": 0.9796705149258281,
"bpb_stderr,none": 0.0022091598069014645,
"logprob,none": -89.82898136321066,
"logprob_stderr,none": 0.44818314862513037,
"choice_logprob,none": -24.144845588658104,
"choice_logprob_stderr,none": 0.288170037808976,
"choice_prob_norm,none": 0.2609134652793668,
"choice_prob_norm_stderr,none": 0.00043502952684702746,
"choice_logprob_norm,none": -1.3577094881716545,
"choice_logprob_norm_stderr,none": 0.001694800352566026
},
"hellaswag_5shot": {
"acc,none": 0.2952599083847839,
"acc_stderr,none": 0.004552272447071751,
"acc_norm,none": 0.3247361083449512,
"acc_norm_stderr,none": 0.004673191423861227,
"bpb,none": 0.9765488031232005,
"bpb_stderr,none": 0.0021804283141941315,
"logprob,none": -89.67450358449308,
"logprob_stderr,none": 0.4494154799653304,
"choice_logprob,none": -24.543052017087717,
"choice_logprob_stderr,none": 0.2923068946123268,
"choice_prob_norm,none": 0.2602805317169324,
"choice_prob_norm_stderr,none": 0.00042882265701986254,
"choice_logprob_norm,none": -1.3597682409940002,
"choice_logprob_norm_stderr,none": 0.0016715171896852109
},
"openbookqa_0shot": {
"acc,none": 0.186,
"acc_stderr,none": 0.01741880678058396,
"acc_norm,none": 0.292,
"acc_norm_stderr,none": 0.020354375480530068,
"bpb,none": 1.9419830728944796,
"bpb_stderr,none": 0.050665847295894746,
"logprob,none": -20.289521512031556,
"logprob_stderr,none": 0.5450531296339609,
"choice_logprob,none": -6.617104865474552,
"choice_logprob_stderr,none": 0.3279247788040061,
"choice_prob_norm,none": 0.2641189282767381,
"choice_prob_norm_stderr,none": 0.0055841283469181225,
"choice_logprob_norm,none": -1.4721363660895315,
"choice_logprob_norm_stderr,none": 0.03031027603400177
},
"piqa_5shot": {
"acc,none": 0.6637649619151251,
"acc_stderr,none": 0.011022346708970227,
"acc_norm,none": 0.6561479869423286,
"acc_norm_stderr,none": 0.011082356277961393,
"bpb,none": 1.2075411435249774,
"bpb_stderr,none": 0.01135770003852137,
"logprob,none": -71.33046385097815,
"logprob_stderr,none": 1.4856433422319442,
"choice_logprob,none": -3.257656613888533,
"choice_logprob_stderr,none": 0.2216319564386613,
"choice_prob_norm,none": 0.5105612330678693,
"choice_prob_norm_stderr,none": 0.0014126100718653543,
"choice_logprob_norm,none": -0.6798722842751997,
"choice_logprob_norm_stderr,none": 0.0029750502872840567
},
"winogrande_5shot": {
"acc,none": 0.5351223362273086,
"acc_stderr,none": 0.014017773120881575,
"bpb,none": 0.4623989922607736,
"bpb_stderr,none": 0.02552736166876556,
"logprob,none": -20.793666890407042,
"logprob_stderr,none": 0.30062464277421497,
"choice_logprob,none": -0.7703221832931528,
"choice_logprob_stderr,none": 0.01604737295976134,
"choice_prob_norm,none": 0.5008314524445316,
"choice_prob_norm_stderr,none": 0.0007409387016776521,
"choice_logprob_norm,none": -0.6939116611648893,
"choice_logprob_norm_stderr,none": 0.00265109409506179
},
"wsc273_0shot": {
"acc,none": 0.5567765567765568,
"acc_stderr,none": 0.030120860870184642,
"bpb,none": 0.6722615630816129,
"bpb_stderr,none": 0.023688048767935867,
"logprob,none": -27.885882580236636,
"logprob_stderr,none": 0.5264396316213983,
"choice_logprob,none": -0.8601103994979528,
"choice_logprob_stderr,none": 0.04981320319007168,
"choice_prob_norm,none": 0.5010490999408292,
"choice_prob_norm_stderr,none": 0.0011558725038424912,
"choice_logprob_norm,none": -0.6917752194669604,
"choice_logprob_norm_stderr,none": 0.002311264655129336
},
"medmcqa_5shot": {
"acc,none": 0.27157542433660053,
"acc_stderr,none": 0.006877736197092466,
"acc_norm,none": 0.27157542433660053,
"acc_norm_stderr,none": 0.006877736197092466,
"logprob,none": -1.5242668319882715,
"logprob_stderr,none": 0.01075254837566572
},
"medmcqa_sl_verb_5shot": {
"acc,none": 0.254601960315563,
"acc_stderr,none": 0.006736479313671978,
"acc_norm,none": 0.2794644991632799,
"acc_norm_stderr,none": 0.006939033764961425,
"bpb,none": 0.23384956925224545,
"bpb_stderr,none": 0.0031576904180103636,
"logprob,none": -2.9715185612261994,
"logprob_stderr,none": 0.028759614619079072,
"choice_logprob,none": -1.8962299105082905,
"choice_logprob_stderr,none": 0.022391609953139855,
"choice_prob_norm,none": 0.2534780753988292,
"choice_prob_norm_stderr,none": 0.0004656791959382843,
"choice_logprob_norm,none": -1.3804754707606355,
"choice_logprob_norm_stderr,none": 0.002065318251034982
},
"truthfulqa_mc1_0shot": {
"acc,none": 0.21664626682986537,
"acc_stderr,none": 0.014421468452506974,
"logprob,none": -24.193991212833176,
"logprob_stderr,none": 0.4860067499388812
},
"logprob_gsm8k_5shot": {
"nll,none": 189.70925658069115,
"nll_stderr,none": 2.365351023976718,
"bpb,none": 0.9738379584517999,
"bpb_stderr,none": 0.007469410805016519
},
"logprob_humaneval_10shot": {
"nll,none": 58.51965685879312,
"nll_stderr,none": 3.2771923459713928,
"bpb,none": 0.5533597291127494,
"bpb_stderr,none": 0.02333208984243271
}
}
},
{
"mix": "v4",
"hidden_dim": 768,
"budget": 1.7e+18,
"tasks": {
"mmlu_sl_verb_0shot": {
"acc,none": 0.2545933627688364,
"acc_stderr,none": 0.0036675964285558024,
"acc_norm,none": 0.28087167070217917,
"acc_norm_stderr,none": 0.0037837947167598256,
"bpb,none": 0.604019403266609,
"bpb_stderr,none": 0.0030674671657727527,
"logprob,none": -9.094732472075744,
"logprob_stderr,none": 0.0270249902313662,
"choice_logprob,none": -2.301985849848546,
"choice_logprob_stderr,none": 0.01685994759971927,
"choice_prob_norm,none": 0.2526754526360594,
"choice_prob_norm_stderr,none": 0.0003488590114592353,
"choice_logprob_norm,none": -1.3922591756508496,
"choice_logprob_norm_stderr,none": 0.0014986315590676327,
"sample_count": {
"acc,none": 14042,
"acc_norm,none": 14042,
"bpb,none": 14042,
"logprob,none": 14042,
"choice_logprob,none": 14042,
"choice_prob_norm,none": 14042,
"choice_logprob_norm,none": 14042
}
},
"mmlu_sl_verb_5shot": {
"acc,none": 0.2659165361059678,
"acc_stderr,none": 0.0037246265371234447,
"acc_norm,none": 0.27958980202250394,
"acc_norm_stderr,none": 0.003782069677219712,
"bpb,none": 0.3021447949633298,
"bpb_stderr,none": 0.0016727105647384023,
"logprob,none": -5.318975468008762,
"logprob_stderr,none": 0.028990564850629445,
"choice_logprob,none": -2.3954261775916654,
"choice_logprob_stderr,none": 0.01977525158101873,
"choice_prob_norm,none": 0.2525448278556143,
"choice_prob_norm_stderr,none": 0.00024976017772156427,
"choice_logprob_norm,none": -1.3847116447076364,
"choice_logprob_norm_stderr,none": 0.0010683410918954095,
"sample_count": {
"acc,none": 14042,
"acc_norm,none": 14042,
"bpb,none": 14042,
"logprob,none": 14042,
"choice_logprob,none": 14042,
"choice_prob_norm,none": 14042,
"choice_logprob_norm,none": 14042
}
},
"arc_challenge_5shot": {
"acc,none": 0.20051194539249148,
"acc_stderr,none": 0.011700318050499396,
"acc_norm,none": 0.23122866894197952,
"acc_norm_stderr,none": 0.012320858834772293,
"bpb,none": 1.2587017012693331,
"bpb_stderr,none": 0.02289648048420843,
"logprob,none": -21.649527869940616,
"logprob_stderr,none": 0.37035561791551175,
"choice_logprob,none": -5.5860482295659715,
"choice_logprob_stderr,none": 0.1702262251043129,
"choice_prob_norm,none": 0.24933040742664814,
"choice_prob_norm_stderr,none": 0.001958966551226811,
"choice_logprob_norm,none": -1.4331417127089947,
"choice_logprob_norm_stderr,none": 0.009684509320890152
},
"arc_easy_5shot": {
"acc,none": 0.4739057239057239,
"acc_stderr,none": 0.010245801990240049,
"acc_norm,none": 0.4722222222222222,
"acc_norm_stderr,none": 0.010243938285881118,
"bpb,none": 1.0766069451676723,
"bpb_stderr,none": 0.011474280694012995,
"logprob,none": -14.897179114296781,
"logprob_stderr,none": 0.2224682584651731,
"choice_logprob,none": -2.596018089293899,
"choice_logprob_stderr,none": 0.08624427779609865,
"choice_prob_norm,none": 0.29303412859039996,
"choice_prob_norm_stderr,none": 0.0017927597251220823,
"choice_logprob_norm,none": -1.272503910085061,
"choice_logprob_norm_stderr,none": 0.0064001036716539285
},
"boolq_10shot": {
"acc,none": 0.5850152905198777,
"acc_stderr,none": 0.008617716361921567,
"bpb,none": 0.5296847318017859,
"bpb_stderr,none": 0.007708238146470792,
"logprob,none": -0.8481250299226254,
"logprob_stderr,none": 0.010126202956885915,
"choice_logprob,none": -0.7330099208405059,
"choice_logprob_stderr,none": 0.010079692335643068,
"choice_prob_norm,none": 0.5438853674999767,
"choice_prob_norm_stderr,none": 0.0033707419980152737,
"choice_logprob_norm,none": -0.6902830310649932,
"choice_logprob_norm_stderr,none": 0.007631074159632701
},
"boolq_sl_verb_10shot": {
"acc,none": 0.5483180428134556,
"acc_stderr,none": 0.00870412620615935,
"acc_norm,none": 0.5678899082568807,
"acc_norm_stderr,none": 0.008664067354619373,
"bpb,none": 0.5092157903518371,
"bpb_stderr,none": 0.007616494586568837,
"logprob,none": -0.8522696880025601,
"logprob_stderr,none": 0.011782420028001834,
"choice_logprob,none": -0.8200578351735449,
"choice_logprob_stderr,none": 0.011809232810617742,
"choice_prob_norm,none": 0.5242103560881676,
"choice_prob_norm_stderr,none": 0.0032533043980025127,
"choice_logprob_norm,none": -0.7248779872506149,
"choice_logprob_norm_stderr,none": 0.007467578574808926
},
"copa_0shot": {
"acc,none": 0.67,
"acc_stderr,none": 0.04725815626252606,
"bpb,none": 1.7509401713984518,
"bpb_stderr,none": 0.0465471025808557,
"logprob,none": -33.0514147567749,
"logprob_stderr,none": 0.6232918344655725,
"choice_logprob,none": -1.195383127628822,
"choice_logprob_stderr,none": 0.1977600254862864,
"choice_prob_norm,none": 0.5106011755889054,
"choice_prob_norm_stderr,none": 0.008237084944626733,
"choice_logprob_norm,none": -0.6864939070298057,
"choice_logprob_norm_stderr,none": 0.01756788250117356
},
"csqa_5shot": {
"acc,none": 0.20393120393120392,
"acc_stderr,none": 0.011535521334313653,
"bpb,none": 2.7750737736040763,
"bpb_stderr,none": 0.038710211216076984,
"logprob,none": -1.9235345620195168,
"logprob_stderr,none": 0.026831873763303756,
"choice_logprob,none": -1.8990875694210565,
"choice_logprob_stderr,none": 0.026864032994106866,
"choice_prob_norm,none": 0.2027864242551805,
"choice_prob_norm_stderr,none": 0.004342440431344874,
"choice_logprob_norm,none": -2.1417477392256643,
"choice_logprob_norm_stderr,none": 0.03895586531751517
},
"csqa_sl_verb_5shot": {
"acc,none": 0.22113022113022113,
"acc_stderr,none": 0.011881644696037889,
"acc_norm,none": 0.23505323505323505,
"acc_norm_stderr,none": 0.01214000336728023,
"bpb,none": 0.5032101150756438,
"bpb_stderr,none": 0.007832177722936925,
"logprob,none": -4.08857823394538,
"logprob_stderr,none": 0.055516030423002266,
"choice_logprob,none": -2.4585863408814546,
"choice_logprob_stderr,none": 0.052941311436914426,
"choice_prob_norm,none": 0.2089841912558642,
"choice_prob_norm_stderr,none": 0.0013420515634436365,
"choice_logprob_norm,none": -1.5949569298775694,
"choice_logprob_norm_stderr,none": 0.007336206390518743
},
"hellaswag_0shot": {
"acc,none": 0.29924317864967137,
"acc_stderr,none": 0.004569906485090238,
"acc_norm,none": 0.3366859191396136,
"acc_norm_stderr,none": 0.00471610647590507,
"bpb,none": 0.967633860035596,
"bpb_stderr,none": 0.0021558406116817135,
"logprob,none": -88.81399176838245,
"logprob_stderr,none": 0.4432641787400844,
"choice_logprob,none": -23.550237747733163,
"choice_logprob_stderr,none": 0.2832279744664909,
"choice_prob_norm,none": 0.26149831224242764,
"choice_prob_norm_stderr,none": 0.00042612890446360705,
"choice_logprob_norm,none": -1.3548133586125368,
"choice_logprob_norm_stderr,none": 0.0016534233664985678
},
"hellaswag_5shot": {
"acc,none": 0.3004381597291376,
"acc_stderr,none": 0.004575116093931934,
"acc_norm,none": 0.3314080860386377,
"acc_norm_stderr,none": 0.0046975739621694145,
"bpb,none": 0.9676567958492561,
"bpb_stderr,none": 0.0021420263053160524,
"logprob,none": -88.96766037096708,
"logprob_stderr,none": 0.44590889583776167,
"choice_logprob,none": -23.964510189108076,
"choice_logprob_stderr,none": 0.28711618611636736,
"choice_prob_norm,none": 0.2610185154732678,
"choice_prob_norm_stderr,none": 0.00042339348751737,
"choice_logprob_norm,none": -1.3564887555532543,
"choice_logprob_norm_stderr,none": 0.001642669815486361
},
"openbookqa_0shot": {
"acc,none": 0.16,
"acc_stderr,none": 0.016411540980502327,
"acc_norm,none": 0.298,
"acc_norm_stderr,none": 0.020475118092988957,
"bpb,none": 1.9540992975212328,
"bpb_stderr,none": 0.05169578694391548,
"logprob,none": -20.303072726249695,
"logprob_stderr,none": 0.5378084724355116,
"choice_logprob,none": -6.459549452866858,
"choice_logprob_stderr,none": 0.3142409273698422,
"choice_prob_norm,none": 0.26374395201506906,
"choice_prob_norm_stderr,none": 0.005601387312775557,
"choice_logprob_norm,none": -1.476509569316706,
"choice_logprob_norm_stderr,none": 0.031568098906027714
},
"piqa_5shot": {
"acc,none": 0.6621327529923831,
"acc_stderr,none": 0.011035474307853841,
"acc_norm,none": 0.6556039173014145,
"acc_norm_stderr,none": 0.011086521237125621,
"bpb,none": 1.2014970353837593,
"bpb_stderr,none": 0.011135509083329038,
"logprob,none": -70.90089691320883,
"logprob_stderr,none": 1.4719156911259526,
"choice_logprob,none": -3.234416235332625,
"choice_logprob_stderr,none": 0.22608919598066995,
"choice_prob_norm,none": 0.5103922109540393,
"choice_prob_norm_stderr,none": 0.0013955248134373478,
"choice_logprob_norm,none": -0.6800897269184378,
"choice_logprob_norm_stderr,none": 0.0029647712439736297
},
"winogrande_5shot": {
"acc,none": 0.5201262825572218,
"acc_stderr,none": 0.014041096664344327,
"bpb,none": 0.45711397512724894,
"bpb_stderr,none": 0.024559125218020894,
"logprob,none": -20.652021914649968,
"logprob_stderr,none": 0.2970189774087888,
"choice_logprob,none": -0.7666224429549149,
"choice_logprob_stderr,none": 0.014726534968563916,
"choice_prob_norm,none": 0.5009876300221876,
"choice_prob_norm_stderr,none": 0.0007727968633400879,
"choice_logprob_norm,none": -0.6936139196733536,
"choice_logprob_norm_stderr,none": 0.0026118340421311267
},
"wsc273_0shot": {
"acc,none": 0.5347985347985348,
"acc_stderr,none": 0.03024344009911826,
"bpb,none": 0.6968475485504413,
"bpb_stderr,none": 0.0242763587065663,
"logprob,none": -28.82327291817019,
"logprob_stderr,none": 0.5332043092143391,
"choice_logprob,none": -0.8000338807144458,
"choice_logprob_stderr,none": 0.04046173201444223,
"choice_prob_norm,none": 0.5009830256732977,
"choice_prob_norm_stderr,none": 0.0012731116653940625,
"choice_logprob_norm,none": -0.6920630078957928,
"choice_logprob_norm_stderr,none": 0.0025503091583774632
},
"medmcqa_5shot": {
"acc,none": 0.2744441788190294,
"acc_stderr,none": 0.006900338771605978,
"acc_norm,none": 0.2744441788190294,
"acc_norm_stderr,none": 0.006900338771605978,
"logprob,none": -1.468579701194061,
"logprob_stderr,none": 0.008180923658803626
},
"medmcqa_sl_verb_5shot": {
"acc,none": 0.2414535022710973,
"acc_stderr,none": 0.006617833617335594,
"acc_norm,none": 0.2701410470953861,
"acc_norm_stderr,none": 0.0068662995339351775,
"bpb,none": 0.2768836852870189,
"bpb_stderr,none": 0.003417423547440866,
"logprob,none": -3.6754608847380768,
"logprob_stderr,none": 0.035808642115232145,
"choice_logprob,none": -2.1497944553173047,
"choice_logprob_stderr,none": 0.02742157118106784,
"choice_prob_norm,none": 0.2514483693484317,
"choice_prob_norm_stderr,none": 0.0004934200964202407,
"choice_logprob_norm,none": -1.389670532257503,
"choice_logprob_norm_stderr,none": 0.0022202315152482794
},
"truthfulqa_mc1_0shot": {
"acc,none": 0.23255813953488372,
"acc_stderr,none": 0.014789157531080517,
"logprob,none": -24.057734922504775,
"logprob_stderr,none": 0.4731599285534115
},
"logprob_gsm8k_5shot": {
"nll,none": 168.23937684320157,
"nll_stderr,none": 2.173767361575589,
"bpb,none": 0.8570287201881884,
"bpb_stderr,none": 0.006734835930797166
},
"logprob_humaneval_10shot": {
"nll,none": 64.84557540823774,
"nll_stderr,none": 3.481535350353007,
"bpb,none": 0.6030907083166783,
"bpb_stderr,none": 0.022326441103826027
}
}
},
{
"mix": "v0",
"hidden_dim": 1024,
"budget": 9e+18,
"tasks": {
"mmlu_sl_verb_0shot": {
"acc_norm,none": 0.28913260219341974,
"acc_norm_stderr,none": 0.0038170505239232483,
"logprob,none": -6.674078064848997,
"logprob_stderr,none": 0.01901140521686992,
"choice_logprob,none": -1.8773911852811065,
"choice_logprob_stderr,none": 0.012410207247699801,
"choice_prob_norm,none": 0.25286112945785577,
"choice_prob_norm_stderr,none": 0.0002782327599307251,
"bpb,none": 0.4661269626913522,
"bpb_stderr,none": 0.002521564276281313,
"choice_logprob_norm,none": -1.3855179171485157,
"choice_logprob_norm_stderr,none": 0.0011732520315487474,
"acc,none": 0.2718985899444524,
"acc_stderr,none": 0.003750371502062022
},
"mmlu_sl_verb_5shot": {
"acc_norm,none": 0.290414470873095,
"acc_norm_stderr,none": 0.003817983230903859,
"choice_logprob_norm,none": -1.3832265879984036,
"choice_logprob_norm_stderr,none": 0.0008903141103261133,
"acc,none": 0.2820111095285572,
"acc_stderr,none": 0.0037857344481307753,
"logprob,none": -3.8444525264711373,
"logprob_stderr,none": 0.020050404178067915,
"bpb,none": 0.24216502968259027,
"bpb_stderr,none": 0.0015688729102161052,
"choice_prob_norm,none": 0.25242225038820676,
"choice_prob_norm_stderr,none": 0.00021197420717174455,
"choice_logprob,none": -1.955011404596652,
"choice_logprob_stderr,none": 0.013779524804021928
},
"arc_challenge_5shot": {
"acc,none": 0.24061433447098976,
"acc_stderr,none": 0.012491468532390575,
"acc_norm,none": 0.2627986348122867,
"acc_norm_stderr,none": 0.012862523175351331,
"bpb,none": 1.1303670351920045,
"bpb_stderr,none": 0.021460296068113123,
"logprob,none": -19.27878961058607,
"logprob_stderr,none": 0.3348281147526389,
"choice_logprob,none": -4.806931849095136,
"choice_logprob_stderr,none": 0.15549723707544602,
"choice_prob_norm,none": 0.25551085220212294,
"choice_prob_norm_stderr,none": 0.0019557571360019656,
"choice_logprob_norm,none": -1.4043355010952636,
"choice_logprob_norm_stderr,none": 0.009062425260127323
},
"arc_easy_5shot": {
"acc,none": 0.5555555555555556,
"acc_stderr,none": 0.010196254838691668,
"acc_norm,none": 0.5324074074074074,
"acc_norm_stderr,none": 0.010238210368801891,
"bpb,none": 0.9382912461299037,
"bpb_stderr,none": 0.010856005058823528,
"logprob,none": -12.965371599301745,
"logprob_stderr,none": 0.1984302457062909,
"choice_logprob,none": -2.026691862536917,
"choice_logprob_stderr,none": 0.07339547348687801,
"choice_prob_norm,none": 0.3044992148390953,
"choice_prob_norm_stderr,none": 0.0017970120063283165,
"choice_logprob_norm,none": -1.2303368635102365,
"choice_logprob_norm_stderr,none": 0.006143988115956957
},
"boolq_10shot": {
"acc,none": 0.45718654434250766,
"acc_stderr,none": 0.008712936764296237,
"bpb,none": 0.45735726848110914,
"bpb_stderr,none": 0.0033240133886141203,
"logprob,none": -0.8455457207624336,
"logprob_stderr,none": 0.007151816427775643,
"choice_logprob,none": -0.7991114588856746,
"choice_logprob_stderr,none": 0.0071367340988007675,
"choice_prob_norm,none": 0.5017328079660853,
"choice_prob_norm_stderr,none": 0.0016066156315061744,
"choice_logprob_norm,none": -0.7071844266861197,
"choice_logprob_norm_stderr,none": 0.003322330090811105
},
"boolq_sl_verb_10shot": {
"acc,none": 0.598776758409786,
"acc_stderr,none": 0.008572708337178991,
"acc_norm,none": 0.6116207951070336,
"acc_norm_stderr,none": 0.008524357307908797,
"bpb,none": 0.5487674562348573,
"bpb_stderr,none": 0.009659175369762825,
"logprob,none": -0.8471939702281893,
"logprob_stderr,none": 0.012795218556042685,
"choice_logprob,none": -0.7946344378664244,
"choice_logprob_stderr,none": 0.012830967641596133,
"choice_prob_norm,none": 0.5445438769833885,
"choice_prob_norm_stderr,none": 0.003920664405608299,
"choice_logprob_norm,none": -0.7270175501136007,
"choice_logprob_norm_stderr,none": 0.00948407759703137
},
"copa_0shot": {
"acc,none": 0.7,
"acc_stderr,none": 0.046056618647183814,
"bpb,none": 1.688417207231075,
"bpb_stderr,none": 0.046443549576117016,
"logprob,none": -31.785984535217285,
"logprob_stderr,none": 0.574583828532445,
"choice_logprob,none": -0.9573435664452496,
"choice_logprob_stderr,none": 0.19609870862864015,
"choice_prob_norm,none": 0.5165112756384823,
"choice_prob_norm_stderr,none": 0.008333286364378283,
"choice_logprob_norm,none": -0.6757736663288648,
"choice_logprob_norm_stderr,none": 0.018314384433538264
},
"csqa_5shot": {
"acc,none": 0.2022932022932023,
"acc_stderr,none": 0.01150091452526044,
"bpb,none": 3.308068562033664,
"bpb_stderr,none": 0.06991929962338578,
"logprob,none": -2.2929783968726305,
"logprob_stderr,none": 0.04846436540067588,
"choice_logprob,none": -2.281958129242245,
"choice_logprob_stderr,none": 0.048469082192047826,
"choice_prob_norm,none": 0.20019674030611143,
"choice_prob_norm_stderr,none": 0.004763466308994383,
"choice_logprob_norm,none": -2.7287219537684475,
"choice_logprob_norm_stderr,none": 0.07008232562791067
},
"csqa_sl_verb_5shot": {
"acc,none": 0.19000819000819,
"acc_stderr,none": 0.01123172751912786,
"acc_norm,none": 0.20147420147420148,
"acc_norm_stderr,none": 0.011483500195202903,
"bpb,none": 0.429949664417547,
"bpb_stderr,none": 0.008076315515335691,
"logprob,none": -3.451736507603226,
"logprob_stderr,none": 0.05611321867763557,
"choice_logprob,none": -2.630758405642532,
"choice_logprob_stderr,none": 0.054967332321790414,
"choice_prob_norm,none": 0.2038328543979333,
"choice_prob_norm_stderr,none": 0.001346731979327511,
"choice_logprob_norm,none": -1.62221360042132,
"choice_logprob_norm_stderr,none": 0.007619295229946782
},
"hellaswag_0shot": {
"acc,none": 0.3452499502091217,
"acc_stderr,none": 0.004744780201276638,
"acc_norm,none": 0.4170483967337184,
"acc_norm_stderr,none": 0.004920633227844459,
"bpb,none": 0.8823173304353232,
"bpb_stderr,none": 0.0020461410545454532,
"logprob,none": -80.86246913710669,
"logprob_stderr,none": 0.4031797743881049,
"choice_logprob,none": -19.395296864862598,
"choice_logprob_stderr,none": 0.2488205743300131,
"choice_prob_norm,none": 0.27011729593392875,
"choice_prob_norm_stderr,none": 0.00042040904737482244,
"choice_logprob_norm,none": -1.3211981977293266,
"choice_logprob_norm_stderr,none": 0.0015788492100577963
},
"hellaswag_5shot": {
"acc,none": 0.344353714399522,
"acc_stderr,none": 0.004741859753178431,
"acc_norm,none": 0.4140609440350528,
"acc_norm_stderr,none": 0.004915524600627973,
"bpb,none": 0.8678022635536807,
"bpb_stderr,none": 0.002027430788617163,
"logprob,none": -79.66558491758701,
"logprob_stderr,none": 0.4004767695182609,
"choice_logprob,none": -19.67994947421911,
"choice_logprob_stderr,none": 0.2510734934995068,
"choice_prob_norm,none": 0.2699317383112573,
"choice_prob_norm_stderr,none": 0.00041977307256955434,
"choice_logprob_norm,none": -1.3218253788095968,
"choice_logprob_norm_stderr,none": 0.001573985153351605
},
"openbookqa_0shot": {
"acc,none": 0.192,
"acc_stderr,none": 0.017632180454360984,
"acc_norm,none": 0.31,
"acc_norm_stderr,none": 0.020704041021724805,
"bpb,none": 1.8607658739262547,
"bpb_stderr,none": 0.04956693790865995,
"logprob,none": -19.349410093307494,
"logprob_stderr,none": 0.5175460946269681,
"choice_logprob,none": -6.027359120533246,
"choice_logprob_stderr,none": 0.30074657863828946,
"choice_prob_norm,none": 0.27147872999523615,
"choice_prob_norm_stderr,none": 0.005653136250223714,
"choice_logprob_norm,none": -1.4341392043181795,
"choice_logprob_norm_stderr,none": 0.028065683186773445
},
"piqa_5shot": {
"acc,none": 0.691512513601741,
"acc_stderr,none": 0.010776164678037155,
"acc_norm,none": 0.6849836779107725,
"acc_norm_stderr,none": 0.010838072746240652,
"bpb,none": 1.1001333707203211,
"bpb_stderr,none": 0.010781655681221258,
"logprob,none": -64.95884351553933,
"logprob_stderr,none": 1.3648116902702927,
"choice_logprob,none": -2.8290212149038094,
"choice_logprob_stderr,none": 0.1978043337372253,
"choice_prob_norm,none": 0.5143110704617275,
"choice_prob_norm_stderr,none": 0.0013428686481482728,
"choice_logprob_norm,none": -0.6716928977102089,
"choice_logprob_norm_stderr,none": 0.0027974047765665882
},
"winogrande_5shot": {
"acc,none": 0.4972375690607735,
"acc_stderr,none": 0.014052271211616438,
"bpb,none": 0.44075114363447215,
"bpb_stderr,none": 0.023351630480024383,
"logprob,none": -19.941748224663378,
"logprob_stderr,none": 0.28624009285998814,
"choice_logprob,none": -0.7768202090598731,
"choice_logprob_stderr,none": 0.01582238063994806,
"choice_prob_norm,none": 0.5011534614182027,
"choice_prob_norm_stderr,none": 0.0007615166601860404,
"choice_logprob_norm,none": -0.6926689003880863,
"choice_logprob_norm_stderr,none": 0.0020130585439979316
},
"wsc273_0shot": {
"acc,none": 0.5714285714285714,
"acc_stderr,none": 0.030006001800600198,
"bpb,none": 0.6548505138835954,
"bpb_stderr,none": 0.021204852809512176,
"logprob,none": -27.373171830788635,
"logprob_stderr,none": 0.4851443088103239,
"choice_logprob,none": -0.781759254624432,
"choice_logprob_stderr,none": 0.046038949062997735,
"choice_prob_norm,none": 0.5022969315343694,
"choice_prob_norm_stderr,none": 0.0012249598100978827,
"choice_logprob_norm,none": -0.68936630940301,
"choice_logprob_norm_stderr,none": 0.002427985612751233
},
"medmcqa_5shot": {
"acc,none": 0.2749223045661009,
"acc_stderr,none": 0.006904070961661394,
"acc_norm,none": 0.2749223045661009,
"acc_norm_stderr,none": 0.006904070961661394,
"logprob,none": -1.4320607341535634,
"logprob_stderr,none": 0.00599742894108014
},
"medmcqa_sl_verb_5shot": {
"acc,none": 0.2409753765240258,
"acc_stderr,none": 0.006613361336468529,
"acc_norm,none": 0.27779105904852974,
"acc_norm_stderr,none": 0.0069262561348971755,
"bpb,none": 0.1980862205268872,
"bpb_stderr,none": 0.002585096103962977,
"logprob,none": -2.4686314965616765,
"logprob_stderr,none": 0.022424759315674673,
"choice_logprob,none": -1.7409690527301998,
"choice_logprob_stderr,none": 0.01762798312325864,
"choice_prob_norm,none": 0.2516423134955927,
"choice_prob_norm_stderr,none": 0.00036149155722740135,
"choice_logprob_norm,none": -1.3846224187693892,
"choice_logprob_norm_stderr,none": 0.0016056848013486905
},
"truthfulqa_mc1_0shot": {
"acc,none": 0.22766217870257038,
"acc_stderr,none": 0.014679255032111064,
"logprob,none": -20.182860770744966,
"logprob_stderr,none": 0.4159737876273053
},
"logprob_gsm8k_5shot": {
"nll,none": 151.70901034617623,
"nll_stderr,none": 1.9142194686108658,
"bpb,none": 0.7785776724099326,
"bpb_stderr,none": 0.006136763749738495
},
"logprob_humaneval_10shot": {
"nll,none": 61.2485516100395,
"nll_stderr,none": 3.2214641436356963,
"bpb,none": 0.5741438524912152,
"bpb_stderr,none": 0.020655639762877293
}
}
},
{
"mix": "v2",
"hidden_dim": 1024,
"budget": 9e+18,
"tasks": {
"mmlu_sl_verb_0shot": {
"choice_logprob_norm,none": -1.3887148443048396,
"choice_logprob_norm_stderr,none": 0.0011612383410942707,
"choice_prob_norm,none": 0.251948878066459,
"choice_prob_norm_stderr,none": 0.0002738137208502262,
"bpb,none": 0.44218048678046323,
"bpb_stderr,none": 0.002349499737618632,
"acc_norm,none": 0.27602905569007263,
"acc_norm_stderr,none": 0.003764639707646851,
"logprob,none": -6.584575940159366,
"logprob_stderr,none": 0.021564959282463818,
"choice_logprob,none": -1.9591102267126992,
"choice_logprob_stderr,none": 0.01261867725547403,
"acc,none": 0.25210084033613445,
"acc_stderr,none": 0.003656009837643968
},
"mmlu_sl_verb_5shot": {
"choice_logprob_norm,none": -1.3846107514653117,
"choice_logprob_norm_stderr,none": 0.000752846654571709,
"acc,none": 0.26741204956558895,
"acc_stderr,none": 0.0037274963630969824,
"bpb,none": 0.20057953440157156,
"bpb_stderr,none": 0.0013000184012252752,
"choice_prob_norm,none": 0.25151363104162694,
"choice_prob_norm_stderr,none": 0.00017700146029745345,
"logprob,none": -3.14127250357916,
"logprob_stderr,none": 0.017217903509997766,
"choice_logprob,none": -1.8302228602650503,
"choice_logprob_stderr,none": 0.012720584641536284,
"acc_norm,none": 0.28607036034752886,
"acc_norm_stderr,none": 0.0038083312053686508
},
"arc_challenge_5shot": {
"acc,none": 0.27047781569965873,
"acc_stderr,none": 0.012980954547659554,
"acc_norm,none": 0.30631399317406144,
"acc_norm_stderr,none": 0.013470584417276513,
"bpb,none": 1.0935280258436462,
"bpb_stderr,none": 0.02220483330833206,
"logprob,none": -18.868265470547072,
"logprob_stderr,none": 0.33899786997642567,
"choice_logprob,none": -4.699882626065459,
"choice_logprob_stderr,none": 0.15638782844904006,
"choice_prob_norm,none": 0.2615960345029809,
"choice_prob_norm_stderr,none": 0.002213300520616238,
"choice_logprob_norm,none": -1.3886084230955638,
"choice_logprob_norm_stderr,none": 0.009964575574973286
},
"arc_easy_5shot": {
"acc,none": 0.5997474747474747,
"acc_stderr,none": 0.010053550119896114,
"acc_norm,none": 0.5909090909090909,
"acc_norm_stderr,none": 0.010088775152615782,
"bpb,none": 0.875158962470022,
"bpb_stderr,none": 0.011381731804273681,
"logprob,none": -12.395110425724324,
"logprob_stderr,none": 0.20011342776901275,
"choice_logprob,none": -1.8528819599043764,
"choice_logprob_stderr,none": 0.07185139483537852,
"choice_prob_norm,none": 0.3189237129071374,
"choice_prob_norm_stderr,none": 0.002016674957278432,
"choice_logprob_norm,none": -1.1895055985152354,
"choice_logprob_norm_stderr,none": 0.006533736773321884
},
"boolq_10shot": {
"acc,none": 0.5168195718654435,
"acc_stderr,none": 0.008740105658763946,
"bpb,none": 0.541989411025595,
"bpb_stderr,none": 0.004312307826813873,
"logprob,none": -0.9524213738397721,
"logprob_stderr,none": 0.006870908275453281,
"choice_logprob,none": -0.7383123585052149,
"choice_logprob_stderr,none": 0.006468265480150814,
"choice_prob_norm,none": 0.5183710765429542,
"choice_prob_norm_stderr,none": 0.0019750440851862504,
"choice_logprob_norm,none": -0.6830070819392396,
"choice_logprob_norm_stderr,none": 0.004108129300805658
},
"boolq_sl_verb_10shot": {
"acc,none": 0.6180428134556575,
"acc_stderr,none": 0.008497851998427189,
"acc_norm,none": 0.6201834862385321,
"acc_norm_stderr,none": 0.008488668235778606,
"bpb,none": 0.6635063310144024,
"bpb_stderr,none": 0.011312178604720991,
"logprob,none": -1.0069598921212946,
"logprob_stderr,none": 0.014705390940278371,
"choice_logprob,none": -0.8202281864002191,
"choice_logprob_stderr,none": 0.014695636042857963,
"choice_prob_norm,none": 0.5610461036675835,
"choice_prob_norm_stderr,none": 0.004633673864166954,
"choice_logprob_norm,none": -0.74378769719765,
"choice_logprob_norm_stderr,none": 0.011175862199800225
},
"copa_0shot": {
"acc,none": 0.68,
"acc_stderr,none": 0.046882617226215034,
"bpb,none": 1.7431494759115915,
"bpb_stderr,none": 0.05228713019448322,
"logprob,none": -32.63967601776123,
"logprob_stderr,none": 0.5909717686742535,
"choice_logprob,none": -1.1414016145947221,
"choice_logprob_stderr,none": 0.20693242642507131,
"choice_prob_norm,none": 0.517273855120282,
"choice_prob_norm_stderr,none": 0.009460633953763339,
"choice_logprob_norm,none": -0.6785414600198394,
"choice_logprob_norm_stderr,none": 0.020700769324077977
},
"csqa_5shot": {
"acc,none": 0.20884520884520885,
"acc_stderr,none": 0.011637590576063046,
"bpb,none": 3.107354595439875,
"bpb_stderr,none": 0.048339534030756526,
"logprob,none": -2.1538540768291403,
"logprob_stderr,none": 0.0335064117230004,
"choice_logprob,none": -2.1230699300379263,
"choice_logprob_stderr,none": 0.033539420460243184,
"choice_prob_norm,none": 0.19609029022568736,
"choice_prob_norm_stderr,none": 0.006244693248187209,
"choice_logprob_norm,none": -2.550141192036063,
"choice_logprob_norm_stderr,none": 0.049664187241009225
},
"csqa_sl_verb_5shot": {
"acc,none": 0.19246519246519248,
"acc_stderr,none": 0.011286955409752617,
"acc_norm,none": 0.20147420147420148,
"acc_norm_stderr,none": 0.011483500195202903,
"bpb,none": 0.39968402829407823,
"bpb_stderr,none": 0.009492562523787662,
"logprob,none": -3.1958935750887885,
"logprob_stderr,none": 0.06880739818028928,
"choice_logprob,none": -2.792026057863224,
"choice_logprob_stderr,none": 0.06670431115893402,
"choice_prob_norm,none": 0.20375478798874827,
"choice_prob_norm_stderr,none": 0.0014768601460400197,
"choice_logprob_norm,none": -1.6309924149932398,
"choice_logprob_norm_stderr,none": 0.008771916183210268
},
"hellaswag_0shot": {
"acc,none": 0.3768173670583549,
"acc_stderr,none": 0.004835981632401603,
"acc_norm,none": 0.46922923720374426,
"acc_norm_stderr,none": 0.0049803234000310795,
"bpb,none": 0.8846169915295451,
"bpb_stderr,none": 0.00216054708286405,
"logprob,none": -80.74439490636823,
"logprob_stderr,none": 0.40232680893402256,
"choice_logprob,none": -18.21099782421507,
"choice_logprob_stderr,none": 0.2436303915764476,
"choice_prob_norm,none": 0.27606623257420826,
"choice_prob_norm_stderr,none": 0.0004449742055465229,
"choice_logprob_norm,none": -1.3003765092331951,
"choice_logprob_norm_stderr,none": 0.0016418664757832538
},
"hellaswag_5shot": {
"acc,none": 0.3639713204540928,
"acc_stderr,none": 0.0048015720289208,
"acc_norm,none": 0.45130452101175067,
"acc_norm_stderr,none": 0.0049660609953150634,
"bpb,none": 0.8645088399901097,
"bpb_stderr,none": 0.0020838005037428706,
"logprob,none": -79.09350363089017,
"logprob_stderr,none": 0.39670599272200674,
"choice_logprob,none": -18.72502364044871,
"choice_logprob_stderr,none": 0.24594117569880022,
"choice_prob_norm,none": 0.27352034044036333,
"choice_prob_norm_stderr,none": 0.00042830633098770517,
"choice_logprob_norm,none": -1.3088219496509341,
"choice_logprob_norm_stderr,none": 0.0015876999111474938
},
"openbookqa_0shot": {
"acc,none": 0.204,
"acc_stderr,none": 0.018039369104138645,
"acc_norm,none": 0.33,
"acc_norm_stderr,none": 0.021049612166134817,
"bpb,none": 1.875484662709983,
"bpb_stderr,none": 0.05179042887158007,
"logprob,none": -19.307575354576112,
"logprob_stderr,none": 0.510450513559666,
"choice_logprob,none": -6.008570079409671,
"choice_logprob_stderr,none": 0.3063816766265774,
"choice_prob_norm,none": 0.27933651070111226,
"choice_prob_norm_stderr,none": 0.006200815553283834,
"choice_logprob_norm,none": -1.4222343139005449,
"choice_logprob_norm_stderr,none": 0.02937963571185174
},
"piqa_5shot": {
"acc,none": 0.6974972796517954,
"acc_stderr,none": 0.010717199698083893,
"acc_norm,none": 0.6985854189336235,
"acc_norm_stderr,none": 0.01070624824275376,
"bpb,none": 1.1024937948409286,
"bpb_stderr,none": 0.010867658597802586,
"logprob,none": -64.97100728743743,
"logprob_stderr,none": 1.37324001166038,
"choice_logprob,none": -2.8151085032374,
"choice_logprob_stderr,none": 0.20151047037802025,
"choice_prob_norm,none": 0.515620895724506,
"choice_prob_norm_stderr,none": 0.0013739474599856348,
"choice_logprob_norm,none": -0.6694607389999877,
"choice_logprob_norm_stderr,none": 0.0028669015157053446
},
"winogrande_5shot": {
"acc,none": 0.5122336227308603,
"acc_stderr,none": 0.01404827882040562,
"bpb,none": 0.4461208230255092,
"bpb_stderr,none": 0.024572691862973322,
"logprob,none": -20.093352986962756,
"logprob_stderr,none": 0.289626758603389,
"choice_logprob,none": -0.7990905808599074,
"choice_logprob_stderr,none": 0.016693379995914515,
"choice_prob_norm,none": 0.5011409500177034,
"choice_prob_norm_stderr,none": 0.0007905641599567411,
"choice_logprob_norm,none": -0.6934659670650414,
"choice_logprob_norm_stderr,none": 0.0027187399193643737
},
"wsc273_0shot": {
"acc,none": 0.5567765567765568,
"acc_stderr,none": 0.030120860870184635,
"bpb,none": 0.6567250997319227,
"bpb_stderr,none": 0.0219496167384627,
"logprob,none": -27.465879419347743,
"logprob_stderr,none": 0.4973342251798536,
"choice_logprob,none": -0.9953870161772929,
"choice_logprob_stderr,none": 0.07727446773738116,
"choice_prob_norm,none": 0.5022875866783845,
"choice_prob_norm_stderr,none": 0.0013912789346927528,
"choice_logprob_norm,none": -0.6896150353015345,
"choice_logprob_norm_stderr,none": 0.002752556408081628
},
"medmcqa_5shot": {
"acc,none": 0.3131723643318193,
"acc_stderr,none": 0.007171724237526972,
"acc_norm,none": 0.3131723643318193,
"acc_norm_stderr,none": 0.007171724237526972,
"logprob,none": -1.4007173108906372,
"logprob_stderr,none": 0.005397008119825809
},
"medmcqa_sl_verb_5shot": {
"acc,none": 0.30695672961989,
"acc_stderr,none": 0.007132253005986364,
"acc_norm,none": 0.32106143915849866,
"acc_norm_stderr,none": 0.007219669176206489,
"bpb,none": 0.15485459632136042,
"bpb_stderr,none": 0.002271082787996014,
"logprob,none": -1.873889418159009,
"logprob_stderr,none": 0.019455744132788375,
"choice_logprob,none": -1.5687109944568915,
"choice_logprob_stderr,none": 0.01690239468821199,
"choice_prob_norm,none": 0.25316229322677253,
"choice_prob_norm_stderr,none": 0.0003259097891512063,
"choice_logprob_norm,none": -1.3778143596538146,
"choice_logprob_norm_stderr,none": 0.0014879400524400613
},
"truthfulqa_mc1_0shot": {
"acc,none": 0.22643818849449204,
"acc_stderr,none": 0.014651337324602597,
"logprob,none": -20.73370366405828,
"logprob_stderr,none": 0.4278143769070544
},
"logprob_gsm8k_5shot": {
"nll,none": 149.62058111571108,
"nll_stderr,none": 1.9986798444447733,
"bpb,none": 0.7666060799050025,
"bpb_stderr,none": 0.007104338540057894
},
"logprob_humaneval_10shot": {
"nll,none": 66.28788003398151,
"nll_stderr,none": 3.5592787545629223,
"bpb,none": 0.6243391427736511,
"bpb_stderr,none": 0.023415837672337732
}
}
},
{
"mix": "v3",
"hidden_dim": 1024,
"budget": 9e+18,
"tasks": {
"mmlu_sl_verb_0shot": {
"acc,none": 0.2778806437829369,
"acc_stderr,none": 0.0037734484032089055,
"acc_norm,none": 0.2879219484403931,
"acc_norm_stderr,none": 0.0038099083678867296,
"bpb,none": 0.42117007170812004,
"bpb_stderr,none": 0.002349339982343761,
"logprob,none": -6.128808188783855,
"logprob_stderr,none": 0.019478551572906597,
"choice_logprob,none": -1.8214619232173475,
"choice_logprob_stderr,none": 0.012090375450305921,
"choice_prob_norm,none": 0.252465882369846,
"choice_prob_norm_stderr,none": 0.0002595426976148667,
"choice_logprob_norm,none": -1.3860591506283386,
"choice_logprob_norm_stderr,none": 0.001087801969607964,
"sample_count": {
"acc,none": 14042,
"acc_norm,none": 14042,
"bpb,none": 14042,
"logprob,none": 14042,
"choice_logprob,none": 14042,
"choice_prob_norm,none": 14042,
"choice_logprob_norm,none": 14042
}
},
"mmlu_sl_verb_5shot": {
"acc,none": 0.2716849451645065,
"acc_stderr,none": 0.0037496055191951496,
"acc_norm,none": 0.2837202677681242,
"acc_norm_stderr,none": 0.0038024293601329213,
"bpb,none": 0.2026213677068112,
"bpb_stderr,none": 0.0013709593473205442,
"logprob,none": -3.228287653153062,
"logprob_stderr,none": 0.017556417333451688,
"choice_logprob,none": -1.7620633090352307,
"choice_logprob_stderr,none": 0.011662024024558518,
"choice_prob_norm,none": 0.2516173104824868,
"choice_prob_norm_stderr,none": 0.00016909057142734156,
"choice_logprob_norm,none": -1.3838629310970294,
"choice_logprob_norm_stderr,none": 0.0007211045140684807,
"sample_count": {
"acc,none": 14042,
"acc_norm,none": 14042,
"bpb,none": 14042,
"logprob,none": 14042,
"choice_logprob,none": 14042,
"choice_prob_norm,none": 14042,
"choice_logprob_norm,none": 14042
}
},
"arc_challenge_5shot": {
"acc,none": 0.25597269624573377,
"acc_stderr,none": 0.012753013241244534,
"acc_norm,none": 0.2883959044368601,
"acc_norm_stderr,none": 0.013238394422428171,
"bpb,none": 1.1221728382537852,
"bpb_stderr,none": 0.022419709106874387,
"logprob,none": -19.341666728563276,
"logprob_stderr,none": 0.34589975499680453,
"choice_logprob,none": -4.940120011255016,
"choice_logprob_stderr,none": 0.15933995231359616,
"choice_prob_norm,none": 0.25794012617660156,
"choice_prob_norm_stderr,none": 0.0022300508930906348,
"choice_logprob_norm,none": -1.4061200054289258,
"choice_logprob_norm_stderr,none": 0.010740217642775685
},
"arc_easy_5shot": {
"acc,none": 0.5601851851851852,
"acc_stderr,none": 0.01018518518518532,
"acc_norm,none": 0.5467171717171717,
"acc_norm_stderr,none": 0.01021490151673162,
"bpb,none": 0.9151864377579918,
"bpb_stderr,none": 0.010967587948958146,
"logprob,none": -12.89686324339523,
"logprob_stderr,none": 0.20518227800584848,
"choice_logprob,none": -2.1136297973276794,
"choice_logprob_stderr,none": 0.07708881527753612,
"choice_prob_norm,none": 0.30940160257013594,
"choice_prob_norm_stderr,none": 0.001909619515456115,
"choice_logprob_norm,none": -1.2187653110451044,
"choice_logprob_norm_stderr,none": 0.006600255376899415
},
"boolq_10shot": {
"acc,none": 0.5330275229357798,
"acc_stderr,none": 0.008725955605686028,
"bpb,none": 0.4524451726921877,
"bpb_stderr,none": 0.004517110600767529,
"logprob,none": -0.788279603818141,
"logprob_stderr,none": 0.006898990789787937,
"choice_logprob,none": -0.7320842558469366,
"choice_logprob_stderr,none": 0.006712690836080821,
"choice_prob_norm,none": 0.5193000834523307,
"choice_prob_norm_stderr,none": 0.0020481414090304526,
"choice_logprob_norm,none": -0.6840219491412365,
"choice_logprob_norm_stderr,none": 0.004384797259024088
},
"boolq_sl_verb_10shot": {
"acc,none": 0.43241590214067277,
"acc_stderr,none": 0.008664798701065792,
"acc_norm,none": 0.4614678899082569,
"acc_norm_stderr,none": 0.008719048328810805,
"bpb,none": 0.49222531517767704,
"bpb_stderr,none": 0.005039794067753739,
"logprob,none": -0.9121814133559527,
"logprob_stderr,none": 0.009741947195775053,
"choice_logprob,none": -0.8860730057583283,
"choice_logprob_stderr,none": 0.009767192859571366,
"choice_prob_norm,none": 0.4919903165538794,
"choice_prob_norm_stderr,none": 0.002204641670153114,
"choice_logprob_norm,none": -0.7457144675567828,
"choice_logprob_norm_stderr,none": 0.004954534668786731
},
"copa_0shot": {
"acc,none": 0.64,
"acc_stderr,none": 0.048241815132442176,
"bpb,none": 1.6599713126166298,
"bpb_stderr,none": 0.04758606810439861,
"logprob,none": -31.17845386505127,
"logprob_stderr,none": 0.5851072656066011,
"choice_logprob,none": -1.1896302615316923,
"choice_logprob_stderr,none": 0.22459360269658862,
"choice_prob_norm,none": 0.5117218312450332,
"choice_prob_norm_stderr,none": 0.008392263694597767,
"choice_logprob_norm,none": -0.6860098596291105,
"choice_logprob_norm_stderr,none": 0.019012778968729667
},
"csqa_5shot": {
"acc,none": 0.18837018837018837,
"acc_stderr,none": 0.011194511993535688,
"bpb,none": 3.6411961247457723,
"bpb_stderr,none": 0.08525874829781832,
"logprob,none": -2.523884827733333,
"logprob_stderr,none": 0.05909686100070277,
"choice_logprob,none": -2.5189072357432103,
"choice_logprob_stderr,none": 0.05909666810824806,
"choice_prob_norm,none": 0.19650949851356697,
"choice_prob_norm_stderr,none": 0.005258433505175625,
"choice_logprob_norm,none": -3.102625967819365,
"choice_logprob_norm_stderr,none": 0.08544437807964977
},
"csqa_sl_verb_5shot": {
"acc,none": 0.20638820638820637,
"acc_stderr,none": 0.011586881879177828,
"acc_norm,none": 0.22276822276822278,
"acc_norm_stderr,none": 0.011913022964039571,
"bpb,none": 0.452215524904344,
"bpb_stderr,none": 0.009584694071551986,
"logprob,none": -3.6264870981033663,
"logprob_stderr,none": 0.06957873167229361,
"choice_logprob,none": -2.9544891951847396,
"choice_logprob_stderr,none": 0.068277529701088,
"choice_prob_norm,none": 0.2046961509793604,
"choice_prob_norm_stderr,none": 0.0016226983561465022,
"choice_logprob_norm,none": -1.6328321858109678,
"choice_logprob_norm_stderr,none": 0.009266670851355841
},
"hellaswag_0shot": {
"acc,none": 0.3324039036048596,
"acc_stderr,none": 0.00470112142180543,
"acc_norm,none": 0.4012148974307907,
"acc_norm_stderr,none": 0.004891426533390623,
"bpb,none": 0.9009375876364222,
"bpb_stderr,none": 0.0021055138981136206,
"logprob,none": -82.36120062808881,
"logprob_stderr,none": 0.40982338735684054,
"choice_logprob,none": -20.272721049937964,
"choice_logprob_stderr,none": 0.2566580062434852,
"choice_prob_norm,none": 0.2685254519429319,
"choice_prob_norm_stderr,none": 0.00042581267754788663,
"choice_logprob_norm,none": -1.3276102234515903,
"choice_logprob_norm_stderr,none": 0.0016122223661406815
},
"hellaswag_5shot": {
"acc,none": 0.3298147779326827,
"acc_stderr,none": 0.004691848665399074,
"acc_norm,none": 0.3972316271659032,
"acc_norm_stderr,none": 0.0048832465794966745,
"bpb,none": 0.8905966079556564,
"bpb_stderr,none": 0.002093729404640619,
"logprob,none": -81.59669353016533,
"logprob_stderr,none": 0.40979959770883,
"choice_logprob,none": -20.699868645589074,
"choice_logprob_stderr,none": 0.26058429885032486,
"choice_prob_norm,none": 0.26802887392660346,
"choice_prob_norm_stderr,none": 0.00042519100330753356,
"choice_logprob_norm,none": -1.3294290716882005,
"choice_logprob_norm_stderr,none": 0.0016093713491937313
},
"openbookqa_0shot": {
"acc,none": 0.198,
"acc_stderr,none": 0.017838958963847237,
"acc_norm,none": 0.314,
"acc_norm_stderr,none": 0.020776701920308997,
"bpb,none": 1.879972734852229,
"bpb_stderr,none": 0.05180831152549998,
"logprob,none": -19.369384078025817,
"logprob_stderr,none": 0.5081540346015109,
"choice_logprob,none": -6.169701454154714,
"choice_logprob_stderr,none": 0.3067755780073137,
"choice_prob_norm,none": 0.2716123029309807,
"choice_prob_norm_stderr,none": 0.005826471946227696,
"choice_logprob_norm,none": -1.4490867505465432,
"choice_logprob_norm_stderr,none": 0.031429373141996994
},
"piqa_5shot": {
"acc,none": 0.6898803046789989,
"acc_stderr,none": 0.01079187656684305,
"acc_norm,none": 0.6822633297062024,
"acc_norm_stderr,none": 0.010863133246569286,
"bpb,none": 1.1267278287839135,
"bpb_stderr,none": 0.011498569348231697,
"logprob,none": -65.66394310858875,
"logprob_stderr,none": 1.361309501469568,
"choice_logprob,none": -2.8842120102243687,
"choice_logprob_stderr,none": 0.20003484149747763,
"choice_prob_norm,none": 0.5143371099470178,
"choice_prob_norm_stderr,none": 0.0014191589687290998,
"choice_logprob_norm,none": -0.672651949677491,
"choice_logprob_norm_stderr,none": 0.003040303448085757
},
"winogrande_5shot": {
"acc,none": 0.500394632991318,
"acc_stderr,none": 0.014052481306049512,
"bpb,none": 0.44746967272692195,
"bpb_stderr,none": 0.02345581834077092,
"logprob,none": -20.28466144733384,
"logprob_stderr,none": 0.2879659049558859,
"choice_logprob,none": -0.8118696336354043,
"choice_logprob_stderr,none": 0.017177796509284464,
"choice_prob_norm,none": 0.5009580522227739,
"choice_prob_norm_stderr,none": 0.0007615390247721247,
"choice_logprob_norm,none": -0.6933535946515011,
"choice_logprob_norm_stderr,none": 0.0023013588988485354
},
"wsc273_0shot": {
"acc,none": 0.5567765567765568,
"acc_stderr,none": 0.03012086087018464,
"bpb,none": 0.6474761978959332,
"bpb_stderr,none": 0.02276077704193253,
"logprob,none": -26.934241563845905,
"logprob_stderr,none": 0.5106632336755854,
"choice_logprob,none": -0.8604588058829552,
"choice_logprob_stderr,none": 0.0579314769585508,
"choice_prob_norm,none": 0.5023677852150373,
"choice_prob_norm_stderr,none": 0.001311220814064811,
"choice_logprob_norm,none": -0.689345064121781,
"choice_logprob_norm_stderr,none": 0.0026084407923643274
},
"medmcqa_5shot": {
"acc,none": 0.271097298589529,
"acc_stderr,none": 0.0068739340426092,
"acc_norm,none": 0.271097298589529,
"acc_norm_stderr,none": 0.0068739340426092,
"logprob,none": -1.464151024846919,
"logprob_stderr,none": 0.008543787761274825
},
"medmcqa_sl_verb_5shot": {
"acc,none": 0.2644035381305283,
"acc_stderr,none": 0.006819639738871822,
"acc_norm,none": 0.2902223284723882,
"acc_norm_stderr,none": 0.007018342582039129,
"bpb,none": 0.15479395761995557,
"bpb_stderr,none": 0.0021831761941513003,
"logprob,none": -1.8923888203062111,
"logprob_stderr,none": 0.017482812769489587,
"choice_logprob,none": -1.6034572713356525,
"choice_logprob_stderr,none": 0.015064619654782398,
"choice_prob_norm,none": 0.2527971935551483,
"choice_prob_norm_stderr,none": 0.0003179931281454456,
"choice_logprob_norm,none": -1.379051484358927,
"choice_logprob_norm_stderr,none": 0.0014580276768533052
},
"truthfulqa_mc1_0shot": {
"acc,none": 0.22031823745410037,
"acc_stderr,none": 0.01450904517148729,
"logprob,none": -20.957387032386286,
"logprob_stderr,none": 0.448160644790156
},
"logprob_gsm8k_5shot": {
"nll,none": 145.7956747267624,
"nll_stderr,none": 1.9411249646364972,
"bpb,none": 0.747191230526963,
"bpb_stderr,none": 0.006610166624908301
},
"logprob_humaneval_10shot": {
"nll,none": 49.127946647202094,
"nll_stderr,none": 2.8058110154980107,
"bpb,none": 0.4534054835306593,
"bpb_stderr,none": 0.019236755302746454
}
}
},
{
"mix": "v4",
"hidden_dim": 1024,
"budget": 9e+18,
"tasks": {
"mmlu_sl_verb_0shot": {
"acc,none": 0.2774533542230451,
"acc_stderr,none": 0.0037723646961444024,
"acc_norm,none": 0.28642643498077197,
"acc_norm_stderr,none": 0.003807535783758128,
"bpb,none": 0.3803269122433914,
"bpb_stderr,none": 0.0020280858919267198,
"logprob,none": -5.844794198463459,
"logprob_stderr,none": 0.018316805879275064,
"choice_logprob,none": -1.8287491768128128,
"choice_logprob_stderr,none": 0.011684516189729936,
"choice_prob_norm,none": 0.2531198595132669,
"choice_prob_norm_stderr,none": 0.00026485418404609127,
"choice_logprob_norm,none": -1.3833328282413355,
"choice_logprob_norm_stderr,none": 0.0011203301238816258,
"sample_count": {
"acc,none": 14042,
"acc_norm,none": 14042,
"bpb,none": 14042,
"logprob,none": 14042,
"choice_logprob,none": 14042,
"choice_prob_norm,none": 14042,
"choice_logprob_norm,none": 14042
}
},
"mmlu_sl_verb_5shot": {
"acc,none": 0.2789488676826663,
"acc_stderr,none": 0.003779197707879068,
"acc_norm,none": 0.2879931633670417,
"acc_norm_stderr,none": 0.0038097458146897248,
"bpb,none": 0.21992023018916243,
"bpb_stderr,none": 0.0013923877356860114,
"logprob,none": -3.564730589500739,
"logprob_stderr,none": 0.017863782973941433,
"choice_logprob,none": -1.8814392643084947,
"choice_logprob_stderr,none": 0.012669429676574627,
"choice_prob_norm,none": 0.25269851613168215,
"choice_prob_norm_stderr,none": 0.00020693702420684866,
"choice_logprob_norm,none": -1.3814515889802632,
"choice_logprob_norm_stderr,none": 0.0008816679758781082,
"sample_count": {
"acc,none": 14042,
"acc_norm,none": 14042,
"bpb,none": 14042,
"logprob,none": 14042,
"choice_logprob,none": 14042,
"choice_prob_norm,none": 14042,
"choice_logprob_norm,none": 14042
}
},
"arc_challenge_5shot": {
"acc,none": 0.23720136518771331,
"acc_stderr,none": 0.012430399829260825,
"acc_norm,none": 0.2627986348122867,
"acc_norm_stderr,none": 0.012862523175351331,
"bpb,none": 1.122671016532738,
"bpb_stderr,none": 0.02156659298873546,
"logprob,none": -19.1114692114319,
"logprob_stderr,none": 0.33233501302887325,
"choice_logprob,none": -4.86906685657334,
"choice_logprob_stderr,none": 0.15676936362090027,
"choice_prob_norm,none": 0.25520528693887606,
"choice_prob_norm_stderr,none": 0.0020432570152471912,
"choice_logprob_norm,none": -1.4106325795742098,
"choice_logprob_norm_stderr,none": 0.009764167523830716
},
"arc_easy_5shot": {
"acc,none": 0.5660774410774411,
"acc_stderr,none": 0.010169795770462108,
"acc_norm,none": 0.5475589225589226,
"acc_norm_stderr,none": 0.010213265860171399,
"bpb,none": 0.9042355726644368,
"bpb_stderr,none": 0.010423249883679883,
"logprob,none": -12.602477428487656,
"logprob_stderr,none": 0.19648418817108307,
"choice_logprob,none": -2.0388860933192974,
"choice_logprob_stderr,none": 0.07320207165571463,
"choice_prob_norm,none": 0.30685700242403374,
"choice_prob_norm_stderr,none": 0.0018183540756247286,
"choice_logprob_norm,none": -1.2231684410890977,
"choice_logprob_norm_stderr,none": 0.006201465599955083
},
"boolq_10shot": {
"acc,none": 0.5636085626911315,
"acc_stderr,none": 0.008674000467432084,
"bpb,none": 0.46067848498842306,
"bpb_stderr,none": 0.005325747676230887,
"logprob,none": -0.7747850852639668,
"logprob_stderr,none": 0.007246572773184094,
"choice_logprob,none": -0.7037545072531656,
"choice_logprob_stderr,none": 0.007217177294864038,
"choice_prob_norm,none": 0.5325794186447117,
"choice_prob_norm_stderr,none": 0.0024977724222479975,
"choice_logprob_norm,none": -0.6716482700585745,
"choice_logprob_norm_stderr,none": 0.005302324269465953
},
"boolq_sl_verb_10shot": {
"acc,none": 0.41406727828746176,
"acc_stderr,none": 0.008614932353134944,
"acc_norm,none": 0.42691131498470947,
"acc_norm_stderr,none": 0.0086511190696438,
"bpb,none": 0.534590995082232,
"bpb_stderr,none": 0.005468438995924815,
"logprob,none": -1.0194541011017033,
"logprob_stderr,none": 0.011602714762655012,
"choice_logprob,none": -0.9879484043502937,
"choice_logprob_stderr,none": 0.011650295516937031,
"choice_prob_norm,none": 0.47919425468082766,
"choice_prob_norm_stderr,none": 0.0024771178854644835,
"choice_logprob_norm,none": -0.7815703185560569,
"choice_logprob_norm_stderr,none": 0.005400403531963753
},
"copa_0shot": {
"acc,none": 0.65,
"acc_stderr,none": 0.04793724854411019,
"bpb,none": 1.7019083597213787,
"bpb_stderr,none": 0.04710704897624362,
"logprob,none": -32.042115364074704,
"logprob_stderr,none": 0.5908285100905646,
"choice_logprob,none": -1.0528322688365988,
"choice_logprob_stderr,none": 0.1874360472540367,
"choice_prob_norm,none": 0.5120526610271129,
"choice_prob_norm_stderr,none": 0.008026570580489788,
"choice_logprob_norm,none": -0.6830492104032645,
"choice_logprob_norm_stderr,none": 0.017254894455359437
},
"csqa_5shot": {
"acc,none": 0.19901719901719903,
"acc_stderr,none": 0.011430809442838396,
"bpb,none": 3.3766833013009983,
"bpb_stderr,none": 0.07009656072885559,
"logprob,none": -2.3405385099406324,
"logprob_stderr,none": 0.048587233436155265,
"choice_logprob,none": -2.3306571442623456,
"choice_logprob_stderr,none": 0.04859626749409556,
"choice_prob_norm,none": 0.20053994555124022,
"choice_prob_norm_stderr,none": 0.005603288735735603,
"choice_logprob_norm,none": -2.8393055229802457,
"choice_logprob_norm_stderr,none": 0.07042486754199812
},
"csqa_sl_verb_5shot": {
"acc,none": 0.2031122031122031,
"acc_stderr,none": 0.0115182547936341,
"acc_norm,none": 0.21457821457821458,
"acc_norm_stderr,none": 0.01175342309421685,
"bpb,none": 0.4511414834609372,
"bpb_stderr,none": 0.007965861823980845,
"logprob,none": -3.6473876377483747,
"logprob_stderr,none": 0.05557672894814405,
"choice_logprob,none": -2.5809087316161494,
"choice_logprob_stderr,none": 0.0532600056694442,
"choice_prob_norm,none": 0.20605941344511416,
"choice_prob_norm_stderr,none": 0.0013607278866299014,
"choice_logprob_norm,none": -1.6102343051125687,
"choice_logprob_norm_stderr,none": 0.007427904090546415
},
"hellaswag_0shot": {
"acc,none": 0.3437562238597889,
"acc_stderr,none": 0.004739902411944531,
"acc_norm,none": 0.40898227444732127,
"acc_norm_stderr,none": 0.0049064119844767886,
"bpb,none": 0.8910177605378113,
"bpb_stderr,none": 0.0020666095537175736,
"logprob,none": -81.55465481666378,
"logprob_stderr,none": 0.4062560138921686,
"choice_logprob,none": -19.694015247380943,
"choice_logprob_stderr,none": 0.2527695825289582,
"choice_prob_norm,none": 0.2694226770198269,
"choice_prob_norm_stderr,none": 0.000422535399658907,
"choice_logprob_norm,none": -1.323975923843565,
"choice_logprob_norm_stderr,none": 0.0015923103921380202
},
"hellaswag_5shot": {
"acc,none": 0.3406691894045011,
"acc_stderr,none": 0.004729656826803945,
"acc_norm,none": 0.41047600079665403,
"acc_norm_stderr,none": 0.004909148239488271,
"bpb,none": 0.8784322372959009,
"bpb_stderr,none": 0.0020547834420918203,
"logprob,none": -80.53157815399504,
"logprob_stderr,none": 0.4046564191726301,
"choice_logprob,none": -20.014971737923123,
"choice_logprob_stderr,none": 0.25572990843559773,
"choice_prob_norm,none": 0.26936698050878854,
"choice_prob_norm_stderr,none": 0.0004218675347711045,
"choice_logprob_norm,none": -1.3241177544374372,
"choice_logprob_norm_stderr,none": 0.001587492822164877
},
"openbookqa_0shot": {
"acc,none": 0.188,
"acc_stderr,none": 0.01749067888034625,
"acc_norm,none": 0.292,
"acc_norm_stderr,none": 0.02035437548053006,
"bpb,none": 1.8626723078338552,
"bpb_stderr,none": 0.04876361788417803,
"logprob,none": -19.33404068660736,
"logprob_stderr,none": 0.5159979719157372,
"choice_logprob,none": -6.121661900622359,
"choice_logprob_stderr,none": 0.3037587927046394,
"choice_prob_norm,none": 0.2706156420052292,
"choice_prob_norm_stderr,none": 0.005846910147542166,
"choice_logprob_norm,none": -1.4455013820083449,
"choice_logprob_norm_stderr,none": 0.028513878219289402
},
"piqa_5shot": {
"acc,none": 0.7034820457018498,
"acc_stderr,none": 0.010656078922661138,
"acc_norm,none": 0.7067464635473341,
"acc_norm_stderr,none": 0.010621818421101926,
"bpb,none": 1.0952820472603881,
"bpb_stderr,none": 0.010897094627065809,
"logprob,none": -64.0548716683642,
"logprob_stderr,none": 1.329642784085942,
"choice_logprob,none": -2.7923947350166616,
"choice_logprob_stderr,none": 0.20186784502783026,
"choice_prob_norm,none": 0.5152923353674727,
"choice_prob_norm_stderr,none": 0.0013546668938055843,
"choice_logprob_norm,none": -0.6697296600130525,
"choice_logprob_norm_stderr,none": 0.0027620892141826114
},
"winogrande_5shot": {
"acc,none": 0.5193370165745856,
"acc_stderr,none": 0.014041972733712977,
"bpb,none": 0.4385273228372553,
"bpb_stderr,none": 0.02343280298510262,
"logprob,none": -19.836543400078753,
"logprob_stderr,none": 0.2874090450194781,
"choice_logprob,none": -0.780198803559572,
"choice_logprob_stderr,none": 0.016388003586523526,
"choice_prob_norm,none": 0.5012282444770617,
"choice_prob_norm_stderr,none": 0.00082343001258754,
"choice_logprob_norm,none": -0.6929080730440892,
"choice_logprob_norm_stderr,none": 0.002277345281768227
},
"wsc273_0shot": {
"acc,none": 0.5604395604395604,
"acc_stderr,none": 0.030094646016767413,
"bpb,none": 0.6676645055346196,
"bpb_stderr,none": 0.02273293542937676,
"logprob,none": -27.731201325580752,
"logprob_stderr,none": 0.5024998047380301,
"choice_logprob,none": -0.7743965635875873,
"choice_logprob_stderr,none": 0.04520265330703877,
"choice_prob_norm,none": 0.5022263467462416,
"choice_prob_norm_stderr,none": 0.0012645326760995672,
"choice_logprob_norm,none": -0.6895570740323064,
"choice_logprob_norm_stderr,none": 0.0025022121928476066
},
"medmcqa_5shot": {
"acc,none": 0.29046139134592397,
"acc_stderr,none": 0.007020050049541257,
"acc_norm,none": 0.29046139134592397,
"acc_norm_stderr,none": 0.007020050049541257,
"logprob,none": -1.482259517818326,
"logprob_stderr,none": 0.010804905175993704
},
"medmcqa_sl_verb_5shot": {
"acc,none": 0.27348792732488647,
"acc_stderr,none": 0.006892844537516325,
"acc_norm,none": 0.28448481950753046,
"acc_norm_stderr,none": 0.006976650388273283,
"bpb,none": 0.1889180090057425,
"bpb_stderr,none": 0.002490507290640879,
"logprob,none": -2.366434458480222,
"logprob_stderr,none": 0.022541561684512896,
"choice_logprob,none": -1.7610643124913015,
"choice_logprob_stderr,none": 0.01969345898307532,
"choice_prob_norm,none": 0.25339439814986003,
"choice_prob_norm_stderr,none": 0.0004121143028173064,
"choice_logprob_norm,none": -1.3792005120618667,
"choice_logprob_norm_stderr,none": 0.0018467409714330132
},
"truthfulqa_mc1_0shot": {
"acc,none": 0.23011015911872704,
"acc_stderr,none": 0.014734557959807756,
"logprob,none": -19.767552736485456,
"logprob_stderr,none": 0.4362608490818928
},
"logprob_gsm8k_5shot": {
"nll,none": 124.9868801672229,
"nll_stderr,none": 1.7562178141715343,
"bpb,none": 0.6312360680966751,
"bpb_stderr,none": 0.005629445208024275
},
"logprob_humaneval_10shot": {
"nll,none": 50.14906453214041,
"nll_stderr,none": 2.7024869472844553,
"bpb,none": 0.4774618551645616,
"bpb_stderr,none": 0.02005287009799475
}
}
},
{
"mix": "v0",
"hidden_dim": 1280,
"budget": 2.83e+19,
"tasks": {
"mmlu_sl_verb_0shot": {
"acc_norm,none": 0.29226605896595925,
"acc_norm_stderr,none": 0.003829748537931336,
"logprob,none": -6.209622505894744,
"logprob_stderr,none": 0.015489851959268889,
"choice_logprob,none": -1.666521599179239,
"choice_logprob_stderr,none": 0.00986827146754822,
"bpb,none": 0.4410641939573321,
"bpb_stderr,none": 0.002417360467048572,
"choice_logprob_norm,none": -1.3843110721639518,
"choice_logprob_norm_stderr,none": 0.001109748013176671,
"acc,none": 0.26883634809856144,
"acc_stderr,none": 0.003735640929041117,
"choice_prob_norm,none": 0.2527257228896304,
"choice_prob_norm_stderr,none": 0.00026333469686721627
},
"mmlu_sl_verb_5shot": {
"logprob,none": -3.3674200068996916,
"logprob_stderr,none": 0.016813833012880218,
"bpb,none": 0.21742171317815961,
"bpb_stderr,none": 0.0014461482395902168,
"acc_norm,none": 0.2828656886483407,
"acc_norm_stderr,none": 0.0037943694482700015,
"choice_logprob_norm,none": -1.3838840680471238,
"choice_logprob_norm_stderr,none": 0.0007215059248168945,
"acc,none": 0.26627261073921094,
"acc_stderr,none": 0.0037251087618102646,
"choice_logprob,none": -1.7229442209787504,
"choice_logprob_stderr,none": 0.010439823186440269,
"choice_prob_norm,none": 0.25157403125961547,
"choice_prob_norm_stderr,none": 0.0001703963440441645
},
"arc_challenge_5shot": {
"acc,none": 0.27303754266211605,
"acc_stderr,none": 0.013019332762635744,
"acc_norm,none": 0.3054607508532423,
"acc_norm_stderr,none": 0.013460080478002505,
"bpb,none": 1.0283761589598304,
"bpb_stderr,none": 0.019987480088041555,
"logprob,none": -17.726086085261745,
"logprob_stderr,none": 0.3161510775591788,
"choice_logprob,none": -4.381043890471866,
"choice_logprob_stderr,none": 0.14355991812557373,
"choice_prob_norm,none": 0.26016375124616575,
"choice_prob_norm_stderr,none": 0.0020313690468874773,
"choice_logprob_norm,none": -1.3857153166892604,
"choice_logprob_norm_stderr,none": 0.008830226241080263
},
"arc_easy_5shot": {
"acc,none": 0.6031144781144782,
"acc_stderr,none": 0.010039236800583207,
"acc_norm,none": 0.5946969696969697,
"acc_norm_stderr,none": 0.010074093589739192,
"bpb,none": 0.8148919918668881,
"bpb_stderr,none": 0.0102402340029807,
"logprob,none": -11.559560100438217,
"logprob_stderr,none": 0.1870772112182125,
"choice_logprob,none": -1.7572758329717766,
"choice_logprob_stderr,none": 0.06667789995280643,
"choice_prob_norm,none": 0.3174056212233518,
"choice_prob_norm_stderr,none": 0.0018724297881361682,
"choice_logprob_norm,none": -1.1876341516951332,
"choice_logprob_norm_stderr,none": 0.005962337814397002
},
"boolq_10shot": {
"acc,none": 0.5807339449541284,
"acc_stderr,none": 0.008630302070999098,
"bpb,none": 0.42498367106450263,
"bpb_stderr,none": 0.005208711650660825,
"logprob,none": -0.7036019508627211,
"logprob_stderr,none": 0.00691061536637038,
"choice_logprob,none": -0.6884317218506175,
"choice_logprob_stderr,none": 0.006896944270013674,
"choice_prob_norm,none": 0.5353019746155919,
"choice_prob_norm_stderr,none": 0.002496901667984246,
"choice_logprob_norm,none": -0.6657104660137152,
"choice_logprob_norm_stderr,none": 0.005225205186616344
},
"boolq_sl_verb_10shot": {
"acc,none": 0.6198776758409786,
"acc_stderr,none": 0.0084899909189892,
"acc_norm,none": 0.6214067278287462,
"acc_norm_stderr,none": 0.008483341718024479,
"bpb,none": 0.5999890705120567,
"bpb_stderr,none": 0.01207801489576029,
"logprob,none": -0.8783978572314668,
"logprob_stderr,none": 0.016216410708147073,
"choice_logprob,none": -0.8516348796961529,
"choice_logprob_stderr,none": 0.016231381752290774,
"choice_prob_norm,none": 0.565980565018701,
"choice_prob_norm_stderr,none": 0.004850349282933368,
"choice_logprob_norm,none": -0.7542167961257918,
"choice_logprob_norm_stderr,none": 0.011944739409233355
},
"copa_0shot": {
"acc,none": 0.73,
"acc_stderr,none": 0.0446196043338474,
"bpb,none": 1.595618454194437,
"bpb_stderr,none": 0.043333507469277076,
"logprob,none": -30.05348129272461,
"logprob_stderr,none": 0.5365979835667709,
"choice_logprob,none": -0.8766987854590171,
"choice_logprob_stderr,none": 0.16336826657602965,
"choice_prob_norm,none": 0.5214883684868503,
"choice_prob_norm_stderr,none": 0.008448802370021472,
"choice_logprob_norm,none": -0.665533557235146,
"choice_logprob_norm_stderr,none": 0.01764556442591378
},
"csqa_5shot": {
"acc,none": 0.21703521703521703,
"acc_stderr,none": 0.01180201884653,
"bpb,none": 2.725079745532547,
"bpb_stderr,none": 0.0409573250841188,
"logprob,none": -1.8888813424168993,
"logprob_stderr,none": 0.028389454405334053,
"choice_logprob,none": -1.8783109756143825,
"choice_logprob_stderr,none": 0.028404217902383927,
"choice_prob_norm,none": 0.20074997076219486,
"choice_prob_norm_stderr,none": 0.003299421449442018,
"choice_logprob_norm,none": -2.077638700704776,
"choice_logprob_norm_stderr,none": 0.04117720478426129
},
"csqa_sl_verb_5shot": {
"acc,none": 0.22522522522522523,
"acc_stderr,none": 0.011959591224286234,
"acc_norm,none": 0.22358722358722358,
"acc_norm_stderr,none": 0.011928612008761174,
"bpb,none": 0.3175998808584069,
"bpb_stderr,none": 0.005524235826180418,
"logprob,none": -2.521169312658318,
"logprob_stderr,none": 0.03560094846882005,
"choice_logprob,none": -1.9576231799793016,
"choice_logprob_stderr,none": 0.03470973076012509,
"choice_prob_norm,none": 0.2048963995235781,
"choice_prob_norm_stderr,none": 0.000922567238262749,
"choice_logprob_norm,none": -1.599995325956514,
"choice_logprob_norm_stderr,none": 0.0051925802377438095
},
"hellaswag_0shot": {
"acc,none": 0.38986257717586137,
"acc_stderr,none": 0.004867221634461271,
"acc_norm,none": 0.4887472615016929,
"acc_norm_stderr,none": 0.004988517597998611,
"bpb,none": 0.8415242444682122,
"bpb_stderr,none": 0.0020012268461091645,
"logprob,none": -76.97146024226288,
"logprob_stderr,none": 0.3834460822769651,
"choice_logprob,none": -16.843240086968393,
"choice_logprob_stderr,none": 0.2292036319493721,
"choice_prob_norm,none": 0.27690864474623345,
"choice_prob_norm_stderr,none": 0.0004217564384003691,
"choice_logprob_norm,none": -1.2958802116211106,
"choice_logprob_norm_stderr,none": 0.0015479785034826032
},
"hellaswag_5shot": {
"acc,none": 0.38418641704839673,
"acc_stderr,none": 0.004854082479916916,
"acc_norm,none": 0.4896434973112926,
"acc_norm_stderr,none": 0.004988710917169329,
"bpb,none": 0.827420054400728,
"bpb_stderr,none": 0.0019730830528583445,
"logprob,none": -75.89870920900186,
"logprob_stderr,none": 0.38215850087060366,
"choice_logprob,none": -17.215097853552177,
"choice_logprob_stderr,none": 0.2322699233354321,
"choice_prob_norm,none": 0.2764504682104974,
"choice_prob_norm_stderr,none": 0.000419227999480607,
"choice_logprob_norm,none": -1.29740576523825,
"choice_logprob_norm_stderr,none": 0.0015386710312781204
},
"openbookqa_0shot": {
"acc,none": 0.226,
"acc_stderr,none": 0.018722956449139936,
"acc_norm,none": 0.328,
"acc_norm_stderr,none": 0.021017027165175495,
"bpb,none": 1.8180603255364978,
"bpb_stderr,none": 0.04992289763998061,
"logprob,none": -18.71832051229477,
"logprob_stderr,none": 0.4984652624372133,
"choice_logprob,none": -5.756449158667543,
"choice_logprob_stderr,none": 0.29307223445528263,
"choice_prob_norm,none": 0.2771861143401492,
"choice_prob_norm_stderr,none": 0.005959448128268381,
"choice_logprob_norm,none": -1.4238737513088515,
"choice_logprob_norm_stderr,none": 0.029916595311500378
},
"piqa_5shot": {
"acc,none": 0.7143634385201306,
"acc_stderr,none": 0.010539303948661927,
"acc_norm,none": 0.7132752992383025,
"acc_norm_stderr,none": 0.010551314503108075,
"bpb,none": 1.045407995402416,
"bpb_stderr,none": 0.010281176695787196,
"logprob,none": -61.57799939076711,
"logprob_stderr,none": 1.2877505958045585,
"choice_logprob,none": -2.5762836726033336,
"choice_logprob_stderr,none": 0.18798789868527002,
"choice_prob_norm,none": 0.5173234535007274,
"choice_prob_norm_stderr,none": 0.0013264659761474242,
"choice_logprob_norm,none": -0.665587866533371,
"choice_logprob_norm_stderr,none": 0.0027402003861128465
},
"winogrande_5shot": {
"acc,none": 0.5185477505919495,
"acc_stderr,none": 0.014042813708888378,
"bpb,none": 0.4340690453637434,
"bpb_stderr,none": 0.02297466714977533,
"logprob,none": -19.649428625603594,
"logprob_stderr,none": 0.27863341577197265,
"choice_logprob,none": -0.7738055543223351,
"choice_logprob_stderr,none": 0.015506418418150121,
"choice_prob_norm,none": 0.5012963998338761,
"choice_prob_norm_stderr,none": 0.0007472258662358123,
"choice_logprob_norm,none": -0.6923158523760211,
"choice_logprob_norm_stderr,none": 0.0019636471033472047
},
"wsc273_0shot": {
"acc,none": 0.6410256410256411,
"acc_stderr,none": 0.029086064518366282,
"bpb,none": 0.6369651079416323,
"bpb_stderr,none": 0.021725909969731252,
"logprob,none": -26.41959673144442,
"logprob_stderr,none": 0.47508686226521085,
"choice_logprob,none": -0.6856140437555511,
"choice_logprob_stderr,none": 0.0407542274946971,
"choice_prob_norm,none": 0.503661892304479,
"choice_prob_norm_stderr,none": 0.0012730978135580076,
"choice_logprob_norm,none": -0.6867133915338746,
"choice_logprob_norm_stderr,none": 0.002521126622067467
},
"medmcqa_5shot": {
"acc,none": 0.2696629213483146,
"acc_stderr,none": 0.006862467146147035,
"acc_norm,none": 0.2696629213483146,
"acc_norm_stderr,none": 0.006862467146147035,
"logprob,none": -1.41153096153837,
"logprob_stderr,none": 0.004057120176996225
},
"medmcqa_sl_verb_5shot": {
"acc,none": 0.27061917284245757,
"acc_stderr,none": 0.00687012182714279,
"acc_norm,none": 0.3067176667463543,
"acc_norm_stderr,none": 0.007130704645763216,
"bpb,none": 0.15086929656791995,
"bpb_stderr,none": 0.0019006540999348866,
"logprob,none": -1.8186738202368389,
"logprob_stderr,none": 0.013975437849597278,
"choice_logprob,none": -1.5058455539114988,
"choice_logprob_stderr,none": 0.011836430715752234,
"choice_prob_norm,none": 0.2525030822261794,
"choice_prob_norm_stderr,none": 0.0002748409391057536,
"choice_logprob_norm,none": -1.3790022896972258,
"choice_logprob_norm_stderr,none": 0.0011577589398460758
},
"truthfulqa_mc1_0shot": {
"acc,none": 0.23011015911872704,
"acc_stderr,none": 0.014734557959807763,
"logprob,none": -19.10854571860744,
"logprob_stderr,none": 0.4024300847185795
},
"logprob_gsm8k_5shot": {
"nll,none": 132.44639959761915,
"nll_stderr,none": 1.7152639585544431,
"bpb,none": 0.6816285006144224,
"bpb_stderr,none": 0.006052614179169423
},
"logprob_humaneval_10shot": {
"nll,none": 50.32563555240631,
"nll_stderr,none": 2.740384937137503,
"bpb,none": 0.4865457768105658,
"bpb_stderr,none": 0.020536436529556782
}
}
},
{
"mix": "v2",
"hidden_dim": 1280,
"budget": 2.83e+19,
"tasks": {
"mmlu_sl_verb_0shot": {
"choice_logprob,none": -1.6355644163636416,
"choice_logprob_stderr,none": 0.00993168138907273,
"acc_norm,none": 0.2942600769121208,
"acc_norm_stderr,none": 0.003832776095212889,
"acc,none": 0.2827944737216921,
"acc_stderr,none": 0.0037907716737867903,
"choice_logprob_norm,none": -1.382032849360097,
"choice_logprob_norm_stderr,none": 0.0008785335683598963,
"bpb,none": 0.37224887518444993,
"bpb_stderr,none": 0.00199571516907844,
"logprob,none": -5.386200733835919,
"logprob_stderr,none": 0.01652340004633214,
"choice_prob_norm,none": 0.2526551930696583,
"choice_prob_norm_stderr,none": 0.0002125452947926354
},
"mmlu_sl_verb_5shot": {
"acc_norm,none": 0.277097279589802,
"acc_norm_stderr,none": 0.0037643368162159296,
"choice_logprob,none": -1.6686232811107222,
"choice_logprob_stderr,none": 0.010236917091765866,
"choice_logprob_norm,none": -1.3828063134991904,
"choice_logprob_norm_stderr,none": 0.0006243860345584463,
"acc,none": 0.2653468166927788,
"acc_stderr,none": 0.0037118178455470127,
"logprob,none": -3.2203891673529896,
"logprob_stderr,none": 0.01716090349129126,
"bpb,none": 0.20412977360285625,
"bpb_stderr,none": 0.0013672943970149927,
"choice_prob_norm,none": 0.2516315059406078,
"choice_prob_norm_stderr,none": 0.0001511494982749355
},
"arc_challenge_5shot": {
"acc,none": 0.3037542662116041,
"acc_stderr,none": 0.01343890918477876,
"acc_norm,none": 0.34982935153583616,
"acc_norm_stderr,none": 0.013936809212158294,
"bpb,none": 1.0051468106645478,
"bpb_stderr,none": 0.023198784622296246,
"logprob,none": -17.086511841609617,
"logprob_stderr,none": 0.31772037414415716,
"choice_logprob,none": -4.346364128180887,
"choice_logprob_stderr,none": 0.14955380981290733,
"choice_prob_norm,none": 0.26689375012133293,
"choice_prob_norm_stderr,none": 0.0023332473491658383,
"choice_logprob_norm,none": -1.3760793440216341,
"choice_logprob_norm_stderr,none": 0.011147063107963154
},
"arc_easy_5shot": {
"acc,none": 0.6515151515151515,
"acc_stderr,none": 0.009777377947106538,
"acc_norm,none": 0.6468855218855218,
"acc_norm_stderr,none": 0.009807078935467613,
"bpb,none": 0.7627812814045225,
"bpb_stderr,none": 0.010633835928133899,
"logprob,none": -10.932685228133643,
"logprob_stderr,none": 0.1862389592816211,
"choice_logprob,none": -1.6075675662596918,
"choice_logprob_stderr,none": 0.06866105725822967,
"choice_prob_norm,none": 0.3365922264060604,
"choice_prob_norm_stderr,none": 0.002248177192882947,
"choice_logprob_norm,none": -1.1387806607328435,
"choice_logprob_norm_stderr,none": 0.006619520224730074
},
"boolq_10shot": {
"acc,none": 0.44036697247706424,
"acc_stderr,none": 0.0086826356676869,
"bpb,none": 0.5165486355515898,
"bpb_stderr,none": 0.004568869184677957,
"logprob,none": -0.9770105327670363,
"logprob_stderr,none": 0.010064845379313079,
"choice_logprob,none": -0.8866122233998849,
"choice_logprob_stderr,none": 0.01003765511377256,
"choice_prob_norm,none": 0.49596476745427315,
"choice_prob_norm_stderr,none": 0.002130429975858172,
"choice_logprob_norm,none": -0.7332640379794884,
"choice_logprob_norm_stderr,none": 0.00451633810323634
},
"boolq_sl_verb_10shot": {
"acc,none": 0.5819571865443425,
"acc_stderr,none": 0.008626774352070744,
"acc_norm,none": 0.5954128440366973,
"acc_norm_stderr,none": 0.008584355308932685,
"bpb,none": 0.5318714914967736,
"bpb_stderr,none": 0.009560120218176012,
"logprob,none": -0.8367789745695365,
"logprob_stderr,none": 0.012954331347065601,
"choice_logprob,none": -0.7965980586116753,
"choice_logprob_stderr,none": 0.012971533544866524,
"choice_prob_norm,none": 0.5463688682187838,
"choice_prob_norm_stderr,none": 0.0038899028314131247,
"choice_logprob_norm,none": -0.7207930588823217,
"choice_logprob_norm_stderr,none": 0.009386272318322439
},
"copa_0shot": {
"acc,none": 0.71,
"acc_stderr,none": 0.045604802157206824,
"bpb,none": 1.6588899547828249,
"bpb_stderr,none": 0.04996699798522133,
"logprob,none": -31.084959506988525,
"logprob_stderr,none": 0.5828067888657876,
"choice_logprob,none": -1.000873777244134,
"choice_logprob_stderr,none": 0.20769504742752298,
"choice_prob_norm,none": 0.5229587520965494,
"choice_prob_norm_stderr,none": 0.009465694784086385,
"choice_logprob_norm,none": -0.6668312799032938,
"choice_logprob_norm_stderr,none": 0.02017937387226091
},
"csqa_5shot": {
"acc,none": 0.22932022932022933,
"acc_stderr,none": 0.012035891058050904,
"bpb,none": 2.722085484127594,
"bpb_stderr,none": 0.04119660416594591,
"logprob,none": -1.886805878566192,
"logprob_stderr,none": 0.028555310026269497,
"choice_logprob,none": -1.8781857661860797,
"choice_logprob_stderr,none": 0.02856460110638303,
"choice_prob_norm,none": 0.2141878789545394,
"choice_prob_norm_stderr,none": 0.0052183509464165685,
"choice_logprob_norm,none": -2.1209128340609134,
"choice_logprob_norm_stderr,none": 0.04174733558816356
},
"csqa_sl_verb_5shot": {
"acc,none": 0.2416052416052416,
"acc_stderr,none": 0.012255214642330772,
"acc_norm,none": 0.2596232596232596,
"acc_norm_stderr,none": 0.012552154236305978,
"bpb,none": 0.29461692125090677,
"bpb_stderr,none": 0.005511764022747519,
"logprob,none": -2.3587569456721407,
"logprob_stderr,none": 0.0390204882966398,
"choice_logprob,none": -2.051425451423569,
"choice_logprob_stderr,none": 0.038252338995030034,
"choice_prob_norm,none": 0.20556295095644472,
"choice_prob_norm_stderr,none": 0.0009840677417696346,
"choice_logprob_norm,none": -1.597807553182379,
"choice_logprob_norm_stderr,none": 0.005283126369552082
},
"hellaswag_0shot": {
"acc,none": 0.4246166102370046,
"acc_stderr,none": 0.004932745013072715,
"acc_norm,none": 0.5495917147978491,
"acc_norm_stderr,none": 0.004965177633049898,
"bpb,none": 0.842218931242513,
"bpb_stderr,none": 0.0020964828004861073,
"logprob,none": -76.82175086138615,
"logprob_stderr,none": 0.38384521535442956,
"choice_logprob,none": -15.645702940141794,
"choice_logprob_stderr,none": 0.2233565615576406,
"choice_prob_norm,none": 0.2841604287383886,
"choice_prob_norm_stderr,none": 0.00044807381921523374,
"choice_logprob_norm,none": -1.2709554769581326,
"choice_logprob_norm_stderr,none": 0.0016109907123332992
},
"hellaswag_5shot": {
"acc,none": 0.40310695080661224,
"acc_stderr,none": 0.004895194143892684,
"acc_norm,none": 0.5241983668591914,
"acc_norm_stderr,none": 0.0049839343432504565,
"bpb,none": 0.8201113158799554,
"bpb_stderr,none": 0.002012125040576876,
"logprob,none": -75.05595897241693,
"logprob_stderr,none": 0.3782021324643839,
"choice_logprob,none": -16.314701318311375,
"choice_logprob_stderr,none": 0.22663824711571423,
"choice_prob_norm,none": 0.28006321862407313,
"choice_prob_norm_stderr,none": 0.0004279107876750804,
"choice_logprob_norm,none": -1.2846411563723752,
"choice_logprob_norm_stderr,none": 0.001554748796237787
},
"openbookqa_0shot": {
"acc,none": 0.242,
"acc_stderr,none": 0.019173085678337157,
"acc_norm,none": 0.334,
"acc_norm_stderr,none": 0.021113492347743727,
"bpb,none": 1.8373831035676307,
"bpb_stderr,none": 0.05059676581196407,
"logprob,none": -18.901991720438005,
"logprob_stderr,none": 0.5034766353234749,
"choice_logprob,none": -5.856424473935738,
"choice_logprob_stderr,none": 0.2982831669792747,
"choice_prob_norm,none": 0.28294638983188414,
"choice_prob_norm_stderr,none": 0.006367518381123621,
"choice_logprob_norm,none": -1.4132429012816998,
"choice_logprob_norm_stderr,none": 0.02956249651515355
},
"piqa_5shot": {
"acc,none": 0.7247007616974973,
"acc_stderr,none": 0.01042142927736953,
"acc_norm,none": 0.7181719260065288,
"acc_norm_stderr,none": 0.01049667523125815,
"bpb,none": 1.0331715040761036,
"bpb_stderr,none": 0.010686860780086791,
"logprob,none": -60.96679821263459,
"logprob_stderr,none": 1.2924989329529273,
"choice_logprob,none": -2.4510160295291232,
"choice_logprob_stderr,none": 0.18160293905688882,
"choice_prob_norm,none": 0.5200233189363311,
"choice_prob_norm_stderr,none": 0.0013944298015675302,
"choice_logprob_norm,none": -0.6608648059728867,
"choice_logprob_norm_stderr,none": 0.0028196221716540477
},
"winogrande_5shot": {
"acc,none": 0.5461720599842147,
"acc_stderr,none": 0.013992441563707063,
"bpb,none": 0.43572332810692266,
"bpb_stderr,none": 0.02351526655579996,
"logprob,none": -19.668826701985445,
"logprob_stderr,none": 0.2815252237442725,
"choice_logprob,none": -0.753119375795119,
"choice_logprob_stderr,none": 0.015587721758187565,
"choice_prob_norm,none": 0.5017349921317078,
"choice_prob_norm_stderr,none": 0.0007873641529697937,
"choice_logprob_norm,none": -0.6918008109489324,
"choice_logprob_norm_stderr,none": 0.002256961442328746
},
"wsc273_0shot": {
"acc,none": 0.6373626373626373,
"acc_stderr,none": 0.029150440533497746,
"bpb,none": 0.6744923062517574,
"bpb_stderr,none": 0.021573641072027108,
"logprob,none": -28.171755874549948,
"logprob_stderr,none": 0.4880426292925857,
"choice_logprob,none": -0.7520041384529634,
"choice_logprob_stderr,none": 0.05351695138599668,
"choice_prob_norm,none": 0.5044346634265771,
"choice_prob_norm_stderr,none": 0.001298596757931072,
"choice_logprob_norm,none": -0.685193426286039,
"choice_logprob_norm_stderr,none": 0.0025251037658414323
},
"medmcqa_5shot": {
"acc,none": 0.3162801816877839,
"acc_stderr,none": 0.007190896863029239,
"acc_norm,none": 0.3162801816877839,
"acc_norm_stderr,none": 0.007190896863029239,
"logprob,none": -1.4593657617103875,
"logprob_stderr,none": 0.010492940476909497
},
"medmcqa_sl_verb_5shot": {
"acc,none": 0.3055223523786756,
"acc_stderr,none": 0.0071229290172453805,
"acc_norm,none": 0.3174754960554626,
"acc_norm_stderr,none": 0.0071981719187398634,
"bpb,none": 0.14504181755898918,
"bpb_stderr,none": 0.002069838836516618,
"logprob,none": -1.7652322429733474,
"logprob_stderr,none": 0.018427749438524245,
"choice_logprob,none": -1.5695841908610249,
"choice_logprob_stderr,none": 0.01624033496449477,
"choice_prob_norm,none": 0.25405428691108856,
"choice_prob_norm_stderr,none": 0.00032918705979028377,
"choice_logprob_norm,none": -1.374203763504559,
"choice_logprob_norm_stderr,none": 0.001471903951849426
},
"truthfulqa_mc1_0shot": {
"acc,none": 0.2178702570379437,
"acc_stderr,none": 0.014450846714123895,
"logprob,none": -19.17639766095464,
"logprob_stderr,none": 0.40257679326714874
},
"logprob_gsm8k_5shot": {
"nll,none": 124.43013503065247,
"nll_stderr,none": 1.7457547832049456,
"bpb,none": 0.631483501899567,
"bpb_stderr,none": 0.005981006908265368
},
"logprob_humaneval_10shot": {
"nll,none": 57.852137891257684,
"nll_stderr,none": 3.13853121848385,
"bpb,none": 0.5336717673608972,
"bpb_stderr,none": 0.019712068441408603
}
}
},
{
"mix": "v3",
"hidden_dim": 1280,
"budget": 2.83e+19,
"tasks": {
"mmlu_sl_verb_0shot": {
"acc,none": 0.2805868109955847,
"acc_stderr,none": 0.0037835552116241037,
"acc_norm,none": 0.2879219484403931,
"acc_norm_stderr,none": 0.0038100683058163503,
"bpb,none": 0.29564193249916315,
"bpb_stderr,none": 0.00163539261025707,
"logprob,none": -4.586842207251352,
"logprob_stderr,none": 0.015925438089625925,
"choice_logprob,none": -1.6562007130156302,
"choice_logprob_stderr,none": 0.009092790810744425,
"choice_prob_norm,none": 0.2524032845490927,
"choice_prob_norm_stderr,none": 0.00021544351320955767,
"choice_logprob_norm,none": -1.3830507908053524,
"choice_logprob_norm_stderr,none": 0.0008968018751013425,
"sample_count": {
"acc,none": 14042,
"acc_norm,none": 14042,
"bpb,none": 14042,
"logprob,none": 14042,
"choice_logprob,none": 14042,
"choice_prob_norm,none": 14042,
"choice_logprob_norm,none": 14042
}
},
"mmlu_sl_verb_5shot": {
"acc,none": 0.27439111237715424,
"acc_stderr,none": 0.0037534949331315937,
"acc_norm,none": 0.2864976499074206,
"acc_norm_stderr,none": 0.00380537568791088,
"bpb,none": 0.18563286499798612,
"bpb_stderr,none": 0.0012545152052620288,
"logprob,none": -2.8578264987061446,
"logprob_stderr,none": 0.014808275213136313,
"choice_logprob,none": -1.5779558935911742,
"choice_logprob_stderr,none": 0.008331239974153823,
"choice_prob_norm,none": 0.25169926597413794,
"choice_prob_norm_stderr,none": 0.00014606058989178342,
"choice_logprob_norm,none": -1.382456117239998,
"choice_logprob_norm_stderr,none": 0.0005969432966090565,
"sample_count": {
"acc,none": 14042,
"acc_norm,none": 14042,
"bpb,none": 14042,
"logprob,none": 14042,
"choice_logprob,none": 14042,
"choice_prob_norm,none": 14042,
"choice_logprob_norm,none": 14042
}
},
"arc_challenge_5shot": {
"acc,none": 0.2781569965870307,
"acc_stderr,none": 0.0130944699195388,
"acc_norm,none": 0.31569965870307165,
"acc_norm_stderr,none": 0.013582571095815291,
"bpb,none": 1.004577123505464,
"bpb_stderr,none": 0.021011051134894577,
"logprob,none": -17.43483729696111,
"logprob_stderr,none": 0.3249676602454631,
"choice_logprob,none": -4.554611755793552,
"choice_logprob_stderr,none": 0.15300655584730743,
"choice_prob_norm,none": 0.2619648526693254,
"choice_prob_norm_stderr,none": 0.0021679851263837527,
"choice_logprob_norm,none": -1.385142044173578,
"choice_logprob_norm_stderr,none": 0.009742104467355102
},
"arc_easy_5shot": {
"acc,none": 0.6077441077441077,
"acc_stderr,none": 0.010018744689650043,
"acc_norm,none": 0.6165824915824916,
"acc_norm_stderr,none": 0.009976995068264717,
"bpb,none": 0.7901136817738874,
"bpb_stderr,none": 0.010450610179742914,
"logprob,none": -11.305382818660952,
"logprob_stderr,none": 0.1897638487309862,
"choice_logprob,none": -1.7970027483644972,
"choice_logprob_stderr,none": 0.06922270693962838,
"choice_prob_norm,none": 0.3219051432437752,
"choice_prob_norm_stderr,none": 0.0020288902873620536,
"choice_logprob_norm,none": -1.1792201909120759,
"choice_logprob_norm_stderr,none": 0.006375784803674426
},
"boolq_10shot": {
"acc,none": 0.5305810397553516,
"acc_stderr,none": 0.008728682900189716,
"bpb,none": 0.4231851610730653,
"bpb_stderr,none": 0.0037642867645096014,
"logprob,none": -0.7694294305022703,
"logprob_stderr,none": 0.00735201818622056,
"choice_logprob,none": -0.7451270770784845,
"choice_logprob_stderr,none": 0.00726881733581359,
"choice_prob_norm,none": 0.5188823602153386,
"choice_prob_norm_stderr,none": 0.0019058689414028012,
"choice_logprob_norm,none": -0.6791780780330144,
"choice_logprob_norm_stderr,none": 0.0038249251654363606
},
"boolq_sl_verb_10shot": {
"acc,none": 0.5571865443425077,
"acc_stderr,none": 0.008687668766930827,
"acc_norm,none": 0.6003058103975535,
"acc_norm_stderr,none": 0.008567275456584977,
"bpb,none": 0.48365323953772843,
"bpb_stderr,none": 0.004755316023137264,
"logprob,none": -0.816987519417334,
"logprob_stderr,none": 0.00618412365524644,
"choice_logprob,none": -0.6970726227422511,
"choice_logprob_stderr,none": 0.006084712635604526,
"choice_prob_norm,none": 0.5286375143943968,
"choice_prob_norm_stderr,none": 0.0022608907687295464,
"choice_logprob_norm,none": -0.6711646122538006,
"choice_logprob_norm_stderr,none": 0.00472307616647191
},
"copa_0shot": {
"acc,none": 0.71,
"acc_stderr,none": 0.045604802157206824,
"bpb,none": 1.5776713479546853,
"bpb_stderr,none": 0.04742787119329083,
"logprob,none": -29.552455520629884,
"logprob_stderr,none": 0.5787075614098893,
"choice_logprob,none": -0.9883281715260241,
"choice_logprob_stderr,none": 0.17475016771875287,
"choice_prob_norm,none": 0.5138977264414717,
"choice_prob_norm_stderr,none": 0.008287843399348597,
"choice_logprob_norm,none": -0.6809176362984224,
"choice_logprob_norm_stderr,none": 0.01835793383033253
},
"csqa_5shot": {
"acc,none": 0.21621621621621623,
"acc_stderr,none": 0.011785889175486662,
"bpb,none": 3.0773994144024783,
"bpb_stderr,none": 0.05960158184136445,
"logprob,none": -2.1330907275499036,
"logprob_stderr,none": 0.04131266841025459,
"choice_logprob,none": -2.1295597913030426,
"choice_logprob_stderr,none": 0.041315162119509,
"choice_prob_norm,none": 0.207259443443934,
"choice_prob_norm_stderr,none": 0.0054614654643658665,
"choice_logprob_norm,none": -2.518117658410582,
"choice_logprob_norm_stderr,none": 0.060043035622258066
},
"csqa_sl_verb_5shot": {
"acc,none": 0.2194922194922195,
"acc_stderr,none": 0.011849997754533981,
"acc_norm,none": 0.22522522522522523,
"acc_norm_stderr,none": 0.01195959122428624,
"bpb,none": 0.278426458562545,
"bpb_stderr,none": 0.00561864110394927,
"logprob,none": -2.2275865034281868,
"logprob_stderr,none": 0.03969882911879949,
"choice_logprob,none": -2.111006653128628,
"choice_logprob_stderr,none": 0.0393815589292252,
"choice_prob_norm,none": 0.20387185409223316,
"choice_prob_norm_stderr,none": 0.0009731985933530395,
"choice_logprob_norm,none": -1.6065435407863369,
"choice_logprob_norm_stderr,none": 0.005409053841909447
},
"hellaswag_0shot": {
"acc,none": 0.3752240589523999,
"acc_stderr,none": 0.004831911860478678,
"acc_norm,none": 0.4751045608444533,
"acc_norm_stderr,none": 0.004983592410934169,
"bpb,none": 0.855193833332515,
"bpb_stderr,none": 0.002054792160447089,
"logprob,none": -78.06590635993523,
"logprob_stderr,none": 0.38839573001040845,
"choice_logprob,none": -17.538550240006114,
"choice_logprob_stderr,none": 0.23479525747783578,
"choice_prob_norm,none": 0.27543585186615815,
"choice_prob_norm_stderr,none": 0.00042570129200358506,
"choice_logprob_norm,none": -1.301569766118517,
"choice_logprob_norm_stderr,none": 0.001571768699400334
},
"hellaswag_5shot": {
"acc,none": 0.37273451503684524,
"acc_stderr,none": 0.004825441080261189,
"acc_norm,none": 0.47161919936267677,
"acc_norm_stderr,none": 0.004981736689518747,
"bpb,none": 0.8430849958045082,
"bpb_stderr,none": 0.0020405100556970075,
"logprob,none": -77.16160491272122,
"logprob_stderr,none": 0.38786765623999814,
"choice_logprob,none": -17.961448764992756,
"choice_logprob_stderr,none": 0.23845571414769917,
"choice_prob_norm,none": 0.275012911847633,
"choice_prob_norm_stderr,none": 0.00042651187996399265,
"choice_logprob_norm,none": -1.3031529086866211,
"choice_logprob_norm_stderr,none": 0.0015738790612415988
},
"openbookqa_0shot": {
"acc,none": 0.234,
"acc_stderr,none": 0.01895274156489368,
"acc_norm,none": 0.316,
"acc_norm_stderr,none": 0.02081235951585586,
"bpb,none": 1.8224381060467416,
"bpb_stderr,none": 0.047799940063301856,
"logprob,none": -18.878421821594237,
"logprob_stderr,none": 0.5024028120213828,
"choice_logprob,none": -5.799801592879904,
"choice_logprob_stderr,none": 0.2982324566604542,
"choice_prob_norm,none": 0.2798389158047688,
"choice_prob_norm_stderr,none": 0.005934231013412345,
"choice_logprob_norm,none": -1.4071986250743183,
"choice_logprob_norm_stderr,none": 0.027636792490084664
},
"piqa_5shot": {
"acc,none": 0.7100108813928183,
"acc_stderr,none": 0.010586899128169326,
"acc_norm,none": 0.7121871599564744,
"acc_norm_stderr,none": 0.01056325038305919,
"bpb,none": 1.0632483954888707,
"bpb_stderr,none": 0.011094055239906829,
"logprob,none": -62.06737054937942,
"logprob_stderr,none": 1.2907558118508673,
"choice_logprob,none": -2.6415280056968267,
"choice_logprob_stderr,none": 0.19373254353608907,
"choice_prob_norm,none": 0.5172964759363821,
"choice_prob_norm_stderr,none": 0.0014099241438236664,
"choice_logprob_norm,none": -0.6666790420392044,
"choice_logprob_norm_stderr,none": 0.002980988229909506
},
"winogrande_5shot": {
"acc,none": 0.5224940805051302,
"acc_stderr,none": 0.014038257824059885,
"bpb,none": 0.43958825417755576,
"bpb_stderr,none": 0.023636284026221113,
"logprob,none": -19.90092382303071,
"logprob_stderr,none": 0.2852125839905403,
"choice_logprob,none": -0.7962947750599071,
"choice_logprob_stderr,none": 0.01823518540912786,
"choice_prob_norm,none": 0.5011304078017509,
"choice_prob_norm_stderr,none": 0.0007359960478038117,
"choice_logprob_norm,none": -0.6932266355873123,
"choice_logprob_norm_stderr,none": 0.002570114917974759
},
"wsc273_0shot": {
"acc,none": 0.5641025641025641,
"acc_stderr,none": 0.030066767691175837,
"bpb,none": 0.612532433855645,
"bpb_stderr,none": 0.021271710060884862,
"logprob,none": -25.572265995291126,
"logprob_stderr,none": 0.5036890338925973,
"choice_logprob,none": -0.7985367154694579,
"choice_logprob_stderr,none": 0.053060844835365384,
"choice_prob_norm,none": 0.5027888183948848,
"choice_prob_norm_stderr,none": 0.001374804067229151,
"choice_logprob_norm,none": -0.6885960337330129,
"choice_logprob_norm_stderr,none": 0.0027288547464764477
},
"medmcqa_5shot": {
"acc,none": 0.284962945254602,
"acc_stderr,none": 0.006980177321467588,
"acc_norm,none": 0.284962945254602,
"acc_norm_stderr,none": 0.006980177321467588,
"logprob,none": -1.4640987695560141,
"logprob_stderr,none": 0.007654463975283136
},
"medmcqa_sl_verb_5shot": {
"acc,none": 0.2713363614630648,
"acc_stderr,none": 0.006875836376429595,
"acc_norm,none": 0.28544107100167343,
"acc_norm_stderr,none": 0.006983694646127615,
"bpb,none": 0.1420662059671355,
"bpb_stderr,none": 0.0016415787355263745,
"logprob,none": -1.723408536928965,
"logprob_stderr,none": 0.012804282714664252,
"choice_logprob,none": -1.515039402430691,
"choice_logprob_stderr,none": 0.011579147478589224,
"choice_prob_norm,none": 0.25206391786261856,
"choice_prob_norm_stderr,none": 0.00026129809605062555,
"choice_logprob_norm,none": -1.3804023670330987,
"choice_logprob_norm_stderr,none": 0.0010727030954833818
},
"truthfulqa_mc1_0shot": {
"acc,none": 0.2350061199510404,
"acc_stderr,none": 0.014843061507731608,
"logprob,none": -18.407885265612983,
"logprob_stderr,none": 0.41773859135898134
},
"logprob_gsm8k_5shot": {
"nll,none": 124.84876262066128,
"nll_stderr,none": 1.7222330277692024,
"bpb,none": 0.6359938881622156,
"bpb_stderr,none": 0.00580424435057278
},
"logprob_humaneval_10shot": {
"nll,none": 42.3871434447242,
"nll_stderr,none": 2.496412503533855,
"bpb,none": 0.39398483856328514,
"bpb_stderr,none": 0.01722858528466455
}
}
},
{
"mix": "v4",
"hidden_dim": 1280,
"budget": 2.83e+19,
"tasks": {
"mmlu_sl_verb_0shot": {
"acc,none": 0.27581541091012673,
"acc_stderr,none": 0.0037655102250954736,
"acc_norm,none": 0.28578550064093433,
"acc_norm_stderr,none": 0.003801300172960957,
"bpb,none": 0.3761284356347318,
"bpb_stderr,none": 0.0020014477738698813,
"logprob,none": -5.662279944632107,
"logprob_stderr,none": 0.016687573808107634,
"choice_logprob,none": -1.673632175862143,
"choice_logprob_stderr,none": 0.009820667712085052,
"choice_prob_norm,none": 0.25268171755591723,
"choice_prob_norm_stderr,none": 0.00023939088147203121,
"choice_logprob_norm,none": -1.3833894893827103,
"choice_logprob_norm_stderr,none": 0.0010189355236283042,
"sample_count": {
"acc,none": 14042,
"acc_norm,none": 14042,
"bpb,none": 14042,
"logprob,none": 14042,
"choice_logprob,none": 14042,
"choice_prob_norm,none": 14042,
"choice_logprob_norm,none": 14042
}
},
"mmlu_sl_verb_5shot": {
"acc,none": 0.27887765275601767,
"acc_stderr,none": 0.0037784107520474164,
"acc_norm,none": 0.290414470873095,
"acc_norm_stderr,none": 0.0038235243498804905,
"bpb,none": 0.20180181204116973,
"bpb_stderr,none": 0.0014229922195093773,
"logprob,none": -3.1823982094482552,
"logprob_stderr,none": 0.01701034616913897,
"choice_logprob,none": -1.6487133829622105,
"choice_logprob_stderr,none": 0.009845379108928897,
"choice_prob_norm,none": 0.25144322116629586,
"choice_prob_norm_stderr,none": 0.0001526810883746589,
"choice_logprob_norm,none": -1.383706412618714,
"choice_logprob_norm_stderr,none": 0.0006480126080205595,
"sample_count": {
"acc,none": 14042,
"acc_norm,none": 14042,
"bpb,none": 14042,
"logprob,none": 14042,
"choice_logprob,none": 14042,
"choice_prob_norm,none": 14042,
"choice_logprob_norm,none": 14042
}
},
"arc_challenge_5shot": {
"acc,none": 0.2721843003412969,
"acc_stderr,none": 0.0130066004064237,
"acc_norm,none": 0.30802047781569963,
"acc_norm_stderr,none": 0.01349142951729204,
"bpb,none": 1.004044559728517,
"bpb_stderr,none": 0.02048653979427857,
"logprob,none": -17.376599727281125,
"logprob_stderr,none": 0.3198441947941251,
"choice_logprob,none": -4.402544551034342,
"choice_logprob_stderr,none": 0.14604037734305347,
"choice_prob_norm,none": 0.26219375712543186,
"choice_prob_norm_stderr,none": 0.002171320809011532,
"choice_logprob_norm,none": -1.3848545061941255,
"choice_logprob_norm_stderr,none": 0.00985341268112613
},
"arc_easy_5shot": {
"acc,none": 0.6123737373737373,
"acc_stderr,none": 0.009997307914447612,
"acc_norm,none": 0.6039562289562289,
"acc_norm_stderr,none": 0.010035580962097949,
"bpb,none": 0.8010086370262028,
"bpb_stderr,none": 0.010519090088396273,
"logprob,none": -11.337275293263723,
"logprob_stderr,none": 0.18745096671919237,
"choice_logprob,none": -1.7834794454579106,
"choice_logprob_stderr,none": 0.06781658640079403,
"choice_prob_norm,none": 0.3207981167012948,
"choice_prob_norm_stderr,none": 0.0019661397554196205,
"choice_logprob_norm,none": -1.180810501076774,
"choice_logprob_norm_stderr,none": 0.006252532908298729
},
"boolq_10shot": {
"acc,none": 0.5828746177370031,
"acc_stderr,none": 0.008624092785001305,
"bpb,none": 0.424077273007807,
"bpb_stderr,none": 0.004402659753550807,
"logprob,none": -0.7095859006092818,
"logprob_stderr,none": 0.0056372946348402595,
"choice_logprob,none": -0.6812509971851064,
"choice_logprob_stderr,none": 0.005612186414219027,
"choice_prob_norm,none": 0.5297755603930668,
"choice_prob_norm_stderr,none": 0.002130059118157897,
"choice_logprob_norm,none": -0.6649111950908801,
"choice_logprob_norm_stderr,none": 0.004407980311407431
},
"boolq_sl_verb_10shot": {
"acc,none": 0.618960244648318,
"acc_stderr,none": 0.00849393752443933,
"acc_norm,none": 0.6214067278287462,
"acc_norm_stderr,none": 0.008483341718024479,
"bpb,none": 0.6787550750990903,
"bpb_stderr,none": 0.013136290003665905,
"logprob,none": -1.0040385069650248,
"logprob_stderr,none": 0.01751576350015502,
"choice_logprob,none": -0.8903443609749093,
"choice_logprob_stderr,none": 0.017546302326646203,
"choice_prob_norm,none": 0.5675065356066568,
"choice_prob_norm_stderr,none": 0.005100287590387704,
"choice_logprob_norm,none": -0.7794454336526722,
"choice_logprob_norm_stderr,none": 0.01299160917527106
},
"copa_0shot": {
"acc,none": 0.71,
"acc_stderr,none": 0.045604802157206824,
"bpb,none": 1.544881163587024,
"bpb_stderr,none": 0.04224672310779111,
"logprob,none": -29.091637153625488,
"logprob_stderr,none": 0.5256394129910011,
"choice_logprob,none": -1.0821095002832515,
"choice_logprob_stderr,none": 0.19938560807250813,
"choice_prob_norm,none": 0.5166427832287669,
"choice_prob_norm_stderr,none": 0.007955090348616845,
"choice_logprob_norm,none": -0.6735219157148393,
"choice_logprob_norm_stderr,none": 0.01681896816881189
},
"csqa_5shot": {
"acc,none": 0.2031122031122031,
"acc_stderr,none": 0.0115182547936341,
"bpb,none": 3.4662226834005994,
"bpb_stderr,none": 0.07760683991934131,
"logprob,none": -2.402602480192052,
"logprob_stderr,none": 0.053792962282258425,
"choice_logprob,none": -2.399173130363177,
"choice_logprob_stderr,none": 0.05379191867983419,
"choice_prob_norm,none": 0.20228046783106549,
"choice_prob_norm_stderr,none": 0.005927960143637226,
"choice_logprob_norm,none": -2.945002693886426,
"choice_logprob_norm_stderr,none": 0.07836870414941635
},
"csqa_sl_verb_5shot": {
"acc,none": 0.21457821457821458,
"acc_stderr,none": 0.01175342309421685,
"acc_norm,none": 0.21785421785421785,
"acc_norm_stderr,none": 0.011818079981132528,
"bpb,none": 0.2971528895582945,
"bpb_stderr,none": 0.00657122035739363,
"logprob,none": -2.36769612986567,
"logprob_stderr,none": 0.046303055084943257,
"choice_logprob,none": -2.222552686903668,
"choice_logprob_stderr,none": 0.04618156469383482,
"choice_prob_norm,none": 0.2033634953423176,
"choice_prob_norm_stderr,none": 0.0010928332934984917,
"choice_logprob_norm,none": -1.614389554809131,
"choice_logprob_norm_stderr,none": 0.00632740787966796
},
"hellaswag_0shot": {
"acc,none": 0.383788090021908,
"acc_stderr,none": 0.0048531342715477715,
"acc_norm,none": 0.4758016331408086,
"acc_norm_stderr,none": 0.004983934343250453,
"bpb,none": 0.8481104875212312,
"bpb_stderr,none": 0.002012050778454794,
"logprob,none": -77.5695962406824,
"logprob_stderr,none": 0.3867355755639212,
"choice_logprob,none": -17.13094384233126,
"choice_logprob_stderr,none": 0.2322080139429845,
"choice_prob_norm,none": 0.2762306795727128,
"choice_prob_norm_stderr,none": 0.0004226701999579038,
"choice_logprob_norm,none": -1.2984380262627977,
"choice_logprob_norm_stderr,none": 0.0015549430400205245
},
"hellaswag_5shot": {
"acc,none": 0.38109938259310894,
"acc_stderr,none": 0.004846643735666544,
"acc_norm,none": 0.47191794463254333,
"acc_norm_stderr,none": 0.0049819052938781415,
"bpb,none": 0.8341103490332094,
"bpb_stderr,none": 0.0019988853798998825,
"logprob,none": -76.39748609609704,
"logprob_stderr,none": 0.3843855373550899,
"choice_logprob,none": -17.535902908856652,
"choice_logprob_stderr,none": 0.23551377069804363,
"choice_prob_norm,none": 0.2757732593282485,
"choice_prob_norm_stderr,none": 0.0004213091077415382,
"choice_logprob_norm,none": -1.3000195912908532,
"choice_logprob_norm_stderr,none": 0.0015490835224453204
},
"openbookqa_0shot": {
"acc,none": 0.236,
"acc_stderr,none": 0.019008699622084718,
"acc_norm,none": 0.33,
"acc_norm_stderr,none": 0.021049612166134813,
"bpb,none": 1.8203886772407774,
"bpb_stderr,none": 0.0495479665668239,
"logprob,none": -18.808675647974013,
"logprob_stderr,none": 0.5015900523926243,
"choice_logprob,none": -5.87718670291878,
"choice_logprob_stderr,none": 0.29807818376863565,
"choice_prob_norm,none": 0.27503256875877746,
"choice_prob_norm_stderr,none": 0.005859820333892211,
"choice_logprob_norm,none": -1.4274577534386603,
"choice_logprob_norm_stderr,none": 0.029418619584831728
},
"piqa_5shot": {
"acc,none": 0.7170837867247007,
"acc_stderr,none": 0.010508949177489678,
"acc_norm,none": 0.7181719260065288,
"acc_norm_stderr,none": 0.010496675231258154,
"bpb,none": 1.0360682750860744,
"bpb_stderr,none": 0.010399419087489509,
"logprob,none": -60.83130082316186,
"logprob_stderr,none": 1.2666396586204536,
"choice_logprob,none": -2.597817156961528,
"choice_logprob_stderr,none": 0.19001766378429708,
"choice_prob_norm,none": 0.5182779144202206,
"choice_prob_norm_stderr,none": 0.0013533389349256308,
"choice_logprob_norm,none": -0.6638477081057357,
"choice_logprob_norm_stderr,none": 0.0027423091484904973
},
"winogrande_5shot": {
"acc,none": 0.5351223362273086,
"acc_stderr,none": 0.014017773120881578,
"bpb,none": 0.4334391964017789,
"bpb_stderr,none": 0.022729344808219854,
"logprob,none": -19.64402215275693,
"logprob_stderr,none": 0.2811034515456502,
"choice_logprob,none": -0.7569728289871221,
"choice_logprob_stderr,none": 0.016668264538358627,
"choice_prob_norm,none": 0.5016246375491271,
"choice_prob_norm_stderr,none": 0.0008329726427253178,
"choice_logprob_norm,none": -0.6921774194909759,
"choice_logprob_norm_stderr,none": 0.0023311184855997463
},
"wsc273_0shot": {
"acc,none": 0.63003663003663,
"acc_stderr,none": 0.02927371304052677,
"bpb,none": 0.6220051946131483,
"bpb_stderr,none": 0.020972923053141378,
"logprob,none": -25.870619232401307,
"logprob_stderr,none": 0.4859263527795115,
"choice_logprob,none": -0.7395315600151394,
"choice_logprob_stderr,none": 0.04398772506101539,
"choice_prob_norm,none": 0.5030254920705404,
"choice_prob_norm_stderr,none": 0.001242651266911796,
"choice_logprob_norm,none": -0.6879269969814308,
"choice_logprob_norm_stderr,none": 0.0024365735253503887
},
"medmcqa_5shot": {
"acc,none": 0.30217547214917523,
"acc_stderr,none": 0.007100856001176909,
"acc_norm,none": 0.30217547214917523,
"acc_norm_stderr,none": 0.007100856001176909,
"logprob,none": -1.442878668113898,
"logprob_stderr,none": 0.008342765938311467
},
"medmcqa_sl_verb_5shot": {
"acc,none": 0.2971551518049247,
"acc_stderr,none": 0.0070669065873055804,
"acc_norm,none": 0.30958642122878316,
"acc_norm_stderr,none": 0.007149136674702513,
"bpb,none": 0.14020111563374607,
"bpb_stderr,none": 0.0019311804256365126,
"logprob,none": -1.6622335477869048,
"logprob_stderr,none": 0.012776276450538825,
"choice_logprob,none": -1.4643904381104342,
"choice_logprob_stderr,none": 0.010785426134688378,
"choice_prob_norm,none": 0.2525786770609617,
"choice_prob_norm_stderr,none": 0.0002552044637741952,
"choice_logprob_norm,none": -1.37853956157313,
"choice_logprob_norm_stderr,none": 0.0011673702634496894
},
"truthfulqa_mc1_0shot": {
"acc,none": 0.23378212974296206,
"acc_stderr,none": 0.01481619599193159,
"logprob,none": -19.429164113146292,
"logprob_stderr,none": 0.41290169240196367
},
"logprob_gsm8k_5shot": {
"nll,none": 110.98014808406063,
"nll_stderr,none": 1.5568822800964812,
"bpb,none": 0.5618956999100354,
"bpb_stderr,none": 0.005125187842721609
},
"logprob_humaneval_10shot": {
"nll,none": 44.287433080556916,
"nll_stderr,none": 2.574493607908735,
"bpb,none": 0.4140016039194337,
"bpb_stderr,none": 0.017964659850082786
}
}
},
{
"mix": "v0",
"hidden_dim": 1536,
"budget": 9e+19,
"tasks": {
"mmlu_sl_verb_0shot": {
"acc,none": 0.2503916820965674,
"acc_stderr,none": 0.0036534832564727587,
"acc_norm,none": 0.27916251246261214,
"acc_norm_stderr,none": 0.0037770013669132336,
"bpb,none": 0.35552157414979424,
"bpb_stderr,none": 0.0019131290063905577,
"logprob,none": -5.065637073363192,
"logprob_stderr,none": 0.012907325924577924,
"choice_logprob,none": -1.5819518990523416,
"choice_logprob_stderr,none": 0.007394711008538637,
"choice_prob_norm,none": 0.25167494076569574,
"choice_prob_norm_stderr,none": 0.0001931048603284136,
"choice_logprob_norm,none": -1.3847537145620747,
"choice_logprob_norm_stderr,none": 0.0007972310007693355,
"sample_count": {
"acc,none": 14042,
"acc_norm,none": 14042,
"bpb,none": 14042,
"logprob,none": 14042,
"choice_logprob,none": 14042,
"choice_prob_norm,none": 14042,
"choice_logprob_norm,none": 14042
}
},
"mmlu_sl_verb_5shot": {
"acc,none": 0.27125765560461473,
"acc_stderr,none": 0.003744348166570681,
"acc_norm,none": 0.28656886483406924,
"acc_norm_stderr,none": 0.003808597127162219,
"bpb,none": 0.18156040586581168,
"bpb_stderr,none": 0.0011994962628792578,
"logprob,none": -2.790621268407345,
"logprob_stderr,none": 0.01383950194817158,
"choice_logprob,none": -1.5617558721966827,
"choice_logprob_stderr,none": 0.008074175431290678,
"choice_prob_norm,none": 0.25143517100434937,
"choice_prob_norm_stderr,none": 0.00013388532833850876,
"choice_logprob_norm,none": -1.3829688743489568,
"choice_logprob_norm_stderr,none": 0.0005421892698406115,
"sample_count": {
"acc,none": 14042,
"acc_norm,none": 14042,
"bpb,none": 14042,
"logprob,none": 14042,
"choice_logprob,none": 14042,
"choice_prob_norm,none": 14042,
"choice_logprob_norm,none": 14042
}
},
"arc_challenge_5shot": {
"acc,none": 0.3302047781569966,
"acc_stderr,none": 0.013743085603760422,
"acc_norm,none": 0.35238907849829354,
"acc_norm_stderr,none": 0.013960142600598677,
"bpb,none": 0.9414160161263657,
"bpb_stderr,none": 0.019954836359592047,
"logprob,none": -16.183541593494677,
"logprob_stderr,none": 0.2986216045469544,
"choice_logprob,none": -3.8834576554259708,
"choice_logprob_stderr,none": 0.1379796322208355,
"choice_prob_norm,none": 0.26792039945837454,
"choice_prob_norm_stderr,none": 0.002158966256971253,
"choice_logprob_norm,none": -1.3601627412764932,
"choice_logprob_norm_stderr,none": 0.009449722136061042
},
"arc_easy_5shot": {
"acc,none": 0.6742424242424242,
"acc_stderr,none": 0.009616642976885968,
"acc_norm,none": 0.6662457912457912,
"acc_norm_stderr,none": 0.00967606568357548,
"bpb,none": 0.7220906914137978,
"bpb_stderr,none": 0.00973173910514286,
"logprob,none": -10.344662363857331,
"logprob_stderr,none": 0.17419791693308334,
"choice_logprob,none": -1.4053845306771011,
"choice_logprob_stderr,none": 0.059466594757234946,
"choice_prob_norm,none": 0.3358377762667521,
"choice_prob_norm_stderr,none": 0.0020885713089263333,
"choice_logprob_norm,none": -1.1341845532619204,
"choice_logprob_norm_stderr,none": 0.006103898794992183
},
"boolq_10shot": {
"acc,none": 0.5431192660550459,
"acc_stderr,none": 0.008712475433089477,
"bpb,none": 0.3995581140433765,
"bpb_stderr,none": 0.002460409164664351,
"logprob,none": -0.7095964209566787,
"logprob_stderr,none": 0.0041886957939902564,
"choice_logprob,none": -0.6905970206206402,
"choice_logprob_stderr,none": 0.004179601526381547,
"choice_prob_norm,none": 0.5199159676607907,
"choice_prob_norm_stderr,none": 0.001298365236934181,
"choice_logprob_norm,none": -0.6644838231351827,
"choice_logprob_norm_stderr,none": 0.0025417972585884737
},
"boolq_sl_verb_10shot": {
"acc,none": 0.6168195718654435,
"acc_stderr,none": 0.008503021391450791,
"acc_norm,none": 0.6211009174311927,
"acc_norm_stderr,none": 0.00848467871856502,
"bpb,none": 0.4644675507724053,
"bpb_stderr,none": 0.0064341497329966505,
"logprob,none": -0.732731931431031,
"logprob_stderr,none": 0.007998332680439648,
"choice_logprob,none": -0.692841816111664,
"choice_logprob_stderr,none": 0.00798951759349642,
"choice_prob_norm,none": 0.5410706603221777,
"choice_prob_norm_stderr,none": 0.002985798104826507,
"choice_logprob_norm,none": -0.6744856753522923,
"choice_logprob_norm_stderr,none": 0.0063912819169031155
},
"copa_0shot": {
"acc,none": 0.69,
"acc_stderr,none": 0.04648231987117316,
"bpb,none": 1.473402516662691,
"bpb_stderr,none": 0.04351580290541288,
"logprob,none": -27.692846031188964,
"logprob_stderr,none": 0.5557166606714958,
"choice_logprob,none": -0.8469954823714905,
"choice_logprob_stderr,none": 0.1681309923120363,
"choice_prob_norm,none": 0.5292895416366712,
"choice_prob_norm_stderr,none": 0.008465964698636304,
"choice_logprob_norm,none": -0.6502013843114527,
"choice_logprob_norm_stderr,none": 0.017311023118854105
},
"csqa_5shot": {
"acc,none": 0.22194922194922195,
"acc_stderr,none": 0.011897367280936749,
"bpb,none": 2.6573150036844377,
"bpb_stderr,none": 0.039550358678890216,
"logprob,none": -1.8419104026635098,
"logprob_stderr,none": 0.027414219608407304,
"choice_logprob,none": -1.8370197668091335,
"choice_logprob_stderr,none": 0.027418207937237038,
"choice_prob_norm,none": 0.21507221565660545,
"choice_prob_norm_stderr,none": 0.004564710560332225,
"choice_logprob_norm,none": -2.0376804575438836,
"choice_logprob_norm_stderr,none": 0.03981184025115802
},
"csqa_sl_verb_5shot": {
"acc,none": 0.23177723177723178,
"acc_stderr,none": 0.01208089355230227,
"acc_norm,none": 0.23914823914823916,
"acc_norm_stderr,none": 0.012212475442884533,
"bpb,none": 0.25099320755867205,
"bpb_stderr,none": 0.004031633326397867,
"logprob,none": -1.9946075831886982,
"logprob_stderr,none": 0.02428004908649879,
"choice_logprob,none": -1.779406383085779,
"choice_logprob_stderr,none": 0.023417537336332426,
"choice_prob_norm,none": 0.20371179160009323,
"choice_prob_norm_stderr,none": 0.000687244002046894,
"choice_logprob_norm,none": -1.598885612265858,
"choice_logprob_norm_stderr,none": 0.0037213188706360627
},
"hellaswag_0shot": {
"acc,none": 0.4358693487353117,
"acc_stderr,none": 0.004948567856373863,
"acc_norm,none": 0.5707030472017527,
"acc_norm_stderr,none": 0.004939642460172563,
"bpb,none": 0.8055160072851943,
"bpb_stderr,none": 0.001959624425503467,
"logprob,none": -73.47371736168267,
"logprob_stderr,none": 0.36482269582399635,
"choice_logprob,none": -14.373321577720866,
"choice_logprob_stderr,none": 0.20916515197729738,
"choice_prob_norm,none": 0.28455403877118096,
"choice_prob_norm_stderr,none": 0.00042521342658046455,
"choice_logprob_norm,none": -1.2682300246515799,
"choice_logprob_norm_stderr,none": 0.0015210307259313257
},
"hellaswag_5shot": {
"acc,none": 0.4340768771161123,
"acc_stderr,none": 0.0049462215121452826,
"acc_norm,none": 0.5705038836885082,
"acc_norm_stderr,none": 0.004939925958728895,
"bpb,none": 0.7875317382572161,
"bpb_stderr,none": 0.0019305361634130123,
"logprob,none": -72.04622087006643,
"logprob_stderr,none": 0.36205872501721265,
"choice_logprob,none": -14.635099365145285,
"choice_logprob_stderr,none": 0.21157611102802906,
"choice_prob_norm,none": 0.28439536617380506,
"choice_prob_norm_stderr,none": 0.0004251818877565747,
"choice_logprob_norm,none": -1.2687362959471866,
"choice_logprob_norm_stderr,none": 0.001515667428361757
},
"openbookqa_0shot": {
"acc,none": 0.256,
"acc_stderr,none": 0.01953692357474761,
"acc_norm,none": 0.356,
"acc_norm_stderr,none": 0.02143471235607264,
"bpb,none": 1.7514395936329286,
"bpb_stderr,none": 0.04590346199306301,
"logprob,none": -18.116089826107025,
"logprob_stderr,none": 0.47628978811352274,
"choice_logprob,none": -5.50242292258655,
"choice_logprob_stderr,none": 0.2787961676611458,
"choice_prob_norm,none": 0.28378143761842295,
"choice_prob_norm_stderr,none": 0.0059663707202530095,
"choice_logprob_norm,none": -1.390278992117216,
"choice_logprob_norm_stderr,none": 0.027074530398652805
},
"piqa_5shot": {
"acc,none": 0.7442872687704026,
"acc_stderr,none": 0.010178690109459862,
"acc_norm,none": 0.7377584330794341,
"acc_norm_stderr,none": 0.010262502565172449,
"bpb,none": 0.9986341152067002,
"bpb_stderr,none": 0.010487867574866843,
"logprob,none": -58.39074820636795,
"logprob_stderr,none": 1.2146705689000088,
"choice_logprob,none": -2.3070429811822026,
"choice_logprob_stderr,none": 0.17286932173721653,
"choice_prob_norm,none": 0.5212220951674228,
"choice_prob_norm_stderr,none": 0.0013421756048841715,
"choice_logprob_norm,none": -0.6580206669507951,
"choice_logprob_norm_stderr,none": 0.0027121864908891693
},
"winogrande_5shot": {
"acc,none": 0.5603788476716653,
"acc_stderr,none": 0.013949649776015698,
"bpb,none": 0.4188933766106223,
"bpb_stderr,none": 0.022580345252037705,
"logprob,none": -18.928303497648653,
"logprob_stderr,none": 0.2737846820388816,
"choice_logprob,none": -0.7431147138587829,
"choice_logprob_stderr,none": 0.016910068400034776,
"choice_prob_norm,none": 0.5020179593806919,
"choice_prob_norm_stderr,none": 0.0007811650353038019,
"choice_logprob_norm,none": -0.6910696833248146,
"choice_logprob_norm_stderr,none": 0.0021031923529397506
},
"wsc273_0shot": {
"acc,none": 0.663003663003663,
"acc_stderr,none": 0.028660654384243294,
"bpb,none": 0.6238243103766034,
"bpb_stderr,none": 0.02104329844697861,
"logprob,none": -26.042019389924548,
"logprob_stderr,none": 0.47439874711014063,
"choice_logprob,none": -0.6398779205929981,
"choice_logprob_stderr,none": 0.043379094756256516,
"choice_prob_norm,none": 0.5049551743412123,
"choice_prob_norm_stderr,none": 0.0013298180281791753,
"choice_logprob_norm,none": -0.6842169909217483,
"choice_logprob_norm_stderr,none": 0.00261393859833492
},
"medmcqa_5shot": {
"acc,none": 0.29141764284006694,
"acc_stderr,none": 0.007026856322397198,
"acc_norm,none": 0.29141764284006694,
"acc_norm_stderr,none": 0.007026856322397198,
"logprob,none": -1.3837027403118791,
"logprob_stderr,none": 0.004640465793133948
},
"medmcqa_sl_verb_5shot": {
"acc,none": 0.2739660530719579,
"acc_stderr,none": 0.006896596635623945,
"acc_norm,none": 0.30408797513746116,
"acc_norm_stderr,none": 0.007113523647158317,
"bpb,none": 0.13553781024009642,
"bpb_stderr,none": 0.0015734770390694308,
"logprob,none": -1.6200002332623427,
"logprob_stderr,none": 0.010590969800974608,
"choice_logprob,none": -1.4465072580196185,
"choice_logprob_stderr,none": 0.009186848812944771,
"choice_prob_norm,none": 0.25210393918690377,
"choice_prob_norm_stderr,none": 0.00022420517419145707,
"choice_logprob_norm,none": -1.3796419726384546,
"choice_logprob_norm_stderr,none": 0.0009219691173436295
},
"truthfulqa_mc1_0shot": {
"acc,none": 0.2178702570379437,
"acc_stderr,none": 0.014450846714123895,
"logprob,none": -16.946733659600685,
"logprob_stderr,none": 0.37880723885054923
},
"logprob_gsm8k_5shot": {
"nll,none": 106.78752208822509,
"nll_stderr,none": 1.468171699400727,
"bpb,none": 0.5419649635618328,
"bpb_stderr,none": 0.004803235526642886
},
"logprob_humaneval_10shot": {
"nll,none": 43.058409789713416,
"nll_stderr,none": 2.5276094541783687,
"bpb,none": 0.4099039537826882,
"bpb_stderr,none": 0.01781821917069972
}
}
},
{
"mix": "v2",
"hidden_dim": 1536,
"budget": 9e+19,
"tasks": {
"mmlu_sl_verb_0shot": {
"acc,none": 0.2833641931348811,
"acc_stderr,none": 0.003793525260773458,
"acc_norm,none": 0.2995299814841191,
"acc_norm_stderr,none": 0.0038569415918105715,
"bpb,none": 0.3652159920429904,
"bpb_stderr,none": 0.0019549232625298827,
"logprob,none": -5.212395275117396,
"logprob_stderr,none": 0.014327558705991703,
"choice_logprob,none": -1.5584006500944099,
"choice_logprob_stderr,none": 0.007587638778556977,
"choice_prob_norm,none": 0.25309797338211804,
"choice_prob_norm_stderr,none": 0.00021247705862183423,
"choice_logprob_norm,none": -1.3801880227174377,
"choice_logprob_norm_stderr,none": 0.0008690833228587661,
"sample_count": {
"acc,none": 14042,
"acc_norm,none": 14042,
"bpb,none": 14042,
"logprob,none": 14042,
"choice_logprob,none": 14042,
"choice_prob_norm,none": 14042,
"choice_logprob_norm,none": 14042
}
},
"mmlu_sl_verb_5shot": {
"acc,none": 0.29539951573849876,
"acc_stderr,none": 0.003819873395705863,
"acc_norm,none": 0.3074348383421165,
"acc_norm_stderr,none": 0.003869313824989712,
"bpb,none": 0.17754738725422795,
"bpb_stderr,none": 0.001192601312340027,
"logprob,none": -2.6514670316189117,
"logprob_stderr,none": 0.0132553270705789,
"choice_logprob,none": -1.5186375646840715,
"choice_logprob_stderr,none": 0.007962096535581713,
"choice_prob_norm,none": 0.2529079101185746,
"choice_prob_norm_stderr,none": 0.00014230007662830306,
"choice_logprob_norm,none": -1.377431731399958,
"choice_logprob_norm_stderr,none": 0.0005748698312148346,
"sample_count": {
"acc,none": 14042,
"acc_norm,none": 14042,
"bpb,none": 14042,
"logprob,none": 14042,
"choice_logprob,none": 14042,
"choice_prob_norm,none": 14042,
"choice_logprob_norm,none": 14042
}
},
"arc_challenge_5shot": {
"acc,none": 0.37627986348122866,
"acc_stderr,none": 0.014157022555407172,
"acc_norm,none": 0.3984641638225256,
"acc_norm_stderr,none": 0.014306946052735565,
"bpb,none": 0.9117224950345897,
"bpb_stderr,none": 0.02058570982727825,
"logprob,none": -15.715859739548517,
"logprob_stderr,none": 0.30072817934094254,
"choice_logprob,none": -3.6533481887218437,
"choice_logprob_stderr,none": 0.13895640550282407,
"choice_prob_norm,none": 0.2779257414622447,
"choice_prob_norm_stderr,none": 0.0023598128882322067,
"choice_logprob_norm,none": -1.3303668950456593,
"choice_logprob_norm_stderr,none": 0.010589553731885004
},
"arc_easy_5shot": {
"acc,none": 0.6999158249158249,
"acc_stderr,none": 0.009404000558513351,
"acc_norm,none": 0.7007575757575758,
"acc_norm_stderr,none": 0.009396447162309822,
"bpb,none": 0.679267650346058,
"bpb_stderr,none": 0.010041405206793617,
"logprob,none": -9.946306985247919,
"logprob_stderr,none": 0.17607263616243687,
"choice_logprob,none": -1.3562313881029344,
"choice_logprob_stderr,none": 0.060583134820112,
"choice_prob_norm,none": 0.35567155622634117,
"choice_prob_norm_stderr,none": 0.002475510479674059,
"choice_logprob_norm,none": -1.0871060184096497,
"choice_logprob_norm_stderr,none": 0.006789025142729967
},
"boolq_10shot": {
"acc,none": 0.6198776758409786,
"acc_stderr,none": 0.008489990918989207,
"bpb,none": 0.42246278290564976,
"bpb_stderr,none": 0.005627377329609596,
"logprob,none": -0.6899870655223135,
"logprob_stderr,none": 0.007479563628999024,
"choice_logprob,none": -0.6578724003071241,
"choice_logprob_stderr,none": 0.007440020669474746,
"choice_prob_norm,none": 0.5500499661526049,
"choice_prob_norm_stderr,none": 0.0027414575071941782,
"choice_logprob_norm,none": -0.6454941559968439,
"choice_logprob_norm_stderr,none": 0.00569403632356082
},
"boolq_sl_verb_10shot": {
"acc,none": 0.6103975535168196,
"acc_stderr,none": 0.008529228894936292,
"acc_norm,none": 0.618348623853211,
"acc_norm_stderr,none": 0.00849655074117826,
"bpb,none": 0.5402393387293413,
"bpb_stderr,none": 0.00944492455451324,
"logprob,none": -0.8257174519953013,
"logprob_stderr,none": 0.01240018574869942,
"choice_logprob,none": -0.7641612885400936,
"choice_logprob_stderr,none": 0.012424413318765988,
"choice_prob_norm,none": 0.5546675618062102,
"choice_prob_norm_stderr,none": 0.004018200206431961,
"choice_logprob_norm,none": -0.7081703345145451,
"choice_logprob_norm_stderr,none": 0.009347391902965673
},
"copa_0shot": {
"acc,none": 0.76,
"acc_stderr,none": 0.04292346959909283,
"bpb,none": 1.5886466466443578,
"bpb_stderr,none": 0.04513550330410471,
"logprob,none": -29.909457206726074,
"logprob_stderr,none": 0.562216134659488,
"choice_logprob,none": -0.7410448574769116,
"choice_logprob_stderr,none": 0.1652254579682627,
"choice_prob_norm,none": 0.5313054727825939,
"choice_prob_norm_stderr,none": 0.008917301908166142,
"choice_logprob_norm,none": -0.6478710857505101,
"choice_logprob_norm_stderr,none": 0.01820457056298214
},
"csqa_5shot": {
"acc,none": 0.2571662571662572,
"acc_stderr,none": 0.012513329723602744,
"bpb,none": 3.61170673010068,
"bpb_stderr,none": 0.09115943514540976,
"logprob,none": -2.5034443369786654,
"logprob_stderr,none": 0.06318690545247797,
"choice_logprob,none": -2.5008061738874,
"choice_logprob_stderr,none": 0.06319150639160746,
"choice_prob_norm,none": 0.24951125498615806,
"choice_prob_norm_stderr,none": 0.008165344544357807,
"choice_logprob_norm,none": -3.1682373926593677,
"choice_logprob_norm_stderr,none": 0.09176244758442266
},
"csqa_sl_verb_5shot": {
"acc,none": 0.2809172809172809,
"acc_stderr,none": 0.012867635159174046,
"acc_norm,none": 0.2751842751842752,
"acc_norm_stderr,none": 0.012786323696375949,
"bpb,none": 0.3391542211540572,
"bpb_stderr,none": 0.008716444229196115,
"logprob,none": -2.7009197721756646,
"logprob_stderr,none": 0.06272524909701989,
"choice_logprob,none": -2.5017582374199936,
"choice_logprob_stderr,none": 0.06202889514708539,
"choice_prob_norm,none": 0.21156096565798563,
"choice_prob_norm_stderr,none": 0.0015500718173795656,
"choice_logprob_norm,none": -1.5925035827705205,
"choice_logprob_norm_stderr,none": 0.008536529612626369
},
"hellaswag_0shot": {
"acc,none": 0.47460665206134234,
"acc_stderr,none": 0.004983342213776263,
"acc_norm,none": 0.6235809599681338,
"acc_norm_stderr,none": 0.004834969412883654,
"bpb,none": 0.8082126244191451,
"bpb_stderr,none": 0.002052694634543442,
"logprob,none": -73.64892684143156,
"logprob_stderr,none": 0.36813965885884153,
"choice_logprob,none": -13.202259825805944,
"choice_logprob_stderr,none": 0.2037577386294536,
"choice_prob_norm,none": 0.2928902213610382,
"choice_prob_norm_stderr,none": 0.00045372247972980584,
"choice_logprob_norm,none": -1.2402134001613863,
"choice_logprob_norm_stderr,none": 0.001578571806293506
},
"hellaswag_5shot": {
"acc,none": 0.45289782911770565,
"acc_stderr,none": 0.004967591267557401,
"acc_norm,none": 0.6041625174268074,
"acc_norm_stderr,none": 0.004880303863138483,
"bpb,none": 0.7808793083265002,
"bpb_stderr,none": 0.001959424563973173,
"logprob,none": -71.30152131100381,
"logprob_stderr,none": 0.35882751446169764,
"choice_logprob,none": -13.817169149802801,
"choice_logprob_stderr,none": 0.20639744750506261,
"choice_prob_norm,none": 0.28819234125658705,
"choice_prob_norm_stderr,none": 0.0004274974259184617,
"choice_logprob_norm,none": -1.2553293865354775,
"choice_logprob_norm_stderr,none": 0.0015072929470410464
},
"openbookqa_0shot": {
"acc,none": 0.28,
"acc_stderr,none": 0.020099950647503233,
"acc_norm,none": 0.376,
"acc_norm_stderr,none": 0.021683827539286115,
"bpb,none": 1.7713766886331395,
"bpb_stderr,none": 0.04749189587622596,
"logprob,none": -18.332382786035538,
"logprob_stderr,none": 0.4923023644170012,
"choice_logprob,none": -5.514236012659305,
"choice_logprob_stderr,none": 0.2901789635151622,
"choice_prob_norm,none": 0.29362202446085794,
"choice_prob_norm_stderr,none": 0.006406220502987297,
"choice_logprob_norm,none": -1.36896579388406,
"choice_logprob_norm_stderr,none": 0.028663472646257432
},
"piqa_5shot": {
"acc,none": 0.750272034820457,
"acc_stderr,none": 0.010099232969867497,
"acc_norm,none": 0.7584330794341676,
"acc_norm_stderr,none": 0.009986718001804477,
"bpb,none": 0.9947934118053557,
"bpb_stderr,none": 0.010950628645614828,
"logprob,none": -58.05032417946723,
"logprob_stderr,none": 1.2177224619074434,
"choice_logprob,none": -2.2846200577070364,
"choice_logprob_stderr,none": 0.17254081279876907,
"choice_prob_norm,none": 0.5236971605836466,
"choice_prob_norm_stderr,none": 0.001409745284657901,
"choice_logprob_norm,none": -0.653834274516731,
"choice_logprob_norm_stderr,none": 0.002818757424899055
},
"winogrande_5shot": {
"acc,none": 0.5603788476716653,
"acc_stderr,none": 0.013949649776015701,
"bpb,none": 0.42337717662405155,
"bpb_stderr,none": 0.023282161540202174,
"logprob,none": -19.086542711732136,
"logprob_stderr,none": 0.27651431898622375,
"choice_logprob,none": -0.7498901000889382,
"choice_logprob_stderr,none": 0.018670393953621845,
"choice_prob_norm,none": 0.5021590887201153,
"choice_prob_norm_stderr,none": 0.0007537685697302094,
"choice_logprob_norm,none": -0.6908339471445006,
"choice_logprob_norm_stderr,none": 0.0021854993231866676
},
"wsc273_0shot": {
"acc,none": 0.63003663003663,
"acc_stderr,none": 0.029273713040526766,
"bpb,none": 0.6458781534417585,
"bpb_stderr,none": 0.02115840809808031,
"logprob,none": -26.894661665836097,
"logprob_stderr,none": 0.47443732608851724,
"choice_logprob,none": -0.711798298682546,
"choice_logprob_stderr,none": 0.05622774135992073,
"choice_prob_norm,none": 0.5070496306067538,
"choice_prob_norm_stderr,none": 0.0015178808977874385,
"choice_logprob_norm,none": -0.6803297384186003,
"choice_logprob_norm_stderr,none": 0.002934668813995698
},
"medmcqa_5shot": {
"acc,none": 0.3000239062873536,
"acc_stderr,none": 0.007086430306346813,
"acc_norm,none": 0.3000239062873536,
"acc_norm_stderr,none": 0.007086430306346813,
"logprob,none": -1.384638548894787,
"logprob_stderr,none": 0.006011263189773365
},
"medmcqa_sl_verb_5shot": {
"acc,none": 0.29213483146067415,
"acc_stderr,none": 0.007031936297958185,
"acc_norm,none": 0.31651924456131963,
"acc_norm_stderr,none": 0.007192356263004332,
"bpb,none": 0.1348471597122054,
"bpb_stderr,none": 0.001871467610285096,
"logprob,none": -1.6010850405784962,
"logprob_stderr,none": 0.01228112024153422,
"choice_logprob,none": -1.4481968658234194,
"choice_logprob_stderr,none": 0.010446909746323068,
"choice_prob_norm,none": 0.25284289852310765,
"choice_prob_norm_stderr,none": 0.0002564181546171402,
"choice_logprob_norm,none": -1.3775063314377334,
"choice_logprob_norm_stderr,none": 0.0012172335913715972
},
"truthfulqa_mc1_0shot": {
"acc,none": 0.23255813953488372,
"acc_stderr,none": 0.014789157531080508,
"logprob,none": -16.15946305921641,
"logprob_stderr,none": 0.38174767602465554
},
"logprob_gsm8k_5shot": {
"nll,none": 106.9984015117006,
"nll_stderr,none": 1.5243443231933331,
"bpb,none": 0.544555718461919,
"bpb_stderr,none": 0.005408454171740506
},
"logprob_humaneval_10shot": {
"nll,none": 48.29808946062879,
"nll_stderr,none": 2.5801950339103974,
"bpb,none": 0.45749667858902093,
"bpb_stderr,none": 0.018462211336157767
}
}
},
{
"mix": "v3",
"hidden_dim": 1536,
"budget": 9e+19,
"tasks": {
"mmlu_sl_verb_0shot": {
"acc,none": 0.2993875516308218,
"acc_stderr,none": 0.003845267502892936,
"acc_norm,none": 0.29867540236433554,
"acc_norm_stderr,none": 0.0038417770785854908,
"bpb,none": 0.3437568050019886,
"bpb_stderr,none": 0.0019239556694492393,
"logprob,none": -5.079754080753175,
"logprob_stderr,none": 0.015738878001771744,
"choice_logprob,none": -1.5769890073434985,
"choice_logprob_stderr,none": 0.00820013727407004,
"choice_prob_norm,none": 0.25343685280982375,
"choice_prob_norm_stderr,none": 0.00022739980059084734,
"choice_logprob_norm,none": -1.379710960697182,
"choice_logprob_norm_stderr,none": 0.0009369500180734398,
"sample_count": {
"acc,none": 14042,
"acc_norm,none": 14042,
"bpb,none": 14042,
"logprob,none": 14042,
"choice_logprob,none": 14042,
"choice_prob_norm,none": 14042,
"choice_logprob_norm,none": 14042
}
},
"mmlu_sl_verb_5shot": {
"acc,none": 0.3127047429141148,
"acc_stderr,none": 0.003884351503361081,
"acc_norm,none": 0.30793334282865686,
"acc_norm_stderr,none": 0.0038776332597401067,
"bpb,none": 0.19614308476728629,
"bpb_stderr,none": 0.0014122119380714784,
"logprob,none": -3.0694671304109518,
"logprob_stderr,none": 0.01727239105380594,
"choice_logprob,none": -1.5472708010614544,
"choice_logprob_stderr,none": 0.00913556432507866,
"choice_prob_norm,none": 0.25314930728707763,
"choice_prob_norm_stderr,none": 0.00016511282196633405,
"choice_logprob_norm,none": -1.3772906231407986,
"choice_logprob_norm_stderr,none": 0.0006642202918479287,
"sample_count": {
"acc,none": 14042,
"acc_norm,none": 14042,
"bpb,none": 14042,
"logprob,none": 14042,
"choice_logprob,none": 14042,
"choice_prob_norm,none": 14042,
"choice_logprob_norm,none": 14042
}
},
"arc_challenge_5shot": {
"acc,none": 0.30802047781569963,
"acc_stderr,none": 0.01349142951729204,
"acc_norm,none": 0.3626279863481229,
"acc_norm_stderr,none": 0.014049106564955012,
"bpb,none": 0.946247010821327,
"bpb_stderr,none": 0.01978146472612539,
"logprob,none": -16.43383033019284,
"logprob_stderr,none": 0.309529240670054,
"choice_logprob,none": -4.169585053458868,
"choice_logprob_stderr,none": 0.14671042509484286,
"choice_prob_norm,none": 0.27267684429575634,
"choice_prob_norm_stderr,none": 0.0025010185384983573,
"choice_logprob_norm,none": -1.3506892796004153,
"choice_logprob_norm_stderr,none": 0.010161115752093695
},
"arc_easy_5shot": {
"acc,none": 0.67003367003367,
"acc_stderr,none": 0.009648311574241036,
"acc_norm,none": 0.67003367003367,
"acc_norm_stderr,none": 0.009648311574241036,
"bpb,none": 0.717838677508279,
"bpb_stderr,none": 0.009992334745802786,
"logprob,none": -10.44071673156587,
"logprob_stderr,none": 0.18172674659824423,
"choice_logprob,none": -1.50517965371248,
"choice_logprob_stderr,none": 0.0640209698061502,
"choice_prob_norm,none": 0.34321682834671347,
"choice_prob_norm_stderr,none": 0.002286219045431404,
"choice_logprob_norm,none": -1.118483567434795,
"choice_logprob_norm_stderr,none": 0.006495514696262302
},
"boolq_10shot": {
"acc,none": 0.518348623853211,
"acc_stderr,none": 0.008739164562341823,
"bpb,none": 0.44150930609847827,
"bpb_stderr,none": 0.005224087739791807,
"logprob,none": -0.8430835138265875,
"logprob_stderr,none": 0.011048484094461665,
"choice_logprob,none": -0.8278325532809531,
"choice_logprob_stderr,none": 0.011052511470261588,
"choice_prob_norm,none": 0.5199383966989631,
"choice_prob_norm_stderr,none": 0.002497028130377343,
"choice_logprob_norm,none": -0.6961303581208853,
"choice_logprob_norm_stderr,none": 0.005262126102052664
},
"boolq_sl_verb_10shot": {
"acc,none": 0.4798165137614679,
"acc_stderr,none": 0.008737927070893478,
"acc_norm,none": 0.5100917431192661,
"acc_norm_stderr,none": 0.008743273544801916,
"bpb,none": 0.4764182924937028,
"bpb_stderr,none": 0.005724492191122488,
"logprob,none": -0.8689801140070326,
"logprob_stderr,none": 0.010654305907655457,
"choice_logprob,none": -0.852790323191026,
"choice_logprob_stderr,none": 0.010659904235460005,
"choice_prob_norm,none": 0.507302111679821,
"choice_prob_norm_stderr,none": 0.002567599725396225,
"choice_logprob_norm,none": -0.7262019204822747,
"choice_logprob_norm_stderr,none": 0.005665271723349297
},
"copa_0shot": {
"acc,none": 0.7,
"acc_stderr,none": 0.046056618647183814,
"bpb,none": 1.487274144839745,
"bpb_stderr,none": 0.042327999980145345,
"logprob,none": -27.925178775787355,
"logprob_stderr,none": 0.5065800122681858,
"choice_logprob,none": -0.8381555373550902,
"choice_logprob_stderr,none": 0.14467992619144224,
"choice_prob_norm,none": 0.5252378959948037,
"choice_prob_norm_stderr,none": 0.00835550268030079,
"choice_logprob_norm,none": -0.6581485206599267,
"choice_logprob_norm_stderr,none": 0.017611476146166566
},
"csqa_5shot": {
"acc,none": 0.2244062244062244,
"acc_stderr,none": 0.01194413467602355,
"bpb,none": 3.1894932947425274,
"bpb_stderr,none": 0.0635788560784972,
"logprob,none": -2.210788284665634,
"logprob_stderr,none": 0.04406950483403684,
"choice_logprob,none": -2.208177722265666,
"choice_logprob_stderr,none": 0.04407384337459683,
"choice_prob_norm,none": 0.2198874834708879,
"choice_prob_norm_stderr,none": 0.006782628095303889,
"choice_logprob_norm,none": -2.682700028774271,
"choice_logprob_norm_stderr,none": 0.0641729357233482
},
"csqa_sl_verb_5shot": {
"acc,none": 0.2334152334152334,
"acc_stderr,none": 0.012110575321206388,
"acc_norm,none": 0.2375102375102375,
"acc_norm_stderr,none": 0.012183673723473449,
"bpb,none": 0.29635859947390786,
"bpb_stderr,none": 0.005987390627844368,
"logprob,none": -2.3901897513358734,
"logprob_stderr,none": 0.04406606414920322,
"choice_logprob,none": -2.1752721980458336,
"choice_logprob_stderr,none": 0.043255338329785384,
"choice_prob_norm,none": 0.20679160700423543,
"choice_prob_norm_stderr,none": 0.0010893273430492957,
"choice_logprob_norm,none": -1.5948191889787853,
"choice_logprob_norm_stderr,none": 0.005737327652962774
},
"hellaswag_0shot": {
"acc,none": 0.42202748456482775,
"acc_stderr,none": 0.004928735103635842,
"acc_norm,none": 0.5519816769567815,
"acc_norm_stderr,none": 0.00496274242684986,
"bpb,none": 0.8146418259907425,
"bpb_stderr,none": 0.0019910515944172053,
"logprob,none": -74.23395537010015,
"logprob_stderr,none": 0.3688047710238557,
"choice_logprob,none": -14.920520618331876,
"choice_logprob_stderr,none": 0.21343610616943745,
"choice_prob_norm,none": 0.28304819548066895,
"choice_prob_norm_stderr,none": 0.0004249866493142731,
"choice_logprob_norm,none": -1.2735844619255967,
"choice_logprob_norm_stderr,none": 0.0015222545817839856
},
"hellaswag_5shot": {
"acc,none": 0.41943835889265085,
"acc_stderr,none": 0.004924586362301659,
"acc_norm,none": 0.5467038438558056,
"acc_norm_stderr,none": 0.004967965810200004,
"bpb,none": 0.7998635043931804,
"bpb_stderr,none": 0.001971432034337865,
"logprob,none": -73.13535002844431,
"logprob_stderr,none": 0.3676983080956583,
"choice_logprob,none": -15.315985043546844,
"choice_logprob_stderr,none": 0.21689330486310834,
"choice_prob_norm,none": 0.2826641401574518,
"choice_prob_norm_stderr,none": 0.00042543304587702426,
"choice_logprob_norm,none": -1.274983501434549,
"choice_logprob_norm_stderr,none": 0.0015250347859301313
},
"openbookqa_0shot": {
"acc,none": 0.232,
"acc_stderr,none": 0.018896193591952052,
"acc_norm,none": 0.352,
"acc_norm_stderr,none": 0.021380042385946034,
"bpb,none": 1.7721463489399987,
"bpb_stderr,none": 0.04602009846494316,
"logprob,none": -18.295756459236145,
"logprob_stderr,none": 0.4800185473603445,
"choice_logprob,none": -5.530525388563438,
"choice_logprob_stderr,none": 0.28186391598720884,
"choice_prob_norm,none": 0.283422989537816,
"choice_prob_norm_stderr,none": 0.006072762610892424,
"choice_logprob_norm,none": -1.3951215332813311,
"choice_logprob_norm_stderr,none": 0.0273854109203058
},
"piqa_5shot": {
"acc,none": 0.7431991294885746,
"acc_stderr,none": 0.010192864802278052,
"acc_norm,none": 0.7448313384113167,
"acc_norm_stderr,none": 0.010171571592521824,
"bpb,none": 1.0059009799366614,
"bpb_stderr,none": 0.010861208835924993,
"logprob,none": -58.56565026956751,
"logprob_stderr,none": 1.2168260391906867,
"choice_logprob,none": -2.3953581215067867,
"choice_logprob_stderr,none": 0.1780463218410201,
"choice_prob_norm,none": 0.5223932399769874,
"choice_prob_norm_stderr,none": 0.0014301252890507393,
"choice_logprob_norm,none": -0.6567117051919433,
"choice_logprob_norm_stderr,none": 0.002918677846516806
},
"winogrande_5shot": {
"acc,none": 0.5485398579321231,
"acc_stderr,none": 0.013986110301017764,
"bpb,none": 0.42715998091726126,
"bpb_stderr,none": 0.022341584833798164,
"logprob,none": -19.36323658518652,
"logprob_stderr,none": 0.2777843058337961,
"choice_logprob,none": -0.7872808600225875,
"choice_logprob_stderr,none": 0.01916575086954024,
"choice_prob_norm,none": 0.5017534878040611,
"choice_prob_norm_stderr,none": 0.0008288172134091742,
"choice_logprob_norm,none": -0.691864582681053,
"choice_logprob_norm_stderr,none": 0.002278305676407902
},
"wsc273_0shot": {
"acc,none": 0.6410256410256411,
"acc_stderr,none": 0.029086064518366282,
"bpb,none": 0.6020577170225541,
"bpb_stderr,none": 0.020261099294745278,
"logprob,none": -25.124764009273097,
"logprob_stderr,none": 0.49789066870628557,
"choice_logprob,none": -0.7944657533511067,
"choice_logprob_stderr,none": 0.0634044680841067,
"choice_prob_norm,none": 0.5046261727330773,
"choice_prob_norm_stderr,none": 0.001277739101185203,
"choice_logprob_norm,none": -0.6848069715512327,
"choice_logprob_norm_stderr,none": 0.0025299360506142346
},
"medmcqa_5shot": {
"acc,none": 0.301458283528568,
"acc_stderr,none": 0.007096068027380258,
"acc_norm,none": 0.301458283528568,
"acc_norm_stderr,none": 0.007096068027380258,
"logprob,none": -1.4224252123122947,
"logprob_stderr,none": 0.008231404069115371
},
"medmcqa_sl_verb_5shot": {
"acc,none": 0.2976332775519962,
"acc_stderr,none": 0.007070183613490213,
"acc_norm,none": 0.3169973703083911,
"acc_norm_stderr,none": 0.0071952684770417,
"bpb,none": 0.13300602038879603,
"bpb_stderr,none": 0.0016396134861531057,
"logprob,none": -1.5505911962120493,
"logprob_stderr,none": 0.010335937240549408,
"choice_logprob,none": -1.4340295424366702,
"choice_logprob_stderr,none": 0.009568631676604841,
"choice_prob_norm,none": 0.25357433651291594,
"choice_prob_norm_stderr,none": 0.00027225266681251624,
"choice_logprob_norm,none": -1.374563104529693,
"choice_logprob_norm_stderr,none": 0.0010959101001788325
},
"truthfulqa_mc1_0shot": {
"acc,none": 0.23623011015911874,
"acc_stderr,none": 0.0148697550158711,
"logprob,none": -17.643773400214485,
"logprob_stderr,none": 0.4078807293990941
},
"logprob_gsm8k_5shot": {
"nll,none": 99.61485948049273,
"nll_stderr,none": 1.489248502672281,
"bpb,none": 0.4985796881272956,
"bpb_stderr,none": 0.0047488863679661765
},
"logprob_humaneval_10shot": {
"nll,none": 37.64992275470641,
"nll_stderr,none": 2.1628372263642115,
"bpb,none": 0.35576757453147967,
"bpb_stderr,none": 0.015815894855466917
}
}
}
]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment