Skip to content

Instantly share code, notes, and snippets.

@randomradio
Created March 21, 2026 10:31
Show Gist options
  • Select an option

  • Save randomradio/12a1f1dd497a575e7384fead287c0f72 to your computer and use it in GitHub Desktop.

Select an option

Save randomradio/12a1f1dd497a575e7384fead287c0f72 to your computer and use it in GitHub Desktop.
LongMemEval_S investigation artifacts
{
"dataset_id": "longmemeval-oracle-rustbench",
"version": "2025-09-cleaned",
"scenario_count": 470,
"overall_score": 96.35,
"overall_grade": "S",
"by_difficulty": {
"L1": 96.92,
"L3": 86.66,
"L2": 94.75
},
"by_tag": {
"multi-session": 98.18,
"single-session-preference": 100.0,
"oracle": 96.35,
"single-session-assistant": 98.91,
"single-session-user": 100.0,
"knowledge-update": 94.98,
"temporal-reasoning": 91.55
},
"by_domain": {
"longmem": 96.35
},
"by_source_family": {
"longmemeval": {
"label": "LongMemEval",
"scenario_count": 470,
"score": 96.35,
"grade": "S"
}
},
"by_longmemeval_category": {
"temporal-reasoning": {
"label": "Temporal Reasoning",
"scenario_count": 127,
"score": 91.55,
"grade": "S"
},
"knowledge-update": {
"label": "Knowledge Update",
"scenario_count": 72,
"score": 94.98,
"grade": "S"
},
"single-session-assistant": {
"label": "Single-Session Assistant",
"scenario_count": 56,
"score": 98.91,
"grade": "S"
},
"single-session-preference": {
"label": "Single-Session Preference",
"scenario_count": 30,
"score": 100.0,
"grade": "S"
},
"single-session-user": {
"label": "Single-Session User",
"scenario_count": 64,
"score": 100.0,
"grade": "S"
},
"multi-session": {
"label": "Multi-Session",
"scenario_count": 121,
"score": 98.18,
"grade": "S"
}
},
"by_beam_ability": {},
"results": [
{
"scenario_id": "gpt4_2655b836",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L2",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 92.78,
"grade": "S",
"mqs_precision": 66.67,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_2487a7cb",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_76048e76",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_2312f94c",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "0bb5a684",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "08f4fc43",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "2c63a862",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_385a5000",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "2a1811e2",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "bbf86515",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_5dcc0aab",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L3",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 82.67,
"grade": "A",
"mqs_precision": 20.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_0b2f1d21",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "f0853d11",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 39.17,
"grade": "D",
"mqs_precision": 0.0,
"mqs_recall": 0.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 0.0
},
{
"scenario_id": "gpt4_6ed717ea",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_70e84552",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "a3838d2b",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L3",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_93159ced",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_2d58bcd6",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_65aabe59",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "982b5123",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "b9cfe692",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L2",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 92.78,
"grade": "S",
"mqs_precision": 66.67,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_4edbafa2",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "c8090214",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_483dd43c",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "e4e14d04",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "c9f37c46",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_2c50253f",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "dcfa8644",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_b4a80587",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_9a159967",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L2",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "cc6d1ec1",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_8c8961ae",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_d9af6064",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_7de946e7",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "d01c6aa8",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "993da5e2",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "a3045048",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_d31cdae3",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_cd90e484",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_88806d6e",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_4cd9eba1",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_93f6379c",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L2",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 92.78,
"grade": "S",
"mqs_precision": 66.67,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "b29f3365",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_2f56ae70",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L2",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "6613b389",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L2",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 92.78,
"grade": "S",
"mqs_precision": 66.67,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_78cf46a3",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_0a05b494",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_1a1dc16d",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_2f584639",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_213fd887",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_5438fa52",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_c27434e8",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_fe651585",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "8c18457d",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "0a995998",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L2",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "6d550036",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L2",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_59c863d7",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L2",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "b5ef892d",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L2",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 92.78,
"grade": "S",
"mqs_precision": 66.67,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "e831120c",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "3a704032",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L2",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 92.78,
"grade": "S",
"mqs_precision": 66.67,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_d84a3211",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L2",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "aae3761f",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L2",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_f2262a51",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L2",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "dd2973ad",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "c4a1ceb8",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L2",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_a56e767c",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L2",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "6cb6f249",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "46a3abf7",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L2",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "36b9f61e",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L2",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "28dc39ac",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L3",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_2f8be40d",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L2",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "2e6d26dc",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L2",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_15e38248",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L2",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "88432d0a",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L2",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "80ec1f4f",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L2",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 39.17,
"grade": "D",
"mqs_precision": 0.0,
"mqs_recall": 0.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 0.0
},
{
"scenario_id": "d23cf73b",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L2",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_7fce9456",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L3",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "d682f1a2",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L2",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "7024f17c",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L2",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 92.78,
"grade": "S",
"mqs_precision": 66.67,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_5501fe77",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L2",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_2ba83207",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L2",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "2318644b",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "2ce6a0f2",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L2",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_d12ceb0e",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L2",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "00ca467f",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L2",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 92.78,
"grade": "S",
"mqs_precision": 66.67,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "b3c15d39",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_31ff4165",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L3",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "eeda8a6d",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "2788b940",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L2",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "60bf93ed",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "9d25d4e0",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L2",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "129d1232",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L2",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "60472f9c",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L2",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 92.78,
"grade": "S",
"mqs_precision": 66.67,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_194be4b3",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L3",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 95.67,
"grade": "S",
"mqs_precision": 80.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "a9f6b44c",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L2",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "d851d5ba",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L2",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "5a7937c8",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L2",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_ab202e7f",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L3",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_e05b82a6",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L2",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_731e37d7",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L2",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 94.58,
"grade": "S",
"mqs_precision": 75.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "edced276",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "10d9b85a",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "e3038f8c",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L2",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "2b8f3739",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L2",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "1a8a66a6",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L2",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 94.58,
"grade": "S",
"mqs_precision": 75.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "c2ac3c61",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "bf659f65",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L2",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_372c3eed",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L2",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_2f91af09",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L2",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "81507db6",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L3",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 91.33,
"grade": "S",
"mqs_precision": 60.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "6a1eabeb",
"title": "knowledge-update",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"knowledge-update"
],
"source_family": "longmemeval",
"question_type": "knowledge_update",
"official_category": "knowledge-update",
"official_category_label": "Knowledge Update",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "6aeb4375",
"title": "knowledge-update",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"knowledge-update"
],
"source_family": "longmemeval",
"question_type": "knowledge_update",
"official_category": "knowledge-update",
"official_category_label": "Knowledge Update",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "830ce83f",
"title": "knowledge-update",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"knowledge-update"
],
"source_family": "longmemeval",
"question_type": "knowledge_update",
"official_category": "knowledge-update",
"official_category_label": "Knowledge Update",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "852ce960",
"title": "knowledge-update",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"knowledge-update"
],
"source_family": "longmemeval",
"question_type": "knowledge_update",
"official_category": "knowledge-update",
"official_category_label": "Knowledge Update",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "945e3d21",
"title": "knowledge-update",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"knowledge-update"
],
"source_family": "longmemeval",
"question_type": "knowledge_update",
"official_category": "knowledge-update",
"official_category_label": "Knowledge Update",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "d7c942c3",
"title": "knowledge-update",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"knowledge-update"
],
"source_family": "longmemeval",
"question_type": "knowledge_update",
"official_category": "knowledge-update",
"official_category_label": "Knowledge Update",
"total_score": 71.67,
"grade": "B",
"mqs_precision": 100.0,
"mqs_recall": 50.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 0.0
},
{
"scenario_id": "71315a70",
"title": "knowledge-update",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"knowledge-update"
],
"source_family": "longmemeval",
"question_type": "knowledge_update",
"official_category": "knowledge-update",
"official_category_label": "Knowledge Update",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "89941a93",
"title": "knowledge-update",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"knowledge-update"
],
"source_family": "longmemeval",
"question_type": "knowledge_update",
"official_category": "knowledge-update",
"official_category_label": "Knowledge Update",
"total_score": 71.67,
"grade": "B",
"mqs_precision": 100.0,
"mqs_recall": 50.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 0.0
},
{
"scenario_id": "ce6d2d27",
"title": "knowledge-update",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"knowledge-update"
],
"source_family": "longmemeval",
"question_type": "knowledge_update",
"official_category": "knowledge-update",
"official_category_label": "Knowledge Update",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "9ea5eabc",
"title": "knowledge-update",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"knowledge-update"
],
"source_family": "longmemeval",
"question_type": "knowledge_update",
"official_category": "knowledge-update",
"official_category_label": "Knowledge Update",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "07741c44",
"title": "knowledge-update",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"knowledge-update"
],
"source_family": "longmemeval",
"question_type": "knowledge_update",
"official_category": "knowledge-update",
"official_category_label": "Knowledge Update",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "a1eacc2a",
"title": "knowledge-update",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"knowledge-update"
],
"source_family": "longmemeval",
"question_type": "knowledge_update",
"official_category": "knowledge-update",
"official_category_label": "Knowledge Update",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "184da446",
"title": "knowledge-update",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"knowledge-update"
],
"source_family": "longmemeval",
"question_type": "knowledge_update",
"official_category": "knowledge-update",
"official_category_label": "Knowledge Update",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "031748ae",
"title": "knowledge-update",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"knowledge-update"
],
"source_family": "longmemeval",
"question_type": "knowledge_update",
"official_category": "knowledge-update",
"official_category_label": "Knowledge Update",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "4d6b87c8",
"title": "knowledge-update",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"knowledge-update"
],
"source_family": "longmemeval",
"question_type": "knowledge_update",
"official_category": "knowledge-update",
"official_category_label": "Knowledge Update",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "0f05491a",
"title": "knowledge-update",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"knowledge-update"
],
"source_family": "longmemeval",
"question_type": "knowledge_update",
"official_category": "knowledge-update",
"official_category_label": "Knowledge Update",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "08e075c7",
"title": "knowledge-update",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"knowledge-update"
],
"source_family": "longmemeval",
"question_type": "knowledge_update",
"official_category": "knowledge-update",
"official_category_label": "Knowledge Update",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "f9e8c073",
"title": "knowledge-update",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"knowledge-update"
],
"source_family": "longmemeval",
"question_type": "knowledge_update",
"official_category": "knowledge-update",
"official_category_label": "Knowledge Update",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "41698283",
"title": "knowledge-update",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"knowledge-update"
],
"source_family": "longmemeval",
"question_type": "knowledge_update",
"official_category": "knowledge-update",
"official_category_label": "Knowledge Update",
"total_score": 71.67,
"grade": "B",
"mqs_precision": 100.0,
"mqs_recall": 50.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 0.0
},
{
"scenario_id": "2698e78f",
"title": "knowledge-update",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"knowledge-update"
],
"source_family": "longmemeval",
"question_type": "knowledge_update",
"official_category": "knowledge-update",
"official_category_label": "Knowledge Update",
"total_score": 71.67,
"grade": "B",
"mqs_precision": 100.0,
"mqs_recall": 50.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 0.0
},
{
"scenario_id": "b6019101",
"title": "knowledge-update",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"knowledge-update"
],
"source_family": "longmemeval",
"question_type": "knowledge_update",
"official_category": "knowledge-update",
"official_category_label": "Knowledge Update",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "45dc21b6",
"title": "knowledge-update",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"knowledge-update"
],
"source_family": "longmemeval",
"question_type": "knowledge_update",
"official_category": "knowledge-update",
"official_category_label": "Knowledge Update",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "5a4f22c0",
"title": "knowledge-update",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"knowledge-update"
],
"source_family": "longmemeval",
"question_type": "knowledge_update",
"official_category": "knowledge-update",
"official_category_label": "Knowledge Update",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "6071bd76",
"title": "knowledge-update",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"knowledge-update"
],
"source_family": "longmemeval",
"question_type": "knowledge_update",
"official_category": "knowledge-update",
"official_category_label": "Knowledge Update",
"total_score": 71.67,
"grade": "B",
"mqs_precision": 100.0,
"mqs_recall": 50.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 0.0
},
{
"scenario_id": "e493bb7c",
"title": "knowledge-update",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"knowledge-update"
],
"source_family": "longmemeval",
"question_type": "knowledge_update",
"official_category": "knowledge-update",
"official_category_label": "Knowledge Update",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "618f13b2",
"title": "knowledge-update",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"knowledge-update"
],
"source_family": "longmemeval",
"question_type": "knowledge_update",
"official_category": "knowledge-update",
"official_category_label": "Knowledge Update",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "72e3ee87",
"title": "knowledge-update",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"knowledge-update"
],
"source_family": "longmemeval",
"question_type": "knowledge_update",
"official_category": "knowledge-update",
"official_category_label": "Knowledge Update",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "c4ea545c",
"title": "knowledge-update",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"knowledge-update"
],
"source_family": "longmemeval",
"question_type": "knowledge_update",
"official_category": "knowledge-update",
"official_category_label": "Knowledge Update",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "01493427",
"title": "knowledge-update",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"knowledge-update"
],
"source_family": "longmemeval",
"question_type": "knowledge_update",
"official_category": "knowledge-update",
"official_category_label": "Knowledge Update",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "6a27ffc2",
"title": "knowledge-update",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"knowledge-update"
],
"source_family": "longmemeval",
"question_type": "knowledge_update",
"official_category": "knowledge-update",
"official_category_label": "Knowledge Update",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "2133c1b5",
"title": "knowledge-update",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"knowledge-update"
],
"source_family": "longmemeval",
"question_type": "knowledge_update",
"official_category": "knowledge-update",
"official_category_label": "Knowledge Update",
"total_score": 71.67,
"grade": "B",
"mqs_precision": 100.0,
"mqs_recall": 50.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 0.0
},
{
"scenario_id": "18bc8abd",
"title": "knowledge-update",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"knowledge-update"
],
"source_family": "longmemeval",
"question_type": "knowledge_update",
"official_category": "knowledge-update",
"official_category_label": "Knowledge Update",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "db467c8c",
"title": "knowledge-update",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"knowledge-update"
],
"source_family": "longmemeval",
"question_type": "knowledge_update",
"official_category": "knowledge-update",
"official_category_label": "Knowledge Update",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "7a87bd0c",
"title": "knowledge-update",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"knowledge-update"
],
"source_family": "longmemeval",
"question_type": "knowledge_update",
"official_category": "knowledge-update",
"official_category_label": "Knowledge Update",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "e61a7584",
"title": "knowledge-update",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"knowledge-update"
],
"source_family": "longmemeval",
"question_type": "knowledge_update",
"official_category": "knowledge-update",
"official_category_label": "Knowledge Update",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "1cea1afa",
"title": "knowledge-update",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"knowledge-update"
],
"source_family": "longmemeval",
"question_type": "knowledge_update",
"official_category": "knowledge-update",
"official_category_label": "Knowledge Update",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "ed4ddc30",
"title": "knowledge-update",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"knowledge-update"
],
"source_family": "longmemeval",
"question_type": "knowledge_update",
"official_category": "knowledge-update",
"official_category_label": "Knowledge Update",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "8fb83627",
"title": "knowledge-update",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"knowledge-update"
],
"source_family": "longmemeval",
"question_type": "knowledge_update",
"official_category": "knowledge-update",
"official_category_label": "Knowledge Update",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "b01defab",
"title": "knowledge-update",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"knowledge-update"
],
"source_family": "longmemeval",
"question_type": "knowledge_update",
"official_category": "knowledge-update",
"official_category_label": "Knowledge Update",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "22d2cb42",
"title": "knowledge-update",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"knowledge-update"
],
"source_family": "longmemeval",
"question_type": "knowledge_update",
"official_category": "knowledge-update",
"official_category_label": "Knowledge Update",
"total_score": 89.17,
"grade": "A",
"mqs_precision": 50.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "0e4e4c46",
"title": "knowledge-update",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"knowledge-update"
],
"source_family": "longmemeval",
"question_type": "knowledge_update",
"official_category": "knowledge-update",
"official_category_label": "Knowledge Update",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "4b24c848",
"title": "knowledge-update",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"knowledge-update"
],
"source_family": "longmemeval",
"question_type": "knowledge_update",
"official_category": "knowledge-update",
"official_category_label": "Knowledge Update",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "7e974930",
"title": "knowledge-update",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"knowledge-update"
],
"source_family": "longmemeval",
"question_type": "knowledge_update",
"official_category": "knowledge-update",
"official_category_label": "Knowledge Update",
"total_score": 71.67,
"grade": "B",
"mqs_precision": 100.0,
"mqs_recall": 50.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 0.0
},
{
"scenario_id": "603deb26",
"title": "knowledge-update",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"knowledge-update"
],
"source_family": "longmemeval",
"question_type": "knowledge_update",
"official_category": "knowledge-update",
"official_category_label": "Knowledge Update",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "59524333",
"title": "knowledge-update",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"knowledge-update"
],
"source_family": "longmemeval",
"question_type": "knowledge_update",
"official_category": "knowledge-update",
"official_category_label": "Knowledge Update",
"total_score": 71.67,
"grade": "B",
"mqs_precision": 100.0,
"mqs_recall": 50.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 0.0
},
{
"scenario_id": "5831f84d",
"title": "knowledge-update",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"knowledge-update"
],
"source_family": "longmemeval",
"question_type": "knowledge_update",
"official_category": "knowledge-update",
"official_category_label": "Knowledge Update",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "eace081b",
"title": "knowledge-update",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"knowledge-update"
],
"source_family": "longmemeval",
"question_type": "knowledge_update",
"official_category": "knowledge-update",
"official_category_label": "Knowledge Update",
"total_score": 89.17,
"grade": "A",
"mqs_precision": 50.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "affe2881",
"title": "knowledge-update",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"knowledge-update"
],
"source_family": "longmemeval",
"question_type": "knowledge_update",
"official_category": "knowledge-update",
"official_category_label": "Knowledge Update",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "50635ada",
"title": "knowledge-update",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"knowledge-update"
],
"source_family": "longmemeval",
"question_type": "knowledge_update",
"official_category": "knowledge-update",
"official_category_label": "Knowledge Update",
"total_score": 71.67,
"grade": "B",
"mqs_precision": 100.0,
"mqs_recall": 50.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 0.0
},
{
"scenario_id": "e66b632c",
"title": "knowledge-update",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"knowledge-update"
],
"source_family": "longmemeval",
"question_type": "knowledge_update",
"official_category": "knowledge-update",
"official_category_label": "Knowledge Update",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "0ddfec37",
"title": "knowledge-update",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"knowledge-update"
],
"source_family": "longmemeval",
"question_type": "knowledge_update",
"official_category": "knowledge-update",
"official_category_label": "Knowledge Update",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "f685340e",
"title": "knowledge-update",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"knowledge-update"
],
"source_family": "longmemeval",
"question_type": "knowledge_update",
"official_category": "knowledge-update",
"official_category_label": "Knowledge Update",
"total_score": 71.67,
"grade": "B",
"mqs_precision": 100.0,
"mqs_recall": 50.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 0.0
},
{
"scenario_id": "cc5ded98",
"title": "knowledge-update",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"knowledge-update"
],
"source_family": "longmemeval",
"question_type": "knowledge_update",
"official_category": "knowledge-update",
"official_category_label": "Knowledge Update",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "dfde3500",
"title": "knowledge-update",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"knowledge-update"
],
"source_family": "longmemeval",
"question_type": "knowledge_update",
"official_category": "knowledge-update",
"official_category_label": "Knowledge Update",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "69fee5aa",
"title": "knowledge-update",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"knowledge-update"
],
"source_family": "longmemeval",
"question_type": "knowledge_update",
"official_category": "knowledge-update",
"official_category_label": "Knowledge Update",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "7401057b",
"title": "knowledge-update",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"knowledge-update"
],
"source_family": "longmemeval",
"question_type": "knowledge_update",
"official_category": "knowledge-update",
"official_category_label": "Knowledge Update",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "cf22b7bf",
"title": "knowledge-update",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"knowledge-update"
],
"source_family": "longmemeval",
"question_type": "knowledge_update",
"official_category": "knowledge-update",
"official_category_label": "Knowledge Update",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "a2f3aa27",
"title": "knowledge-update",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"knowledge-update"
],
"source_family": "longmemeval",
"question_type": "knowledge_update",
"official_category": "knowledge-update",
"official_category_label": "Knowledge Update",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "c7dc5443",
"title": "knowledge-update",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"knowledge-update"
],
"source_family": "longmemeval",
"question_type": "knowledge_update",
"official_category": "knowledge-update",
"official_category_label": "Knowledge Update",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "06db6396",
"title": "knowledge-update",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"knowledge-update"
],
"source_family": "longmemeval",
"question_type": "knowledge_update",
"official_category": "knowledge-update",
"official_category_label": "Knowledge Update",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "3ba21379",
"title": "knowledge-update",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"knowledge-update"
],
"source_family": "longmemeval",
"question_type": "knowledge_update",
"official_category": "knowledge-update",
"official_category_label": "Knowledge Update",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "9bbe84a2",
"title": "knowledge-update",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"knowledge-update"
],
"source_family": "longmemeval",
"question_type": "knowledge_update",
"official_category": "knowledge-update",
"official_category_label": "Knowledge Update",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "10e09553",
"title": "knowledge-update",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"knowledge-update"
],
"source_family": "longmemeval",
"question_type": "knowledge_update",
"official_category": "knowledge-update",
"official_category_label": "Knowledge Update",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "dad224aa",
"title": "knowledge-update",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"knowledge-update"
],
"source_family": "longmemeval",
"question_type": "knowledge_update",
"official_category": "knowledge-update",
"official_category_label": "Knowledge Update",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "ba61f0b9",
"title": "knowledge-update",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"knowledge-update"
],
"source_family": "longmemeval",
"question_type": "knowledge_update",
"official_category": "knowledge-update",
"official_category_label": "Knowledge Update",
"total_score": 71.67,
"grade": "B",
"mqs_precision": 100.0,
"mqs_recall": 50.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 0.0
},
{
"scenario_id": "42ec0761",
"title": "knowledge-update",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"knowledge-update"
],
"source_family": "longmemeval",
"question_type": "knowledge_update",
"official_category": "knowledge-update",
"official_category_label": "Knowledge Update",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "5c40ec5b",
"title": "knowledge-update",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"knowledge-update"
],
"source_family": "longmemeval",
"question_type": "knowledge_update",
"official_category": "knowledge-update",
"official_category_label": "Knowledge Update",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "c6853660",
"title": "knowledge-update",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"knowledge-update"
],
"source_family": "longmemeval",
"question_type": "knowledge_update",
"official_category": "knowledge-update",
"official_category_label": "Knowledge Update",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "26bdc477",
"title": "knowledge-update",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"knowledge-update"
],
"source_family": "longmemeval",
"question_type": "knowledge_update",
"official_category": "knowledge-update",
"official_category_label": "Knowledge Update",
"total_score": 71.67,
"grade": "B",
"mqs_precision": 100.0,
"mqs_recall": 50.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 0.0
},
{
"scenario_id": "0977f2af",
"title": "knowledge-update",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"knowledge-update"
],
"source_family": "longmemeval",
"question_type": "knowledge_update",
"official_category": "knowledge-update",
"official_category_label": "Knowledge Update",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "89941a94",
"title": "knowledge-update",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"knowledge-update"
],
"source_family": "longmemeval",
"question_type": "knowledge_update",
"official_category": "knowledge-update",
"official_category_label": "Knowledge Update",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "07741c45",
"title": "knowledge-update",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"knowledge-update"
],
"source_family": "longmemeval",
"question_type": "knowledge_update",
"official_category": "knowledge-update",
"official_category_label": "Knowledge Update",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "8a2466db",
"title": "single-session-preference",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-preference"
],
"source_family": "longmemeval",
"question_type": "single_session_preference",
"official_category": "single-session-preference",
"official_category_label": "Single-Session Preference",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "06878be2",
"title": "single-session-preference",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-preference"
],
"source_family": "longmemeval",
"question_type": "single_session_preference",
"official_category": "single-session-preference",
"official_category_label": "Single-Session Preference",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "75832dbd",
"title": "single-session-preference",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-preference"
],
"source_family": "longmemeval",
"question_type": "single_session_preference",
"official_category": "single-session-preference",
"official_category_label": "Single-Session Preference",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "0edc2aef",
"title": "single-session-preference",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-preference"
],
"source_family": "longmemeval",
"question_type": "single_session_preference",
"official_category": "single-session-preference",
"official_category_label": "Single-Session Preference",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "35a27287",
"title": "single-session-preference",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-preference"
],
"source_family": "longmemeval",
"question_type": "single_session_preference",
"official_category": "single-session-preference",
"official_category_label": "Single-Session Preference",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "32260d93",
"title": "single-session-preference",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-preference"
],
"source_family": "longmemeval",
"question_type": "single_session_preference",
"official_category": "single-session-preference",
"official_category_label": "Single-Session Preference",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "195a1a1b",
"title": "single-session-preference",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-preference"
],
"source_family": "longmemeval",
"question_type": "single_session_preference",
"official_category": "single-session-preference",
"official_category_label": "Single-Session Preference",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "afdc33df",
"title": "single-session-preference",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-preference"
],
"source_family": "longmemeval",
"question_type": "single_session_preference",
"official_category": "single-session-preference",
"official_category_label": "Single-Session Preference",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "caf03d32",
"title": "single-session-preference",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-preference"
],
"source_family": "longmemeval",
"question_type": "single_session_preference",
"official_category": "single-session-preference",
"official_category_label": "Single-Session Preference",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "54026fce",
"title": "single-session-preference",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-preference"
],
"source_family": "longmemeval",
"question_type": "single_session_preference",
"official_category": "single-session-preference",
"official_category_label": "Single-Session Preference",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "06f04340",
"title": "single-session-preference",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-preference"
],
"source_family": "longmemeval",
"question_type": "single_session_preference",
"official_category": "single-session-preference",
"official_category_label": "Single-Session Preference",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "6b7dfb22",
"title": "single-session-preference",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-preference"
],
"source_family": "longmemeval",
"question_type": "single_session_preference",
"official_category": "single-session-preference",
"official_category_label": "Single-Session Preference",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "1a1907b4",
"title": "single-session-preference",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-preference"
],
"source_family": "longmemeval",
"question_type": "single_session_preference",
"official_category": "single-session-preference",
"official_category_label": "Single-Session Preference",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "09d032c9",
"title": "single-session-preference",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-preference"
],
"source_family": "longmemeval",
"question_type": "single_session_preference",
"official_category": "single-session-preference",
"official_category_label": "Single-Session Preference",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "38146c39",
"title": "single-session-preference",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-preference"
],
"source_family": "longmemeval",
"question_type": "single_session_preference",
"official_category": "single-session-preference",
"official_category_label": "Single-Session Preference",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "d24813b1",
"title": "single-session-preference",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-preference"
],
"source_family": "longmemeval",
"question_type": "single_session_preference",
"official_category": "single-session-preference",
"official_category_label": "Single-Session Preference",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "57f827a0",
"title": "single-session-preference",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-preference"
],
"source_family": "longmemeval",
"question_type": "single_session_preference",
"official_category": "single-session-preference",
"official_category_label": "Single-Session Preference",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "95228167",
"title": "single-session-preference",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-preference"
],
"source_family": "longmemeval",
"question_type": "single_session_preference",
"official_category": "single-session-preference",
"official_category_label": "Single-Session Preference",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "505af2f5",
"title": "single-session-preference",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-preference"
],
"source_family": "longmemeval",
"question_type": "single_session_preference",
"official_category": "single-session-preference",
"official_category_label": "Single-Session Preference",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "75f70248",
"title": "single-session-preference",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-preference"
],
"source_family": "longmemeval",
"question_type": "single_session_preference",
"official_category": "single-session-preference",
"official_category_label": "Single-Session Preference",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "d6233ab6",
"title": "single-session-preference",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-preference"
],
"source_family": "longmemeval",
"question_type": "single_session_preference",
"official_category": "single-session-preference",
"official_category_label": "Single-Session Preference",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "1da05512",
"title": "single-session-preference",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-preference"
],
"source_family": "longmemeval",
"question_type": "single_session_preference",
"official_category": "single-session-preference",
"official_category_label": "Single-Session Preference",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "fca70973",
"title": "single-session-preference",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-preference"
],
"source_family": "longmemeval",
"question_type": "single_session_preference",
"official_category": "single-session-preference",
"official_category_label": "Single-Session Preference",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "b6025781",
"title": "single-session-preference",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-preference"
],
"source_family": "longmemeval",
"question_type": "single_session_preference",
"official_category": "single-session-preference",
"official_category_label": "Single-Session Preference",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "a89d7624",
"title": "single-session-preference",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-preference"
],
"source_family": "longmemeval",
"question_type": "single_session_preference",
"official_category": "single-session-preference",
"official_category_label": "Single-Session Preference",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "b0479f84",
"title": "single-session-preference",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-preference"
],
"source_family": "longmemeval",
"question_type": "single_session_preference",
"official_category": "single-session-preference",
"official_category_label": "Single-Session Preference",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "1d4e3b97",
"title": "single-session-preference",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-preference"
],
"source_family": "longmemeval",
"question_type": "single_session_preference",
"official_category": "single-session-preference",
"official_category_label": "Single-Session Preference",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "07b6f563",
"title": "single-session-preference",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-preference"
],
"source_family": "longmemeval",
"question_type": "single_session_preference",
"official_category": "single-session-preference",
"official_category_label": "Single-Session Preference",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "1c0ddc50",
"title": "single-session-preference",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-preference"
],
"source_family": "longmemeval",
"question_type": "single_session_preference",
"official_category": "single-session-preference",
"official_category_label": "Single-Session Preference",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "0a34ad58",
"title": "single-session-preference",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-preference"
],
"source_family": "longmemeval",
"question_type": "single_session_preference",
"official_category": "single-session-preference",
"official_category_label": "Single-Session Preference",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "7161e7e2",
"title": "single-session-assistant",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-assistant"
],
"source_family": "longmemeval",
"question_type": "single_session_assistant",
"official_category": "single-session-assistant",
"official_category_label": "Single-Session Assistant",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "c4f10528",
"title": "single-session-assistant",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-assistant"
],
"source_family": "longmemeval",
"question_type": "single_session_assistant",
"official_category": "single-session-assistant",
"official_category_label": "Single-Session Assistant",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "89527b6b",
"title": "single-session-assistant",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-assistant"
],
"source_family": "longmemeval",
"question_type": "single_session_assistant",
"official_category": "single-session-assistant",
"official_category_label": "Single-Session Assistant",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "e9327a54",
"title": "single-session-assistant",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-assistant"
],
"source_family": "longmemeval",
"question_type": "single_session_assistant",
"official_category": "single-session-assistant",
"official_category_label": "Single-Session Assistant",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "4c36ccef",
"title": "single-session-assistant",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-assistant"
],
"source_family": "longmemeval",
"question_type": "single_session_assistant",
"official_category": "single-session-assistant",
"official_category_label": "Single-Session Assistant",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "6ae235be",
"title": "single-session-assistant",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-assistant"
],
"source_family": "longmemeval",
"question_type": "single_session_assistant",
"official_category": "single-session-assistant",
"official_category_label": "Single-Session Assistant",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "7e00a6cb",
"title": "single-session-assistant",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-assistant"
],
"source_family": "longmemeval",
"question_type": "single_session_assistant",
"official_category": "single-session-assistant",
"official_category_label": "Single-Session Assistant",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "1903aded",
"title": "single-session-assistant",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-assistant"
],
"source_family": "longmemeval",
"question_type": "single_session_assistant",
"official_category": "single-session-assistant",
"official_category_label": "Single-Session Assistant",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "ceb54acb",
"title": "single-session-assistant",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-assistant"
],
"source_family": "longmemeval",
"question_type": "single_session_assistant",
"official_category": "single-session-assistant",
"official_category_label": "Single-Session Assistant",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "f523d9fe",
"title": "single-session-assistant",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-assistant"
],
"source_family": "longmemeval",
"question_type": "single_session_assistant",
"official_category": "single-session-assistant",
"official_category_label": "Single-Session Assistant",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "0e5e2d1a",
"title": "single-session-assistant",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-assistant"
],
"source_family": "longmemeval",
"question_type": "single_session_assistant",
"official_category": "single-session-assistant",
"official_category_label": "Single-Session Assistant",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "fea54f57",
"title": "single-session-assistant",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-assistant"
],
"source_family": "longmemeval",
"question_type": "single_session_assistant",
"official_category": "single-session-assistant",
"official_category_label": "Single-Session Assistant",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "cc539528",
"title": "single-session-assistant",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-assistant"
],
"source_family": "longmemeval",
"question_type": "single_session_assistant",
"official_category": "single-session-assistant",
"official_category_label": "Single-Session Assistant",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "dc439ea3",
"title": "single-session-assistant",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-assistant"
],
"source_family": "longmemeval",
"question_type": "single_session_assistant",
"official_category": "single-session-assistant",
"official_category_label": "Single-Session Assistant",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "18dcd5a5",
"title": "single-session-assistant",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-assistant"
],
"source_family": "longmemeval",
"question_type": "single_session_assistant",
"official_category": "single-session-assistant",
"official_category_label": "Single-Session Assistant",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "488d3006",
"title": "single-session-assistant",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-assistant"
],
"source_family": "longmemeval",
"question_type": "single_session_assistant",
"official_category": "single-session-assistant",
"official_category_label": "Single-Session Assistant",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "58470ed2",
"title": "single-session-assistant",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-assistant"
],
"source_family": "longmemeval",
"question_type": "single_session_assistant",
"official_category": "single-session-assistant",
"official_category_label": "Single-Session Assistant",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "8cf51dda",
"title": "single-session-assistant",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-assistant"
],
"source_family": "longmemeval",
"question_type": "single_session_assistant",
"official_category": "single-session-assistant",
"official_category_label": "Single-Session Assistant",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "1d4da289",
"title": "single-session-assistant",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-assistant"
],
"source_family": "longmemeval",
"question_type": "single_session_assistant",
"official_category": "single-session-assistant",
"official_category_label": "Single-Session Assistant",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "8464fc84",
"title": "single-session-assistant",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-assistant"
],
"source_family": "longmemeval",
"question_type": "single_session_assistant",
"official_category": "single-session-assistant",
"official_category_label": "Single-Session Assistant",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "8aef76bc",
"title": "single-session-assistant",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-assistant"
],
"source_family": "longmemeval",
"question_type": "single_session_assistant",
"official_category": "single-session-assistant",
"official_category_label": "Single-Session Assistant",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "71a3fd6b",
"title": "single-session-assistant",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-assistant"
],
"source_family": "longmemeval",
"question_type": "single_session_assistant",
"official_category": "single-session-assistant",
"official_category_label": "Single-Session Assistant",
"total_score": 39.17,
"grade": "D",
"mqs_precision": 0.0,
"mqs_recall": 0.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 0.0
},
{
"scenario_id": "2bf43736",
"title": "single-session-assistant",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-assistant"
],
"source_family": "longmemeval",
"question_type": "single_session_assistant",
"official_category": "single-session-assistant",
"official_category_label": "Single-Session Assistant",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "70b3e69b",
"title": "single-session-assistant",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-assistant"
],
"source_family": "longmemeval",
"question_type": "single_session_assistant",
"official_category": "single-session-assistant",
"official_category_label": "Single-Session Assistant",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "8752c811",
"title": "single-session-assistant",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-assistant"
],
"source_family": "longmemeval",
"question_type": "single_session_assistant",
"official_category": "single-session-assistant",
"official_category_label": "Single-Session Assistant",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "3249768e",
"title": "single-session-assistant",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-assistant"
],
"source_family": "longmemeval",
"question_type": "single_session_assistant",
"official_category": "single-session-assistant",
"official_category_label": "Single-Session Assistant",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "1b9b7252",
"title": "single-session-assistant",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-assistant"
],
"source_family": "longmemeval",
"question_type": "single_session_assistant",
"official_category": "single-session-assistant",
"official_category_label": "Single-Session Assistant",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "1568498a",
"title": "single-session-assistant",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-assistant"
],
"source_family": "longmemeval",
"question_type": "single_session_assistant",
"official_category": "single-session-assistant",
"official_category_label": "Single-Session Assistant",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "6222b6eb",
"title": "single-session-assistant",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-assistant"
],
"source_family": "longmemeval",
"question_type": "single_session_assistant",
"official_category": "single-session-assistant",
"official_category_label": "Single-Session Assistant",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "e8a79c70",
"title": "single-session-assistant",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-assistant"
],
"source_family": "longmemeval",
"question_type": "single_session_assistant",
"official_category": "single-session-assistant",
"official_category_label": "Single-Session Assistant",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "d596882b",
"title": "single-session-assistant",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-assistant"
],
"source_family": "longmemeval",
"question_type": "single_session_assistant",
"official_category": "single-session-assistant",
"official_category_label": "Single-Session Assistant",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "e3fc4d6e",
"title": "single-session-assistant",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-assistant"
],
"source_family": "longmemeval",
"question_type": "single_session_assistant",
"official_category": "single-session-assistant",
"official_category_label": "Single-Session Assistant",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "51b23612",
"title": "single-session-assistant",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-assistant"
],
"source_family": "longmemeval",
"question_type": "single_session_assistant",
"official_category": "single-session-assistant",
"official_category_label": "Single-Session Assistant",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "3e321797",
"title": "single-session-assistant",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-assistant"
],
"source_family": "longmemeval",
"question_type": "single_session_assistant",
"official_category": "single-session-assistant",
"official_category_label": "Single-Session Assistant",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "e982271f",
"title": "single-session-assistant",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-assistant"
],
"source_family": "longmemeval",
"question_type": "single_session_assistant",
"official_category": "single-session-assistant",
"official_category_label": "Single-Session Assistant",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "352ab8bd",
"title": "single-session-assistant",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-assistant"
],
"source_family": "longmemeval",
"question_type": "single_session_assistant",
"official_category": "single-session-assistant",
"official_category_label": "Single-Session Assistant",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "fca762bc",
"title": "single-session-assistant",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-assistant"
],
"source_family": "longmemeval",
"question_type": "single_session_assistant",
"official_category": "single-session-assistant",
"official_category_label": "Single-Session Assistant",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "7a8d0b71",
"title": "single-session-assistant",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-assistant"
],
"source_family": "longmemeval",
"question_type": "single_session_assistant",
"official_category": "single-session-assistant",
"official_category_label": "Single-Session Assistant",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "a40e080f",
"title": "single-session-assistant",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-assistant"
],
"source_family": "longmemeval",
"question_type": "single_session_assistant",
"official_category": "single-session-assistant",
"official_category_label": "Single-Session Assistant",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "8b9d4367",
"title": "single-session-assistant",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-assistant"
],
"source_family": "longmemeval",
"question_type": "single_session_assistant",
"official_category": "single-session-assistant",
"official_category_label": "Single-Session Assistant",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "5809eb10",
"title": "single-session-assistant",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-assistant"
],
"source_family": "longmemeval",
"question_type": "single_session_assistant",
"official_category": "single-session-assistant",
"official_category_label": "Single-Session Assistant",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "41275add",
"title": "single-session-assistant",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-assistant"
],
"source_family": "longmemeval",
"question_type": "single_session_assistant",
"official_category": "single-session-assistant",
"official_category_label": "Single-Session Assistant",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "4388e9dd",
"title": "single-session-assistant",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-assistant"
],
"source_family": "longmemeval",
"question_type": "single_session_assistant",
"official_category": "single-session-assistant",
"official_category_label": "Single-Session Assistant",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "4baee567",
"title": "single-session-assistant",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-assistant"
],
"source_family": "longmemeval",
"question_type": "single_session_assistant",
"official_category": "single-session-assistant",
"official_category_label": "Single-Session Assistant",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "561fabcd",
"title": "single-session-assistant",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-assistant"
],
"source_family": "longmemeval",
"question_type": "single_session_assistant",
"official_category": "single-session-assistant",
"official_category_label": "Single-Session Assistant",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "b759caee",
"title": "single-session-assistant",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-assistant"
],
"source_family": "longmemeval",
"question_type": "single_session_assistant",
"official_category": "single-session-assistant",
"official_category_label": "Single-Session Assistant",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "ac031881",
"title": "single-session-assistant",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-assistant"
],
"source_family": "longmemeval",
"question_type": "single_session_assistant",
"official_category": "single-session-assistant",
"official_category_label": "Single-Session Assistant",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "28bcfaac",
"title": "single-session-assistant",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-assistant"
],
"source_family": "longmemeval",
"question_type": "single_session_assistant",
"official_category": "single-session-assistant",
"official_category_label": "Single-Session Assistant",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "16c90bf4",
"title": "single-session-assistant",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-assistant"
],
"source_family": "longmemeval",
"question_type": "single_session_assistant",
"official_category": "single-session-assistant",
"official_category_label": "Single-Session Assistant",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "c8f1aeed",
"title": "single-session-assistant",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-assistant"
],
"source_family": "longmemeval",
"question_type": "single_session_assistant",
"official_category": "single-session-assistant",
"official_category_label": "Single-Session Assistant",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "eaca4986",
"title": "single-session-assistant",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-assistant"
],
"source_family": "longmemeval",
"question_type": "single_session_assistant",
"official_category": "single-session-assistant",
"official_category_label": "Single-Session Assistant",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "c7cf7dfd",
"title": "single-session-assistant",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-assistant"
],
"source_family": "longmemeval",
"question_type": "single_session_assistant",
"official_category": "single-session-assistant",
"official_category_label": "Single-Session Assistant",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "e48988bc",
"title": "single-session-assistant",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-assistant"
],
"source_family": "longmemeval",
"question_type": "single_session_assistant",
"official_category": "single-session-assistant",
"official_category_label": "Single-Session Assistant",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "1de5cff2",
"title": "single-session-assistant",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-assistant"
],
"source_family": "longmemeval",
"question_type": "single_session_assistant",
"official_category": "single-session-assistant",
"official_category_label": "Single-Session Assistant",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "65240037",
"title": "single-session-assistant",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-assistant"
],
"source_family": "longmemeval",
"question_type": "single_session_assistant",
"official_category": "single-session-assistant",
"official_category_label": "Single-Session Assistant",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "778164c6",
"title": "single-session-assistant",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-assistant"
],
"source_family": "longmemeval",
"question_type": "single_session_assistant",
"official_category": "single-session-assistant",
"official_category_label": "Single-Session Assistant",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "e47becba",
"title": "single-session-user",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-user"
],
"source_family": "longmemeval",
"question_type": "single_session_user",
"official_category": "single-session-user",
"official_category_label": "Single-Session User",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "118b2229",
"title": "single-session-user",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-user"
],
"source_family": "longmemeval",
"question_type": "single_session_user",
"official_category": "single-session-user",
"official_category_label": "Single-Session User",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "51a45a95",
"title": "single-session-user",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-user"
],
"source_family": "longmemeval",
"question_type": "single_session_user",
"official_category": "single-session-user",
"official_category_label": "Single-Session User",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "58bf7951",
"title": "single-session-user",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-user"
],
"source_family": "longmemeval",
"question_type": "single_session_user",
"official_category": "single-session-user",
"official_category_label": "Single-Session User",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "1e043500",
"title": "single-session-user",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-user"
],
"source_family": "longmemeval",
"question_type": "single_session_user",
"official_category": "single-session-user",
"official_category_label": "Single-Session User",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "c5e8278d",
"title": "single-session-user",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-user"
],
"source_family": "longmemeval",
"question_type": "single_session_user",
"official_category": "single-session-user",
"official_category_label": "Single-Session User",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "6ade9755",
"title": "single-session-user",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-user"
],
"source_family": "longmemeval",
"question_type": "single_session_user",
"official_category": "single-session-user",
"official_category_label": "Single-Session User",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "6f9b354f",
"title": "single-session-user",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-user"
],
"source_family": "longmemeval",
"question_type": "single_session_user",
"official_category": "single-session-user",
"official_category_label": "Single-Session User",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "58ef2f1c",
"title": "single-session-user",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-user"
],
"source_family": "longmemeval",
"question_type": "single_session_user",
"official_category": "single-session-user",
"official_category_label": "Single-Session User",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "f8c5f88b",
"title": "single-session-user",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-user"
],
"source_family": "longmemeval",
"question_type": "single_session_user",
"official_category": "single-session-user",
"official_category_label": "Single-Session User",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "5d3d2817",
"title": "single-session-user",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-user"
],
"source_family": "longmemeval",
"question_type": "single_session_user",
"official_category": "single-session-user",
"official_category_label": "Single-Session User",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "7527f7e2",
"title": "single-session-user",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-user"
],
"source_family": "longmemeval",
"question_type": "single_session_user",
"official_category": "single-session-user",
"official_category_label": "Single-Session User",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "c960da58",
"title": "single-session-user",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-user"
],
"source_family": "longmemeval",
"question_type": "single_session_user",
"official_category": "single-session-user",
"official_category_label": "Single-Session User",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "3b6f954b",
"title": "single-session-user",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-user"
],
"source_family": "longmemeval",
"question_type": "single_session_user",
"official_category": "single-session-user",
"official_category_label": "Single-Session User",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "726462e0",
"title": "single-session-user",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-user"
],
"source_family": "longmemeval",
"question_type": "single_session_user",
"official_category": "single-session-user",
"official_category_label": "Single-Session User",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "94f70d80",
"title": "single-session-user",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-user"
],
"source_family": "longmemeval",
"question_type": "single_session_user",
"official_category": "single-session-user",
"official_category_label": "Single-Session User",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "66f24dbb",
"title": "single-session-user",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-user"
],
"source_family": "longmemeval",
"question_type": "single_session_user",
"official_category": "single-session-user",
"official_category_label": "Single-Session User",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "ad7109d1",
"title": "single-session-user",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-user"
],
"source_family": "longmemeval",
"question_type": "single_session_user",
"official_category": "single-session-user",
"official_category_label": "Single-Session User",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "af8d2e46",
"title": "single-session-user",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-user"
],
"source_family": "longmemeval",
"question_type": "single_session_user",
"official_category": "single-session-user",
"official_category_label": "Single-Session User",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "dccbc061",
"title": "single-session-user",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-user"
],
"source_family": "longmemeval",
"question_type": "single_session_user",
"official_category": "single-session-user",
"official_category_label": "Single-Session User",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "c8c3f81d",
"title": "single-session-user",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-user"
],
"source_family": "longmemeval",
"question_type": "single_session_user",
"official_category": "single-session-user",
"official_category_label": "Single-Session User",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "8ebdbe50",
"title": "single-session-user",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-user"
],
"source_family": "longmemeval",
"question_type": "single_session_user",
"official_category": "single-session-user",
"official_category_label": "Single-Session User",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "6b168ec8",
"title": "single-session-user",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-user"
],
"source_family": "longmemeval",
"question_type": "single_session_user",
"official_category": "single-session-user",
"official_category_label": "Single-Session User",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "75499fd8",
"title": "single-session-user",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-user"
],
"source_family": "longmemeval",
"question_type": "single_session_user",
"official_category": "single-session-user",
"official_category_label": "Single-Session User",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "21436231",
"title": "single-session-user",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-user"
],
"source_family": "longmemeval",
"question_type": "single_session_user",
"official_category": "single-session-user",
"official_category_label": "Single-Session User",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "95bcc1c8",
"title": "single-session-user",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-user"
],
"source_family": "longmemeval",
"question_type": "single_session_user",
"official_category": "single-session-user",
"official_category_label": "Single-Session User",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "0862e8bf",
"title": "single-session-user",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-user"
],
"source_family": "longmemeval",
"question_type": "single_session_user",
"official_category": "single-session-user",
"official_category_label": "Single-Session User",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "853b0a1d",
"title": "single-session-user",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-user"
],
"source_family": "longmemeval",
"question_type": "single_session_user",
"official_category": "single-session-user",
"official_category_label": "Single-Session User",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "a06e4cfe",
"title": "single-session-user",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-user"
],
"source_family": "longmemeval",
"question_type": "single_session_user",
"official_category": "single-session-user",
"official_category_label": "Single-Session User",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "37d43f65",
"title": "single-session-user",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-user"
],
"source_family": "longmemeval",
"question_type": "single_session_user",
"official_category": "single-session-user",
"official_category_label": "Single-Session User",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "b86304ba",
"title": "single-session-user",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-user"
],
"source_family": "longmemeval",
"question_type": "single_session_user",
"official_category": "single-session-user",
"official_category_label": "Single-Session User",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "d52b4f67",
"title": "single-session-user",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-user"
],
"source_family": "longmemeval",
"question_type": "single_session_user",
"official_category": "single-session-user",
"official_category_label": "Single-Session User",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "25e5aa4f",
"title": "single-session-user",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-user"
],
"source_family": "longmemeval",
"question_type": "single_session_user",
"official_category": "single-session-user",
"official_category_label": "Single-Session User",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "caf9ead2",
"title": "single-session-user",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-user"
],
"source_family": "longmemeval",
"question_type": "single_session_user",
"official_category": "single-session-user",
"official_category_label": "Single-Session User",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "8550ddae",
"title": "single-session-user",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-user"
],
"source_family": "longmemeval",
"question_type": "single_session_user",
"official_category": "single-session-user",
"official_category_label": "Single-Session User",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "60d45044",
"title": "single-session-user",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-user"
],
"source_family": "longmemeval",
"question_type": "single_session_user",
"official_category": "single-session-user",
"official_category_label": "Single-Session User",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "3f1e9474",
"title": "single-session-user",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-user"
],
"source_family": "longmemeval",
"question_type": "single_session_user",
"official_category": "single-session-user",
"official_category_label": "Single-Session User",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "86b68151",
"title": "single-session-user",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-user"
],
"source_family": "longmemeval",
"question_type": "single_session_user",
"official_category": "single-session-user",
"official_category_label": "Single-Session User",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "577d4d32",
"title": "single-session-user",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-user"
],
"source_family": "longmemeval",
"question_type": "single_session_user",
"official_category": "single-session-user",
"official_category_label": "Single-Session User",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "ec81a493",
"title": "single-session-user",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-user"
],
"source_family": "longmemeval",
"question_type": "single_session_user",
"official_category": "single-session-user",
"official_category_label": "Single-Session User",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "15745da0",
"title": "single-session-user",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-user"
],
"source_family": "longmemeval",
"question_type": "single_session_user",
"official_category": "single-session-user",
"official_category_label": "Single-Session User",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "e01b8e2f",
"title": "single-session-user",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-user"
],
"source_family": "longmemeval",
"question_type": "single_session_user",
"official_category": "single-session-user",
"official_category_label": "Single-Session User",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "bc8a6e93",
"title": "single-session-user",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-user"
],
"source_family": "longmemeval",
"question_type": "single_session_user",
"official_category": "single-session-user",
"official_category_label": "Single-Session User",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "ccb36322",
"title": "single-session-user",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-user"
],
"source_family": "longmemeval",
"question_type": "single_session_user",
"official_category": "single-session-user",
"official_category_label": "Single-Session User",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "001be529",
"title": "single-session-user",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-user"
],
"source_family": "longmemeval",
"question_type": "single_session_user",
"official_category": "single-session-user",
"official_category_label": "Single-Session User",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "b320f3f8",
"title": "single-session-user",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-user"
],
"source_family": "longmemeval",
"question_type": "single_session_user",
"official_category": "single-session-user",
"official_category_label": "Single-Session User",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "19b5f2b3",
"title": "single-session-user",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-user"
],
"source_family": "longmemeval",
"question_type": "single_session_user",
"official_category": "single-session-user",
"official_category_label": "Single-Session User",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "4fd1909e",
"title": "single-session-user",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-user"
],
"source_family": "longmemeval",
"question_type": "single_session_user",
"official_category": "single-session-user",
"official_category_label": "Single-Session User",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "545bd2b5",
"title": "single-session-user",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-user"
],
"source_family": "longmemeval",
"question_type": "single_session_user",
"official_category": "single-session-user",
"official_category_label": "Single-Session User",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "8a137a7f",
"title": "single-session-user",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-user"
],
"source_family": "longmemeval",
"question_type": "single_session_user",
"official_category": "single-session-user",
"official_category_label": "Single-Session User",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "76d63226",
"title": "single-session-user",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-user"
],
"source_family": "longmemeval",
"question_type": "single_session_user",
"official_category": "single-session-user",
"official_category_label": "Single-Session User",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "86f00804",
"title": "single-session-user",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-user"
],
"source_family": "longmemeval",
"question_type": "single_session_user",
"official_category": "single-session-user",
"official_category_label": "Single-Session User",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "8e9d538c",
"title": "single-session-user",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-user"
],
"source_family": "longmemeval",
"question_type": "single_session_user",
"official_category": "single-session-user",
"official_category_label": "Single-Session User",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "311778f1",
"title": "single-session-user",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-user"
],
"source_family": "longmemeval",
"question_type": "single_session_user",
"official_category": "single-session-user",
"official_category_label": "Single-Session User",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "c19f7a0b",
"title": "single-session-user",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-user"
],
"source_family": "longmemeval",
"question_type": "single_session_user",
"official_category": "single-session-user",
"official_category_label": "Single-Session User",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "4100d0a0",
"title": "single-session-user",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-user"
],
"source_family": "longmemeval",
"question_type": "single_session_user",
"official_category": "single-session-user",
"official_category_label": "Single-Session User",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "29f2956b",
"title": "single-session-user",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-user"
],
"source_family": "longmemeval",
"question_type": "single_session_user",
"official_category": "single-session-user",
"official_category_label": "Single-Session User",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "1faac195",
"title": "single-session-user",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-user"
],
"source_family": "longmemeval",
"question_type": "single_session_user",
"official_category": "single-session-user",
"official_category_label": "Single-Session User",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "faba32e5",
"title": "single-session-user",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-user"
],
"source_family": "longmemeval",
"question_type": "single_session_user",
"official_category": "single-session-user",
"official_category_label": "Single-Session User",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "f4f1d8a4",
"title": "single-session-user",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-user"
],
"source_family": "longmemeval",
"question_type": "single_session_user",
"official_category": "single-session-user",
"official_category_label": "Single-Session User",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "c14c00dd",
"title": "single-session-user",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-user"
],
"source_family": "longmemeval",
"question_type": "single_session_user",
"official_category": "single-session-user",
"official_category_label": "Single-Session User",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "36580ce8",
"title": "single-session-user",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-user"
],
"source_family": "longmemeval",
"question_type": "single_session_user",
"official_category": "single-session-user",
"official_category_label": "Single-Session User",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "3d86fd0a",
"title": "single-session-user",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-user"
],
"source_family": "longmemeval",
"question_type": "single_session_user",
"official_category": "single-session-user",
"official_category_label": "Single-Session User",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "a82c026e",
"title": "single-session-user",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"single-session-user"
],
"source_family": "longmemeval",
"question_type": "single_session_user",
"official_category": "single-session-user",
"official_category_label": "Single-Session User",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_59149c77",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_f49edff3",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L2",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 39.17,
"grade": "D",
"mqs_precision": 0.0,
"mqs_recall": 0.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 0.0
},
{
"scenario_id": "71017276",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "b46e15ed",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L2",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_fa19884c",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "0bc8ad92",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L2",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "af082822",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_4929293a",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_b5700ca9",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 39.17,
"grade": "D",
"mqs_precision": 0.0,
"mqs_recall": 0.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 0.0
},
{
"scenario_id": "9a707b81",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 39.17,
"grade": "D",
"mqs_precision": 0.0,
"mqs_recall": 0.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 0.0
},
{
"scenario_id": "gpt4_1d4ab0c9",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 71.67,
"grade": "B",
"mqs_precision": 100.0,
"mqs_recall": 50.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 0.0
},
{
"scenario_id": "gpt4_e072b769",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "0db4c65d",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_1d80365e",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_7f6b06db",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L2",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 75.28,
"grade": "B",
"mqs_precision": 100.0,
"mqs_recall": 66.67,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 0.0
},
{
"scenario_id": "gpt4_6dc9b45b",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_8279ba02",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_18c2b244",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L2",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 68.06,
"grade": "C",
"mqs_precision": 100.0,
"mqs_recall": 33.33,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 0.0
},
{
"scenario_id": "gpt4_a1b77f9c",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L3",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 39.17,
"grade": "D",
"mqs_precision": 0.0,
"mqs_recall": 0.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 0.0
},
{
"scenario_id": "gpt4_1916e0ea",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_7a0daae1",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_468eb063",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_7abb270c",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L3",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_1e4a8aeb",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_4fc4f797",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 71.67,
"grade": "B",
"mqs_precision": 100.0,
"mqs_recall": 50.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 0.0
},
{
"scenario_id": "4dfccbf7",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_61e13b3c",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_45189cb4",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L2",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "2ebe6c90",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_e061b84f",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L2",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "370a8ff4",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_d6585ce8",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L3",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 65.17,
"grade": "C",
"mqs_precision": 100.0,
"mqs_recall": 20.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 0.0
},
{
"scenario_id": "gpt4_4ef30696",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_ec93e27f",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "6e984301",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "8077ef71",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_f420262c",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L3",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_8e165409",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_74aed68e",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "bcbe585f",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_21adecb5",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 71.67,
"grade": "B",
"mqs_precision": 100.0,
"mqs_recall": 50.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 0.0
},
{
"scenario_id": "5e1b23de",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_98f46fc6",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_af6db32f",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "eac54adc",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 39.17,
"grade": "D",
"mqs_precision": 0.0,
"mqs_recall": 0.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 0.0
},
{
"scenario_id": "gpt4_7ddcf75f",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_a2d1d1f6",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_85da3956",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_b0863698",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_68e94287",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_e414231e",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_7ca326fa",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L2",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_7bc6cf22",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 39.17,
"grade": "D",
"mqs_precision": 0.0,
"mqs_recall": 0.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 0.0
},
{
"scenario_id": "2ebe6c92",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 39.17,
"grade": "D",
"mqs_precision": 0.0,
"mqs_recall": 0.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 0.0
},
{
"scenario_id": "gpt4_e061b84g",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L2",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 85.56,
"grade": "A",
"mqs_precision": 33.33,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "71017277",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "b46e15ee",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L2",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 83.75,
"grade": "A",
"mqs_precision": 25.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_d6585ce9",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L3",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_1e4a8aec",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 89.17,
"grade": "A",
"mqs_precision": 50.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_f420262d",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L3",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 39.17,
"grade": "D",
"mqs_precision": 0.0,
"mqs_recall": 0.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 0.0
},
{
"scenario_id": "gpt4_59149c78",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 89.17,
"grade": "A",
"mqs_precision": 50.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_e414231f",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 89.17,
"grade": "A",
"mqs_precision": 50.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_4929293b",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 89.17,
"grade": "A",
"mqs_precision": 50.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_468eb064",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_fa19884d",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 89.17,
"grade": "A",
"mqs_precision": 50.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "9a707b82",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 39.17,
"grade": "D",
"mqs_precision": 0.0,
"mqs_recall": 0.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 0.0
},
{
"scenario_id": "eac54add",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 39.17,
"grade": "D",
"mqs_precision": 0.0,
"mqs_recall": 0.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 0.0
},
{
"scenario_id": "4dfccbf8",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 89.17,
"grade": "A",
"mqs_precision": 50.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "0bc8ad93",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L2",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 85.56,
"grade": "A",
"mqs_precision": 33.33,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "6e984302",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 89.17,
"grade": "A",
"mqs_precision": 50.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_8279ba03",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_b5700ca0",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "gpt4_68e94288",
"title": "temporal-reasoning",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"temporal-reasoning"
],
"source_family": "longmemeval",
"question_type": "temporal_reasoning",
"official_category": "temporal-reasoning",
"official_category_label": "Temporal Reasoning",
"total_score": 39.17,
"grade": "D",
"mqs_precision": 0.0,
"mqs_recall": 0.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 0.0
},
{
"scenario_id": "d3ab962e",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "2311e44b",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "cc06de0d",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "a11281a2",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "4f54b7c9",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "85fa3a3f",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "9aaed6a3",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "1f2b8d4f",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "e6041065",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "51c32626",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "d905b33f",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "7405e8b1",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "f35224e0",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "6456829e",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "a4996e51",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "3c1045c8",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "60036106",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "681a1674",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "e25c3b8d",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "4adc0475",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "4bc144e2",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "ef66a6e5",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "5025383b",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "a1cc6108",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "9ee3ecd6",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "3fdac837",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "91b15a6e",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "27016adc",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "720133ac",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "77eafa52",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "8979f9ec",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "0100672e",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "a96c20ee",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "92a0aa75",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "3fe836c9",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "1c549ce4",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "6c49646a",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "1192316e",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "0ea62687",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "67e0d0f2",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "bb7c3b45",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "ba358f49",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "61f8c8f8",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "60159905",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 0.0,
"grade": "D",
"mqs_precision": 0.0,
"mqs_recall": 0.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 0.0,
"aus_assertion_pass": 0.0
},
{
"scenario_id": "ef9cf60a",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "73d42213",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "bc149d6b",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "099778bb",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "09ba9854",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "d6062bb9",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "157a136e",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "c18a7dc8",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "a3332713",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "55241a1f",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "a08a253f",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "f0e564bc",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "078150f1",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "8cf4d046",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "a346bb18",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "37f165cf",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "8e91e7d9",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "87f22b4a",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "e56a43b9",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "efc3f7c2",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
},
{
"scenario_id": "21d02d0d",
"title": "multi-session",
"domain": "longmem",
"difficulty": "L1",
"horizon": "oracle",
"tags": [
"oracle",
"multi-session"
],
"source_family": "longmemeval",
"question_type": "multi_session",
"official_category": "multi-session",
"official_category_label": "Multi-Session",
"total_score": 100.0,
"grade": "S",
"mqs_precision": 100.0,
"mqs_recall": 100.0,
"mqs_noise_rejection": 100.0,
"aus_step_success": 100.0,
"aus_assertion_pass": 100.0
}
]
}

Summary

We ran Memoria against LongMemEval with the aligned official datasets and saw a clear split:

  • LongMemEval oracle is strong
  • LongMemEval_S degrades sharply in long-session settings

The current failure mode is not a one-off benchmark bug. It appears to be a structural retrieval issue in long histories:

  • graph retrieval never contributes
  • fallback hybrid retrieval operates on whole-session memory blobs
  • broad semantic matches outrank the actual evidence sessions
  • exact lexical cases can still pass, but long-session fact lookup is generally weak

Confirmed runs

1. Official aligned oracle run

Preserved report:

  • benchmarks/results/longmemeval-oracle-20260321b/full.report.json

Result:

  • dataset: longmemeval-oracle-rustbench
  • scenarios: 470
  • overall: 96.35 (S)

Category breakdown:

  • Temporal Reasoning: 91.55 over 127
  • Knowledge Update: 94.98 over 72
  • Multi-Session: 98.18 over 121
  • Single-Session Assistant: 98.91 over 56
  • Single-Session Preference: 100.0 over 30
  • Single-Session User: 100.0 over 64

2. LongMemEval_S full run

We started a full LongMemEval_S run on the aligned 470-scenario dataset, but stopped it because throughput was too slow for iteration:

  • rough throughput observed: ~75-90s / scenario
  • rough ETA: ~10-12h for full run

3. LongMemEval_S stratified sample run

To get faster signal, we generated a deterministic stratified subset:

  • dataset: benchmarks/datasets/longmemeval_s_sample_60.json
  • dataset_id: longmemeval-s-rustbench-sample-60
  • scenarios: 60

Category mix:

  • multi-session: 16
  • temporal-reasoning: 16
  • knowledge-update: 9
  • single-session-user: 8
  • single-session-assistant: 7
  • single-session-preference: 4

This sample run was also stopped early, at about 10 / 60 scenarios, but the live output was already strongly negative:

Observed scenario outputs:

  • 06db63960.0 (D)
  • 0a34ad5839.2 (D)
  • 0bb5a68460.8 (C)
  • 0db4c65d100.0 (S)
  • 0f05491a39.2 (D)
  • 1c549ce439.2 (D)
  • 1d4e3b9739.2 (D)
  • 35a2728739.2 (D)
  • 37f165cf39.2 (D)
  • 3f1e947439.2 (D)

This is enough to conclude LongMemEval_S is failing systematically early, not just on one corner case.

Key findings

1. Graph retrieval is effectively inactive on these long-session runs

The retrieval service tries graph first, then hybrid/vector, then fulltext.

However, in the stopped LongMemEval_S sample DB snapshot:

  • memory_graph_nodes = 0
  • memory_graph_edges = 0
  • mem_entities = 24124
  • mem_memories = 470

So entity extraction is happening, but graph memory nodes are not populated, which means graph retrieval never contributes.

Observed from retrieval explain:

  • graph_hit = false
  • graph_candidates = 0

2. Hybrid retrieval is ranking whole-session blobs, not the right evidence

Each seeded memory is a full session transcript, and LongMemEval_S scenarios often contain 42-52 session memories each.

That means retrieval is embedding and ranking coarse session-sized blobs rather than narrower facts/events.

In failed cases, top results were often thematically related but clearly wrong.

Example: 37f165cf

  • Query: What was the page count of the two novels I finished in January and March?
  • Expected: 2 book-related evidence sessions
  • Returned top results:
    • meal planning / pasta
    • immigration / Canada resettlement
    • Chicago travel

Explain:

  • path: hybrid
  • graph_hit=false
  • vector_hit=true

Example: 0a34ad58

  • Query: I’m a bit anxious about getting around Tokyo. Do you have any helpful tips?
  • Expected: Tokyo transit sessions tied to Suica usage
  • Returned top results:
    • showerhead repair
    • salad recipe
    • one generic Tokyo restaurant session

Again:

  • path: hybrid
  • graph_hit=false

3. Success cases rely on strong lexical anchors

Example: 0db4c65d

  • Query mentions rare titles:
    • The Seven Husbands of Evelyn Hugo
    • The Silent Patient
  • This case succeeded with:
    • path: fulltext
    • graph_hit=false
    • vector_hit=false

So some long-session cases still pass, but mainly when exact lexical matching is strong enough to compensate.

4. Temporal reasoning is not being rescued by actual temporal scoring

In the failed hybrid explain traces:

  • temporal_score = 0.0

So temporal or multi-event questions in long sessions are not being saved by a real temporal-aware retrieval layer. They mostly succeed or fail based on coarse vector/fulltext retrieval over long session blobs.

Likely root cause

Memoria currently stores and retrieves long histories at too coarse a granularity for LongMemEval_S.

What seems to be happening:

  • capture stores session-sized memories
  • graph path is not materially available in these runs
  • hybrid retrieval ranks semantically broad sessions
  • the benchmark needs precise evidence sessions, often for a specific fact inside a long conversation

This is why:

  • oracle does well
  • S fails early and broadly

oracle reduces the search burden to evidence sessions only. S requires retrieval to survive long-history dilution, and the current design does not.

Proposed next directions

  1. Add a benchmark-friendly long-history capture/indexing layer
  • do not only store full session blobs
  • also store narrower fact/event units derived from each session
  1. Make graph retrieval real for this path
  • entity extraction alone is not enough
  • ensure graph node materialization/backfill participates during long-session benchmark runs
  1. Improve retrieval granularity before tuning ranking
  • the main issue looks like coarse candidate generation, not just bad score weights
  1. Add a repeatable LongMemEval_S sample benchmark workflow
  • keep deterministic subset sampling for fast iteration
  • use full S only for milestone validation

Artifacts

Aligned datasets:

  • benchmarks/datasets/longmemeval_oracle.rustbench.json
  • benchmarks/datasets/longmemeval_s.rustbench.json
  • benchmarks/datasets/longmemeval_m.rustbench.json

Deterministic sample:

  • benchmarks/datasets/longmemeval_s_sample_60.json

Preserved oracle report:

  • benchmarks/results/longmemeval-oracle-20260321b/full.report.json
{
"dataset_id": "longmemeval-s-rustbench-sample-60",
"version": "2025-09-cleaned",
"scenario_count": 60,
"category_counts": {
"knowledge-update": 9,
"single-session-preference": 4,
"temporal-reasoning": 16,
"multi-session": 16,
"single-session-user": 8,
"single-session-assistant": 7
},
"scenario_ids": [
"06db6396",
"0a34ad58",
"0bb5a684",
"0db4c65d",
"0f05491a",
"1c549ce4",
"1d4e3b97",
"35a27287",
"37f165cf",
"3f1e9474",
"41275add",
"41698283",
"4adc0475",
"55241a1f",
"5831f84d",
"5a4f22c0",
"6b168ec8",
"7024f17c",
"71315a70",
"7161e7e2",
"8077ef71",
"80ec1f4f",
"8c18457d",
"92a0aa75",
"95bcc1c8",
"9bbe84a2",
"a2f3aa27",
"a346bb18",
"a96c20ee",
"aae3761f",
"ba61f0b9",
"bf659f65",
"c5e8278d",
"c8c3f81d",
"c8f1aeed",
"caf03d32",
"d3ab962e",
"d682f1a2",
"d851d5ba",
"dc439ea3",
"dcfa8644",
"e01b8e2f",
"e3fc4d6e",
"e4e14d04",
"e56a43b9",
"e8a79c70",
"e982271f",
"eac54add",
"f4f1d8a4",
"f8c5f88b",
"gpt4_0b2f1d21",
"gpt4_18c2b244",
"gpt4_1e4a8aeb",
"gpt4_2f56ae70",
"gpt4_2f91af09",
"gpt4_4929293a",
"gpt4_4cd9eba1",
"gpt4_4edbafa2",
"gpt4_76048e76",
"gpt4_b0863698"
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment