Skip to content

Instantly share code, notes, and snippets.

@sandlbn
Created November 6, 2025 14:58
Show Gist options
  • Select an option

  • Save sandlbn/6d73d51746b2b6d51d84bc4164600bd2 to your computer and use it in GitHub Desktop.

Select an option

Save sandlbn/6d73d51746b2b6d51d84bc4164600bd2 to your computer and use it in GitHub Desktop.
dataset_download.py
#!/usr/bin/env python3
# pip install datasets pandas
import argparse, json, re
from pathlib import Path
from datasets import load_dataset
import pandas as pd
parser = argparse.ArgumentParser()
parser.add_argument("-o", "--out-dir", default="aice_dump", help="Output directory")
parser.add_argument("--only-correct", action="store_true", help="Export only rows with Correct==True")
args = parser.parse_args()
OUT = Path(args.out_dir)
OUT.mkdir(parents=True, exist_ok=True)
SAFE_RE = re.compile(r"[^A-Za-z0-9._-]+")
def safe(x, fallback="item"):
s = str(x if x is not None else fallback)
s = SAFE_RE.sub("_", s).strip("_")
return s[:120] or fallback
def write_text(path: Path, content):
if content is None:
return
s = str(content)
if s.strip():
path.write_text(s, encoding="utf-8")
ds = load_dataset("SakanaAI/AI-CUDA-Engineer-Archive")
# Only name and performance-related fields
CSV_COLS = [
"Op_Name",
"Level_ID",
"Task_ID",
"Kernel_Name",
"CUDA_Runtime",
"PyTorch_Native_Runtime",
"PyTorch_Compile_Runtime",
"CUDA_Speedup_Native",
"CUDA_Speedup_Compile",
"Correct",
"Max_Diff",
"Error",
]
all_rows = []
for split in ("level_1", "level_2", "level_3"):
if split not in ds:
continue
df = ds[split].to_pandas()
if args.only_correct and "Correct" in df.columns:
df = df[df["Correct"] == True]
for i, row in df.iterrows():
task_id = safe(row.get("Task_ID", "task"))
kname = safe(row.get("Kernel_Name", f"kernel_{i}"))
base = OUT / split / f"Task_{task_id}" / kname
base.mkdir(parents=True, exist_ok=True)
# Save code and auxiliary files (optional)
write_text(base / "cuda_kernel.cu", row.get("CUDA_Code"))
write_text(base / "pytorch_module.py", row.get("PyTorch_Code_Module"))
write_text(base / "pytorch_functional.py", row.get("PyTorch_Code_Functional"))
# Collect just performance numbers
entry = {c: row.get(c) for c in CSV_COLS if c in row}
entry["Level_ID"] = row.get("Level_ID") or split
all_rows.append(entry)
csv_path = OUT / "performance_summary.csv"
pd.DataFrame(all_rows, columns=CSV_COLS).to_csv(csv_path, index=False)
print(f"\nExport complete.\nCSV: {csv_path}\nFiles under {OUT}/level_*/Task_*/<Kernel_Name>/")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment