scottt · May 13, 2025 19:15 · jammm · May 6, 2025
diff --git a/aotriton-build.py b/aotriton-build.py
 #!/usr/bin/env python

 import os
 import sys
 import subprocess
 import time
 from pathlib import Path

 # --- Configuration ---

 AOTRITON_TARGET_ARCH = 'gfx1151'
 PYTHON_VER = '3.13'
 AOTRITON_NOIMAGE_MODE = False

 if sys.platform == 'win32':
    os.environ['AOTRITON_SOURCE_DIR'] = '/work/aotriton'
    os.environ['AOTRITON_BUILD_DIR'] = '/aotriton-output/build'
    # NOTE: Python version of Triton wheel must match Python version used to build AOTriton
 else:
    os.environ['AOTRITON_SOURCE_DIR'] = os.path.expanduser('~/work/aotriton-0.9-gfx1151-windows')
    os.environ['AOTRITON_BUILD_DIR'] = '/aotriton-windows-for-merge-output/build'

 source_dir = Path(os.environ['AOTRITON_SOURCE_DIR']).resolve(strict=True)
 build_dir = Path(os.environ['AOTRITON_BUILD_DIR']).resolve()
 if not build_dir.exists():
    build_dir.mkdir(parents=True, exist_ok=True)

 venv_dir = build_dir / 'venv'

 # --- Setup ---
 caches_dir = Path("/caches")
 pip_cache_dir = caches_dir / "pip"

 if sys.platform == 'win32':
    os.environ['PKG_CONFIG'] = 'C:/Strawberry/perl/bin/pkg-config.bat'
    os.environ['PKG_CONFIG_PATH'] = '/xz-output/lib/pkgconfig'

 print(f"--- Configuration ---")
 print(f"Source Directory: {source_dir}")
 print(f"Cache Directory: {caches_dir}")
 print(f"LLVM_SYSPATH: {os.environ.get('LLVM_SYSPATH')}")
 print(f"LLVM_INCLUDE_DIRS: {os.environ.get('LLVM_INCLUDE_DIRS')}")
 print(f"LLVM_LIBRARY_DIR: {os.environ.get('LLVM_LIBRARY_DIR')}")
 print(f"PKG_CONFIG_DIR: {os.environ.get('PKG_CONFIG_DIR')}")
 print(f"Script Arguments: {sys.argv[1:]}")
 print(f"---------------------")

 def should_change_caches_dir():
    return sys.platform == 'win32'

 if should_change_caches_dir():
    print(f"Ensuring directories exist...")
    pip_cache_dir.mkdir(parents=True, exist_ok=True)

 print("Setting environment variables...")
 if should_change_caches_dir():
    os.environ['PIP_CACHE_DIR'] = str(pip_cache_dir.resolve())

 print(f"PIP_CACHE_DIR = {os.environ.get('PIP_CACHE_DIR')}")

 # --- Helper function to run commands ---
 def run_command(cmd_list, cwd=None):
    """Runs a command, prints it, times it, and checks for errors."""
    print(f"\n--- Executing: {' '.join(map(str, cmd_list))} ---", flush=True)
    start_time = time.monotonic()
    try:
        # Use shell=False (default) for better security and argument handling
        # check=True raises CalledProcessError on non-zero exit code (like set -e)
        process = subprocess.run(cmd_list, cwd=cwd, check=True, text=True)
                                 #stdout=sys.stdout, stderr=sys.stderr) # Redirect streams directly
    except FileNotFoundError:
        print(f"ERROR: Command not found: {cmd_list[0]}. Is it installed and in PATH?", file=sys.stderr)
        sys.exit(1)
    except subprocess.CalledProcessError as e:
        print(f"ERROR: Command failed with exit code {e.returncode}", file=sys.stderr)
        sys.exit(e.returncode)
    except Exception as e:
        print(f"ERROR: An unexpected error occurred: {e}", file=sys.stderr)
        sys.exit(1)

    end_time = time.monotonic()
    print(f"--- Command finished successfully in {end_time - start_time:.2f} seconds ---", flush=True)
    return process

 # --- Build Steps ---

 cmd = [
    "uv",
    "venv",
    "--python", PYTHON_VER,
    str(venv_dir),
 ]
 run_command(cmd)

 build_python_bindings = True

 if not AOTRITON_NOIMAGE_MODE:
    # https://github.com/astral-sh/uv/issues/8721
    cmd = [
        "uv",
        "pip",
        "install", "torch",
        "--python", str(venv_dir),
    ]
    run_command(cmd)

 use_aotriton_target_arch = True
 build_aotriton_09 = True
 if build_aotriton_09:
    if AOTRITON_TARGET_ARCH == 'gfx1151':
        use_aotriton_target_arch = False
        TARGET_GPUS = 'Navi3.5'
    else:
        raise TypeError(f"Don't know GPU mapping for aotriton 0.9 for {AOTRITON_TARGET_GPUS}")

 cmd = [
    "cmake",
    #"--trace",
    "-GNinja",
    f"-DVENV_DIR={str(venv_dir)}",
    f"-DCMAKE_INSTALL_PREFIX={str((build_dir / 'install_dir').resolve())}",
 	"-DCMAKE_BUILD_TYPE=Release",
 	"-DAOTRITON_GPU_BUILD_TIMEOUT=0",
    # AOTRITON_NO_PYTHON must be OFF and AOTRITON_NAME_SUFFIX must be set to run the unit tests
 	"-DAOTRITON_NO_PYTHON=OFF",
 	"-DHIP_PLATFORM=amd",
    f'-DAOTRITON_NOIMAGE_MODE={'ON' if AOTRITON_NOIMAGE_MODE else 'OFF' }',
    "-S", str(source_dir.resolve()),
    "-B", str(build_dir.resolve()),
    #"--debug-find",
 ]


 if use_aotriton_target_arch: # aotriton-0.10
    cmd.extend([
        f"-DAOTRITON_TARGET_ARCH={AOTRITON_TARGET_ARCH}"
    ])
 else: # aotriton-0.9
    cmd.extend([
        f"-DTARGET_GPUS={TARGET_GPUS}"
    ])

 if sys.platform == 'win32':
    cmd.append('-Ddlfcn-win32_DIR=/dlfcn-output/share/dlfcn-win32')

 cmd.extend(sys.argv[1:]) # Add extra arguments from script call
 run_command(cmd)

 cmd = [
    "ninja", "install"
 ]

 cpu_count = os.cpu_count()
 if cpu_count and cpu_count > 1:
     cmake_build_cmd.extend(["--", f"-j{max(1, cpu_count - 1)}"])
     print(f"Using parallel build flag: -j{max(1, cpu_count - 1)}")

 run_command(cmd, cwd=build_dir)

 print("\nBuild script completed successfully.")
diff --git a/aotriton-nokernel-build.py b/aotriton-nokernel-build.py
 #!/usr/bin/env python

 import os
 import sys
 import subprocess
 import time
 from pathlib import Path

 # --- Configuration ---

 AOTRITON_TARGET_ARCH = 'gfx1151'
 # We need to install `torch` in the virtual env so we need to create the venv and specify
 # the python version used
 PYTHON_VER = '3.13'
 AOTRITON_NOIMAGE_MODE = True

 fork_name = 'aotriton-0.9-gfx1151-windows'
 if sys.platform == 'win32':
    os.environ['AOTRITON_SOURCE_DIR'] = f'/w/{fork_name}'
    os.environ['AOTRITON_BUILD_DIR'] = f'/o/{fork_name}/build'
    # NOTE: Python version of Triton wheel must match Python version used to build AOTriton
 else:
    os.environ['AOTRITON_SOURCE_DIR'] = os.path.expanduser('~/work/aotriton')
    os.environ['AOTRITON_BUILD_DIR'] = '/aotriton-output/build'

 # Workaround cl.exe amd_hip_vector_types.h
 os.environ['CC'] = 'clang-cl'
 os.environ['CXX'] = 'clang-cl'

 triton_wheel_path = None
 if not AOTRITON_NOIMAGE_MODE:
    if sys.platform == 'win32':
        triton_wheel_path = Path('/work/triton-lshqqytiger/python/dist/triton-3.3.0+gitf8727c94-cp313-cp313-win_amd64.whl')
    else:
        triton_wheel_path = Path('~/work/triton-lshqqytiger/python/dist/triton-3.3.0+gitf8727c94-cp313-cp313-win_amd64.whl')
    triton_wheel_path.resolve(strict=True)

 source_dir = Path(os.environ['AOTRITON_SOURCE_DIR']).resolve(strict=True)
 build_dir = Path(os.environ['AOTRITON_BUILD_DIR']).resolve()
 if not build_dir.exists():
    build_dir.mkdir(parents=True, exist_ok=True)

 venv_dir = build_dir / 'venv'

 if sys.platform == 'win32':
    Python3_EXECUTABLE = None
    Python3_INCLUDE_DIR = None
    Python3_LIBRARY = None
 else:
    # Python3_EXECUTABLE = str(venv_dir / "bin" / "python")
    Python3_EXECUTABLE = "/usr/bin/python"
    Python3_INCLUDE_DIR = "/usr/include/python3.13"
    Python3_LIBRARY = "/usr/lib64/libpython3.13.so"

 CCACHE_EXECUTABLE = "ccache"

 # --- Setup ---
 caches_dir = Path("/caches")
 ccache_dir = caches_dir / "ccache"
 pip_cache_dir = caches_dir / "pip"

 if sys.platform == 'win32':
    os.environ['PKG_CONFIG'] = 'C:/Strawberry/perl/bin/pkg-config.bat'
    os.environ['PKG_CONFIG_PATH'] = '/xz-output/lib/pkgconfig'

 print(f"--- Configuration ---")
 print(f"Source Directory: {source_dir}")
 print(f"Cache Directory: {caches_dir}")
 print(f"LLVM_SYSPATH: {os.environ.get('LLVM_SYSPATH')}")
 print(f"LLVM_INCLUDE_DIRS: {os.environ.get('LLVM_INCLUDE_DIRS')}")
 print(f"LLVM_LIBRARY_DIR: {os.environ.get('LLVM_LIBRARY_DIR')}")
 print(f"PKG_CONFIG_DIR: {os.environ.get('PKG_CONFIG_DIR')}")
 print(f"Script Arguments: {sys.argv[1:]}")
 print(f"---------------------")

 def should_change_caches_dir():
    return sys.platform == 'win32'

 if should_change_caches_dir():
    print(f"Ensuring directories exist...")
    ccache_dir.mkdir(parents=True, exist_ok=True)
    pip_cache_dir.mkdir(parents=True, exist_ok=True)

 print("Setting environment variables...")
 if should_change_caches_dir():
    os.environ['CCACHE_DIR'] = str(ccache_dir.resolve())
    os.environ['PIP_CACHE_DIR'] = str(pip_cache_dir.resolve())

 use_ccache = False
 if use_ccache:
    print(f"Configuring CMake to use ccache ('{CCACHE_EXECUTABLE}')...")
    os.environ['CMAKE_C_COMPILER_LAUNCHER'] = CCACHE_EXECUTABLE
    os.environ['CMAKE_CXX_COMPILER_LAUNCHER'] = CCACHE_EXECUTABLE
 else:
    print("Skipping ccache configuration.")
    os.environ.pop('CMAKE_C_COMPILER_LAUNCHER', None)
    os.environ.pop('CMAKE_CXX_COMPILER_LAUNCHER', None)

 print(f"CCACHE_DIR = {os.environ.get('CCACHE_DIR')}")
 print(f"PIP_CACHE_DIR = {os.environ.get('PIP_CACHE_DIR')}")
 print(f"CMAKE_C_COMPILER_LAUNCHER = {os.environ.get('CMAKE_C_COMPILER_LAUNCHER')}")
 print(f"CMAKE_CXX_COMPILER_LAUNCHER = {os.environ.get('CMAKE_CXX_COMPILER_LAUNCHER')}")

 # --- Helper function to run commands ---
 def run_command(cmd_list, cwd=None):
    """Runs a command, prints it, times it, and checks for errors."""
    print(f"\n--- Executing: {' '.join(map(str, cmd_list))} ---", flush=True)
    start_time = time.monotonic()
    try:
        # Use shell=False (default) for better security and argument handling
        # check=True raises CalledProcessError on non-zero exit code (like set -e)
        process = subprocess.run(cmd_list, cwd=cwd, check=True, text=True)
                                 #stdout=sys.stdout, stderr=sys.stderr) # Redirect streams directly
    except FileNotFoundError:
        print(f"ERROR: Command not found: {cmd_list[0]}. Is it installed and in PATH?", file=sys.stderr)
        sys.exit(1)
    except subprocess.CalledProcessError as e:
        print(f"ERROR: Command failed with exit code {e.returncode}", file=sys.stderr)
        sys.exit(e.returncode)
    except Exception as e:
        print(f"ERROR: An unexpected error occurred: {e}", file=sys.stderr)
        sys.exit(1)

    end_time = time.monotonic()
    print(f"--- Command finished successfully in {end_time - start_time:.2f} seconds ---", flush=True)
    return process

 # --- Build Steps ---

 cmd = [
    "uv",
    "venv",
    "--python", PYTHON_VER,
    str(venv_dir),
 ]
 run_command(cmd)

 BUILD_PYTHON_BINDINGS = True

 if not AOTRITON_NOIMAGE_MODE:
    # https://github.com/astral-sh/uv/issues/8721
    cmd = [
        "uv",
        "pip",
        "install", "torch",
        "--python", str(venv_dir),
    ]
    run_command(cmd)

 use_aotriton_target_arch = True
 build_aotriton_09 = True
 if build_aotriton_09:
    if AOTRITON_TARGET_ARCH == 'gfx1151':
        use_aotriton_target_arch = False
        TARGET_GPUS = 'Navi3.5'
    else:
        raise TypeError(f"Don't know GPU mapping for aotriton 0.9 for {AOTRITON_TARGET_GPUS}")

 cmd = [
    "cmake",
    #"--trace",
    "-GNinja",
    f"-DVENV_DIR={str(venv_dir)}",
    f"-DCMAKE_INSTALL_PREFIX={str((build_dir / 'install_dir').resolve())}",
 	"-DCMAKE_BUILD_TYPE=Release",
 	"-DAOTRITON_GPU_BUILD_TIMEOUT=0",
    # AOTRITON_NO_PYTHON must be OFF and AOTRITON_NAME_SUFFIX must be set to run the unit tests
 	f"-DAOTRITON_NO_PYTHON={"ON" if BUILD_PYTHON_BINDINGS else "OFF"}",
 	"-DHIP_PLATFORM=amd",
    f'-DAOTRITON_NOIMAGE_MODE={'ON' if AOTRITON_NOIMAGE_MODE else 'OFF' }',
    "-S", str(source_dir.resolve()),
    "-B", str(build_dir.resolve()),
    #"--debug-find",
 ]


 if use_aotriton_target_arch:
    cmd.extend([
        f"-DAOTRITON_TARGET_ARCH={AOTRITON_TARGET_ARCH}"
    ])
 else: # aotriton-0.9
    cmd.extend([
        f"-DTARGET_GPUS={TARGET_GPUS}"
    ])

 if BUILD_PYTHON_BINDINGS:
    if Python3_EXECUTABLE:
        cmd.extend([
            f"-DPython3_EXECUTABLE={Python3_EXECUTABLE}",
        ])
    if Python3_INCLUDE_DIR:
        cmd.extend([
            "-DPython3_INCLUDE_DIR={Python3_INCLUDE_DIR}",
        ])
        cmd.extend([
            "-DPython3_LIBRARY={Python3_LIBRARY}",
        ])

 if triton_wheel_path is not None:
    cmd.append(f"-DINSTALL_TRITON_FROM_WHEEL={str(triton_wheel_path.resolve())}")

 if sys.platform == 'win32':
    cmd.append('-Ddlfcn-win32_DIR=/dlfcn-output/share/dlfcn-win32')

 cmd.extend(sys.argv[1:]) # Add extra arguments from script call
 run_command(cmd)

 if sys.platform == 'win32':
    # Set HIP_PATH to dir containing `bin/ld-lld.exe` for triton\backends\amd\compiler.py
    #   lld = Path(os.path.join( os.environ['HIP_PATH'] , 'bin', 'ld.lld.exe' ))
    os.environ['HIP_PATH'] = '/o/r-st/build/dist/rocm/lib/llvm'

 cmd = [
    "ninja", "install"
 ]
 if sys.platform == 'win32' and (not AOTRITON_NOIMAGE_MODE):
    cmd.extend(['-j', '1'])
 run_command(cmd, cwd=build_dir)

 print("\nBuild script completed successfully.")
diff --git a/dlfcn-build.py b/dlfcn-build.py
 #!/usr/bin/env python

 import os
 import sys
 import subprocess
 import time
 from pathlib import Path

 # https://github.com/dlfcn-win32/dlfcn-win32

 # --- Configuration ---

 output_dir = Path('/dlfcn-output')
 os.environ['SOURCE_DIR'] = '/work/dlfcn-win32'
 source_dir = Path(os.environ['SOURCE_DIR']).resolve(strict=True)
 build_dir = source_dir / 'build'
 os.environ['BUILD_DIR'] = '/dlfcn-output/build'

 if not build_dir.exists():
    build_dir.mkdir(parents=True, exist_ok=True)

 CCACHE_EXECUTABLE = "ccache"

 # --- Setup ---
 caches_dir = Path("/caches")
 ccache_dir = caches_dir / "ccache"

 print(f"--- Configuration ---")
 print(f"Source Directory: {source_dir}")
 print(f"Build Directory: {build_dir}")
 print(f"Output Directory: {output_dir}")
 print(f"Cache Directory: {caches_dir}")
 print(f"Script Arguments: {sys.argv[1:]}")
 print(f"---------------------")

 def should_change_caches_dir():
    return sys.platform == 'win32'

 if should_change_caches_dir():
    print(f"Ensuring directories exist...")
    ccache_dir.mkdir(parents=True, exist_ok=True)

 print("Setting environment variables...")
 if should_change_caches_dir():
    os.environ['CCACHE_DIR'] = str(ccache_dir.resolve())

 use_ccache = True
 if use_ccache:
    print(f"Configuring CMake to use ccache ('{CCACHE_EXECUTABLE}')...")
    os.environ['CMAKE_C_COMPILER_LAUNCHER'] = CCACHE_EXECUTABLE
    os.environ['CMAKE_CXX_COMPILER_LAUNCHER'] = CCACHE_EXECUTABLE
 else:
    print("Skipping ccache configuration.")
    os.environ.pop('CMAKE_C_COMPILER_LAUNCHER', None)
    os.environ.pop('CMAKE_CXX_COMPILER_LAUNCHER', None)

 print(f"CCACHE_DIR = {os.environ.get('CCACHE_DIR')}")
 print(f"CMAKE_C_COMPILER_LAUNCHER = {os.environ.get('CMAKE_C_COMPILER_LAUNCHER')}")
 print(f"CMAKE_CXX_COMPILER_LAUNCHER = {os.environ.get('CMAKE_CXX_COMPILER_LAUNCHER')}")

 # --- Helper function to run commands ---
 def run_command(cmd_list, cwd=None):
    """Runs a command, prints it, times it, and checks for errors."""
    print(f"\n--- Executing: {' '.join(map(str, cmd_list))} ---", flush=True)
    start_time = time.monotonic()
    try:
        # Use shell=False (default) for better security and argument handling
        # check=True raises CalledProcessError on non-zero exit code (like set -e)
        process = subprocess.run(cmd_list, cwd=cwd, check=True, text=True)
                                 #stdout=sys.stdout, stderr=sys.stderr) # Redirect streams directly
    except FileNotFoundError:
        print(f"ERROR: Command not found: {cmd_list[0]}. Is it installed and in PATH?", file=sys.stderr)
        sys.exit(1)
    except subprocess.CalledProcessError as e:
        print(f"ERROR: Command failed with exit code {e.returncode}", file=sys.stderr)
        sys.exit(e.returncode)
    except Exception as e:
        print(f"ERROR: An unexpected error occurred: {e}", file=sys.stderr)
        sys.exit(1)

    end_time = time.monotonic()
    print(f"--- Command finished successfully in {end_time - start_time:.2f} seconds ---", flush=True)
    return process

 # --- Build Steps ---

 cmd = [
    "cmake",
    # "--trace",
    "-GNinja",
    "-DBUILD_SHARED_LIBS=ON",
    "-DCMAKE_BUILD_TYPE=Release",
    f"-DCMAKE_INSTALL_PREFIX={str(output_dir.resolve())}",
    "-S", str(source_dir.resolve()),
    "-B", str(build_dir.resolve()),
 ]
 cmd.extend(sys.argv[1:]) # Add extra arguments from script call
 run_command(cmd)

 cmd = [
    "cmake",
    "--build",
    str(build_dir.resolve()),
 ]
 run_command(cmd)

 cmd = [
    "cmake",
    "--install",
    str(build_dir.resolve()),
 ]
 run_command(cmd)

 print("\nBuild script completed successfully.")
diff --git a/llvm-build.py b/llvm-build.py
 #!/usr/bin/env python

 import os
 import sys
 import subprocess
 import time
 from pathlib import Path

 # --- Configuration ---
 os.environ['LLVM_OUTPUT_DIR'] = '/llvm-output'
 os.environ['LLVM_SOURCE_DIR'] = '/work/llvm-for-triton'
 output_dir = Path(os.environ['LLVM_OUTPUT_DIR']).resolve()
 source_dir = Path(os.environ['LLVM_SOURCE_DIR']).resolve()

 CCACHE_EXECUTABLE = "ccache"

 # --- Setup ---

 build_dir = output_dir / "build"
 caches_dir = output_dir / ".." / "caches"
 ccache_dir = caches_dir / "ccache"

 print(f"--- Configuration ---")
 print(f"Source Directory: {source_dir}")
 print(f"Output Base Directory: {output_dir}")
 print(f"Build Directory: {build_dir}")
 print(f"Cache Directory: {caches_dir}")
 print(f"Script Arguments: {sys.argv[1:]}")
 print(f"---------------------")

 print(f"Ensuring directories exist...")
 ccache_dir.mkdir(parents=True, exist_ok=True)
 build_dir.mkdir(parents=True, exist_ok=True) # Also ensure build dir exists early

 #
 print("Setting environment variables...")
 os.environ['CCACHE_DIR'] = str(ccache_dir.resolve())

 # Check if ccache is desired/available before setting launchers
 use_ccache = True # Set to False to disable ccache
 if use_ccache:
    print(f"Configuring CMake to use ccache ('{CCACHE_EXECUTABLE}')...")
    os.environ['CMAKE_C_COMPILER_LAUNCHER'] = CCACHE_EXECUTABLE
    os.environ['CMAKE_CXX_COMPILER_LAUNCHER'] = CCACHE_EXECUTABLE
 else:
    print("Skipping ccache configuration.")
    # Ensure they are unset if they existed before
    os.environ.pop('CMAKE_C_COMPILER_LAUNCHER', None)
    os.environ.pop('CMAKE_CXX_COMPILER_LAUNCHER', None)

 print(f"CCACHE_DIR = {os.environ.get('CCACHE_DIR')}")
 print(f"CMAKE_C_COMPILER_LAUNCHER = {os.environ.get('CMAKE_C_COMPILER_LAUNCHER')}")
 print(f"CMAKE_CXX_COMPILER_LAUNCHER = {os.environ.get('CMAKE_CXX_COMPILER_LAUNCHER')}")

 # --- Helper function to run commands ---
 def run_command(cmd_list, cwd=None):
    """Runs a command, prints it, times it, and checks for errors."""
    print(f"\n--- Executing: {' '.join(map(str, cmd_list))} ---", flush=True)
    start_time = time.monotonic()
    try:
        # Use shell=False (default) for better security and argument handling
        # check=True raises CalledProcessError on non-zero exit code (like set -e)
        process = subprocess.run(cmd_list, cwd=cwd, check=True, text=True)
                                 #stdout=sys.stdout, stderr=sys.stderr) # Redirect streams directly
    except FileNotFoundError:
        print(f"ERROR: Command not found: {cmd_list[0]}. Is it installed and in PATH?", file=sys.stderr)
        sys.exit(1)
    except subprocess.CalledProcessError as e:
        print(f"ERROR: Command failed with exit code {e.returncode}", file=sys.stderr)
        sys.exit(e.returncode)
    except Exception as e:
        print(f"ERROR: An unexpected error occurred: {e}", file=sys.stderr)
        sys.exit(1)

    end_time = time.monotonic()
    print(f"--- Command finished successfully in {end_time - start_time:.2f} seconds ---", flush=True)
    return process


 # --- Build Steps ---

 # https://github.com/triton-lang/triton?tab=readme-ov-file#building-with-a-custom-llvm
 # 1. CMake Configure Step
 configure_cmd = [
    "cmake", \
    "-GNinja",
    "-DCMAKE_BUILD_TYPE=Release",
    "-DLLVM_ENABLE_ASSERTIONS=ON",
    "-DLLVM_ENABLE_PROJECTS=mlir;llvm;lld",
    "-DLLVM_TARGETS_TO_BUILD=host;NVPTX;AMDGPU",
    "-DLLVM_FORCE_VC_REPOSITORY=llvm-for-triton"
    "-DLLVM_FORCE_VC_REVISION=rev-for-triton"
    "-S", str((source_dir / "llvm").resolve()),
    "-B", str(build_dir.resolve()),
 ]
 configure_cmd.extend(sys.argv[1:]) # Add extra arguments from script call
 run_command(configure_cmd)

 # 2. CMake Build Step
 cmake_build_cmd = [
    "cmake",
    "--build", str(build_dir.resolve())
 ]
 # Add parallel build flag common on Windows (optional)
 # Get number of processors, leave one free
 cpu_count = os.cpu_count()
 if cpu_count and cpu_count > 1:
     cmake_build_cmd.extend(["--", f"-j{max(1, cpu_count - 1)}"]) # Pass '-jN' to underlying Ninja
     print(f"Using parallel build flag: -j{max(1, cpu_count - 1)}")

 run_command(cmake_build_cmd)

 print("\nBuild script completed successfully.")
diff --git a/lzma-build.py b/lzma-build.py
 #!/usr/bin/env python

 # BUild liblzma for aotriton

 import os
 import sys
 import subprocess
 import time
 from pathlib import Path

 # https://github.com/tukaani-project/xz

 # --- Configuration ---

 output_dir = Path('/xz-output')
 os.environ['XZ_SOURCE_DIR'] = '/work/xz'
 source_dir = Path(os.environ['XZ_SOURCE_DIR']).resolve(strict=True)
 build_dir = source_dir / 'build'
 os.environ['XZ_BUILD_DIR'] = '/aotriton-output/build'

 if not build_dir.exists():
    build_dir.mkdir(parents=True, exist_ok=True)

 CCACHE_EXECUTABLE = "ccache"

 # --- Setup ---
 caches_dir = Path("/caches")
 ccache_dir = caches_dir / "ccache"

 print(f"--- Configuration ---")
 print(f"Source Directory: {source_dir}")
 print(f"Build Directory: {build_dir}")
 print(f"Output Directory: {output_dir}")
 print(f"Cache Directory: {caches_dir}")
 print(f"Script Arguments: {sys.argv[1:]}")
 print(f"---------------------")

 def should_change_caches_dir():
    return sys.platform == 'win32'

 if should_change_caches_dir():
    print(f"Ensuring directories exist...")
    ccache_dir.mkdir(parents=True, exist_ok=True)

 print("Setting environment variables...")
 if should_change_caches_dir():
    os.environ['CCACHE_DIR'] = str(ccache_dir.resolve())

 use_ccache = True
 if use_ccache:
    print(f"Configuring CMake to use ccache ('{CCACHE_EXECUTABLE}')...")
    os.environ['CMAKE_C_COMPILER_LAUNCHER'] = CCACHE_EXECUTABLE
    os.environ['CMAKE_CXX_COMPILER_LAUNCHER'] = CCACHE_EXECUTABLE
 else:
    print("Skipping ccache configuration.")
    os.environ.pop('CMAKE_C_COMPILER_LAUNCHER', None)
    os.environ.pop('CMAKE_CXX_COMPILER_LAUNCHER', None)

 print(f"CCACHE_DIR = {os.environ.get('CCACHE_DIR')}")
 print(f"CMAKE_C_COMPILER_LAUNCHER = {os.environ.get('CMAKE_C_COMPILER_LAUNCHER')}")
 print(f"CMAKE_CXX_COMPILER_LAUNCHER = {os.environ.get('CMAKE_CXX_COMPILER_LAUNCHER')}")

 # --- Helper function to run commands ---
 def run_command(cmd_list, cwd=None):
    """Runs a command, prints it, times it, and checks for errors."""
    print(f"\n--- Executing: {' '.join(map(str, cmd_list))} ---", flush=True)
    start_time = time.monotonic()
    try:
        # Use shell=False (default) for better security and argument handling
        # check=True raises CalledProcessError on non-zero exit code (like set -e)
        process = subprocess.run(cmd_list, cwd=cwd, check=True, text=True)
                                 #stdout=sys.stdout, stderr=sys.stderr) # Redirect streams directly
    except FileNotFoundError:
        print(f"ERROR: Command not found: {cmd_list[0]}. Is it installed and in PATH?", file=sys.stderr)
        sys.exit(1)
    except subprocess.CalledProcessError as e:
        print(f"ERROR: Command failed with exit code {e.returncode}", file=sys.stderr)
        sys.exit(e.returncode)
    except Exception as e:
        print(f"ERROR: An unexpected error occurred: {e}", file=sys.stderr)
        sys.exit(1)

    end_time = time.monotonic()
    print(f"--- Command finished successfully in {end_time - start_time:.2f} seconds ---", flush=True)
    return process

 # --- Build Steps ---

 cmd = [
    "cmake",
    # "--trace",
    "-GNinja",
    "-DXZ_NLS=OFF",
    "-DBUILD_SHARED_LIBS=ON",
    "-DCMAKE_BUILD_TYPE=Release",
    f"-DCMAKE_INSTALL_PREFIX={str(output_dir.resolve())}",
    "-S", str(source_dir.resolve()),
    "-B", str(build_dir.resolve()),
 ]
 cmd.extend(sys.argv[1:]) # Add extra arguments from script call
 run_command(cmd)

 cmd = [
    "cmake",
    "--build",
    str(build_dir.resolve()),
 ]
 run_command(cmd)

 cmd = [
    "cmake",
    "--install",
    str(build_dir.resolve()),
 ]
 run_command(cmd)

 print("\nBuild script completed successfully.")
diff --git a/pytorch-build.py b/pytorch-build.py
 #!/usr/bin/env python

 import os
 import sys
 import subprocess
 import time
 from pathlib import Path

 # --- Configuration ---

 AMDGPU_TARGETS = 'gfx1151'
 # We need to install `torch` in the virtual env so we need to create the venv and specify
 # the python version used
 PYTHON_VER = '3.13'

 fork_name = 'pytorch-st'
 if sys.platform == 'win32':
    source_dir = Path(f'/w/{fork_name}')
    out_dir = Path(f'/w/{fork_name}')
    os.environ['AOTRITON_INSTALLED_PREFIX'] = '/o/aotriton-0.9-gfx1151-windows/build/install_dir'
    os.environ['CMAKE_PREFIX_PATH'] = '/o/r-st-gfx1151/build/dist/rocm'
    # NOTE: Python version of Triton wheel must match Python version used to build AOTriton if Triton is used
    USE_CMAKE = True
    use_ccache = False
    os.environ['CC'] = 'clang-cl'
    os.environ['CXX'] = 'clang-cl'
 else:
    source_dir = Path(os.path.expanduser(f'~/w/{fork_name}'))
    out_dir = Path(os.path.expanduser(f'~/w/{fork_name}'))
    os.environ['AOTRITON_INSTALLED_PREFIX'] = os.path.expanduser('~/aotriton-windows-for-merge-output/build/install_dir')
    os.environ['CMAKE_PREFIX_PATH'] = os.path.expanduser('~/therock-output-gfx1151/build/dist/rocm')
    USE_CMAKE = True
    use_ccache = False
 build_dir = out_dir / "build"
 venv_dir = out_dir / '.venv'

 os.environ['USE_KINETO'] = 'OFF'
 os.environ['PYTORCH_ROCM_ARCH'] = AMDGPU_TARGETS
 os.environ['USE_ROCM'] = 'ON'
 os.environ['BUILD_TEST'] = '0'
 os.environ['USE_FLASH_ATTENTION'] = 'ON'
 os.environ['USE_MEM_EFF_ATTENTION'] = 'ON'
 os.environ['DISTUTILS_USE_SDK'] = '1'

 source_dir = source_dir.resolve(strict=True)
 build_dir = build_dir.resolve()
 if not build_dir.exists():
    build_dir.mkdir(parents=True, exist_ok=True)

 CCACHE_EXECUTABLE = "ccache"

 # --- Setup ---
 caches_dir = Path("/caches")
 ccache_dir = caches_dir / "ccache"
 pip_cache_dir = caches_dir / "pip"

 print(f"--- Configuration ---")
 print(f"Source Directory: {source_dir}")
 print(f"Cache Directory: {caches_dir}")
 print(f"Script Arguments: {sys.argv[1:]}")
 print(f"---------------------")

 def should_change_caches_dir():
    return sys.platform == 'win32'

 if should_change_caches_dir():
    print(f"Ensuring directories exist...")
    ccache_dir.mkdir(parents=True, exist_ok=True)
    pip_cache_dir.mkdir(parents=True, exist_ok=True)

 print("Setting environment variables...")
 if should_change_caches_dir():
    os.environ['CCACHE_DIR'] = str(ccache_dir.resolve())
    os.environ['PIP_CACHE_DIR'] = str(pip_cache_dir.resolve())

 if use_ccache:
    print(f"Configuring CMake to use ccache ('{CCACHE_EXECUTABLE}')...")
    os.environ['CMAKE_C_COMPILER_LAUNCHER'] = CCACHE_EXECUTABLE
    os.environ['CMAKE_CXX_COMPILER_LAUNCHER'] = CCACHE_EXECUTABLE
 else:
    print("Skipping ccache configuration.")
    os.environ.pop('CMAKE_C_COMPILER_LAUNCHER', None)
    os.environ.pop('CMAKE_CXX_COMPILER_LAUNCHER', None)

 print(f"CCACHE_DIR = {os.environ.get('CCACHE_DIR')}")
 print(f"PIP_CACHE_DIR = {os.environ.get('PIP_CACHE_DIR')}")
 print(f"CMAKE_C_COMPILER_LAUNCHER = {os.environ.get('CMAKE_C_COMPILER_LAUNCHER')}")
 print(f"CMAKE_CXX_COMPILER_LAUNCHER = {os.environ.get('CMAKE_CXX_COMPILER_LAUNCHER')}")

 # --- Helper function to run commands ---
 def run_command(cmd_list, cwd=None):
    """Runs a command, prints it, times it, and checks for errors."""
    print(f"\n--- Executing: {' '.join(map(str, cmd_list))} ---", flush=True)
    start_time = time.monotonic()
    try:
        # Use shell=False (default) for better security and argument handling
        # check=True raises CalledProcessError on non-zero exit code (like set -e)
        process = subprocess.run(cmd_list, cwd=cwd, check=True, text=True)
                                 #stdout=sys.stdout, stderr=sys.stderr) # Redirect streams directly
    except FileNotFoundError:
        print(f"ERROR: Command not found: {cmd_list[0]}. Is it installed and in PATH?", file=sys.stderr)
        sys.exit(1)
    except subprocess.CalledProcessError as e:
        print(f"ERROR: Command failed with exit code {e.returncode}", file=sys.stderr)
        sys.exit(e.returncode)
    except Exception as e:
        print(f"ERROR: An unexpected error occurred: {e}", file=sys.stderr)
        sys.exit(1)

    end_time = time.monotonic()
    print(f"--- Command finished successfully in {end_time - start_time:.2f} seconds ---", flush=True)
    return process

 # --- Build Steps ---

 if True:
    cmd = [
        "uv",
        "venv",
        "--python", PYTHON_VER,
        str(venv_dir),
    ]
    run_command(cmd)

 cmd = [
    "uv",
    "pip",
    "install", "-r", str(source_dir / "requirements.txt"),
    "--python", str(venv_dir),
 ]
 run_command(cmd)

 venv_bin_dir = venv_dir / Path('bin')
 venv_python_bin = venv_dir / Path('bin') / Path('python')
 if sys.platform == 'win32':
    venv_bin_dir = venv_dir / Path('Scripts')
    venv_python_bin = venv_dir / Path('Scripts') / Path('python.exe')
    
 if USE_CMAKE:
    cmd = [
        venv_python_bin,
        "setup.py",
        "build", "--cmake-only",
    ]
    run_command(cmd, cwd=source_dir)

    cmd = [
        "cmake",
        #"--trace",
        "-GNinja",
        "-DCMAKE_BUILD_TYPE=Release",
        f"-DPython_EXECUTABLE={venv_python_bin}",
        f"-DPYTORCH_ROCM_ARCH={AMDGPU_TARGETS}",
        "-DUSE_ROCM=ON",
        "-DUSE_KINETO=OFF",
        "-DUSE_FLASH_ATTENTION=ON",
        "-S", str(source_dir.resolve()),
        "-B", str(build_dir.resolve()),
    ]
    cmd.extend(sys.argv[1:]) # Add extra arguments from script call
    run_command(cmd)
    # sys.exit(17)

 cmd = [
    venv_python_bin,
    "setup.py",
    "bdist_wheel",
 ]
 run_command(cmd, cwd=source_dir)

 print("\nBuild script completed successfully.")
diff --git a/sdpa-bench.py b/sdpa-bench.py
 #!/usr/bin/env python

 # https://github.com/ROCm/TheRock/discussions/244#discussioncomment-12926010

 import torch
 import time
 from torch.nn.functional import scaled_dot_product_attention
 from torch.nn.attention import SDPBackend, sdpa_kernel

 # Check for GPU
 if not torch.cuda.is_available():
    raise SystemExit("CUDA GPU is not available. Please run on a CUDA-enabled device.")

 device = torch.device("cuda")
 torch.cuda.init()  # Initialize CUDA context (optional, helps measure baseline)

 # Helper function for measuring one run
 def measure_op(op_func, warmup=3, total_runs=10):
    """
    op_func: a callable that runs the operation (including memory measurement)
             and returns (time_ms, peak_mem_MB, gflops_s).
    warmup: number of warm-up runs to discard.
    total_runs: total runs to do, including warmup.
    Returns: average_time_ms, average_peak_mem_MB, average_gflops_s over the runs after warm-up.
    """
    times = []
    mems = []
    flops = []

    for run_idx in range(total_runs):
        # Reset peak memory stats at the start of each run
        torch.cuda.reset_peak_memory_stats(device)
        
        t_ms, peak_mb, gf_s = op_func()
        
        if run_idx >= warmup:
            times.append(t_ms)
            mems.append(peak_mb)
            flops.append(gf_s)

    avg_time_ms = sum(times) / len(times)
    avg_mem_mb = sum(mems) / len(mems)
    avg_gf_s = sum(flops) / len(flops)
    return avg_time_ms, avg_mem_mb, avg_gf_s


 # 1) Define the Scaled Dot-Product Attention test
 def run_sdpa():
    # Configuration
    B, heads = 1, 8
    L = 8192
    E = 64
    S = L

    # Create random Q, K, V in half precision
    # We place them inside the function so each run re-allocates 
    # new memory (to measure peak memory usage properly).
    q = torch.randn(B, heads, L, E, device=device, dtype=torch.float16)
    k = torch.randn(B, heads, S, E, device=device, dtype=torch.float16)
    v = torch.randn(B, heads, S, E, device=device, dtype=torch.float16)

    # Start timing
    torch.cuda.synchronize()
    start_time = time.time()

    # Run scaled dot-product attention (Flash Attention backend)
    with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
        out = scaled_dot_product_attention(q, k, v)

    # Synchronize & end timing
    torch.cuda.synchronize()
    end_time = time.time()

    # Measure time
    time_ms = (end_time - start_time) * 1000.0

    # Peak memory usage (MB)
    peak_mem_bytes = torch.cuda.max_memory_allocated(device)
    peak_mem_mb = peak_mem_bytes / (1024**2)

    # Compute FLOPs for scaled dot-product attention:
    # Q*K^T -> 2 * B * heads * L * S * E
    # Attn*V -> 2 * B * heads * L * S * E
    # Total = 4 * B * heads * L * S * E
    flops = 4.0 * B * heads * L * S * E
    # Convert to GFLOPs/s
    flops_s = flops / (end_time - start_time)
    gflops_s = flops_s / 1e9

    return time_ms, peak_mem_mb, gflops_s

 # Run the measurements
 print("Benchmarking Scaled Dot-Product Attention (Flash) in FP16 ...")
 warmup = 3
 sdpa_time, sdpa_mem, sdpa_gflops = measure_op(run_sdpa, warmup=warmup, total_runs=warmup+7)
 print(f"Average time: {sdpa_time:.2f} ms")
 print(f"Average peak memory: {sdpa_mem:.2f} MB")
 print(f"Average throughput: {sdpa_gflops:.2f} GFLOP/s\n")
diff --git a/sdpa-test.py b/sdpa-test.py
 # https://rocm.blogs.amd.com/artificial-intelligence/flash-attention/README.html

 import numpy as np
 import torch
 import torch.nn.functional as F
 from torch.nn.attention import SDPBackend, sdpa_kernel

 class NaiveSdpaForDevice():
    def __init__(self, device):
        self.device = device

    def scaled_dot_product_attention(self, query, key, value, attn_mask=None, is_causal=False, dropout_p=0.0, scale=None):
        """
        Computes the scaled dot product attention between query, key, and value tensors in PyTorch eager mode.

        Args:
            query (torch.Tensor): The query tensor of shape (batch_size, n_heads, seq_len, hidden_dim).
            key (torch.Tensor): The key tensor of shape (batch_size, n_heads, seq_len, hidden_dim).
            value (torch.Tensor): The value tensor of shape (batch_size, n_heads, seq_len, hidden_dim).
            attn_mask (torch.Tensor, optional): The attention mask tensor of shape (batch_size, n_heads, seq_len, seq_len). Defaults to None.
            is_causal (bool, optional): Whether to apply a causal attention mask. Defaults to False.
            dropout_p (float, optional): The dropout probability. Defaults to 0.
            scale (float, optional): The scale factor for the dot product. Defaults to None.

        Returns:
            torch.Tensor: The output tensor of shape (batch_size, n_heads, seq_len, hidden_dim).
        """

        # Calculate the scale factor
        scale_factor = 1 / np.sqrt(query.size(-1)) if scale is None else scale
        attn_weight = (query @ key.transpose(-2, -1) * scale_factor)
        
        # Create the attention mask
        attn_mask = (torch.ones(query.shape[0], query.shape[1], query.shape[2], query.shape[2], dtype=torch.bool,
                                device=self.device).tril(diagonal=0)
                     if is_causal else attn_mask)
        attn_weight = attn_weight.masked_fill_(~attn_mask, -torch.inf) if attn_mask is not None else attn_weight
          
        # Compute the scaled dot product attention
        attn_weight = torch.softmax(attn_weight, dim=-1)
        attn_weight = torch.dropout(attn_weight, dropout_p, train=False)

        return attn_weight @ value

 def test(n_times):
    batch_size = 1
    seq_len = 64
    num_heads = 32
    device = torch.device("cuda")
    embed_dims = [16, 32, 48, 64, 80, 96, 128, 160, 192, 224, 256]

    times = n_times // 2 // len(embed_dims)
    for (dtype, epsilon) in [(torch.float16, 1e-03),
                             (torch.bfloat16, 1e-02)]:
        for embed_dim in embed_dims:
            for i in range(times):
                query = torch.rand(batch_size, num_heads, seq_len, embed_dim, device=device, dtype=dtype)
                key = torch.rand(batch_size, num_heads, seq_len, embed_dim, device=device, dtype=dtype)
                value = torch.rand(batch_size, num_heads, seq_len, embed_dim, device=device, dtype=dtype)
                naive = NaiveSdpaForDevice(device)
                want = naive.scaled_dot_product_attention(query, key, value, is_causal=True)
                with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
                    got = F.scaled_dot_product_attention(query, key, value, is_causal=True)
                if not torch.allclose(want, got, rtol=epsilon, atol=epsilon):
                    raise RuntimeErrror(f'dtype: {dtype}, test_iteration: {i}')

 test(1000)
diff --git a/setup.ps1 b/setup.ps1
 # LLVM for triton-lshqqytiger
 git clone [email protected]:llvm/llvm-project.git llvm-for-triton
 cd llvm-for-triton
 # Use llvm version specified in https://github.com/lshqqytiger/triton/blob/main/cmake/llvm-hash.txt
 git checkout 1cec5fffd8fddd9d85b516f876093b0e3f0eec5f
 cd ..

 # use_triton_fork = 'triton-lshqqytiger'
 git clone [email protected]:lshqqytiger/triton.git triton-lshqqytiger
 cd triton-lshqqytiger
 uv venv --python=3.13
 .venv\Script\Activate
 uv pip install -r python\requirements.txt
 python llvm-build.py
 #
 # NOTE: triton's setup.py wants to create symlinks which requires setting "Dev mode" to true on Windows 11
 #       when using from a non-admin user account
 #
 # Edit `triton-build-fork.py` and set `use_triton_fork=triton-lshqqytiger`
 python triton-build-fork.py

 # use_triton_fork = 'triton-windows'
 git [email protected]:woct0rdho/triton-windows.git
 cd triton-windows
 git checkout v3.3.x-windows
 uv venv --python=3.13
 .venv\Script\Activate
 uv pip install -r python\requirements.txt
 #
 # NOTE: triton's setup.py wants to create symlinks which requires setting "Dev mode" to true on Windows 11
 #       when using from a non-admin user account
 #
 # Edit `triton-build-fork.py` and set `use_triton_fork=triton-windows`
 python triton-build-fork.py

 # To build aotriton, for AMD GPUs, use triton-lshqqytiger from above
 python lzma-build.py
 git clone [email protected]:scottt/aotriton.git
 cd aotriton
 git checkout windows
 cd ..
 python aotriton-build.py
diff --git a/triton-build-fork.py b/triton-build-fork.py
 #!/usr/bin/env python

 import os
 import sys
 import subprocess
 import time
 from pathlib import Path

 # use_triton_fork = 'triton-lshqqytiger'
 use_triton_fork = 'triton-windows'

 def run_command(cmd_list, cwd=None):
    """Runs a command, prints it, times it, and checks for errors."""
    print(f"\n--- Executing: {' '.join(map(str, cmd_list))} ---", flush=True)
    start_time = time.monotonic()
    try:
        process = subprocess.run(cmd_list, cwd=cwd, check=True, text=True)
    except FileNotFoundError:
        print(f"ERROR: Command not found: {cmd_list[0]}. Is it installed and in PATH?", file=sys.stderr)
        sys.exit(1)
    except subprocess.CalledProcessError as e:
        print(f"ERROR: Command failed with exit code {e.returncode}", file=sys.stderr)
        sys.exit(e.returncode)
    except Exception as e:
        print(f"ERROR: An unexpected error occurred: {e}", file=sys.stderr)
        sys.exit(1)

    end_time = time.monotonic()
    print(f"--- Command finished successfully in {end_time - start_time:.2f} seconds ---", flush=True)
    return process

 def build():
    script_dir = Path(os.path.dirname(sys.argv[0]))
    if sys.platform == 'win32':
        # nlohmann-json from https://github.com/nlohmann/json/releases/download/v3.11.3/include.zip
        os.environ['JSON_SYSPATH'] = os.path.expanduser('/work/nlohmann-json-3.11.3')
        # Both triton forks are built with "MSVC v143 x64"
        # That's the default toolchain in Visual Studio 2022 Community Edition as of 2025-05
        os.environ['TRITON_BUILD_WITH_CLANG_LLD'] = 'OFF'
        if use_triton_fork == 'triton-windows':
            os.environ['TRITON_SOURCE_DIR'] = os.path.expanduser('/work/triton-windows')
            # LLVM from https://oaitriton.blob.core.windows.net/public/llvm-builds/llvm-a66376b0-windows-x64.tar.gz
            os.environ['LLVM_SYSPATH'] = os.path.expanduser('/caches/.triton/llvm/llvm-a66376b0-windows-x64')
        else:
            os.environ['TRITON_SOURCE_DIR'] = os.path.expanduser('/work/triton-lshqqytiger')
            # LLVM revision 1cec5fffd8fddd9d85b516f876093b0e3f0eec5fa built from source
            # This uses the llvm "build" and implicitly the "source" directory contents, instead of an "install"  tree
            os.environ['LLVM_SYSPATH'] = os.path.expanduser('/llvm-output/build')
    else:
        os.environ['JSON_SYSPATH'] = os.path.expanduser('~/work/nlohmann-json-3.11.3')
        os.environ['TRITON_SOURCE_DIR'] = os.path.expanduser('~/work/triton')
    cmd = [
        sys.executable,
        script_dir / 'triton-build.py',
    ]
    run_command(cmd)

 build()
diff --git a/triton-build.py b/triton-build.py
 #!/usr/bin/env python

 import os
 import sys
 import subprocess
 import time
 from pathlib import Path

 # --- Configuration ---
 source_dir = Path(os.environ['TRITON_SOURCE_DIR']).resolve()

 llvm_syspath = os.environ.get('LLVM_SYSPATH')
 if llvm_syspath is not None:
    llvm_syspath = Path(os.environ['LLVM_SYSPATH']).resolve()
 json_syspath = Path(os.environ['JSON_SYSPATH']).resolve()

 CCACHE_EXECUTABLE = "ccache"

 # --- Setup ---
 caches_dir = Path("/caches")
 ccache_dir = caches_dir / "ccache"
 pip_cache_dir = caches_dir / "pip"

 # Change default from "$HOME/.triton"a to "$caches_dir/.triton"
 # Triton always adds the ".triton" part
 triton_home_dir = caches_dir

 if llvm_syspath is not None:
    llvm_include_dir = llvm_syspath / "include"
    llvm_library_dir = llvm_syspath / "lib"

 # Disable downloading of dependencies
 # pybind11 is installed via pip and requirements.txt

 cuda_dep_var_names = [
 'TRITON_PTXAS_PATH',
 'TRITON_CUOBJDUMP_PATH',
 'TRITON_NVDISASM_PATH',
 'TRITON_CUDACRT_PATH',
 'TRITON_CUDART_PATH',
 'TRITON_CUPTI_INCLUDE_PATH',
 'TRITON_CUPTI_LIB_PATH',
 ]
 for x in cuda_dep_var_names:
    # value is just a placeholder
    os.environ[x] = '/bin/false'

 if llvm_syspath is not None:
    os.environ['LLVM_INCLUDE_DIRS'] = str(llvm_include_dir.resolve())
    os.environ['LLVM_LIBRARY_DIR'] = str(llvm_library_dir.resolve())

 print(f"--- Configuration ---")
 print(f"Source Directory: {source_dir}")
 print(f"Cache Directory: {caches_dir}")
 print(f"LLVM_SYSPATH: {os.environ.get('LLVM_SYSPATH')}")
 print(f"LLVM_INCLUDE_DIRS: {os.environ.get('LLVM_INCLUDE_DIRS')}")
 print(f"LLVM_LIBRARY_DIR: {os.environ.get('LLVM_LIBRARY_DIR')}")
 print(f"Script Arguments: {sys.argv[1:]}")
 print(f"---------------------")

 def should_change_caches_dir():
    return sys.platform == 'win32'

 if should_change_caches_dir():
    print(f"Ensuring directories exist...")
    ccache_dir.mkdir(parents=True, exist_ok=True)
    pip_cache_dir.mkdir(parents=True, exist_ok=True)

 print("Setting environment variables...")
 if should_change_caches_dir():
    os.environ['CCACHE_DIR'] = str(ccache_dir.resolve())
    os.environ['PIP_CACHE_DIR'] = str(pip_cache_dir.resolve())
    # Change default from "$HOME/.triton" to "$TRITON_HOME/.triton" the ".triton" part is always added
    os.environ['TRITON_HOME'] = str(triton_home_dir.resolve())
 # TRITON_PARALLEL_LINK_JOBS
 # TRITON_BUILD_WITH_CCACHE: we control this manually with CMAKE_{C,CXX}_COMPILER_LAUNCHER

 # TRITRON_BUILD_WITH_O1 doesn't work on triton-v3.3.x
 os.environ['TRITRON_BUILD_WITH_O1'] = 'ON'
 os.environ['TRITON_BUILD_PROTON'] = 'OFF'
 # os.environ['TRITON_BUILD_WITH_CLANG_LLD'] = 'OFF'
 # Triton C++ unit tests
 os.environ['TRITON_BUILD_UT'] = 'OFF'
 # TRITON_BUILD_BINARY is only defined in the 'triton-windows' fork, which requires it
 os.environ['TRITON_BUILD_BINARY'] = 'OFF'

 # Check if ccache is desired/available before setting launchers
 use_ccache = True # Set to False to disable ccache
 if use_ccache:
    print(f"Configuring CMake to use ccache ('{CCACHE_EXECUTABLE}')...")
    os.environ['CMAKE_C_COMPILER_LAUNCHER'] = CCACHE_EXECUTABLE
    os.environ['CMAKE_CXX_COMPILER_LAUNCHER'] = CCACHE_EXECUTABLE
 else:
    print("Skipping ccache configuration.")
    # Ensure they are unset if they existed before
    os.environ.pop('CMAKE_C_COMPILER_LAUNCHER', None)
    os.environ.pop('CMAKE_CXX_COMPILER_LAUNCHER', None)

 print(f"CCACHE_DIR = {os.environ.get('CCACHE_DIR')}")
 print(f"PIP_CACHE_DIR = {os.environ.get('PIP_CACHE_DIR')}")
 print(f"CMAKE_C_COMPILER_LAUNCHER = {os.environ.get('CMAKE_C_COMPILER_LAUNCHER')}")
 print(f"CMAKE_CXX_COMPILER_LAUNCHER = {os.environ.get('CMAKE_CXX_COMPILER_LAUNCHER')}")

 # --- Helper function to run commands ---
 def run_command(cmd_list, cwd=None):
    """Runs a command, prints it, times it, and checks for errors."""
    print(f"\n--- Executing: {' '.join(map(str, cmd_list))} ---", flush=True)
    start_time = time.monotonic()
    try:
        # Use shell=False (default) for better security and argument handling
        # check=True raises CalledProcessError on non-zero exit code (like set -e)
        process = subprocess.run(cmd_list, cwd=cwd, check=True, text=True)
                                 #stdout=sys.stdout, stderr=sys.stderr) # Redirect streams directly
    except FileNotFoundError:
        print(f"ERROR: Command not found: {cmd_list[0]}. Is it installed and in PATH?", file=sys.stderr)
        sys.exit(1)
    except subprocess.CalledProcessError as e:
        print(f"ERROR: Command failed with exit code {e.returncode}", file=sys.stderr)
        sys.exit(e.returncode)
    except Exception as e:
        print(f"ERROR: An unexpected error occurred: {e}", file=sys.stderr)
        sys.exit(1)

    end_time = time.monotonic()
    print(f"--- Command finished successfully in {end_time - start_time:.2f} seconds ---", flush=True)
    return process


 # --- Build Steps ---

 setup_py_dir = source_dir
 # triton (future) v3.4+ has "triton/setup.py"
 if not os.path.exists(str(setup_py_dir / "setup.py")):
    setup_py_dir = source_dir / "python"

 # Make the build fail here if 'setup.py' can't be found
 assert os.path.exists(str(setup_py_dir / "setup.py"))

 # --no-build-isolation: https://github.com/triton-lang/triton?tab=readme-ov-file#tips-for-building
 cmd = [
    "uv", "pip", "install", "-vv", "--no-build-isolation", str((setup_py_dir).resolve()),
 ]
 cmd.extend(sys.argv[1:]) # Add extra arguments from script call
 run_command(cmd)

 print("\nBuild script completed successfully.")
	#!/usr/bin/env python

	import os
	import sys
	import subprocess
	import time
	from pathlib import Path

	# --- Configuration ---

	AOTRITON_TARGET_ARCH = 'gfx1151'
	PYTHON_VER = '3.13'
	AOTRITON_NOIMAGE_MODE = False

	if sys.platform == 'win32':
	os.environ['AOTRITON_SOURCE_DIR'] = '/work/aotriton'
	os.environ['AOTRITON_BUILD_DIR'] = '/aotriton-output/build'
	# NOTE: Python version of Triton wheel must match Python version used to build AOTriton
	else:
	os.environ['AOTRITON_SOURCE_DIR'] = os.path.expanduser('~/work/aotriton-0.9-gfx1151-windows')
	os.environ['AOTRITON_BUILD_DIR'] = '/aotriton-windows-for-merge-output/build'

	source_dir = Path(os.environ['AOTRITON_SOURCE_DIR']).resolve(strict=True)
	build_dir = Path(os.environ['AOTRITON_BUILD_DIR']).resolve()
	if not build_dir.exists():
	build_dir.mkdir(parents=True, exist_ok=True)

	venv_dir = build_dir / 'venv'

	# --- Setup ---
	caches_dir = Path("/caches")
	pip_cache_dir = caches_dir / "pip"

	if sys.platform == 'win32':
	os.environ['PKG_CONFIG'] = 'C:/Strawberry/perl/bin/pkg-config.bat'
	os.environ['PKG_CONFIG_PATH'] = '/xz-output/lib/pkgconfig'

	print(f"--- Configuration ---")
	print(f"Source Directory: {source_dir}")
	print(f"Cache Directory: {caches_dir}")
	print(f"LLVM_SYSPATH: {os.environ.get('LLVM_SYSPATH')}")
	print(f"LLVM_INCLUDE_DIRS: {os.environ.get('LLVM_INCLUDE_DIRS')}")
	print(f"LLVM_LIBRARY_DIR: {os.environ.get('LLVM_LIBRARY_DIR')}")
	print(f"PKG_CONFIG_DIR: {os.environ.get('PKG_CONFIG_DIR')}")
	print(f"Script Arguments: {sys.argv[1:]}")
	print(f"---------------------")

	def should_change_caches_dir():
	return sys.platform == 'win32'

	if should_change_caches_dir():
	print(f"Ensuring directories exist...")
	pip_cache_dir.mkdir(parents=True, exist_ok=True)

	print("Setting environment variables...")
	if should_change_caches_dir():
	os.environ['PIP_CACHE_DIR'] = str(pip_cache_dir.resolve())

	print(f"PIP_CACHE_DIR = {os.environ.get('PIP_CACHE_DIR')}")

	# --- Helper function to run commands ---
	def run_command(cmd_list, cwd=None):
	"""Runs a command, prints it, times it, and checks for errors."""
	print(f"\n--- Executing: {' '.join(map(str, cmd_list))} ---", flush=True)
	start_time = time.monotonic()
	try:
	# Use shell=False (default) for better security and argument handling
	# check=True raises CalledProcessError on non-zero exit code (like set -e)
	process = subprocess.run(cmd_list, cwd=cwd, check=True, text=True)
	#stdout=sys.stdout, stderr=sys.stderr) # Redirect streams directly
	except FileNotFoundError:
	print(f"ERROR: Command not found: {cmd_list[0]}. Is it installed and in PATH?", file=sys.stderr)
	sys.exit(1)
	except subprocess.CalledProcessError as e:
	print(f"ERROR: Command failed with exit code {e.returncode}", file=sys.stderr)
	sys.exit(e.returncode)
	except Exception as e:
	print(f"ERROR: An unexpected error occurred: {e}", file=sys.stderr)
	sys.exit(1)

	end_time = time.monotonic()
	print(f"--- Command finished successfully in {end_time - start_time:.2f} seconds ---", flush=True)
	return process

	# --- Build Steps ---

	cmd = [
	"uv",
	"venv",
	"--python", PYTHON_VER,
	str(venv_dir),
	]
	run_command(cmd)

	build_python_bindings = True

	if not AOTRITON_NOIMAGE_MODE:
	# https://github.com/astral-sh/uv/issues/8721
	cmd = [
	"uv",
	"pip",
	"install", "torch",
	"--python", str(venv_dir),
	]
	run_command(cmd)

	use_aotriton_target_arch = True
	build_aotriton_09 = True
	if build_aotriton_09:
	if AOTRITON_TARGET_ARCH == 'gfx1151':
	use_aotriton_target_arch = False
	TARGET_GPUS = 'Navi3.5'
	else:
	raise TypeError(f"Don't know GPU mapping for aotriton 0.9 for {AOTRITON_TARGET_GPUS}")

	cmd = [
	"cmake",
	#"--trace",
	"-GNinja",
	f"-DVENV_DIR={str(venv_dir)}",
	f"-DCMAKE_INSTALL_PREFIX={str((build_dir / 'install_dir').resolve())}",
	"-DCMAKE_BUILD_TYPE=Release",
	"-DAOTRITON_GPU_BUILD_TIMEOUT=0",
	# AOTRITON_NO_PYTHON must be OFF and AOTRITON_NAME_SUFFIX must be set to run the unit tests
	"-DAOTRITON_NO_PYTHON=OFF",
	"-DHIP_PLATFORM=amd",
	f'-DAOTRITON_NOIMAGE_MODE={'ON' if AOTRITON_NOIMAGE_MODE else 'OFF' }',
	"-S", str(source_dir.resolve()),
	"-B", str(build_dir.resolve()),
	#"--debug-find",
	]


	if use_aotriton_target_arch: # aotriton-0.10
	cmd.extend([
	f"-DAOTRITON_TARGET_ARCH={AOTRITON_TARGET_ARCH}"
	])
	else: # aotriton-0.9
	cmd.extend([
	f"-DTARGET_GPUS={TARGET_GPUS}"
	])

	if sys.platform == 'win32':
	cmd.append('-Ddlfcn-win32_DIR=/dlfcn-output/share/dlfcn-win32')

	cmd.extend(sys.argv[1:]) # Add extra arguments from script call
	run_command(cmd)

	cmd = [
	"ninja", "install"
	]

	cpu_count = os.cpu_count()
	if cpu_count and cpu_count > 1:
	cmake_build_cmd.extend(["--", f"-j{max(1, cpu_count - 1)}"])
	print(f"Using parallel build flag: -j{max(1, cpu_count - 1)}")

	run_command(cmd, cwd=build_dir)

	print("\nBuild script completed successfully.")
	#!/usr/bin/env python

	# BUild liblzma for aotriton

	import os
	import sys
	import subprocess
	import time
	from pathlib import Path

	# https://github.com/tukaani-project/xz

	# --- Configuration ---

	output_dir = Path('/xz-output')
	os.environ['XZ_SOURCE_DIR'] = '/work/xz'
	source_dir = Path(os.environ['XZ_SOURCE_DIR']).resolve(strict=True)
	build_dir = source_dir / 'build'
	os.environ['XZ_BUILD_DIR'] = '/aotriton-output/build'

	if not build_dir.exists():
	build_dir.mkdir(parents=True, exist_ok=True)

	CCACHE_EXECUTABLE = "ccache"

	# --- Setup ---
	caches_dir = Path("/caches")
	ccache_dir = caches_dir / "ccache"

	print(f"--- Configuration ---")
	print(f"Source Directory: {source_dir}")
	print(f"Build Directory: {build_dir}")
	print(f"Output Directory: {output_dir}")
	print(f"Cache Directory: {caches_dir}")
	print(f"Script Arguments: {sys.argv[1:]}")
	print(f"---------------------")

	def should_change_caches_dir():
	return sys.platform == 'win32'

	if should_change_caches_dir():
	print(f"Ensuring directories exist...")
	ccache_dir.mkdir(parents=True, exist_ok=True)

	print("Setting environment variables...")
	if should_change_caches_dir():
	os.environ['CCACHE_DIR'] = str(ccache_dir.resolve())

	use_ccache = True
	if use_ccache:
	print(f"Configuring CMake to use ccache ('{CCACHE_EXECUTABLE}')...")
	os.environ['CMAKE_C_COMPILER_LAUNCHER'] = CCACHE_EXECUTABLE
	os.environ['CMAKE_CXX_COMPILER_LAUNCHER'] = CCACHE_EXECUTABLE
	else:
	print("Skipping ccache configuration.")
	os.environ.pop('CMAKE_C_COMPILER_LAUNCHER', None)
	os.environ.pop('CMAKE_CXX_COMPILER_LAUNCHER', None)

	print(f"CCACHE_DIR = {os.environ.get('CCACHE_DIR')}")
	print(f"CMAKE_C_COMPILER_LAUNCHER = {os.environ.get('CMAKE_C_COMPILER_LAUNCHER')}")
	print(f"CMAKE_CXX_COMPILER_LAUNCHER = {os.environ.get('CMAKE_CXX_COMPILER_LAUNCHER')}")

	# --- Helper function to run commands ---
	def run_command(cmd_list, cwd=None):
	"""Runs a command, prints it, times it, and checks for errors."""
	print(f"\n--- Executing: {' '.join(map(str, cmd_list))} ---", flush=True)
	start_time = time.monotonic()
	try:
	# Use shell=False (default) for better security and argument handling
	# check=True raises CalledProcessError on non-zero exit code (like set -e)
	process = subprocess.run(cmd_list, cwd=cwd, check=True, text=True)
	#stdout=sys.stdout, stderr=sys.stderr) # Redirect streams directly
	except FileNotFoundError:
	print(f"ERROR: Command not found: {cmd_list[0]}. Is it installed and in PATH?", file=sys.stderr)
	sys.exit(1)
	except subprocess.CalledProcessError as e:
	print(f"ERROR: Command failed with exit code {e.returncode}", file=sys.stderr)
	sys.exit(e.returncode)
	except Exception as e:
	print(f"ERROR: An unexpected error occurred: {e}", file=sys.stderr)
	sys.exit(1)

	end_time = time.monotonic()
	print(f"--- Command finished successfully in {end_time - start_time:.2f} seconds ---", flush=True)
	return process

	# --- Build Steps ---

	cmd = [
	"cmake",
	# "--trace",
	"-GNinja",
	"-DXZ_NLS=OFF",
	"-DBUILD_SHARED_LIBS=ON",
	"-DCMAKE_BUILD_TYPE=Release",
	f"-DCMAKE_INSTALL_PREFIX={str(output_dir.resolve())}",
	"-S", str(source_dir.resolve()),
	"-B", str(build_dir.resolve()),
	]
	cmd.extend(sys.argv[1:]) # Add extra arguments from script call
	run_command(cmd)

	cmd = [
	"cmake",
	"--build",
	str(build_dir.resolve()),
	]
	run_command(cmd)

	cmd = [
	"cmake",
	"--install",
	str(build_dir.resolve()),
	]
	run_command(cmd)

	print("\nBuild script completed successfully.")
	#!/usr/bin/env python

	# https://github.com/ROCm/TheRock/discussions/244#discussioncomment-12926010

	import torch
	import time
	from torch.nn.functional import scaled_dot_product_attention
	from torch.nn.attention import SDPBackend, sdpa_kernel

	# Check for GPU
	if not torch.cuda.is_available():
	raise SystemExit("CUDA GPU is not available. Please run on a CUDA-enabled device.")

	device = torch.device("cuda")
	torch.cuda.init() # Initialize CUDA context (optional, helps measure baseline)

	# Helper function for measuring one run
	def measure_op(op_func, warmup=3, total_runs=10):
	"""
	op_func: a callable that runs the operation (including memory measurement)
	and returns (time_ms, peak_mem_MB, gflops_s).
	warmup: number of warm-up runs to discard.
	total_runs: total runs to do, including warmup.
	Returns: average_time_ms, average_peak_mem_MB, average_gflops_s over the runs after warm-up.
	"""
	times = []
	mems = []
	flops = []

	for run_idx in range(total_runs):
	# Reset peak memory stats at the start of each run
	torch.cuda.reset_peak_memory_stats(device)

	t_ms, peak_mb, gf_s = op_func()

	if run_idx >= warmup:
	times.append(t_ms)
	mems.append(peak_mb)
	flops.append(gf_s)

	avg_time_ms = sum(times) / len(times)
	avg_mem_mb = sum(mems) / len(mems)
	avg_gf_s = sum(flops) / len(flops)
	return avg_time_ms, avg_mem_mb, avg_gf_s


	# 1) Define the Scaled Dot-Product Attention test
	def run_sdpa():
	# Configuration
	B, heads = 1, 8
	L = 8192
	E = 64
	S = L

	# Create random Q, K, V in half precision
	# We place them inside the function so each run re-allocates
	# new memory (to measure peak memory usage properly).
	q = torch.randn(B, heads, L, E, device=device, dtype=torch.float16)
	k = torch.randn(B, heads, S, E, device=device, dtype=torch.float16)
	v = torch.randn(B, heads, S, E, device=device, dtype=torch.float16)

	# Start timing
	torch.cuda.synchronize()
	start_time = time.time()

	# Run scaled dot-product attention (Flash Attention backend)
	with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
	out = scaled_dot_product_attention(q, k, v)

	# Synchronize & end timing
	torch.cuda.synchronize()
	end_time = time.time()

	# Measure time
	time_ms = (end_time - start_time) * 1000.0

	# Peak memory usage (MB)
	peak_mem_bytes = torch.cuda.max_memory_allocated(device)
	peak_mem_mb = peak_mem_bytes / (1024**2)

	# Compute FLOPs for scaled dot-product attention:
	# QK^T -> 2 B * heads * L * S * E
	# AttnV -> 2 B * heads * L * S * E
	# Total = 4 * B * heads * L * S * E
	flops = 4.0 * B * heads * L * S * E
	# Convert to GFLOPs/s
	flops_s = flops / (end_time - start_time)
	gflops_s = flops_s / 1e9

	return time_ms, peak_mem_mb, gflops_s

	# Run the measurements
	print("Benchmarking Scaled Dot-Product Attention (Flash) in FP16 ...")
	warmup = 3
	sdpa_time, sdpa_mem, sdpa_gflops = measure_op(run_sdpa, warmup=warmup, total_runs=warmup+7)
	print(f"Average time: {sdpa_time:.2f} ms")
	print(f"Average peak memory: {sdpa_mem:.2f} MB")
	print(f"Average throughput: {sdpa_gflops:.2f} GFLOP/s\n")
	# https://rocm.blogs.amd.com/artificial-intelligence/flash-attention/README.html

	import numpy as np
	import torch
	import torch.nn.functional as F
	from torch.nn.attention import SDPBackend, sdpa_kernel

	class NaiveSdpaForDevice():
	def __init__(self, device):
	self.device = device

	def scaled_dot_product_attention(self, query, key, value, attn_mask=None, is_causal=False, dropout_p=0.0, scale=None):
	"""
	Computes the scaled dot product attention between query, key, and value tensors in PyTorch eager mode.

	Args:
	query (torch.Tensor): The query tensor of shape (batch_size, n_heads, seq_len, hidden_dim).
	key (torch.Tensor): The key tensor of shape (batch_size, n_heads, seq_len, hidden_dim).
	value (torch.Tensor): The value tensor of shape (batch_size, n_heads, seq_len, hidden_dim).
	attn_mask (torch.Tensor, optional): The attention mask tensor of shape (batch_size, n_heads, seq_len, seq_len). Defaults to None.
	is_causal (bool, optional): Whether to apply a causal attention mask. Defaults to False.
	dropout_p (float, optional): The dropout probability. Defaults to 0.
	scale (float, optional): The scale factor for the dot product. Defaults to None.

	Returns:
	torch.Tensor: The output tensor of shape (batch_size, n_heads, seq_len, hidden_dim).
	"""

	# Calculate the scale factor
	scale_factor = 1 / np.sqrt(query.size(-1)) if scale is None else scale
	attn_weight = (query @ key.transpose(-2, -1) * scale_factor)

	# Create the attention mask
	attn_mask = (torch.ones(query.shape[0], query.shape[1], query.shape[2], query.shape[2], dtype=torch.bool,
	device=self.device).tril(diagonal=0)
	if is_causal else attn_mask)
	attn_weight = attn_weight.masked_fill_(~attn_mask, -torch.inf) if attn_mask is not None else attn_weight

	# Compute the scaled dot product attention
	attn_weight = torch.softmax(attn_weight, dim=-1)
	attn_weight = torch.dropout(attn_weight, dropout_p, train=False)

	return attn_weight @ value

	def test(n_times):
	batch_size = 1
	seq_len = 64
	num_heads = 32
	device = torch.device("cuda")
	embed_dims = [16, 32, 48, 64, 80, 96, 128, 160, 192, 224, 256]

	times = n_times // 2 // len(embed_dims)
	for (dtype, epsilon) in [(torch.float16, 1e-03),
	(torch.bfloat16, 1e-02)]:
	for embed_dim in embed_dims:
	for i in range(times):
	query = torch.rand(batch_size, num_heads, seq_len, embed_dim, device=device, dtype=dtype)
	key = torch.rand(batch_size, num_heads, seq_len, embed_dim, device=device, dtype=dtype)
	value = torch.rand(batch_size, num_heads, seq_len, embed_dim, device=device, dtype=dtype)
	naive = NaiveSdpaForDevice(device)
	want = naive.scaled_dot_product_attention(query, key, value, is_causal=True)
	with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
	got = F.scaled_dot_product_attention(query, key, value, is_causal=True)
	if not torch.allclose(want, got, rtol=epsilon, atol=epsilon):
	raise RuntimeErrror(f'dtype: {dtype}, test_iteration: {i}')

	test(1000)
	# LLVM for triton-lshqqytiger
	git clone [email protected]:llvm/llvm-project.git llvm-for-triton
	cd llvm-for-triton
	# Use llvm version specified in https://github.com/lshqqytiger/triton/blob/main/cmake/llvm-hash.txt
	git checkout 1cec5fffd8fddd9d85b516f876093b0e3f0eec5f
	cd ..

	# use_triton_fork = 'triton-lshqqytiger'
	git clone [email protected]:lshqqytiger/triton.git triton-lshqqytiger
	cd triton-lshqqytiger
	uv venv --python=3.13
	.venv\Script\Activate
	uv pip install -r python\requirements.txt
	python llvm-build.py
	#
	# NOTE: triton's setup.py wants to create symlinks which requires setting "Dev mode" to true on Windows 11
	# when using from a non-admin user account
	#
	# Edit `triton-build-fork.py` and set `use_triton_fork=triton-lshqqytiger`
	python triton-build-fork.py

	# use_triton_fork = 'triton-windows'
	git [email protected]:woct0rdho/triton-windows.git
	cd triton-windows
	git checkout v3.3.x-windows
	uv venv --python=3.13
	.venv\Script\Activate
	uv pip install -r python\requirements.txt
	#
	# NOTE: triton's setup.py wants to create symlinks which requires setting "Dev mode" to true on Windows 11
	# when using from a non-admin user account
	#
	# Edit `triton-build-fork.py` and set `use_triton_fork=triton-windows`
	python triton-build-fork.py

	# To build aotriton, for AMD GPUs, use triton-lshqqytiger from above
	python lzma-build.py
	git clone [email protected]:scottt/aotriton.git
	cd aotriton
	git checkout windows
	cd ..
	python aotriton-build.py