Skip to content

Instantly share code, notes, and snippets.

@scottt
Last active May 13, 2025 19:15
Show Gist options
  • Save scottt/b85f07db488b3e37944a9f675370ed3c to your computer and use it in GitHub Desktop.
Save scottt/b85f07db488b3e37944a9f675370ed3c to your computer and use it in GitHub Desktop.
Triton on Windows
#!/usr/bin/env python
import os
import sys
import subprocess
import time
from pathlib import Path
# --- Configuration ---
AOTRITON_TARGET_ARCH = 'gfx1151'
PYTHON_VER = '3.13'
AOTRITON_NOIMAGE_MODE = False
if sys.platform == 'win32':
os.environ['AOTRITON_SOURCE_DIR'] = '/work/aotriton'
os.environ['AOTRITON_BUILD_DIR'] = '/aotriton-output/build'
# NOTE: Python version of Triton wheel must match Python version used to build AOTriton
else:
os.environ['AOTRITON_SOURCE_DIR'] = os.path.expanduser('~/work/aotriton-0.9-gfx1151-windows')
os.environ['AOTRITON_BUILD_DIR'] = '/aotriton-windows-for-merge-output/build'
source_dir = Path(os.environ['AOTRITON_SOURCE_DIR']).resolve(strict=True)
build_dir = Path(os.environ['AOTRITON_BUILD_DIR']).resolve()
if not build_dir.exists():
build_dir.mkdir(parents=True, exist_ok=True)
venv_dir = build_dir / 'venv'
# --- Setup ---
caches_dir = Path("/caches")
pip_cache_dir = caches_dir / "pip"
if sys.platform == 'win32':
os.environ['PKG_CONFIG'] = 'C:/Strawberry/perl/bin/pkg-config.bat'
os.environ['PKG_CONFIG_PATH'] = '/xz-output/lib/pkgconfig'
print(f"--- Configuration ---")
print(f"Source Directory: {source_dir}")
print(f"Cache Directory: {caches_dir}")
print(f"LLVM_SYSPATH: {os.environ.get('LLVM_SYSPATH')}")
print(f"LLVM_INCLUDE_DIRS: {os.environ.get('LLVM_INCLUDE_DIRS')}")
print(f"LLVM_LIBRARY_DIR: {os.environ.get('LLVM_LIBRARY_DIR')}")
print(f"PKG_CONFIG_DIR: {os.environ.get('PKG_CONFIG_DIR')}")
print(f"Script Arguments: {sys.argv[1:]}")
print(f"---------------------")
def should_change_caches_dir():
return sys.platform == 'win32'
if should_change_caches_dir():
print(f"Ensuring directories exist...")
pip_cache_dir.mkdir(parents=True, exist_ok=True)
print("Setting environment variables...")
if should_change_caches_dir():
os.environ['PIP_CACHE_DIR'] = str(pip_cache_dir.resolve())
print(f"PIP_CACHE_DIR = {os.environ.get('PIP_CACHE_DIR')}")
# --- Helper function to run commands ---
def run_command(cmd_list, cwd=None):
"""Runs a command, prints it, times it, and checks for errors."""
print(f"\n--- Executing: {' '.join(map(str, cmd_list))} ---", flush=True)
start_time = time.monotonic()
try:
# Use shell=False (default) for better security and argument handling
# check=True raises CalledProcessError on non-zero exit code (like set -e)
process = subprocess.run(cmd_list, cwd=cwd, check=True, text=True)
#stdout=sys.stdout, stderr=sys.stderr) # Redirect streams directly
except FileNotFoundError:
print(f"ERROR: Command not found: {cmd_list[0]}. Is it installed and in PATH?", file=sys.stderr)
sys.exit(1)
except subprocess.CalledProcessError as e:
print(f"ERROR: Command failed with exit code {e.returncode}", file=sys.stderr)
sys.exit(e.returncode)
except Exception as e:
print(f"ERROR: An unexpected error occurred: {e}", file=sys.stderr)
sys.exit(1)
end_time = time.monotonic()
print(f"--- Command finished successfully in {end_time - start_time:.2f} seconds ---", flush=True)
return process
# --- Build Steps ---
cmd = [
"uv",
"venv",
"--python", PYTHON_VER,
str(venv_dir),
]
run_command(cmd)
build_python_bindings = True
if not AOTRITON_NOIMAGE_MODE:
# https://github.com/astral-sh/uv/issues/8721
cmd = [
"uv",
"pip",
"install", "torch",
"--python", str(venv_dir),
]
run_command(cmd)
use_aotriton_target_arch = True
build_aotriton_09 = True
if build_aotriton_09:
if AOTRITON_TARGET_ARCH == 'gfx1151':
use_aotriton_target_arch = False
TARGET_GPUS = 'Navi3.5'
else:
raise TypeError(f"Don't know GPU mapping for aotriton 0.9 for {AOTRITON_TARGET_GPUS}")
cmd = [
"cmake",
#"--trace",
"-GNinja",
f"-DVENV_DIR={str(venv_dir)}",
f"-DCMAKE_INSTALL_PREFIX={str((build_dir / 'install_dir').resolve())}",
"-DCMAKE_BUILD_TYPE=Release",
"-DAOTRITON_GPU_BUILD_TIMEOUT=0",
# AOTRITON_NO_PYTHON must be OFF and AOTRITON_NAME_SUFFIX must be set to run the unit tests
"-DAOTRITON_NO_PYTHON=OFF",
"-DHIP_PLATFORM=amd",
f'-DAOTRITON_NOIMAGE_MODE={'ON' if AOTRITON_NOIMAGE_MODE else 'OFF' }',
"-S", str(source_dir.resolve()),
"-B", str(build_dir.resolve()),
#"--debug-find",
]
if use_aotriton_target_arch: # aotriton-0.10
cmd.extend([
f"-DAOTRITON_TARGET_ARCH={AOTRITON_TARGET_ARCH}"
])
else: # aotriton-0.9
cmd.extend([
f"-DTARGET_GPUS={TARGET_GPUS}"
])
if sys.platform == 'win32':
cmd.append('-Ddlfcn-win32_DIR=/dlfcn-output/share/dlfcn-win32')
cmd.extend(sys.argv[1:]) # Add extra arguments from script call
run_command(cmd)
cmd = [
"ninja", "install"
]
cpu_count = os.cpu_count()
if cpu_count and cpu_count > 1:
cmake_build_cmd.extend(["--", f"-j{max(1, cpu_count - 1)}"])
print(f"Using parallel build flag: -j{max(1, cpu_count - 1)}")
run_command(cmd, cwd=build_dir)
print("\nBuild script completed successfully.")
#!/usr/bin/env python
import os
import sys
import subprocess
import time
from pathlib import Path
# --- Configuration ---
AOTRITON_TARGET_ARCH = 'gfx1151'
# We need to install `torch` in the virtual env so we need to create the venv and specify
# the python version used
PYTHON_VER = '3.13'
AOTRITON_NOIMAGE_MODE = True
fork_name = 'aotriton-0.9-gfx1151-windows'
if sys.platform == 'win32':
os.environ['AOTRITON_SOURCE_DIR'] = f'/w/{fork_name}'
os.environ['AOTRITON_BUILD_DIR'] = f'/o/{fork_name}/build'
# NOTE: Python version of Triton wheel must match Python version used to build AOTriton
else:
os.environ['AOTRITON_SOURCE_DIR'] = os.path.expanduser('~/work/aotriton')
os.environ['AOTRITON_BUILD_DIR'] = '/aotriton-output/build'
# Workaround cl.exe amd_hip_vector_types.h
os.environ['CC'] = 'clang-cl'
os.environ['CXX'] = 'clang-cl'
triton_wheel_path = None
if not AOTRITON_NOIMAGE_MODE:
if sys.platform == 'win32':
triton_wheel_path = Path('/work/triton-lshqqytiger/python/dist/triton-3.3.0+gitf8727c94-cp313-cp313-win_amd64.whl')
else:
triton_wheel_path = Path('~/work/triton-lshqqytiger/python/dist/triton-3.3.0+gitf8727c94-cp313-cp313-win_amd64.whl')
triton_wheel_path.resolve(strict=True)
source_dir = Path(os.environ['AOTRITON_SOURCE_DIR']).resolve(strict=True)
build_dir = Path(os.environ['AOTRITON_BUILD_DIR']).resolve()
if not build_dir.exists():
build_dir.mkdir(parents=True, exist_ok=True)
venv_dir = build_dir / 'venv'
if sys.platform == 'win32':
Python3_EXECUTABLE = None
Python3_INCLUDE_DIR = None
Python3_LIBRARY = None
else:
# Python3_EXECUTABLE = str(venv_dir / "bin" / "python")
Python3_EXECUTABLE = "/usr/bin/python"
Python3_INCLUDE_DIR = "/usr/include/python3.13"
Python3_LIBRARY = "/usr/lib64/libpython3.13.so"
CCACHE_EXECUTABLE = "ccache"
# --- Setup ---
caches_dir = Path("/caches")
ccache_dir = caches_dir / "ccache"
pip_cache_dir = caches_dir / "pip"
if sys.platform == 'win32':
os.environ['PKG_CONFIG'] = 'C:/Strawberry/perl/bin/pkg-config.bat'
os.environ['PKG_CONFIG_PATH'] = '/xz-output/lib/pkgconfig'
print(f"--- Configuration ---")
print(f"Source Directory: {source_dir}")
print(f"Cache Directory: {caches_dir}")
print(f"LLVM_SYSPATH: {os.environ.get('LLVM_SYSPATH')}")
print(f"LLVM_INCLUDE_DIRS: {os.environ.get('LLVM_INCLUDE_DIRS')}")
print(f"LLVM_LIBRARY_DIR: {os.environ.get('LLVM_LIBRARY_DIR')}")
print(f"PKG_CONFIG_DIR: {os.environ.get('PKG_CONFIG_DIR')}")
print(f"Script Arguments: {sys.argv[1:]}")
print(f"---------------------")
def should_change_caches_dir():
return sys.platform == 'win32'
if should_change_caches_dir():
print(f"Ensuring directories exist...")
ccache_dir.mkdir(parents=True, exist_ok=True)
pip_cache_dir.mkdir(parents=True, exist_ok=True)
print("Setting environment variables...")
if should_change_caches_dir():
os.environ['CCACHE_DIR'] = str(ccache_dir.resolve())
os.environ['PIP_CACHE_DIR'] = str(pip_cache_dir.resolve())
use_ccache = False
if use_ccache:
print(f"Configuring CMake to use ccache ('{CCACHE_EXECUTABLE}')...")
os.environ['CMAKE_C_COMPILER_LAUNCHER'] = CCACHE_EXECUTABLE
os.environ['CMAKE_CXX_COMPILER_LAUNCHER'] = CCACHE_EXECUTABLE
else:
print("Skipping ccache configuration.")
os.environ.pop('CMAKE_C_COMPILER_LAUNCHER', None)
os.environ.pop('CMAKE_CXX_COMPILER_LAUNCHER', None)
print(f"CCACHE_DIR = {os.environ.get('CCACHE_DIR')}")
print(f"PIP_CACHE_DIR = {os.environ.get('PIP_CACHE_DIR')}")
print(f"CMAKE_C_COMPILER_LAUNCHER = {os.environ.get('CMAKE_C_COMPILER_LAUNCHER')}")
print(f"CMAKE_CXX_COMPILER_LAUNCHER = {os.environ.get('CMAKE_CXX_COMPILER_LAUNCHER')}")
# --- Helper function to run commands ---
def run_command(cmd_list, cwd=None):
"""Runs a command, prints it, times it, and checks for errors."""
print(f"\n--- Executing: {' '.join(map(str, cmd_list))} ---", flush=True)
start_time = time.monotonic()
try:
# Use shell=False (default) for better security and argument handling
# check=True raises CalledProcessError on non-zero exit code (like set -e)
process = subprocess.run(cmd_list, cwd=cwd, check=True, text=True)
#stdout=sys.stdout, stderr=sys.stderr) # Redirect streams directly
except FileNotFoundError:
print(f"ERROR: Command not found: {cmd_list[0]}. Is it installed and in PATH?", file=sys.stderr)
sys.exit(1)
except subprocess.CalledProcessError as e:
print(f"ERROR: Command failed with exit code {e.returncode}", file=sys.stderr)
sys.exit(e.returncode)
except Exception as e:
print(f"ERROR: An unexpected error occurred: {e}", file=sys.stderr)
sys.exit(1)
end_time = time.monotonic()
print(f"--- Command finished successfully in {end_time - start_time:.2f} seconds ---", flush=True)
return process
# --- Build Steps ---
cmd = [
"uv",
"venv",
"--python", PYTHON_VER,
str(venv_dir),
]
run_command(cmd)
BUILD_PYTHON_BINDINGS = True
if not AOTRITON_NOIMAGE_MODE:
# https://github.com/astral-sh/uv/issues/8721
cmd = [
"uv",
"pip",
"install", "torch",
"--python", str(venv_dir),
]
run_command(cmd)
use_aotriton_target_arch = True
build_aotriton_09 = True
if build_aotriton_09:
if AOTRITON_TARGET_ARCH == 'gfx1151':
use_aotriton_target_arch = False
TARGET_GPUS = 'Navi3.5'
else:
raise TypeError(f"Don't know GPU mapping for aotriton 0.9 for {AOTRITON_TARGET_GPUS}")
cmd = [
"cmake",
#"--trace",
"-GNinja",
f"-DVENV_DIR={str(venv_dir)}",
f"-DCMAKE_INSTALL_PREFIX={str((build_dir / 'install_dir').resolve())}",
"-DCMAKE_BUILD_TYPE=Release",
"-DAOTRITON_GPU_BUILD_TIMEOUT=0",
# AOTRITON_NO_PYTHON must be OFF and AOTRITON_NAME_SUFFIX must be set to run the unit tests
f"-DAOTRITON_NO_PYTHON={"ON" if BUILD_PYTHON_BINDINGS else "OFF"}",
"-DHIP_PLATFORM=amd",
f'-DAOTRITON_NOIMAGE_MODE={'ON' if AOTRITON_NOIMAGE_MODE else 'OFF' }',
"-S", str(source_dir.resolve()),
"-B", str(build_dir.resolve()),
#"--debug-find",
]
if use_aotriton_target_arch:
cmd.extend([
f"-DAOTRITON_TARGET_ARCH={AOTRITON_TARGET_ARCH}"
])
else: # aotriton-0.9
cmd.extend([
f"-DTARGET_GPUS={TARGET_GPUS}"
])
if BUILD_PYTHON_BINDINGS:
if Python3_EXECUTABLE:
cmd.extend([
f"-DPython3_EXECUTABLE={Python3_EXECUTABLE}",
])
if Python3_INCLUDE_DIR:
cmd.extend([
"-DPython3_INCLUDE_DIR={Python3_INCLUDE_DIR}",
])
cmd.extend([
"-DPython3_LIBRARY={Python3_LIBRARY}",
])
if triton_wheel_path is not None:
cmd.append(f"-DINSTALL_TRITON_FROM_WHEEL={str(triton_wheel_path.resolve())}")
if sys.platform == 'win32':
cmd.append('-Ddlfcn-win32_DIR=/dlfcn-output/share/dlfcn-win32')
cmd.extend(sys.argv[1:]) # Add extra arguments from script call
run_command(cmd)
if sys.platform == 'win32':
# Set HIP_PATH to dir containing `bin/ld-lld.exe` for triton\backends\amd\compiler.py
# lld = Path(os.path.join( os.environ['HIP_PATH'] , 'bin', 'ld.lld.exe' ))
os.environ['HIP_PATH'] = '/o/r-st/build/dist/rocm/lib/llvm'
cmd = [
"ninja", "install"
]
if sys.platform == 'win32' and (not AOTRITON_NOIMAGE_MODE):
cmd.extend(['-j', '1'])
run_command(cmd, cwd=build_dir)
print("\nBuild script completed successfully.")
#!/usr/bin/env python
import os
import sys
import subprocess
import time
from pathlib import Path
# https://github.com/dlfcn-win32/dlfcn-win32
# --- Configuration ---
output_dir = Path('/dlfcn-output')
os.environ['SOURCE_DIR'] = '/work/dlfcn-win32'
source_dir = Path(os.environ['SOURCE_DIR']).resolve(strict=True)
build_dir = source_dir / 'build'
os.environ['BUILD_DIR'] = '/dlfcn-output/build'
if not build_dir.exists():
build_dir.mkdir(parents=True, exist_ok=True)
CCACHE_EXECUTABLE = "ccache"
# --- Setup ---
caches_dir = Path("/caches")
ccache_dir = caches_dir / "ccache"
print(f"--- Configuration ---")
print(f"Source Directory: {source_dir}")
print(f"Build Directory: {build_dir}")
print(f"Output Directory: {output_dir}")
print(f"Cache Directory: {caches_dir}")
print(f"Script Arguments: {sys.argv[1:]}")
print(f"---------------------")
def should_change_caches_dir():
return sys.platform == 'win32'
if should_change_caches_dir():
print(f"Ensuring directories exist...")
ccache_dir.mkdir(parents=True, exist_ok=True)
print("Setting environment variables...")
if should_change_caches_dir():
os.environ['CCACHE_DIR'] = str(ccache_dir.resolve())
use_ccache = True
if use_ccache:
print(f"Configuring CMake to use ccache ('{CCACHE_EXECUTABLE}')...")
os.environ['CMAKE_C_COMPILER_LAUNCHER'] = CCACHE_EXECUTABLE
os.environ['CMAKE_CXX_COMPILER_LAUNCHER'] = CCACHE_EXECUTABLE
else:
print("Skipping ccache configuration.")
os.environ.pop('CMAKE_C_COMPILER_LAUNCHER', None)
os.environ.pop('CMAKE_CXX_COMPILER_LAUNCHER', None)
print(f"CCACHE_DIR = {os.environ.get('CCACHE_DIR')}")
print(f"CMAKE_C_COMPILER_LAUNCHER = {os.environ.get('CMAKE_C_COMPILER_LAUNCHER')}")
print(f"CMAKE_CXX_COMPILER_LAUNCHER = {os.environ.get('CMAKE_CXX_COMPILER_LAUNCHER')}")
# --- Helper function to run commands ---
def run_command(cmd_list, cwd=None):
"""Runs a command, prints it, times it, and checks for errors."""
print(f"\n--- Executing: {' '.join(map(str, cmd_list))} ---", flush=True)
start_time = time.monotonic()
try:
# Use shell=False (default) for better security and argument handling
# check=True raises CalledProcessError on non-zero exit code (like set -e)
process = subprocess.run(cmd_list, cwd=cwd, check=True, text=True)
#stdout=sys.stdout, stderr=sys.stderr) # Redirect streams directly
except FileNotFoundError:
print(f"ERROR: Command not found: {cmd_list[0]}. Is it installed and in PATH?", file=sys.stderr)
sys.exit(1)
except subprocess.CalledProcessError as e:
print(f"ERROR: Command failed with exit code {e.returncode}", file=sys.stderr)
sys.exit(e.returncode)
except Exception as e:
print(f"ERROR: An unexpected error occurred: {e}", file=sys.stderr)
sys.exit(1)
end_time = time.monotonic()
print(f"--- Command finished successfully in {end_time - start_time:.2f} seconds ---", flush=True)
return process
# --- Build Steps ---
cmd = [
"cmake",
# "--trace",
"-GNinja",
"-DBUILD_SHARED_LIBS=ON",
"-DCMAKE_BUILD_TYPE=Release",
f"-DCMAKE_INSTALL_PREFIX={str(output_dir.resolve())}",
"-S", str(source_dir.resolve()),
"-B", str(build_dir.resolve()),
]
cmd.extend(sys.argv[1:]) # Add extra arguments from script call
run_command(cmd)
cmd = [
"cmake",
"--build",
str(build_dir.resolve()),
]
run_command(cmd)
cmd = [
"cmake",
"--install",
str(build_dir.resolve()),
]
run_command(cmd)
print("\nBuild script completed successfully.")
#!/usr/bin/env python
import os
import sys
import subprocess
import time
from pathlib import Path
# --- Configuration ---
os.environ['LLVM_OUTPUT_DIR'] = '/llvm-output'
os.environ['LLVM_SOURCE_DIR'] = '/work/llvm-for-triton'
output_dir = Path(os.environ['LLVM_OUTPUT_DIR']).resolve()
source_dir = Path(os.environ['LLVM_SOURCE_DIR']).resolve()
CCACHE_EXECUTABLE = "ccache"
# --- Setup ---
build_dir = output_dir / "build"
caches_dir = output_dir / ".." / "caches"
ccache_dir = caches_dir / "ccache"
print(f"--- Configuration ---")
print(f"Source Directory: {source_dir}")
print(f"Output Base Directory: {output_dir}")
print(f"Build Directory: {build_dir}")
print(f"Cache Directory: {caches_dir}")
print(f"Script Arguments: {sys.argv[1:]}")
print(f"---------------------")
print(f"Ensuring directories exist...")
ccache_dir.mkdir(parents=True, exist_ok=True)
build_dir.mkdir(parents=True, exist_ok=True) # Also ensure build dir exists early
#
print("Setting environment variables...")
os.environ['CCACHE_DIR'] = str(ccache_dir.resolve())
# Check if ccache is desired/available before setting launchers
use_ccache = True # Set to False to disable ccache
if use_ccache:
print(f"Configuring CMake to use ccache ('{CCACHE_EXECUTABLE}')...")
os.environ['CMAKE_C_COMPILER_LAUNCHER'] = CCACHE_EXECUTABLE
os.environ['CMAKE_CXX_COMPILER_LAUNCHER'] = CCACHE_EXECUTABLE
else:
print("Skipping ccache configuration.")
# Ensure they are unset if they existed before
os.environ.pop('CMAKE_C_COMPILER_LAUNCHER', None)
os.environ.pop('CMAKE_CXX_COMPILER_LAUNCHER', None)
print(f"CCACHE_DIR = {os.environ.get('CCACHE_DIR')}")
print(f"CMAKE_C_COMPILER_LAUNCHER = {os.environ.get('CMAKE_C_COMPILER_LAUNCHER')}")
print(f"CMAKE_CXX_COMPILER_LAUNCHER = {os.environ.get('CMAKE_CXX_COMPILER_LAUNCHER')}")
# --- Helper function to run commands ---
def run_command(cmd_list, cwd=None):
"""Runs a command, prints it, times it, and checks for errors."""
print(f"\n--- Executing: {' '.join(map(str, cmd_list))} ---", flush=True)
start_time = time.monotonic()
try:
# Use shell=False (default) for better security and argument handling
# check=True raises CalledProcessError on non-zero exit code (like set -e)
process = subprocess.run(cmd_list, cwd=cwd, check=True, text=True)
#stdout=sys.stdout, stderr=sys.stderr) # Redirect streams directly
except FileNotFoundError:
print(f"ERROR: Command not found: {cmd_list[0]}. Is it installed and in PATH?", file=sys.stderr)
sys.exit(1)
except subprocess.CalledProcessError as e:
print(f"ERROR: Command failed with exit code {e.returncode}", file=sys.stderr)
sys.exit(e.returncode)
except Exception as e:
print(f"ERROR: An unexpected error occurred: {e}", file=sys.stderr)
sys.exit(1)
end_time = time.monotonic()
print(f"--- Command finished successfully in {end_time - start_time:.2f} seconds ---", flush=True)
return process
# --- Build Steps ---
# https://github.com/triton-lang/triton?tab=readme-ov-file#building-with-a-custom-llvm
# 1. CMake Configure Step
configure_cmd = [
"cmake", \
"-GNinja",
"-DCMAKE_BUILD_TYPE=Release",
"-DLLVM_ENABLE_ASSERTIONS=ON",
"-DLLVM_ENABLE_PROJECTS=mlir;llvm;lld",
"-DLLVM_TARGETS_TO_BUILD=host;NVPTX;AMDGPU",
"-DLLVM_FORCE_VC_REPOSITORY=llvm-for-triton"
"-DLLVM_FORCE_VC_REVISION=rev-for-triton"
"-S", str((source_dir / "llvm").resolve()),
"-B", str(build_dir.resolve()),
]
configure_cmd.extend(sys.argv[1:]) # Add extra arguments from script call
run_command(configure_cmd)
# 2. CMake Build Step
cmake_build_cmd = [
"cmake",
"--build", str(build_dir.resolve())
]
# Add parallel build flag common on Windows (optional)
# Get number of processors, leave one free
cpu_count = os.cpu_count()
if cpu_count and cpu_count > 1:
cmake_build_cmd.extend(["--", f"-j{max(1, cpu_count - 1)}"]) # Pass '-jN' to underlying Ninja
print(f"Using parallel build flag: -j{max(1, cpu_count - 1)}")
run_command(cmake_build_cmd)
print("\nBuild script completed successfully.")
#!/usr/bin/env python
# BUild liblzma for aotriton
import os
import sys
import subprocess
import time
from pathlib import Path
# https://github.com/tukaani-project/xz
# --- Configuration ---
output_dir = Path('/xz-output')
os.environ['XZ_SOURCE_DIR'] = '/work/xz'
source_dir = Path(os.environ['XZ_SOURCE_DIR']).resolve(strict=True)
build_dir = source_dir / 'build'
os.environ['XZ_BUILD_DIR'] = '/aotriton-output/build'
if not build_dir.exists():
build_dir.mkdir(parents=True, exist_ok=True)
CCACHE_EXECUTABLE = "ccache"
# --- Setup ---
caches_dir = Path("/caches")
ccache_dir = caches_dir / "ccache"
print(f"--- Configuration ---")
print(f"Source Directory: {source_dir}")
print(f"Build Directory: {build_dir}")
print(f"Output Directory: {output_dir}")
print(f"Cache Directory: {caches_dir}")
print(f"Script Arguments: {sys.argv[1:]}")
print(f"---------------------")
def should_change_caches_dir():
return sys.platform == 'win32'
if should_change_caches_dir():
print(f"Ensuring directories exist...")
ccache_dir.mkdir(parents=True, exist_ok=True)
print("Setting environment variables...")
if should_change_caches_dir():
os.environ['CCACHE_DIR'] = str(ccache_dir.resolve())
use_ccache = True
if use_ccache:
print(f"Configuring CMake to use ccache ('{CCACHE_EXECUTABLE}')...")
os.environ['CMAKE_C_COMPILER_LAUNCHER'] = CCACHE_EXECUTABLE
os.environ['CMAKE_CXX_COMPILER_LAUNCHER'] = CCACHE_EXECUTABLE
else:
print("Skipping ccache configuration.")
os.environ.pop('CMAKE_C_COMPILER_LAUNCHER', None)
os.environ.pop('CMAKE_CXX_COMPILER_LAUNCHER', None)
print(f"CCACHE_DIR = {os.environ.get('CCACHE_DIR')}")
print(f"CMAKE_C_COMPILER_LAUNCHER = {os.environ.get('CMAKE_C_COMPILER_LAUNCHER')}")
print(f"CMAKE_CXX_COMPILER_LAUNCHER = {os.environ.get('CMAKE_CXX_COMPILER_LAUNCHER')}")
# --- Helper function to run commands ---
def run_command(cmd_list, cwd=None):
"""Runs a command, prints it, times it, and checks for errors."""
print(f"\n--- Executing: {' '.join(map(str, cmd_list))} ---", flush=True)
start_time = time.monotonic()
try:
# Use shell=False (default) for better security and argument handling
# check=True raises CalledProcessError on non-zero exit code (like set -e)
process = subprocess.run(cmd_list, cwd=cwd, check=True, text=True)
#stdout=sys.stdout, stderr=sys.stderr) # Redirect streams directly
except FileNotFoundError:
print(f"ERROR: Command not found: {cmd_list[0]}. Is it installed and in PATH?", file=sys.stderr)
sys.exit(1)
except subprocess.CalledProcessError as e:
print(f"ERROR: Command failed with exit code {e.returncode}", file=sys.stderr)
sys.exit(e.returncode)
except Exception as e:
print(f"ERROR: An unexpected error occurred: {e}", file=sys.stderr)
sys.exit(1)
end_time = time.monotonic()
print(f"--- Command finished successfully in {end_time - start_time:.2f} seconds ---", flush=True)
return process
# --- Build Steps ---
cmd = [
"cmake",
# "--trace",
"-GNinja",
"-DXZ_NLS=OFF",
"-DBUILD_SHARED_LIBS=ON",
"-DCMAKE_BUILD_TYPE=Release",
f"-DCMAKE_INSTALL_PREFIX={str(output_dir.resolve())}",
"-S", str(source_dir.resolve()),
"-B", str(build_dir.resolve()),
]
cmd.extend(sys.argv[1:]) # Add extra arguments from script call
run_command(cmd)
cmd = [
"cmake",
"--build",
str(build_dir.resolve()),
]
run_command(cmd)
cmd = [
"cmake",
"--install",
str(build_dir.resolve()),
]
run_command(cmd)
print("\nBuild script completed successfully.")
#!/usr/bin/env python
import os
import sys
import subprocess
import time
from pathlib import Path
# --- Configuration ---
AMDGPU_TARGETS = 'gfx1151'
# We need to install `torch` in the virtual env so we need to create the venv and specify
# the python version used
PYTHON_VER = '3.13'
fork_name = 'pytorch-st'
if sys.platform == 'win32':
source_dir = Path(f'/w/{fork_name}')
out_dir = Path(f'/w/{fork_name}')
os.environ['AOTRITON_INSTALLED_PREFIX'] = '/o/aotriton-0.9-gfx1151-windows/build/install_dir'
os.environ['CMAKE_PREFIX_PATH'] = '/o/r-st-gfx1151/build/dist/rocm'
# NOTE: Python version of Triton wheel must match Python version used to build AOTriton if Triton is used
USE_CMAKE = True
use_ccache = False
os.environ['CC'] = 'clang-cl'
os.environ['CXX'] = 'clang-cl'
else:
source_dir = Path(os.path.expanduser(f'~/w/{fork_name}'))
out_dir = Path(os.path.expanduser(f'~/w/{fork_name}'))
os.environ['AOTRITON_INSTALLED_PREFIX'] = os.path.expanduser('~/aotriton-windows-for-merge-output/build/install_dir')
os.environ['CMAKE_PREFIX_PATH'] = os.path.expanduser('~/therock-output-gfx1151/build/dist/rocm')
USE_CMAKE = True
use_ccache = False
build_dir = out_dir / "build"
venv_dir = out_dir / '.venv'
os.environ['USE_KINETO'] = 'OFF'
os.environ['PYTORCH_ROCM_ARCH'] = AMDGPU_TARGETS
os.environ['USE_ROCM'] = 'ON'
os.environ['BUILD_TEST'] = '0'
os.environ['USE_FLASH_ATTENTION'] = 'ON'
os.environ['USE_MEM_EFF_ATTENTION'] = 'ON'
os.environ['DISTUTILS_USE_SDK'] = '1'
source_dir = source_dir.resolve(strict=True)
build_dir = build_dir.resolve()
if not build_dir.exists():
build_dir.mkdir(parents=True, exist_ok=True)
CCACHE_EXECUTABLE = "ccache"
# --- Setup ---
caches_dir = Path("/caches")
ccache_dir = caches_dir / "ccache"
pip_cache_dir = caches_dir / "pip"
print(f"--- Configuration ---")
print(f"Source Directory: {source_dir}")
print(f"Cache Directory: {caches_dir}")
print(f"Script Arguments: {sys.argv[1:]}")
print(f"---------------------")
def should_change_caches_dir():
return sys.platform == 'win32'
if should_change_caches_dir():
print(f"Ensuring directories exist...")
ccache_dir.mkdir(parents=True, exist_ok=True)
pip_cache_dir.mkdir(parents=True, exist_ok=True)
print("Setting environment variables...")
if should_change_caches_dir():
os.environ['CCACHE_DIR'] = str(ccache_dir.resolve())
os.environ['PIP_CACHE_DIR'] = str(pip_cache_dir.resolve())
if use_ccache:
print(f"Configuring CMake to use ccache ('{CCACHE_EXECUTABLE}')...")
os.environ['CMAKE_C_COMPILER_LAUNCHER'] = CCACHE_EXECUTABLE
os.environ['CMAKE_CXX_COMPILER_LAUNCHER'] = CCACHE_EXECUTABLE
else:
print("Skipping ccache configuration.")
os.environ.pop('CMAKE_C_COMPILER_LAUNCHER', None)
os.environ.pop('CMAKE_CXX_COMPILER_LAUNCHER', None)
print(f"CCACHE_DIR = {os.environ.get('CCACHE_DIR')}")
print(f"PIP_CACHE_DIR = {os.environ.get('PIP_CACHE_DIR')}")
print(f"CMAKE_C_COMPILER_LAUNCHER = {os.environ.get('CMAKE_C_COMPILER_LAUNCHER')}")
print(f"CMAKE_CXX_COMPILER_LAUNCHER = {os.environ.get('CMAKE_CXX_COMPILER_LAUNCHER')}")
# --- Helper function to run commands ---
def run_command(cmd_list, cwd=None):
"""Runs a command, prints it, times it, and checks for errors."""
print(f"\n--- Executing: {' '.join(map(str, cmd_list))} ---", flush=True)
start_time = time.monotonic()
try:
# Use shell=False (default) for better security and argument handling
# check=True raises CalledProcessError on non-zero exit code (like set -e)
process = subprocess.run(cmd_list, cwd=cwd, check=True, text=True)
#stdout=sys.stdout, stderr=sys.stderr) # Redirect streams directly
except FileNotFoundError:
print(f"ERROR: Command not found: {cmd_list[0]}. Is it installed and in PATH?", file=sys.stderr)
sys.exit(1)
except subprocess.CalledProcessError as e:
print(f"ERROR: Command failed with exit code {e.returncode}", file=sys.stderr)
sys.exit(e.returncode)
except Exception as e:
print(f"ERROR: An unexpected error occurred: {e}", file=sys.stderr)
sys.exit(1)
end_time = time.monotonic()
print(f"--- Command finished successfully in {end_time - start_time:.2f} seconds ---", flush=True)
return process
# --- Build Steps ---
if True:
cmd = [
"uv",
"venv",
"--python", PYTHON_VER,
str(venv_dir),
]
run_command(cmd)
cmd = [
"uv",
"pip",
"install", "-r", str(source_dir / "requirements.txt"),
"--python", str(venv_dir),
]
run_command(cmd)
venv_bin_dir = venv_dir / Path('bin')
venv_python_bin = venv_dir / Path('bin') / Path('python')
if sys.platform == 'win32':
venv_bin_dir = venv_dir / Path('Scripts')
venv_python_bin = venv_dir / Path('Scripts') / Path('python.exe')
if USE_CMAKE:
cmd = [
venv_python_bin,
"setup.py",
"build", "--cmake-only",
]
run_command(cmd, cwd=source_dir)
cmd = [
"cmake",
#"--trace",
"-GNinja",
"-DCMAKE_BUILD_TYPE=Release",
f"-DPython_EXECUTABLE={venv_python_bin}",
f"-DPYTORCH_ROCM_ARCH={AMDGPU_TARGETS}",
"-DUSE_ROCM=ON",
"-DUSE_KINETO=OFF",
"-DUSE_FLASH_ATTENTION=ON",
"-S", str(source_dir.resolve()),
"-B", str(build_dir.resolve()),
]
cmd.extend(sys.argv[1:]) # Add extra arguments from script call
run_command(cmd)
# sys.exit(17)
cmd = [
venv_python_bin,
"setup.py",
"bdist_wheel",
]
run_command(cmd, cwd=source_dir)
print("\nBuild script completed successfully.")
#!/usr/bin/env python
# https://github.com/ROCm/TheRock/discussions/244#discussioncomment-12926010
import torch
import time
from torch.nn.functional import scaled_dot_product_attention
from torch.nn.attention import SDPBackend, sdpa_kernel
# Check for GPU
if not torch.cuda.is_available():
raise SystemExit("CUDA GPU is not available. Please run on a CUDA-enabled device.")
device = torch.device("cuda")
torch.cuda.init() # Initialize CUDA context (optional, helps measure baseline)
# Helper function for measuring one run
def measure_op(op_func, warmup=3, total_runs=10):
"""
op_func: a callable that runs the operation (including memory measurement)
and returns (time_ms, peak_mem_MB, gflops_s).
warmup: number of warm-up runs to discard.
total_runs: total runs to do, including warmup.
Returns: average_time_ms, average_peak_mem_MB, average_gflops_s over the runs after warm-up.
"""
times = []
mems = []
flops = []
for run_idx in range(total_runs):
# Reset peak memory stats at the start of each run
torch.cuda.reset_peak_memory_stats(device)
t_ms, peak_mb, gf_s = op_func()
if run_idx >= warmup:
times.append(t_ms)
mems.append(peak_mb)
flops.append(gf_s)
avg_time_ms = sum(times) / len(times)
avg_mem_mb = sum(mems) / len(mems)
avg_gf_s = sum(flops) / len(flops)
return avg_time_ms, avg_mem_mb, avg_gf_s
# 1) Define the Scaled Dot-Product Attention test
def run_sdpa():
# Configuration
B, heads = 1, 8
L = 8192
E = 64
S = L
# Create random Q, K, V in half precision
# We place them inside the function so each run re-allocates
# new memory (to measure peak memory usage properly).
q = torch.randn(B, heads, L, E, device=device, dtype=torch.float16)
k = torch.randn(B, heads, S, E, device=device, dtype=torch.float16)
v = torch.randn(B, heads, S, E, device=device, dtype=torch.float16)
# Start timing
torch.cuda.synchronize()
start_time = time.time()
# Run scaled dot-product attention (Flash Attention backend)
with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
out = scaled_dot_product_attention(q, k, v)
# Synchronize & end timing
torch.cuda.synchronize()
end_time = time.time()
# Measure time
time_ms = (end_time - start_time) * 1000.0
# Peak memory usage (MB)
peak_mem_bytes = torch.cuda.max_memory_allocated(device)
peak_mem_mb = peak_mem_bytes / (1024**2)
# Compute FLOPs for scaled dot-product attention:
# Q*K^T -> 2 * B * heads * L * S * E
# Attn*V -> 2 * B * heads * L * S * E
# Total = 4 * B * heads * L * S * E
flops = 4.0 * B * heads * L * S * E
# Convert to GFLOPs/s
flops_s = flops / (end_time - start_time)
gflops_s = flops_s / 1e9
return time_ms, peak_mem_mb, gflops_s
# Run the measurements
print("Benchmarking Scaled Dot-Product Attention (Flash) in FP16 ...")
warmup = 3
sdpa_time, sdpa_mem, sdpa_gflops = measure_op(run_sdpa, warmup=warmup, total_runs=warmup+7)
print(f"Average time: {sdpa_time:.2f} ms")
print(f"Average peak memory: {sdpa_mem:.2f} MB")
print(f"Average throughput: {sdpa_gflops:.2f} GFLOP/s\n")
# https://rocm.blogs.amd.com/artificial-intelligence/flash-attention/README.html
import numpy as np
import torch
import torch.nn.functional as F
from torch.nn.attention import SDPBackend, sdpa_kernel
class NaiveSdpaForDevice():
def __init__(self, device):
self.device = device
def scaled_dot_product_attention(self, query, key, value, attn_mask=None, is_causal=False, dropout_p=0.0, scale=None):
"""
Computes the scaled dot product attention between query, key, and value tensors in PyTorch eager mode.
Args:
query (torch.Tensor): The query tensor of shape (batch_size, n_heads, seq_len, hidden_dim).
key (torch.Tensor): The key tensor of shape (batch_size, n_heads, seq_len, hidden_dim).
value (torch.Tensor): The value tensor of shape (batch_size, n_heads, seq_len, hidden_dim).
attn_mask (torch.Tensor, optional): The attention mask tensor of shape (batch_size, n_heads, seq_len, seq_len). Defaults to None.
is_causal (bool, optional): Whether to apply a causal attention mask. Defaults to False.
dropout_p (float, optional): The dropout probability. Defaults to 0.
scale (float, optional): The scale factor for the dot product. Defaults to None.
Returns:
torch.Tensor: The output tensor of shape (batch_size, n_heads, seq_len, hidden_dim).
"""
# Calculate the scale factor
scale_factor = 1 / np.sqrt(query.size(-1)) if scale is None else scale
attn_weight = (query @ key.transpose(-2, -1) * scale_factor)
# Create the attention mask
attn_mask = (torch.ones(query.shape[0], query.shape[1], query.shape[2], query.shape[2], dtype=torch.bool,
device=self.device).tril(diagonal=0)
if is_causal else attn_mask)
attn_weight = attn_weight.masked_fill_(~attn_mask, -torch.inf) if attn_mask is not None else attn_weight
# Compute the scaled dot product attention
attn_weight = torch.softmax(attn_weight, dim=-1)
attn_weight = torch.dropout(attn_weight, dropout_p, train=False)
return attn_weight @ value
def test(n_times):
batch_size = 1
seq_len = 64
num_heads = 32
device = torch.device("cuda")
embed_dims = [16, 32, 48, 64, 80, 96, 128, 160, 192, 224, 256]
times = n_times // 2 // len(embed_dims)
for (dtype, epsilon) in [(torch.float16, 1e-03),
(torch.bfloat16, 1e-02)]:
for embed_dim in embed_dims:
for i in range(times):
query = torch.rand(batch_size, num_heads, seq_len, embed_dim, device=device, dtype=dtype)
key = torch.rand(batch_size, num_heads, seq_len, embed_dim, device=device, dtype=dtype)
value = torch.rand(batch_size, num_heads, seq_len, embed_dim, device=device, dtype=dtype)
naive = NaiveSdpaForDevice(device)
want = naive.scaled_dot_product_attention(query, key, value, is_causal=True)
with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
got = F.scaled_dot_product_attention(query, key, value, is_causal=True)
if not torch.allclose(want, got, rtol=epsilon, atol=epsilon):
raise RuntimeErrror(f'dtype: {dtype}, test_iteration: {i}')
test(1000)
# LLVM for triton-lshqqytiger
git clone [email protected]:llvm/llvm-project.git llvm-for-triton
cd llvm-for-triton
# Use llvm version specified in https://github.com/lshqqytiger/triton/blob/main/cmake/llvm-hash.txt
git checkout 1cec5fffd8fddd9d85b516f876093b0e3f0eec5f
cd ..
# use_triton_fork = 'triton-lshqqytiger'
git clone [email protected]:lshqqytiger/triton.git triton-lshqqytiger
cd triton-lshqqytiger
uv venv --python=3.13
.venv\Script\Activate
uv pip install -r python\requirements.txt
python llvm-build.py
#
# NOTE: triton's setup.py wants to create symlinks which requires setting "Dev mode" to true on Windows 11
# when using from a non-admin user account
#
# Edit `triton-build-fork.py` and set `use_triton_fork=triton-lshqqytiger`
python triton-build-fork.py
# use_triton_fork = 'triton-windows'
git [email protected]:woct0rdho/triton-windows.git
cd triton-windows
git checkout v3.3.x-windows
uv venv --python=3.13
.venv\Script\Activate
uv pip install -r python\requirements.txt
#
# NOTE: triton's setup.py wants to create symlinks which requires setting "Dev mode" to true on Windows 11
# when using from a non-admin user account
#
# Edit `triton-build-fork.py` and set `use_triton_fork=triton-windows`
python triton-build-fork.py
# To build aotriton, for AMD GPUs, use triton-lshqqytiger from above
python lzma-build.py
git clone [email protected]:scottt/aotriton.git
cd aotriton
git checkout windows
cd ..
python aotriton-build.py
#!/usr/bin/env python
import os
import sys
import subprocess
import time
from pathlib import Path
# use_triton_fork = 'triton-lshqqytiger'
use_triton_fork = 'triton-windows'
def run_command(cmd_list, cwd=None):
"""Runs a command, prints it, times it, and checks for errors."""
print(f"\n--- Executing: {' '.join(map(str, cmd_list))} ---", flush=True)
start_time = time.monotonic()
try:
process = subprocess.run(cmd_list, cwd=cwd, check=True, text=True)
except FileNotFoundError:
print(f"ERROR: Command not found: {cmd_list[0]}. Is it installed and in PATH?", file=sys.stderr)
sys.exit(1)
except subprocess.CalledProcessError as e:
print(f"ERROR: Command failed with exit code {e.returncode}", file=sys.stderr)
sys.exit(e.returncode)
except Exception as e:
print(f"ERROR: An unexpected error occurred: {e}", file=sys.stderr)
sys.exit(1)
end_time = time.monotonic()
print(f"--- Command finished successfully in {end_time - start_time:.2f} seconds ---", flush=True)
return process
def build():
script_dir = Path(os.path.dirname(sys.argv[0]))
if sys.platform == 'win32':
# nlohmann-json from https://github.com/nlohmann/json/releases/download/v3.11.3/include.zip
os.environ['JSON_SYSPATH'] = os.path.expanduser('/work/nlohmann-json-3.11.3')
# Both triton forks are built with "MSVC v143 x64"
# That's the default toolchain in Visual Studio 2022 Community Edition as of 2025-05
os.environ['TRITON_BUILD_WITH_CLANG_LLD'] = 'OFF'
if use_triton_fork == 'triton-windows':
os.environ['TRITON_SOURCE_DIR'] = os.path.expanduser('/work/triton-windows')
# LLVM from https://oaitriton.blob.core.windows.net/public/llvm-builds/llvm-a66376b0-windows-x64.tar.gz
os.environ['LLVM_SYSPATH'] = os.path.expanduser('/caches/.triton/llvm/llvm-a66376b0-windows-x64')
else:
os.environ['TRITON_SOURCE_DIR'] = os.path.expanduser('/work/triton-lshqqytiger')
# LLVM revision 1cec5fffd8fddd9d85b516f876093b0e3f0eec5fa built from source
# This uses the llvm "build" and implicitly the "source" directory contents, instead of an "install" tree
os.environ['LLVM_SYSPATH'] = os.path.expanduser('/llvm-output/build')
else:
os.environ['JSON_SYSPATH'] = os.path.expanduser('~/work/nlohmann-json-3.11.3')
os.environ['TRITON_SOURCE_DIR'] = os.path.expanduser('~/work/triton')
cmd = [
sys.executable,
script_dir / 'triton-build.py',
]
run_command(cmd)
build()
#!/usr/bin/env python
import os
import sys
import subprocess
import time
from pathlib import Path
# --- Configuration ---
source_dir = Path(os.environ['TRITON_SOURCE_DIR']).resolve()
llvm_syspath = os.environ.get('LLVM_SYSPATH')
if llvm_syspath is not None:
llvm_syspath = Path(os.environ['LLVM_SYSPATH']).resolve()
json_syspath = Path(os.environ['JSON_SYSPATH']).resolve()
CCACHE_EXECUTABLE = "ccache"
# --- Setup ---
caches_dir = Path("/caches")
ccache_dir = caches_dir / "ccache"
pip_cache_dir = caches_dir / "pip"
# Change default from "$HOME/.triton"a to "$caches_dir/.triton"
# Triton always adds the ".triton" part
triton_home_dir = caches_dir
if llvm_syspath is not None:
llvm_include_dir = llvm_syspath / "include"
llvm_library_dir = llvm_syspath / "lib"
# Disable downloading of dependencies
# pybind11 is installed via pip and requirements.txt
cuda_dep_var_names = [
'TRITON_PTXAS_PATH',
'TRITON_CUOBJDUMP_PATH',
'TRITON_NVDISASM_PATH',
'TRITON_CUDACRT_PATH',
'TRITON_CUDART_PATH',
'TRITON_CUPTI_INCLUDE_PATH',
'TRITON_CUPTI_LIB_PATH',
]
for x in cuda_dep_var_names:
# value is just a placeholder
os.environ[x] = '/bin/false'
if llvm_syspath is not None:
os.environ['LLVM_INCLUDE_DIRS'] = str(llvm_include_dir.resolve())
os.environ['LLVM_LIBRARY_DIR'] = str(llvm_library_dir.resolve())
print(f"--- Configuration ---")
print(f"Source Directory: {source_dir}")
print(f"Cache Directory: {caches_dir}")
print(f"LLVM_SYSPATH: {os.environ.get('LLVM_SYSPATH')}")
print(f"LLVM_INCLUDE_DIRS: {os.environ.get('LLVM_INCLUDE_DIRS')}")
print(f"LLVM_LIBRARY_DIR: {os.environ.get('LLVM_LIBRARY_DIR')}")
print(f"Script Arguments: {sys.argv[1:]}")
print(f"---------------------")
def should_change_caches_dir():
return sys.platform == 'win32'
if should_change_caches_dir():
print(f"Ensuring directories exist...")
ccache_dir.mkdir(parents=True, exist_ok=True)
pip_cache_dir.mkdir(parents=True, exist_ok=True)
print("Setting environment variables...")
if should_change_caches_dir():
os.environ['CCACHE_DIR'] = str(ccache_dir.resolve())
os.environ['PIP_CACHE_DIR'] = str(pip_cache_dir.resolve())
# Change default from "$HOME/.triton" to "$TRITON_HOME/.triton" the ".triton" part is always added
os.environ['TRITON_HOME'] = str(triton_home_dir.resolve())
# TRITON_PARALLEL_LINK_JOBS
# TRITON_BUILD_WITH_CCACHE: we control this manually with CMAKE_{C,CXX}_COMPILER_LAUNCHER
# TRITRON_BUILD_WITH_O1 doesn't work on triton-v3.3.x
os.environ['TRITRON_BUILD_WITH_O1'] = 'ON'
os.environ['TRITON_BUILD_PROTON'] = 'OFF'
# os.environ['TRITON_BUILD_WITH_CLANG_LLD'] = 'OFF'
# Triton C++ unit tests
os.environ['TRITON_BUILD_UT'] = 'OFF'
# TRITON_BUILD_BINARY is only defined in the 'triton-windows' fork, which requires it
os.environ['TRITON_BUILD_BINARY'] = 'OFF'
# Check if ccache is desired/available before setting launchers
use_ccache = True # Set to False to disable ccache
if use_ccache:
print(f"Configuring CMake to use ccache ('{CCACHE_EXECUTABLE}')...")
os.environ['CMAKE_C_COMPILER_LAUNCHER'] = CCACHE_EXECUTABLE
os.environ['CMAKE_CXX_COMPILER_LAUNCHER'] = CCACHE_EXECUTABLE
else:
print("Skipping ccache configuration.")
# Ensure they are unset if they existed before
os.environ.pop('CMAKE_C_COMPILER_LAUNCHER', None)
os.environ.pop('CMAKE_CXX_COMPILER_LAUNCHER', None)
print(f"CCACHE_DIR = {os.environ.get('CCACHE_DIR')}")
print(f"PIP_CACHE_DIR = {os.environ.get('PIP_CACHE_DIR')}")
print(f"CMAKE_C_COMPILER_LAUNCHER = {os.environ.get('CMAKE_C_COMPILER_LAUNCHER')}")
print(f"CMAKE_CXX_COMPILER_LAUNCHER = {os.environ.get('CMAKE_CXX_COMPILER_LAUNCHER')}")
# --- Helper function to run commands ---
def run_command(cmd_list, cwd=None):
"""Runs a command, prints it, times it, and checks for errors."""
print(f"\n--- Executing: {' '.join(map(str, cmd_list))} ---", flush=True)
start_time = time.monotonic()
try:
# Use shell=False (default) for better security and argument handling
# check=True raises CalledProcessError on non-zero exit code (like set -e)
process = subprocess.run(cmd_list, cwd=cwd, check=True, text=True)
#stdout=sys.stdout, stderr=sys.stderr) # Redirect streams directly
except FileNotFoundError:
print(f"ERROR: Command not found: {cmd_list[0]}. Is it installed and in PATH?", file=sys.stderr)
sys.exit(1)
except subprocess.CalledProcessError as e:
print(f"ERROR: Command failed with exit code {e.returncode}", file=sys.stderr)
sys.exit(e.returncode)
except Exception as e:
print(f"ERROR: An unexpected error occurred: {e}", file=sys.stderr)
sys.exit(1)
end_time = time.monotonic()
print(f"--- Command finished successfully in {end_time - start_time:.2f} seconds ---", flush=True)
return process
# --- Build Steps ---
setup_py_dir = source_dir
# triton (future) v3.4+ has "triton/setup.py"
if not os.path.exists(str(setup_py_dir / "setup.py")):
setup_py_dir = source_dir / "python"
# Make the build fail here if 'setup.py' can't be found
assert os.path.exists(str(setup_py_dir / "setup.py"))
# --no-build-isolation: https://github.com/triton-lang/triton?tab=readme-ov-file#tips-for-building
cmd = [
"uv", "pip", "install", "-vv", "--no-build-isolation", str((setup_py_dir).resolve()),
]
cmd.extend(sys.argv[1:]) # Add extra arguments from script call
run_command(cmd)
print("\nBuild script completed successfully.")
@jammm
Copy link

jammm commented May 6, 2025

lzma_build.py - this built fine for me on Windows after modifying the paths. I also don't use ccache (yet) so I set that flag to false.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment