Last active
May 13, 2025 19:15
-
-
Save scottt/b85f07db488b3e37944a9f675370ed3c to your computer and use it in GitHub Desktop.
Triton on Windows
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import os | |
import sys | |
import subprocess | |
import time | |
from pathlib import Path | |
# --- Configuration --- | |
AOTRITON_TARGET_ARCH = 'gfx1151' | |
PYTHON_VER = '3.13' | |
AOTRITON_NOIMAGE_MODE = False | |
if sys.platform == 'win32': | |
os.environ['AOTRITON_SOURCE_DIR'] = '/work/aotriton' | |
os.environ['AOTRITON_BUILD_DIR'] = '/aotriton-output/build' | |
# NOTE: Python version of Triton wheel must match Python version used to build AOTriton | |
else: | |
os.environ['AOTRITON_SOURCE_DIR'] = os.path.expanduser('~/work/aotriton-0.9-gfx1151-windows') | |
os.environ['AOTRITON_BUILD_DIR'] = '/aotriton-windows-for-merge-output/build' | |
source_dir = Path(os.environ['AOTRITON_SOURCE_DIR']).resolve(strict=True) | |
build_dir = Path(os.environ['AOTRITON_BUILD_DIR']).resolve() | |
if not build_dir.exists(): | |
build_dir.mkdir(parents=True, exist_ok=True) | |
venv_dir = build_dir / 'venv' | |
# --- Setup --- | |
caches_dir = Path("/caches") | |
pip_cache_dir = caches_dir / "pip" | |
if sys.platform == 'win32': | |
os.environ['PKG_CONFIG'] = 'C:/Strawberry/perl/bin/pkg-config.bat' | |
os.environ['PKG_CONFIG_PATH'] = '/xz-output/lib/pkgconfig' | |
print(f"--- Configuration ---") | |
print(f"Source Directory: {source_dir}") | |
print(f"Cache Directory: {caches_dir}") | |
print(f"LLVM_SYSPATH: {os.environ.get('LLVM_SYSPATH')}") | |
print(f"LLVM_INCLUDE_DIRS: {os.environ.get('LLVM_INCLUDE_DIRS')}") | |
print(f"LLVM_LIBRARY_DIR: {os.environ.get('LLVM_LIBRARY_DIR')}") | |
print(f"PKG_CONFIG_DIR: {os.environ.get('PKG_CONFIG_DIR')}") | |
print(f"Script Arguments: {sys.argv[1:]}") | |
print(f"---------------------") | |
def should_change_caches_dir(): | |
return sys.platform == 'win32' | |
if should_change_caches_dir(): | |
print(f"Ensuring directories exist...") | |
pip_cache_dir.mkdir(parents=True, exist_ok=True) | |
print("Setting environment variables...") | |
if should_change_caches_dir(): | |
os.environ['PIP_CACHE_DIR'] = str(pip_cache_dir.resolve()) | |
print(f"PIP_CACHE_DIR = {os.environ.get('PIP_CACHE_DIR')}") | |
# --- Helper function to run commands --- | |
def run_command(cmd_list, cwd=None): | |
"""Runs a command, prints it, times it, and checks for errors.""" | |
print(f"\n--- Executing: {' '.join(map(str, cmd_list))} ---", flush=True) | |
start_time = time.monotonic() | |
try: | |
# Use shell=False (default) for better security and argument handling | |
# check=True raises CalledProcessError on non-zero exit code (like set -e) | |
process = subprocess.run(cmd_list, cwd=cwd, check=True, text=True) | |
#stdout=sys.stdout, stderr=sys.stderr) # Redirect streams directly | |
except FileNotFoundError: | |
print(f"ERROR: Command not found: {cmd_list[0]}. Is it installed and in PATH?", file=sys.stderr) | |
sys.exit(1) | |
except subprocess.CalledProcessError as e: | |
print(f"ERROR: Command failed with exit code {e.returncode}", file=sys.stderr) | |
sys.exit(e.returncode) | |
except Exception as e: | |
print(f"ERROR: An unexpected error occurred: {e}", file=sys.stderr) | |
sys.exit(1) | |
end_time = time.monotonic() | |
print(f"--- Command finished successfully in {end_time - start_time:.2f} seconds ---", flush=True) | |
return process | |
# --- Build Steps --- | |
cmd = [ | |
"uv", | |
"venv", | |
"--python", PYTHON_VER, | |
str(venv_dir), | |
] | |
run_command(cmd) | |
build_python_bindings = True | |
if not AOTRITON_NOIMAGE_MODE: | |
# https://github.com/astral-sh/uv/issues/8721 | |
cmd = [ | |
"uv", | |
"pip", | |
"install", "torch", | |
"--python", str(venv_dir), | |
] | |
run_command(cmd) | |
use_aotriton_target_arch = True | |
build_aotriton_09 = True | |
if build_aotriton_09: | |
if AOTRITON_TARGET_ARCH == 'gfx1151': | |
use_aotriton_target_arch = False | |
TARGET_GPUS = 'Navi3.5' | |
else: | |
raise TypeError(f"Don't know GPU mapping for aotriton 0.9 for {AOTRITON_TARGET_GPUS}") | |
cmd = [ | |
"cmake", | |
#"--trace", | |
"-GNinja", | |
f"-DVENV_DIR={str(venv_dir)}", | |
f"-DCMAKE_INSTALL_PREFIX={str((build_dir / 'install_dir').resolve())}", | |
"-DCMAKE_BUILD_TYPE=Release", | |
"-DAOTRITON_GPU_BUILD_TIMEOUT=0", | |
# AOTRITON_NO_PYTHON must be OFF and AOTRITON_NAME_SUFFIX must be set to run the unit tests | |
"-DAOTRITON_NO_PYTHON=OFF", | |
"-DHIP_PLATFORM=amd", | |
f'-DAOTRITON_NOIMAGE_MODE={'ON' if AOTRITON_NOIMAGE_MODE else 'OFF' }', | |
"-S", str(source_dir.resolve()), | |
"-B", str(build_dir.resolve()), | |
#"--debug-find", | |
] | |
if use_aotriton_target_arch: # aotriton-0.10 | |
cmd.extend([ | |
f"-DAOTRITON_TARGET_ARCH={AOTRITON_TARGET_ARCH}" | |
]) | |
else: # aotriton-0.9 | |
cmd.extend([ | |
f"-DTARGET_GPUS={TARGET_GPUS}" | |
]) | |
if sys.platform == 'win32': | |
cmd.append('-Ddlfcn-win32_DIR=/dlfcn-output/share/dlfcn-win32') | |
cmd.extend(sys.argv[1:]) # Add extra arguments from script call | |
run_command(cmd) | |
cmd = [ | |
"ninja", "install" | |
] | |
cpu_count = os.cpu_count() | |
if cpu_count and cpu_count > 1: | |
cmake_build_cmd.extend(["--", f"-j{max(1, cpu_count - 1)}"]) | |
print(f"Using parallel build flag: -j{max(1, cpu_count - 1)}") | |
run_command(cmd, cwd=build_dir) | |
print("\nBuild script completed successfully.") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import os | |
import sys | |
import subprocess | |
import time | |
from pathlib import Path | |
# --- Configuration --- | |
AOTRITON_TARGET_ARCH = 'gfx1151' | |
# We need to install `torch` in the virtual env so we need to create the venv and specify | |
# the python version used | |
PYTHON_VER = '3.13' | |
AOTRITON_NOIMAGE_MODE = True | |
fork_name = 'aotriton-0.9-gfx1151-windows' | |
if sys.platform == 'win32': | |
os.environ['AOTRITON_SOURCE_DIR'] = f'/w/{fork_name}' | |
os.environ['AOTRITON_BUILD_DIR'] = f'/o/{fork_name}/build' | |
# NOTE: Python version of Triton wheel must match Python version used to build AOTriton | |
else: | |
os.environ['AOTRITON_SOURCE_DIR'] = os.path.expanduser('~/work/aotriton') | |
os.environ['AOTRITON_BUILD_DIR'] = '/aotriton-output/build' | |
# Workaround cl.exe amd_hip_vector_types.h | |
os.environ['CC'] = 'clang-cl' | |
os.environ['CXX'] = 'clang-cl' | |
triton_wheel_path = None | |
if not AOTRITON_NOIMAGE_MODE: | |
if sys.platform == 'win32': | |
triton_wheel_path = Path('/work/triton-lshqqytiger/python/dist/triton-3.3.0+gitf8727c94-cp313-cp313-win_amd64.whl') | |
else: | |
triton_wheel_path = Path('~/work/triton-lshqqytiger/python/dist/triton-3.3.0+gitf8727c94-cp313-cp313-win_amd64.whl') | |
triton_wheel_path.resolve(strict=True) | |
source_dir = Path(os.environ['AOTRITON_SOURCE_DIR']).resolve(strict=True) | |
build_dir = Path(os.environ['AOTRITON_BUILD_DIR']).resolve() | |
if not build_dir.exists(): | |
build_dir.mkdir(parents=True, exist_ok=True) | |
venv_dir = build_dir / 'venv' | |
if sys.platform == 'win32': | |
Python3_EXECUTABLE = None | |
Python3_INCLUDE_DIR = None | |
Python3_LIBRARY = None | |
else: | |
# Python3_EXECUTABLE = str(venv_dir / "bin" / "python") | |
Python3_EXECUTABLE = "/usr/bin/python" | |
Python3_INCLUDE_DIR = "/usr/include/python3.13" | |
Python3_LIBRARY = "/usr/lib64/libpython3.13.so" | |
CCACHE_EXECUTABLE = "ccache" | |
# --- Setup --- | |
caches_dir = Path("/caches") | |
ccache_dir = caches_dir / "ccache" | |
pip_cache_dir = caches_dir / "pip" | |
if sys.platform == 'win32': | |
os.environ['PKG_CONFIG'] = 'C:/Strawberry/perl/bin/pkg-config.bat' | |
os.environ['PKG_CONFIG_PATH'] = '/xz-output/lib/pkgconfig' | |
print(f"--- Configuration ---") | |
print(f"Source Directory: {source_dir}") | |
print(f"Cache Directory: {caches_dir}") | |
print(f"LLVM_SYSPATH: {os.environ.get('LLVM_SYSPATH')}") | |
print(f"LLVM_INCLUDE_DIRS: {os.environ.get('LLVM_INCLUDE_DIRS')}") | |
print(f"LLVM_LIBRARY_DIR: {os.environ.get('LLVM_LIBRARY_DIR')}") | |
print(f"PKG_CONFIG_DIR: {os.environ.get('PKG_CONFIG_DIR')}") | |
print(f"Script Arguments: {sys.argv[1:]}") | |
print(f"---------------------") | |
def should_change_caches_dir(): | |
return sys.platform == 'win32' | |
if should_change_caches_dir(): | |
print(f"Ensuring directories exist...") | |
ccache_dir.mkdir(parents=True, exist_ok=True) | |
pip_cache_dir.mkdir(parents=True, exist_ok=True) | |
print("Setting environment variables...") | |
if should_change_caches_dir(): | |
os.environ['CCACHE_DIR'] = str(ccache_dir.resolve()) | |
os.environ['PIP_CACHE_DIR'] = str(pip_cache_dir.resolve()) | |
use_ccache = False | |
if use_ccache: | |
print(f"Configuring CMake to use ccache ('{CCACHE_EXECUTABLE}')...") | |
os.environ['CMAKE_C_COMPILER_LAUNCHER'] = CCACHE_EXECUTABLE | |
os.environ['CMAKE_CXX_COMPILER_LAUNCHER'] = CCACHE_EXECUTABLE | |
else: | |
print("Skipping ccache configuration.") | |
os.environ.pop('CMAKE_C_COMPILER_LAUNCHER', None) | |
os.environ.pop('CMAKE_CXX_COMPILER_LAUNCHER', None) | |
print(f"CCACHE_DIR = {os.environ.get('CCACHE_DIR')}") | |
print(f"PIP_CACHE_DIR = {os.environ.get('PIP_CACHE_DIR')}") | |
print(f"CMAKE_C_COMPILER_LAUNCHER = {os.environ.get('CMAKE_C_COMPILER_LAUNCHER')}") | |
print(f"CMAKE_CXX_COMPILER_LAUNCHER = {os.environ.get('CMAKE_CXX_COMPILER_LAUNCHER')}") | |
# --- Helper function to run commands --- | |
def run_command(cmd_list, cwd=None): | |
"""Runs a command, prints it, times it, and checks for errors.""" | |
print(f"\n--- Executing: {' '.join(map(str, cmd_list))} ---", flush=True) | |
start_time = time.monotonic() | |
try: | |
# Use shell=False (default) for better security and argument handling | |
# check=True raises CalledProcessError on non-zero exit code (like set -e) | |
process = subprocess.run(cmd_list, cwd=cwd, check=True, text=True) | |
#stdout=sys.stdout, stderr=sys.stderr) # Redirect streams directly | |
except FileNotFoundError: | |
print(f"ERROR: Command not found: {cmd_list[0]}. Is it installed and in PATH?", file=sys.stderr) | |
sys.exit(1) | |
except subprocess.CalledProcessError as e: | |
print(f"ERROR: Command failed with exit code {e.returncode}", file=sys.stderr) | |
sys.exit(e.returncode) | |
except Exception as e: | |
print(f"ERROR: An unexpected error occurred: {e}", file=sys.stderr) | |
sys.exit(1) | |
end_time = time.monotonic() | |
print(f"--- Command finished successfully in {end_time - start_time:.2f} seconds ---", flush=True) | |
return process | |
# --- Build Steps --- | |
cmd = [ | |
"uv", | |
"venv", | |
"--python", PYTHON_VER, | |
str(venv_dir), | |
] | |
run_command(cmd) | |
BUILD_PYTHON_BINDINGS = True | |
if not AOTRITON_NOIMAGE_MODE: | |
# https://github.com/astral-sh/uv/issues/8721 | |
cmd = [ | |
"uv", | |
"pip", | |
"install", "torch", | |
"--python", str(venv_dir), | |
] | |
run_command(cmd) | |
use_aotriton_target_arch = True | |
build_aotriton_09 = True | |
if build_aotriton_09: | |
if AOTRITON_TARGET_ARCH == 'gfx1151': | |
use_aotriton_target_arch = False | |
TARGET_GPUS = 'Navi3.5' | |
else: | |
raise TypeError(f"Don't know GPU mapping for aotriton 0.9 for {AOTRITON_TARGET_GPUS}") | |
cmd = [ | |
"cmake", | |
#"--trace", | |
"-GNinja", | |
f"-DVENV_DIR={str(venv_dir)}", | |
f"-DCMAKE_INSTALL_PREFIX={str((build_dir / 'install_dir').resolve())}", | |
"-DCMAKE_BUILD_TYPE=Release", | |
"-DAOTRITON_GPU_BUILD_TIMEOUT=0", | |
# AOTRITON_NO_PYTHON must be OFF and AOTRITON_NAME_SUFFIX must be set to run the unit tests | |
f"-DAOTRITON_NO_PYTHON={"ON" if BUILD_PYTHON_BINDINGS else "OFF"}", | |
"-DHIP_PLATFORM=amd", | |
f'-DAOTRITON_NOIMAGE_MODE={'ON' if AOTRITON_NOIMAGE_MODE else 'OFF' }', | |
"-S", str(source_dir.resolve()), | |
"-B", str(build_dir.resolve()), | |
#"--debug-find", | |
] | |
if use_aotriton_target_arch: | |
cmd.extend([ | |
f"-DAOTRITON_TARGET_ARCH={AOTRITON_TARGET_ARCH}" | |
]) | |
else: # aotriton-0.9 | |
cmd.extend([ | |
f"-DTARGET_GPUS={TARGET_GPUS}" | |
]) | |
if BUILD_PYTHON_BINDINGS: | |
if Python3_EXECUTABLE: | |
cmd.extend([ | |
f"-DPython3_EXECUTABLE={Python3_EXECUTABLE}", | |
]) | |
if Python3_INCLUDE_DIR: | |
cmd.extend([ | |
"-DPython3_INCLUDE_DIR={Python3_INCLUDE_DIR}", | |
]) | |
cmd.extend([ | |
"-DPython3_LIBRARY={Python3_LIBRARY}", | |
]) | |
if triton_wheel_path is not None: | |
cmd.append(f"-DINSTALL_TRITON_FROM_WHEEL={str(triton_wheel_path.resolve())}") | |
if sys.platform == 'win32': | |
cmd.append('-Ddlfcn-win32_DIR=/dlfcn-output/share/dlfcn-win32') | |
cmd.extend(sys.argv[1:]) # Add extra arguments from script call | |
run_command(cmd) | |
if sys.platform == 'win32': | |
# Set HIP_PATH to dir containing `bin/ld-lld.exe` for triton\backends\amd\compiler.py | |
# lld = Path(os.path.join( os.environ['HIP_PATH'] , 'bin', 'ld.lld.exe' )) | |
os.environ['HIP_PATH'] = '/o/r-st/build/dist/rocm/lib/llvm' | |
cmd = [ | |
"ninja", "install" | |
] | |
if sys.platform == 'win32' and (not AOTRITON_NOIMAGE_MODE): | |
cmd.extend(['-j', '1']) | |
run_command(cmd, cwd=build_dir) | |
print("\nBuild script completed successfully.") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import os | |
import sys | |
import subprocess | |
import time | |
from pathlib import Path | |
# https://github.com/dlfcn-win32/dlfcn-win32 | |
# --- Configuration --- | |
output_dir = Path('/dlfcn-output') | |
os.environ['SOURCE_DIR'] = '/work/dlfcn-win32' | |
source_dir = Path(os.environ['SOURCE_DIR']).resolve(strict=True) | |
build_dir = source_dir / 'build' | |
os.environ['BUILD_DIR'] = '/dlfcn-output/build' | |
if not build_dir.exists(): | |
build_dir.mkdir(parents=True, exist_ok=True) | |
CCACHE_EXECUTABLE = "ccache" | |
# --- Setup --- | |
caches_dir = Path("/caches") | |
ccache_dir = caches_dir / "ccache" | |
print(f"--- Configuration ---") | |
print(f"Source Directory: {source_dir}") | |
print(f"Build Directory: {build_dir}") | |
print(f"Output Directory: {output_dir}") | |
print(f"Cache Directory: {caches_dir}") | |
print(f"Script Arguments: {sys.argv[1:]}") | |
print(f"---------------------") | |
def should_change_caches_dir(): | |
return sys.platform == 'win32' | |
if should_change_caches_dir(): | |
print(f"Ensuring directories exist...") | |
ccache_dir.mkdir(parents=True, exist_ok=True) | |
print("Setting environment variables...") | |
if should_change_caches_dir(): | |
os.environ['CCACHE_DIR'] = str(ccache_dir.resolve()) | |
use_ccache = True | |
if use_ccache: | |
print(f"Configuring CMake to use ccache ('{CCACHE_EXECUTABLE}')...") | |
os.environ['CMAKE_C_COMPILER_LAUNCHER'] = CCACHE_EXECUTABLE | |
os.environ['CMAKE_CXX_COMPILER_LAUNCHER'] = CCACHE_EXECUTABLE | |
else: | |
print("Skipping ccache configuration.") | |
os.environ.pop('CMAKE_C_COMPILER_LAUNCHER', None) | |
os.environ.pop('CMAKE_CXX_COMPILER_LAUNCHER', None) | |
print(f"CCACHE_DIR = {os.environ.get('CCACHE_DIR')}") | |
print(f"CMAKE_C_COMPILER_LAUNCHER = {os.environ.get('CMAKE_C_COMPILER_LAUNCHER')}") | |
print(f"CMAKE_CXX_COMPILER_LAUNCHER = {os.environ.get('CMAKE_CXX_COMPILER_LAUNCHER')}") | |
# --- Helper function to run commands --- | |
def run_command(cmd_list, cwd=None): | |
"""Runs a command, prints it, times it, and checks for errors.""" | |
print(f"\n--- Executing: {' '.join(map(str, cmd_list))} ---", flush=True) | |
start_time = time.monotonic() | |
try: | |
# Use shell=False (default) for better security and argument handling | |
# check=True raises CalledProcessError on non-zero exit code (like set -e) | |
process = subprocess.run(cmd_list, cwd=cwd, check=True, text=True) | |
#stdout=sys.stdout, stderr=sys.stderr) # Redirect streams directly | |
except FileNotFoundError: | |
print(f"ERROR: Command not found: {cmd_list[0]}. Is it installed and in PATH?", file=sys.stderr) | |
sys.exit(1) | |
except subprocess.CalledProcessError as e: | |
print(f"ERROR: Command failed with exit code {e.returncode}", file=sys.stderr) | |
sys.exit(e.returncode) | |
except Exception as e: | |
print(f"ERROR: An unexpected error occurred: {e}", file=sys.stderr) | |
sys.exit(1) | |
end_time = time.monotonic() | |
print(f"--- Command finished successfully in {end_time - start_time:.2f} seconds ---", flush=True) | |
return process | |
# --- Build Steps --- | |
cmd = [ | |
"cmake", | |
# "--trace", | |
"-GNinja", | |
"-DBUILD_SHARED_LIBS=ON", | |
"-DCMAKE_BUILD_TYPE=Release", | |
f"-DCMAKE_INSTALL_PREFIX={str(output_dir.resolve())}", | |
"-S", str(source_dir.resolve()), | |
"-B", str(build_dir.resolve()), | |
] | |
cmd.extend(sys.argv[1:]) # Add extra arguments from script call | |
run_command(cmd) | |
cmd = [ | |
"cmake", | |
"--build", | |
str(build_dir.resolve()), | |
] | |
run_command(cmd) | |
cmd = [ | |
"cmake", | |
"--install", | |
str(build_dir.resolve()), | |
] | |
run_command(cmd) | |
print("\nBuild script completed successfully.") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import os | |
import sys | |
import subprocess | |
import time | |
from pathlib import Path | |
# --- Configuration --- | |
os.environ['LLVM_OUTPUT_DIR'] = '/llvm-output' | |
os.environ['LLVM_SOURCE_DIR'] = '/work/llvm-for-triton' | |
output_dir = Path(os.environ['LLVM_OUTPUT_DIR']).resolve() | |
source_dir = Path(os.environ['LLVM_SOURCE_DIR']).resolve() | |
CCACHE_EXECUTABLE = "ccache" | |
# --- Setup --- | |
build_dir = output_dir / "build" | |
caches_dir = output_dir / ".." / "caches" | |
ccache_dir = caches_dir / "ccache" | |
print(f"--- Configuration ---") | |
print(f"Source Directory: {source_dir}") | |
print(f"Output Base Directory: {output_dir}") | |
print(f"Build Directory: {build_dir}") | |
print(f"Cache Directory: {caches_dir}") | |
print(f"Script Arguments: {sys.argv[1:]}") | |
print(f"---------------------") | |
print(f"Ensuring directories exist...") | |
ccache_dir.mkdir(parents=True, exist_ok=True) | |
build_dir.mkdir(parents=True, exist_ok=True) # Also ensure build dir exists early | |
# | |
print("Setting environment variables...") | |
os.environ['CCACHE_DIR'] = str(ccache_dir.resolve()) | |
# Check if ccache is desired/available before setting launchers | |
use_ccache = True # Set to False to disable ccache | |
if use_ccache: | |
print(f"Configuring CMake to use ccache ('{CCACHE_EXECUTABLE}')...") | |
os.environ['CMAKE_C_COMPILER_LAUNCHER'] = CCACHE_EXECUTABLE | |
os.environ['CMAKE_CXX_COMPILER_LAUNCHER'] = CCACHE_EXECUTABLE | |
else: | |
print("Skipping ccache configuration.") | |
# Ensure they are unset if they existed before | |
os.environ.pop('CMAKE_C_COMPILER_LAUNCHER', None) | |
os.environ.pop('CMAKE_CXX_COMPILER_LAUNCHER', None) | |
print(f"CCACHE_DIR = {os.environ.get('CCACHE_DIR')}") | |
print(f"CMAKE_C_COMPILER_LAUNCHER = {os.environ.get('CMAKE_C_COMPILER_LAUNCHER')}") | |
print(f"CMAKE_CXX_COMPILER_LAUNCHER = {os.environ.get('CMAKE_CXX_COMPILER_LAUNCHER')}") | |
# --- Helper function to run commands --- | |
def run_command(cmd_list, cwd=None): | |
"""Runs a command, prints it, times it, and checks for errors.""" | |
print(f"\n--- Executing: {' '.join(map(str, cmd_list))} ---", flush=True) | |
start_time = time.monotonic() | |
try: | |
# Use shell=False (default) for better security and argument handling | |
# check=True raises CalledProcessError on non-zero exit code (like set -e) | |
process = subprocess.run(cmd_list, cwd=cwd, check=True, text=True) | |
#stdout=sys.stdout, stderr=sys.stderr) # Redirect streams directly | |
except FileNotFoundError: | |
print(f"ERROR: Command not found: {cmd_list[0]}. Is it installed and in PATH?", file=sys.stderr) | |
sys.exit(1) | |
except subprocess.CalledProcessError as e: | |
print(f"ERROR: Command failed with exit code {e.returncode}", file=sys.stderr) | |
sys.exit(e.returncode) | |
except Exception as e: | |
print(f"ERROR: An unexpected error occurred: {e}", file=sys.stderr) | |
sys.exit(1) | |
end_time = time.monotonic() | |
print(f"--- Command finished successfully in {end_time - start_time:.2f} seconds ---", flush=True) | |
return process | |
# --- Build Steps --- | |
# https://github.com/triton-lang/triton?tab=readme-ov-file#building-with-a-custom-llvm | |
# 1. CMake Configure Step | |
configure_cmd = [ | |
"cmake", \ | |
"-GNinja", | |
"-DCMAKE_BUILD_TYPE=Release", | |
"-DLLVM_ENABLE_ASSERTIONS=ON", | |
"-DLLVM_ENABLE_PROJECTS=mlir;llvm;lld", | |
"-DLLVM_TARGETS_TO_BUILD=host;NVPTX;AMDGPU", | |
"-DLLVM_FORCE_VC_REPOSITORY=llvm-for-triton" | |
"-DLLVM_FORCE_VC_REVISION=rev-for-triton" | |
"-S", str((source_dir / "llvm").resolve()), | |
"-B", str(build_dir.resolve()), | |
] | |
configure_cmd.extend(sys.argv[1:]) # Add extra arguments from script call | |
run_command(configure_cmd) | |
# 2. CMake Build Step | |
cmake_build_cmd = [ | |
"cmake", | |
"--build", str(build_dir.resolve()) | |
] | |
# Add parallel build flag common on Windows (optional) | |
# Get number of processors, leave one free | |
cpu_count = os.cpu_count() | |
if cpu_count and cpu_count > 1: | |
cmake_build_cmd.extend(["--", f"-j{max(1, cpu_count - 1)}"]) # Pass '-jN' to underlying Ninja | |
print(f"Using parallel build flag: -j{max(1, cpu_count - 1)}") | |
run_command(cmake_build_cmd) | |
print("\nBuild script completed successfully.") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# BUild liblzma for aotriton | |
import os | |
import sys | |
import subprocess | |
import time | |
from pathlib import Path | |
# https://github.com/tukaani-project/xz | |
# --- Configuration --- | |
output_dir = Path('/xz-output') | |
os.environ['XZ_SOURCE_DIR'] = '/work/xz' | |
source_dir = Path(os.environ['XZ_SOURCE_DIR']).resolve(strict=True) | |
build_dir = source_dir / 'build' | |
os.environ['XZ_BUILD_DIR'] = '/aotriton-output/build' | |
if not build_dir.exists(): | |
build_dir.mkdir(parents=True, exist_ok=True) | |
CCACHE_EXECUTABLE = "ccache" | |
# --- Setup --- | |
caches_dir = Path("/caches") | |
ccache_dir = caches_dir / "ccache" | |
print(f"--- Configuration ---") | |
print(f"Source Directory: {source_dir}") | |
print(f"Build Directory: {build_dir}") | |
print(f"Output Directory: {output_dir}") | |
print(f"Cache Directory: {caches_dir}") | |
print(f"Script Arguments: {sys.argv[1:]}") | |
print(f"---------------------") | |
def should_change_caches_dir(): | |
return sys.platform == 'win32' | |
if should_change_caches_dir(): | |
print(f"Ensuring directories exist...") | |
ccache_dir.mkdir(parents=True, exist_ok=True) | |
print("Setting environment variables...") | |
if should_change_caches_dir(): | |
os.environ['CCACHE_DIR'] = str(ccache_dir.resolve()) | |
use_ccache = True | |
if use_ccache: | |
print(f"Configuring CMake to use ccache ('{CCACHE_EXECUTABLE}')...") | |
os.environ['CMAKE_C_COMPILER_LAUNCHER'] = CCACHE_EXECUTABLE | |
os.environ['CMAKE_CXX_COMPILER_LAUNCHER'] = CCACHE_EXECUTABLE | |
else: | |
print("Skipping ccache configuration.") | |
os.environ.pop('CMAKE_C_COMPILER_LAUNCHER', None) | |
os.environ.pop('CMAKE_CXX_COMPILER_LAUNCHER', None) | |
print(f"CCACHE_DIR = {os.environ.get('CCACHE_DIR')}") | |
print(f"CMAKE_C_COMPILER_LAUNCHER = {os.environ.get('CMAKE_C_COMPILER_LAUNCHER')}") | |
print(f"CMAKE_CXX_COMPILER_LAUNCHER = {os.environ.get('CMAKE_CXX_COMPILER_LAUNCHER')}") | |
# --- Helper function to run commands --- | |
def run_command(cmd_list, cwd=None): | |
"""Runs a command, prints it, times it, and checks for errors.""" | |
print(f"\n--- Executing: {' '.join(map(str, cmd_list))} ---", flush=True) | |
start_time = time.monotonic() | |
try: | |
# Use shell=False (default) for better security and argument handling | |
# check=True raises CalledProcessError on non-zero exit code (like set -e) | |
process = subprocess.run(cmd_list, cwd=cwd, check=True, text=True) | |
#stdout=sys.stdout, stderr=sys.stderr) # Redirect streams directly | |
except FileNotFoundError: | |
print(f"ERROR: Command not found: {cmd_list[0]}. Is it installed and in PATH?", file=sys.stderr) | |
sys.exit(1) | |
except subprocess.CalledProcessError as e: | |
print(f"ERROR: Command failed with exit code {e.returncode}", file=sys.stderr) | |
sys.exit(e.returncode) | |
except Exception as e: | |
print(f"ERROR: An unexpected error occurred: {e}", file=sys.stderr) | |
sys.exit(1) | |
end_time = time.monotonic() | |
print(f"--- Command finished successfully in {end_time - start_time:.2f} seconds ---", flush=True) | |
return process | |
# --- Build Steps --- | |
cmd = [ | |
"cmake", | |
# "--trace", | |
"-GNinja", | |
"-DXZ_NLS=OFF", | |
"-DBUILD_SHARED_LIBS=ON", | |
"-DCMAKE_BUILD_TYPE=Release", | |
f"-DCMAKE_INSTALL_PREFIX={str(output_dir.resolve())}", | |
"-S", str(source_dir.resolve()), | |
"-B", str(build_dir.resolve()), | |
] | |
cmd.extend(sys.argv[1:]) # Add extra arguments from script call | |
run_command(cmd) | |
cmd = [ | |
"cmake", | |
"--build", | |
str(build_dir.resolve()), | |
] | |
run_command(cmd) | |
cmd = [ | |
"cmake", | |
"--install", | |
str(build_dir.resolve()), | |
] | |
run_command(cmd) | |
print("\nBuild script completed successfully.") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import os | |
import sys | |
import subprocess | |
import time | |
from pathlib import Path | |
# --- Configuration --- | |
AMDGPU_TARGETS = 'gfx1151' | |
# We need to install `torch` in the virtual env so we need to create the venv and specify | |
# the python version used | |
PYTHON_VER = '3.13' | |
fork_name = 'pytorch-st' | |
if sys.platform == 'win32': | |
source_dir = Path(f'/w/{fork_name}') | |
out_dir = Path(f'/w/{fork_name}') | |
os.environ['AOTRITON_INSTALLED_PREFIX'] = '/o/aotriton-0.9-gfx1151-windows/build/install_dir' | |
os.environ['CMAKE_PREFIX_PATH'] = '/o/r-st-gfx1151/build/dist/rocm' | |
# NOTE: Python version of Triton wheel must match Python version used to build AOTriton if Triton is used | |
USE_CMAKE = True | |
use_ccache = False | |
os.environ['CC'] = 'clang-cl' | |
os.environ['CXX'] = 'clang-cl' | |
else: | |
source_dir = Path(os.path.expanduser(f'~/w/{fork_name}')) | |
out_dir = Path(os.path.expanduser(f'~/w/{fork_name}')) | |
os.environ['AOTRITON_INSTALLED_PREFIX'] = os.path.expanduser('~/aotriton-windows-for-merge-output/build/install_dir') | |
os.environ['CMAKE_PREFIX_PATH'] = os.path.expanduser('~/therock-output-gfx1151/build/dist/rocm') | |
USE_CMAKE = True | |
use_ccache = False | |
build_dir = out_dir / "build" | |
venv_dir = out_dir / '.venv' | |
os.environ['USE_KINETO'] = 'OFF' | |
os.environ['PYTORCH_ROCM_ARCH'] = AMDGPU_TARGETS | |
os.environ['USE_ROCM'] = 'ON' | |
os.environ['BUILD_TEST'] = '0' | |
os.environ['USE_FLASH_ATTENTION'] = 'ON' | |
os.environ['USE_MEM_EFF_ATTENTION'] = 'ON' | |
os.environ['DISTUTILS_USE_SDK'] = '1' | |
source_dir = source_dir.resolve(strict=True) | |
build_dir = build_dir.resolve() | |
if not build_dir.exists(): | |
build_dir.mkdir(parents=True, exist_ok=True) | |
CCACHE_EXECUTABLE = "ccache" | |
# --- Setup --- | |
caches_dir = Path("/caches") | |
ccache_dir = caches_dir / "ccache" | |
pip_cache_dir = caches_dir / "pip" | |
print(f"--- Configuration ---") | |
print(f"Source Directory: {source_dir}") | |
print(f"Cache Directory: {caches_dir}") | |
print(f"Script Arguments: {sys.argv[1:]}") | |
print(f"---------------------") | |
def should_change_caches_dir(): | |
return sys.platform == 'win32' | |
if should_change_caches_dir(): | |
print(f"Ensuring directories exist...") | |
ccache_dir.mkdir(parents=True, exist_ok=True) | |
pip_cache_dir.mkdir(parents=True, exist_ok=True) | |
print("Setting environment variables...") | |
if should_change_caches_dir(): | |
os.environ['CCACHE_DIR'] = str(ccache_dir.resolve()) | |
os.environ['PIP_CACHE_DIR'] = str(pip_cache_dir.resolve()) | |
if use_ccache: | |
print(f"Configuring CMake to use ccache ('{CCACHE_EXECUTABLE}')...") | |
os.environ['CMAKE_C_COMPILER_LAUNCHER'] = CCACHE_EXECUTABLE | |
os.environ['CMAKE_CXX_COMPILER_LAUNCHER'] = CCACHE_EXECUTABLE | |
else: | |
print("Skipping ccache configuration.") | |
os.environ.pop('CMAKE_C_COMPILER_LAUNCHER', None) | |
os.environ.pop('CMAKE_CXX_COMPILER_LAUNCHER', None) | |
print(f"CCACHE_DIR = {os.environ.get('CCACHE_DIR')}") | |
print(f"PIP_CACHE_DIR = {os.environ.get('PIP_CACHE_DIR')}") | |
print(f"CMAKE_C_COMPILER_LAUNCHER = {os.environ.get('CMAKE_C_COMPILER_LAUNCHER')}") | |
print(f"CMAKE_CXX_COMPILER_LAUNCHER = {os.environ.get('CMAKE_CXX_COMPILER_LAUNCHER')}") | |
# --- Helper function to run commands --- | |
def run_command(cmd_list, cwd=None): | |
"""Runs a command, prints it, times it, and checks for errors.""" | |
print(f"\n--- Executing: {' '.join(map(str, cmd_list))} ---", flush=True) | |
start_time = time.monotonic() | |
try: | |
# Use shell=False (default) for better security and argument handling | |
# check=True raises CalledProcessError on non-zero exit code (like set -e) | |
process = subprocess.run(cmd_list, cwd=cwd, check=True, text=True) | |
#stdout=sys.stdout, stderr=sys.stderr) # Redirect streams directly | |
except FileNotFoundError: | |
print(f"ERROR: Command not found: {cmd_list[0]}. Is it installed and in PATH?", file=sys.stderr) | |
sys.exit(1) | |
except subprocess.CalledProcessError as e: | |
print(f"ERROR: Command failed with exit code {e.returncode}", file=sys.stderr) | |
sys.exit(e.returncode) | |
except Exception as e: | |
print(f"ERROR: An unexpected error occurred: {e}", file=sys.stderr) | |
sys.exit(1) | |
end_time = time.monotonic() | |
print(f"--- Command finished successfully in {end_time - start_time:.2f} seconds ---", flush=True) | |
return process | |
# --- Build Steps --- | |
if True: | |
cmd = [ | |
"uv", | |
"venv", | |
"--python", PYTHON_VER, | |
str(venv_dir), | |
] | |
run_command(cmd) | |
cmd = [ | |
"uv", | |
"pip", | |
"install", "-r", str(source_dir / "requirements.txt"), | |
"--python", str(venv_dir), | |
] | |
run_command(cmd) | |
venv_bin_dir = venv_dir / Path('bin') | |
venv_python_bin = venv_dir / Path('bin') / Path('python') | |
if sys.platform == 'win32': | |
venv_bin_dir = venv_dir / Path('Scripts') | |
venv_python_bin = venv_dir / Path('Scripts') / Path('python.exe') | |
if USE_CMAKE: | |
cmd = [ | |
venv_python_bin, | |
"setup.py", | |
"build", "--cmake-only", | |
] | |
run_command(cmd, cwd=source_dir) | |
cmd = [ | |
"cmake", | |
#"--trace", | |
"-GNinja", | |
"-DCMAKE_BUILD_TYPE=Release", | |
f"-DPython_EXECUTABLE={venv_python_bin}", | |
f"-DPYTORCH_ROCM_ARCH={AMDGPU_TARGETS}", | |
"-DUSE_ROCM=ON", | |
"-DUSE_KINETO=OFF", | |
"-DUSE_FLASH_ATTENTION=ON", | |
"-S", str(source_dir.resolve()), | |
"-B", str(build_dir.resolve()), | |
] | |
cmd.extend(sys.argv[1:]) # Add extra arguments from script call | |
run_command(cmd) | |
# sys.exit(17) | |
cmd = [ | |
venv_python_bin, | |
"setup.py", | |
"bdist_wheel", | |
] | |
run_command(cmd, cwd=source_dir) | |
print("\nBuild script completed successfully.") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# https://github.com/ROCm/TheRock/discussions/244#discussioncomment-12926010 | |
import torch | |
import time | |
from torch.nn.functional import scaled_dot_product_attention | |
from torch.nn.attention import SDPBackend, sdpa_kernel | |
# Check for GPU | |
if not torch.cuda.is_available(): | |
raise SystemExit("CUDA GPU is not available. Please run on a CUDA-enabled device.") | |
device = torch.device("cuda") | |
torch.cuda.init() # Initialize CUDA context (optional, helps measure baseline) | |
# Helper function for measuring one run | |
def measure_op(op_func, warmup=3, total_runs=10): | |
""" | |
op_func: a callable that runs the operation (including memory measurement) | |
and returns (time_ms, peak_mem_MB, gflops_s). | |
warmup: number of warm-up runs to discard. | |
total_runs: total runs to do, including warmup. | |
Returns: average_time_ms, average_peak_mem_MB, average_gflops_s over the runs after warm-up. | |
""" | |
times = [] | |
mems = [] | |
flops = [] | |
for run_idx in range(total_runs): | |
# Reset peak memory stats at the start of each run | |
torch.cuda.reset_peak_memory_stats(device) | |
t_ms, peak_mb, gf_s = op_func() | |
if run_idx >= warmup: | |
times.append(t_ms) | |
mems.append(peak_mb) | |
flops.append(gf_s) | |
avg_time_ms = sum(times) / len(times) | |
avg_mem_mb = sum(mems) / len(mems) | |
avg_gf_s = sum(flops) / len(flops) | |
return avg_time_ms, avg_mem_mb, avg_gf_s | |
# 1) Define the Scaled Dot-Product Attention test | |
def run_sdpa(): | |
# Configuration | |
B, heads = 1, 8 | |
L = 8192 | |
E = 64 | |
S = L | |
# Create random Q, K, V in half precision | |
# We place them inside the function so each run re-allocates | |
# new memory (to measure peak memory usage properly). | |
q = torch.randn(B, heads, L, E, device=device, dtype=torch.float16) | |
k = torch.randn(B, heads, S, E, device=device, dtype=torch.float16) | |
v = torch.randn(B, heads, S, E, device=device, dtype=torch.float16) | |
# Start timing | |
torch.cuda.synchronize() | |
start_time = time.time() | |
# Run scaled dot-product attention (Flash Attention backend) | |
with sdpa_kernel(SDPBackend.FLASH_ATTENTION): | |
out = scaled_dot_product_attention(q, k, v) | |
# Synchronize & end timing | |
torch.cuda.synchronize() | |
end_time = time.time() | |
# Measure time | |
time_ms = (end_time - start_time) * 1000.0 | |
# Peak memory usage (MB) | |
peak_mem_bytes = torch.cuda.max_memory_allocated(device) | |
peak_mem_mb = peak_mem_bytes / (1024**2) | |
# Compute FLOPs for scaled dot-product attention: | |
# Q*K^T -> 2 * B * heads * L * S * E | |
# Attn*V -> 2 * B * heads * L * S * E | |
# Total = 4 * B * heads * L * S * E | |
flops = 4.0 * B * heads * L * S * E | |
# Convert to GFLOPs/s | |
flops_s = flops / (end_time - start_time) | |
gflops_s = flops_s / 1e9 | |
return time_ms, peak_mem_mb, gflops_s | |
# Run the measurements | |
print("Benchmarking Scaled Dot-Product Attention (Flash) in FP16 ...") | |
warmup = 3 | |
sdpa_time, sdpa_mem, sdpa_gflops = measure_op(run_sdpa, warmup=warmup, total_runs=warmup+7) | |
print(f"Average time: {sdpa_time:.2f} ms") | |
print(f"Average peak memory: {sdpa_mem:.2f} MB") | |
print(f"Average throughput: {sdpa_gflops:.2f} GFLOP/s\n") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# https://rocm.blogs.amd.com/artificial-intelligence/flash-attention/README.html | |
import numpy as np | |
import torch | |
import torch.nn.functional as F | |
from torch.nn.attention import SDPBackend, sdpa_kernel | |
class NaiveSdpaForDevice(): | |
def __init__(self, device): | |
self.device = device | |
def scaled_dot_product_attention(self, query, key, value, attn_mask=None, is_causal=False, dropout_p=0.0, scale=None): | |
""" | |
Computes the scaled dot product attention between query, key, and value tensors in PyTorch eager mode. | |
Args: | |
query (torch.Tensor): The query tensor of shape (batch_size, n_heads, seq_len, hidden_dim). | |
key (torch.Tensor): The key tensor of shape (batch_size, n_heads, seq_len, hidden_dim). | |
value (torch.Tensor): The value tensor of shape (batch_size, n_heads, seq_len, hidden_dim). | |
attn_mask (torch.Tensor, optional): The attention mask tensor of shape (batch_size, n_heads, seq_len, seq_len). Defaults to None. | |
is_causal (bool, optional): Whether to apply a causal attention mask. Defaults to False. | |
dropout_p (float, optional): The dropout probability. Defaults to 0. | |
scale (float, optional): The scale factor for the dot product. Defaults to None. | |
Returns: | |
torch.Tensor: The output tensor of shape (batch_size, n_heads, seq_len, hidden_dim). | |
""" | |
# Calculate the scale factor | |
scale_factor = 1 / np.sqrt(query.size(-1)) if scale is None else scale | |
attn_weight = (query @ key.transpose(-2, -1) * scale_factor) | |
# Create the attention mask | |
attn_mask = (torch.ones(query.shape[0], query.shape[1], query.shape[2], query.shape[2], dtype=torch.bool, | |
device=self.device).tril(diagonal=0) | |
if is_causal else attn_mask) | |
attn_weight = attn_weight.masked_fill_(~attn_mask, -torch.inf) if attn_mask is not None else attn_weight | |
# Compute the scaled dot product attention | |
attn_weight = torch.softmax(attn_weight, dim=-1) | |
attn_weight = torch.dropout(attn_weight, dropout_p, train=False) | |
return attn_weight @ value | |
def test(n_times): | |
batch_size = 1 | |
seq_len = 64 | |
num_heads = 32 | |
device = torch.device("cuda") | |
embed_dims = [16, 32, 48, 64, 80, 96, 128, 160, 192, 224, 256] | |
times = n_times // 2 // len(embed_dims) | |
for (dtype, epsilon) in [(torch.float16, 1e-03), | |
(torch.bfloat16, 1e-02)]: | |
for embed_dim in embed_dims: | |
for i in range(times): | |
query = torch.rand(batch_size, num_heads, seq_len, embed_dim, device=device, dtype=dtype) | |
key = torch.rand(batch_size, num_heads, seq_len, embed_dim, device=device, dtype=dtype) | |
value = torch.rand(batch_size, num_heads, seq_len, embed_dim, device=device, dtype=dtype) | |
naive = NaiveSdpaForDevice(device) | |
want = naive.scaled_dot_product_attention(query, key, value, is_causal=True) | |
with sdpa_kernel(SDPBackend.FLASH_ATTENTION): | |
got = F.scaled_dot_product_attention(query, key, value, is_causal=True) | |
if not torch.allclose(want, got, rtol=epsilon, atol=epsilon): | |
raise RuntimeErrror(f'dtype: {dtype}, test_iteration: {i}') | |
test(1000) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# LLVM for triton-lshqqytiger | |
git clone [email protected]:llvm/llvm-project.git llvm-for-triton | |
cd llvm-for-triton | |
# Use llvm version specified in https://github.com/lshqqytiger/triton/blob/main/cmake/llvm-hash.txt | |
git checkout 1cec5fffd8fddd9d85b516f876093b0e3f0eec5f | |
cd .. | |
# use_triton_fork = 'triton-lshqqytiger' | |
git clone [email protected]:lshqqytiger/triton.git triton-lshqqytiger | |
cd triton-lshqqytiger | |
uv venv --python=3.13 | |
.venv\Script\Activate | |
uv pip install -r python\requirements.txt | |
python llvm-build.py | |
# | |
# NOTE: triton's setup.py wants to create symlinks which requires setting "Dev mode" to true on Windows 11 | |
# when using from a non-admin user account | |
# | |
# Edit `triton-build-fork.py` and set `use_triton_fork=triton-lshqqytiger` | |
python triton-build-fork.py | |
# use_triton_fork = 'triton-windows' | |
git [email protected]:woct0rdho/triton-windows.git | |
cd triton-windows | |
git checkout v3.3.x-windows | |
uv venv --python=3.13 | |
.venv\Script\Activate | |
uv pip install -r python\requirements.txt | |
# | |
# NOTE: triton's setup.py wants to create symlinks which requires setting "Dev mode" to true on Windows 11 | |
# when using from a non-admin user account | |
# | |
# Edit `triton-build-fork.py` and set `use_triton_fork=triton-windows` | |
python triton-build-fork.py | |
# To build aotriton, for AMD GPUs, use triton-lshqqytiger from above | |
python lzma-build.py | |
git clone [email protected]:scottt/aotriton.git | |
cd aotriton | |
git checkout windows | |
cd .. | |
python aotriton-build.py |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import os | |
import sys | |
import subprocess | |
import time | |
from pathlib import Path | |
# use_triton_fork = 'triton-lshqqytiger' | |
use_triton_fork = 'triton-windows' | |
def run_command(cmd_list, cwd=None): | |
"""Runs a command, prints it, times it, and checks for errors.""" | |
print(f"\n--- Executing: {' '.join(map(str, cmd_list))} ---", flush=True) | |
start_time = time.monotonic() | |
try: | |
process = subprocess.run(cmd_list, cwd=cwd, check=True, text=True) | |
except FileNotFoundError: | |
print(f"ERROR: Command not found: {cmd_list[0]}. Is it installed and in PATH?", file=sys.stderr) | |
sys.exit(1) | |
except subprocess.CalledProcessError as e: | |
print(f"ERROR: Command failed with exit code {e.returncode}", file=sys.stderr) | |
sys.exit(e.returncode) | |
except Exception as e: | |
print(f"ERROR: An unexpected error occurred: {e}", file=sys.stderr) | |
sys.exit(1) | |
end_time = time.monotonic() | |
print(f"--- Command finished successfully in {end_time - start_time:.2f} seconds ---", flush=True) | |
return process | |
def build(): | |
script_dir = Path(os.path.dirname(sys.argv[0])) | |
if sys.platform == 'win32': | |
# nlohmann-json from https://github.com/nlohmann/json/releases/download/v3.11.3/include.zip | |
os.environ['JSON_SYSPATH'] = os.path.expanduser('/work/nlohmann-json-3.11.3') | |
# Both triton forks are built with "MSVC v143 x64" | |
# That's the default toolchain in Visual Studio 2022 Community Edition as of 2025-05 | |
os.environ['TRITON_BUILD_WITH_CLANG_LLD'] = 'OFF' | |
if use_triton_fork == 'triton-windows': | |
os.environ['TRITON_SOURCE_DIR'] = os.path.expanduser('/work/triton-windows') | |
# LLVM from https://oaitriton.blob.core.windows.net/public/llvm-builds/llvm-a66376b0-windows-x64.tar.gz | |
os.environ['LLVM_SYSPATH'] = os.path.expanduser('/caches/.triton/llvm/llvm-a66376b0-windows-x64') | |
else: | |
os.environ['TRITON_SOURCE_DIR'] = os.path.expanduser('/work/triton-lshqqytiger') | |
# LLVM revision 1cec5fffd8fddd9d85b516f876093b0e3f0eec5fa built from source | |
# This uses the llvm "build" and implicitly the "source" directory contents, instead of an "install" tree | |
os.environ['LLVM_SYSPATH'] = os.path.expanduser('/llvm-output/build') | |
else: | |
os.environ['JSON_SYSPATH'] = os.path.expanduser('~/work/nlohmann-json-3.11.3') | |
os.environ['TRITON_SOURCE_DIR'] = os.path.expanduser('~/work/triton') | |
cmd = [ | |
sys.executable, | |
script_dir / 'triton-build.py', | |
] | |
run_command(cmd) | |
build() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import os | |
import sys | |
import subprocess | |
import time | |
from pathlib import Path | |
# --- Configuration --- | |
source_dir = Path(os.environ['TRITON_SOURCE_DIR']).resolve() | |
llvm_syspath = os.environ.get('LLVM_SYSPATH') | |
if llvm_syspath is not None: | |
llvm_syspath = Path(os.environ['LLVM_SYSPATH']).resolve() | |
json_syspath = Path(os.environ['JSON_SYSPATH']).resolve() | |
CCACHE_EXECUTABLE = "ccache" | |
# --- Setup --- | |
caches_dir = Path("/caches") | |
ccache_dir = caches_dir / "ccache" | |
pip_cache_dir = caches_dir / "pip" | |
# Change default from "$HOME/.triton"a to "$caches_dir/.triton" | |
# Triton always adds the ".triton" part | |
triton_home_dir = caches_dir | |
if llvm_syspath is not None: | |
llvm_include_dir = llvm_syspath / "include" | |
llvm_library_dir = llvm_syspath / "lib" | |
# Disable downloading of dependencies | |
# pybind11 is installed via pip and requirements.txt | |
cuda_dep_var_names = [ | |
'TRITON_PTXAS_PATH', | |
'TRITON_CUOBJDUMP_PATH', | |
'TRITON_NVDISASM_PATH', | |
'TRITON_CUDACRT_PATH', | |
'TRITON_CUDART_PATH', | |
'TRITON_CUPTI_INCLUDE_PATH', | |
'TRITON_CUPTI_LIB_PATH', | |
] | |
for x in cuda_dep_var_names: | |
# value is just a placeholder | |
os.environ[x] = '/bin/false' | |
if llvm_syspath is not None: | |
os.environ['LLVM_INCLUDE_DIRS'] = str(llvm_include_dir.resolve()) | |
os.environ['LLVM_LIBRARY_DIR'] = str(llvm_library_dir.resolve()) | |
print(f"--- Configuration ---") | |
print(f"Source Directory: {source_dir}") | |
print(f"Cache Directory: {caches_dir}") | |
print(f"LLVM_SYSPATH: {os.environ.get('LLVM_SYSPATH')}") | |
print(f"LLVM_INCLUDE_DIRS: {os.environ.get('LLVM_INCLUDE_DIRS')}") | |
print(f"LLVM_LIBRARY_DIR: {os.environ.get('LLVM_LIBRARY_DIR')}") | |
print(f"Script Arguments: {sys.argv[1:]}") | |
print(f"---------------------") | |
def should_change_caches_dir(): | |
return sys.platform == 'win32' | |
if should_change_caches_dir(): | |
print(f"Ensuring directories exist...") | |
ccache_dir.mkdir(parents=True, exist_ok=True) | |
pip_cache_dir.mkdir(parents=True, exist_ok=True) | |
print("Setting environment variables...") | |
if should_change_caches_dir(): | |
os.environ['CCACHE_DIR'] = str(ccache_dir.resolve()) | |
os.environ['PIP_CACHE_DIR'] = str(pip_cache_dir.resolve()) | |
# Change default from "$HOME/.triton" to "$TRITON_HOME/.triton" the ".triton" part is always added | |
os.environ['TRITON_HOME'] = str(triton_home_dir.resolve()) | |
# TRITON_PARALLEL_LINK_JOBS | |
# TRITON_BUILD_WITH_CCACHE: we control this manually with CMAKE_{C,CXX}_COMPILER_LAUNCHER | |
# TRITRON_BUILD_WITH_O1 doesn't work on triton-v3.3.x | |
os.environ['TRITRON_BUILD_WITH_O1'] = 'ON' | |
os.environ['TRITON_BUILD_PROTON'] = 'OFF' | |
# os.environ['TRITON_BUILD_WITH_CLANG_LLD'] = 'OFF' | |
# Triton C++ unit tests | |
os.environ['TRITON_BUILD_UT'] = 'OFF' | |
# TRITON_BUILD_BINARY is only defined in the 'triton-windows' fork, which requires it | |
os.environ['TRITON_BUILD_BINARY'] = 'OFF' | |
# Check if ccache is desired/available before setting launchers | |
use_ccache = True # Set to False to disable ccache | |
if use_ccache: | |
print(f"Configuring CMake to use ccache ('{CCACHE_EXECUTABLE}')...") | |
os.environ['CMAKE_C_COMPILER_LAUNCHER'] = CCACHE_EXECUTABLE | |
os.environ['CMAKE_CXX_COMPILER_LAUNCHER'] = CCACHE_EXECUTABLE | |
else: | |
print("Skipping ccache configuration.") | |
# Ensure they are unset if they existed before | |
os.environ.pop('CMAKE_C_COMPILER_LAUNCHER', None) | |
os.environ.pop('CMAKE_CXX_COMPILER_LAUNCHER', None) | |
print(f"CCACHE_DIR = {os.environ.get('CCACHE_DIR')}") | |
print(f"PIP_CACHE_DIR = {os.environ.get('PIP_CACHE_DIR')}") | |
print(f"CMAKE_C_COMPILER_LAUNCHER = {os.environ.get('CMAKE_C_COMPILER_LAUNCHER')}") | |
print(f"CMAKE_CXX_COMPILER_LAUNCHER = {os.environ.get('CMAKE_CXX_COMPILER_LAUNCHER')}") | |
# --- Helper function to run commands --- | |
def run_command(cmd_list, cwd=None): | |
"""Runs a command, prints it, times it, and checks for errors.""" | |
print(f"\n--- Executing: {' '.join(map(str, cmd_list))} ---", flush=True) | |
start_time = time.monotonic() | |
try: | |
# Use shell=False (default) for better security and argument handling | |
# check=True raises CalledProcessError on non-zero exit code (like set -e) | |
process = subprocess.run(cmd_list, cwd=cwd, check=True, text=True) | |
#stdout=sys.stdout, stderr=sys.stderr) # Redirect streams directly | |
except FileNotFoundError: | |
print(f"ERROR: Command not found: {cmd_list[0]}. Is it installed and in PATH?", file=sys.stderr) | |
sys.exit(1) | |
except subprocess.CalledProcessError as e: | |
print(f"ERROR: Command failed with exit code {e.returncode}", file=sys.stderr) | |
sys.exit(e.returncode) | |
except Exception as e: | |
print(f"ERROR: An unexpected error occurred: {e}", file=sys.stderr) | |
sys.exit(1) | |
end_time = time.monotonic() | |
print(f"--- Command finished successfully in {end_time - start_time:.2f} seconds ---", flush=True) | |
return process | |
# --- Build Steps --- | |
setup_py_dir = source_dir | |
# triton (future) v3.4+ has "triton/setup.py" | |
if not os.path.exists(str(setup_py_dir / "setup.py")): | |
setup_py_dir = source_dir / "python" | |
# Make the build fail here if 'setup.py' can't be found | |
assert os.path.exists(str(setup_py_dir / "setup.py")) | |
# --no-build-isolation: https://github.com/triton-lang/triton?tab=readme-ov-file#tips-for-building | |
cmd = [ | |
"uv", "pip", "install", "-vv", "--no-build-isolation", str((setup_py_dir).resolve()), | |
] | |
cmd.extend(sys.argv[1:]) # Add extra arguments from script call | |
run_command(cmd) | |
print("\nBuild script completed successfully.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
lzma_build.py
- this built fine for me on Windows after modifying the paths. I also don't use ccache (yet) so I set that flag to false.