This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import ctypes | |
| import torch | |
| import time | |
| def nvrtc_compile(source: str) -> str: | |
| from ctypes import CDLL, c_void_p, c_char_p, c_size_t, byref, create_string_buffer | |
| libnvrtc = CDLL('libnvrtc.so') | |
| def get_error_string() -> str: | |
| err_p = c_char_p() | |
| libnvrtc.nvrtcGetErrorString(result, byref(err_str)) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| Example showing how to use the no_header mode with a TensorBase CUDA extension | |
| This example creates a CUDA extension that directly includes ATen/core/TensorBase.h | |
| instead of torch/extension.h, resulting in faster compilation with no_header=True | |
| """ | |
| from datetime import datetime | |
| import torch | |
| import torch.utils.cpp_extension |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import dis | |
| import timeit | |
| def list_to_dict_1(l): | |
| rc = {} | |
| for idx, v in enumerate(l): | |
| rc[v] = idx | |
| return rc | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // Fail with Error Domain=AGXMetalG14X Code=3 "Compiler encountered an internal error" on M1/M2 (using MacOS 15.3.1) | |
| // Works on M4 (and may be M3) | |
| let shader_source = """ | |
| template <typename T> | |
| float bessel_j0_forward(T x) { | |
| constexpr float PP[] = { | |
| +7.96936729297347051624e-04, | |
| +8.28352392107440799803e-02, | |
| +1.23953371646414299388e+00, |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # How to reuse shared memory | |
| # Right now MPS inductor produces following code | |
| # #include <c10/metal/random.h> | |
| # #include <c10/metal/special_math.h> | |
| # #include <c10/metal/utils.h> | |
| # #include <c10/metal/reduction_utils.h> | |
| # kernel void generated_kernel( | |
| # device float* out_ptr0, | |
| # device float* out_ptr1, | |
| # constant float* in_ptr0, |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| let shader_source = """ | |
| struct add_functor { | |
| template <typename T> | |
| inline T operator()(const T a, const T b) { | |
| return static_cast<T>(a + b); | |
| } | |
| }; | |
| namespace { | |
| struct sub_functor { |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include <stdio.h> | |
| #include <cuda_runtime.h> | |
| __host__ __device__ int return_two() { | |
| #if defined(__CUDA_ARCH__) | |
| return 3; | |
| #else | |
| return 2; | |
| #endif | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import torch | |
| import torch._inductor.config | |
| torch.set_default_device("cuda") | |
| import os | |
| from triton import autotune, cdiv, Config, heuristics, jit # @manual | |
| import triton.language as tl | |
| @autotune( | |
| configs=[ | |
| Config({"BLOCK_M": 32, "BLOCK_N": 32}), |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from ctypes import cdll, c_char_p, c_uint32 | |
| libdyld = cdll.LoadLibrary("libSystem.dylib") | |
| libdyld._dyld_image_count.restype = c_uint32 | |
| libdyld._dyld_get_image_name.restype = c_char_p | |
| libdyld._dyld_get_image_name.argtypes = [c_uint32] | |
| before_torch = {libdyld._dyld_get_image_name(i).decode("ascii") for i in range(libdyld._dyld_image_count())} | |
| import torch | |
| after_torch = {libdyld._dyld_get_image_name(i).decode("ascii") for i in range(libdyld._dyld_image_count())} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import Metal | |
| import MetalPerformanceShadersGraph | |
| func calculateExpMetal(device: MTLDevice, ibuf: MTLBuffer, obuf: MTLBuffer, nelem: Int, fastMathEnabled: Bool = false) { | |
| let shader_source = """ | |
| #include <metal_stdlib> | |
| using namespace metal; | |
| kernel void do_exp(constant float *input [[buffer(0)]], |