This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Example showing how to use the no_header mode with a TensorBase CUDA extension | |
This example creates a CUDA extension that directly includes ATen/core/TensorBase.h | |
instead of torch/extension.h, resulting in faster compilation with no_header=True | |
""" | |
from datetime import datetime | |
import torch | |
import torch.utils.cpp_extension |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import dis | |
import timeit | |
def list_to_dict_1(l): | |
rc = {} | |
for idx, v in enumerate(l): | |
rc[v] = idx | |
return rc | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Fail with Error Domain=AGXMetalG14X Code=3 "Compiler encountered an internal error" on M1/M2 (using MacOS 15.3.1) | |
// Works on M4 (and may be M3) | |
let shader_source = """ | |
template <typename T> | |
float bessel_j0_forward(T x) { | |
constexpr float PP[] = { | |
+7.96936729297347051624e-04, | |
+8.28352392107440799803e-02, | |
+1.23953371646414299388e+00, |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# How to reuse shared memory | |
# Right now MPS inductor produces following code | |
# #include <c10/metal/random.h> | |
# #include <c10/metal/special_math.h> | |
# #include <c10/metal/utils.h> | |
# #include <c10/metal/reduction_utils.h> | |
# kernel void generated_kernel( | |
# device float* out_ptr0, | |
# device float* out_ptr1, | |
# constant float* in_ptr0, |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
let shader_source = """ | |
struct add_functor { | |
template <typename T> | |
inline T operator()(const T a, const T b) { | |
return static_cast<T>(a + b); | |
} | |
}; | |
namespace { | |
struct sub_functor { |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <cuda_runtime.h> | |
__host__ __device__ int return_two() { | |
#if defined(__CUDA_ARCH__) | |
return 3; | |
#else | |
return 2; | |
#endif | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
import torch._inductor.config | |
torch.set_default_device("cuda") | |
import os | |
from triton import autotune, cdiv, Config, heuristics, jit # @manual | |
import triton.language as tl | |
@autotune( | |
configs=[ | |
Config({"BLOCK_M": 32, "BLOCK_N": 32}), |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from ctypes import cdll, c_char_p, c_uint32 | |
libdyld = cdll.LoadLibrary("libSystem.dylib") | |
libdyld._dyld_image_count.restype = c_uint32 | |
libdyld._dyld_get_image_name.restype = c_char_p | |
libdyld._dyld_get_image_name.argtypes = [c_uint32] | |
before_torch = {libdyld._dyld_get_image_name(i).decode("ascii") for i in range(libdyld._dyld_image_count())} | |
import torch | |
after_torch = {libdyld._dyld_get_image_name(i).decode("ascii") for i in range(libdyld._dyld_image_count())} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import Metal | |
import MetalPerformanceShadersGraph | |
func calculateExpMetal(device: MTLDevice, ibuf: MTLBuffer, obuf: MTLBuffer, nelem: Int, fastMathEnabled: Bool = false) { | |
let shader_source = """ | |
#include <metal_stdlib> | |
using namespace metal; | |
kernel void do_exp(constant float *input [[buffer(0)]], |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import Metal | |
let shader_source = """ | |
#include <metal_stdlib> | |
using namespace metal; | |
kernel void nextafter_pred(device float *data [[buffer(0)]], | |
device bool *pred [[buffer(1)]], | |
uint thread_index [[thread_position_in_grid]]) { | |
data[thread_index] = nextafter(float(thread_index) - 8.0, 1e4); |
NewerOlder