| dtype | SOTA | 2.2.2+eager | 2.3.0+eager | 2.3.0+compile | trunk + compile |
|---|---|---|---|---|---|
| bfloat16 (M1) | 111 tokens/sec | 110 tokens/sec | 80 tokens/sec | ||
| float32 (M1) | 687 tokens/sec | 165 tokens/sec | 176 tokens/sec | ||
| float16 (M1) | 1106 tokens/sec | 50 tokens/sec | 187 tokens/sec | ||
| float16 (LinX86) | 40 tokens/sec | 43 tokens/sec | 173 tokens/sec | ||
| float32 (LinX86) | 38 tokens/sec | 40 tokens/sec | 179 tokens/sec | ||
| bfloat16 (LinX86) | 73 tokens/sec | 78 tokens/sec | 180 tokens/sec |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import dis | |
| import timeit | |
| def list_to_dict_1(l): | |
| rc = {} | |
| for idx, v in enumerate(l): | |
| rc[v] = idx | |
| return rc | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // Fail with Error Domain=AGXMetalG14X Code=3 "Compiler encountered an internal error" on M1/M2 (using MacOS 15.3.1) | |
| // Works on M4 (and may be M3) | |
| let shader_source = """ | |
| template <typename T> | |
| float bessel_j0_forward(T x) { | |
| constexpr float PP[] = { | |
| +7.96936729297347051624e-04, | |
| +8.28352392107440799803e-02, | |
| +1.23953371646414299388e+00, |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # How to reuse shared memory | |
| # Right now MPS inductor produces following code | |
| # #include <c10/metal/random.h> | |
| # #include <c10/metal/special_math.h> | |
| # #include <c10/metal/utils.h> | |
| # #include <c10/metal/reduction_utils.h> | |
| # kernel void generated_kernel( | |
| # device float* out_ptr0, | |
| # device float* out_ptr1, | |
| # constant float* in_ptr0, |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| let shader_source = """ | |
| struct add_functor { | |
| template <typename T> | |
| inline T operator()(const T a, const T b) { | |
| return static_cast<T>(a + b); | |
| } | |
| }; | |
| namespace { | |
| struct sub_functor { |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include <stdio.h> | |
| #include <cuda_runtime.h> | |
| __host__ __device__ int return_two() { | |
| #if defined(__CUDA_ARCH__) | |
| return 3; | |
| #else | |
| return 2; | |
| #endif | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import torch | |
| import torch._inductor.config | |
| torch.set_default_device("cuda") | |
| import os | |
| from triton import autotune, cdiv, Config, heuristics, jit # @manual | |
| import triton.language as tl | |
| @autotune( | |
| configs=[ | |
| Config({"BLOCK_M": 32, "BLOCK_N": 32}), |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from ctypes import cdll, c_char_p, c_uint32 | |
| libdyld = cdll.LoadLibrary("libSystem.dylib") | |
| libdyld._dyld_image_count.restype = c_uint32 | |
| libdyld._dyld_get_image_name.restype = c_char_p | |
| libdyld._dyld_get_image_name.argtypes = [c_uint32] | |
| before_torch = {libdyld._dyld_get_image_name(i).decode("ascii") for i in range(libdyld._dyld_image_count())} | |
| import torch | |
| after_torch = {libdyld._dyld_get_image_name(i).decode("ascii") for i in range(libdyld._dyld_image_count())} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import Metal | |
| import MetalPerformanceShadersGraph | |
| func calculateExpMetal(device: MTLDevice, ibuf: MTLBuffer, obuf: MTLBuffer, nelem: Int, fastMathEnabled: Bool = false) { | |
| let shader_source = """ | |
| #include <metal_stdlib> | |
| using namespace metal; | |
| kernel void do_exp(constant float *input [[buffer(0)]], |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import Metal | |
| let shader_source = """ | |
| #include <metal_stdlib> | |
| using namespace metal; | |
| kernel void nextafter_pred(device float *data [[buffer(0)]], | |
| device bool *pred [[buffer(1)]], | |
| uint thread_index [[thread_position_in_grid]]) { | |
| data[thread_index] = nextafter(float(thread_index) - 8.0, 1e4); |