Skip to content

Instantly share code, notes, and snippets.

"""
Example showing how to use the no_header mode with a TensorBase CUDA extension
This example creates a CUDA extension that directly includes ATen/core/TensorBase.h
instead of torch/extension.h, resulting in faster compilation with no_header=True
"""
from datetime import datetime
import torch
import torch.utils.cpp_extension
import dis
import timeit
def list_to_dict_1(l):
rc = {}
for idx, v in enumerate(l):
rc[v] = idx
return rc
// Fail with Error Domain=AGXMetalG14X Code=3 "Compiler encountered an internal error" on M1/M2 (using MacOS 15.3.1)
// Works on M4 (and may be M3)
let shader_source = """
template <typename T>
float bessel_j0_forward(T x) {
constexpr float PP[] = {
+7.96936729297347051624e-04,
+8.28352392107440799803e-02,
+1.23953371646414299388e+00,
# How to reuse shared memory
# Right now MPS inductor produces following code
# #include <c10/metal/random.h>
# #include <c10/metal/special_math.h>
# #include <c10/metal/utils.h>
# #include <c10/metal/reduction_utils.h>
# kernel void generated_kernel(
# device float* out_ptr0,
# device float* out_ptr1,
# constant float* in_ptr0,
let shader_source = """
struct add_functor {
template <typename T>
inline T operator()(const T a, const T b) {
return static_cast<T>(a + b);
}
};
namespace {
struct sub_functor {
#include <stdio.h>
#include <cuda_runtime.h>
__host__ __device__ int return_two() {
#if defined(__CUDA_ARCH__)
return 3;
#else
return 2;
#endif
}
import torch
import torch._inductor.config
torch.set_default_device("cuda")
import os
from triton import autotune, cdiv, Config, heuristics, jit # @manual
import triton.language as tl
@autotune(
configs=[
Config({"BLOCK_M": 32, "BLOCK_N": 32}),
@malfet
malfet / dyld.py
Created June 5, 2024 21:04
Print shared libraries loaded by PyTorch on MacOS
from ctypes import cdll, c_char_p, c_uint32
libdyld = cdll.LoadLibrary("libSystem.dylib")
libdyld._dyld_image_count.restype = c_uint32
libdyld._dyld_get_image_name.restype = c_char_p
libdyld._dyld_get_image_name.argtypes = [c_uint32]
before_torch = {libdyld._dyld_get_image_name(i).decode("ascii") for i in range(libdyld._dyld_image_count())}
import torch
after_torch = {libdyld._dyld_get_image_name(i).decode("ascii") for i in range(libdyld._dyld_image_count())}
import Metal
import MetalPerformanceShadersGraph
func calculateExpMetal(device: MTLDevice, ibuf: MTLBuffer, obuf: MTLBuffer, nelem: Int, fastMathEnabled: Bool = false) {
let shader_source = """
#include <metal_stdlib>
using namespace metal;
kernel void do_exp(constant float *input [[buffer(0)]],
@malfet
malfet / subnormals_metal.swift
Created May 7, 2024 00:47
Check if `nextafter(0.0, 1.0)` is greater than zero on Metal device
import Metal
let shader_source = """
#include <metal_stdlib>
using namespace metal;
kernel void nextafter_pred(device float *data [[buffer(0)]],
device bool *pred [[buffer(1)]],
uint thread_index [[thread_position_in_grid]]) {
data[thread_index] = nextafter(float(thread_index) - 8.0, 1e4);