Nikita Shulga malfet

malfet / test_header_only.py

Created May 20, 2025 00:34

	import torch
	import torch.utils.cpp_extension as _ce
	import tempfile
	import os
	import subprocess

	src = """#include <c10/util/BFloat16.h>
	#include <iostream>
	int main() {
	std::cout << c10::BFloat16(3.14) << std::endl;

malfet / ctypes-nvrtc.py

Last active April 16, 2025 21:47

	import ctypes
	import torch
	import time

	def nvrtc_compile(source: str) -> str:
	from ctypes import CDLL, c_void_p, c_char_p, c_size_t, byref, create_string_buffer
	libnvrtc = CDLL('libnvrtc.so')
	def get_error_string() -> str:
	err_p = c_char_p()
	libnvrtc.nvrtcGetErrorString(result, byref(err_str))

malfet / tensor-base-example.py

Last active March 21, 2025 23:38

	"""
	Example showing how to use the no_header mode with a TensorBase CUDA extension

	This example creates a CUDA extension that directly includes ATen/core/TensorBase.h
	instead of torch/extension.h, resulting in faster compilation with no_header=True
	"""
	from datetime import datetime
	import torch
	import torch.utils.cpp_extension

malfet / listtodictdis.py

Last active March 17, 2025 22:58

malfet / metal-internal-compiler-error-m2.swift

Created March 14, 2025 05:11

	// Fail with Error Domain=AGXMetalG14X Code=3 "Compiler encountered an internal error" on M1/M2 (using MacOS 15.3.1)
	// Works on M4 (and may be M3)

	let shader_source = """
	template <typename T>
	float bessel_j0_forward(T x) {
	constexpr float PP[] = {
	+7.96936729297347051624e-04,
	+8.28352392107440799803e-02,
	+1.23953371646414299388e+00,

malfet / mpsinductor-minmax.py

Created March 12, 2025 16:39

	# How to reuse shared memory
	# Right now MPS inductor produces following code
	# #include <c10/metal/random.h>
	# #include <c10/metal/special_math.h>
	# #include <c10/metal/utils.h>
	# #include <c10/metal/reduction_utils.h>
	# kernel void generated_kernel(
	# device float* out_ptr0,
	# device float* out_ptr1,
	# constant float* in_ptr0,

malfet / anon_kernel_is_missing.swift

Last active February 12, 2025 18:16

	let shader_source = """
	struct add_functor {
	template <typename T>
	inline T operator()(const T a, const T b) {
	return static_cast<T>(a + b);
	}
	};

	namespace {
	struct sub_functor {

malfet / use_cuda_arch.cu

Last active November 8, 2024 20:32

malfet / triton-aoti.py

Last active September 21, 2024 21:59

	import torch
	import torch._inductor.config
	torch.set_default_device("cuda")
	import os
	from triton import autotune, cdiv, Config, heuristics, jit # @manual
	import triton.language as tl

	@autotune(
	configs=[
	Config({"BLOCK_M": 32, "BLOCK_N": 32}),

malfet / dyld.py

Created June 5, 2024 21:04

Print shared libraries loaded by PyTorch on MacOS

	from ctypes import cdll, c_char_p, c_uint32

	libdyld = cdll.LoadLibrary("libSystem.dylib")
	libdyld._dyld_image_count.restype = c_uint32
	libdyld._dyld_get_image_name.restype = c_char_p
	libdyld._dyld_get_image_name.argtypes = [c_uint32]

	before_torch = {libdyld._dyld_get_image_name(i).decode("ascii") for i in range(libdyld._dyld_image_count())}
	import torch
	after_torch = {libdyld._dyld_get_image_name(i).decode("ascii") for i in range(libdyld._dyld_image_count())}