malfet’s gists

malfet / launch_kernel.cu

Created October 21, 2025 21:50

	#include <stdio.h>
	#include <chrono>

	__global__ void noop() { }

	int main(int argc, const char *argv[]) {
	cudaDeviceProp prop;
	auto rc = cudaGetDeviceProperties(&prop, 0);
	printf("Running on %s sm%d.%d multiProcessorCount = %d maxBlocksPerMultiProcessor = %d maxThreadsPerBlock = %d\n",
	prop.name, prop.major, prop.minor, prop.multiProcessorCount, prop.maxBlocksPerMultiProcessor, prop.maxThreadsPerBlock);

malfet / symbol_clash.py

Created October 20, 2025 21:51

	a_cpp = """#include <iostream>

	namespace foo::bar {
	inline namespace baz {
	int inc(int x) {
	std::cout << "do inc from lib_a" << std::endl;
	return x + 1;
	}
	} // inline namespace baz
	void do_a(int x) {

malfet / bench-dispatch.py

Created October 7, 2025 15:56

malfet / hello.cu

Created October 6, 2025 21:59

	#include <stdio.h>

	__global__ void print() {
	printf("Hello World of CUDA threadIdx.x=%d\n", threadIdx.x);
	}

	__global__ void noop() { }

	int main(int argc, const char *argv[]) {
	cudaDeviceProp prop;

malfet / debug_index_put.py

Created August 21, 2025 19:23

	import torch
	import os
	os.environ["MTL_CAPTURE_ENABLED"]="1"
	a = torch.ones(2, (1 << 31) + 5, dtype=torch.int8, device='mps')
	index_0 = torch.tensor([0, -1, 0, 1], device=a.device)
	index_1 = torch.tensor([-2, -1, 0, 1], device=a.device)
	values = torch.tensor([12, 13, 10, 11], dtype=a.dtype, device=a.device)
	with torch.mps.profiler.metal_capture("index_put"):
	a.index_put_((index_0, index_1), values, accumulate=True)
	b = a[1, -2].cpu()

malfet / install_nightly.py

Last active August 6, 2025 19:53

	#!/usr/bin/env python3

	import sys
	import subprocess
	import urllib.request
	import json

	def get_latest_version(package_name: str) -> str:
	"""Get latest version from PyPI"""
	api_url = f"https://pypi.org/pypi/{package_name}/json"

malfet / gist:46bd5716da0c3ddc3fd3ca0703ff8fd2

Created July 18, 2025 19:19

This file has been truncated, but you can view the full file.

	With cudnn-9.10.2.21
	```
	$ CUDNN_LOGINFO_DBG=3 RUN_SLOW=1 python3 -m pytest -v tests/models/vit/test_modeling_vit.py::ViTModelTest::test_batching_equivalence
	========================================================================================== test session starts ===========================================================================================
	platform linux -- Python 3.10.12, pytest-8.4.1, pluggy-1.6.0 -- /home/ubuntu/py3.10-nightly/bin/python3
	cachedir: .pytest_cache
	rootdir: /home/ubuntu/transformers
	configfile: pyproject.toml
	plugins: xdist-3.8.0, asyncio-1.1.0, rerunfailures-15.1, order-1.3.0, timeout-2.4.0, rich-0.2.0
	asyncio: mode=strict, asyncio_default_fixture_loop_scope=function, asyncio_default_test_loop_scope=function

malfet / test_header_only.py

Created May 20, 2025 00:34

	import torch
	import torch.utils.cpp_extension as _ce
	import tempfile
	import os
	import subprocess

	src = """#include <c10/util/BFloat16.h>
	#include <iostream>
	int main() {
	std::cout << c10::BFloat16(3.14) << std::endl;

malfet / ctypes-nvrtc.py

Last active September 19, 2025 08:53

	import ctypes
	import torch
	import time

	def nvrtc_compile(source: str) -> str:
	from ctypes import CDLL, c_void_p, c_char_p, c_size_t, byref, create_string_buffer
	libnvrtc = CDLL('libnvrtc.so')
	def get_error_string() -> str:
	err_p = c_char_p()
	libnvrtc.nvrtcGetErrorString(result, byref(err_str))

malfet / tensor-base-example.py

Last active March 21, 2025 23:38

	"""
	Example showing how to use the no_header mode with a TensorBase CUDA extension

	This example creates a CUDA extension that directly includes ATen/core/TensorBase.h
	instead of torch/extension.h, resulting in faster compilation with no_header=True
	"""
	from datetime import datetime
	import torch
	import torch.utils.cpp_extension

Nikita Shulga malfet