ita9naiwa’s gists

ita9naiwa / runner.sh

Created October 20, 2025 21:38

vllm_benchmark

	in_len=1024; out_len=1024; batch_size=128;
	model=meta-llama/Llama-3.1-8B-Instruct

	VLLM_USE_V1=1 VLLM_DISABLE_COMPILE_CACHE=1 TRITON_PRINT_AUTOTUNING=1 python3 vllm_benchmark.py --input-len $in_len --output-len $out_len --model $model --dtype float16 --batch-size $batch_size --num_iters_warmup 5 --num_iters 5

ita9naiwa / .cu

Created October 5, 2025 12:14

simple_mma.cu

	// simple.cu - Direct PTX Tensor Core GEMM using mma.m16n8k16
	// Demonstrates: cp.async, ldmatrix, mma instruction, 8-row interleaved output
	// Target: Ampere+ GPUs (sm_80+)

	#include <cuda_fp16.h>
	#include <cstdio>
	#include <cmath>

	// ============================================================================
	// CPU Reference Implementation

ita9naiwa / gist:644f9a025d08361b4d1c6412a67e9346

Created August 27, 2025 08:35

test_scaled.dot.py

ita9naiwa / gist:ac4cc770379504b573c460d3861675ab

Created August 25, 2025 20:15

sm_120.py

ita9naiwa / bench.py

Last active August 14, 2025 21:34

scaled dot bench

	import os
	os.environ["TORCH_COMPILE_DISABLE"] = "1"
	os.environ["TRITON_ALWAYS_COMPILE"] = "1"

	import argparse, torch, triton, triton.language as tl
	from triton.tools.mxfp import MXFP4Tensor
	import time

	def scaleDot_ref(A, B, sA_grouped, sB_grouped, GROUP_K: int):
	sA = 2 ** (sA_grouped.float() - 127.0)

ita9naiwa / benchmark.py

Created July 20, 2025 01:08

benchmark.py

	import os
	os.environ["TRITON_CACHE_DIR"] = "./cache"
	os.environ["TRITON_DUMP_DIR"] = "./cache"

	import torch, triton, triton.language as tl, os, statistics as stats

	# ---------------------------
	# Fused 2-dot kernel
	# Dot A: (M1,K1)x(K1,N1) -> C1
	# Dot B: (M2,K2)x(K2,N2) -> C2 (smaller)

ita9naiwa / after_pad_fold.mlir

Created February 16, 2025 07:51

folding `tensor.pad`

	// -----// IR Dump After Canonicalizer (canonicalize) //----- //
	func.func @test_fusion(%arg0: tensor<32x16x256x256xf32>, %arg1: tensor<32xf32>, %arg2: tensor<32x16xf32>, %arg3: tensor<32x16xf32>) -> tensor<512x258x258xf32> {
	%cst = arith.constant 1.000000e+00 : f32
	%cst_0 = arith.constant 0.000000e+00 : f32
	%0 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1, %arg2, %arg3 : tensor<32x16x256x256xf32>, tensor<32xf32>, tensor<32x16xf32>, tensor<32x16xf32>) outs(%arg0 : tensor<32x16x256x256xf32>) {
	^bb0(%in: f32, %in_1: f32, %in_2: f32, %in_3: f32, %out: f32):
	%1 = arith.addf %in_1, %cst : f32
	%2 = math.rsqrt %1 : f32
	%3 = arith.mulf %in, %2 : f32
	%4 = arith.mulf %3, %in_2 : f32

ita9naiwa / .vimrc

Last active February 6, 2025 01:49 — forked from simonista/.vimrc

A basic .vimrc file that will serve as a good template on which to build.

	" Don't try to be vi compatible
	set nocompatible

	" Helps force plugins to load correctly when it is turned back on below
	filetype off

	" TODO: Load plugins here (pathogen or vundle)

	" Turn on syntax highlighting
	syntax on

ita9naiwa / cifar.py

Created March 9, 2022 10:01

bonus-ce and MAE

	'''
	Train CIFAR10 with PyTorch.
	based on https://github.com/kuangliu/pytorch-cifar
	'''
	import torch
	import torch.nn as nn
	import torch.optim as optim
	import torch.nn.functional as F
	import torch.backends.cudnn as cudnn

ita9naiwa / gist:1999469f0ccbc9e4fef790fa51504b98

Last active December 12, 2021 15:16

alpha-beta-NDCG

Sorry, something went wrong. Reload?

Sorry, we cannot display this file.

Sorry, this file is invalid so it cannot be displayed.

Hyunsung Lee ita9naiwa