Skip to content

Instantly share code, notes, and snippets.

View ita9naiwa's full-sized avatar

Hyunsung Lee ita9naiwa

View GitHub Profile
import torch
import triton
import triton.language as tl
@triton.jit
def mxfp_matmul(
a_ptr, b_ptr, output_ptr,
a_scale, b_scale,
M, N, K,
stride_scale: tl.constexpr,
#!/usr/bin/env python3
import torch
import triton
import triton.language as tl
@triton.jit
def scaled_dot_kernel(
# Pointers to matrices
a_ptr, b_ptr, output_ptr,
@ita9naiwa
ita9naiwa / .cu
Created October 5, 2025 12:14
simple_mma.cu
// simple.cu - Direct PTX Tensor Core GEMM using mma.m16n8k16
// Demonstrates: cp.async, ldmatrix, mma instruction, 8-row interleaved output
// Target: Ampere+ GPUs (sm_80+)
#include <cuda_fp16.h>
#include <cstdio>
#include <cmath>
// ============================================================================
// CPU Reference Implementation
@ita9naiwa
ita9naiwa / runner.sh
Created October 20, 2025 21:38
vllm_benchmark
in_len=1024; out_len=1024; batch_size=128;
model=meta-llama/Llama-3.1-8B-Instruct
VLLM_USE_V1=1 VLLM_DISABLE_COMPILE_CACHE=1 TRITON_PRINT_AUTOTUNING=1 python3 vllm_benchmark.py --input-len $in_len --output-len $out_len --model $model --dtype float16 --batch-size $batch_size --num_iters_warmup 5 --num_iters 5