Hyunsung Lee ita9naiwa

最淡的墨水勝過最好的記憶

ita9naiwa / gist:ac4cc770379504b573c460d3861675ab

Created August 25, 2025 20:15

sm_120.py

ita9naiwa / gist:644f9a025d08361b4d1c6412a67e9346

Created August 27, 2025 08:35

test_scaled.dot.py

ita9naiwa / .cu

Created October 5, 2025 12:14

simple_mma.cu

	// simple.cu - Direct PTX Tensor Core GEMM using mma.m16n8k16
	// Demonstrates: cp.async, ldmatrix, mma instruction, 8-row interleaved output
	// Target: Ampere+ GPUs (sm_80+)

	#include <cuda_fp16.h>
	#include <cstdio>
	#include <cmath>

	// ============================================================================
	// CPU Reference Implementation

ita9naiwa / runner.sh

Created October 20, 2025 21:38

vllm_benchmark

	in_len=1024; out_len=1024; batch_size=128;
	model=meta-llama/Llama-3.1-8B-Instruct

	VLLM_USE_V1=1 VLLM_DISABLE_COMPILE_CACHE=1 TRITON_PRINT_AUTOTUNING=1 python3 vllm_benchmark.py --input-len $in_len --output-len $out_len --model $model --dtype float16 --batch-size $batch_size --num_iters_warmup 5 --num_iters 5