AmosLewis’s gists

AmosLewis / module_prefill_bs1$async_dispatch_0.mlir

Created January 30, 2025 18:24

hal.executable public @prefill_bs1$async_dispatch_0 {

  hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts =

AmosLewis / module__initializer_0_dispatch_0.mlir

Created January 30, 2025 18:23

hal.executable public @_initializer_0_dispatch_0 {

  hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2

AmosLewis / module___builtin_fill_i64.mlir

Created January 30, 2025 18:22

hal.executable public @__builtin_fill_i64 {

  hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [21474836

AmosLewis / numpy2TorchBf16Bin.py

Last active January 27, 2025 22:05

	# wget https://sharkpublic.blob.core.windows.net/sharkpublic/halo-models/llm-dev/llama3_8b/prefill_args_bs4_128_stride_32/cs_f16.npy
	# wget https://sharkpublic.blob.core.windows.net/sharkpublic/halo-models/llm-dev/llama3_8b/prefill_args_bs4_128_stride_32/seq_block_ids.npy
	# wget https://sharkpublic.blob.core.windows.net/sharkpublic/halo-models/llm-dev/llama3_8b/prefill_args_bs4_128_stride_32/seq_lens.npy
	# wget https://sharkpublic.blob.core.windows.net/sharkpublic/halo-models/llm-dev/llama3_8b/prefill_args_bs4_128_stride_32/tokens.npy
	import numpy as np
	import torch

	prefills = ['cs_f16','seq_block_ids','seq_lens','tokens']

	for prefill in prefills:

AmosLewis / castf16.py

Last active January 27, 2025 19:33

	# Download the f32 npy, then use this script to cast 32 to 16
	# wget https://gist.github.com/aviator19941/380acabc77aeb4749fac14262e17db69
	# wget https://sharkpublic.blob.core.windows.net/sharkpublic/halo-models/llm-dev/llama3_8b/prefill_args_bs4_128_stride_32/cs_f16.npy
	# wget https://sharkpublic.blob.core.windows.net/sharkpublic/halo-models/llm-dev/llama3_8b/prefill_args_bs4_128_stride_32/seq_block_ids.npy
	# wget https://sharkpublic.blob.core.windows.net/sharkpublic/halo-models/llm-dev/llama3_8b/prefill_args_bs4_128_stride_32/seq_lens.npy
	# wget https://sharkpublic.blob.core.windows.net/sharkpublic/halo-models/llm-dev/llama3_8b/prefill_args_bs4_128_stride_32/tokens.npy
	# pip install numpy==1.26
	# pip install bfloat16
	import numpy as np
	from bfloat16 import bfloat16

AmosLewis / llama_8b_fp8.mlir

Created January 23, 2025 16:56

This file has been truncated, but you can view the full file.

	module @module {
	util.global private @__auto.token_embd.weight = #stream.parameter.named<"model"::"token_embd.weight"> : tensor<128256x4096xbf16>
	util.global private @__auto.blk.0.attn_norm.weight = #stream.parameter.named<"model"::"blk.0.attn_norm.weight"> : tensor<4096xbf16>
	util.global private @"__auto.blk.0.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.0.attn_q.q_input:rscale"> : tensor<f32>
	util.global private @"__auto.blk.0.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.0.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
	util.global private @"__auto.blk.0.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.0.attn_k.q_input:rscale"> : tensor<f32>
	util.global private @"__auto.blk.0.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.0.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
	util.global private @"__auto.blk.0.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.0.attn_v.q_input:rscale"> : tensor<f32>
	util.global private @"__au

AmosLewis / llama_8b_fp8_tracy_bug.log

Last active January 23, 2025 00:08

	(.venv) ➜ llama /home/chi/src/iree-build-trace/tools/iree-compile \
	fp8.mlir \
	--iree-hip-target=gfx942 \
	-o=fp8_tracy.vmfb \
	--iree-hal-target-device=hip \
	--iree-dispatch-creation-enable-aggressive-fusion=true \
	--iree-global-opt-propagate-transposes=true \
	--iree-opt-aggressively-propagate-transposes=true \
	--iree-opt-data-tiling=false \
	--iree-preprocessing-pass-pipeline='builtin.module(util.func(iree-preprocessing-generalize-linalg-matmul-experimental))' \

AmosLewis / iree-compile-flag.txt

Created January 16, 2025 17:25

	iree-compile --help
	OVERVIEW: IREE compilation driver

	USAGE: iree-compile [options] <input file or '-' for stdin>

	OPTIONS:

	CUDA HAL Target:

	--iree-cuda-target=<string> - CUDA target as expected by LLVM NVPTX backend; e.g., 'sm_80'/'sm_90' for targeting Ampere/Hopper GPUs. Additionally this also supports architecture code names like 'turing'/'ampere' or some product names like 'a100'/'rtx3090ti' for a better experience. See https://iree.dev/guides/deployment-configurations/gpu-cuda for more details.

AmosLewis / nonzero.fx_importer.log

Created December 17, 2024 08:09

	torch-mlir-opt -pass-pipeline='builtin.module(func.func(torch-match-quantized-custom-ops), torchdynamo-export-to-torch-backend-pipeline{extra-library=})' /tmp/UnnammedModule.mlir --debug
	Args: torch-mlir-opt -pass-pipeline=builtin.module(func.func(torch-match-quantized-custom-ops), torchdynamo-export-to-torch-backend-pipeline{extra-library=}) /tmp/UnnammedModule.mlir --debug
	Load new dialect in Context builtin
	ImplicitTypeIDRegistry::lookupOrInsert(mlir::ShapedType)
	ImplicitTypeIDRegistry::lookupOrInsert(mlir::MemRefLayoutAttrInterface)
	ImplicitTypeIDRegistry::lookupOrInsert(mlir::TypedAttr)
	ImplicitTypeIDRegistry::lookupOrInsert(mlir::ElementsAttr)
	ImplicitTypeIDRegistry::lookupOrInsert(mlir::DistinctAttr)
	ImplicitTypeIDRegistry::lookupOrInsert(mlir::BytecodeOpInterface)
	ImplicitTypeIDRegistry::lookupOrInsert(mlir::SymbolOpInterface)

AmosLewis / nonzero.decompose.py

Created December 13, 2024 06:21

	class NonzeroDecomposeModule(torch.nn.Module):
	def __init__(self):
	super().__init__()

	@export
	@annotate_args(
	[
	None,
	([-1], torch.bool, True),
	]

Chi_Liu AmosLewis