pashu123’s gists

pashu123 / punet.mlir

Created March 10, 2025 16:58

This file has been truncated, but you can view the full file.

	#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
	#map1 = affine_map<(d0, d1, d2, d3) -> (d1)>
	module @module {
	util.global private @__auto.time_embedding.linear_1.premul_input = #stream.parameter.named<"model"::"time_embedding.linear_1.premul_input"> : tensor<1x320xf16>
	util.global private @__auto.time_embedding.linear_1.weight = #stream.parameter.named<"model"::"time_embedding.linear_1.weight"> : tensor<1280x320xf16>
	util.global private @__auto.time_embedding.linear_1.bias = #stream.parameter.named<"model"::"time_embedding.linear_1.bias"> : tensor<1280xf16>
	util.global private @__auto.time_embedding.linear_2.premul_input = #stream.parameter.named<"model"::"time_embedding.linear_2.premul_input"> : tensor<1x1280xf16>
	util.global private @__auto.time_embedding.linear_2.weight = #stream.parameter.named<"model"::"time_embedding.linear_2.weight"> : tensor<1280x1280xf16>
	util.global private @__auto.time_embedding.linear_2.bias = #stream.parameter.named<"model"::"time_embedding.linear_2.bias"> : t

pashu123 / send.mlir

Created March 4, 2025 14:04

	func.func @encode_prompts$async_dispatch_10_softmax_12x64x64xf32_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true, use_igemm_convolution = false>}>} {
	%cst = arith.constant dense<0.000000e+00> : vector<64xf32>
	%cst_0 = arith.constant dense<0xFFC00000> : vector<64xf32>
	%c0 = arith.constant 0 : index
	%cst_1 = arith.constant 0.000000e+00 : f32
	%cst_2 = arith.constant 0xFFC00000 : f32
	%0 = hal.interface.constant.load layout(<constants = 2, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
	%1 = hal.interface.constant.load layout(<constants = 2, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>],

pashu123 / test_this_now.mlir

Created February 21, 2025 20:08

	#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
	hal.executable private @encode {
	hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) {
	hal.executable.export public @matvec_fp16 ordinal(0) layout(#pipeline_layout) {
	^bb0(%arg0: !hal.device):
	%x, %y, %z = flow.dispatch.workgroup_count_from_slice
	hal.return %x, %y, %z : index, index, index
	}
	builtin.module {
	func.func @encode() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true, use_igemm_convolution = false>}>} {

pashu123 / full.txt

Created February 21, 2025 17:45

This file has been truncated, but you can view the full file.

	// -----// IR Dump After TileAndDistributeToWorkgroupsUsingForallOpPass (iree-codegen-tile-and-distribute-to-workgroups-using-forall-op) //----- //
	func.func @encode() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true, use_igemm_convolution = false>}>} {
	%cst = arith.constant 1.280000e+03 : f32
	%cst_0 = arith.constant 0.000000e+00 : f32
	%0 = hal.interface.constant.load layout(<constants = 2, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
	%1 = hal.interface.constant.load layout(<constants = 2, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
	%2 = arith.index_castui

pashu123 / compiled_clip_encode_prompts$async_dispatch_178.mlir

Created February 19, 2025 18:08

hal.executable public @encode_prompts$async_dispatch_178 {

  hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_cou

pashu123 / make.mlir

Created February 17, 2025 17:26

	#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
	#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
	#map2 = affine_map<(d0, d1) -> (d0, d1)>
	#map3 = affine_map<(d0, d1, d2, d3, d4) -> (d1, d2, d0, d3, d4)>
	#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1)>
	#map5 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>
	#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
	#config1 = #iree_gpu.lowering_config<{thread = [0, 0, 0, 4],
	thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]],
	subgroup_basis = [[1, 1, 1, 1], [0, 1, 2, 3]],

pashu123 / make.mlir

Created February 17, 2025 14:00

	#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
	#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
	#map2 = affine_map<(d0, d1) -> (d0, d1)>
	#map3 = affine_map<(d0, d1, d2, d3, d4) -> (d1, d2, d0, d3, d4)>
	#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1)>
	#map5 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>
	#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
	#config2 = #iree_gpu.lowering_config<{thread = [0, 0, 0, 4],
	thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]],
	subgroup_basis = [[1, 1, 1, 1], [0, 1, 2, 3]],

pashu123 / make.mlir

Created February 17, 2025 11:49

	#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
	#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
	#map2 = affine_map<(d0, d1) -> (d0, d1)>
	#map3 = affine_map<(d0, d1, d2, d3, d4) -> (d1, d2, d0, d3, d4)>
	#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1)>
	#map5 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>
	#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
	#config1 = #iree_gpu.lowering_config<{thread = [0, 0, 0, 4],
	thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]],
	subgroup_basis = [[1, 1, 1, 1], [0, 1, 2, 3]],

pashu123 / before_vector_distribute.mlir

Created February 14, 2025 11:12

	func.func @prefill_bs1$async_dispatch_19_attention_8x4x1xDx32x128xf8E4M3FNUZ_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {}>} {
	%c0 = arith.constant 0 : index
	%cst = arith.constant 0.000000e+00 : f32
	%c32_i64 = arith.constant 32 : i64
	%c67108864 = arith.constant 67108864 : index
	%c32 = arith.constant 32 : index
	%c1 = arith.constant 1 : index
	%cst_0 = arith.constant 1.44269502 : f32
	%cst_1 = arith.constant 0.000000e+00 : f8E4M3FNUZ
	%cst_2 = arith.constant dense<0.000000e+00> : vector<32x128xf32>

pashu123 / module_prefill_bs1$async_dispatch_19.mlir

Created February 14, 2025 11:00

hal.executable public @prefill_bs1$async_dispatch_19 {

  hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts 

Prashant Kumar pashu123