pashu123’s gists

pashu123 / full.txt

Created January 13, 2025 09:45

This file has been truncated, but you can view the full file.

	// -----// IR Dump After AutoInputConversionPipelinePass (iree-auto-input-conversion) //----- //
	#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>
	#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>
	#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>
	#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()>
	#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>
	module {
	func.func @attention(%arg0: tensor<20x4096x64xf16>, %arg1: tensor<20x4096x64xf16>, %arg2: tensor<20x4096x64xf16>) -> tensor<20x4096x64xf16> {
	%c0 = arith.constant 0 : index
	%cst = arith.constant 1.250000e-01 : f16

pashu123 / attn.mlir

Created January 13, 2025 09:45

	func.func @attention(%arg1: tensor<20x4096x64xf16>, %arg2: tensor<20x4096x64xf16>, %arg3: tensor<20x4096x64xf16>) -> tensor<20x4096x64xf16> {
	%c0 = arith.constant 0 : index
	%scale = arith.constant 0.125 : f16
	%7 = tensor.empty() : tensor<20x4096x64xf16>
	%8 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>,
	affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>,
	affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>,
	affine_map<(d0, d1, d2, d3, d4) -> ()>,
	affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]}
	ins(%arg1, %arg2, %arg3, %scale : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16)

pashu123 / with.txt

Created January 13, 2025 08:04

This file has been truncated, but you can view the full file.

	// -----// IR Dump After ConvolutionToIGEMMPass (iree-codegen-convolution-to-igemm) //----- //
	func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
	%cst = arith.constant 0.000000e+00 : f32
	%c0 = arith.constant 0 : index
	%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly\|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
	%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.bind

pashu123 / test.mlir

Created January 8, 2025 15:04

	func.func @matmul(%lhs: tensor<654x321xi8>, %rhs: tensor<321x234xi8>) -> tensor<654x234xi32> {
	%init_acc = tensor.empty() : tensor<654x234xi32>
	%c0_acc_type = arith.constant 0: i32
	%acc = linalg.fill ins(%c0_acc_type : i32) outs(%init_acc : tensor<654x234xi32>) -> tensor<654x234xi32>
	%result = linalg.matmul ins(%lhs, %rhs: tensor<654x321xi8>, tensor<321x234xi8>) outs(%acc: tensor<654x234xi32>) -> tensor<654x234xi32>
	return %result: tensor<654x234xi32>
	}

pashu123 / buff_analysis.mlir

Created January 7, 2025 16:53

	#map = affine_map<(d0, d1, d2) -> (d2, d1)>
	#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
	#map2 = affine_map<(d0, d1, d2) -> (d0, d2)>
	#map3 = affine_map<(d0, d1, d2) -> (d1, d2)>
	#nested = #iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>
	#nested1 = #iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>
	#nested2 = #iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>
	#pipeline_layout = #hal.pipeline.layout<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
	#translation = #iree_codegen.tr

pashu123 / xyz_old.txt

Created January 6, 2025 06:39

This file has been truncated, but you can view the full file.

// -----// IR Dump After AssignLegacyTargetDevicesPass (iree-hal-assign-legacy-target-devices) //----- //

#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max

pashu123 / xyz.txt

Created January 6, 2025 06:39

This file has been truncated, but you can view the full file.

// -----// IR Dump After AssignLegacyTargetDevicesPass (iree-hal-assign-legacy-target-devices) //----- //

#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max

pashu123 / model.mlir

Created January 3, 2025 17:46

This file has been truncated, but you can view the full file.

	module @compiled_scheduled_unet {
	util.global private @_params.unet.conv_in.weight {noinline} = #stream.parameter.named<"model"::"unet.conv_in.weight"> : tensor<320x4x3x3xf16>
	util.global private @_params.unet.conv_in.bias {noinline} = #stream.parameter.named<"model"::"unet.conv_in.bias"> : tensor<320xf16>
	util.global private @_params.unet.time_embedding.linear_1.weight {noinline} = #stream.parameter.named<"model"::"unet.time_embedding.linear_1.weight"> : tensor<1280x320xf16>
	util.global private @_params.unet.time_embedding.linear_1.bias {noinline} = #stream.parameter.named<"model"::"unet.time_embedding.linear_1.bias"> : tensor<1280xf16>
	util.global private @_params.unet.time_embedding.linear_2.weight {noinline} = #stream.parameter.named<"model"::"unet.time_embedding.linear_2.weight"> : tensor<1280x1280xf16>
	util.global private @_params.unet.time_embedding.linear_2.bias {noinline} = #stream.parameter.named<"model"::"unet.time_embedding.linear_2.bias"> : tensor<1280xf16>
	util.global private @_par

pashu123 / debug_fuse_and_hoist.txt

Created December 18, 2024 15:56

	Args: iree-opt before_fuse_and_hoist.mlir --pass-pipeline=builtin.module(func.func(iree-codegen-gpu-fuse-and-hoist-parallel-loops)) --debug
	ImplicitTypeIDRegistry::lookupOrInsert(mlir::chlo::ChloDialect)
	ImplicitTypeIDRegistry::lookupOrInsert(mlir::stablehlo::StablehloDialect)
	ImplicitTypeIDRegistry::lookupOrInsert(mlir::vhlo::VhloDialect)
	Load new dialect in Context builtin
	ImplicitTypeIDRegistry::lookupOrInsert(mlir::ShapedType)
	ImplicitTypeIDRegistry::lookupOrInsert(mlir::MemRefLayoutAttrInterface)
	ImplicitTypeIDRegistry::lookupOrInsert(mlir::TypedAttr)
	ImplicitTypeIDRegistry::lookupOrInsert(mlir::ElementsAttr)
	ImplicitTypeIDRegistry::lookupOrInsert(mlir::DistinctAttr)

pashu123 / full_tilefuse.txt

Created December 18, 2024 15:40

This file has been truncated, but you can view the full file.

	// -----// IR Dump After ConvertTorchOnnxToTorch (convert-torch-onnx-to-torch) //----- //
	func.func @torch_jit(%arg0: !torch.vtensor<[1,128,4,256],f32>) -> !torch.vtensor<[1,257,4,256],f32> attributes {torch.onnx_meta.ir_version = 7 : si64, torch.onnx_meta.opset_version = 21 : si64, torch.onnx_meta.producer_name = "pytorch", torch.onnx_meta.producer_version = "1.12.1"} {
	%0 = torch.vtensor.literal(dense_resource<__onnx_constant_not_found_possibly_due_to_being_elided__> : tensor<257x128x1x1xf32>) : !torch.vtensor<[257,128,1,1],f32>
	%1 = torch.vtensor.literal(dense_resource<__onnx_constant_not_found_possibly_due_to_being_elided___1> : tensor<257xf32>) : !torch.vtensor<[257],f32>
	%int0 = torch.constant.int 0
	%int0_0 = torch.constant.int 0
	%2 = torch.prim.ListConstruct %int0, %int0_0 : (!torch.int, !torch.int) -> !torch.list<int>
	%int1 = torch.constant.int 1
	%int1_1 = torch.constant.int 1
	%int1_2 = torch.constant.int 1

Prashant Kumar pashu123