Skip to content

Instantly share code, notes, and snippets.

View pashu123's full-sized avatar
๐Ÿ˜‡
Working from home

Prashant Kumar pashu123

๐Ÿ˜‡
Working from home
View GitHub Profile
This file has been truncated, but you can view the full file.
// -----// IR Dump After AutoInputConversionPipelinePass (iree-auto-input-conversion) //----- //
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()>
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>
module {
func.func @attention(%arg0: tensor<20x4096x64xf16>, %arg1: tensor<20x4096x64xf16>, %arg2: tensor<20x4096x64xf16>) -> tensor<20x4096x64xf16> {
%c0 = arith.constant 0 : index
%cst = arith.constant 1.250000e-01 : f16
func.func @attention(%arg1: tensor<20x4096x64xf16>, %arg2: tensor<20x4096x64xf16>, %arg3: tensor<20x4096x64xf16>) -> tensor<20x4096x64xf16> {
%c0 = arith.constant 0 : index
%scale = arith.constant 0.125 : f16
%7 = tensor.empty() : tensor<20x4096x64xf16>
%8 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>,
affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>,
affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>,
affine_map<(d0, d1, d2, d3, d4) -> ()>,
affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]}
ins(%arg1, %arg2, %arg3, %scale : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16)
This file has been truncated, but you can view the full file.
// -----// IR Dump After ConvolutionToIGEMMPass (iree-codegen-convolution-to-igemm) //----- //
func.func @conv_nhwc_unaligned_stride_2() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.bind
func.func @matmul(%lhs: tensor<654x321xi8>, %rhs: tensor<321x234xi8>) -> tensor<654x234xi32> {
%init_acc = tensor.empty() : tensor<654x234xi32>
%c0_acc_type = arith.constant 0: i32
%acc = linalg.fill ins(%c0_acc_type : i32) outs(%init_acc : tensor<654x234xi32>) -> tensor<654x234xi32>
%result = linalg.matmul ins(%lhs, %rhs: tensor<654x321xi8>, tensor<321x234xi8>) outs(%acc: tensor<654x234xi32>) -> tensor<654x234xi32>
return %result: tensor<654x234xi32>
}
#map = affine_map<(d0, d1, d2) -> (d2, d1)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d2)>
#map3 = affine_map<(d0, d1, d2) -> (d1, d2)>
#nested = #iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>
#nested1 = #iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>
#nested2 = #iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>
#pipeline_layout = #hal.pipeline.layout<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
#translation = #iree_codegen.tr
This file has been truncated, but you can view the full file.
// -----// IR Dump After AssignLegacyTargetDevicesPass (iree-hal-assign-legacy-target-devices) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max
This file has been truncated, but you can view the full file.
// -----// IR Dump After AssignLegacyTargetDevicesPass (iree-hal-assign-legacy-target-devices) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max
This file has been truncated, but you can view the full file.
module @compiled_scheduled_unet {
util.global private @_params.unet.conv_in.weight {noinline} = #stream.parameter.named<"model"::"unet.conv_in.weight"> : tensor<320x4x3x3xf16>
util.global private @_params.unet.conv_in.bias {noinline} = #stream.parameter.named<"model"::"unet.conv_in.bias"> : tensor<320xf16>
util.global private @_params.unet.time_embedding.linear_1.weight {noinline} = #stream.parameter.named<"model"::"unet.time_embedding.linear_1.weight"> : tensor<1280x320xf16>
util.global private @_params.unet.time_embedding.linear_1.bias {noinline} = #stream.parameter.named<"model"::"unet.time_embedding.linear_1.bias"> : tensor<1280xf16>
util.global private @_params.unet.time_embedding.linear_2.weight {noinline} = #stream.parameter.named<"model"::"unet.time_embedding.linear_2.weight"> : tensor<1280x1280xf16>
util.global private @_params.unet.time_embedding.linear_2.bias {noinline} = #stream.parameter.named<"model"::"unet.time_embedding.linear_2.bias"> : tensor<1280xf16>
util.global private @_par
Args: iree-opt before_fuse_and_hoist.mlir --pass-pipeline=builtin.module(func.func(iree-codegen-gpu-fuse-and-hoist-parallel-loops)) --debug
ImplicitTypeIDRegistry::lookupOrInsert(mlir::chlo::ChloDialect)
ImplicitTypeIDRegistry::lookupOrInsert(mlir::stablehlo::StablehloDialect)
ImplicitTypeIDRegistry::lookupOrInsert(mlir::vhlo::VhloDialect)
Load new dialect in Context builtin
ImplicitTypeIDRegistry::lookupOrInsert(mlir::ShapedType)
ImplicitTypeIDRegistry::lookupOrInsert(mlir::MemRefLayoutAttrInterface)
ImplicitTypeIDRegistry::lookupOrInsert(mlir::TypedAttr)
ImplicitTypeIDRegistry::lookupOrInsert(mlir::ElementsAttr)
ImplicitTypeIDRegistry::lookupOrInsert(mlir::DistinctAttr)
This file has been truncated, but you can view the full file.
// -----// IR Dump After ConvertTorchOnnxToTorch (convert-torch-onnx-to-torch) //----- //
func.func @torch_jit(%arg0: !torch.vtensor<[1,128,4,256],f32>) -> !torch.vtensor<[1,257,4,256],f32> attributes {torch.onnx_meta.ir_version = 7 : si64, torch.onnx_meta.opset_version = 21 : si64, torch.onnx_meta.producer_name = "pytorch", torch.onnx_meta.producer_version = "1.12.1"} {
%0 = torch.vtensor.literal(dense_resource<__onnx_constant_not_found_possibly_due_to_being_elided__> : tensor<257x128x1x1xf32>) : !torch.vtensor<[257,128,1,1],f32>
%1 = torch.vtensor.literal(dense_resource<__onnx_constant_not_found_possibly_due_to_being_elided___1> : tensor<257xf32>) : !torch.vtensor<[257],f32>
%int0 = torch.constant.int 0
%int0_0 = torch.constant.int 0
%2 = torch.prim.ListConstruct %int0, %int0_0 : (!torch.int, !torch.int) -> !torch.list<int>
%int1 = torch.constant.int 1
%int1_1 = torch.constant.int 1
%int1_2 = torch.constant.int 1