Skip to content

Instantly share code, notes, and snippets.

View pashu123's full-sized avatar
๐Ÿ˜‡
Working from home

Prashant Kumar pashu123

๐Ÿ˜‡
Working from home
View GitHub Profile
: 1709405007:0;git clone --depth 1 https://github.com/junegunn/fzf.git ~/.fzf\
~/.fzf/install
: 1709405026:0;source ~/.zshrcs
: 1709405061:0;hipcc --version
: 1709405083:0;hipcc -o3
: 1709405161:0;sudo dnf install raedontop
: 1709405239:0;sudo dnf install cargo
: 1709405609:0;sudo apt-get intall lld
: 1709405648:0;sudo dnf install libdrm-dev
: 1709405653:0;sudo dnf install libdrm-devel
This file has been truncated, but you can view the full file.
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
#map1 = affine_map<(d0, d1, d2, d3) -> (d1)>
module @module {
util.global private @__auto.time_embedding.linear_1.premul_input = #stream.parameter.named<"model"::"time_embedding.linear_1.premul_input"> : tensor<1x320xf16>
util.global private @__auto.time_embedding.linear_1.weight = #stream.parameter.named<"model"::"time_embedding.linear_1.weight"> : tensor<1280x320xf16>
util.global private @__auto.time_embedding.linear_1.bias = #stream.parameter.named<"model"::"time_embedding.linear_1.bias"> : tensor<1280xf16>
util.global private @__auto.time_embedding.linear_2.premul_input = #stream.parameter.named<"model"::"time_embedding.linear_2.premul_input"> : tensor<1x1280xf16>
util.global private @__auto.time_embedding.linear_2.weight = #stream.parameter.named<"model"::"time_embedding.linear_2.weight"> : tensor<1280x1280xf16>
util.global private @__auto.time_embedding.linear_2.bias = #stream.parameter.named<"model"::"time_embedding.linear_2.bias"> : t
func.func @encode_prompts$async_dispatch_10_softmax_12x64x64xf32_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true, use_igemm_convolution = false>}>} {
%cst = arith.constant dense<0.000000e+00> : vector<64xf32>
%cst_0 = arith.constant dense<0xFFC00000> : vector<64xf32>
%c0 = arith.constant 0 : index
%cst_1 = arith.constant 0.000000e+00 : f32
%cst_2 = arith.constant 0xFFC00000 : f32
%0 = hal.interface.constant.load layout(<constants = 2, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 2, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>],
#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
hal.executable private @encode {
hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) {
hal.executable.export public @matvec_fp16 ordinal(0) layout(#pipeline_layout) {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @encode() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true, use_igemm_convolution = false>}>} {
This file has been truncated, but you can view the full file.
// -----// IR Dump After TileAndDistributeToWorkgroupsUsingForallOpPass (iree-codegen-tile-and-distribute-to-workgroups-using-forall-op) //----- //
func.func @encode() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true, use_igemm_convolution = false>}>} {
%cst = arith.constant 1.280000e+03 : f32
%cst_0 = arith.constant 0.000000e+00 : f32
%0 = hal.interface.constant.load layout(<constants = 2, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 2, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = arith.index_castui
hal.executable public @encode_prompts$async_dispatch_178 {
hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_cou
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#map3 = affine_map<(d0, d1, d2, d3, d4) -> (d1, d2, d0, d3, d4)>
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1)>
#map5 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>
#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
#config1 = #iree_gpu.lowering_config<{thread = [0, 0, 0, 4],
thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]],
subgroup_basis = [[1, 1, 1, 1], [0, 1, 2, 3]],
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#map3 = affine_map<(d0, d1, d2, d3, d4) -> (d1, d2, d0, d3, d4)>
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1)>
#map5 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>
#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
#config2 = #iree_gpu.lowering_config<{thread = [0, 0, 0, 4],
thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]],
subgroup_basis = [[1, 1, 1, 1], [0, 1, 2, 3]],
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#map3 = affine_map<(d0, d1, d2, d3, d4) -> (d1, d2, d0, d3, d4)>
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1)>
#map5 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>
#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
#config1 = #iree_gpu.lowering_config<{thread = [0, 0, 0, 4],
thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]],
subgroup_basis = [[1, 1, 1, 1], [0, 1, 2, 3]],
func.func @prefill_bs1$async_dispatch_19_attention_8x4x1xDx32x128xf8E4M3FNUZ_generic() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {}>} {
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%c32_i64 = arith.constant 32 : i64
%c67108864 = arith.constant 67108864 : index
%c32 = arith.constant 32 : index
%c1 = arith.constant 1 : index
%cst_0 = arith.constant 1.44269502 : f32
%cst_1 = arith.constant 0.000000e+00 : f8E4M3FNUZ
%cst_2 = arith.constant dense<0.000000e+00> : vector<32x128xf32>