zhuangh · November 22, 2023 07:54
diff --git a/matmul_gtx1060.ir b/matmul_gtx1060.ir
 IR module {
  tt.func public @matmul_kernel_0d1d2d3d4c5d6c7d8c(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32}) attributes {noinline = false} {
    %c16_i32 = arith.constant 16 : i32
    %c1024_i32 = arith.constant 1024 : i32
    %c0_i32 = arith.constant 0 : i32
    %cst = arith.constant dense<16> : tensor<16x16xi32>
    %cst_0 = arith.constant dense<0.000000e+00> : tensor<16x16xf32>
    %0 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32>
    %1 = tt.expand_dims %0 {axis = 1 : i32} : (tensor<16xi32>) -> tensor<16x1xi32>
    %2 = tt.splat %arg3 : (i32) -> tensor<16x1xi32>
    %3 = arith.muli %1, %2 : tensor<16x1xi32>
    %4 = tt.splat %arg0 : (!tt.ptr<f32>) -> tensor<16x1x!tt.ptr<f32>>
    %5 = tt.addptr %4, %3 : tensor<16x1x!tt.ptr<f32>>, tensor<16x1xi32>
    %6 = tt.expand_dims %0 {axis = 0 : i32} : (tensor<16xi32>) -> tensor<1x16xi32>
    %7 = tt.broadcast %5 : (tensor<16x1x!tt.ptr<f32>>) -> tensor<16x16x!tt.ptr<f32>>
    %8 = tt.broadcast %6 : (tensor<1x16xi32>) -> tensor<16x16xi32>
    %9 = tt.addptr %7, %8 : tensor<16x16x!tt.ptr<f32>>, tensor<16x16xi32>
    %10 = tt.splat %arg4 : (i32) -> tensor<16x1xi32>
    %11 = arith.muli %1, %10 : tensor<16x1xi32>
    %12 = tt.splat %arg1 : (!tt.ptr<f32>) -> tensor<16x1x!tt.ptr<f32>>
    %13 = tt.addptr %12, %11 : tensor<16x1x!tt.ptr<f32>>, tensor<16x1xi32>
    %14 = tt.broadcast %13 : (tensor<16x1x!tt.ptr<f32>>) -> tensor<16x16x!tt.ptr<f32>>
    %15 = tt.addptr %14, %8 : tensor<16x16x!tt.ptr<f32>>, tensor<16x16xi32>
    %16 = arith.muli %arg4, %c16_i32 : i32
    %17 = tt.splat %16 : (i32) -> tensor<16x16xi32>
    %18:3 = scf.for %arg6 = %c0_i32 to %c1024_i32 step %c16_i32 iter_args(%arg7 = %cst_0, %arg8 = %9, %arg9 = %15) -> (tensor<16x16xf32>, tensor<16x16x!tt.ptr<f32>>, tensor<16x16x!tt.ptr<f32>>)  : i32 {
      %25 = tt.load %arg8 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<16x16xf32>
      %26 = tt.load %arg9 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<16x16xf32>
      %27 = tt.dot %25, %26, %arg7 {allowTF32 = true} : tensor<16x16xf32> * tensor<16x16xf32> -> tensor<16x16xf32>
      %28 = tt.addptr %arg8, %cst : tensor<16x16x!tt.ptr<f32>>, tensor<16x16xi32>
      %29 = tt.addptr %arg9, %17 : tensor<16x16x!tt.ptr<f32>>, tensor<16x16xi32>
      scf.yield %27, %28, %29 : tensor<16x16xf32>, tensor<16x16x!tt.ptr<f32>>, tensor<16x16x!tt.ptr<f32>>
    }
    %19 = tt.splat %arg5 : (i32) -> tensor<16x1xi32>
    %20 = arith.muli %1, %19 : tensor<16x1xi32>
    %21 = tt.splat %arg2 : (!tt.ptr<f32>) -> tensor<16x1x!tt.ptr<f32>>
    %22 = tt.addptr %21, %20 : tensor<16x1x!tt.ptr<f32>>, tensor<16x1xi32>
    %23 = tt.broadcast %22 : (tensor<16x1x!tt.ptr<f32>>) -> tensor<16x16x!tt.ptr<f32>>
    %24 = tt.addptr %23, %8 : tensor<16x16x!tt.ptr<f32>>, tensor<16x16xi32>
    tt.store %24, %18#0 {cache = 1 : i32, evict = 1 : i32} : tensor<16x16xf32>
    tt.return
  }
 }

 TTGIR #blocked = #triton_gpu.blocked<{sizePerThread = [1, 2], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
 #blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [2, 16], warpsPerCTA = [4, 1], order = [1, 0]}>
 #shared = #triton_gpu.shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0]}>
 module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
  tt.func public @matmul_kernel_0d1d2d3d4c5d6c7d8c(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32}) attributes {noinline = false} {
    %c1_i32 = arith.constant 1 : i32
    %c2_i32 = arith.constant 2 : i32
    %cst = arith.constant dense<true> : tensor<16x16xi1, #blocked>
    %c16_i32 = arith.constant 16 : i32
    %c3_i32 = arith.constant 3 : i32
    %cst_0 = arith.constant dense<0.000000e+00> : tensor<16x16xf32, #blocked1>
    %cst_1 = arith.constant dense<16> : tensor<16x16xi32, #blocked>
    %c1024_i32 = arith.constant 1024 : i32
    %c0_i32 = arith.constant 0 : i32
    %0 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
    %1 = tt.expand_dims %0 {axis = 1 : i32} : (tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<16x1xi32, #blocked>
    %2 = tt.splat %arg3 : (i32) -> tensor<16x1xi32, #blocked>
    %3 = arith.muli %1, %2 : tensor<16x1xi32, #blocked>
    %4 = tt.splat %arg0 : (!tt.ptr<f32>) -> tensor<16x1x!tt.ptr<f32>, #blocked>
    %5 = tt.addptr %4, %3 : tensor<16x1x!tt.ptr<f32>, #blocked>, tensor<16x1xi32, #blocked>
    %6 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
    %7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x16xi32, #blocked>
    %8 = tt.broadcast %5 : (tensor<16x1x!tt.ptr<f32>, #blocked>) -> tensor<16x16x!tt.ptr<f32>, #blocked>
    %9 = tt.broadcast %7 : (tensor<1x16xi32, #blocked>) -> tensor<16x16xi32, #blocked>
    %10 = tt.addptr %8, %9 : tensor<16x16x!tt.ptr<f32>, #blocked>, tensor<16x16xi32, #blocked>
    %11 = tt.splat %arg4 : (i32) -> tensor<16x1xi32, #blocked>
    %12 = arith.muli %1, %11 : tensor<16x1xi32, #blocked>
    %13 = tt.splat %arg1 : (!tt.ptr<f32>) -> tensor<16x1x!tt.ptr<f32>, #blocked>
    %14 = tt.addptr %13, %12 : tensor<16x1x!tt.ptr<f32>, #blocked>, tensor<16x1xi32, #blocked>
    %15 = tt.broadcast %14 : (tensor<16x1x!tt.ptr<f32>, #blocked>) -> tensor<16x16x!tt.ptr<f32>, #blocked>
    %16 = tt.addptr %15, %9 : tensor<16x16x!tt.ptr<f32>, #blocked>, tensor<16x16xi32, #blocked>
    %17 = arith.muli %arg4, %c16_i32 : i32
    %18 = tt.splat %17 : (i32) -> tensor<16x16xi32, #blocked>
    %19 = triton_gpu.alloc_tensor : tensor<3x16x16xf32, #shared>
    %20 = triton_gpu.insert_slice_async %10, %19, %c0_i32, %cst {axis = 0 : i32, cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<16x16x!tt.ptr<f32>, #blocked> -> tensor<3x16x16xf32, #shared>
    triton_gpu.async_commit_group
    %21 = triton_gpu.alloc_tensor : tensor<3x16x16xf32, #shared>
    %22 = triton_gpu.insert_slice_async %16, %21, %c0_i32, %cst {axis = 0 : i32, cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<16x16x!tt.ptr<f32>, #blocked> -> tensor<3x16x16xf32, #shared>
    triton_gpu.async_commit_group
    %23 = tt.addptr %10, %cst_1 : tensor<16x16x!tt.ptr<f32>, #blocked>, tensor<16x16xi32, #blocked>
    %24 = tt.addptr %16, %18 : tensor<16x16x!tt.ptr<f32>, #blocked>, tensor<16x16xi32, #blocked>
    %25 = triton_gpu.insert_slice_async %23, %20, %c1_i32, %cst {axis = 0 : i32, cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<16x16x!tt.ptr<f32>, #blocked> -> tensor<3x16x16xf32, #shared>
    triton_gpu.async_commit_group
    %26 = triton_gpu.insert_slice_async %24, %22, %c1_i32, %cst {axis = 0 : i32, cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<16x16x!tt.ptr<f32>, #blocked> -> tensor<3x16x16xf32, #shared>
    triton_gpu.async_commit_group
    triton_gpu.async_wait {num = 2 : i32}
    %27 = triton_gpu.extract_slice %25[0, 0, 0] [1, 16, 16] [1, 1, 1] : tensor<3x16x16xf32, #shared> to tensor<16x16xf32, #shared>
    %28 = triton_gpu.extract_slice %26[0, 0, 0] [1, 16, 16] [1, 1, 1] : tensor<3x16x16xf32, #shared> to tensor<16x16xf32, #shared>
    %29 = triton_gpu.extract_slice %27[0, 0] [16, 8] [1, 1] : tensor<16x16xf32, #shared> to tensor<16x8xf32, #shared>
    %30 = triton_gpu.convert_layout %29 : (tensor<16x8xf32, #shared>) -> tensor<16x8xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked1}>>
    %31 = triton_gpu.extract_slice %28[0, 0] [8, 16] [1, 1] : tensor<16x16xf32, #shared> to tensor<8x16xf32, #shared>
    %32 = triton_gpu.convert_layout %31 : (tensor<8x16xf32, #shared>) -> tensor<8x16xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked1}>>
    %33:14 = scf.for %arg6 = %c0_i32 to %c1024_i32 step %c16_i32 iter_args(%arg7 = %cst_0, %arg8 = %10, %arg9 = %16, %arg10 = %25, %arg11 = %26, %arg12 = %27, %arg13 = %28, %arg14 = %23, %arg15 = %24, %arg16 = %c16_i32, %arg17 = %c2_i32, %arg18 = %c1_i32, %arg19 = %30, %arg20 = %32) -> (tensor<16x16xf32, #blocked1>, tensor<16x16x!tt.ptr<f32>, #blocked>, tensor<16x16x!tt.ptr<f32>, #blocked>, tensor<3x16x16xf32, #shared>, tensor<3x16x16xf32, #shared>, tensor<16x16xf32, #shared>, tensor<16x16xf32, #shared>, tensor<16x16x!tt.ptr<f32>, #blocked>, tensor<16x16x!tt.ptr<f32>, #blocked>, i32, i32, i32, tensor<16x8xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked1}>>, tensor<8x16xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked1}>>)  : i32 {
      %41 = triton_gpu.extract_slice %arg12[0, 8] [16, 8] [1, 1] : tensor<16x16xf32, #shared> to tensor<16x8xf32, #shared>
      %42 = triton_gpu.convert_layout %41 : (tensor<16x8xf32, #shared>) -> tensor<16x8xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked1}>>
      %43 = triton_gpu.extract_slice %arg13[8, 0] [8, 16] [1, 1] : tensor<16x16xf32, #shared> to tensor<8x16xf32, #shared>
      %44 = triton_gpu.convert_layout %43 : (tensor<8x16xf32, #shared>) -> tensor<8x16xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked1}>>
      %45 = tt.dot %arg19, %arg20, %arg7 {allowTF32 = true} : tensor<16x8xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked1}>> * tensor<8x16xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked1}>> -> tensor<16x16xf32, #blocked1>
      %46 = tt.dot %42, %44, %45 {allowTF32 = true} : tensor<16x8xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked1}>> * tensor<8x16xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked1}>> -> tensor<16x16xf32, #blocked1>
      %47 = tt.addptr %arg8, %cst_1 : tensor<16x16x!tt.ptr<f32>, #blocked>, tensor<16x16xi32, #blocked>
      %48 = tt.addptr %arg9, %18 : tensor<16x16x!tt.ptr<f32>, #blocked>, tensor<16x16xi32, #blocked>
      %49 = arith.addi %arg16, %c16_i32 : i32
      %50 = arith.cmpi slt, %49, %c1024_i32 : i32
      %51 = arith.remsi %arg17, %c3_i32 : i32
      %52 = arith.remsi %arg18, %c3_i32 : i32
      %53 = tt.addptr %arg14, %cst_1 : tensor<16x16x!tt.ptr<f32>, #blocked>, tensor<16x16xi32, #blocked>
      %54 = tt.addptr %arg15, %18 : tensor<16x16x!tt.ptr<f32>, #blocked>, tensor<16x16xi32, #blocked>
      %55 = tt.splat %50 : (i1) -> tensor<16x16xi1, #blocked>
      %56 = triton_gpu.insert_slice_async %53, %arg10, %51, %55 {axis = 0 : i32, cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<16x16x!tt.ptr<f32>, #blocked> -> tensor<3x16x16xf32, #shared>
      triton_gpu.async_commit_group
      %57 = triton_gpu.insert_slice_async %54, %arg11, %51, %55 {axis = 0 : i32, cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<16x16x!tt.ptr<f32>, #blocked> -> tensor<3x16x16xf32, #shared>
      triton_gpu.async_commit_group
      triton_gpu.async_wait {num = 2 : i32}
      %58 = triton_gpu.extract_slice %56[%52, 0, 0] [1, 16, 16] [1, 1, 1] : tensor<3x16x16xf32, #shared> to tensor<16x16xf32, #shared>
      %59 = triton_gpu.extract_slice %57[%52, 0, 0] [1, 16, 16] [1, 1, 1] : tensor<3x16x16xf32, #shared> to tensor<16x16xf32, #shared>
      %60 = arith.addi %arg17, %c1_i32 : i32
      %61 = arith.addi %arg18, %c1_i32 : i32
      %62 = triton_gpu.extract_slice %58[0, 0] [16, 8] [1, 1] : tensor<16x16xf32, #shared> to tensor<16x8xf32, #shared>
      %63 = triton_gpu.convert_layout %62 : (tensor<16x8xf32, #shared>) -> tensor<16x8xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked1}>>
      %64 = triton_gpu.extract_slice %59[0, 0] [8, 16] [1, 1] : tensor<16x16xf32, #shared> to tensor<8x16xf32, #shared>
      %65 = triton_gpu.convert_layout %64 : (tensor<8x16xf32, #shared>) -> tensor<8x16xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked1}>>
      scf.yield %46, %47, %48, %56, %57, %58, %59, %53, %54, %49, %60, %61, %63, %65 : tensor<16x16xf32, #blocked1>, tensor<16x16x!tt.ptr<f32>, #blocked>, tensor<16x16x!tt.ptr<f32>, #blocked>, tensor<3x16x16xf32, #shared>, tensor<3x16x16xf32, #shared>, tensor<16x16xf32, #shared>, tensor<16x16xf32, #shared>, tensor<16x16x!tt.ptr<f32>, #blocked>, tensor<16x16x!tt.ptr<f32>, #blocked>, i32, i32, i32, tensor<16x8xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked1}>>, tensor<8x16xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked1}>>
    }
    triton_gpu.async_wait {num = 0 : i32}
    %34 = tt.splat %arg5 : (i32) -> tensor<16x1xi32, #blocked>
    %35 = arith.muli %1, %34 : tensor<16x1xi32, #blocked>
    %36 = tt.splat %arg2 : (!tt.ptr<f32>) -> tensor<16x1x!tt.ptr<f32>, #blocked>
    %37 = tt.addptr %36, %35 : tensor<16x1x!tt.ptr<f32>, #blocked>, tensor<16x1xi32, #blocked>
    %38 = tt.broadcast %37 : (tensor<16x1x!tt.ptr<f32>, #blocked>) -> tensor<16x16x!tt.ptr<f32>, #blocked>
    %39 = tt.addptr %38, %9 : tensor<16x16x!tt.ptr<f32>, #blocked>, tensor<16x16xi32, #blocked>
    %40 = triton_gpu.convert_layout %33#0 : (tensor<16x16xf32, #blocked1>) -> tensor<16x16xf32, #blocked>
    tt.store %39, %40 {cache = 1 : i32, evict = 1 : i32} : tensor<16x16xf32, #blocked>
    tt.return
  }
 }

 LLIR ; ModuleID = 'LLVMDialectModule'
 source_filename = "LLVMDialectModule"

 @global_smem = external addrspace(3) global [0 x i8]

 define void @matmul_kernel_0d1d2d3d4c5d6c7d8c(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4, i32 %5) local_unnamed_addr !dbg !5 {
  %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
  %8 = and i32 %7, 31, !dbg !8
  %9 = lshr i32 %7, 5, !dbg !8
  %10 = and i32 %9, 3, !dbg !8
  %11 = lshr i32 %8, 3, !dbg !8
  %12 = shl nuw nsw i32 %10, 2, !dbg !8
  %13 = or i32 %12, %11, !dbg !8
  %14 = shl i32 %7, 1, !dbg !9
  %15 = and i32 %14, 14, !dbg !9
  %16 = mul i32 %13, %3, !dbg !10
  %17 = sext i32 %16 to i64, !dbg !11
  %18 = getelementptr float, ptr addrspace(1) %0, i64 %17, !dbg !11
  %19 = zext i32 %15 to i64, !dbg !12
  %20 = getelementptr float, ptr addrspace(1) %18, i64 %19, !dbg !12
  %21 = mul i32 %13, %4, !dbg !13
  %22 = sext i32 %21 to i64, !dbg !14
  %23 = getelementptr float, ptr addrspace(1) %1, i64 %22, !dbg !14
  %24 = getelementptr float, ptr addrspace(1) %23, i64 %19, !dbg !15
  %25 = shl i32 %4, 4, !dbg !16
  %26 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];", "=r,=r,l,b"(ptr addrspace(1) %20, i1 true) #2, !dbg !17
  %27 = extractvalue { i32, i32 } %26, 0, !dbg !17
  %28 = extractvalue { i32, i32 } %26, 1, !dbg !17
  %29 = shl nuw nsw i32 %13, 4, !dbg !17
  %30 = or i32 %29, %15, !dbg !17
  %31 = zext i32 %30 to i64, !dbg !17
  %32 = getelementptr float, ptr addrspace(3) @global_smem, i64 %31, !dbg !17
  %33 = getelementptr float, ptr addrspace(3) %32, i64 1, !dbg !17
  store i32 %27, ptr addrspace(3) %32, align 4, !dbg !17
  store i32 %28, ptr addrspace(3) %33, align 4, !dbg !17
  %34 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];", "=r,=r,l,b"(ptr addrspace(1) %24, i1 true) #2, !dbg !18
  %35 = extractvalue { i32, i32 } %34, 0, !dbg !18
  %36 = extractvalue { i32, i32 } %34, 1, !dbg !18
  %37 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 3072), i64 %31, !dbg !18
  %38 = getelementptr float, ptr addrspace(3) %37, i64 1, !dbg !18
  store i32 %35, ptr addrspace(3) %37, align 4, !dbg !18
  store i32 %36, ptr addrspace(3) %38, align 4, !dbg !18
  %39 = getelementptr float, ptr addrspace(1) %20, i64 16, !dbg !19
  %40 = sext i32 %25 to i64, !dbg !20
  %41 = getelementptr float, ptr addrspace(1) %24, i64 %40, !dbg !20
  %42 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];", "=r,=r,l,b"(ptr addrspace(1) %39, i1 true) #2, !dbg !17
  %43 = extractvalue { i32, i32 } %42, 0, !dbg !17
  %44 = extractvalue { i32, i32 } %42, 1, !dbg !17
  tail call void @llvm.nvvm.barrier0(), !dbg !17
  %45 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 1024), i64 %31, !dbg !17
  %46 = getelementptr float, ptr addrspace(3) %45, i64 1, !dbg !17
  store i32 %43, ptr addrspace(3) %45, align 4, !dbg !17
  store i32 %44, ptr addrspace(3) %46, align 4, !dbg !17
  %47 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];", "=r,=r,l,b"(ptr addrspace(1) %41, i1 true) #2, !dbg !18
  %48 = extractvalue { i32, i32 } %47, 0, !dbg !18
  %49 = extractvalue { i32, i32 } %47, 1, !dbg !18
  %50 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 4096), i64 %31, !dbg !18
  %51 = getelementptr float, ptr addrspace(3) %50, i64 1, !dbg !18
  store i32 %48, ptr addrspace(3) %50, align 4, !dbg !18
  store i32 %49, ptr addrspace(3) %51, align 4, !dbg !18
  tail call void @llvm.nvvm.barrier0(), !dbg !17
  %52 = lshr i32 %7, 4, !dbg !17
  %53 = and i32 %52, 7, !dbg !17
  %54 = shl nuw nsw i32 %53, 4, !dbg !17
  %55 = zext i32 %54 to i64, !dbg !17
  %56 = and i32 %7, 15, !dbg !18
  %57 = zext i32 %56 to i64, !dbg !18
  br label %58, !dbg !21

 58:                                               ; preds = %6, %58
  %.pn5864 = phi ptr addrspace(3) [ getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 3072), %6 ], [ %230, %58 ]
  %.pn63 = phi ptr addrspace(3) [ @global_smem, %6 ], [ %224, %58 ]
  %59 = phi i32 [ 1, %6 ], [ %237, %58 ]
  %60 = phi i32 [ 2, %6 ], [ %236, %58 ]
  %61 = phi i32 [ 16, %6 ], [ %208, %58 ]
  %.pn762 = phi ptr addrspace(1) [ %41, %6 ], [ %211, %58 ]
  %.pn361 = phi ptr addrspace(1) [ %39, %6 ], [ %210, %58 ]
  %62 = phi { ptr addrspace(3), i32, i32, i32, i32 } [ { ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 3072), i32 16, i32 1, i32 0, i32 0 }, %6 ], [ %235, %58 ]
  %63 = phi { ptr addrspace(3), i32, i32, i32, i32 } [ { ptr addrspace(3) @global_smem, i32 16, i32 1, i32 0, i32 0 }, %6 ], [ %229, %58 ]
  %64 = phi { float, float } [ zeroinitializer, %6 ], [ %207, %58 ]
  %65 = phi i32 [ 0, %6 ], [ %238, %58 ]
  %.pn55.in = getelementptr float, ptr addrspace(3) %.pn5864, i64 %57, !dbg !18
  %.pn39.in = getelementptr float, ptr addrspace(3) %.pn63, i64 %55, !dbg !17
  %.pn41.in = getelementptr float, ptr addrspace(3) %.pn55.in, i64 112, !dbg !18
  %.pn43.in = getelementptr float, ptr addrspace(3) %.pn55.in, i64 96, !dbg !18
  %.pn45.in = getelementptr float, ptr addrspace(3) %.pn55.in, i64 80, !dbg !18
  %.pn47.in = getelementptr float, ptr addrspace(3) %.pn55.in, i64 64, !dbg !18
  %.pn49.in = getelementptr float, ptr addrspace(3) %.pn55.in, i64 48, !dbg !18
  %.pn51.in = getelementptr float, ptr addrspace(3) %.pn55.in, i64 32, !dbg !18
  %.pn53.in = getelementptr float, ptr addrspace(3) %.pn55.in, i64 16, !dbg !18
  %.pn9.in = getelementptr float, ptr addrspace(3) %.pn39.in, i64 135, !dbg !17
  %.pn11.in = getelementptr float, ptr addrspace(3) %.pn39.in, i64 7, !dbg !17
  %.pn13.in = getelementptr float, ptr addrspace(3) %.pn39.in, i64 134, !dbg !17
  %.pn15.in = getelementptr float, ptr addrspace(3) %.pn39.in, i64 6, !dbg !17
  %.pn17.in = getelementptr float, ptr addrspace(3) %.pn39.in, i64 133, !dbg !17
  %.pn19.in = getelementptr float, ptr addrspace(3) %.pn39.in, i64 5, !dbg !17
  %.pn21.in = getelementptr float, ptr addrspace(3) %.pn39.in, i64 132, !dbg !17
  %.pn23.in = getelementptr float, ptr addrspace(3) %.pn39.in, i64 4, !dbg !17
  %.pn25.in = getelementptr float, ptr addrspace(3) %.pn39.in, i64 131, !dbg !17
  %.pn27.in = getelementptr float, ptr addrspace(3) %.pn39.in, i64 3, !dbg !17
  %.pn29.in = getelementptr float, ptr addrspace(3) %.pn39.in, i64 130, !dbg !17
  %.pn31.in = getelementptr float, ptr addrspace(3) %.pn39.in, i64 2, !dbg !17
  %.pn33.in = getelementptr float, ptr addrspace(3) %.pn39.in, i64 129, !dbg !17
  %.pn35.in = getelementptr float, ptr addrspace(3) %.pn39.in, i64 1, !dbg !17
  %.pn37.in = getelementptr float, ptr addrspace(3) %.pn39.in, i64 128, !dbg !17
  %.pn55 = load float, ptr addrspace(3) %.pn55.in, align 4, !dbg !18
  %.pn53 = load float, ptr addrspace(3) %.pn53.in, align 4, !dbg !18
  %.pn51 = load float, ptr addrspace(3) %.pn51.in, align 4, !dbg !18
  %.pn49 = load float, ptr addrspace(3) %.pn49.in, align 4, !dbg !18
  %.pn47 = load float, ptr addrspace(3) %.pn47.in, align 4, !dbg !18
  %.pn45 = load float, ptr addrspace(3) %.pn45.in, align 4, !dbg !18
  %.pn43 = load float, ptr addrspace(3) %.pn43.in, align 4, !dbg !18
  %.pn41 = load float, ptr addrspace(3) %.pn41.in, align 4, !dbg !18
  %.pn39 = load float, ptr addrspace(3) %.pn39.in, align 4, !dbg !17
  %.pn37 = load float, ptr addrspace(3) %.pn37.in, align 4, !dbg !17
  %.pn35 = load float, ptr addrspace(3) %.pn35.in, align 4, !dbg !17
  %.pn33 = load float, ptr addrspace(3) %.pn33.in, align 4, !dbg !17
  %.pn31 = load float, ptr addrspace(3) %.pn31.in, align 4, !dbg !17
  %.pn29 = load float, ptr addrspace(3) %.pn29.in, align 4, !dbg !17
  %.pn27 = load float, ptr addrspace(3) %.pn27.in, align 4, !dbg !17
  %.pn25 = load float, ptr addrspace(3) %.pn25.in, align 4, !dbg !17
  %.pn23 = load float, ptr addrspace(3) %.pn23.in, align 4, !dbg !17
  %.pn21 = load float, ptr addrspace(3) %.pn21.in, align 4, !dbg !17
  %.pn19 = load float, ptr addrspace(3) %.pn19.in, align 4, !dbg !17
  %.pn17 = load float, ptr addrspace(3) %.pn17.in, align 4, !dbg !17
  %.pn15 = load float, ptr addrspace(3) %.pn15.in, align 4, !dbg !17
  %.pn13 = load float, ptr addrspace(3) %.pn13.in, align 4, !dbg !17
  %.pn11 = load float, ptr addrspace(3) %.pn11.in, align 4, !dbg !17
  %.pn9 = load float, ptr addrspace(3) %.pn9.in, align 4, !dbg !17
  %66 = extractvalue { ptr addrspace(3), i32, i32, i32, i32 } %63, 0, !dbg !17
  %67 = extractvalue { ptr addrspace(3), i32, i32, i32, i32 } %63, 1, !dbg !17
  %68 = extractvalue { ptr addrspace(3), i32, i32, i32, i32 } %63, 2, !dbg !17
  %69 = shl i32 %68, 3, !dbg !17
  %70 = sext i32 %69 to i64, !dbg !17
  %71 = getelementptr float, ptr addrspace(3) %66, i64 %70, !dbg !17
  %72 = mul i32 %67, %53, !dbg !17
  %73 = sext i32 %72 to i64, !dbg !17
  %74 = getelementptr float, ptr addrspace(3) %71, i64 %73, !dbg !17
  %75 = load float, ptr addrspace(3) %74, align 4, !dbg !17
  %76 = shl i32 %67, 3, !dbg !17
  %77 = sext i32 %76 to i64, !dbg !17
  %78 = getelementptr float, ptr addrspace(3) %74, i64 %77, !dbg !17
  %79 = load float, ptr addrspace(3) %78, align 4, !dbg !17
  %80 = sext i32 %68 to i64, !dbg !17
  %81 = getelementptr float, ptr addrspace(3) %74, i64 %80, !dbg !17
  %82 = load float, ptr addrspace(3) %81, align 4, !dbg !17
  %83 = add i32 %76, %68, !dbg !17
  %84 = sext i32 %83 to i64, !dbg !17
  %85 = getelementptr float, ptr addrspace(3) %74, i64 %84, !dbg !17
  %86 = load float, ptr addrspace(3) %85, align 4, !dbg !17
  %87 = shl i32 %68, 1, !dbg !17
  %88 = sext i32 %87 to i64, !dbg !17
  %89 = getelementptr float, ptr addrspace(3) %74, i64 %88, !dbg !17
  %90 = load float, ptr addrspace(3) %89, align 4, !dbg !17
  %91 = add i32 %76, %87, !dbg !17
  %92 = sext i32 %91 to i64, !dbg !17
  %93 = getelementptr float, ptr addrspace(3) %74, i64 %92, !dbg !17
  %94 = load float, ptr addrspace(3) %93, align 4, !dbg !17
  %95 = mul i32 %68, 3, !dbg !17
  %96 = sext i32 %95 to i64, !dbg !17
  %97 = getelementptr float, ptr addrspace(3) %74, i64 %96, !dbg !17
  %98 = load float, ptr addrspace(3) %97, align 4, !dbg !17
  %99 = add i32 %76, %95, !dbg !17
  %100 = sext i32 %99 to i64, !dbg !17
  %101 = getelementptr float, ptr addrspace(3) %74, i64 %100, !dbg !17
  %102 = load float, ptr addrspace(3) %101, align 4, !dbg !17
  %103 = shl i32 %68, 2, !dbg !17
  %104 = sext i32 %103 to i64, !dbg !17
  %105 = getelementptr float, ptr addrspace(3) %74, i64 %104, !dbg !17
  %106 = load float, ptr addrspace(3) %105, align 4, !dbg !17
  %107 = add i32 %76, %103, !dbg !17
  %108 = sext i32 %107 to i64, !dbg !17
  %109 = getelementptr float, ptr addrspace(3) %74, i64 %108, !dbg !17
  %110 = load float, ptr addrspace(3) %109, align 4, !dbg !17
  %111 = mul i32 %68, 5, !dbg !17
  %112 = sext i32 %111 to i64, !dbg !17
  %113 = getelementptr float, ptr addrspace(3) %74, i64 %112, !dbg !17
  %114 = load float, ptr addrspace(3) %113, align 4, !dbg !17
  %115 = add i32 %76, %111, !dbg !17
  %116 = sext i32 %115 to i64, !dbg !17
  %117 = getelementptr float, ptr addrspace(3) %74, i64 %116, !dbg !17
  %118 = load float, ptr addrspace(3) %117, align 4, !dbg !17
  %119 = mul i32 %68, 6, !dbg !17
  %120 = sext i32 %119 to i64, !dbg !17
  %121 = getelementptr float, ptr addrspace(3) %74, i64 %120, !dbg !17
  %122 = load float, ptr addrspace(3) %121, align 4, !dbg !17
  %123 = add i32 %76, %119, !dbg !17
  %124 = sext i32 %123 to i64, !dbg !17
  %125 = getelementptr float, ptr addrspace(3) %74, i64 %124, !dbg !17
  %126 = load float, ptr addrspace(3) %125, align 4, !dbg !17
  %127 = mul i32 %68, 7, !dbg !17
  %128 = sext i32 %127 to i64, !dbg !17
  %129 = getelementptr float, ptr addrspace(3) %74, i64 %128, !dbg !17
  %130 = load float, ptr addrspace(3) %129, align 4, !dbg !17
  %131 = add i32 %76, %127, !dbg !17
  %132 = sext i32 %131 to i64, !dbg !17
  %133 = getelementptr float, ptr addrspace(3) %74, i64 %132, !dbg !17
  %134 = load float, ptr addrspace(3) %133, align 4, !dbg !17
  %135 = extractvalue { ptr addrspace(3), i32, i32, i32, i32 } %62, 0, !dbg !18
  %136 = extractvalue { ptr addrspace(3), i32, i32, i32, i32 } %62, 1, !dbg !18
  %137 = extractvalue { ptr addrspace(3), i32, i32, i32, i32 } %62, 2, !dbg !18
  %138 = shl i32 %136, 3, !dbg !18
  %139 = sext i32 %138 to i64, !dbg !18
  %140 = getelementptr float, ptr addrspace(3) %135, i64 %139, !dbg !18
  %141 = mul i32 %137, %56, !dbg !18
  %142 = sext i32 %141 to i64, !dbg !18
  %143 = getelementptr float, ptr addrspace(3) %140, i64 %142, !dbg !18
  %144 = load float, ptr addrspace(3) %143, align 4, !dbg !18
  %145 = sext i32 %136 to i64, !dbg !18
  %146 = getelementptr float, ptr addrspace(3) %143, i64 %145, !dbg !18
  %147 = load float, ptr addrspace(3) %146, align 4, !dbg !18
  %148 = shl i32 %136, 1, !dbg !18
  %149 = sext i32 %148 to i64, !dbg !18
  %150 = getelementptr float, ptr addrspace(3) %143, i64 %149, !dbg !18
  %151 = load float, ptr addrspace(3) %150, align 4, !dbg !18
  %152 = mul i32 %136, 3, !dbg !18
  %153 = sext i32 %152 to i64, !dbg !18
  %154 = getelementptr float, ptr addrspace(3) %143, i64 %153, !dbg !18
  %155 = load float, ptr addrspace(3) %154, align 4, !dbg !18
  %156 = shl i32 %136, 2, !dbg !18
  %157 = sext i32 %156 to i64, !dbg !18
  %158 = getelementptr float, ptr addrspace(3) %143, i64 %157, !dbg !18
  %159 = load float, ptr addrspace(3) %158, align 4, !dbg !18
  %160 = mul i32 %136, 5, !dbg !18
  %161 = sext i32 %160 to i64, !dbg !18
  %162 = getelementptr float, ptr addrspace(3) %143, i64 %161, !dbg !18
  %163 = load float, ptr addrspace(3) %162, align 4, !dbg !18
  %164 = mul i32 %136, 6, !dbg !18
  %165 = sext i32 %164 to i64, !dbg !18
  %166 = getelementptr float, ptr addrspace(3) %143, i64 %165, !dbg !18
  %167 = load float, ptr addrspace(3) %166, align 4, !dbg !18
  %168 = mul i32 %136, 7, !dbg !18
  %169 = sext i32 %168 to i64, !dbg !18
  %170 = getelementptr float, ptr addrspace(3) %143, i64 %169, !dbg !18
  %171 = load float, ptr addrspace(3) %170, align 4, !dbg !18
  %172 = extractvalue { float, float } %64, 0, !dbg !22
  %173 = extractvalue { float, float } %64, 1, !dbg !22
  %174 = tail call float @llvm.fmuladd.f32(float %.pn39, float %.pn55, float %172), !dbg !22
  %175 = tail call float @llvm.fmuladd.f32(float %.pn37, float %.pn55, float %173), !dbg !22
  %176 = tail call float @llvm.fmuladd.f32(float %.pn35, float %.pn53, float %174), !dbg !22
  %177 = tail call float @llvm.fmuladd.f32(float %.pn33, float %.pn53, float %175), !dbg !22
  %178 = tail call float @llvm.fmuladd.f32(float %.pn31, float %.pn51, float %176), !dbg !22
  %179 = tail call float @llvm.fmuladd.f32(float %.pn29, float %.pn51, float %177), !dbg !22
  %180 = tail call float @llvm.fmuladd.f32(float %.pn27, float %.pn49, float %178), !dbg !22
  %181 = tail call float @llvm.fmuladd.f32(float %.pn25, float %.pn49, float %179), !dbg !22
  %182 = tail call float @llvm.fmuladd.f32(float %.pn23, float %.pn47, float %180), !dbg !22
  %183 = tail call float @llvm.fmuladd.f32(float %.pn21, float %.pn47, float %181), !dbg !22
  %184 = tail call float @llvm.fmuladd.f32(float %.pn19, float %.pn45, float %182), !dbg !22
  %185 = tail call float @llvm.fmuladd.f32(float %.pn17, float %.pn45, float %183), !dbg !22
  %186 = tail call float @llvm.fmuladd.f32(float %.pn15, float %.pn43, float %184), !dbg !22
  %187 = tail call float @llvm.fmuladd.f32(float %.pn13, float %.pn43, float %185), !dbg !22
  %188 = tail call float @llvm.fmuladd.f32(float %.pn11, float %.pn41, float %186), !dbg !22
  %189 = tail call float @llvm.fmuladd.f32(float %.pn9, float %.pn41, float %187), !dbg !22
  %190 = tail call float @llvm.fmuladd.f32(float %75, float %144, float %188), !dbg !22
  %191 = tail call float @llvm.fmuladd.f32(float %79, float %144, float %189), !dbg !22
  %192 = tail call float @llvm.fmuladd.f32(float %82, float %147, float %190), !dbg !22
  %193 = tail call float @llvm.fmuladd.f32(float %86, float %147, float %191), !dbg !22
  %194 = tail call float @llvm.fmuladd.f32(float %90, float %151, float %192), !dbg !22
  %195 = tail call float @llvm.fmuladd.f32(float %94, float %151, float %193), !dbg !22
  %196 = tail call float @llvm.fmuladd.f32(float %98, float %155, float %194), !dbg !22
  %197 = tail call float @llvm.fmuladd.f32(float %102, float %155, float %195), !dbg !22
  %198 = tail call float @llvm.fmuladd.f32(float %106, float %159, float %196), !dbg !22
  %199 = tail call float @llvm.fmuladd.f32(float %110, float %159, float %197), !dbg !22
  %200 = tail call float @llvm.fmuladd.f32(float %114, float %163, float %198), !dbg !22
  %201 = tail call float @llvm.fmuladd.f32(float %118, float %163, float %199), !dbg !22
  %202 = tail call float @llvm.fmuladd.f32(float %122, float %167, float %200), !dbg !22
  %203 = tail call float @llvm.fmuladd.f32(float %126, float %167, float %201), !dbg !22
  %204 = tail call float @llvm.fmuladd.f32(float %130, float %171, float %202), !dbg !22
  %205 = tail call float @llvm.fmuladd.f32(float %134, float %171, float %203), !dbg !22
  %206 = insertvalue { float, float } undef, float %204, 0, !dbg !22
  %207 = insertvalue { float, float } %206, float %205, 1, !dbg !22
  %208 = add nuw nsw i32 %61, 16, !dbg !21
  %209 = icmp ult i32 %61, 1008, !dbg !21
  %.urem = urem i32 %60, 3
  %.urem66 = urem i32 %59, 3
  %210 = getelementptr float, ptr addrspace(1) %.pn361, i64 16, !dbg !19
  %211 = getelementptr float, ptr addrspace(1) %.pn762, i64 %40, !dbg !20
  %212 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];", "=r,=r,l,b"(ptr addrspace(1) %210, i1 %209) #2, !dbg !17
  %213 = extractvalue { i32, i32 } %212, 0, !dbg !17
  %214 = extractvalue { i32, i32 } %212, 1, !dbg !17
  tail call void @llvm.nvvm.barrier0(), !dbg !17
  %215 = shl nuw nsw i32 %.urem, 8, !dbg !17
  %216 = zext i32 %215 to i64
  %gep = getelementptr float, ptr addrspace(3) %32, i64 %216, !dbg !17
  %217 = getelementptr float, ptr addrspace(3) %gep, i64 1, !dbg !17
  store i32 %213, ptr addrspace(3) %gep, align 4, !dbg !17
  store i32 %214, ptr addrspace(3) %217, align 4, !dbg !17
  %218 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];", "=r,=r,l,b"(ptr addrspace(1) %211, i1 %209) #2, !dbg !18
  %219 = extractvalue { i32, i32 } %218, 0, !dbg !18
  %220 = extractvalue { i32, i32 } %218, 1, !dbg !18
  %gep60 = getelementptr float, ptr addrspace(3) %37, i64 %216, !dbg !18
  %221 = getelementptr float, ptr addrspace(3) %gep60, i64 1, !dbg !18
  store i32 %219, ptr addrspace(3) %gep60, align 4, !dbg !18
  store i32 %220, ptr addrspace(3) %221, align 4, !dbg !18
  %222 = shl nuw nsw i32 %.urem66, 8, !dbg !17
  %223 = zext i32 %222 to i64
  %224 = getelementptr float, ptr addrspace(3) @global_smem, i64 %223, !dbg !17
  %225 = insertvalue { ptr addrspace(3), i32, i32, i32, i32 } undef, ptr addrspace(3) %224, 0, !dbg !17
  %226 = insertvalue { ptr addrspace(3), i32, i32, i32, i32 } %225, i32 16, 1, !dbg !17
  %227 = insertvalue { ptr addrspace(3), i32, i32, i32, i32 } %226, i32 1, 2, !dbg !17
  %228 = insertvalue { ptr addrspace(3), i32, i32, i32, i32 } %227, i32 0, 3, !dbg !17
  %229 = insertvalue { ptr addrspace(3), i32, i32, i32, i32 } %228, i32 0, 4, !dbg !17
  %230 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 3072), i64 %223, !dbg !18
  %231 = insertvalue { ptr addrspace(3), i32, i32, i32, i32 } undef, ptr addrspace(3) %230, 0, !dbg !18
  %232 = insertvalue { ptr addrspace(3), i32, i32, i32, i32 } %231, i32 16, 1, !dbg !18
  %233 = insertvalue { ptr addrspace(3), i32, i32, i32, i32 } %232, i32 1, 2, !dbg !18
  %234 = insertvalue { ptr addrspace(3), i32, i32, i32, i32 } %233, i32 0, 3, !dbg !18
  %235 = insertvalue { ptr addrspace(3), i32, i32, i32, i32 } %234, i32 0, 4, !dbg !18
  %236 = add nuw nsw i32 %60, 1, !dbg !21
  %237 = add nuw nsw i32 %59, 1, !dbg !21
  tail call void @llvm.nvvm.barrier0(), !dbg !17
  %238 = add nuw nsw i32 %65, 16, !dbg !21
  %239 = icmp ult i32 %65, 1008, !dbg !21
  br i1 %239, label %58, label %240, !dbg !21

 240:                                              ; preds = %58
  %241 = lshr i32 %8, 4, !dbg !23
  %242 = shl nuw nsw i32 %10, 1, !dbg !23
  %243 = or i32 %242, %241, !dbg !23
  %244 = mul i32 %13, %5, !dbg !24
  %245 = sext i32 %244 to i64, !dbg !25
  %246 = getelementptr float, ptr addrspace(1) %2, i64 %245, !dbg !25
  %247 = getelementptr float, ptr addrspace(1) %246, i64 %19, !dbg !26
  tail call void @llvm.nvvm.barrier0(), !dbg !23
  %248 = mul nuw nsw i32 %243, 18, !dbg !23
  %249 = add nuw nsw i32 %248, %56, !dbg !23
  %250 = zext i32 %249 to i64, !dbg !23
  %251 = getelementptr float, ptr addrspace(3) @global_smem, i64 %250, !dbg !23
  %252 = insertelement <1 x float> undef, float %204, i64 0, !dbg !23
  store <1 x float> %252, ptr addrspace(3) %251, align 4, !dbg !23
  %253 = add nuw nsw i32 %249, 144, !dbg !23
  %254 = zext i32 %253 to i64, !dbg !23
  %255 = getelementptr float, ptr addrspace(3) @global_smem, i64 %254, !dbg !23
  %256 = insertelement <1 x float> undef, float %205, i64 0, !dbg !23
  store <1 x float> %256, ptr addrspace(3) %255, align 4, !dbg !23
  tail call void @llvm.nvvm.barrier0(), !dbg !23
  %257 = mul nuw nsw i32 %13, 18, !dbg !23
  %258 = add nuw nsw i32 %257, %15, !dbg !23
  %259 = zext i32 %258 to i64, !dbg !23
  %260 = getelementptr float, ptr addrspace(3) @global_smem, i64 %259, !dbg !23
  %.extract = load i32, ptr addrspace(3) %260, align 8, !dbg !23
  %261 = getelementptr inbounds <2 x i32>, ptr addrspace(3) %260, i64 0, i64 1, !dbg !23
  %.extract57 = load i32, ptr addrspace(3) %261, align 4, !dbg !23
  tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %.extract, i32 %.extract57, ptr addrspace(1) %247, i1 true) #2, !dbg !23
  ret void, !dbg !27
 }

 ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
 declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0

 ; Function Attrs: convergent nocallback nounwind
 declare void @llvm.nvvm.barrier0() #1

 ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
 declare float @llvm.fmuladd.f32(float, float, float) #0

 attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 attributes #1 = { convergent nocallback nounwind }
 attributes #2 = { nounwind }

 !llvm.module.flags = !{!0}
 !llvm.dbg.cu = !{!1}
 !nvvm.annotations = !{!3, !4, !4, !3}

 !0 = !{i32 2, !"Debug Info Version", i32 3}
 !1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
 !2 = !DIFile(filename: "run_triton_matmul.py", directory: "/home/hzhuang/code")
 !3 = !{ptr @matmul_kernel_0d1d2d3d4c5d6c7d8c, !"kernel", i32 1}
 !4 = !{ptr @matmul_kernel_0d1d2d3d4c5d6c7d8c, !"maxntidx", i32 128}
 !5 = distinct !DISubprogram(name: "matmul_kernel_0d1d2d3d4c5d6c7d8c", linkageName: "matmul_kernel_0d1d2d3d4c5d6c7d8c", scope: !2, file: !2, line: 171, type: !6, scopeLine: 171, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
 !6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
 !7 = !{}
 !8 = !DILocation(line: 183, column: 28, scope: !5)
 !9 = !DILocation(line: 183, column: 58, scope: !5)
 !10 = !DILocation(line: 183, column: 39, scope: !5)
 !11 = !DILocation(line: 183, column: 21, scope: !5)
 !12 = !DILocation(line: 183, column: 51, scope: !5)
 !13 = !DILocation(line: 184, column: 39, scope: !5)
 !14 = !DILocation(line: 184, column: 21, scope: !5)
 !15 = !DILocation(line: 184, column: 51, scope: !5)
 !16 = !DILocation(line: 191, column: 33, scope: !5)
 !17 = !DILocation(line: 187, column: 20, scope: !5)
 !18 = !DILocation(line: 188, column: 20, scope: !5)
 !19 = !DILocation(line: 190, column: 18, scope: !5)
 !20 = !DILocation(line: 191, column: 18, scope: !5)
 !21 = !DILocation(line: 186, column: 25, scope: !5)
 !22 = !DILocation(line: 189, scope: !5)
 !23 = !DILocation(line: 194, column: 21, scope: !5)
 !24 = !DILocation(line: 193, column: 39, scope: !5)
 !25 = !DILocation(line: 193, column: 21, scope: !5)
 !26 = !DILocation(line: 193, column: 51, scope: !5)
 !27 = !DILocation(line: 194, column: 4, scope: !5)

 ['__class__', '__class_getitem__', '__contains__', '__copy__', '__delattr__', '__delitem__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__ior__', '__iter__', '__le__', '__len__', '__lt__', '__missing__', '__ne__', '__new__', '__or__', '__reduce__', '__reduce_ex__', '__repr__', '__reversed__', '__ror__', '__setattr__', '__setitem__', '__sizeof__', '__str__', '__subclasshook__', 'clear', 'copy', 'default_factory', 'fromkeys', 'get', 'items', 'keys', 'pop', 'popitem', 'setdefault', 'update', 'values']