Created
November 22, 2023 07:54
-
-
Save zhuangh/b1010323573bd55dc3a5a6710027f156 to your computer and use it in GitHub Desktop.
matmul_gtx1060.ir
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
IR module { | |
tt.func public @matmul_kernel_0d1d2d3d4c5d6c7d8c(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32}) attributes {noinline = false} { | |
%c16_i32 = arith.constant 16 : i32 | |
%c1024_i32 = arith.constant 1024 : i32 | |
%c0_i32 = arith.constant 0 : i32 | |
%cst = arith.constant dense<16> : tensor<16x16xi32> | |
%cst_0 = arith.constant dense<0.000000e+00> : tensor<16x16xf32> | |
%0 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> | |
%1 = tt.expand_dims %0 {axis = 1 : i32} : (tensor<16xi32>) -> tensor<16x1xi32> | |
%2 = tt.splat %arg3 : (i32) -> tensor<16x1xi32> | |
%3 = arith.muli %1, %2 : tensor<16x1xi32> | |
%4 = tt.splat %arg0 : (!tt.ptr<f32>) -> tensor<16x1x!tt.ptr<f32>> | |
%5 = tt.addptr %4, %3 : tensor<16x1x!tt.ptr<f32>>, tensor<16x1xi32> | |
%6 = tt.expand_dims %0 {axis = 0 : i32} : (tensor<16xi32>) -> tensor<1x16xi32> | |
%7 = tt.broadcast %5 : (tensor<16x1x!tt.ptr<f32>>) -> tensor<16x16x!tt.ptr<f32>> | |
%8 = tt.broadcast %6 : (tensor<1x16xi32>) -> tensor<16x16xi32> | |
%9 = tt.addptr %7, %8 : tensor<16x16x!tt.ptr<f32>>, tensor<16x16xi32> | |
%10 = tt.splat %arg4 : (i32) -> tensor<16x1xi32> | |
%11 = arith.muli %1, %10 : tensor<16x1xi32> | |
%12 = tt.splat %arg1 : (!tt.ptr<f32>) -> tensor<16x1x!tt.ptr<f32>> | |
%13 = tt.addptr %12, %11 : tensor<16x1x!tt.ptr<f32>>, tensor<16x1xi32> | |
%14 = tt.broadcast %13 : (tensor<16x1x!tt.ptr<f32>>) -> tensor<16x16x!tt.ptr<f32>> | |
%15 = tt.addptr %14, %8 : tensor<16x16x!tt.ptr<f32>>, tensor<16x16xi32> | |
%16 = arith.muli %arg4, %c16_i32 : i32 | |
%17 = tt.splat %16 : (i32) -> tensor<16x16xi32> | |
%18:3 = scf.for %arg6 = %c0_i32 to %c1024_i32 step %c16_i32 iter_args(%arg7 = %cst_0, %arg8 = %9, %arg9 = %15) -> (tensor<16x16xf32>, tensor<16x16x!tt.ptr<f32>>, tensor<16x16x!tt.ptr<f32>>) : i32 { | |
%25 = tt.load %arg8 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<16x16xf32> | |
%26 = tt.load %arg9 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<16x16xf32> | |
%27 = tt.dot %25, %26, %arg7 {allowTF32 = true} : tensor<16x16xf32> * tensor<16x16xf32> -> tensor<16x16xf32> | |
%28 = tt.addptr %arg8, %cst : tensor<16x16x!tt.ptr<f32>>, tensor<16x16xi32> | |
%29 = tt.addptr %arg9, %17 : tensor<16x16x!tt.ptr<f32>>, tensor<16x16xi32> | |
scf.yield %27, %28, %29 : tensor<16x16xf32>, tensor<16x16x!tt.ptr<f32>>, tensor<16x16x!tt.ptr<f32>> | |
} | |
%19 = tt.splat %arg5 : (i32) -> tensor<16x1xi32> | |
%20 = arith.muli %1, %19 : tensor<16x1xi32> | |
%21 = tt.splat %arg2 : (!tt.ptr<f32>) -> tensor<16x1x!tt.ptr<f32>> | |
%22 = tt.addptr %21, %20 : tensor<16x1x!tt.ptr<f32>>, tensor<16x1xi32> | |
%23 = tt.broadcast %22 : (tensor<16x1x!tt.ptr<f32>>) -> tensor<16x16x!tt.ptr<f32>> | |
%24 = tt.addptr %23, %8 : tensor<16x16x!tt.ptr<f32>>, tensor<16x16xi32> | |
tt.store %24, %18#0 {cache = 1 : i32, evict = 1 : i32} : tensor<16x16xf32> | |
tt.return | |
} | |
} | |
TTGIR #blocked = #triton_gpu.blocked<{sizePerThread = [1, 2], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> | |
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [2, 16], warpsPerCTA = [4, 1], order = [1, 0]}> | |
#shared = #triton_gpu.shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0]}> | |
module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { | |
tt.func public @matmul_kernel_0d1d2d3d4c5d6c7d8c(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32}) attributes {noinline = false} { | |
%c1_i32 = arith.constant 1 : i32 | |
%c2_i32 = arith.constant 2 : i32 | |
%cst = arith.constant dense<true> : tensor<16x16xi1, #blocked> | |
%c16_i32 = arith.constant 16 : i32 | |
%c3_i32 = arith.constant 3 : i32 | |
%cst_0 = arith.constant dense<0.000000e+00> : tensor<16x16xf32, #blocked1> | |
%cst_1 = arith.constant dense<16> : tensor<16x16xi32, #blocked> | |
%c1024_i32 = arith.constant 1024 : i32 | |
%c0_i32 = arith.constant 0 : i32 | |
%0 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> | |
%1 = tt.expand_dims %0 {axis = 1 : i32} : (tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<16x1xi32, #blocked> | |
%2 = tt.splat %arg3 : (i32) -> tensor<16x1xi32, #blocked> | |
%3 = arith.muli %1, %2 : tensor<16x1xi32, #blocked> | |
%4 = tt.splat %arg0 : (!tt.ptr<f32>) -> tensor<16x1x!tt.ptr<f32>, #blocked> | |
%5 = tt.addptr %4, %3 : tensor<16x1x!tt.ptr<f32>, #blocked>, tensor<16x1xi32, #blocked> | |
%6 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> | |
%7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x16xi32, #blocked> | |
%8 = tt.broadcast %5 : (tensor<16x1x!tt.ptr<f32>, #blocked>) -> tensor<16x16x!tt.ptr<f32>, #blocked> | |
%9 = tt.broadcast %7 : (tensor<1x16xi32, #blocked>) -> tensor<16x16xi32, #blocked> | |
%10 = tt.addptr %8, %9 : tensor<16x16x!tt.ptr<f32>, #blocked>, tensor<16x16xi32, #blocked> | |
%11 = tt.splat %arg4 : (i32) -> tensor<16x1xi32, #blocked> | |
%12 = arith.muli %1, %11 : tensor<16x1xi32, #blocked> | |
%13 = tt.splat %arg1 : (!tt.ptr<f32>) -> tensor<16x1x!tt.ptr<f32>, #blocked> | |
%14 = tt.addptr %13, %12 : tensor<16x1x!tt.ptr<f32>, #blocked>, tensor<16x1xi32, #blocked> | |
%15 = tt.broadcast %14 : (tensor<16x1x!tt.ptr<f32>, #blocked>) -> tensor<16x16x!tt.ptr<f32>, #blocked> | |
%16 = tt.addptr %15, %9 : tensor<16x16x!tt.ptr<f32>, #blocked>, tensor<16x16xi32, #blocked> | |
%17 = arith.muli %arg4, %c16_i32 : i32 | |
%18 = tt.splat %17 : (i32) -> tensor<16x16xi32, #blocked> | |
%19 = triton_gpu.alloc_tensor : tensor<3x16x16xf32, #shared> | |
%20 = triton_gpu.insert_slice_async %10, %19, %c0_i32, %cst {axis = 0 : i32, cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<16x16x!tt.ptr<f32>, #blocked> -> tensor<3x16x16xf32, #shared> | |
triton_gpu.async_commit_group | |
%21 = triton_gpu.alloc_tensor : tensor<3x16x16xf32, #shared> | |
%22 = triton_gpu.insert_slice_async %16, %21, %c0_i32, %cst {axis = 0 : i32, cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<16x16x!tt.ptr<f32>, #blocked> -> tensor<3x16x16xf32, #shared> | |
triton_gpu.async_commit_group | |
%23 = tt.addptr %10, %cst_1 : tensor<16x16x!tt.ptr<f32>, #blocked>, tensor<16x16xi32, #blocked> | |
%24 = tt.addptr %16, %18 : tensor<16x16x!tt.ptr<f32>, #blocked>, tensor<16x16xi32, #blocked> | |
%25 = triton_gpu.insert_slice_async %23, %20, %c1_i32, %cst {axis = 0 : i32, cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<16x16x!tt.ptr<f32>, #blocked> -> tensor<3x16x16xf32, #shared> | |
triton_gpu.async_commit_group | |
%26 = triton_gpu.insert_slice_async %24, %22, %c1_i32, %cst {axis = 0 : i32, cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<16x16x!tt.ptr<f32>, #blocked> -> tensor<3x16x16xf32, #shared> | |
triton_gpu.async_commit_group | |
triton_gpu.async_wait {num = 2 : i32} | |
%27 = triton_gpu.extract_slice %25[0, 0, 0] [1, 16, 16] [1, 1, 1] : tensor<3x16x16xf32, #shared> to tensor<16x16xf32, #shared> | |
%28 = triton_gpu.extract_slice %26[0, 0, 0] [1, 16, 16] [1, 1, 1] : tensor<3x16x16xf32, #shared> to tensor<16x16xf32, #shared> | |
%29 = triton_gpu.extract_slice %27[0, 0] [16, 8] [1, 1] : tensor<16x16xf32, #shared> to tensor<16x8xf32, #shared> | |
%30 = triton_gpu.convert_layout %29 : (tensor<16x8xf32, #shared>) -> tensor<16x8xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked1}>> | |
%31 = triton_gpu.extract_slice %28[0, 0] [8, 16] [1, 1] : tensor<16x16xf32, #shared> to tensor<8x16xf32, #shared> | |
%32 = triton_gpu.convert_layout %31 : (tensor<8x16xf32, #shared>) -> tensor<8x16xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked1}>> | |
%33:14 = scf.for %arg6 = %c0_i32 to %c1024_i32 step %c16_i32 iter_args(%arg7 = %cst_0, %arg8 = %10, %arg9 = %16, %arg10 = %25, %arg11 = %26, %arg12 = %27, %arg13 = %28, %arg14 = %23, %arg15 = %24, %arg16 = %c16_i32, %arg17 = %c2_i32, %arg18 = %c1_i32, %arg19 = %30, %arg20 = %32) -> (tensor<16x16xf32, #blocked1>, tensor<16x16x!tt.ptr<f32>, #blocked>, tensor<16x16x!tt.ptr<f32>, #blocked>, tensor<3x16x16xf32, #shared>, tensor<3x16x16xf32, #shared>, tensor<16x16xf32, #shared>, tensor<16x16xf32, #shared>, tensor<16x16x!tt.ptr<f32>, #blocked>, tensor<16x16x!tt.ptr<f32>, #blocked>, i32, i32, i32, tensor<16x8xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked1}>>, tensor<8x16xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked1}>>) : i32 { | |
%41 = triton_gpu.extract_slice %arg12[0, 8] [16, 8] [1, 1] : tensor<16x16xf32, #shared> to tensor<16x8xf32, #shared> | |
%42 = triton_gpu.convert_layout %41 : (tensor<16x8xf32, #shared>) -> tensor<16x8xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked1}>> | |
%43 = triton_gpu.extract_slice %arg13[8, 0] [8, 16] [1, 1] : tensor<16x16xf32, #shared> to tensor<8x16xf32, #shared> | |
%44 = triton_gpu.convert_layout %43 : (tensor<8x16xf32, #shared>) -> tensor<8x16xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked1}>> | |
%45 = tt.dot %arg19, %arg20, %arg7 {allowTF32 = true} : tensor<16x8xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked1}>> * tensor<8x16xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked1}>> -> tensor<16x16xf32, #blocked1> | |
%46 = tt.dot %42, %44, %45 {allowTF32 = true} : tensor<16x8xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked1}>> * tensor<8x16xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked1}>> -> tensor<16x16xf32, #blocked1> | |
%47 = tt.addptr %arg8, %cst_1 : tensor<16x16x!tt.ptr<f32>, #blocked>, tensor<16x16xi32, #blocked> | |
%48 = tt.addptr %arg9, %18 : tensor<16x16x!tt.ptr<f32>, #blocked>, tensor<16x16xi32, #blocked> | |
%49 = arith.addi %arg16, %c16_i32 : i32 | |
%50 = arith.cmpi slt, %49, %c1024_i32 : i32 | |
%51 = arith.remsi %arg17, %c3_i32 : i32 | |
%52 = arith.remsi %arg18, %c3_i32 : i32 | |
%53 = tt.addptr %arg14, %cst_1 : tensor<16x16x!tt.ptr<f32>, #blocked>, tensor<16x16xi32, #blocked> | |
%54 = tt.addptr %arg15, %18 : tensor<16x16x!tt.ptr<f32>, #blocked>, tensor<16x16xi32, #blocked> | |
%55 = tt.splat %50 : (i1) -> tensor<16x16xi1, #blocked> | |
%56 = triton_gpu.insert_slice_async %53, %arg10, %51, %55 {axis = 0 : i32, cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<16x16x!tt.ptr<f32>, #blocked> -> tensor<3x16x16xf32, #shared> | |
triton_gpu.async_commit_group | |
%57 = triton_gpu.insert_slice_async %54, %arg11, %51, %55 {axis = 0 : i32, cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<16x16x!tt.ptr<f32>, #blocked> -> tensor<3x16x16xf32, #shared> | |
triton_gpu.async_commit_group | |
triton_gpu.async_wait {num = 2 : i32} | |
%58 = triton_gpu.extract_slice %56[%52, 0, 0] [1, 16, 16] [1, 1, 1] : tensor<3x16x16xf32, #shared> to tensor<16x16xf32, #shared> | |
%59 = triton_gpu.extract_slice %57[%52, 0, 0] [1, 16, 16] [1, 1, 1] : tensor<3x16x16xf32, #shared> to tensor<16x16xf32, #shared> | |
%60 = arith.addi %arg17, %c1_i32 : i32 | |
%61 = arith.addi %arg18, %c1_i32 : i32 | |
%62 = triton_gpu.extract_slice %58[0, 0] [16, 8] [1, 1] : tensor<16x16xf32, #shared> to tensor<16x8xf32, #shared> | |
%63 = triton_gpu.convert_layout %62 : (tensor<16x8xf32, #shared>) -> tensor<16x8xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked1}>> | |
%64 = triton_gpu.extract_slice %59[0, 0] [8, 16] [1, 1] : tensor<16x16xf32, #shared> to tensor<8x16xf32, #shared> | |
%65 = triton_gpu.convert_layout %64 : (tensor<8x16xf32, #shared>) -> tensor<8x16xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked1}>> | |
scf.yield %46, %47, %48, %56, %57, %58, %59, %53, %54, %49, %60, %61, %63, %65 : tensor<16x16xf32, #blocked1>, tensor<16x16x!tt.ptr<f32>, #blocked>, tensor<16x16x!tt.ptr<f32>, #blocked>, tensor<3x16x16xf32, #shared>, tensor<3x16x16xf32, #shared>, tensor<16x16xf32, #shared>, tensor<16x16xf32, #shared>, tensor<16x16x!tt.ptr<f32>, #blocked>, tensor<16x16x!tt.ptr<f32>, #blocked>, i32, i32, i32, tensor<16x8xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked1}>>, tensor<8x16xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked1}>> | |
} | |
triton_gpu.async_wait {num = 0 : i32} | |
%34 = tt.splat %arg5 : (i32) -> tensor<16x1xi32, #blocked> | |
%35 = arith.muli %1, %34 : tensor<16x1xi32, #blocked> | |
%36 = tt.splat %arg2 : (!tt.ptr<f32>) -> tensor<16x1x!tt.ptr<f32>, #blocked> | |
%37 = tt.addptr %36, %35 : tensor<16x1x!tt.ptr<f32>, #blocked>, tensor<16x1xi32, #blocked> | |
%38 = tt.broadcast %37 : (tensor<16x1x!tt.ptr<f32>, #blocked>) -> tensor<16x16x!tt.ptr<f32>, #blocked> | |
%39 = tt.addptr %38, %9 : tensor<16x16x!tt.ptr<f32>, #blocked>, tensor<16x16xi32, #blocked> | |
%40 = triton_gpu.convert_layout %33#0 : (tensor<16x16xf32, #blocked1>) -> tensor<16x16xf32, #blocked> | |
tt.store %39, %40 {cache = 1 : i32, evict = 1 : i32} : tensor<16x16xf32, #blocked> | |
tt.return | |
} | |
} | |
LLIR ; ModuleID = 'LLVMDialectModule' | |
source_filename = "LLVMDialectModule" | |
@global_smem = external addrspace(3) global [0 x i8] | |
define void @matmul_kernel_0d1d2d3d4c5d6c7d8c(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4, i32 %5) local_unnamed_addr !dbg !5 { | |
%7 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8 | |
%8 = and i32 %7, 31, !dbg !8 | |
%9 = lshr i32 %7, 5, !dbg !8 | |
%10 = and i32 %9, 3, !dbg !8 | |
%11 = lshr i32 %8, 3, !dbg !8 | |
%12 = shl nuw nsw i32 %10, 2, !dbg !8 | |
%13 = or i32 %12, %11, !dbg !8 | |
%14 = shl i32 %7, 1, !dbg !9 | |
%15 = and i32 %14, 14, !dbg !9 | |
%16 = mul i32 %13, %3, !dbg !10 | |
%17 = sext i32 %16 to i64, !dbg !11 | |
%18 = getelementptr float, ptr addrspace(1) %0, i64 %17, !dbg !11 | |
%19 = zext i32 %15 to i64, !dbg !12 | |
%20 = getelementptr float, ptr addrspace(1) %18, i64 %19, !dbg !12 | |
%21 = mul i32 %13, %4, !dbg !13 | |
%22 = sext i32 %21 to i64, !dbg !14 | |
%23 = getelementptr float, ptr addrspace(1) %1, i64 %22, !dbg !14 | |
%24 = getelementptr float, ptr addrspace(1) %23, i64 %19, !dbg !15 | |
%25 = shl i32 %4, 4, !dbg !16 | |
%26 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];", "=r,=r,l,b"(ptr addrspace(1) %20, i1 true) #2, !dbg !17 | |
%27 = extractvalue { i32, i32 } %26, 0, !dbg !17 | |
%28 = extractvalue { i32, i32 } %26, 1, !dbg !17 | |
%29 = shl nuw nsw i32 %13, 4, !dbg !17 | |
%30 = or i32 %29, %15, !dbg !17 | |
%31 = zext i32 %30 to i64, !dbg !17 | |
%32 = getelementptr float, ptr addrspace(3) @global_smem, i64 %31, !dbg !17 | |
%33 = getelementptr float, ptr addrspace(3) %32, i64 1, !dbg !17 | |
store i32 %27, ptr addrspace(3) %32, align 4, !dbg !17 | |
store i32 %28, ptr addrspace(3) %33, align 4, !dbg !17 | |
%34 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];", "=r,=r,l,b"(ptr addrspace(1) %24, i1 true) #2, !dbg !18 | |
%35 = extractvalue { i32, i32 } %34, 0, !dbg !18 | |
%36 = extractvalue { i32, i32 } %34, 1, !dbg !18 | |
%37 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 3072), i64 %31, !dbg !18 | |
%38 = getelementptr float, ptr addrspace(3) %37, i64 1, !dbg !18 | |
store i32 %35, ptr addrspace(3) %37, align 4, !dbg !18 | |
store i32 %36, ptr addrspace(3) %38, align 4, !dbg !18 | |
%39 = getelementptr float, ptr addrspace(1) %20, i64 16, !dbg !19 | |
%40 = sext i32 %25 to i64, !dbg !20 | |
%41 = getelementptr float, ptr addrspace(1) %24, i64 %40, !dbg !20 | |
%42 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];", "=r,=r,l,b"(ptr addrspace(1) %39, i1 true) #2, !dbg !17 | |
%43 = extractvalue { i32, i32 } %42, 0, !dbg !17 | |
%44 = extractvalue { i32, i32 } %42, 1, !dbg !17 | |
tail call void @llvm.nvvm.barrier0(), !dbg !17 | |
%45 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 1024), i64 %31, !dbg !17 | |
%46 = getelementptr float, ptr addrspace(3) %45, i64 1, !dbg !17 | |
store i32 %43, ptr addrspace(3) %45, align 4, !dbg !17 | |
store i32 %44, ptr addrspace(3) %46, align 4, !dbg !17 | |
%47 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];", "=r,=r,l,b"(ptr addrspace(1) %41, i1 true) #2, !dbg !18 | |
%48 = extractvalue { i32, i32 } %47, 0, !dbg !18 | |
%49 = extractvalue { i32, i32 } %47, 1, !dbg !18 | |
%50 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 4096), i64 %31, !dbg !18 | |
%51 = getelementptr float, ptr addrspace(3) %50, i64 1, !dbg !18 | |
store i32 %48, ptr addrspace(3) %50, align 4, !dbg !18 | |
store i32 %49, ptr addrspace(3) %51, align 4, !dbg !18 | |
tail call void @llvm.nvvm.barrier0(), !dbg !17 | |
%52 = lshr i32 %7, 4, !dbg !17 | |
%53 = and i32 %52, 7, !dbg !17 | |
%54 = shl nuw nsw i32 %53, 4, !dbg !17 | |
%55 = zext i32 %54 to i64, !dbg !17 | |
%56 = and i32 %7, 15, !dbg !18 | |
%57 = zext i32 %56 to i64, !dbg !18 | |
br label %58, !dbg !21 | |
58: ; preds = %6, %58 | |
%.pn5864 = phi ptr addrspace(3) [ getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 3072), %6 ], [ %230, %58 ] | |
%.pn63 = phi ptr addrspace(3) [ @global_smem, %6 ], [ %224, %58 ] | |
%59 = phi i32 [ 1, %6 ], [ %237, %58 ] | |
%60 = phi i32 [ 2, %6 ], [ %236, %58 ] | |
%61 = phi i32 [ 16, %6 ], [ %208, %58 ] | |
%.pn762 = phi ptr addrspace(1) [ %41, %6 ], [ %211, %58 ] | |
%.pn361 = phi ptr addrspace(1) [ %39, %6 ], [ %210, %58 ] | |
%62 = phi { ptr addrspace(3), i32, i32, i32, i32 } [ { ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 3072), i32 16, i32 1, i32 0, i32 0 }, %6 ], [ %235, %58 ] | |
%63 = phi { ptr addrspace(3), i32, i32, i32, i32 } [ { ptr addrspace(3) @global_smem, i32 16, i32 1, i32 0, i32 0 }, %6 ], [ %229, %58 ] | |
%64 = phi { float, float } [ zeroinitializer, %6 ], [ %207, %58 ] | |
%65 = phi i32 [ 0, %6 ], [ %238, %58 ] | |
%.pn55.in = getelementptr float, ptr addrspace(3) %.pn5864, i64 %57, !dbg !18 | |
%.pn39.in = getelementptr float, ptr addrspace(3) %.pn63, i64 %55, !dbg !17 | |
%.pn41.in = getelementptr float, ptr addrspace(3) %.pn55.in, i64 112, !dbg !18 | |
%.pn43.in = getelementptr float, ptr addrspace(3) %.pn55.in, i64 96, !dbg !18 | |
%.pn45.in = getelementptr float, ptr addrspace(3) %.pn55.in, i64 80, !dbg !18 | |
%.pn47.in = getelementptr float, ptr addrspace(3) %.pn55.in, i64 64, !dbg !18 | |
%.pn49.in = getelementptr float, ptr addrspace(3) %.pn55.in, i64 48, !dbg !18 | |
%.pn51.in = getelementptr float, ptr addrspace(3) %.pn55.in, i64 32, !dbg !18 | |
%.pn53.in = getelementptr float, ptr addrspace(3) %.pn55.in, i64 16, !dbg !18 | |
%.pn9.in = getelementptr float, ptr addrspace(3) %.pn39.in, i64 135, !dbg !17 | |
%.pn11.in = getelementptr float, ptr addrspace(3) %.pn39.in, i64 7, !dbg !17 | |
%.pn13.in = getelementptr float, ptr addrspace(3) %.pn39.in, i64 134, !dbg !17 | |
%.pn15.in = getelementptr float, ptr addrspace(3) %.pn39.in, i64 6, !dbg !17 | |
%.pn17.in = getelementptr float, ptr addrspace(3) %.pn39.in, i64 133, !dbg !17 | |
%.pn19.in = getelementptr float, ptr addrspace(3) %.pn39.in, i64 5, !dbg !17 | |
%.pn21.in = getelementptr float, ptr addrspace(3) %.pn39.in, i64 132, !dbg !17 | |
%.pn23.in = getelementptr float, ptr addrspace(3) %.pn39.in, i64 4, !dbg !17 | |
%.pn25.in = getelementptr float, ptr addrspace(3) %.pn39.in, i64 131, !dbg !17 | |
%.pn27.in = getelementptr float, ptr addrspace(3) %.pn39.in, i64 3, !dbg !17 | |
%.pn29.in = getelementptr float, ptr addrspace(3) %.pn39.in, i64 130, !dbg !17 | |
%.pn31.in = getelementptr float, ptr addrspace(3) %.pn39.in, i64 2, !dbg !17 | |
%.pn33.in = getelementptr float, ptr addrspace(3) %.pn39.in, i64 129, !dbg !17 | |
%.pn35.in = getelementptr float, ptr addrspace(3) %.pn39.in, i64 1, !dbg !17 | |
%.pn37.in = getelementptr float, ptr addrspace(3) %.pn39.in, i64 128, !dbg !17 | |
%.pn55 = load float, ptr addrspace(3) %.pn55.in, align 4, !dbg !18 | |
%.pn53 = load float, ptr addrspace(3) %.pn53.in, align 4, !dbg !18 | |
%.pn51 = load float, ptr addrspace(3) %.pn51.in, align 4, !dbg !18 | |
%.pn49 = load float, ptr addrspace(3) %.pn49.in, align 4, !dbg !18 | |
%.pn47 = load float, ptr addrspace(3) %.pn47.in, align 4, !dbg !18 | |
%.pn45 = load float, ptr addrspace(3) %.pn45.in, align 4, !dbg !18 | |
%.pn43 = load float, ptr addrspace(3) %.pn43.in, align 4, !dbg !18 | |
%.pn41 = load float, ptr addrspace(3) %.pn41.in, align 4, !dbg !18 | |
%.pn39 = load float, ptr addrspace(3) %.pn39.in, align 4, !dbg !17 | |
%.pn37 = load float, ptr addrspace(3) %.pn37.in, align 4, !dbg !17 | |
%.pn35 = load float, ptr addrspace(3) %.pn35.in, align 4, !dbg !17 | |
%.pn33 = load float, ptr addrspace(3) %.pn33.in, align 4, !dbg !17 | |
%.pn31 = load float, ptr addrspace(3) %.pn31.in, align 4, !dbg !17 | |
%.pn29 = load float, ptr addrspace(3) %.pn29.in, align 4, !dbg !17 | |
%.pn27 = load float, ptr addrspace(3) %.pn27.in, align 4, !dbg !17 | |
%.pn25 = load float, ptr addrspace(3) %.pn25.in, align 4, !dbg !17 | |
%.pn23 = load float, ptr addrspace(3) %.pn23.in, align 4, !dbg !17 | |
%.pn21 = load float, ptr addrspace(3) %.pn21.in, align 4, !dbg !17 | |
%.pn19 = load float, ptr addrspace(3) %.pn19.in, align 4, !dbg !17 | |
%.pn17 = load float, ptr addrspace(3) %.pn17.in, align 4, !dbg !17 | |
%.pn15 = load float, ptr addrspace(3) %.pn15.in, align 4, !dbg !17 | |
%.pn13 = load float, ptr addrspace(3) %.pn13.in, align 4, !dbg !17 | |
%.pn11 = load float, ptr addrspace(3) %.pn11.in, align 4, !dbg !17 | |
%.pn9 = load float, ptr addrspace(3) %.pn9.in, align 4, !dbg !17 | |
%66 = extractvalue { ptr addrspace(3), i32, i32, i32, i32 } %63, 0, !dbg !17 | |
%67 = extractvalue { ptr addrspace(3), i32, i32, i32, i32 } %63, 1, !dbg !17 | |
%68 = extractvalue { ptr addrspace(3), i32, i32, i32, i32 } %63, 2, !dbg !17 | |
%69 = shl i32 %68, 3, !dbg !17 | |
%70 = sext i32 %69 to i64, !dbg !17 | |
%71 = getelementptr float, ptr addrspace(3) %66, i64 %70, !dbg !17 | |
%72 = mul i32 %67, %53, !dbg !17 | |
%73 = sext i32 %72 to i64, !dbg !17 | |
%74 = getelementptr float, ptr addrspace(3) %71, i64 %73, !dbg !17 | |
%75 = load float, ptr addrspace(3) %74, align 4, !dbg !17 | |
%76 = shl i32 %67, 3, !dbg !17 | |
%77 = sext i32 %76 to i64, !dbg !17 | |
%78 = getelementptr float, ptr addrspace(3) %74, i64 %77, !dbg !17 | |
%79 = load float, ptr addrspace(3) %78, align 4, !dbg !17 | |
%80 = sext i32 %68 to i64, !dbg !17 | |
%81 = getelementptr float, ptr addrspace(3) %74, i64 %80, !dbg !17 | |
%82 = load float, ptr addrspace(3) %81, align 4, !dbg !17 | |
%83 = add i32 %76, %68, !dbg !17 | |
%84 = sext i32 %83 to i64, !dbg !17 | |
%85 = getelementptr float, ptr addrspace(3) %74, i64 %84, !dbg !17 | |
%86 = load float, ptr addrspace(3) %85, align 4, !dbg !17 | |
%87 = shl i32 %68, 1, !dbg !17 | |
%88 = sext i32 %87 to i64, !dbg !17 | |
%89 = getelementptr float, ptr addrspace(3) %74, i64 %88, !dbg !17 | |
%90 = load float, ptr addrspace(3) %89, align 4, !dbg !17 | |
%91 = add i32 %76, %87, !dbg !17 | |
%92 = sext i32 %91 to i64, !dbg !17 | |
%93 = getelementptr float, ptr addrspace(3) %74, i64 %92, !dbg !17 | |
%94 = load float, ptr addrspace(3) %93, align 4, !dbg !17 | |
%95 = mul i32 %68, 3, !dbg !17 | |
%96 = sext i32 %95 to i64, !dbg !17 | |
%97 = getelementptr float, ptr addrspace(3) %74, i64 %96, !dbg !17 | |
%98 = load float, ptr addrspace(3) %97, align 4, !dbg !17 | |
%99 = add i32 %76, %95, !dbg !17 | |
%100 = sext i32 %99 to i64, !dbg !17 | |
%101 = getelementptr float, ptr addrspace(3) %74, i64 %100, !dbg !17 | |
%102 = load float, ptr addrspace(3) %101, align 4, !dbg !17 | |
%103 = shl i32 %68, 2, !dbg !17 | |
%104 = sext i32 %103 to i64, !dbg !17 | |
%105 = getelementptr float, ptr addrspace(3) %74, i64 %104, !dbg !17 | |
%106 = load float, ptr addrspace(3) %105, align 4, !dbg !17 | |
%107 = add i32 %76, %103, !dbg !17 | |
%108 = sext i32 %107 to i64, !dbg !17 | |
%109 = getelementptr float, ptr addrspace(3) %74, i64 %108, !dbg !17 | |
%110 = load float, ptr addrspace(3) %109, align 4, !dbg !17 | |
%111 = mul i32 %68, 5, !dbg !17 | |
%112 = sext i32 %111 to i64, !dbg !17 | |
%113 = getelementptr float, ptr addrspace(3) %74, i64 %112, !dbg !17 | |
%114 = load float, ptr addrspace(3) %113, align 4, !dbg !17 | |
%115 = add i32 %76, %111, !dbg !17 | |
%116 = sext i32 %115 to i64, !dbg !17 | |
%117 = getelementptr float, ptr addrspace(3) %74, i64 %116, !dbg !17 | |
%118 = load float, ptr addrspace(3) %117, align 4, !dbg !17 | |
%119 = mul i32 %68, 6, !dbg !17 | |
%120 = sext i32 %119 to i64, !dbg !17 | |
%121 = getelementptr float, ptr addrspace(3) %74, i64 %120, !dbg !17 | |
%122 = load float, ptr addrspace(3) %121, align 4, !dbg !17 | |
%123 = add i32 %76, %119, !dbg !17 | |
%124 = sext i32 %123 to i64, !dbg !17 | |
%125 = getelementptr float, ptr addrspace(3) %74, i64 %124, !dbg !17 | |
%126 = load float, ptr addrspace(3) %125, align 4, !dbg !17 | |
%127 = mul i32 %68, 7, !dbg !17 | |
%128 = sext i32 %127 to i64, !dbg !17 | |
%129 = getelementptr float, ptr addrspace(3) %74, i64 %128, !dbg !17 | |
%130 = load float, ptr addrspace(3) %129, align 4, !dbg !17 | |
%131 = add i32 %76, %127, !dbg !17 | |
%132 = sext i32 %131 to i64, !dbg !17 | |
%133 = getelementptr float, ptr addrspace(3) %74, i64 %132, !dbg !17 | |
%134 = load float, ptr addrspace(3) %133, align 4, !dbg !17 | |
%135 = extractvalue { ptr addrspace(3), i32, i32, i32, i32 } %62, 0, !dbg !18 | |
%136 = extractvalue { ptr addrspace(3), i32, i32, i32, i32 } %62, 1, !dbg !18 | |
%137 = extractvalue { ptr addrspace(3), i32, i32, i32, i32 } %62, 2, !dbg !18 | |
%138 = shl i32 %136, 3, !dbg !18 | |
%139 = sext i32 %138 to i64, !dbg !18 | |
%140 = getelementptr float, ptr addrspace(3) %135, i64 %139, !dbg !18 | |
%141 = mul i32 %137, %56, !dbg !18 | |
%142 = sext i32 %141 to i64, !dbg !18 | |
%143 = getelementptr float, ptr addrspace(3) %140, i64 %142, !dbg !18 | |
%144 = load float, ptr addrspace(3) %143, align 4, !dbg !18 | |
%145 = sext i32 %136 to i64, !dbg !18 | |
%146 = getelementptr float, ptr addrspace(3) %143, i64 %145, !dbg !18 | |
%147 = load float, ptr addrspace(3) %146, align 4, !dbg !18 | |
%148 = shl i32 %136, 1, !dbg !18 | |
%149 = sext i32 %148 to i64, !dbg !18 | |
%150 = getelementptr float, ptr addrspace(3) %143, i64 %149, !dbg !18 | |
%151 = load float, ptr addrspace(3) %150, align 4, !dbg !18 | |
%152 = mul i32 %136, 3, !dbg !18 | |
%153 = sext i32 %152 to i64, !dbg !18 | |
%154 = getelementptr float, ptr addrspace(3) %143, i64 %153, !dbg !18 | |
%155 = load float, ptr addrspace(3) %154, align 4, !dbg !18 | |
%156 = shl i32 %136, 2, !dbg !18 | |
%157 = sext i32 %156 to i64, !dbg !18 | |
%158 = getelementptr float, ptr addrspace(3) %143, i64 %157, !dbg !18 | |
%159 = load float, ptr addrspace(3) %158, align 4, !dbg !18 | |
%160 = mul i32 %136, 5, !dbg !18 | |
%161 = sext i32 %160 to i64, !dbg !18 | |
%162 = getelementptr float, ptr addrspace(3) %143, i64 %161, !dbg !18 | |
%163 = load float, ptr addrspace(3) %162, align 4, !dbg !18 | |
%164 = mul i32 %136, 6, !dbg !18 | |
%165 = sext i32 %164 to i64, !dbg !18 | |
%166 = getelementptr float, ptr addrspace(3) %143, i64 %165, !dbg !18 | |
%167 = load float, ptr addrspace(3) %166, align 4, !dbg !18 | |
%168 = mul i32 %136, 7, !dbg !18 | |
%169 = sext i32 %168 to i64, !dbg !18 | |
%170 = getelementptr float, ptr addrspace(3) %143, i64 %169, !dbg !18 | |
%171 = load float, ptr addrspace(3) %170, align 4, !dbg !18 | |
%172 = extractvalue { float, float } %64, 0, !dbg !22 | |
%173 = extractvalue { float, float } %64, 1, !dbg !22 | |
%174 = tail call float @llvm.fmuladd.f32(float %.pn39, float %.pn55, float %172), !dbg !22 | |
%175 = tail call float @llvm.fmuladd.f32(float %.pn37, float %.pn55, float %173), !dbg !22 | |
%176 = tail call float @llvm.fmuladd.f32(float %.pn35, float %.pn53, float %174), !dbg !22 | |
%177 = tail call float @llvm.fmuladd.f32(float %.pn33, float %.pn53, float %175), !dbg !22 | |
%178 = tail call float @llvm.fmuladd.f32(float %.pn31, float %.pn51, float %176), !dbg !22 | |
%179 = tail call float @llvm.fmuladd.f32(float %.pn29, float %.pn51, float %177), !dbg !22 | |
%180 = tail call float @llvm.fmuladd.f32(float %.pn27, float %.pn49, float %178), !dbg !22 | |
%181 = tail call float @llvm.fmuladd.f32(float %.pn25, float %.pn49, float %179), !dbg !22 | |
%182 = tail call float @llvm.fmuladd.f32(float %.pn23, float %.pn47, float %180), !dbg !22 | |
%183 = tail call float @llvm.fmuladd.f32(float %.pn21, float %.pn47, float %181), !dbg !22 | |
%184 = tail call float @llvm.fmuladd.f32(float %.pn19, float %.pn45, float %182), !dbg !22 | |
%185 = tail call float @llvm.fmuladd.f32(float %.pn17, float %.pn45, float %183), !dbg !22 | |
%186 = tail call float @llvm.fmuladd.f32(float %.pn15, float %.pn43, float %184), !dbg !22 | |
%187 = tail call float @llvm.fmuladd.f32(float %.pn13, float %.pn43, float %185), !dbg !22 | |
%188 = tail call float @llvm.fmuladd.f32(float %.pn11, float %.pn41, float %186), !dbg !22 | |
%189 = tail call float @llvm.fmuladd.f32(float %.pn9, float %.pn41, float %187), !dbg !22 | |
%190 = tail call float @llvm.fmuladd.f32(float %75, float %144, float %188), !dbg !22 | |
%191 = tail call float @llvm.fmuladd.f32(float %79, float %144, float %189), !dbg !22 | |
%192 = tail call float @llvm.fmuladd.f32(float %82, float %147, float %190), !dbg !22 | |
%193 = tail call float @llvm.fmuladd.f32(float %86, float %147, float %191), !dbg !22 | |
%194 = tail call float @llvm.fmuladd.f32(float %90, float %151, float %192), !dbg !22 | |
%195 = tail call float @llvm.fmuladd.f32(float %94, float %151, float %193), !dbg !22 | |
%196 = tail call float @llvm.fmuladd.f32(float %98, float %155, float %194), !dbg !22 | |
%197 = tail call float @llvm.fmuladd.f32(float %102, float %155, float %195), !dbg !22 | |
%198 = tail call float @llvm.fmuladd.f32(float %106, float %159, float %196), !dbg !22 | |
%199 = tail call float @llvm.fmuladd.f32(float %110, float %159, float %197), !dbg !22 | |
%200 = tail call float @llvm.fmuladd.f32(float %114, float %163, float %198), !dbg !22 | |
%201 = tail call float @llvm.fmuladd.f32(float %118, float %163, float %199), !dbg !22 | |
%202 = tail call float @llvm.fmuladd.f32(float %122, float %167, float %200), !dbg !22 | |
%203 = tail call float @llvm.fmuladd.f32(float %126, float %167, float %201), !dbg !22 | |
%204 = tail call float @llvm.fmuladd.f32(float %130, float %171, float %202), !dbg !22 | |
%205 = tail call float @llvm.fmuladd.f32(float %134, float %171, float %203), !dbg !22 | |
%206 = insertvalue { float, float } undef, float %204, 0, !dbg !22 | |
%207 = insertvalue { float, float } %206, float %205, 1, !dbg !22 | |
%208 = add nuw nsw i32 %61, 16, !dbg !21 | |
%209 = icmp ult i32 %61, 1008, !dbg !21 | |
%.urem = urem i32 %60, 3 | |
%.urem66 = urem i32 %59, 3 | |
%210 = getelementptr float, ptr addrspace(1) %.pn361, i64 16, !dbg !19 | |
%211 = getelementptr float, ptr addrspace(1) %.pn762, i64 %40, !dbg !20 | |
%212 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];", "=r,=r,l,b"(ptr addrspace(1) %210, i1 %209) #2, !dbg !17 | |
%213 = extractvalue { i32, i32 } %212, 0, !dbg !17 | |
%214 = extractvalue { i32, i32 } %212, 1, !dbg !17 | |
tail call void @llvm.nvvm.barrier0(), !dbg !17 | |
%215 = shl nuw nsw i32 %.urem, 8, !dbg !17 | |
%216 = zext i32 %215 to i64 | |
%gep = getelementptr float, ptr addrspace(3) %32, i64 %216, !dbg !17 | |
%217 = getelementptr float, ptr addrspace(3) %gep, i64 1, !dbg !17 | |
store i32 %213, ptr addrspace(3) %gep, align 4, !dbg !17 | |
store i32 %214, ptr addrspace(3) %217, align 4, !dbg !17 | |
%218 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];", "=r,=r,l,b"(ptr addrspace(1) %211, i1 %209) #2, !dbg !18 | |
%219 = extractvalue { i32, i32 } %218, 0, !dbg !18 | |
%220 = extractvalue { i32, i32 } %218, 1, !dbg !18 | |
%gep60 = getelementptr float, ptr addrspace(3) %37, i64 %216, !dbg !18 | |
%221 = getelementptr float, ptr addrspace(3) %gep60, i64 1, !dbg !18 | |
store i32 %219, ptr addrspace(3) %gep60, align 4, !dbg !18 | |
store i32 %220, ptr addrspace(3) %221, align 4, !dbg !18 | |
%222 = shl nuw nsw i32 %.urem66, 8, !dbg !17 | |
%223 = zext i32 %222 to i64 | |
%224 = getelementptr float, ptr addrspace(3) @global_smem, i64 %223, !dbg !17 | |
%225 = insertvalue { ptr addrspace(3), i32, i32, i32, i32 } undef, ptr addrspace(3) %224, 0, !dbg !17 | |
%226 = insertvalue { ptr addrspace(3), i32, i32, i32, i32 } %225, i32 16, 1, !dbg !17 | |
%227 = insertvalue { ptr addrspace(3), i32, i32, i32, i32 } %226, i32 1, 2, !dbg !17 | |
%228 = insertvalue { ptr addrspace(3), i32, i32, i32, i32 } %227, i32 0, 3, !dbg !17 | |
%229 = insertvalue { ptr addrspace(3), i32, i32, i32, i32 } %228, i32 0, 4, !dbg !17 | |
%230 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 3072), i64 %223, !dbg !18 | |
%231 = insertvalue { ptr addrspace(3), i32, i32, i32, i32 } undef, ptr addrspace(3) %230, 0, !dbg !18 | |
%232 = insertvalue { ptr addrspace(3), i32, i32, i32, i32 } %231, i32 16, 1, !dbg !18 | |
%233 = insertvalue { ptr addrspace(3), i32, i32, i32, i32 } %232, i32 1, 2, !dbg !18 | |
%234 = insertvalue { ptr addrspace(3), i32, i32, i32, i32 } %233, i32 0, 3, !dbg !18 | |
%235 = insertvalue { ptr addrspace(3), i32, i32, i32, i32 } %234, i32 0, 4, !dbg !18 | |
%236 = add nuw nsw i32 %60, 1, !dbg !21 | |
%237 = add nuw nsw i32 %59, 1, !dbg !21 | |
tail call void @llvm.nvvm.barrier0(), !dbg !17 | |
%238 = add nuw nsw i32 %65, 16, !dbg !21 | |
%239 = icmp ult i32 %65, 1008, !dbg !21 | |
br i1 %239, label %58, label %240, !dbg !21 | |
240: ; preds = %58 | |
%241 = lshr i32 %8, 4, !dbg !23 | |
%242 = shl nuw nsw i32 %10, 1, !dbg !23 | |
%243 = or i32 %242, %241, !dbg !23 | |
%244 = mul i32 %13, %5, !dbg !24 | |
%245 = sext i32 %244 to i64, !dbg !25 | |
%246 = getelementptr float, ptr addrspace(1) %2, i64 %245, !dbg !25 | |
%247 = getelementptr float, ptr addrspace(1) %246, i64 %19, !dbg !26 | |
tail call void @llvm.nvvm.barrier0(), !dbg !23 | |
%248 = mul nuw nsw i32 %243, 18, !dbg !23 | |
%249 = add nuw nsw i32 %248, %56, !dbg !23 | |
%250 = zext i32 %249 to i64, !dbg !23 | |
%251 = getelementptr float, ptr addrspace(3) @global_smem, i64 %250, !dbg !23 | |
%252 = insertelement <1 x float> undef, float %204, i64 0, !dbg !23 | |
store <1 x float> %252, ptr addrspace(3) %251, align 4, !dbg !23 | |
%253 = add nuw nsw i32 %249, 144, !dbg !23 | |
%254 = zext i32 %253 to i64, !dbg !23 | |
%255 = getelementptr float, ptr addrspace(3) @global_smem, i64 %254, !dbg !23 | |
%256 = insertelement <1 x float> undef, float %205, i64 0, !dbg !23 | |
store <1 x float> %256, ptr addrspace(3) %255, align 4, !dbg !23 | |
tail call void @llvm.nvvm.barrier0(), !dbg !23 | |
%257 = mul nuw nsw i32 %13, 18, !dbg !23 | |
%258 = add nuw nsw i32 %257, %15, !dbg !23 | |
%259 = zext i32 %258 to i64, !dbg !23 | |
%260 = getelementptr float, ptr addrspace(3) @global_smem, i64 %259, !dbg !23 | |
%.extract = load i32, ptr addrspace(3) %260, align 8, !dbg !23 | |
%261 = getelementptr inbounds <2 x i32>, ptr addrspace(3) %260, i64 0, i64 1, !dbg !23 | |
%.extract57 = load i32, ptr addrspace(3) %261, align 4, !dbg !23 | |
tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %.extract, i32 %.extract57, ptr addrspace(1) %247, i1 true) #2, !dbg !23 | |
ret void, !dbg !27 | |
} | |
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) | |
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0 | |
; Function Attrs: convergent nocallback nounwind | |
declare void @llvm.nvvm.barrier0() #1 | |
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) | |
declare float @llvm.fmuladd.f32(float, float, float) #0 | |
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } | |
attributes #1 = { convergent nocallback nounwind } | |
attributes #2 = { nounwind } | |
!llvm.module.flags = !{!0} | |
!llvm.dbg.cu = !{!1} | |
!nvvm.annotations = !{!3, !4, !4, !3} | |
!0 = !{i32 2, !"Debug Info Version", i32 3} | |
!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) | |
!2 = !DIFile(filename: "run_triton_matmul.py", directory: "/home/hzhuang/code") | |
!3 = !{ptr @matmul_kernel_0d1d2d3d4c5d6c7d8c, !"kernel", i32 1} | |
!4 = !{ptr @matmul_kernel_0d1d2d3d4c5d6c7d8c, !"maxntidx", i32 128} | |
!5 = distinct !DISubprogram(name: "matmul_kernel_0d1d2d3d4c5d6c7d8c", linkageName: "matmul_kernel_0d1d2d3d4c5d6c7d8c", scope: !2, file: !2, line: 171, type: !6, scopeLine: 171, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1) | |
!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) | |
!7 = !{} | |
!8 = !DILocation(line: 183, column: 28, scope: !5) | |
!9 = !DILocation(line: 183, column: 58, scope: !5) | |
!10 = !DILocation(line: 183, column: 39, scope: !5) | |
!11 = !DILocation(line: 183, column: 21, scope: !5) | |
!12 = !DILocation(line: 183, column: 51, scope: !5) | |
!13 = !DILocation(line: 184, column: 39, scope: !5) | |
!14 = !DILocation(line: 184, column: 21, scope: !5) | |
!15 = !DILocation(line: 184, column: 51, scope: !5) | |
!16 = !DILocation(line: 191, column: 33, scope: !5) | |
!17 = !DILocation(line: 187, column: 20, scope: !5) | |
!18 = !DILocation(line: 188, column: 20, scope: !5) | |
!19 = !DILocation(line: 190, column: 18, scope: !5) | |
!20 = !DILocation(line: 191, column: 18, scope: !5) | |
!21 = !DILocation(line: 186, column: 25, scope: !5) | |
!22 = !DILocation(line: 189, scope: !5) | |
!23 = !DILocation(line: 194, column: 21, scope: !5) | |
!24 = !DILocation(line: 193, column: 39, scope: !5) | |
!25 = !DILocation(line: 193, column: 21, scope: !5) | |
!26 = !DILocation(line: 193, column: 51, scope: !5) | |
!27 = !DILocation(line: 194, column: 4, scope: !5) | |
['__class__', '__class_getitem__', '__contains__', '__copy__', '__delattr__', '__delitem__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__ior__', '__iter__', '__le__', '__len__', '__lt__', '__missing__', '__ne__', '__new__', '__or__', '__reduce__', '__reduce_ex__', '__repr__', '__reversed__', '__ror__', '__setattr__', '__setitem__', '__sizeof__', '__str__', '__subclasshook__', 'clear', 'copy', 'default_factory', 'fromkeys', 'get', 'items', 'keys', 'pop', 'popitem', 'setdefault', 'update', 'values'] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment