Created
December 14, 2022 10:04
-
-
Save mratsim/0e1f1454d549e81312a5fb97ac717b7a to your computer and use it in GitHub Desktop.
Uint256 on Nvidia, codegen quality investigation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
; ModuleID = 'build/nvidia/add_carry.cu' | |
source_filename = "build/nvidia/add_carry.cu" | |
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" | |
target triple = "nvptx64-nvidia-cuda" | |
%printf_args = type { i64 } | |
%printf_args.0 = type { i64 } | |
@.str = private unnamed_addr constant [27 x i8] c"32-bit Addition: %#016llx\0A\00", align 1 | |
@.str1 = private unnamed_addr constant [27 x i8] c"64-bit Addition: %#016llx\0A\00", align 1 | |
; Function Attrs: convergent mustprogress noinline norecurse nounwind optnone | |
define dso_local void @_Z13addcKernelv() #0 { | |
%1 = alloca i64, align 8 | |
%2 = alloca i64, align 8 | |
%3 = alloca %printf_args, align 8 | |
%4 = alloca %printf_args.0, align 8 | |
store i64 0, i64* %1, align 8 | |
store i64 0, i64* %2, align 8 | |
%5 = call i64 asm ".reg .b32 r0;\0A\09.reg .b32 r1;\0A\09add.cc.u32 r0, 0xc2775652, 0x4c60baa8;\0A\09addc.cc.u32 r1, 0xa64ab78d, 0xb1da3ab6;\0A\09mov.b64 $0, {r0, r1}\0A\09;", "=l"() #1, !srcloc !7 | |
store i64 %5, i64* %1, align 8 | |
%6 = call i64 asm "add.u64 $0, 0xa64ab78dc2775652, 0xb1da3ab64c60baa8;\0A\09", "=l"() #1, !srcloc !8 | |
store i64 %6, i64* %2, align 8 | |
%7 = load i64, i64* %1, align 8 | |
%8 = getelementptr inbounds %printf_args, %printf_args* %3, i32 0, i32 0 | |
store i64 %7, i64* %8, align 8 | |
%9 = bitcast %printf_args* %3 to i8* | |
%10 = call i32 @vprintf(i8* getelementptr inbounds ([27 x i8], [27 x i8]* @.str, i64 0, i64 0), i8* %9) | |
%11 = load i64, i64* %2, align 8 | |
%12 = getelementptr inbounds %printf_args.0, %printf_args.0* %4, i32 0, i32 0 | |
store i64 %11, i64* %12, align 8 | |
%13 = bitcast %printf_args.0* %4 to i8* | |
%14 = call i32 @vprintf(i8* getelementptr inbounds ([27 x i8], [27 x i8]* @.str1, i64 0, i64 0), i8* %13) | |
ret void | |
} | |
declare i32 @vprintf(i8*, i8*) | |
attributes #0 = { convergent mustprogress noinline norecurse nounwind optnone "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_86" "target-features"="+ptx75,+sm_86" } | |
attributes #1 = { convergent nounwind readnone } | |
!llvm.module.flags = !{!0, !1, !2, !3} | |
!nvvm.annotations = !{!4} | |
!llvm.ident = !{!5, !6} | |
!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 11, i32 5]} | |
!1 = !{i32 1, !"wchar_size", i32 4} | |
!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0} | |
!3 = !{i32 7, !"frame-pointer", i32 2} | |
!4 = !{void ()* @_Z13addcKernelv, !"kernel", i32 1} | |
!5 = !{!"clang version 14.0.6"} | |
!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} | |
!7 = !{i64 229, i64 245, i64 267, i64 314, i64 362, i64 391} | |
!8 = !{i64 430, i64 484} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Compile with | |
// NVCC: | |
// nvcc -arch=sm_86 -ptx build/nvidia/add_carry.cu -o build/nvidia/add_carry_nvcc.ptx | |
// Clang/LLVM with NVPTX backend | |
// clang++ -S -emit-llvm \ | |
// build/nvidia/add_carry.cu \ | |
// --cuda-gpu-arch=sm_86 \ | |
// -L/opt/cuda/lib64 \ | |
// -lcudart_static -ldl -lrt -pthread | |
// clang++ build/nvidia/add_carry.cu \ | |
// -o build/nvidia/add_carry \ | |
// --cuda-gpu-arch=sm_86 \ | |
// -L/opt/cuda/lib64 \ | |
// -lcudart_static -ldl -lrt -pthread | |
// llc -mcpu=sm_86 build/nvidia/add_carry-cuda-nvptx64-nvidia-cuda-sm_86.ll -o build/nvidia/add_carry_llvm.ptx | |
#include "cuda_runtime.h" | |
#include "device_launch_parameters.h" | |
#include <cstdint> | |
#include <stdio.h> | |
cudaError_t addc(); | |
__global__ void addcKernel() | |
{ | |
uint64_t result32bitAdd = 0; | |
uint64_t result64bitAdd = 0; | |
asm(".reg .b32 r0;\n\t" | |
".reg .b32 r1;\n\t" | |
"add.cc.u32 r0, 0xc2775652, 0x4c60baa8;\n\t" | |
"addc.cc.u32 r1, 0xa64ab78d, 0xb1da3ab6;\n\t" | |
"mov.b64 %0, {r0, r1}\n\t;" | |
: "=l"(result32bitAdd)); | |
asm("add.u64 %0, 0xa64ab78dc2775652, 0xb1da3ab64c60baa8;\n\t" | |
: "=l"(result64bitAdd)); | |
printf("32-bit Addition: %#016llx\n", result32bitAdd); | |
printf("64-bit Addition: %#016llx\n", result64bitAdd); | |
} | |
int main() | |
{ | |
cudaError_t cudaStatus = addc(); | |
if (cudaStatus != cudaSuccess) { | |
fprintf(stderr, "addWithCuda failed!"); | |
return 1; | |
} | |
cudaStatus = cudaDeviceReset(); | |
if (cudaStatus != cudaSuccess) { | |
fprintf(stderr, "cudaDeviceReset failed!"); | |
return 1; | |
} | |
getchar(); | |
return 0; | |
} | |
cudaError_t addc() | |
{ | |
cudaError_t cudaStatus; | |
cudaStatus = cudaSetDevice(0); | |
addcKernel <<<1, 1>>>(); | |
cudaStatus = cudaGetLastError(); | |
if (cudaStatus != cudaSuccess) { | |
fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus)); | |
goto Error; | |
} | |
cudaStatus = cudaDeviceSynchronize(); | |
if (cudaStatus != cudaSuccess) { | |
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus); | |
goto Error; | |
} | |
Error: | |
return cudaStatus; | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// | |
// Generated by LLVM NVPTX Back-End | |
// | |
.version 7.1 | |
.target sm_86 | |
.address_size 64 | |
// .globl _Z13addcKernelv // -- Begin function _Z13addcKernelv | |
.extern .func (.param .b32 func_retval0) vprintf | |
( | |
.param .b64 vprintf_param_0, | |
.param .b64 vprintf_param_1 | |
) | |
; | |
.global .align 1 .b8 _$_str[27] = {51, 50, 45, 98, 105, 116, 32, 65, 100, 100, 105, 116, 105, 111, 110, 58, 32, 37, 35, 48, 49, 54, 108, 108, 120, 10, 0}; | |
.global .align 1 .b8 _$_str1[27] = {54, 52, 45, 98, 105, 116, 32, 65, 100, 100, 105, 116, 105, 111, 110, 58, 32, 37, 35, 48, 49, 54, 108, 108, 120, 10, 0}; | |
// @_Z13addcKernelv | |
.visible .entry _Z13addcKernelv() | |
{ | |
.local .align 8 .b8 __local_depot0[32]; | |
.reg .b64 %SP; | |
.reg .b64 %SPL; | |
.reg .b32 %r<5>; | |
.reg .b64 %rd<12>; | |
// %bb.0: | |
mov.u64 %SPL, __local_depot0; | |
cvta.local.u64 %SP, %SPL; | |
mov.u64 %rd3, 0; | |
st.u64 [%SP+0], %rd3; | |
st.u64 [%SP+8], %rd3; | |
// begin inline asm | |
.reg .b32 r0; | |
.reg .b32 r1; | |
add.cc.u32 r0, 0xc2775652, 0x4c60baa8; | |
addc.cc.u32 r1, 0xa64ab78d, 0xb1da3ab6; | |
mov.b64 %rd1, {r0, r1} | |
; | |
// end inline asm | |
st.u64 [%SP+0], %rd1; | |
// begin inline asm | |
add.u64 %rd2, 0xa64ab78dc2775652, 0xb1da3ab64c60baa8; | |
// end inline asm | |
st.u64 [%SP+8], %rd2; | |
ld.u64 %rd4, [%SP+0]; | |
st.u64 [%SP+16], %rd4; | |
mov.u64 %rd5, _$_str; | |
cvta.global.u64 %rd6, %rd5; | |
add.u64 %rd7, %SP, 16; | |
{ // callseq 0, 0 | |
.reg .b32 temp_param_reg; | |
.param .b64 param0; | |
st.param.b64 [param0+0], %rd6; | |
.param .b64 param1; | |
st.param.b64 [param1+0], %rd7; | |
.param .b32 retval0; | |
call.uni (retval0), | |
vprintf, | |
( | |
param0, | |
param1 | |
); | |
ld.param.b32 %r1, [retval0+0]; | |
} // callseq 0 | |
ld.u64 %rd8, [%SP+8]; | |
st.u64 [%SP+24], %rd8; | |
mov.u64 %rd9, _$_str1; | |
cvta.global.u64 %rd10, %rd9; | |
add.u64 %rd11, %SP, 24; | |
{ // callseq 1, 0 | |
.reg .b32 temp_param_reg; | |
.param .b64 param0; | |
st.param.b64 [param0+0], %rd10; | |
.param .b64 param1; | |
st.param.b64 [param1+0], %rd11; | |
.param .b32 retval0; | |
call.uni (retval0), | |
vprintf, | |
( | |
param0, | |
param1 | |
); | |
ld.param.b32 %r3, [retval0+0]; | |
} // callseq 1 | |
ret; | |
// -- End function | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// | |
// Generated by NVIDIA NVVM Compiler | |
// | |
// Compiler Build ID: CL-31833905 | |
// Cuda compilation tools, release 11.8, V11.8.89 | |
// Based on NVVM 7.0.1 | |
// | |
.version 7.8 | |
.target sm_86 | |
.address_size 64 | |
// .globl _Z10addcKernelv | |
.extern .func (.param .b32 func_retval0) vprintf | |
( | |
.param .b64 vprintf_param_0, | |
.param .b64 vprintf_param_1 | |
) | |
; | |
.global .align 1 .b8 $str[27] = {51, 50, 45, 98, 105, 116, 32, 65, 100, 100, 105, 116, 105, 111, 110, 58, 32, 37, 35, 48, 49, 54, 108, 108, 120, 10, 0}; | |
.global .align 1 .b8 $str$1[27] = {54, 52, 45, 98, 105, 116, 32, 65, 100, 100, 105, 116, 105, 111, 110, 58, 32, 37, 35, 48, 49, 54, 108, 108, 120, 10, 0}; | |
.visible .entry _Z10addcKernelv() | |
{ | |
.local .align 8 .b8 __local_depot0[8]; | |
.reg .b64 %SP; | |
.reg .b64 %SPL; | |
.reg .b32 %r<3>; | |
.reg .b64 %rd<9>; | |
mov.u64 %SPL, __local_depot0; | |
cvta.local.u64 %SP, %SPL; | |
add.u64 %rd3, %SP, 0; | |
add.u64 %rd4, %SPL, 0; | |
// begin inline asm | |
.reg .b32 r0; | |
.reg .b32 r1; | |
add.cc.u32 r0, 0xc2775652, 0x4c60baa8; | |
addc.cc.u32 r1, 0xa64ab78d, 0xb1da3ab6; | |
mov.b64 %rd1, {r0, r1} | |
; | |
// end inline asm | |
// begin inline asm | |
add.u64 %rd2, 0xa64ab78dc2775652, 0xb1da3ab64c60baa8; | |
// end inline asm | |
st.local.u64 [%rd4], %rd1; | |
mov.u64 %rd5, $str; | |
cvta.global.u64 %rd6, %rd5; | |
{ // callseq 0, 0 | |
.reg .b32 temp_param_reg; | |
.param .b64 param0; | |
st.param.b64 [param0+0], %rd6; | |
.param .b64 param1; | |
st.param.b64 [param1+0], %rd3; | |
.param .b32 retval0; | |
call.uni (retval0), | |
vprintf, | |
( | |
param0, | |
param1 | |
); | |
ld.param.b32 %r1, [retval0+0]; | |
} // callseq 0 | |
st.local.u64 [%rd4], %rd2; | |
mov.u64 %rd7, $str$1; | |
cvta.global.u64 %rd8, %rd7; | |
{ // callseq 1, 0 | |
.reg .b32 temp_param_reg; | |
.param .b64 param0; | |
st.param.b64 [param0+0], %rd8; | |
.param .b64 param1; | |
st.param.b64 [param1+0], %rd3; | |
.param .b32 retval0; | |
call.uni (retval0), | |
vprintf, | |
( | |
param0, | |
param1 | |
); | |
ld.param.b32 %r2, [retval0+0]; | |
} // callseq 1 | |
ret; | |
} | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
; ModuleID = 'build/nvidia/wideint.cu' | |
source_filename = "build/nvidia/wideint.cu" | |
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" | |
target triple = "nvptx64-nvidia-cuda" | |
%printf_args = type { i32 } | |
@.str = private unnamed_addr constant [5 x i8] c"%02X\00", align 1 | |
; Function Attrs: convergent mustprogress noinline norecurse nounwind optnone | |
define dso_local void @_Z12add256Kernelv() #0 { | |
%1 = alloca i256, align 8 | |
%2 = alloca i256, align 8 | |
%3 = alloca i256, align 8 | |
%4 = alloca i32, align 4 | |
%5 = alloca %printf_args, align 8 | |
store i256 43520, i256* %1, align 8 | |
store i256 1, i256* %2, align 8 | |
store i256 0, i256* %3, align 8 | |
%6 = load i256, i256* %1, align 8 | |
%7 = load i256, i256* %2, align 8 | |
%8 = add nsw i256 %6, %7 | |
store i256 %8, i256* %3, align 8 | |
store i32 0, i32* %4, align 4 | |
br label %9 | |
9: ; preds = %22, %0 | |
%10 = load i32, i32* %4, align 4 | |
%11 = icmp slt i32 %10, 32 | |
br i1 %11, label %12, label %25 | |
12: ; preds = %9 | |
%13 = bitcast i256* %3 to i8* | |
%14 = load i32, i32* %4, align 4 | |
%15 = sext i32 %14 to i64 | |
%16 = getelementptr inbounds i8, i8* %13, i64 %15 | |
%17 = load i8, i8* %16, align 1 | |
%18 = zext i8 %17 to i32 | |
%19 = getelementptr inbounds %printf_args, %printf_args* %5, i32 0, i32 0 | |
store i32 %18, i32* %19, align 4 | |
%20 = bitcast %printf_args* %5 to i8* | |
%21 = call i32 @vprintf(i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i64 0, i64 0), i8* %20) | |
br label %22 | |
22: ; preds = %12 | |
%23 = load i32, i32* %4, align 4 | |
%24 = add nsw i32 %23, 1 | |
store i32 %24, i32* %4, align 4 | |
br label %9, !llvm.loop !7 | |
25: ; preds = %9 | |
ret void | |
} | |
declare i32 @vprintf(i8*, i8*) | |
attributes #0 = { convergent mustprogress noinline norecurse nounwind optnone "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_86" "target-features"="+ptx72,+sm_86" } | |
!llvm.module.flags = !{!0, !1, !2, !3} | |
!nvvm.annotations = !{!4} | |
!llvm.ident = !{!5, !6} | |
!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 11, i32 2]} | |
!1 = !{i32 1, !"wchar_size", i32 4} | |
!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0} | |
!3 = !{i32 7, !"frame-pointer", i32 2} | |
!4 = !{void ()* @_Z12add256Kernelv, !"kernel", i32 1} | |
!5 = !{!"clang version 13.0.1"} | |
!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} | |
!7 = distinct !{!7, !8} | |
!8 = !{!"llvm.loop.mustprogress"} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Compile with LLVM | |
// /usr/lib/llvm13/bin/clang++ -S -emit-llvm \ | |
// build/nvidia/wideint.cu \ | |
// --cuda-gpu-arch=sm_86 \ | |
// -L/opt/cuda/lib64 \ | |
// -lcudart_static -ldl -lrt -pthread | |
// /usr/lib/llvm13/bin/clang++ build/nvidia/wideint.cu \ | |
// -o build/nvidia/wideint \ | |
// --cuda-gpu-arch=sm_86 \ | |
// -L/opt/cuda/lib64 \ | |
// -lcudart_static -ldl -lrt -pthread | |
// llc -mcpu=sm_86 build/nvidia/wideint-cuda-nvptx64-nvidia-cuda-sm_86.ll -o build/nvidia/wideint_llvm.ptx | |
#include "cuda_runtime.h" | |
#include "device_launch_parameters.h" | |
#include <cstdint> | |
#include <stdio.h> | |
typedef _ExtInt(256) u256; | |
cudaError_t add256(); | |
__global__ void add256Kernel() { | |
u256 a = 0xAA00; | |
u256 b = 0x1; | |
u256 c = 0; | |
c = a + b; | |
for (int i = 0; i < 32; i++) { | |
printf("%02X", ((unsigned char*)(&c))[i]); | |
} | |
} | |
int main() | |
{ | |
cudaError_t cudaStatus = add256(); | |
if (cudaStatus != cudaSuccess) { | |
fprintf(stderr, "addWithCuda failed!"); | |
return 1; | |
} | |
cudaStatus = cudaDeviceReset(); | |
if (cudaStatus != cudaSuccess) { | |
fprintf(stderr, "cudaDeviceReset failed!"); | |
return 1; | |
} | |
getchar(); | |
return 0; | |
} | |
cudaError_t add256() | |
{ | |
cudaError_t cudaStatus; | |
cudaStatus = cudaSetDevice(0); | |
add256Kernel<<<1, 1>>>(); | |
cudaStatus = cudaGetLastError(); | |
if (cudaStatus != cudaSuccess) { | |
fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus)); | |
goto Error; | |
} | |
cudaStatus = cudaDeviceSynchronize(); | |
if (cudaStatus != cudaSuccess) { | |
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus); | |
goto Error; | |
} | |
Error: | |
return cudaStatus; | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// | |
// Generated by LLVM NVPTX Back-End | |
// | |
.version 7.1 | |
.target sm_86 | |
.address_size 64 | |
// .globl _Z12add256Kernelv // -- Begin function _Z12add256Kernelv | |
.extern .func (.param .b32 func_retval0) vprintf | |
( | |
.param .b64 vprintf_param_0, | |
.param .b64 vprintf_param_1 | |
) | |
; | |
.global .align 1 .b8 _$_str[5] = {37, 48, 50, 88, 0}; | |
// @_Z12add256Kernelv | |
.visible .entry _Z12add256Kernelv() | |
{ | |
.local .align 8 .b8 __local_depot0[112]; | |
.reg .b64 %SP; | |
.reg .b64 %SPL; | |
.reg .pred %p<13>; | |
.reg .b32 %r<15>; | |
.reg .b64 %rd<35>; | |
// %bb.0: | |
mov.u64 %SPL, __local_depot0; | |
cvta.local.u64 %SP, %SPL; | |
mov.u64 %rd1, 0; | |
st.u64 [%SP+24], %rd1; | |
st.u64 [%SP+16], %rd1; | |
st.u64 [%SP+8], %rd1; | |
mov.u64 %rd2, 43520; | |
st.u64 [%SP+0], %rd2; | |
st.u64 [%SP+56], %rd1; | |
st.u64 [%SP+48], %rd1; | |
st.u64 [%SP+40], %rd1; | |
mov.u64 %rd3, 1; | |
st.u64 [%SP+32], %rd3; | |
st.u64 [%SP+88], %rd1; | |
st.u64 [%SP+80], %rd1; | |
st.u64 [%SP+72], %rd1; | |
st.u64 [%SP+64], %rd1; | |
ld.u64 %rd4, [%SP+24]; | |
ld.u64 %rd5, [%SP+16]; | |
ld.u64 %rd6, [%SP+8]; | |
ld.u64 %rd7, [%SP+0]; | |
ld.u64 %rd8, [%SP+56]; | |
ld.u64 %rd9, [%SP+48]; | |
ld.u64 %rd10, [%SP+40]; | |
ld.u64 %rd11, [%SP+32]; | |
add.s64 %rd12, %rd7, %rd11; | |
setp.lt.u64 %p1, %rd12, %rd11; | |
setp.lt.u64 %p2, %rd12, %rd7; | |
selp.u64 %rd13, 1, 0, %p2; | |
selp.b64 %rd14, 1, %rd13, %p1; | |
add.s64 %rd15, %rd6, %rd10; | |
add.s64 %rd16, %rd15, %rd14; | |
setp.eq.s64 %p3, %rd16, %rd10; | |
setp.lt.u64 %p4, %rd16, %rd10; | |
selp.u32 %r1, -1, 0, %p4; | |
selp.u32 %r2, -1, 0, %p1; | |
selp.b32 %r3, %r2, %r1, %p3; | |
and.b32 %r4, %r3, 1; | |
setp.eq.b32 %p5, %r4, 1; | |
setp.eq.s64 %p6, %rd16, %rd6; | |
setp.lt.u64 %p7, %rd16, %rd6; | |
selp.u32 %r5, -1, 0, %p7; | |
selp.u32 %r6, -1, 0, %p2; | |
selp.b32 %r7, %r6, %r5, %p6; | |
cvt.u64.u32 %rd17, %r7; | |
and.b64 %rd18, %rd17, 1; | |
selp.b64 %rd19, 1, %rd18, %p5; | |
add.s64 %rd20, %rd5, %rd9; | |
add.s64 %rd21, %rd20, %rd19; | |
setp.lt.u64 %p8, %rd21, %rd19; | |
setp.lt.u64 %p9, %rd21, %rd20; | |
selp.u64 %rd22, 1, 0, %p9; | |
selp.b64 %rd23, 1, %rd22, %p8; | |
setp.lt.u64 %p10, %rd20, %rd9; | |
setp.lt.u64 %p11, %rd20, %rd5; | |
selp.u64 %rd24, 1, 0, %p11; | |
selp.b64 %rd25, 1, %rd24, %p10; | |
add.s64 %rd26, %rd4, %rd8; | |
add.s64 %rd27, %rd26, %rd25; | |
add.s64 %rd28, %rd27, %rd23; | |
st.u64 [%SP+64], %rd12; | |
st.u64 [%SP+72], %rd16; | |
st.u64 [%SP+80], %rd21; | |
st.u64 [%SP+88], %rd28; | |
mov.u32 %r8, 0; | |
st.u32 [%SP+96], %r8; | |
bra.uni LBB0_1; | |
LBB0_1: // =>This Inner Loop Header: Depth=1 | |
ld.u32 %r9, [%SP+96]; | |
setp.gt.s32 %p12, %r9, 31; | |
@%p12 bra LBB0_4; | |
bra.uni LBB0_2; | |
LBB0_2: // in Loop: Header=BB0_1 Depth=1 | |
ld.s32 %rd29, [%SP+96]; | |
add.u64 %rd30, %SP, 64; | |
add.s64 %rd31, %rd30, %rd29; | |
ld.u8 %r10, [%rd31]; | |
st.u32 [%SP+104], %r10; | |
mov.u64 %rd32, _$_str; | |
cvta.global.u64 %rd33, %rd32; | |
add.u64 %rd34, %SP, 104; | |
{ // callseq 0, 0 | |
.reg .b32 temp_param_reg; | |
.param .b64 param0; | |
st.param.b64 [param0+0], %rd33; | |
.param .b64 param1; | |
st.param.b64 [param1+0], %rd34; | |
.param .b32 retval0; | |
call.uni (retval0), | |
vprintf, | |
( | |
param0, | |
param1 | |
); | |
ld.param.b32 %r11, [retval0+0]; | |
} // callseq 0 | |
bra.uni LBB0_3; | |
LBB0_3: // in Loop: Header=BB0_1 Depth=1 | |
ld.u32 %r13, [%SP+96]; | |
add.s32 %r14, %r13, 1; | |
st.u32 [%SP+96], %r14; | |
bra.uni LBB0_1; | |
LBB0_4: | |
ret; | |
// -- End function | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment