Last active
August 16, 2021 13:06
-
-
Save MasonProtter/7de88773b44f35cad796fd66e253a78a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#+BEGIN_SRC jupyter-julia | |
using CUDA | |
struct StaticString{N} <: AbstractString | |
chars::NTuple{N, Char} | |
end | |
macro s_str(s) | |
chars = tuple(collect(s)...) | |
N = length(chars) | |
esc(:(StaticString{$N}($chars))) | |
end | |
Base.String(s::StaticString) = String(collect(s.chars)) | |
Base.show(io::IO, s::StaticString{N}) where {N} = print(io, "StaticString{$N}(\"$(String(s))\")") | |
Base.:(*)(s1::StaticString{N}, s2::StaticString{M}) where {N, M} = StaticString{N + M}((s1.chars..., s2.chars...)) | |
@device_code_ptx cu([s"abc ", s"123 "]) .* cu([s"hello ", s"goodbye"]) | |
#+END_SRC | |
#+RESULTS: | |
#+BEGIN_EXAMPLE | |
// PTX CompilerJob of kernel broadcast_kernel(CUDA.CuKernelContext, CuDeviceArray{StaticString{11},1,CUDA.AS.Global}, Base.Broadcast.Broadcasted{Nothing,Tuple{Base.OneTo{Int64}},typeof(*),Tuple{Base.Broadcast.Extruded{CuDeviceArray{StaticString{4},1,CUDA.AS.Global},Tuple{Bool},Tuple{Int64}},Base.Broadcast.Extruded{CuDeviceArray{StaticString{7},1,CUDA.AS.Global},Tuple{Bool},Tuple{Int64}}}}, Int64) for sm_75 | |
// | |
// Generated by LLVM NVPTX Back-End | |
// | |
.version 6.3 | |
.target sm_75 | |
.address_size 64 | |
// .globl _Z27julia_broadcast_kernel_342715CuKernelContext13CuDeviceArrayI12StaticStringILi11EELi1E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64EE2__5TupleI8ExtrudedI13CuDeviceArrayI12StaticStringILi4EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EE8ExtrudedI13CuDeviceArrayI12StaticStringILi7EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEE5Int64 // -- Begin function _Z27julia_broadcast_kernel_342715CuKernelContext13CuDeviceArrayI12StaticStringILi11EELi1E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64EE2__5TupleI8ExtrudedI13CuDeviceArrayI12StaticStringILi4EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EE8ExtrudedI13CuDeviceArrayI12StaticStringILi7EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEE5Int64 | |
.weak .global .align 8 .u64 exception_flag; | |
// @_Z27julia_broadcast_kernel_342715CuKernelContext13CuDeviceArrayI12StaticStringILi11EELi1E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64EE2__5TupleI8ExtrudedI13CuDeviceArrayI12StaticStringILi4EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EE8ExtrudedI13CuDeviceArrayI12StaticStringILi7EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEE5Int64 | |
.visible .entry _Z27julia_broadcast_kernel_342715CuKernelContext13CuDeviceArrayI12StaticStringILi11EELi1E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64EE2__5TupleI8ExtrudedI13CuDeviceArrayI12StaticStringILi4EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EE8ExtrudedI13CuDeviceArrayI12StaticStringILi7EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEE5Int64( | |
.param .align 8 .b8 _Z27julia_broadcast_kernel_342715CuKernelContext13CuDeviceArrayI12StaticStringILi11EELi1E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64EE2__5TupleI8ExtrudedI13CuDeviceArrayI12StaticStringILi4EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EE8ExtrudedI13CuDeviceArrayI12StaticStringILi7EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEE5Int64_param_0[16], | |
.param .align 8 .b8 _Z27julia_broadcast_kernel_342715CuKernelContext13CuDeviceArrayI12StaticStringILi11EELi1E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64EE2__5TupleI8ExtrudedI13CuDeviceArrayI12StaticStringILi4EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EE8ExtrudedI13CuDeviceArrayI12StaticStringILi7EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEE5Int64_param_1[72], | |
.param .u64 _Z27julia_broadcast_kernel_342715CuKernelContext13CuDeviceArrayI12StaticStringILi11EELi1E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64EE2__5TupleI8ExtrudedI13CuDeviceArrayI12StaticStringILi4EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EE8ExtrudedI13CuDeviceArrayI12StaticStringILi7EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEE5Int64_param_2 | |
) | |
{ | |
.reg .pred %p<10>; | |
.reg .b16 %rs<6>; | |
.reg .b32 %r<28>; | |
.reg .b64 %rd<63>; | |
// %bb.0: // %top | |
ld.param.u64 %rd39, [_Z27julia_broadcast_kernel_342715CuKernelContext13CuDeviceArrayI12StaticStringILi11EELi1E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64EE2__5TupleI8ExtrudedI13CuDeviceArrayI12StaticStringILi4EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EE8ExtrudedI13CuDeviceArrayI12StaticStringILi7EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEE5Int64_param_2]; | |
setp.lt.s64 %p1, %rd39, 1; | |
@%p1 bra LBB0_8; | |
// %bb.1: // %L12.preheader | |
mov.b64 %rd37, _Z27julia_broadcast_kernel_342715CuKernelContext13CuDeviceArrayI12StaticStringILi11EELi1E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64EE2__5TupleI8ExtrudedI13CuDeviceArrayI12StaticStringILi4EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EE8ExtrudedI13CuDeviceArrayI12StaticStringILi7EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEE5Int64_param_0; | |
mov.b64 %rd38, _Z27julia_broadcast_kernel_342715CuKernelContext13CuDeviceArrayI12StaticStringILi11EELi1E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64EE2__5TupleI8ExtrudedI13CuDeviceArrayI12StaticStringILi4EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EE8ExtrudedI13CuDeviceArrayI12StaticStringILi7EELi1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEE5Int64_param_1; | |
ld.param.u64 %rd1, [%rd38+8]; | |
ld.param.u64 %rd3, [%rd38+40]; | |
ld.param.u64 %rd4, [%rd38+56]; | |
ld.param.u8 %rs1, [%rd38+16]; | |
ld.param.u8 %rs2, [%rd38+48]; | |
ld.param.u64 %rd5, [%rd37]; | |
ld.param.u64 %rd6, [%rd37+8]; | |
setp.gt.s64 %p2, %rd39, 0; | |
selp.b64 %rd62, %rd39, 0, %p2; | |
mov.u32 %r1, %tid.x; | |
add.s32 %r2, %r1, 1; | |
cvt.u64.u32 %rd8, %r2; | |
mov.u32 %r3, %ctaid.x; | |
cvt.u64.u32 %rd9, %r3; | |
mov.u32 %r4, %ntid.x; | |
cvt.u64.u32 %rd10, %r4; | |
mov.u32 %r5, %nctaid.x; | |
cvt.u64.u32 %rd11, %r5; | |
and.b16 %rs3, %rs1, 255; | |
setp.eq.s16 %p3, %rs3, 0; | |
@%p3 bra LBB0_3; | |
// %bb.2: // %L12.preheader36 | |
mul.lo.s64 %rd40, %rd10, %rd9; | |
add.s64 %rd61, %rd8, %rd40; | |
add.s64 %rd41, %rd61, -1; | |
mul.lo.s64 %rd13, %rd11, %rd10; | |
shl.b64 %rd42, %rd41, 4; | |
add.s64 %rd43, %rd1, %rd42; | |
add.s64 %rd60, %rd43, 8; | |
shl.b64 %rd15, %rd13, 4; | |
mul.lo.s64 %rd44, %rd41, 44; | |
add.s64 %rd45, %rd6, %rd44; | |
add.s64 %rd59, %rd45, 20; | |
mul.lo.s64 %rd17, %rd13, 44; | |
and.b16 %rs4, %rs2, 255; | |
setp.eq.s16 %p5, %rs4, 0; | |
LBB0_7: // %L12 | |
// =>This Inner Loop Header: Depth=1 | |
setp.ge.s64 %p4, %rd5, %rd61; | |
@%p4 bra LBB0_6; | |
bra.uni LBB0_8; | |
LBB0_6: // %L63 | |
// in Loop: Header=BB0_7 Depth=1 | |
ld.global.u32 %r6, [%rd60+-8]; | |
ld.global.u32 %r7, [%rd60+-4]; | |
ld.global.u32 %r8, [%rd60]; | |
ld.global.u32 %r9, [%rd60+4]; | |
selp.b64 %rd46, %rd4, %rd61, %p5; | |
mul.lo.s64 %rd47, %rd46, 28; | |
add.s64 %rd48, %rd3, %rd47; | |
ld.global.u32 %r10, [%rd48+-28]; | |
ld.global.u32 %r11, [%rd48+-24]; | |
ld.global.u32 %r12, [%rd48+-20]; | |
ld.global.u32 %r13, [%rd48+-16]; | |
ld.global.u32 %r14, [%rd48+-12]; | |
ld.global.u32 %r15, [%rd48+-8]; | |
ld.global.u32 %r16, [%rd48+-4]; | |
st.global.u32 [%rd59+-20], %r6; | |
st.global.u32 [%rd59+-16], %r7; | |
st.global.u32 [%rd59+-12], %r8; | |
st.global.u32 [%rd59+-8], %r9; | |
st.global.u32 [%rd59+-4], %r10; | |
st.global.u32 [%rd59], %r11; | |
st.global.u32 [%rd59+4], %r12; | |
st.global.u32 [%rd59+8], %r13; | |
st.global.u32 [%rd59+12], %r14; | |
st.global.u32 [%rd59+16], %r15; | |
st.global.u32 [%rd59+20], %r16; | |
add.s64 %rd62, %rd62, -1; | |
add.s64 %rd61, %rd61, %rd13; | |
add.s64 %rd60, %rd60, %rd15; | |
add.s64 %rd59, %rd59, %rd17; | |
setp.eq.s64 %p6, %rd62, 0; | |
@%p6 bra LBB0_8; | |
bra.uni LBB0_7; | |
LBB0_3: // %L12.preheader.split.us | |
ld.param.u64 %rd2, [%rd38+24]; | |
shl.b64 %rd49, %rd2, 4; | |
add.s64 %rd18, %rd1, %rd49; | |
mul.lo.s64 %rd50, %rd10, %rd9; | |
add.s64 %rd57, %rd8, %rd50; | |
mul.lo.s64 %rd51, %rd57, 44; | |
mul.lo.s64 %rd20, %rd11, %rd10; | |
add.s64 %rd52, %rd51, %rd6; | |
add.s64 %rd56, %rd52, -24; | |
mul.lo.s64 %rd22, %rd20, 44; | |
and.b16 %rs5, %rs2, 255; | |
setp.eq.s16 %p8, %rs5, 0; | |
LBB0_4: // %L12.us | |
// =>This Inner Loop Header: Depth=1 | |
setp.lt.s64 %p7, %rd5, %rd57; | |
@%p7 bra LBB0_8; | |
// %bb.5: // %L63.us | |
// in Loop: Header=BB0_4 Depth=1 | |
ld.global.u32 %r17, [%rd18+-16]; | |
ld.global.u32 %r18, [%rd18+-12]; | |
ld.global.u32 %r19, [%rd18+-8]; | |
ld.global.u32 %r20, [%rd18+-4]; | |
selp.b64 %rd53, %rd4, %rd57, %p8; | |
mul.lo.s64 %rd54, %rd53, 28; | |
add.s64 %rd55, %rd3, %rd54; | |
ld.global.u32 %r21, [%rd55+-28]; | |
ld.global.u32 %r22, [%rd55+-24]; | |
ld.global.u32 %r23, [%rd55+-20]; | |
ld.global.u32 %r24, [%rd55+-16]; | |
ld.global.u32 %r25, [%rd55+-12]; | |
ld.global.u32 %r26, [%rd55+-8]; | |
ld.global.u32 %r27, [%rd55+-4]; | |
st.global.u32 [%rd56+-20], %r17; | |
st.global.u32 [%rd56+-16], %r18; | |
st.global.u32 [%rd56+-12], %r19; | |
st.global.u32 [%rd56+-8], %r20; | |
st.global.u32 [%rd56+-4], %r21; | |
st.global.u32 [%rd56], %r22; | |
st.global.u32 [%rd56+4], %r23; | |
st.global.u32 [%rd56+8], %r24; | |
st.global.u32 [%rd56+12], %r25; | |
st.global.u32 [%rd56+16], %r26; | |
st.global.u32 [%rd56+20], %r27; | |
add.s64 %rd62, %rd62, -1; | |
add.s64 %rd57, %rd57, %rd20; | |
add.s64 %rd56, %rd56, %rd22; | |
setp.eq.s64 %p9, %rd62, 0; | |
@%p9 bra LBB0_8; | |
bra.uni LBB0_4; | |
LBB0_8: // %L62 | |
ret; | |
// -- End function | |
} | |
#+END_EXAMPLE |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment