Created
April 20, 2022 18:03
-
-
Save davidberard98/78bbdc7a7e52b8466940e96b2f24cf13 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
srun: error: ioctl(TIOCGWINSZ): Inappropriate ioctl for device | |
srun: error: Not using a pseudo-terminal, disregarding --pty option | |
[DUMP graph_fuser.cpp:2323] Before Fusion: | |
[DUMP graph_fuser.cpp:2323] graph(%t1.1 : Tensor, | |
[DUMP graph_fuser.cpp:2323] %t2.1 : Tensor, | |
[DUMP graph_fuser.cpp:2323] %t3.1 : Tensor, | |
[DUMP graph_fuser.cpp:2323] %t4.1 : Tensor, | |
[DUMP graph_fuser.cpp:2323] %i1.1 : int, | |
[DUMP graph_fuser.cpp:2323] %i2.1 : int): | |
[DUMP graph_fuser.cpp:2323] %9 : int = prim::Constant[value=-1]() # /fsx/users/dberard/pytorch/33-repro.py:8:28 | |
[DUMP graph_fuser.cpp:2323] %8 : bool = prim::Constant[value=0]() # /fsx/users/dberard/pytorch/33-repro.py:8:32 | |
[DUMP graph_fuser.cpp:2323] %7 : int[] = prim::Constant[value=[1, 12, 64, 4096]]() | |
[DUMP graph_fuser.cpp:2323] %6 : int[] = prim::Constant[value=[12, 64, 4096]]() | |
[DUMP graph_fuser.cpp:2323] %10 : Tensor = prim::profile[profiled_type=Double(requires_grad=0, device=cuda:0), seen_none=0](%t3.1) | |
[DUMP graph_fuser.cpp:2323] %11 : Tensor = prim::profile[profiled_type=Float(1, 1, 1, 4096, strides=[4096, 4096, 4096, 1], requires_grad=0, device=cuda:0), seen_none=0](%t4.1) | |
[DUMP graph_fuser.cpp:2323] %v1.1 : Tensor = aten::sub(%10, %11, %i2.1) # /fsx/users/dberard/pytorch/33-repro.py:4:9 | |
[DUMP graph_fuser.cpp:2323] %13 : Tensor = prim::profile[profiled_type=Float(1, 1, 1, 4096, strides=[4096, 4096, 4096, 1], requires_grad=0, device=cuda:0), seen_none=0](%v1.1) | |
[DUMP graph_fuser.cpp:2323] %14 : Tensor = prim::profile[profiled_type=Double(requires_grad=0, device=cuda:0), seen_none=0](%t2.1) | |
[DUMP graph_fuser.cpp:2323] %v2.1 : Tensor = aten::mul(%13, %14) # /fsx/users/dberard/pytorch/33-repro.py:5:9 | |
[DUMP graph_fuser.cpp:2323] %16 : Tensor = prim::profile[profiled_type=Float(12, 64, 4096, strides=[262144, 4096, 1], requires_grad=0, device=cuda:0), seen_none=0](%t1.1) | |
[DUMP graph_fuser.cpp:2323] %v3.1 : Tensor = aten::reshape(%16, %7) # /fsx/users/dberard/pytorch/33-repro.py:6:9 | |
[DUMP graph_fuser.cpp:2323] %18 : Tensor = prim::profile[profiled_type=Float(1, 12, 64, 4096, strides=[3145728, 262144, 4096, 1], requires_grad=0, device=cuda:0), seen_none=0](%v3.1) | |
[DUMP graph_fuser.cpp:2323] %19 : Tensor = prim::profile[profiled_type=Float(1, 1, 1, 4096, strides=[4096, 4096, 4096, 1], requires_grad=0, device=cuda:0), seen_none=0](%v2.1) | |
[DUMP graph_fuser.cpp:2323] %v4.1 : Tensor = aten::add(%18, %19, %i1.1) # /fsx/users/dberard/pytorch/33-repro.py:7:9 | |
[DUMP graph_fuser.cpp:2323] %21 : Tensor = prim::profile[profiled_type=Float(1, 12, 64, 4096, strides=[3145728, 262144, 4096, 1], requires_grad=0, device=cuda:0), seen_none=0](%v4.1) | |
[DUMP graph_fuser.cpp:2323] %v5.1 : Tensor = aten::_softmax(%21, %9, %8) # /fsx/users/dberard/pytorch/33-repro.py:8:9 | |
[DUMP graph_fuser.cpp:2323] %23 : Tensor = prim::profile[profiled_type=Float(1, 12, 64, 4096, strides=[3145728, 262144, 4096, 1], requires_grad=0, device=cuda:0), seen_none=0](%v5.1) | |
[DUMP graph_fuser.cpp:2323] %v6.1 : Tensor = aten::reshape(%23, %6) # /fsx/users/dberard/pytorch/33-repro.py:9:9 | |
[DUMP graph_fuser.cpp:2323] %25 : Tensor = prim::profile[profiled_type=Float(12, 64, 4096, strides=[262144, 4096, 1], requires_grad=0, device=cuda:0), seen_none=0](%v6.1) | |
[DUMP graph_fuser.cpp:2323] return (%25) | |
[DUMP graph_fuser.cpp:2403] Before Compilation: | |
[DUMP graph_fuser.cpp:2403] graph(%t1.1 : Tensor, | |
[DUMP graph_fuser.cpp:2403] %t2.1 : Tensor, | |
[DUMP graph_fuser.cpp:2403] %t3.1 : Tensor, | |
[DUMP graph_fuser.cpp:2403] %t4.1 : Tensor, | |
[DUMP graph_fuser.cpp:2403] %i1.1 : int, | |
[DUMP graph_fuser.cpp:2403] %i2.1 : int): | |
[DUMP graph_fuser.cpp:2403] %7 : int[] = prim::Constant[value=[1, 12, 64, 4096]]() | |
[DUMP graph_fuser.cpp:2403] %6 : int[] = prim::Constant[value=[12, 64, 4096]]() | |
[DUMP graph_fuser.cpp:2403] %v3.1 : Tensor = aten::reshape(%t1.1, %7) # /fsx/users/dberard/pytorch/33-repro.py:6:9 | |
[DUMP graph_fuser.cpp:2403] %36 : bool = prim::CudaFusionGuard[types=[Float(1, 12, 64, 4096, strides=[3145728, 262144, 4096, 1], requires_grad=0, device=cuda:0), Double(requires_grad=0, device=cuda:0), Double(requires_grad=0, device=cuda:0), Float(1, 1, 1, 4096, strides=[4096, 4096, 4096, 1], requires_grad=0, device=cuda:0)]](%v3.1, %t2.1, %t3.1, %t4.1) | |
[DUMP graph_fuser.cpp:2403] %35 : Tensor = prim::If(%36) | |
[DUMP graph_fuser.cpp:2403] block0(): | |
[DUMP graph_fuser.cpp:2403] %v5.4 : Tensor = prim::CudaFusionGroup_0(%v3.1, %t2.1, %t3.1, %t4.1, %i1.1, %i2.1) | |
[DUMP graph_fuser.cpp:2403] -> (%v5.4) | |
[DUMP graph_fuser.cpp:2403] block1(): | |
[DUMP graph_fuser.cpp:2403] %v5.1 : Tensor = prim::FallbackGraph_1(%v3.1, %t2.1, %t3.1, %t4.1, %i1.1, %i2.1) | |
[DUMP graph_fuser.cpp:2403] -> (%v5.1) | |
[DUMP graph_fuser.cpp:2403] %v6.1 : Tensor = aten::reshape(%35, %6) # /fsx/users/dberard/pytorch/33-repro.py:9:9 | |
[DUMP graph_fuser.cpp:2403] return (%v6.1) | |
[DUMP graph_fuser.cpp:2403] with prim::CudaFusionGroup_0 = graph(%4 : Float(1, 12, 64, 4096, strides=[3145728, 262144, 4096, 1], requires_grad=0, device=cuda:0), | |
[DUMP graph_fuser.cpp:2403] %9 : Double(requires_grad=0, device=cuda:0), | |
[DUMP graph_fuser.cpp:2403] %11 : Double(requires_grad=0, device=cuda:0), | |
[DUMP graph_fuser.cpp:2403] %12 : Float(1, 1, 1, 4096, strides=[4096, 4096, 4096, 1], requires_grad=0, device=cuda:0), | |
[DUMP graph_fuser.cpp:2403] %6 : int, | |
[DUMP graph_fuser.cpp:2403] %13 : int): | |
[DUMP graph_fuser.cpp:2403] %2 : bool = prim::Constant[value=0]() # /fsx/users/dberard/pytorch/33-repro.py:8:32 | |
[DUMP graph_fuser.cpp:2403] %1 : int = prim::Constant[value=-1]() # /fsx/users/dberard/pytorch/33-repro.py:8:28 | |
[DUMP graph_fuser.cpp:2403] %v1.1 : Float(1, 1, 1, 4096, strides=[4096, 4096, 4096, 1], requires_grad=0, device=cuda:0) = aten::sub(%11, %12, %13) # /fsx/users/dberard/pytorch/33-repro.py:4:9 | |
[DUMP graph_fuser.cpp:2403] %v2.1 : Float(1, 1, 1, 4096, strides=[4096, 4096, 4096, 1], requires_grad=0, device=cuda:0) = aten::mul(%v1.1, %9) # /fsx/users/dberard/pytorch/33-repro.py:5:9 | |
[DUMP graph_fuser.cpp:2403] %v4.1 : Float(1, 12, 64, 4096, strides=[3145728, 262144, 4096, 1], requires_grad=0, device=cuda:0) = aten::add(%4, %v2.1, %6) # /fsx/users/dberard/pytorch/33-repro.py:7:9 | |
[DUMP graph_fuser.cpp:2403] %v5.1 : Float(1, 12, 64, 4096, strides=[3145728, 262144, 4096, 1], requires_grad=0, device=cuda:0) = aten::_softmax(%v4.1, %1, %2) # /fsx/users/dberard/pytorch/33-repro.py:8:9 | |
[DUMP graph_fuser.cpp:2403] return (%v5.1) | |
[DUMP graph_fuser.cpp:2403] with prim::FallbackGraph_1 = graph(%v3.1 : Float(1, 12, 64, 4096, strides=[3145728, 262144, 4096, 1], requires_grad=0, device=cuda:0), | |
[DUMP graph_fuser.cpp:2403] %t2.1 : Double(requires_grad=0, device=cuda:0), | |
[DUMP graph_fuser.cpp:2403] %t3.1 : Double(requires_grad=0, device=cuda:0), | |
[DUMP graph_fuser.cpp:2403] %t4.1 : Float(1, 1, 1, 4096, strides=[4096, 4096, 4096, 1], requires_grad=0, device=cuda:0), | |
[DUMP graph_fuser.cpp:2403] %i1.1 : int, | |
[DUMP graph_fuser.cpp:2403] %i2.1 : int): | |
[DUMP graph_fuser.cpp:2403] %6 : bool = prim::Constant[value=0]() # /fsx/users/dberard/pytorch/33-repro.py:8:32 | |
[DUMP graph_fuser.cpp:2403] %7 : int = prim::Constant[value=-1]() # /fsx/users/dberard/pytorch/33-repro.py:8:28 | |
[DUMP graph_fuser.cpp:2403] %v1.1 : Float(1, 1, 1, 4096, strides=[4096, 4096, 4096, 1], requires_grad=0, device=cuda:0) = aten::sub(%t3.1, %t4.1, %i2.1) # /fsx/users/dberard/pytorch/33-repro.py:4:9 | |
[DUMP graph_fuser.cpp:2403] %v2.1 : Float(1, 1, 1, 4096, strides=[4096, 4096, 4096, 1], requires_grad=0, device=cuda:0) = aten::mul(%v1.1, %t2.1) # /fsx/users/dberard/pytorch/33-repro.py:5:9 | |
[DUMP graph_fuser.cpp:2403] %v4.1 : Float(1, 12, 64, 4096, strides=[3145728, 262144, 4096, 1], requires_grad=0, device=cuda:0) = aten::add(%v3.1, %v2.1, %i1.1) # /fsx/users/dberard/pytorch/33-repro.py:7:9 | |
[DUMP graph_fuser.cpp:2403] %v5.1 : Float(1, 12, 64, 4096, strides=[3145728, 262144, 4096, 1], requires_grad=0, device=cuda:0) = aten::_softmax(%v4.1, %7, %6) # /fsx/users/dberard/pytorch/33-repro.py:8:9 | |
[DUMP graph_fuser.cpp:2403] return (%v5.1) | |
===== Reduction Stats ======== | |
total_reduction_numel: 4096 | |
total_iteration_numel: 768 | |
inner_most_dimension_numel: 4096 | |
vectorize_factor: 2 | |
n_tensor_inputs: 2 | |
max_input_dtype_size: 4 | |
max_persistent_buffer_size: 32768 | |
max_multi_reduction_factor: 4 | |
block(512, 1, 1) | |
===== Reduction Parameters ======== | |
Tag: Inner Persistent Heuristic. | |
Red On Fastest Dim | |
Persistent Kernel | |
Batches per block: 4 | |
Iteration Domain: blockIdx.x / unroll / factor 2 | |
Inner Reduction Domain: cross block - threadIdx.x / pad to warp / persistent batch - 4 / vectorize / factor 2 | |
Launch Parameters: BlockDim.x = -1, BlockDim.y = 1, BlockDim.z = -1, GridDim.x = -1, GridDim.y = -1, GridDim.z = -1, Smem Size = 0 | |
==================================== | |
===== Reduction Stats ======== | |
total_reduction_numel: 4096 | |
total_iteration_numel: 768 | |
inner_most_dimension_numel: 4096 | |
vectorize_factor: 2 | |
n_tensor_inputs: 2 | |
max_input_dtype_size: 4 | |
max_persistent_buffer_size: 32768 | |
max_multi_reduction_factor: 4 | |
block(512, 1, 1) | |
===== Reduction Parameters ======== | |
Tag: Inner Persistent Heuristic. | |
Red On Fastest Dim | |
Persistent Kernel | |
Batches per block: 4 | |
Iteration Domain: blockIdx.x / unroll / factor 2 | |
Inner Reduction Domain: cross block - threadIdx.x / pad to warp / persistent batch - 4 / vectorize / factor 2 | |
Launch Parameters: BlockDim.x = -1, BlockDim.y = 1, BlockDim.z = -1, GridDim.x = -1, GridDim.y = -1, GridDim.z = -1, Smem Size = 0 | |
==================================== | |
Inputs: | |
T0_g[ iblockIdx.x238{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS239{1}, iS237{2}, iS245{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}, iS242{4}, iS244{1}, iS241{2} ], float | |
T1_g[ 0 ], double | |
T2_g[ 0 ], double | |
T3_g[ sbS322{( ceilDiv(( ceilDiv(( 1 * ( 1 * 1 ) ), 2) ), 1) )}, sbS323{1}, sbS321{2}, iS329{( ceilDiv(( ceilDiv(( ceilDiv(i5, 2) ), 4) ), 1) )}, iS326{4}, iS328{1}, iS325{2} ], float | |
i6, int64_t | |
i7, int64_t | |
Outputs: | |
T20_g[ iblockIdx.x166{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS167{1}, iUR165{2}, ithreadIdx.x359{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}_p, iS356{4}, iUS358{1}, iV355{2} ] produce_pos( 2), double | |
%kernel_math { | |
T21_l[ iblockIdx.x214{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS215{1}, iUR213{2}, ithreadIdx.x221{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}_p, iS218{4}, iUS220{1}, iV217{2} ] ca_pos( 2 ) | |
= T0_g[ iblockIdx.x238{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS239{1}, iS237{2}, iS245{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}, iS242{4}, iS244{1}, iS241{2} ]; | |
T11_l[ iblockIdx.x190{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS191{1}, iS189{2}, ithreadIdx.x197{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}_p, iS194{4}, iUS196{1}, iS193{2} ] ca_pos( 7 ) produce_pos( 2) | |
= (double)(T21_l[ iblockIdx.x214{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS215{1}, iUR213{2}, ithreadIdx.x221{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}_p, iS218{4}, iUS220{1}, iV217{2} ] ca_pos( 2 )); | |
T23_l[ 0 ] | |
= T2_g[ 0 ]; | |
T4_l[ bblockIdx.x274{( ceilDiv(( ceilDiv(( 1 * ( 1 * 1 ) ), 2) ), 1) )}, bUS275{1}, bS273{2}, bthreadIdx.x281{( ceilDiv(( ceilDiv(( ceilDiv(1, 2) ), 4) ), 1) )}_p, bS278{4}, bUS280{1}, bS277{2} ] = broadcast( T23_l[ 0 ] ) | |
T24_l[ sbblockIdx.x310{( ceilDiv(( ceilDiv(( 1 * ( 1 * 1 ) ), 2) ), 1) )}, sbUS311{1}, sbUR309{2}, ithreadIdx.x317{( ceilDiv(( ceilDiv(( ceilDiv(i5, 2) ), 4) ), 1) )}_p, iS314{4}, iUS316{1}, iV313{2} ] | |
= T3_g[ sbS322{( ceilDiv(( ceilDiv(( 1 * ( 1 * 1 ) ), 2) ), 1) )}, sbS323{1}, sbS321{2}, iS329{( ceilDiv(( ceilDiv(( ceilDiv(i5, 2) ), 4) ), 1) )}, iS326{4}, iS328{1}, iS325{2} ]; | |
d10 = (double)(i7); | |
T5_l[ sbblockIdx.x298{( ceilDiv(( ceilDiv(( 1 * ( 1 * 1 ) ), 2) ), 1) )}, sbUS299{1}, sbS297{2}, ithreadIdx.x305{( ceilDiv(( ceilDiv(( ceilDiv(i5, 2) ), 4) ), 1) )}_p, iS302{4}, iUS304{1}, iS301{2} ] ca_pos( 7 ) | |
= T24_l[ sbblockIdx.x310{( ceilDiv(( ceilDiv(( 1 * ( 1 * 1 ) ), 2) ), 1) )}, sbUS311{1}, sbUR309{2}, ithreadIdx.x317{( ceilDiv(( ceilDiv(( ceilDiv(i5, 2) ), 4) ), 1) )}_p, iS314{4}, iUS316{1}, iV313{2} ] | |
* d10; | |
T6_l[ sbblockIdx.x286{( ceilDiv(( ceilDiv(( 1 * ( 1 * 1 ) ), 2) ), 1) )}, sbUS287{1}, sbS285{2}, ithreadIdx.x293{( ceilDiv(( ceilDiv(( ceilDiv(i5, 2) ), 4) ), 1) )}_p, iS290{4}, iUS292{1}, iS289{2} ] ca_pos( 7 ) produce_pos( 7) | |
= (double)(T5_l[ sbblockIdx.x298{( ceilDiv(( ceilDiv(( 1 * ( 1 * 1 ) ), 2) ), 1) )}, sbUS299{1}, sbS297{2}, ithreadIdx.x305{( ceilDiv(( ceilDiv(( ceilDiv(i5, 2) ), 4) ), 1) )}_p, iS302{4}, iUS304{1}, iS301{2} ] ca_pos( 7 )); | |
T7_l[ sbblockIdx.x250{( ceilDiv(( ceilDiv(( 1 * ( 1 * 1 ) ), 2) ), 1) )}, sbUS251{1}, sbS249{2}, ithreadIdx.x257{( ceilDiv(( ceilDiv(( ceilDiv(i5, 2) ), 4) ), 1) )}_p, iS254{4}, iUS256{1}, iS253{2} ] ca_pos( 7 ) produce_pos( 7) | |
= T4_l[ bblockIdx.x274{( ceilDiv(( ceilDiv(( 1 * ( 1 * 1 ) ), 2) ), 1) )}, bUS275{1}, bS273{2}, bthreadIdx.x281{( ceilDiv(( ceilDiv(( ceilDiv(1, 2) ), 4) ), 1) )}_p, bS278{4}, bUS280{1}, bS277{2} ] | |
- T6_l[ sbblockIdx.x286{( ceilDiv(( ceilDiv(( 1 * ( 1 * 1 ) ), 2) ), 1) )}, sbUS287{1}, sbS285{2}, ithreadIdx.x293{( ceilDiv(( ceilDiv(( ceilDiv(i5, 2) ), 4) ), 1) )}_p, iS290{4}, iUS292{1}, iS289{2} ] ca_pos( 7 ) produce_pos( 7); | |
T22_l[ 0 ] | |
= T1_g[ 0 ]; | |
T8_l[ bblockIdx.x262{( ceilDiv(( ceilDiv(( 1 * ( 1 * 1 ) ), 2) ), 1) )}, bUS263{1}, bS261{2}, bthreadIdx.x269{( ceilDiv(( ceilDiv(( ceilDiv(1, 2) ), 4) ), 1) )}_p, bS266{4}, bUS268{1}, bS265{2} ] = broadcast( T22_l[ 0 ] ) | |
T9_l[ sbblockIdx.x226{( ceilDiv(( ceilDiv(( 1 * ( 1 * 1 ) ), 2) ), 1) )}, sbUS227{1}, sbS225{2}, ithreadIdx.x233{( ceilDiv(( ceilDiv(( ceilDiv(i5, 2) ), 4) ), 1) )}_p, iS230{4}, iUS232{1}, iS229{2} ] ca_pos( 7 ) produce_pos( 7) | |
= T7_l[ sbblockIdx.x250{( ceilDiv(( ceilDiv(( 1 * ( 1 * 1 ) ), 2) ), 1) )}, sbUS251{1}, sbS249{2}, ithreadIdx.x257{( ceilDiv(( ceilDiv(( ceilDiv(i5, 2) ), 4) ), 1) )}_p, iS254{4}, iUS256{1}, iS253{2} ] ca_pos( 7 ) produce_pos( 7) | |
* T8_l[ bblockIdx.x262{( ceilDiv(( ceilDiv(( 1 * ( 1 * 1 ) ), 2) ), 1) )}, bUS263{1}, bS261{2}, bthreadIdx.x269{( ceilDiv(( ceilDiv(( ceilDiv(1, 2) ), 4) ), 1) )}_p, bS266{4}, bUS268{1}, bS265{2} ]; | |
d19 = (double)(i6); | |
T10_l[ sbblockIdx.x202{( ceilDiv(( ceilDiv(( 1 * ( 1 * 1 ) ), 2) ), 1) )}, sbUS203{1}, sbS201{2}, ithreadIdx.x209{( ceilDiv(( ceilDiv(( ceilDiv(i5, 2) ), 4) ), 1) )}_p, iS206{4}, iUS208{1}, iS205{2} ] ca_pos( 7 ) produce_pos( 7) | |
= T9_l[ sbblockIdx.x226{( ceilDiv(( ceilDiv(( 1 * ( 1 * 1 ) ), 2) ), 1) )}, sbUS227{1}, sbS225{2}, ithreadIdx.x233{( ceilDiv(( ceilDiv(( ceilDiv(i5, 2) ), 4) ), 1) )}_p, iS230{4}, iUS232{1}, iS229{2} ] ca_pos( 7 ) produce_pos( 7) | |
* d19; | |
T12_l[ iblockIdx.x178{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS179{1}, iS177{2}, ithreadIdx.x185{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}_p, iS182{4}, iUS184{1}, iS181{2} ] ca_pos( 3 ) produce_pos( 7) | |
= T11_l[ iblockIdx.x190{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS191{1}, iS189{2}, ithreadIdx.x197{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}_p, iS194{4}, iUS196{1}, iS193{2} ] ca_pos( 7 ) produce_pos( 2) | |
+ T10_l[ sbblockIdx.x202{( ceilDiv(( ceilDiv(( 1 * ( 1 * 1 ) ), 2) ), 1) )}, sbUS203{1}, sbS201{2}, ithreadIdx.x209{( ceilDiv(( ceilDiv(( ceilDiv(i5, 2) ), 4) ), 1) )}_p, iS206{4}, iUS208{1}, iS205{2} ] ca_pos( 7 ) produce_pos( 7); | |
T26_l[ iblockIdx.x108{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS109{1}, iS107{2}, ithreadIdx.x115{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}rf_p, rS112{4}rf, rUS114{1}rf, rS111{2}rf ] ca_pos( 4 ) produce_pos( 3) = reduction( T12_l[ iblockIdx.x178{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS179{1}, iS177{2}, ithreadIdx.x185{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}_p, iS182{4}, iUS184{1}, iS181{2} ] ca_pos( 3 ) produce_pos( 7), op = fmax, initial value = double(-inf), fused = 0 ) | |
T13_l[ iblockIdx.x123{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS124{1}, iS122{2}, rthreadIdx.x125{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}_p ] ca_pos( 3 ) produce_pos( 4) = reduction( T26_l[ iblockIdx.x108{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS109{1}, iS107{2}, ithreadIdx.x115{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}rf_p, rS112{4}rf, rUS114{1}rf, rS111{2}rf ] ca_pos( 4 ) produce_pos( 3), op = fmax, initial value = double(-inf), fused = 0 ) | |
T14_l[ iblockIdx.x130{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS131{1}, iS129{2}, bthreadIdx.x365{( ceilDiv(( ceilDiv(( ceilDiv(1, 2) ), 4) ), 1) )}_p, bS362{4}, bUS364{1}, bS361{2} ] ca_pos( 3 ) produce_pos( 3) = broadcast( T13_l[ iblockIdx.x123{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS124{1}, iS122{2}, rthreadIdx.x125{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}_p ] ca_pos( 3 ) produce_pos( 4) ) | |
T15_l[ iblockIdx.x136{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS137{1}, iS135{2}, ithreadIdx.x335{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}_p, iS332{4}, iUS334{1}, iS331{2} ] ca_pos( 7 ) produce_pos( 3) | |
= T12_l[ iblockIdx.x178{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS179{1}, iS177{2}, ithreadIdx.x185{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}_p, iS182{4}, iUS184{1}, iS181{2} ] ca_pos( 3 ) produce_pos( 7) | |
- T14_l[ iblockIdx.x130{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS131{1}, iS129{2}, bthreadIdx.x365{( ceilDiv(( ceilDiv(( ceilDiv(1, 2) ), 4) ), 1) )}_p, bS362{4}, bUS364{1}, bS361{2} ] ca_pos( 3 ) produce_pos( 3); | |
T16_l[ iblockIdx.x142{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS143{1}, iS141{2}, ithreadIdx.x341{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}_p, iS338{4}, iUS340{1}, iS337{2} ] ca_pos( 3 ) produce_pos( 7) | |
= exp(T15_l[ iblockIdx.x136{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS137{1}, iS135{2}, ithreadIdx.x335{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}_p, iS332{4}, iUS334{1}, iS331{2} ] ca_pos( 7 ) produce_pos( 3)); | |
T27_l[ iblockIdx.x386{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS387{1}, iS385{2}, ithreadIdx.x393{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}rf_p, rS390{4}rf, rUS392{1}rf, rS389{2}rf ] ca_pos( 4 ) produce_pos( 3) = reduction( T16_l[ iblockIdx.x142{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS143{1}, iS141{2}, ithreadIdx.x341{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}_p, iS338{4}, iUS340{1}, iS337{2} ] ca_pos( 3 ) produce_pos( 7), op = add, initial value = double(0), fused = 0 ) | |
T17_l[ iblockIdx.x401{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS402{1}, iS400{2}, rthreadIdx.x403{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}_p ] ca_pos( 3 ) produce_pos( 4) = reduction( T27_l[ iblockIdx.x386{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS387{1}, iS385{2}, ithreadIdx.x393{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}rf_p, rS390{4}rf, rUS392{1}rf, rS389{2}rf ] ca_pos( 4 ) produce_pos( 3), op = add, initial value = double(0), fused = 0 ) | |
T18_l[ iblockIdx.x160{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS161{1}, iS159{2}, bthreadIdx.x377{( ceilDiv(( ceilDiv(( ceilDiv(1, 2) ), 4) ), 1) )}_p, bS374{4}, bUS376{1}, bS373{2} ] ca_pos( 3 ) produce_pos( 3) = broadcast( T17_l[ iblockIdx.x401{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS402{1}, iS400{2}, rthreadIdx.x403{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}_p ] ca_pos( 3 ) produce_pos( 4) ) | |
T19_l[ iblockIdx.x172{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS173{1}, iS171{2}, bthreadIdx.x371{( ceilDiv(( ceilDiv(( ceilDiv(1, 2) ), 4) ), 1) )}_p, bS368{4}, bUS370{1}, bS367{2} ] ca_pos( 3 ) produce_pos( 3) | |
= reciprocal(T18_l[ iblockIdx.x160{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS161{1}, iS159{2}, bthreadIdx.x377{( ceilDiv(( ceilDiv(( ceilDiv(1, 2) ), 4) ), 1) )}_p, bS374{4}, bUS376{1}, bS373{2} ] ca_pos( 3 ) produce_pos( 3)); | |
T25_l[ iblockIdx.x154{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS155{1}, iS153{2}, ithreadIdx.x353{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}_p, iS350{4}, iUS352{1}, iS349{2} ] ca_pos( 2 ) produce_pos( 3) | |
= T16_l[ iblockIdx.x142{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS143{1}, iS141{2}, ithreadIdx.x341{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}_p, iS338{4}, iUS340{1}, iS337{2} ] ca_pos( 3 ) produce_pos( 7) | |
* T19_l[ iblockIdx.x172{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS173{1}, iS171{2}, bthreadIdx.x371{( ceilDiv(( ceilDiv(( ceilDiv(1, 2) ), 4) ), 1) )}_p, bS368{4}, bUS370{1}, bS367{2} ] ca_pos( 3 ) produce_pos( 3); | |
T20_g[ iblockIdx.x166{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS167{1}, iUR165{2}, ithreadIdx.x359{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}_p, iS356{4}, iUS358{1}, iV355{2} ] produce_pos( 2) | |
= T25_l[ iblockIdx.x154{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS155{1}, iS153{2}, ithreadIdx.x353{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}_p, iS350{4}, iUS352{1}, iS349{2} ] ca_pos( 2 ) produce_pos( 3); | |
} | |
======= Codegen output for kernel: kernel1 ======= | |
__global__ void kernel1(Tensor<float, 4> T0, Tensor<double, 0> T1, Tensor<double, 0> T2, Tensor<float, 4> T3, int64_t i6, int64_t i7, Tensor<double, 4> T20) { | |
alignas(8) extern __shared__ char array[]; | |
void* shared_mem = array; | |
NVFUSER_DEFINE_MAGIC_ZERO | |
double T22[1]; | |
T22[0] = 0; | |
T22[0] | |
= T1[0]; | |
double T8[1]; | |
T8[0] | |
= T22[0]; | |
double T23[1]; | |
T23[0] = 0; | |
T23[0] | |
= T2[0]; | |
double T4[1]; | |
T4[0] | |
= T23[0]; | |
double d19; | |
d19 = (double)(i6); | |
double d10; | |
d10 = (double)(i7); | |
Array<float, ((4 * 1) * 2), 2> T24; | |
#pragma unroll | |
for(nvfuser_index_t i295 = 0; i295 < 4; ++i295) { | |
T24.set(0); | |
} | |
NVFUSER_UPDATE_MAGIC_ZERO | |
if (((((((ceilDiv((ceilDiv(T0.size[3], 2)), 4)) * 3) + ((nvfuser_index_t)threadIdx.x)) * 2) + 1) < T0.size[3])) { | |
#pragma unroll | |
for(nvfuser_index_t i288 = 0; i288 < 4; ++i288) { | |
loadGlobalToLocal<float, 2, false>(&T24[(i288 * 2)], &T3[((((i288 + nvfuser_zero) * (ceilDiv((ceilDiv(T0.size[3], 2)), 4))) + ((nvfuser_index_t)threadIdx.x)) * 2)]); | |
} | |
NVFUSER_UPDATE_MAGIC_ZERO | |
} else { | |
#pragma unroll | |
for(nvfuser_index_t i288 = 0; i288 < 4; ++i288) { | |
int64_t i361; | |
i361 = (((i288 + nvfuser_zero) * (ceilDiv((ceilDiv(T0.size[3], 2)), 4))) + ((nvfuser_index_t)threadIdx.x)) * 2; | |
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv((ceilDiv((ceilDiv(T0.size[3], 2)), 4)), 1))) && ((((((i288 + nvfuser_zero) * (ceilDiv((ceilDiv(T0.size[3], 2)), 4))) + ((nvfuser_index_t)threadIdx.x)) * 2) + 1) < T0.size[3]))) { | |
loadGlobalToLocal<float, 2, false>(&T24[(i288 * 2)], &T3[i361]); | |
} | |
} | |
NVFUSER_UPDATE_MAGIC_ZERO | |
} | |
if ((((((((ceilDiv((ceilDiv(T0.size[3], 2)), 4)) * 3) + ((nvfuser_index_t)threadIdx.x)) * 2) + 1) < T0.size[3]) && (((((nvfuser_index_t)blockIdx.x) * 2) + 1) < (1 * (T0.size[1] * T0.size[2]))))) { | |
Array<float, (((2 * 4) * 1) * 2), 2> T21; | |
#pragma unroll | |
for(nvfuser_index_t i304 = 0; i304 < 2; ++i304) { | |
#pragma unroll | |
for(nvfuser_index_t i302 = 0; i302 < 4; ++i302) { | |
T21.set(0); | |
} | |
} | |
NVFUSER_UPDATE_MAGIC_ZERO | |
#pragma unroll | |
for(nvfuser_index_t i282 = 0; i282 < 2; ++i282) { | |
#pragma unroll | |
for(nvfuser_index_t i284 = 0; i284 < 4; ++i284) { | |
loadGlobalToLocal<float, 2, false>(&T21[(i282 * 8) + (i284 * 2)], &T0[(((((nvfuser_index_t)blockIdx.x) * 2) + i282) * T0.size[3]) + ((((i284 + nvfuser_zero) * (ceilDiv((ceilDiv(T0.size[3], 2)), 4))) + ((nvfuser_index_t)threadIdx.x)) * 2)]); | |
} | |
} | |
NVFUSER_UPDATE_MAGIC_ZERO | |
Array<double, (((2 * 4) * 1) * 2), 2> T25; | |
#pragma unroll | |
for(nvfuser_index_t i260 = 0; i260 < 2; ++i260) { | |
double T12[((4 * 1) * 2)]; | |
#pragma unroll | |
for(nvfuser_index_t i279 = 0; i279 < 4; ++i279) { | |
#pragma unroll | |
for(nvfuser_index_t i281 = 0; i281 < 2; ++i281) { | |
float T5[1]; | |
T5[0] | |
= T24[((i279 * 2) + i281)] | |
* (float) d10; | |
double T6[1]; | |
T6[0] | |
= (double)(T5[0]); | |
double T7[1]; | |
T7[0] | |
= T4[0] | |
- T6[0]; | |
double T11[1]; | |
T11[0] | |
= (double)(T21[(i260 * 8) + ((i279 * 2) + i281)]); | |
double T9[1]; | |
T9[0] | |
= T7[0] | |
* T8[0]; | |
double T10[1]; | |
T10[0] | |
= T9[0] | |
* d19; | |
T12[((i279 * 2) + i281)] | |
= T11[0] | |
+ T10[0]; | |
} | |
} | |
double T13[1]; | |
T13[0] = NEG_INFINITY; | |
double T26[1]; | |
T26[0] = NEG_INFINITY; | |
#pragma unroll | |
for(nvfuser_index_t i275 = 0; i275 < 4; ++i275) { | |
#pragma unroll | |
for(nvfuser_index_t i277 = 0; i277 < 2; ++i277) { | |
T26[0] = fmax( | |
T26[0], | |
T12[((i275 * 2) + i277)]); | |
} | |
} | |
warp::warpReduceTIDX<false>( | |
T13[0], | |
T26[0], | |
[](double &a, double b) { a = fmax(a, b); }, | |
threadIdx, | |
blockDim, | |
static_cast<double*>(shared_mem), | |
true, | |
double(NEG_INFINITY)); | |
double T14[1]; | |
broadcast::blockBroadcast<true, false, false>( | |
T14[0], | |
T13[0], | |
static_cast<double*>(shared_mem), | |
true); | |
// Alias Allocation - register | |
auto& T16 = T12; | |
#pragma unroll | |
for(nvfuser_index_t i271 = 0; i271 < 4; ++i271) { | |
#pragma unroll | |
for(nvfuser_index_t i273 = 0; i273 < 2; ++i273) { | |
double T15[1]; | |
T15[0] | |
= T12[((i271 * 2) + i273)] | |
- T14[0]; | |
T16[((i271 * 2) + i273)] | |
= exp(T15[0]); | |
} | |
} | |
double T17[1]; | |
T17[0] = 0; | |
double T27[1]; | |
T27[0] = 0; | |
#pragma unroll | |
for(nvfuser_index_t i267 = 0; i267 < 4; ++i267) { | |
#pragma unroll | |
for(nvfuser_index_t i269 = 0; i269 < 2; ++i269) { | |
T27[0] | |
= T27[0] | |
+ T16[((i267 * 2) + i269)]; | |
} | |
} | |
warp::warpReduceTIDX<false>( | |
T17[0], | |
T27[0], | |
[](double &a, double b) { a = a + b; }, | |
threadIdx, | |
blockDim, | |
static_cast<double*>(shared_mem), | |
true, | |
double(0)); | |
double T18[1]; | |
broadcast::blockBroadcast<true, false, false>( | |
T18[0], | |
T17[0], | |
static_cast<double*>(shared_mem), | |
true); | |
double T19[1]; | |
T19[0] | |
= reciprocal(T18[0]); | |
#pragma unroll | |
for(nvfuser_index_t i262 = 0; i262 < 4; ++i262) { | |
#pragma unroll | |
for(nvfuser_index_t i264 = 0; i264 < 2; ++i264) { | |
T25[(i260 * 8) + ((i262 * 2) + i264)] | |
= T16[((i262 * 2) + i264)] | |
* T19[0]; | |
} | |
} | |
} | |
NVFUSER_UPDATE_MAGIC_ZERO | |
#pragma unroll | |
for(nvfuser_index_t i255 = 0; i255 < 2; ++i255) { | |
#pragma unroll | |
for(nvfuser_index_t i257 = 0; i257 < 4; ++i257) { | |
loadLocalToGlobal<double, 2, false>( &T20[(((((nvfuser_index_t)blockIdx.x) * 2) + i255) * T0.size[3]) + ((((i257 + nvfuser_zero) * (ceilDiv((ceilDiv(T0.size[3], 2)), 4))) + ((nvfuser_index_t)threadIdx.x)) * 2)], &T25[(i255 * 8) + (i257 * 2)]); | |
} | |
} | |
NVFUSER_UPDATE_MAGIC_ZERO | |
} else { | |
Array<float, (((2 * 4) * 1) * 2), 2> T21; | |
#pragma unroll | |
for(nvfuser_index_t i304 = 0; i304 < 2; ++i304) { | |
#pragma unroll | |
for(nvfuser_index_t i302 = 0; i302 < 4; ++i302) { | |
T21.set(0); | |
} | |
} | |
NVFUSER_UPDATE_MAGIC_ZERO | |
#pragma unroll | |
for(nvfuser_index_t i282 = 0; i282 < 2; ++i282) { | |
int64_t i1224; | |
i1224 = (((nvfuser_index_t)blockIdx.x) * 2) + i282; | |
#pragma unroll | |
for(nvfuser_index_t i284 = 0; i284 < 4; ++i284) { | |
int64_t i661; | |
i661 = (((i284 + nvfuser_zero) * (ceilDiv((ceilDiv(T0.size[3], 2)), 4))) + ((nvfuser_index_t)threadIdx.x)) * 2; | |
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv((ceilDiv((ceilDiv(T0.size[3], 2)), 4)), 1))) && ((((((i284 + nvfuser_zero) * (ceilDiv((ceilDiv(T0.size[3], 2)), 4))) + ((nvfuser_index_t)threadIdx.x)) * 2) + 1) < T0.size[3])) && (i1224 < (1 * (T0.size[1] * T0.size[2]))))) { | |
loadGlobalToLocal<float, 2, false>(&T21[(i282 * 8) + (i284 * 2)], &T0[(((((nvfuser_index_t)blockIdx.x) * 2) + i282) * T0.size[3]) + i661]); | |
} | |
} | |
} | |
NVFUSER_UPDATE_MAGIC_ZERO | |
Array<double, (((2 * 4) * 1) * 2), 2> T25; | |
#pragma unroll | |
for(nvfuser_index_t i260 = 0; i260 < 2; ++i260) { | |
int64_t i1272; | |
i1272 = (((nvfuser_index_t)blockIdx.x) * 2) + i260; | |
double T12[((4 * 1) * 2)]; | |
#pragma unroll | |
for(nvfuser_index_t i279 = 0; i279 < 4; ++i279) { | |
#pragma unroll | |
for(nvfuser_index_t i281 = 0; i281 < 2; ++i281) { | |
float T5[1]; | |
T5[0] | |
= T24[((i279 * 2) + i281)] | |
* (float) d10; | |
double T6[1]; | |
T6[0] | |
= (double)(T5[0]); | |
double T7[1]; | |
T7[0] | |
= T4[0] | |
- T6[0]; | |
double T11[1]; | |
T11[0] | |
= (double)(T21[(i260 * 8) + ((i279 * 2) + i281)]); | |
double T9[1]; | |
T9[0] | |
= T7[0] | |
* T8[0]; | |
double T10[1]; | |
T10[0] | |
= T9[0] | |
* d19; | |
T12[((i279 * 2) + i281)] | |
= T11[0] | |
+ T10[0]; | |
} | |
} | |
double T13[1]; | |
T13[0] = NEG_INFINITY; | |
double T26[1]; | |
T26[0] = NEG_INFINITY; | |
#pragma unroll | |
for(nvfuser_index_t i275 = 0; i275 < 4; ++i275) { | |
#pragma unroll | |
for(nvfuser_index_t i277 = 0; i277 < 2; ++i277) { | |
if ((((((((i275 * (ceilDiv((ceilDiv(T0.size[3], 2)), 4))) + ((nvfuser_index_t)threadIdx.x)) * 2) + (i277 + nvfuser_zero)) < T0.size[3]) && (i1272 < (1 * (T0.size[1] * T0.size[2])))) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv((ceilDiv((ceilDiv(T0.size[3], 2)), 4)), 1))))) { | |
T26[0] = fmax( | |
T26[0], | |
T12[((i275 * 2) + i277)]); | |
} | |
} | |
} | |
warp::warpReduceTIDX<false>( | |
T13[0], | |
T26[0], | |
[](double &a, double b) { a = fmax(a, b); }, | |
threadIdx, | |
blockDim, | |
static_cast<double*>(shared_mem), | |
true, | |
double(NEG_INFINITY)); | |
double T14[1]; | |
broadcast::blockBroadcast<true, false, false>( | |
T14[0], | |
T13[0], | |
static_cast<double*>(shared_mem), | |
true); | |
// Alias Allocation - register | |
auto& T16 = T12; | |
#pragma unroll | |
for(nvfuser_index_t i271 = 0; i271 < 4; ++i271) { | |
#pragma unroll | |
for(nvfuser_index_t i273 = 0; i273 < 2; ++i273) { | |
double T15[1]; | |
T15[0] | |
= T12[((i271 * 2) + i273)] | |
- T14[0]; | |
T16[((i271 * 2) + i273)] | |
= exp(T15[0]); | |
} | |
} | |
double T17[1]; | |
T17[0] = 0; | |
double T27[1]; | |
T27[0] = 0; | |
#pragma unroll | |
for(nvfuser_index_t i267 = 0; i267 < 4; ++i267) { | |
#pragma unroll | |
for(nvfuser_index_t i269 = 0; i269 < 2; ++i269) { | |
if ((((((((i267 * (ceilDiv((ceilDiv(T0.size[3], 2)), 4))) + ((nvfuser_index_t)threadIdx.x)) * 2) + (i269 + nvfuser_zero)) < T0.size[3]) && (i1272 < (1 * (T0.size[1] * T0.size[2])))) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv((ceilDiv((ceilDiv(T0.size[3], 2)), 4)), 1))))) { | |
T27[0] | |
= T27[0] | |
+ T16[((i267 * 2) + i269)]; | |
} | |
} | |
} | |
warp::warpReduceTIDX<false>( | |
T17[0], | |
T27[0], | |
[](double &a, double b) { a = a + b; }, | |
threadIdx, | |
blockDim, | |
static_cast<double*>(shared_mem), | |
true, | |
double(0)); | |
double T18[1]; | |
broadcast::blockBroadcast<true, false, false>( | |
T18[0], | |
T17[0], | |
static_cast<double*>(shared_mem), | |
true); | |
double T19[1]; | |
T19[0] | |
= reciprocal(T18[0]); | |
#pragma unroll | |
for(nvfuser_index_t i262 = 0; i262 < 4; ++i262) { | |
#pragma unroll | |
for(nvfuser_index_t i264 = 0; i264 < 2; ++i264) { | |
T25[(i260 * 8) + ((i262 * 2) + i264)] | |
= T16[((i262 * 2) + i264)] | |
* T19[0]; | |
} | |
} | |
} | |
NVFUSER_UPDATE_MAGIC_ZERO | |
#pragma unroll | |
for(nvfuser_index_t i255 = 0; i255 < 2; ++i255) { | |
int64_t i1334; | |
i1334 = (((nvfuser_index_t)blockIdx.x) * 2) + i255; | |
#pragma unroll | |
for(nvfuser_index_t i257 = 0; i257 < 4; ++i257) { | |
int64_t i903; | |
i903 = (((i257 + nvfuser_zero) * (ceilDiv((ceilDiv(T0.size[3], 2)), 4))) + ((nvfuser_index_t)threadIdx.x)) * 2; | |
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv((ceilDiv((ceilDiv(T0.size[3], 2)), 4)), 1))) && ((((((i257 + nvfuser_zero) * (ceilDiv((ceilDiv(T0.size[3], 2)), 4))) + ((nvfuser_index_t)threadIdx.x)) * 2) + 1) < T0.size[3])) && (i1334 < (1 * (T0.size[1] * T0.size[2]))))) { | |
loadLocalToGlobal<double, 2, false>( &T20[(((((nvfuser_index_t)blockIdx.x) * 2) + i255) * T0.size[3]) + i903], &T25[(i255 * 8) + (i257 * 2)]); | |
} | |
} | |
} | |
NVFUSER_UPDATE_MAGIC_ZERO | |
} | |
} | |
====================================== | |
ptxas info : 982 bytes gmem | |
ptxas info : Compiling entry function '_ZN11CudaCodeGen7kernel1ENS_6TensorIfLi4EEENS0_IdLi0EEES2_S1_xxNS0_IdLi4EEE' for 'sm_80' | |
ptxas info : Function properties for _ZN11CudaCodeGen7kernel1ENS_6TensorIfLi4EEENS0_IdLi0EEES2_S1_xxNS0_IdLi4EEE | |
ptxas . 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads | |
ptxas info : Used 135 registers, 16 bytes smem, 504 bytes cmem[0], 96 bytes cmem[2] | |
Launch Parameters: BlockDim.x = 512, BlockDim.y = -1, BlockDim.z = -1, GridDim.x = 384, GridDim.y = -1, GridDim.z = -1, Smem Size = 4096 | |
Traceback (most recent call last): | |
File "/fsx/users/dberard/pytorch/33-repro.py", line 21, in <module> | |
fn_s(t1, t2, t3, t4, 1, 1) | |
RuntimeError: The following operation failed in the TorchScript interpreter. | |
Traceback of TorchScript (most recent call last): | |
RuntimeError: CUDA driver error: too many resources requested for launch | |
srun: error: dev-st-p4d24xlarge-1: task 0: Exited with exit code 1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment