Skip to content

Instantly share code, notes, and snippets.

@davidberard98
Created April 20, 2022 18:03
Show Gist options
  • Save davidberard98/78bbdc7a7e52b8466940e96b2f24cf13 to your computer and use it in GitHub Desktop.
Save davidberard98/78bbdc7a7e52b8466940e96b2f24cf13 to your computer and use it in GitHub Desktop.
srun: error: ioctl(TIOCGWINSZ): Inappropriate ioctl for device
srun: error: Not using a pseudo-terminal, disregarding --pty option
[DUMP graph_fuser.cpp:2323] Before Fusion:
[DUMP graph_fuser.cpp:2323] graph(%t1.1 : Tensor,
[DUMP graph_fuser.cpp:2323] %t2.1 : Tensor,
[DUMP graph_fuser.cpp:2323] %t3.1 : Tensor,
[DUMP graph_fuser.cpp:2323] %t4.1 : Tensor,
[DUMP graph_fuser.cpp:2323] %i1.1 : int,
[DUMP graph_fuser.cpp:2323] %i2.1 : int):
[DUMP graph_fuser.cpp:2323] %9 : int = prim::Constant[value=-1]() # /fsx/users/dberard/pytorch/33-repro.py:8:28
[DUMP graph_fuser.cpp:2323] %8 : bool = prim::Constant[value=0]() # /fsx/users/dberard/pytorch/33-repro.py:8:32
[DUMP graph_fuser.cpp:2323] %7 : int[] = prim::Constant[value=[1, 12, 64, 4096]]()
[DUMP graph_fuser.cpp:2323] %6 : int[] = prim::Constant[value=[12, 64, 4096]]()
[DUMP graph_fuser.cpp:2323] %10 : Tensor = prim::profile[profiled_type=Double(requires_grad=0, device=cuda:0), seen_none=0](%t3.1)
[DUMP graph_fuser.cpp:2323] %11 : Tensor = prim::profile[profiled_type=Float(1, 1, 1, 4096, strides=[4096, 4096, 4096, 1], requires_grad=0, device=cuda:0), seen_none=0](%t4.1)
[DUMP graph_fuser.cpp:2323] %v1.1 : Tensor = aten::sub(%10, %11, %i2.1) # /fsx/users/dberard/pytorch/33-repro.py:4:9
[DUMP graph_fuser.cpp:2323] %13 : Tensor = prim::profile[profiled_type=Float(1, 1, 1, 4096, strides=[4096, 4096, 4096, 1], requires_grad=0, device=cuda:0), seen_none=0](%v1.1)
[DUMP graph_fuser.cpp:2323] %14 : Tensor = prim::profile[profiled_type=Double(requires_grad=0, device=cuda:0), seen_none=0](%t2.1)
[DUMP graph_fuser.cpp:2323] %v2.1 : Tensor = aten::mul(%13, %14) # /fsx/users/dberard/pytorch/33-repro.py:5:9
[DUMP graph_fuser.cpp:2323] %16 : Tensor = prim::profile[profiled_type=Float(12, 64, 4096, strides=[262144, 4096, 1], requires_grad=0, device=cuda:0), seen_none=0](%t1.1)
[DUMP graph_fuser.cpp:2323] %v3.1 : Tensor = aten::reshape(%16, %7) # /fsx/users/dberard/pytorch/33-repro.py:6:9
[DUMP graph_fuser.cpp:2323] %18 : Tensor = prim::profile[profiled_type=Float(1, 12, 64, 4096, strides=[3145728, 262144, 4096, 1], requires_grad=0, device=cuda:0), seen_none=0](%v3.1)
[DUMP graph_fuser.cpp:2323] %19 : Tensor = prim::profile[profiled_type=Float(1, 1, 1, 4096, strides=[4096, 4096, 4096, 1], requires_grad=0, device=cuda:0), seen_none=0](%v2.1)
[DUMP graph_fuser.cpp:2323] %v4.1 : Tensor = aten::add(%18, %19, %i1.1) # /fsx/users/dberard/pytorch/33-repro.py:7:9
[DUMP graph_fuser.cpp:2323] %21 : Tensor = prim::profile[profiled_type=Float(1, 12, 64, 4096, strides=[3145728, 262144, 4096, 1], requires_grad=0, device=cuda:0), seen_none=0](%v4.1)
[DUMP graph_fuser.cpp:2323] %v5.1 : Tensor = aten::_softmax(%21, %9, %8) # /fsx/users/dberard/pytorch/33-repro.py:8:9
[DUMP graph_fuser.cpp:2323] %23 : Tensor = prim::profile[profiled_type=Float(1, 12, 64, 4096, strides=[3145728, 262144, 4096, 1], requires_grad=0, device=cuda:0), seen_none=0](%v5.1)
[DUMP graph_fuser.cpp:2323] %v6.1 : Tensor = aten::reshape(%23, %6) # /fsx/users/dberard/pytorch/33-repro.py:9:9
[DUMP graph_fuser.cpp:2323] %25 : Tensor = prim::profile[profiled_type=Float(12, 64, 4096, strides=[262144, 4096, 1], requires_grad=0, device=cuda:0), seen_none=0](%v6.1)
[DUMP graph_fuser.cpp:2323] return (%25)
[DUMP graph_fuser.cpp:2403] Before Compilation:
[DUMP graph_fuser.cpp:2403] graph(%t1.1 : Tensor,
[DUMP graph_fuser.cpp:2403] %t2.1 : Tensor,
[DUMP graph_fuser.cpp:2403] %t3.1 : Tensor,
[DUMP graph_fuser.cpp:2403] %t4.1 : Tensor,
[DUMP graph_fuser.cpp:2403] %i1.1 : int,
[DUMP graph_fuser.cpp:2403] %i2.1 : int):
[DUMP graph_fuser.cpp:2403] %7 : int[] = prim::Constant[value=[1, 12, 64, 4096]]()
[DUMP graph_fuser.cpp:2403] %6 : int[] = prim::Constant[value=[12, 64, 4096]]()
[DUMP graph_fuser.cpp:2403] %v3.1 : Tensor = aten::reshape(%t1.1, %7) # /fsx/users/dberard/pytorch/33-repro.py:6:9
[DUMP graph_fuser.cpp:2403] %36 : bool = prim::CudaFusionGuard[types=[Float(1, 12, 64, 4096, strides=[3145728, 262144, 4096, 1], requires_grad=0, device=cuda:0), Double(requires_grad=0, device=cuda:0), Double(requires_grad=0, device=cuda:0), Float(1, 1, 1, 4096, strides=[4096, 4096, 4096, 1], requires_grad=0, device=cuda:0)]](%v3.1, %t2.1, %t3.1, %t4.1)
[DUMP graph_fuser.cpp:2403] %35 : Tensor = prim::If(%36)
[DUMP graph_fuser.cpp:2403] block0():
[DUMP graph_fuser.cpp:2403] %v5.4 : Tensor = prim::CudaFusionGroup_0(%v3.1, %t2.1, %t3.1, %t4.1, %i1.1, %i2.1)
[DUMP graph_fuser.cpp:2403] -> (%v5.4)
[DUMP graph_fuser.cpp:2403] block1():
[DUMP graph_fuser.cpp:2403] %v5.1 : Tensor = prim::FallbackGraph_1(%v3.1, %t2.1, %t3.1, %t4.1, %i1.1, %i2.1)
[DUMP graph_fuser.cpp:2403] -> (%v5.1)
[DUMP graph_fuser.cpp:2403] %v6.1 : Tensor = aten::reshape(%35, %6) # /fsx/users/dberard/pytorch/33-repro.py:9:9
[DUMP graph_fuser.cpp:2403] return (%v6.1)
[DUMP graph_fuser.cpp:2403] with prim::CudaFusionGroup_0 = graph(%4 : Float(1, 12, 64, 4096, strides=[3145728, 262144, 4096, 1], requires_grad=0, device=cuda:0),
[DUMP graph_fuser.cpp:2403] %9 : Double(requires_grad=0, device=cuda:0),
[DUMP graph_fuser.cpp:2403] %11 : Double(requires_grad=0, device=cuda:0),
[DUMP graph_fuser.cpp:2403] %12 : Float(1, 1, 1, 4096, strides=[4096, 4096, 4096, 1], requires_grad=0, device=cuda:0),
[DUMP graph_fuser.cpp:2403] %6 : int,
[DUMP graph_fuser.cpp:2403] %13 : int):
[DUMP graph_fuser.cpp:2403] %2 : bool = prim::Constant[value=0]() # /fsx/users/dberard/pytorch/33-repro.py:8:32
[DUMP graph_fuser.cpp:2403] %1 : int = prim::Constant[value=-1]() # /fsx/users/dberard/pytorch/33-repro.py:8:28
[DUMP graph_fuser.cpp:2403] %v1.1 : Float(1, 1, 1, 4096, strides=[4096, 4096, 4096, 1], requires_grad=0, device=cuda:0) = aten::sub(%11, %12, %13) # /fsx/users/dberard/pytorch/33-repro.py:4:9
[DUMP graph_fuser.cpp:2403] %v2.1 : Float(1, 1, 1, 4096, strides=[4096, 4096, 4096, 1], requires_grad=0, device=cuda:0) = aten::mul(%v1.1, %9) # /fsx/users/dberard/pytorch/33-repro.py:5:9
[DUMP graph_fuser.cpp:2403] %v4.1 : Float(1, 12, 64, 4096, strides=[3145728, 262144, 4096, 1], requires_grad=0, device=cuda:0) = aten::add(%4, %v2.1, %6) # /fsx/users/dberard/pytorch/33-repro.py:7:9
[DUMP graph_fuser.cpp:2403] %v5.1 : Float(1, 12, 64, 4096, strides=[3145728, 262144, 4096, 1], requires_grad=0, device=cuda:0) = aten::_softmax(%v4.1, %1, %2) # /fsx/users/dberard/pytorch/33-repro.py:8:9
[DUMP graph_fuser.cpp:2403] return (%v5.1)
[DUMP graph_fuser.cpp:2403] with prim::FallbackGraph_1 = graph(%v3.1 : Float(1, 12, 64, 4096, strides=[3145728, 262144, 4096, 1], requires_grad=0, device=cuda:0),
[DUMP graph_fuser.cpp:2403] %t2.1 : Double(requires_grad=0, device=cuda:0),
[DUMP graph_fuser.cpp:2403] %t3.1 : Double(requires_grad=0, device=cuda:0),
[DUMP graph_fuser.cpp:2403] %t4.1 : Float(1, 1, 1, 4096, strides=[4096, 4096, 4096, 1], requires_grad=0, device=cuda:0),
[DUMP graph_fuser.cpp:2403] %i1.1 : int,
[DUMP graph_fuser.cpp:2403] %i2.1 : int):
[DUMP graph_fuser.cpp:2403] %6 : bool = prim::Constant[value=0]() # /fsx/users/dberard/pytorch/33-repro.py:8:32
[DUMP graph_fuser.cpp:2403] %7 : int = prim::Constant[value=-1]() # /fsx/users/dberard/pytorch/33-repro.py:8:28
[DUMP graph_fuser.cpp:2403] %v1.1 : Float(1, 1, 1, 4096, strides=[4096, 4096, 4096, 1], requires_grad=0, device=cuda:0) = aten::sub(%t3.1, %t4.1, %i2.1) # /fsx/users/dberard/pytorch/33-repro.py:4:9
[DUMP graph_fuser.cpp:2403] %v2.1 : Float(1, 1, 1, 4096, strides=[4096, 4096, 4096, 1], requires_grad=0, device=cuda:0) = aten::mul(%v1.1, %t2.1) # /fsx/users/dberard/pytorch/33-repro.py:5:9
[DUMP graph_fuser.cpp:2403] %v4.1 : Float(1, 12, 64, 4096, strides=[3145728, 262144, 4096, 1], requires_grad=0, device=cuda:0) = aten::add(%v3.1, %v2.1, %i1.1) # /fsx/users/dberard/pytorch/33-repro.py:7:9
[DUMP graph_fuser.cpp:2403] %v5.1 : Float(1, 12, 64, 4096, strides=[3145728, 262144, 4096, 1], requires_grad=0, device=cuda:0) = aten::_softmax(%v4.1, %7, %6) # /fsx/users/dberard/pytorch/33-repro.py:8:9
[DUMP graph_fuser.cpp:2403] return (%v5.1)
===== Reduction Stats ========
total_reduction_numel: 4096
total_iteration_numel: 768
inner_most_dimension_numel: 4096
vectorize_factor: 2
n_tensor_inputs: 2
max_input_dtype_size: 4
max_persistent_buffer_size: 32768
max_multi_reduction_factor: 4
block(512, 1, 1)
===== Reduction Parameters ========
Tag: Inner Persistent Heuristic.
Red On Fastest Dim
Persistent Kernel
Batches per block: 4
Iteration Domain: blockIdx.x / unroll / factor 2
Inner Reduction Domain: cross block - threadIdx.x / pad to warp / persistent batch - 4 / vectorize / factor 2
Launch Parameters: BlockDim.x = -1, BlockDim.y = 1, BlockDim.z = -1, GridDim.x = -1, GridDim.y = -1, GridDim.z = -1, Smem Size = 0
====================================
===== Reduction Stats ========
total_reduction_numel: 4096
total_iteration_numel: 768
inner_most_dimension_numel: 4096
vectorize_factor: 2
n_tensor_inputs: 2
max_input_dtype_size: 4
max_persistent_buffer_size: 32768
max_multi_reduction_factor: 4
block(512, 1, 1)
===== Reduction Parameters ========
Tag: Inner Persistent Heuristic.
Red On Fastest Dim
Persistent Kernel
Batches per block: 4
Iteration Domain: blockIdx.x / unroll / factor 2
Inner Reduction Domain: cross block - threadIdx.x / pad to warp / persistent batch - 4 / vectorize / factor 2
Launch Parameters: BlockDim.x = -1, BlockDim.y = 1, BlockDim.z = -1, GridDim.x = -1, GridDim.y = -1, GridDim.z = -1, Smem Size = 0
====================================
Inputs:
T0_g[ iblockIdx.x238{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS239{1}, iS237{2}, iS245{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}, iS242{4}, iS244{1}, iS241{2} ], float
T1_g[ 0 ], double
T2_g[ 0 ], double
T3_g[ sbS322{( ceilDiv(( ceilDiv(( 1 * ( 1 * 1 ) ), 2) ), 1) )}, sbS323{1}, sbS321{2}, iS329{( ceilDiv(( ceilDiv(( ceilDiv(i5, 2) ), 4) ), 1) )}, iS326{4}, iS328{1}, iS325{2} ], float
i6, int64_t
i7, int64_t
Outputs:
T20_g[ iblockIdx.x166{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS167{1}, iUR165{2}, ithreadIdx.x359{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}_p, iS356{4}, iUS358{1}, iV355{2} ] produce_pos( 2), double
%kernel_math {
T21_l[ iblockIdx.x214{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS215{1}, iUR213{2}, ithreadIdx.x221{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}_p, iS218{4}, iUS220{1}, iV217{2} ] ca_pos( 2 )
= T0_g[ iblockIdx.x238{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS239{1}, iS237{2}, iS245{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}, iS242{4}, iS244{1}, iS241{2} ];
T11_l[ iblockIdx.x190{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS191{1}, iS189{2}, ithreadIdx.x197{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}_p, iS194{4}, iUS196{1}, iS193{2} ] ca_pos( 7 ) produce_pos( 2)
= (double)(T21_l[ iblockIdx.x214{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS215{1}, iUR213{2}, ithreadIdx.x221{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}_p, iS218{4}, iUS220{1}, iV217{2} ] ca_pos( 2 ));
T23_l[ 0 ]
= T2_g[ 0 ];
T4_l[ bblockIdx.x274{( ceilDiv(( ceilDiv(( 1 * ( 1 * 1 ) ), 2) ), 1) )}, bUS275{1}, bS273{2}, bthreadIdx.x281{( ceilDiv(( ceilDiv(( ceilDiv(1, 2) ), 4) ), 1) )}_p, bS278{4}, bUS280{1}, bS277{2} ] = broadcast( T23_l[ 0 ] )
T24_l[ sbblockIdx.x310{( ceilDiv(( ceilDiv(( 1 * ( 1 * 1 ) ), 2) ), 1) )}, sbUS311{1}, sbUR309{2}, ithreadIdx.x317{( ceilDiv(( ceilDiv(( ceilDiv(i5, 2) ), 4) ), 1) )}_p, iS314{4}, iUS316{1}, iV313{2} ]
= T3_g[ sbS322{( ceilDiv(( ceilDiv(( 1 * ( 1 * 1 ) ), 2) ), 1) )}, sbS323{1}, sbS321{2}, iS329{( ceilDiv(( ceilDiv(( ceilDiv(i5, 2) ), 4) ), 1) )}, iS326{4}, iS328{1}, iS325{2} ];
d10 = (double)(i7);
T5_l[ sbblockIdx.x298{( ceilDiv(( ceilDiv(( 1 * ( 1 * 1 ) ), 2) ), 1) )}, sbUS299{1}, sbS297{2}, ithreadIdx.x305{( ceilDiv(( ceilDiv(( ceilDiv(i5, 2) ), 4) ), 1) )}_p, iS302{4}, iUS304{1}, iS301{2} ] ca_pos( 7 )
= T24_l[ sbblockIdx.x310{( ceilDiv(( ceilDiv(( 1 * ( 1 * 1 ) ), 2) ), 1) )}, sbUS311{1}, sbUR309{2}, ithreadIdx.x317{( ceilDiv(( ceilDiv(( ceilDiv(i5, 2) ), 4) ), 1) )}_p, iS314{4}, iUS316{1}, iV313{2} ]
* d10;
T6_l[ sbblockIdx.x286{( ceilDiv(( ceilDiv(( 1 * ( 1 * 1 ) ), 2) ), 1) )}, sbUS287{1}, sbS285{2}, ithreadIdx.x293{( ceilDiv(( ceilDiv(( ceilDiv(i5, 2) ), 4) ), 1) )}_p, iS290{4}, iUS292{1}, iS289{2} ] ca_pos( 7 ) produce_pos( 7)
= (double)(T5_l[ sbblockIdx.x298{( ceilDiv(( ceilDiv(( 1 * ( 1 * 1 ) ), 2) ), 1) )}, sbUS299{1}, sbS297{2}, ithreadIdx.x305{( ceilDiv(( ceilDiv(( ceilDiv(i5, 2) ), 4) ), 1) )}_p, iS302{4}, iUS304{1}, iS301{2} ] ca_pos( 7 ));
T7_l[ sbblockIdx.x250{( ceilDiv(( ceilDiv(( 1 * ( 1 * 1 ) ), 2) ), 1) )}, sbUS251{1}, sbS249{2}, ithreadIdx.x257{( ceilDiv(( ceilDiv(( ceilDiv(i5, 2) ), 4) ), 1) )}_p, iS254{4}, iUS256{1}, iS253{2} ] ca_pos( 7 ) produce_pos( 7)
= T4_l[ bblockIdx.x274{( ceilDiv(( ceilDiv(( 1 * ( 1 * 1 ) ), 2) ), 1) )}, bUS275{1}, bS273{2}, bthreadIdx.x281{( ceilDiv(( ceilDiv(( ceilDiv(1, 2) ), 4) ), 1) )}_p, bS278{4}, bUS280{1}, bS277{2} ]
- T6_l[ sbblockIdx.x286{( ceilDiv(( ceilDiv(( 1 * ( 1 * 1 ) ), 2) ), 1) )}, sbUS287{1}, sbS285{2}, ithreadIdx.x293{( ceilDiv(( ceilDiv(( ceilDiv(i5, 2) ), 4) ), 1) )}_p, iS290{4}, iUS292{1}, iS289{2} ] ca_pos( 7 ) produce_pos( 7);
T22_l[ 0 ]
= T1_g[ 0 ];
T8_l[ bblockIdx.x262{( ceilDiv(( ceilDiv(( 1 * ( 1 * 1 ) ), 2) ), 1) )}, bUS263{1}, bS261{2}, bthreadIdx.x269{( ceilDiv(( ceilDiv(( ceilDiv(1, 2) ), 4) ), 1) )}_p, bS266{4}, bUS268{1}, bS265{2} ] = broadcast( T22_l[ 0 ] )
T9_l[ sbblockIdx.x226{( ceilDiv(( ceilDiv(( 1 * ( 1 * 1 ) ), 2) ), 1) )}, sbUS227{1}, sbS225{2}, ithreadIdx.x233{( ceilDiv(( ceilDiv(( ceilDiv(i5, 2) ), 4) ), 1) )}_p, iS230{4}, iUS232{1}, iS229{2} ] ca_pos( 7 ) produce_pos( 7)
= T7_l[ sbblockIdx.x250{( ceilDiv(( ceilDiv(( 1 * ( 1 * 1 ) ), 2) ), 1) )}, sbUS251{1}, sbS249{2}, ithreadIdx.x257{( ceilDiv(( ceilDiv(( ceilDiv(i5, 2) ), 4) ), 1) )}_p, iS254{4}, iUS256{1}, iS253{2} ] ca_pos( 7 ) produce_pos( 7)
* T8_l[ bblockIdx.x262{( ceilDiv(( ceilDiv(( 1 * ( 1 * 1 ) ), 2) ), 1) )}, bUS263{1}, bS261{2}, bthreadIdx.x269{( ceilDiv(( ceilDiv(( ceilDiv(1, 2) ), 4) ), 1) )}_p, bS266{4}, bUS268{1}, bS265{2} ];
d19 = (double)(i6);
T10_l[ sbblockIdx.x202{( ceilDiv(( ceilDiv(( 1 * ( 1 * 1 ) ), 2) ), 1) )}, sbUS203{1}, sbS201{2}, ithreadIdx.x209{( ceilDiv(( ceilDiv(( ceilDiv(i5, 2) ), 4) ), 1) )}_p, iS206{4}, iUS208{1}, iS205{2} ] ca_pos( 7 ) produce_pos( 7)
= T9_l[ sbblockIdx.x226{( ceilDiv(( ceilDiv(( 1 * ( 1 * 1 ) ), 2) ), 1) )}, sbUS227{1}, sbS225{2}, ithreadIdx.x233{( ceilDiv(( ceilDiv(( ceilDiv(i5, 2) ), 4) ), 1) )}_p, iS230{4}, iUS232{1}, iS229{2} ] ca_pos( 7 ) produce_pos( 7)
* d19;
T12_l[ iblockIdx.x178{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS179{1}, iS177{2}, ithreadIdx.x185{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}_p, iS182{4}, iUS184{1}, iS181{2} ] ca_pos( 3 ) produce_pos( 7)
= T11_l[ iblockIdx.x190{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS191{1}, iS189{2}, ithreadIdx.x197{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}_p, iS194{4}, iUS196{1}, iS193{2} ] ca_pos( 7 ) produce_pos( 2)
+ T10_l[ sbblockIdx.x202{( ceilDiv(( ceilDiv(( 1 * ( 1 * 1 ) ), 2) ), 1) )}, sbUS203{1}, sbS201{2}, ithreadIdx.x209{( ceilDiv(( ceilDiv(( ceilDiv(i5, 2) ), 4) ), 1) )}_p, iS206{4}, iUS208{1}, iS205{2} ] ca_pos( 7 ) produce_pos( 7);
T26_l[ iblockIdx.x108{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS109{1}, iS107{2}, ithreadIdx.x115{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}rf_p, rS112{4}rf, rUS114{1}rf, rS111{2}rf ] ca_pos( 4 ) produce_pos( 3) = reduction( T12_l[ iblockIdx.x178{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS179{1}, iS177{2}, ithreadIdx.x185{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}_p, iS182{4}, iUS184{1}, iS181{2} ] ca_pos( 3 ) produce_pos( 7), op = fmax, initial value = double(-inf), fused = 0 )
T13_l[ iblockIdx.x123{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS124{1}, iS122{2}, rthreadIdx.x125{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}_p ] ca_pos( 3 ) produce_pos( 4) = reduction( T26_l[ iblockIdx.x108{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS109{1}, iS107{2}, ithreadIdx.x115{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}rf_p, rS112{4}rf, rUS114{1}rf, rS111{2}rf ] ca_pos( 4 ) produce_pos( 3), op = fmax, initial value = double(-inf), fused = 0 )
T14_l[ iblockIdx.x130{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS131{1}, iS129{2}, bthreadIdx.x365{( ceilDiv(( ceilDiv(( ceilDiv(1, 2) ), 4) ), 1) )}_p, bS362{4}, bUS364{1}, bS361{2} ] ca_pos( 3 ) produce_pos( 3) = broadcast( T13_l[ iblockIdx.x123{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS124{1}, iS122{2}, rthreadIdx.x125{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}_p ] ca_pos( 3 ) produce_pos( 4) )
T15_l[ iblockIdx.x136{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS137{1}, iS135{2}, ithreadIdx.x335{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}_p, iS332{4}, iUS334{1}, iS331{2} ] ca_pos( 7 ) produce_pos( 3)
= T12_l[ iblockIdx.x178{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS179{1}, iS177{2}, ithreadIdx.x185{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}_p, iS182{4}, iUS184{1}, iS181{2} ] ca_pos( 3 ) produce_pos( 7)
- T14_l[ iblockIdx.x130{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS131{1}, iS129{2}, bthreadIdx.x365{( ceilDiv(( ceilDiv(( ceilDiv(1, 2) ), 4) ), 1) )}_p, bS362{4}, bUS364{1}, bS361{2} ] ca_pos( 3 ) produce_pos( 3);
T16_l[ iblockIdx.x142{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS143{1}, iS141{2}, ithreadIdx.x341{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}_p, iS338{4}, iUS340{1}, iS337{2} ] ca_pos( 3 ) produce_pos( 7)
= exp(T15_l[ iblockIdx.x136{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS137{1}, iS135{2}, ithreadIdx.x335{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}_p, iS332{4}, iUS334{1}, iS331{2} ] ca_pos( 7 ) produce_pos( 3));
T27_l[ iblockIdx.x386{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS387{1}, iS385{2}, ithreadIdx.x393{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}rf_p, rS390{4}rf, rUS392{1}rf, rS389{2}rf ] ca_pos( 4 ) produce_pos( 3) = reduction( T16_l[ iblockIdx.x142{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS143{1}, iS141{2}, ithreadIdx.x341{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}_p, iS338{4}, iUS340{1}, iS337{2} ] ca_pos( 3 ) produce_pos( 7), op = add, initial value = double(0), fused = 0 )
T17_l[ iblockIdx.x401{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS402{1}, iS400{2}, rthreadIdx.x403{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}_p ] ca_pos( 3 ) produce_pos( 4) = reduction( T27_l[ iblockIdx.x386{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS387{1}, iS385{2}, ithreadIdx.x393{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}rf_p, rS390{4}rf, rUS392{1}rf, rS389{2}rf ] ca_pos( 4 ) produce_pos( 3), op = add, initial value = double(0), fused = 0 )
T18_l[ iblockIdx.x160{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS161{1}, iS159{2}, bthreadIdx.x377{( ceilDiv(( ceilDiv(( ceilDiv(1, 2) ), 4) ), 1) )}_p, bS374{4}, bUS376{1}, bS373{2} ] ca_pos( 3 ) produce_pos( 3) = broadcast( T17_l[ iblockIdx.x401{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS402{1}, iS400{2}, rthreadIdx.x403{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}_p ] ca_pos( 3 ) produce_pos( 4) )
T19_l[ iblockIdx.x172{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS173{1}, iS171{2}, bthreadIdx.x371{( ceilDiv(( ceilDiv(( ceilDiv(1, 2) ), 4) ), 1) )}_p, bS368{4}, bUS370{1}, bS367{2} ] ca_pos( 3 ) produce_pos( 3)
= reciprocal(T18_l[ iblockIdx.x160{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS161{1}, iS159{2}, bthreadIdx.x377{( ceilDiv(( ceilDiv(( ceilDiv(1, 2) ), 4) ), 1) )}_p, bS374{4}, bUS376{1}, bS373{2} ] ca_pos( 3 ) produce_pos( 3));
T25_l[ iblockIdx.x154{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS155{1}, iS153{2}, ithreadIdx.x353{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}_p, iS350{4}, iUS352{1}, iS349{2} ] ca_pos( 2 ) produce_pos( 3)
= T16_l[ iblockIdx.x142{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS143{1}, iS141{2}, ithreadIdx.x341{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}_p, iS338{4}, iUS340{1}, iS337{2} ] ca_pos( 3 ) produce_pos( 7)
* T19_l[ iblockIdx.x172{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS173{1}, iS171{2}, bthreadIdx.x371{( ceilDiv(( ceilDiv(( ceilDiv(1, 2) ), 4) ), 1) )}_p, bS368{4}, bUS370{1}, bS367{2} ] ca_pos( 3 ) produce_pos( 3);
T20_g[ iblockIdx.x166{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS167{1}, iUR165{2}, ithreadIdx.x359{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}_p, iS356{4}, iUS358{1}, iV355{2} ] produce_pos( 2)
= T25_l[ iblockIdx.x154{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS155{1}, iS153{2}, ithreadIdx.x353{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}_p, iS350{4}, iUS352{1}, iS349{2} ] ca_pos( 2 ) produce_pos( 3);
}
======= Codegen output for kernel: kernel1 =======
__global__ void kernel1(Tensor<float, 4> T0, Tensor<double, 0> T1, Tensor<double, 0> T2, Tensor<float, 4> T3, int64_t i6, int64_t i7, Tensor<double, 4> T20) {
alignas(8) extern __shared__ char array[];
void* shared_mem = array;
NVFUSER_DEFINE_MAGIC_ZERO
double T22[1];
T22[0] = 0;
T22[0]
= T1[0];
double T8[1];
T8[0]
= T22[0];
double T23[1];
T23[0] = 0;
T23[0]
= T2[0];
double T4[1];
T4[0]
= T23[0];
double d19;
d19 = (double)(i6);
double d10;
d10 = (double)(i7);
Array<float, ((4 * 1) * 2), 2> T24;
#pragma unroll
for(nvfuser_index_t i295 = 0; i295 < 4; ++i295) {
T24.set(0);
}
NVFUSER_UPDATE_MAGIC_ZERO
if (((((((ceilDiv((ceilDiv(T0.size[3], 2)), 4)) * 3) + ((nvfuser_index_t)threadIdx.x)) * 2) + 1) < T0.size[3])) {
#pragma unroll
for(nvfuser_index_t i288 = 0; i288 < 4; ++i288) {
loadGlobalToLocal<float, 2, false>(&T24[(i288 * 2)], &T3[((((i288 + nvfuser_zero) * (ceilDiv((ceilDiv(T0.size[3], 2)), 4))) + ((nvfuser_index_t)threadIdx.x)) * 2)]);
}
NVFUSER_UPDATE_MAGIC_ZERO
} else {
#pragma unroll
for(nvfuser_index_t i288 = 0; i288 < 4; ++i288) {
int64_t i361;
i361 = (((i288 + nvfuser_zero) * (ceilDiv((ceilDiv(T0.size[3], 2)), 4))) + ((nvfuser_index_t)threadIdx.x)) * 2;
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv((ceilDiv((ceilDiv(T0.size[3], 2)), 4)), 1))) && ((((((i288 + nvfuser_zero) * (ceilDiv((ceilDiv(T0.size[3], 2)), 4))) + ((nvfuser_index_t)threadIdx.x)) * 2) + 1) < T0.size[3]))) {
loadGlobalToLocal<float, 2, false>(&T24[(i288 * 2)], &T3[i361]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO
}
if ((((((((ceilDiv((ceilDiv(T0.size[3], 2)), 4)) * 3) + ((nvfuser_index_t)threadIdx.x)) * 2) + 1) < T0.size[3]) && (((((nvfuser_index_t)blockIdx.x) * 2) + 1) < (1 * (T0.size[1] * T0.size[2]))))) {
Array<float, (((2 * 4) * 1) * 2), 2> T21;
#pragma unroll
for(nvfuser_index_t i304 = 0; i304 < 2; ++i304) {
#pragma unroll
for(nvfuser_index_t i302 = 0; i302 < 4; ++i302) {
T21.set(0);
}
}
NVFUSER_UPDATE_MAGIC_ZERO
#pragma unroll
for(nvfuser_index_t i282 = 0; i282 < 2; ++i282) {
#pragma unroll
for(nvfuser_index_t i284 = 0; i284 < 4; ++i284) {
loadGlobalToLocal<float, 2, false>(&T21[(i282 * 8) + (i284 * 2)], &T0[(((((nvfuser_index_t)blockIdx.x) * 2) + i282) * T0.size[3]) + ((((i284 + nvfuser_zero) * (ceilDiv((ceilDiv(T0.size[3], 2)), 4))) + ((nvfuser_index_t)threadIdx.x)) * 2)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO
Array<double, (((2 * 4) * 1) * 2), 2> T25;
#pragma unroll
for(nvfuser_index_t i260 = 0; i260 < 2; ++i260) {
double T12[((4 * 1) * 2)];
#pragma unroll
for(nvfuser_index_t i279 = 0; i279 < 4; ++i279) {
#pragma unroll
for(nvfuser_index_t i281 = 0; i281 < 2; ++i281) {
float T5[1];
T5[0]
= T24[((i279 * 2) + i281)]
* (float) d10;
double T6[1];
T6[0]
= (double)(T5[0]);
double T7[1];
T7[0]
= T4[0]
- T6[0];
double T11[1];
T11[0]
= (double)(T21[(i260 * 8) + ((i279 * 2) + i281)]);
double T9[1];
T9[0]
= T7[0]
* T8[0];
double T10[1];
T10[0]
= T9[0]
* d19;
T12[((i279 * 2) + i281)]
= T11[0]
+ T10[0];
}
}
double T13[1];
T13[0] = NEG_INFINITY;
double T26[1];
T26[0] = NEG_INFINITY;
#pragma unroll
for(nvfuser_index_t i275 = 0; i275 < 4; ++i275) {
#pragma unroll
for(nvfuser_index_t i277 = 0; i277 < 2; ++i277) {
T26[0] = fmax(
T26[0],
T12[((i275 * 2) + i277)]);
}
}
warp::warpReduceTIDX<false>(
T13[0],
T26[0],
[](double &a, double b) { a = fmax(a, b); },
threadIdx,
blockDim,
static_cast<double*>(shared_mem),
true,
double(NEG_INFINITY));
double T14[1];
broadcast::blockBroadcast<true, false, false>(
T14[0],
T13[0],
static_cast<double*>(shared_mem),
true);
// Alias Allocation - register
auto& T16 = T12;
#pragma unroll
for(nvfuser_index_t i271 = 0; i271 < 4; ++i271) {
#pragma unroll
for(nvfuser_index_t i273 = 0; i273 < 2; ++i273) {
double T15[1];
T15[0]
= T12[((i271 * 2) + i273)]
- T14[0];
T16[((i271 * 2) + i273)]
= exp(T15[0]);
}
}
double T17[1];
T17[0] = 0;
double T27[1];
T27[0] = 0;
#pragma unroll
for(nvfuser_index_t i267 = 0; i267 < 4; ++i267) {
#pragma unroll
for(nvfuser_index_t i269 = 0; i269 < 2; ++i269) {
T27[0]
= T27[0]
+ T16[((i267 * 2) + i269)];
}
}
warp::warpReduceTIDX<false>(
T17[0],
T27[0],
[](double &a, double b) { a = a + b; },
threadIdx,
blockDim,
static_cast<double*>(shared_mem),
true,
double(0));
double T18[1];
broadcast::blockBroadcast<true, false, false>(
T18[0],
T17[0],
static_cast<double*>(shared_mem),
true);
double T19[1];
T19[0]
= reciprocal(T18[0]);
#pragma unroll
for(nvfuser_index_t i262 = 0; i262 < 4; ++i262) {
#pragma unroll
for(nvfuser_index_t i264 = 0; i264 < 2; ++i264) {
T25[(i260 * 8) + ((i262 * 2) + i264)]
= T16[((i262 * 2) + i264)]
* T19[0];
}
}
}
NVFUSER_UPDATE_MAGIC_ZERO
#pragma unroll
for(nvfuser_index_t i255 = 0; i255 < 2; ++i255) {
#pragma unroll
for(nvfuser_index_t i257 = 0; i257 < 4; ++i257) {
loadLocalToGlobal<double, 2, false>( &T20[(((((nvfuser_index_t)blockIdx.x) * 2) + i255) * T0.size[3]) + ((((i257 + nvfuser_zero) * (ceilDiv((ceilDiv(T0.size[3], 2)), 4))) + ((nvfuser_index_t)threadIdx.x)) * 2)], &T25[(i255 * 8) + (i257 * 2)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO
} else {
Array<float, (((2 * 4) * 1) * 2), 2> T21;
#pragma unroll
for(nvfuser_index_t i304 = 0; i304 < 2; ++i304) {
#pragma unroll
for(nvfuser_index_t i302 = 0; i302 < 4; ++i302) {
T21.set(0);
}
}
NVFUSER_UPDATE_MAGIC_ZERO
#pragma unroll
for(nvfuser_index_t i282 = 0; i282 < 2; ++i282) {
int64_t i1224;
i1224 = (((nvfuser_index_t)blockIdx.x) * 2) + i282;
#pragma unroll
for(nvfuser_index_t i284 = 0; i284 < 4; ++i284) {
int64_t i661;
i661 = (((i284 + nvfuser_zero) * (ceilDiv((ceilDiv(T0.size[3], 2)), 4))) + ((nvfuser_index_t)threadIdx.x)) * 2;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv((ceilDiv((ceilDiv(T0.size[3], 2)), 4)), 1))) && ((((((i284 + nvfuser_zero) * (ceilDiv((ceilDiv(T0.size[3], 2)), 4))) + ((nvfuser_index_t)threadIdx.x)) * 2) + 1) < T0.size[3])) && (i1224 < (1 * (T0.size[1] * T0.size[2]))))) {
loadGlobalToLocal<float, 2, false>(&T21[(i282 * 8) + (i284 * 2)], &T0[(((((nvfuser_index_t)blockIdx.x) * 2) + i282) * T0.size[3]) + i661]);
}
}
}
NVFUSER_UPDATE_MAGIC_ZERO
Array<double, (((2 * 4) * 1) * 2), 2> T25;
#pragma unroll
for(nvfuser_index_t i260 = 0; i260 < 2; ++i260) {
int64_t i1272;
i1272 = (((nvfuser_index_t)blockIdx.x) * 2) + i260;
double T12[((4 * 1) * 2)];
#pragma unroll
for(nvfuser_index_t i279 = 0; i279 < 4; ++i279) {
#pragma unroll
for(nvfuser_index_t i281 = 0; i281 < 2; ++i281) {
float T5[1];
T5[0]
= T24[((i279 * 2) + i281)]
* (float) d10;
double T6[1];
T6[0]
= (double)(T5[0]);
double T7[1];
T7[0]
= T4[0]
- T6[0];
double T11[1];
T11[0]
= (double)(T21[(i260 * 8) + ((i279 * 2) + i281)]);
double T9[1];
T9[0]
= T7[0]
* T8[0];
double T10[1];
T10[0]
= T9[0]
* d19;
T12[((i279 * 2) + i281)]
= T11[0]
+ T10[0];
}
}
double T13[1];
T13[0] = NEG_INFINITY;
double T26[1];
T26[0] = NEG_INFINITY;
#pragma unroll
for(nvfuser_index_t i275 = 0; i275 < 4; ++i275) {
#pragma unroll
for(nvfuser_index_t i277 = 0; i277 < 2; ++i277) {
if ((((((((i275 * (ceilDiv((ceilDiv(T0.size[3], 2)), 4))) + ((nvfuser_index_t)threadIdx.x)) * 2) + (i277 + nvfuser_zero)) < T0.size[3]) && (i1272 < (1 * (T0.size[1] * T0.size[2])))) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv((ceilDiv((ceilDiv(T0.size[3], 2)), 4)), 1))))) {
T26[0] = fmax(
T26[0],
T12[((i275 * 2) + i277)]);
}
}
}
warp::warpReduceTIDX<false>(
T13[0],
T26[0],
[](double &a, double b) { a = fmax(a, b); },
threadIdx,
blockDim,
static_cast<double*>(shared_mem),
true,
double(NEG_INFINITY));
double T14[1];
broadcast::blockBroadcast<true, false, false>(
T14[0],
T13[0],
static_cast<double*>(shared_mem),
true);
// Alias Allocation - register
auto& T16 = T12;
#pragma unroll
for(nvfuser_index_t i271 = 0; i271 < 4; ++i271) {
#pragma unroll
for(nvfuser_index_t i273 = 0; i273 < 2; ++i273) {
double T15[1];
T15[0]
= T12[((i271 * 2) + i273)]
- T14[0];
T16[((i271 * 2) + i273)]
= exp(T15[0]);
}
}
double T17[1];
T17[0] = 0;
double T27[1];
T27[0] = 0;
#pragma unroll
for(nvfuser_index_t i267 = 0; i267 < 4; ++i267) {
#pragma unroll
for(nvfuser_index_t i269 = 0; i269 < 2; ++i269) {
if ((((((((i267 * (ceilDiv((ceilDiv(T0.size[3], 2)), 4))) + ((nvfuser_index_t)threadIdx.x)) * 2) + (i269 + nvfuser_zero)) < T0.size[3]) && (i1272 < (1 * (T0.size[1] * T0.size[2])))) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv((ceilDiv((ceilDiv(T0.size[3], 2)), 4)), 1))))) {
T27[0]
= T27[0]
+ T16[((i267 * 2) + i269)];
}
}
}
warp::warpReduceTIDX<false>(
T17[0],
T27[0],
[](double &a, double b) { a = a + b; },
threadIdx,
blockDim,
static_cast<double*>(shared_mem),
true,
double(0));
double T18[1];
broadcast::blockBroadcast<true, false, false>(
T18[0],
T17[0],
static_cast<double*>(shared_mem),
true);
double T19[1];
T19[0]
= reciprocal(T18[0]);
#pragma unroll
for(nvfuser_index_t i262 = 0; i262 < 4; ++i262) {
#pragma unroll
for(nvfuser_index_t i264 = 0; i264 < 2; ++i264) {
T25[(i260 * 8) + ((i262 * 2) + i264)]
= T16[((i262 * 2) + i264)]
* T19[0];
}
}
}
NVFUSER_UPDATE_MAGIC_ZERO
#pragma unroll
for(nvfuser_index_t i255 = 0; i255 < 2; ++i255) {
int64_t i1334;
i1334 = (((nvfuser_index_t)blockIdx.x) * 2) + i255;
#pragma unroll
for(nvfuser_index_t i257 = 0; i257 < 4; ++i257) {
int64_t i903;
i903 = (((i257 + nvfuser_zero) * (ceilDiv((ceilDiv(T0.size[3], 2)), 4))) + ((nvfuser_index_t)threadIdx.x)) * 2;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv((ceilDiv((ceilDiv(T0.size[3], 2)), 4)), 1))) && ((((((i257 + nvfuser_zero) * (ceilDiv((ceilDiv(T0.size[3], 2)), 4))) + ((nvfuser_index_t)threadIdx.x)) * 2) + 1) < T0.size[3])) && (i1334 < (1 * (T0.size[1] * T0.size[2]))))) {
loadLocalToGlobal<double, 2, false>( &T20[(((((nvfuser_index_t)blockIdx.x) * 2) + i255) * T0.size[3]) + i903], &T25[(i255 * 8) + (i257 * 2)]);
}
}
}
NVFUSER_UPDATE_MAGIC_ZERO
}
}
======================================
ptxas info : 982 bytes gmem
ptxas info : Compiling entry function '_ZN11CudaCodeGen7kernel1ENS_6TensorIfLi4EEENS0_IdLi0EEES2_S1_xxNS0_IdLi4EEE' for 'sm_80'
ptxas info : Function properties for _ZN11CudaCodeGen7kernel1ENS_6TensorIfLi4EEENS0_IdLi0EEES2_S1_xxNS0_IdLi4EEE
ptxas . 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Used 135 registers, 16 bytes smem, 504 bytes cmem[0], 96 bytes cmem[2]
Launch Parameters: BlockDim.x = 512, BlockDim.y = -1, BlockDim.z = -1, GridDim.x = 384, GridDim.y = -1, GridDim.z = -1, Smem Size = 4096
Traceback (most recent call last):
File "/fsx/users/dberard/pytorch/33-repro.py", line 21, in <module>
fn_s(t1, t2, t3, t4, 1, 1)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: CUDA driver error: too many resources requested for launch
srun: error: dev-st-p4d24xlarge-1: task 0: Exited with exit code 1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment