davidberard98 · April 20, 2022 18:03
diff --git a/33-logs.txt b/33-logs.txt
 srun: error: ioctl(TIOCGWINSZ): Inappropriate ioctl for device
 srun: error: Not using a pseudo-terminal, disregarding --pty option
 [DUMP graph_fuser.cpp:2323] Before Fusion: 
 [DUMP graph_fuser.cpp:2323] graph(%t1.1 : Tensor,
 [DUMP graph_fuser.cpp:2323]       %t2.1 : Tensor,
 [DUMP graph_fuser.cpp:2323]       %t3.1 : Tensor,
 [DUMP graph_fuser.cpp:2323]       %t4.1 : Tensor,
 [DUMP graph_fuser.cpp:2323]       %i1.1 : int,
 [DUMP graph_fuser.cpp:2323]       %i2.1 : int):
 [DUMP graph_fuser.cpp:2323]   %9 : int = prim::Constant[value=-1]() # /fsx/users/dberard/pytorch/33-repro.py:8:28
 [DUMP graph_fuser.cpp:2323]   %8 : bool = prim::Constant[value=0]() # /fsx/users/dberard/pytorch/33-repro.py:8:32
 [DUMP graph_fuser.cpp:2323]   %7 : int[] = prim::Constant[value=[1, 12, 64, 4096]]()
 [DUMP graph_fuser.cpp:2323]   %6 : int[] = prim::Constant[value=[12, 64, 4096]]()
 [DUMP graph_fuser.cpp:2323]   %10 : Tensor = prim::profile[profiled_type=Double(requires_grad=0, device=cuda:0), seen_none=0](%t3.1)
 [DUMP graph_fuser.cpp:2323]   %11 : Tensor = prim::profile[profiled_type=Float(1, 1, 1, 4096, strides=[4096, 4096, 4096, 1], requires_grad=0, device=cuda:0), seen_none=0](%t4.1)
 [DUMP graph_fuser.cpp:2323]   %v1.1 : Tensor = aten::sub(%10, %11, %i2.1) # /fsx/users/dberard/pytorch/33-repro.py:4:9
 [DUMP graph_fuser.cpp:2323]   %13 : Tensor = prim::profile[profiled_type=Float(1, 1, 1, 4096, strides=[4096, 4096, 4096, 1], requires_grad=0, device=cuda:0), seen_none=0](%v1.1)
 [DUMP graph_fuser.cpp:2323]   %14 : Tensor = prim::profile[profiled_type=Double(requires_grad=0, device=cuda:0), seen_none=0](%t2.1)
 [DUMP graph_fuser.cpp:2323]   %v2.1 : Tensor = aten::mul(%13, %14) # /fsx/users/dberard/pytorch/33-repro.py:5:9
 [DUMP graph_fuser.cpp:2323]   %16 : Tensor = prim::profile[profiled_type=Float(12, 64, 4096, strides=[262144, 4096, 1], requires_grad=0, device=cuda:0), seen_none=0](%t1.1)
 [DUMP graph_fuser.cpp:2323]   %v3.1 : Tensor = aten::reshape(%16, %7) # /fsx/users/dberard/pytorch/33-repro.py:6:9
 [DUMP graph_fuser.cpp:2323]   %18 : Tensor = prim::profile[profiled_type=Float(1, 12, 64, 4096, strides=[3145728, 262144, 4096, 1], requires_grad=0, device=cuda:0), seen_none=0](%v3.1)
 [DUMP graph_fuser.cpp:2323]   %19 : Tensor = prim::profile[profiled_type=Float(1, 1, 1, 4096, strides=[4096, 4096, 4096, 1], requires_grad=0, device=cuda:0), seen_none=0](%v2.1)
 [DUMP graph_fuser.cpp:2323]   %v4.1 : Tensor = aten::add(%18, %19, %i1.1) # /fsx/users/dberard/pytorch/33-repro.py:7:9
 [DUMP graph_fuser.cpp:2323]   %21 : Tensor = prim::profile[profiled_type=Float(1, 12, 64, 4096, strides=[3145728, 262144, 4096, 1], requires_grad=0, device=cuda:0), seen_none=0](%v4.1)
 [DUMP graph_fuser.cpp:2323]   %v5.1 : Tensor = aten::_softmax(%21, %9, %8) # /fsx/users/dberard/pytorch/33-repro.py:8:9
 [DUMP graph_fuser.cpp:2323]   %23 : Tensor = prim::profile[profiled_type=Float(1, 12, 64, 4096, strides=[3145728, 262144, 4096, 1], requires_grad=0, device=cuda:0), seen_none=0](%v5.1)
 [DUMP graph_fuser.cpp:2323]   %v6.1 : Tensor = aten::reshape(%23, %6) # /fsx/users/dberard/pytorch/33-repro.py:9:9
 [DUMP graph_fuser.cpp:2323]   %25 : Tensor = prim::profile[profiled_type=Float(12, 64, 4096, strides=[262144, 4096, 1], requires_grad=0, device=cuda:0), seen_none=0](%v6.1)
 [DUMP graph_fuser.cpp:2323]   return (%25)
 [DUMP graph_fuser.cpp:2403] Before Compilation: 
 [DUMP graph_fuser.cpp:2403] graph(%t1.1 : Tensor,
 [DUMP graph_fuser.cpp:2403]       %t2.1 : Tensor,
 [DUMP graph_fuser.cpp:2403]       %t3.1 : Tensor,
 [DUMP graph_fuser.cpp:2403]       %t4.1 : Tensor,
 [DUMP graph_fuser.cpp:2403]       %i1.1 : int,
 [DUMP graph_fuser.cpp:2403]       %i2.1 : int):
 [DUMP graph_fuser.cpp:2403]   %7 : int[] = prim::Constant[value=[1, 12, 64, 4096]]()
 [DUMP graph_fuser.cpp:2403]   %6 : int[] = prim::Constant[value=[12, 64, 4096]]()
 [DUMP graph_fuser.cpp:2403]   %v3.1 : Tensor = aten::reshape(%t1.1, %7) # /fsx/users/dberard/pytorch/33-repro.py:6:9
 [DUMP graph_fuser.cpp:2403]   %36 : bool = prim::CudaFusionGuard[types=[Float(1, 12, 64, 4096, strides=[3145728, 262144, 4096, 1], requires_grad=0, device=cuda:0), Double(requires_grad=0, device=cuda:0), Double(requires_grad=0, device=cuda:0), Float(1, 1, 1, 4096, strides=[4096, 4096, 4096, 1], requires_grad=0, device=cuda:0)]](%v3.1, %t2.1, %t3.1, %t4.1)
 [DUMP graph_fuser.cpp:2403]   %35 : Tensor = prim::If(%36)
 [DUMP graph_fuser.cpp:2403]     block0():
 [DUMP graph_fuser.cpp:2403]       %v5.4 : Tensor = prim::CudaFusionGroup_0(%v3.1, %t2.1, %t3.1, %t4.1, %i1.1, %i2.1)
 [DUMP graph_fuser.cpp:2403]       -> (%v5.4)
 [DUMP graph_fuser.cpp:2403]     block1():
 [DUMP graph_fuser.cpp:2403]       %v5.1 : Tensor = prim::FallbackGraph_1(%v3.1, %t2.1, %t3.1, %t4.1, %i1.1, %i2.1)
 [DUMP graph_fuser.cpp:2403]       -> (%v5.1)
 [DUMP graph_fuser.cpp:2403]   %v6.1 : Tensor = aten::reshape(%35, %6) # /fsx/users/dberard/pytorch/33-repro.py:9:9
 [DUMP graph_fuser.cpp:2403]   return (%v6.1)
 [DUMP graph_fuser.cpp:2403] with prim::CudaFusionGroup_0 = graph(%4 : Float(1, 12, 64, 4096, strides=[3145728, 262144, 4096, 1], requires_grad=0, device=cuda:0),
 [DUMP graph_fuser.cpp:2403]       %9 : Double(requires_grad=0, device=cuda:0),
 [DUMP graph_fuser.cpp:2403]       %11 : Double(requires_grad=0, device=cuda:0),
 [DUMP graph_fuser.cpp:2403]       %12 : Float(1, 1, 1, 4096, strides=[4096, 4096, 4096, 1], requires_grad=0, device=cuda:0),
 [DUMP graph_fuser.cpp:2403]       %6 : int,
 [DUMP graph_fuser.cpp:2403]       %13 : int):
 [DUMP graph_fuser.cpp:2403]   %2 : bool = prim::Constant[value=0]() # /fsx/users/dberard/pytorch/33-repro.py:8:32
 [DUMP graph_fuser.cpp:2403]   %1 : int = prim::Constant[value=-1]() # /fsx/users/dberard/pytorch/33-repro.py:8:28
 [DUMP graph_fuser.cpp:2403]   %v1.1 : Float(1, 1, 1, 4096, strides=[4096, 4096, 4096, 1], requires_grad=0, device=cuda:0) = aten::sub(%11, %12, %13) # /fsx/users/dberard/pytorch/33-repro.py:4:9
 [DUMP graph_fuser.cpp:2403]   %v2.1 : Float(1, 1, 1, 4096, strides=[4096, 4096, 4096, 1], requires_grad=0, device=cuda:0) = aten::mul(%v1.1, %9) # /fsx/users/dberard/pytorch/33-repro.py:5:9
 [DUMP graph_fuser.cpp:2403]   %v4.1 : Float(1, 12, 64, 4096, strides=[3145728, 262144, 4096, 1], requires_grad=0, device=cuda:0) = aten::add(%4, %v2.1, %6) # /fsx/users/dberard/pytorch/33-repro.py:7:9
 [DUMP graph_fuser.cpp:2403]   %v5.1 : Float(1, 12, 64, 4096, strides=[3145728, 262144, 4096, 1], requires_grad=0, device=cuda:0) = aten::_softmax(%v4.1, %1, %2) # /fsx/users/dberard/pytorch/33-repro.py:8:9
 [DUMP graph_fuser.cpp:2403]   return (%v5.1)
 [DUMP graph_fuser.cpp:2403] with prim::FallbackGraph_1 = graph(%v3.1 : Float(1, 12, 64, 4096, strides=[3145728, 262144, 4096, 1], requires_grad=0, device=cuda:0),
 [DUMP graph_fuser.cpp:2403]       %t2.1 : Double(requires_grad=0, device=cuda:0),
 [DUMP graph_fuser.cpp:2403]       %t3.1 : Double(requires_grad=0, device=cuda:0),
 [DUMP graph_fuser.cpp:2403]       %t4.1 : Float(1, 1, 1, 4096, strides=[4096, 4096, 4096, 1], requires_grad=0, device=cuda:0),
 [DUMP graph_fuser.cpp:2403]       %i1.1 : int,
 [DUMP graph_fuser.cpp:2403]       %i2.1 : int):
 [DUMP graph_fuser.cpp:2403]   %6 : bool = prim::Constant[value=0]() # /fsx/users/dberard/pytorch/33-repro.py:8:32
 [DUMP graph_fuser.cpp:2403]   %7 : int = prim::Constant[value=-1]() # /fsx/users/dberard/pytorch/33-repro.py:8:28
 [DUMP graph_fuser.cpp:2403]   %v1.1 : Float(1, 1, 1, 4096, strides=[4096, 4096, 4096, 1], requires_grad=0, device=cuda:0) = aten::sub(%t3.1, %t4.1, %i2.1) # /fsx/users/dberard/pytorch/33-repro.py:4:9
 [DUMP graph_fuser.cpp:2403]   %v2.1 : Float(1, 1, 1, 4096, strides=[4096, 4096, 4096, 1], requires_grad=0, device=cuda:0) = aten::mul(%v1.1, %t2.1) # /fsx/users/dberard/pytorch/33-repro.py:5:9
 [DUMP graph_fuser.cpp:2403]   %v4.1 : Float(1, 12, 64, 4096, strides=[3145728, 262144, 4096, 1], requires_grad=0, device=cuda:0) = aten::add(%v3.1, %v2.1, %i1.1) # /fsx/users/dberard/pytorch/33-repro.py:7:9
 [DUMP graph_fuser.cpp:2403]   %v5.1 : Float(1, 12, 64, 4096, strides=[3145728, 262144, 4096, 1], requires_grad=0, device=cuda:0) = aten::_softmax(%v4.1, %7, %6) # /fsx/users/dberard/pytorch/33-repro.py:8:9
 [DUMP graph_fuser.cpp:2403]   return (%v5.1)

 ===== Reduction Stats ========
 total_reduction_numel: 4096
 total_iteration_numel: 768
 inner_most_dimension_numel: 4096
 vectorize_factor: 2
 n_tensor_inputs: 2
 max_input_dtype_size: 4
 max_persistent_buffer_size: 32768
 max_multi_reduction_factor: 4
 block(512, 1, 1)
 ===== Reduction Parameters ========
 Tag: Inner Persistent Heuristic.

 Red On Fastest Dim
 Persistent Kernel
 Batches per block: 4

 Iteration Domain: blockIdx.x / unroll / factor 2
 Inner Reduction Domain: cross block - threadIdx.x /  pad to warp / persistent batch - 4 / vectorize / factor 2
 Launch Parameters: BlockDim.x = -1, BlockDim.y = 1, BlockDim.z = -1, GridDim.x = -1, GridDim.y = -1, GridDim.z = -1, Smem Size = 0

 ====================================


 ===== Reduction Stats ========
 total_reduction_numel: 4096
 total_iteration_numel: 768
 inner_most_dimension_numel: 4096
 vectorize_factor: 2
 n_tensor_inputs: 2
 max_input_dtype_size: 4
 max_persistent_buffer_size: 32768
 max_multi_reduction_factor: 4
 block(512, 1, 1)
 ===== Reduction Parameters ========
 Tag: Inner Persistent Heuristic.

 Red On Fastest Dim
 Persistent Kernel
 Batches per block: 4

 Iteration Domain: blockIdx.x / unroll / factor 2
 Inner Reduction Domain: cross block - threadIdx.x /  pad to warp / persistent batch - 4 / vectorize / factor 2
 Launch Parameters: BlockDim.x = -1, BlockDim.y = 1, BlockDim.z = -1, GridDim.x = -1, GridDim.y = -1, GridDim.z = -1, Smem Size = 0

 ====================================

 Inputs:
  T0_g[ iblockIdx.x238{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS239{1}, iS237{2}, iS245{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}, iS242{4}, iS244{1}, iS241{2} ], float
  T1_g[ 0 ], double
  T2_g[ 0 ], double
  T3_g[ sbS322{( ceilDiv(( ceilDiv(( 1 * ( 1 * 1 ) ), 2) ), 1) )}, sbS323{1}, sbS321{2}, iS329{( ceilDiv(( ceilDiv(( ceilDiv(i5, 2) ), 4) ), 1) )}, iS326{4}, iS328{1}, iS325{2} ], float
  i6, int64_t
  i7, int64_t
 Outputs:
  T20_g[ iblockIdx.x166{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS167{1}, iUR165{2}, ithreadIdx.x359{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}_p, iS356{4}, iUS358{1}, iV355{2} ] produce_pos( 2), double

 %kernel_math {
 T21_l[ iblockIdx.x214{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS215{1}, iUR213{2}, ithreadIdx.x221{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}_p, iS218{4}, iUS220{1}, iV217{2} ] ca_pos( 2 )
   = T0_g[ iblockIdx.x238{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS239{1}, iS237{2}, iS245{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}, iS242{4}, iS244{1}, iS241{2} ];
 T11_l[ iblockIdx.x190{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS191{1}, iS189{2}, ithreadIdx.x197{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}_p, iS194{4}, iUS196{1}, iS193{2} ] ca_pos( 7 ) produce_pos( 2)
   = (double)(T21_l[ iblockIdx.x214{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS215{1}, iUR213{2}, ithreadIdx.x221{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}_p, iS218{4}, iUS220{1}, iV217{2} ] ca_pos( 2 ));
 T23_l[ 0 ]
   = T2_g[ 0 ];
 T4_l[ bblockIdx.x274{( ceilDiv(( ceilDiv(( 1 * ( 1 * 1 ) ), 2) ), 1) )}, bUS275{1}, bS273{2}, bthreadIdx.x281{( ceilDiv(( ceilDiv(( ceilDiv(1, 2) ), 4) ), 1) )}_p, bS278{4}, bUS280{1}, bS277{2} ] = broadcast( T23_l[ 0 ] )
 T24_l[ sbblockIdx.x310{( ceilDiv(( ceilDiv(( 1 * ( 1 * 1 ) ), 2) ), 1) )}, sbUS311{1}, sbUR309{2}, ithreadIdx.x317{( ceilDiv(( ceilDiv(( ceilDiv(i5, 2) ), 4) ), 1) )}_p, iS314{4}, iUS316{1}, iV313{2} ]
   = T3_g[ sbS322{( ceilDiv(( ceilDiv(( 1 * ( 1 * 1 ) ), 2) ), 1) )}, sbS323{1}, sbS321{2}, iS329{( ceilDiv(( ceilDiv(( ceilDiv(i5, 2) ), 4) ), 1) )}, iS326{4}, iS328{1}, iS325{2} ];
 d10 = (double)(i7);
 T5_l[ sbblockIdx.x298{( ceilDiv(( ceilDiv(( 1 * ( 1 * 1 ) ), 2) ), 1) )}, sbUS299{1}, sbS297{2}, ithreadIdx.x305{( ceilDiv(( ceilDiv(( ceilDiv(i5, 2) ), 4) ), 1) )}_p, iS302{4}, iUS304{1}, iS301{2} ] ca_pos( 7 )
   = T24_l[ sbblockIdx.x310{( ceilDiv(( ceilDiv(( 1 * ( 1 * 1 ) ), 2) ), 1) )}, sbUS311{1}, sbUR309{2}, ithreadIdx.x317{( ceilDiv(( ceilDiv(( ceilDiv(i5, 2) ), 4) ), 1) )}_p, iS314{4}, iUS316{1}, iV313{2} ]
   * d10;
 T6_l[ sbblockIdx.x286{( ceilDiv(( ceilDiv(( 1 * ( 1 * 1 ) ), 2) ), 1) )}, sbUS287{1}, sbS285{2}, ithreadIdx.x293{( ceilDiv(( ceilDiv(( ceilDiv(i5, 2) ), 4) ), 1) )}_p, iS290{4}, iUS292{1}, iS289{2} ] ca_pos( 7 ) produce_pos( 7)
   = (double)(T5_l[ sbblockIdx.x298{( ceilDiv(( ceilDiv(( 1 * ( 1 * 1 ) ), 2) ), 1) )}, sbUS299{1}, sbS297{2}, ithreadIdx.x305{( ceilDiv(( ceilDiv(( ceilDiv(i5, 2) ), 4) ), 1) )}_p, iS302{4}, iUS304{1}, iS301{2} ] ca_pos( 7 ));
 T7_l[ sbblockIdx.x250{( ceilDiv(( ceilDiv(( 1 * ( 1 * 1 ) ), 2) ), 1) )}, sbUS251{1}, sbS249{2}, ithreadIdx.x257{( ceilDiv(( ceilDiv(( ceilDiv(i5, 2) ), 4) ), 1) )}_p, iS254{4}, iUS256{1}, iS253{2} ] ca_pos( 7 ) produce_pos( 7)
   = T4_l[ bblockIdx.x274{( ceilDiv(( ceilDiv(( 1 * ( 1 * 1 ) ), 2) ), 1) )}, bUS275{1}, bS273{2}, bthreadIdx.x281{( ceilDiv(( ceilDiv(( ceilDiv(1, 2) ), 4) ), 1) )}_p, bS278{4}, bUS280{1}, bS277{2} ]
   - T6_l[ sbblockIdx.x286{( ceilDiv(( ceilDiv(( 1 * ( 1 * 1 ) ), 2) ), 1) )}, sbUS287{1}, sbS285{2}, ithreadIdx.x293{( ceilDiv(( ceilDiv(( ceilDiv(i5, 2) ), 4) ), 1) )}_p, iS290{4}, iUS292{1}, iS289{2} ] ca_pos( 7 ) produce_pos( 7);
 T22_l[ 0 ]
   = T1_g[ 0 ];
 T8_l[ bblockIdx.x262{( ceilDiv(( ceilDiv(( 1 * ( 1 * 1 ) ), 2) ), 1) )}, bUS263{1}, bS261{2}, bthreadIdx.x269{( ceilDiv(( ceilDiv(( ceilDiv(1, 2) ), 4) ), 1) )}_p, bS266{4}, bUS268{1}, bS265{2} ] = broadcast( T22_l[ 0 ] )
 T9_l[ sbblockIdx.x226{( ceilDiv(( ceilDiv(( 1 * ( 1 * 1 ) ), 2) ), 1) )}, sbUS227{1}, sbS225{2}, ithreadIdx.x233{( ceilDiv(( ceilDiv(( ceilDiv(i5, 2) ), 4) ), 1) )}_p, iS230{4}, iUS232{1}, iS229{2} ] ca_pos( 7 ) produce_pos( 7)
   = T7_l[ sbblockIdx.x250{( ceilDiv(( ceilDiv(( 1 * ( 1 * 1 ) ), 2) ), 1) )}, sbUS251{1}, sbS249{2}, ithreadIdx.x257{( ceilDiv(( ceilDiv(( ceilDiv(i5, 2) ), 4) ), 1) )}_p, iS254{4}, iUS256{1}, iS253{2} ] ca_pos( 7 ) produce_pos( 7)
   * T8_l[ bblockIdx.x262{( ceilDiv(( ceilDiv(( 1 * ( 1 * 1 ) ), 2) ), 1) )}, bUS263{1}, bS261{2}, bthreadIdx.x269{( ceilDiv(( ceilDiv(( ceilDiv(1, 2) ), 4) ), 1) )}_p, bS266{4}, bUS268{1}, bS265{2} ];
 d19 = (double)(i6);
 T10_l[ sbblockIdx.x202{( ceilDiv(( ceilDiv(( 1 * ( 1 * 1 ) ), 2) ), 1) )}, sbUS203{1}, sbS201{2}, ithreadIdx.x209{( ceilDiv(( ceilDiv(( ceilDiv(i5, 2) ), 4) ), 1) )}_p, iS206{4}, iUS208{1}, iS205{2} ] ca_pos( 7 ) produce_pos( 7)
   = T9_l[ sbblockIdx.x226{( ceilDiv(( ceilDiv(( 1 * ( 1 * 1 ) ), 2) ), 1) )}, sbUS227{1}, sbS225{2}, ithreadIdx.x233{( ceilDiv(( ceilDiv(( ceilDiv(i5, 2) ), 4) ), 1) )}_p, iS230{4}, iUS232{1}, iS229{2} ] ca_pos( 7 ) produce_pos( 7)
   * d19;
 T12_l[ iblockIdx.x178{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS179{1}, iS177{2}, ithreadIdx.x185{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}_p, iS182{4}, iUS184{1}, iS181{2} ] ca_pos( 3 ) produce_pos( 7)
   = T11_l[ iblockIdx.x190{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS191{1}, iS189{2}, ithreadIdx.x197{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}_p, iS194{4}, iUS196{1}, iS193{2} ] ca_pos( 7 ) produce_pos( 2)
   + T10_l[ sbblockIdx.x202{( ceilDiv(( ceilDiv(( 1 * ( 1 * 1 ) ), 2) ), 1) )}, sbUS203{1}, sbS201{2}, ithreadIdx.x209{( ceilDiv(( ceilDiv(( ceilDiv(i5, 2) ), 4) ), 1) )}_p, iS206{4}, iUS208{1}, iS205{2} ] ca_pos( 7 ) produce_pos( 7);
 T26_l[ iblockIdx.x108{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS109{1}, iS107{2}, ithreadIdx.x115{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}rf_p, rS112{4}rf, rUS114{1}rf, rS111{2}rf ] ca_pos( 4 ) produce_pos( 3) = reduction( T12_l[ iblockIdx.x178{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS179{1}, iS177{2}, ithreadIdx.x185{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}_p, iS182{4}, iUS184{1}, iS181{2} ] ca_pos( 3 ) produce_pos( 7), op = fmax, initial value = double(-inf), fused = 0 )
 T13_l[ iblockIdx.x123{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS124{1}, iS122{2}, rthreadIdx.x125{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}_p ] ca_pos( 3 ) produce_pos( 4) = reduction( T26_l[ iblockIdx.x108{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS109{1}, iS107{2}, ithreadIdx.x115{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}rf_p, rS112{4}rf, rUS114{1}rf, rS111{2}rf ] ca_pos( 4 ) produce_pos( 3), op = fmax, initial value = double(-inf), fused = 0 )
 T14_l[ iblockIdx.x130{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS131{1}, iS129{2}, bthreadIdx.x365{( ceilDiv(( ceilDiv(( ceilDiv(1, 2) ), 4) ), 1) )}_p, bS362{4}, bUS364{1}, bS361{2} ] ca_pos( 3 ) produce_pos( 3) = broadcast( T13_l[ iblockIdx.x123{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS124{1}, iS122{2}, rthreadIdx.x125{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}_p ] ca_pos( 3 ) produce_pos( 4) )
 T15_l[ iblockIdx.x136{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS137{1}, iS135{2}, ithreadIdx.x335{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}_p, iS332{4}, iUS334{1}, iS331{2} ] ca_pos( 7 ) produce_pos( 3)
   = T12_l[ iblockIdx.x178{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS179{1}, iS177{2}, ithreadIdx.x185{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}_p, iS182{4}, iUS184{1}, iS181{2} ] ca_pos( 3 ) produce_pos( 7)
   - T14_l[ iblockIdx.x130{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS131{1}, iS129{2}, bthreadIdx.x365{( ceilDiv(( ceilDiv(( ceilDiv(1, 2) ), 4) ), 1) )}_p, bS362{4}, bUS364{1}, bS361{2} ] ca_pos( 3 ) produce_pos( 3);
 T16_l[ iblockIdx.x142{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS143{1}, iS141{2}, ithreadIdx.x341{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}_p, iS338{4}, iUS340{1}, iS337{2} ] ca_pos( 3 ) produce_pos( 7)
   = exp(T15_l[ iblockIdx.x136{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS137{1}, iS135{2}, ithreadIdx.x335{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}_p, iS332{4}, iUS334{1}, iS331{2} ] ca_pos( 7 ) produce_pos( 3));
 T27_l[ iblockIdx.x386{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS387{1}, iS385{2}, ithreadIdx.x393{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}rf_p, rS390{4}rf, rUS392{1}rf, rS389{2}rf ] ca_pos( 4 ) produce_pos( 3) = reduction( T16_l[ iblockIdx.x142{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS143{1}, iS141{2}, ithreadIdx.x341{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}_p, iS338{4}, iUS340{1}, iS337{2} ] ca_pos( 3 ) produce_pos( 7), op = add, initial value = double(0), fused = 0 )
 T17_l[ iblockIdx.x401{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS402{1}, iS400{2}, rthreadIdx.x403{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}_p ] ca_pos( 3 ) produce_pos( 4) = reduction( T27_l[ iblockIdx.x386{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS387{1}, iS385{2}, ithreadIdx.x393{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}rf_p, rS390{4}rf, rUS392{1}rf, rS389{2}rf ] ca_pos( 4 ) produce_pos( 3), op = add, initial value = double(0), fused = 0 )
 T18_l[ iblockIdx.x160{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS161{1}, iS159{2}, bthreadIdx.x377{( ceilDiv(( ceilDiv(( ceilDiv(1, 2) ), 4) ), 1) )}_p, bS374{4}, bUS376{1}, bS373{2} ] ca_pos( 3 ) produce_pos( 3) = broadcast( T17_l[ iblockIdx.x401{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS402{1}, iS400{2}, rthreadIdx.x403{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}_p ] ca_pos( 3 ) produce_pos( 4) )
 T19_l[ iblockIdx.x172{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS173{1}, iS171{2}, bthreadIdx.x371{( ceilDiv(( ceilDiv(( ceilDiv(1, 2) ), 4) ), 1) )}_p, bS368{4}, bUS370{1}, bS367{2} ] ca_pos( 3 ) produce_pos( 3)
   = reciprocal(T18_l[ iblockIdx.x160{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS161{1}, iS159{2}, bthreadIdx.x377{( ceilDiv(( ceilDiv(( ceilDiv(1, 2) ), 4) ), 1) )}_p, bS374{4}, bUS376{1}, bS373{2} ] ca_pos( 3 ) produce_pos( 3));
 T25_l[ iblockIdx.x154{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS155{1}, iS153{2}, ithreadIdx.x353{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}_p, iS350{4}, iUS352{1}, iS349{2} ] ca_pos( 2 ) produce_pos( 3)
   = T16_l[ iblockIdx.x142{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS143{1}, iS141{2}, ithreadIdx.x341{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}_p, iS338{4}, iUS340{1}, iS337{2} ] ca_pos( 3 ) produce_pos( 7)
   * T19_l[ iblockIdx.x172{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS173{1}, iS171{2}, bthreadIdx.x371{( ceilDiv(( ceilDiv(( ceilDiv(1, 2) ), 4) ), 1) )}_p, bS368{4}, bUS370{1}, bS367{2} ] ca_pos( 3 ) produce_pos( 3);
 T20_g[ iblockIdx.x166{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS167{1}, iUR165{2}, ithreadIdx.x359{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}_p, iS356{4}, iUS358{1}, iV355{2} ] produce_pos( 2)
   = T25_l[ iblockIdx.x154{( ceilDiv(( ceilDiv(( 1 * ( i2 * i3 ) ), 2) ), 1) )}, iUS155{1}, iS153{2}, ithreadIdx.x353{( ceilDiv(( ceilDiv(( ceilDiv(i4, 2) ), 4) ), 1) )}_p, iS350{4}, iUS352{1}, iS349{2} ] ca_pos( 2 ) produce_pos( 3);
 }


 ======= Codegen output for kernel: kernel1 =======

 __global__ void kernel1(Tensor<float, 4> T0, Tensor<double, 0> T1, Tensor<double, 0> T2, Tensor<float, 4> T3, int64_t i6, int64_t i7, Tensor<double, 4> T20) {
  alignas(8) extern __shared__ char array[];
  void* shared_mem = array;
  NVFUSER_DEFINE_MAGIC_ZERO
  double T22[1];
  T22[0] = 0;
  T22[0]
     = T1[0];
  double T8[1];
  T8[0]
     = T22[0];
  double T23[1];
  T23[0] = 0;
  T23[0]
     = T2[0];
  double T4[1];
  T4[0]
     = T23[0];
  double d19;
  d19 = (double)(i6);
  double d10;
  d10 = (double)(i7);
  Array<float, ((4 * 1) * 2), 2> T24;
  #pragma unroll
  for(nvfuser_index_t i295 = 0; i295 < 4; ++i295) {
    T24.set(0);
  }
  NVFUSER_UPDATE_MAGIC_ZERO
  if (((((((ceilDiv((ceilDiv(T0.size[3], 2)), 4)) * 3) + ((nvfuser_index_t)threadIdx.x)) * 2) + 1) < T0.size[3])) {
    #pragma unroll
    for(nvfuser_index_t i288 = 0; i288 < 4; ++i288) {
      loadGlobalToLocal<float, 2, false>(&T24[(i288 * 2)],  &T3[((((i288 + nvfuser_zero) * (ceilDiv((ceilDiv(T0.size[3], 2)), 4))) + ((nvfuser_index_t)threadIdx.x)) * 2)]);
    }
    NVFUSER_UPDATE_MAGIC_ZERO
  } else {
    #pragma unroll
    for(nvfuser_index_t i288 = 0; i288 < 4; ++i288) {
      int64_t i361;
      i361 = (((i288 + nvfuser_zero) * (ceilDiv((ceilDiv(T0.size[3], 2)), 4))) + ((nvfuser_index_t)threadIdx.x)) * 2;
      if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv((ceilDiv((ceilDiv(T0.size[3], 2)), 4)), 1))) && ((((((i288 + nvfuser_zero) * (ceilDiv((ceilDiv(T0.size[3], 2)), 4))) + ((nvfuser_index_t)threadIdx.x)) * 2) + 1) < T0.size[3]))) {
        loadGlobalToLocal<float, 2, false>(&T24[(i288 * 2)],  &T3[i361]);
      }
    }
    NVFUSER_UPDATE_MAGIC_ZERO
  }
  if ((((((((ceilDiv((ceilDiv(T0.size[3], 2)), 4)) * 3) + ((nvfuser_index_t)threadIdx.x)) * 2) + 1) < T0.size[3]) && (((((nvfuser_index_t)blockIdx.x) * 2) + 1) < (1 * (T0.size[1] * T0.size[2]))))) {
    Array<float, (((2 * 4) * 1) * 2), 2> T21;
    #pragma unroll
    for(nvfuser_index_t i304 = 0; i304 < 2; ++i304) {
      #pragma unroll
      for(nvfuser_index_t i302 = 0; i302 < 4; ++i302) {
        T21.set(0);
      }
    }
    NVFUSER_UPDATE_MAGIC_ZERO
    #pragma unroll
    for(nvfuser_index_t i282 = 0; i282 < 2; ++i282) {
      #pragma unroll
      for(nvfuser_index_t i284 = 0; i284 < 4; ++i284) {
        loadGlobalToLocal<float, 2, false>(&T21[(i282 * 8) + (i284 * 2)],  &T0[(((((nvfuser_index_t)blockIdx.x) * 2) + i282) * T0.size[3]) + ((((i284 + nvfuser_zero) * (ceilDiv((ceilDiv(T0.size[3], 2)), 4))) + ((nvfuser_index_t)threadIdx.x)) * 2)]);
      }
    }
    NVFUSER_UPDATE_MAGIC_ZERO
    Array<double, (((2 * 4) * 1) * 2), 2> T25;
    #pragma unroll
    for(nvfuser_index_t i260 = 0; i260 < 2; ++i260) {
      double T12[((4 * 1) * 2)];
      #pragma unroll
      for(nvfuser_index_t i279 = 0; i279 < 4; ++i279) {
        #pragma unroll
        for(nvfuser_index_t i281 = 0; i281 < 2; ++i281) {
          float T5[1];
          T5[0]
            = T24[((i279 * 2) + i281)]
            * (float) d10;
          double T6[1];
          T6[0]
             = (double)(T5[0]);
          double T7[1];
          T7[0]
            = T4[0]
            - T6[0];
          double T11[1];
          T11[0]
             = (double)(T21[(i260 * 8) + ((i279 * 2) + i281)]);
          double T9[1];
          T9[0]
            = T7[0]
            * T8[0];
          double T10[1];
          T10[0]
            = T9[0]
            * d19;
          T12[((i279 * 2) + i281)]
            = T11[0]
            + T10[0];
        }
      }
      double T13[1];
      T13[0] = NEG_INFINITY;
      double T26[1];
      T26[0] = NEG_INFINITY;
      #pragma unroll
      for(nvfuser_index_t i275 = 0; i275 < 4; ++i275) {
        #pragma unroll
        for(nvfuser_index_t i277 = 0; i277 < 2; ++i277) {
          T26[0] = fmax(
            T26[0],
            T12[((i275 * 2) + i277)]);
        }
      }
      warp::warpReduceTIDX<false>(
        T13[0],
        T26[0],
        [](double &a, double b) { a = fmax(a, b); },
        threadIdx,
        blockDim,
        static_cast<double*>(shared_mem),
        true,
        double(NEG_INFINITY));
      double T14[1];
      broadcast::blockBroadcast<true, false, false>(
        T14[0],
        T13[0],
        static_cast<double*>(shared_mem),
        true);
      // Alias Allocation - register
      auto& T16 = T12;
      #pragma unroll
      for(nvfuser_index_t i271 = 0; i271 < 4; ++i271) {
        #pragma unroll
        for(nvfuser_index_t i273 = 0; i273 < 2; ++i273) {
          double T15[1];
          T15[0]
            = T12[((i271 * 2) + i273)]
            - T14[0];
          T16[((i271 * 2) + i273)]
             = exp(T15[0]);
        }
      }
      double T17[1];
      T17[0] = 0;
      double T27[1];
      T27[0] = 0;
      #pragma unroll
      for(nvfuser_index_t i267 = 0; i267 < 4; ++i267) {
        #pragma unroll
        for(nvfuser_index_t i269 = 0; i269 < 2; ++i269) {
          T27[0]
            = T27[0]
            + T16[((i267 * 2) + i269)];
        }
      }
      warp::warpReduceTIDX<false>(
        T17[0],
        T27[0],
        [](double &a, double b) { a = a + b; },
        threadIdx,
        blockDim,
        static_cast<double*>(shared_mem),
        true,
        double(0));
      double T18[1];
      broadcast::blockBroadcast<true, false, false>(
        T18[0],
        T17[0],
        static_cast<double*>(shared_mem),
        true);
      double T19[1];
      T19[0]
         = reciprocal(T18[0]);
      #pragma unroll
      for(nvfuser_index_t i262 = 0; i262 < 4; ++i262) {
        #pragma unroll
        for(nvfuser_index_t i264 = 0; i264 < 2; ++i264) {
          T25[(i260 * 8) + ((i262 * 2) + i264)]
            = T16[((i262 * 2) + i264)]
            * T19[0];
        }
      }
    }
    NVFUSER_UPDATE_MAGIC_ZERO
    #pragma unroll
    for(nvfuser_index_t i255 = 0; i255 < 2; ++i255) {
      #pragma unroll
      for(nvfuser_index_t i257 = 0; i257 < 4; ++i257) {
        loadLocalToGlobal<double, 2, false>( &T20[(((((nvfuser_index_t)blockIdx.x) * 2) + i255) * T0.size[3]) + ((((i257 + nvfuser_zero) * (ceilDiv((ceilDiv(T0.size[3], 2)), 4))) + ((nvfuser_index_t)threadIdx.x)) * 2)], &T25[(i255 * 8) + (i257 * 2)]);
      }
    }
    NVFUSER_UPDATE_MAGIC_ZERO
  } else {
    Array<float, (((2 * 4) * 1) * 2), 2> T21;
    #pragma unroll
    for(nvfuser_index_t i304 = 0; i304 < 2; ++i304) {
      #pragma unroll
      for(nvfuser_index_t i302 = 0; i302 < 4; ++i302) {
        T21.set(0);
      }
    }
    NVFUSER_UPDATE_MAGIC_ZERO
    #pragma unroll
    for(nvfuser_index_t i282 = 0; i282 < 2; ++i282) {
      int64_t i1224;
      i1224 = (((nvfuser_index_t)blockIdx.x) * 2) + i282;
      #pragma unroll
      for(nvfuser_index_t i284 = 0; i284 < 4; ++i284) {
        int64_t i661;
        i661 = (((i284 + nvfuser_zero) * (ceilDiv((ceilDiv(T0.size[3], 2)), 4))) + ((nvfuser_index_t)threadIdx.x)) * 2;
        if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv((ceilDiv((ceilDiv(T0.size[3], 2)), 4)), 1))) && ((((((i284 + nvfuser_zero) * (ceilDiv((ceilDiv(T0.size[3], 2)), 4))) + ((nvfuser_index_t)threadIdx.x)) * 2) + 1) < T0.size[3])) && (i1224 < (1 * (T0.size[1] * T0.size[2]))))) {
          loadGlobalToLocal<float, 2, false>(&T21[(i282 * 8) + (i284 * 2)],  &T0[(((((nvfuser_index_t)blockIdx.x) * 2) + i282) * T0.size[3]) + i661]);
        }
      }
    }
    NVFUSER_UPDATE_MAGIC_ZERO
    Array<double, (((2 * 4) * 1) * 2), 2> T25;
    #pragma unroll
    for(nvfuser_index_t i260 = 0; i260 < 2; ++i260) {
      int64_t i1272;
      i1272 = (((nvfuser_index_t)blockIdx.x) * 2) + i260;
      double T12[((4 * 1) * 2)];
      #pragma unroll
      for(nvfuser_index_t i279 = 0; i279 < 4; ++i279) {
        #pragma unroll
        for(nvfuser_index_t i281 = 0; i281 < 2; ++i281) {
          float T5[1];
          T5[0]
            = T24[((i279 * 2) + i281)]
            * (float) d10;
          double T6[1];
          T6[0]
             = (double)(T5[0]);
          double T7[1];
          T7[0]
            = T4[0]
            - T6[0];
          double T11[1];
          T11[0]
             = (double)(T21[(i260 * 8) + ((i279 * 2) + i281)]);
          double T9[1];
          T9[0]
            = T7[0]
            * T8[0];
          double T10[1];
          T10[0]
            = T9[0]
            * d19;
          T12[((i279 * 2) + i281)]
            = T11[0]
            + T10[0];
        }
      }
      double T13[1];
      T13[0] = NEG_INFINITY;
      double T26[1];
      T26[0] = NEG_INFINITY;
      #pragma unroll
      for(nvfuser_index_t i275 = 0; i275 < 4; ++i275) {
        #pragma unroll
        for(nvfuser_index_t i277 = 0; i277 < 2; ++i277) {
          if ((((((((i275 * (ceilDiv((ceilDiv(T0.size[3], 2)), 4))) + ((nvfuser_index_t)threadIdx.x)) * 2) + (i277 + nvfuser_zero)) < T0.size[3]) && (i1272 < (1 * (T0.size[1] * T0.size[2])))) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv((ceilDiv((ceilDiv(T0.size[3], 2)), 4)), 1))))) {
            T26[0] = fmax(
              T26[0],
              T12[((i275 * 2) + i277)]);
          }
        }
      }
      warp::warpReduceTIDX<false>(
        T13[0],
        T26[0],
        [](double &a, double b) { a = fmax(a, b); },
        threadIdx,
        blockDim,
        static_cast<double*>(shared_mem),
        true,
        double(NEG_INFINITY));
      double T14[1];
      broadcast::blockBroadcast<true, false, false>(
        T14[0],
        T13[0],
        static_cast<double*>(shared_mem),
        true);
      // Alias Allocation - register
      auto& T16 = T12;
      #pragma unroll
      for(nvfuser_index_t i271 = 0; i271 < 4; ++i271) {
        #pragma unroll
        for(nvfuser_index_t i273 = 0; i273 < 2; ++i273) {
          double T15[1];
          T15[0]
            = T12[((i271 * 2) + i273)]
            - T14[0];
          T16[((i271 * 2) + i273)]
             = exp(T15[0]);
        }
      }
      double T17[1];
      T17[0] = 0;
      double T27[1];
      T27[0] = 0;
      #pragma unroll
      for(nvfuser_index_t i267 = 0; i267 < 4; ++i267) {
        #pragma unroll
        for(nvfuser_index_t i269 = 0; i269 < 2; ++i269) {
          if ((((((((i267 * (ceilDiv((ceilDiv(T0.size[3], 2)), 4))) + ((nvfuser_index_t)threadIdx.x)) * 2) + (i269 + nvfuser_zero)) < T0.size[3]) && (i1272 < (1 * (T0.size[1] * T0.size[2])))) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv((ceilDiv((ceilDiv(T0.size[3], 2)), 4)), 1))))) {
            T27[0]
              = T27[0]
              + T16[((i267 * 2) + i269)];
          }
        }
      }
      warp::warpReduceTIDX<false>(
        T17[0],
        T27[0],
        [](double &a, double b) { a = a + b; },
        threadIdx,
        blockDim,
        static_cast<double*>(shared_mem),
        true,
        double(0));
      double T18[1];
      broadcast::blockBroadcast<true, false, false>(
        T18[0],
        T17[0],
        static_cast<double*>(shared_mem),
        true);
      double T19[1];
      T19[0]
         = reciprocal(T18[0]);
      #pragma unroll
      for(nvfuser_index_t i262 = 0; i262 < 4; ++i262) {
        #pragma unroll
        for(nvfuser_index_t i264 = 0; i264 < 2; ++i264) {
          T25[(i260 * 8) + ((i262 * 2) + i264)]
            = T16[((i262 * 2) + i264)]
            * T19[0];
        }
      }
    }
    NVFUSER_UPDATE_MAGIC_ZERO
    #pragma unroll
    for(nvfuser_index_t i255 = 0; i255 < 2; ++i255) {
      int64_t i1334;
      i1334 = (((nvfuser_index_t)blockIdx.x) * 2) + i255;
      #pragma unroll
      for(nvfuser_index_t i257 = 0; i257 < 4; ++i257) {
        int64_t i903;
        i903 = (((i257 + nvfuser_zero) * (ceilDiv((ceilDiv(T0.size[3], 2)), 4))) + ((nvfuser_index_t)threadIdx.x)) * 2;
        if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv((ceilDiv((ceilDiv(T0.size[3], 2)), 4)), 1))) && ((((((i257 + nvfuser_zero) * (ceilDiv((ceilDiv(T0.size[3], 2)), 4))) + ((nvfuser_index_t)threadIdx.x)) * 2) + 1) < T0.size[3])) && (i1334 < (1 * (T0.size[1] * T0.size[2]))))) {
          loadLocalToGlobal<double, 2, false>( &T20[(((((nvfuser_index_t)blockIdx.x) * 2) + i255) * T0.size[3]) + i903], &T25[(i255 * 8) + (i257 * 2)]);
        }
      }
    }
    NVFUSER_UPDATE_MAGIC_ZERO
  }
 }

 ======================================

 ptxas info    : 982 bytes gmem
 ptxas info    : Compiling entry function '_ZN11CudaCodeGen7kernel1ENS_6TensorIfLi4EEENS0_IdLi0EEES2_S1_xxNS0_IdLi4EEE' for 'sm_80'
 ptxas info    : Function properties for _ZN11CudaCodeGen7kernel1ENS_6TensorIfLi4EEENS0_IdLi0EEES2_S1_xxNS0_IdLi4EEE
 ptxas         .     0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
 ptxas info    : Used 135 registers, 16 bytes smem, 504 bytes cmem[0], 96 bytes cmem[2]

 Launch Parameters: BlockDim.x = 512, BlockDim.y = -1, BlockDim.z = -1, GridDim.x = 384, GridDim.y = -1, GridDim.z = -1, Smem Size = 4096
 Traceback (most recent call last):
  File "/fsx/users/dberard/pytorch/33-repro.py", line 21, in <module>
    fn_s(t1, t2, t3, t4, 1, 1)
 RuntimeError: The following operation failed in the TorchScript interpreter.
 Traceback of TorchScript (most recent call last):
 RuntimeError: CUDA driver error: too many resources requested for launch

 srun: error: dev-st-p4d24xlarge-1: task 0: Exited with exit code 1