Skip to content

Instantly share code, notes, and snippets.

@leslie-fang-intel
Created March 6, 2023 09:40
Show Gist options
  • Select an option

  • Save leslie-fang-intel/8d4e6185aafb86ce93a03e5d6481139d to your computer and use it in GitHub Desktop.

Select an option

Save leslie-fang-intel/8d4e6185aafb86ce93a03e5d6481139d to your computer and use it in GitHub Desktop.
from ctypes import c_void_p, c_long
import torch
import math
import random
from torch import empty_strided, as_strided, device
from torch._inductor.codecache import AsyncCompile
from torch._inductor.select_algorithm import extern_kernels
aten = torch.ops.aten
assert_size_stride = torch._C._dynamo.guards.assert_size_stride
async_compile = AsyncCompile()
kernel_cpp_0 = async_compile.cpp('''
#include "/tmp/torchinductor_root/dl/cdljpywww2h2ag4o35mwbvm45hhasxnxkhqgbupxnk3y7olula65.h"
extern "C" void kernel(const float* __restrict__ in_ptr0,
float* __restrict__ out_ptr0)
{
{
#pragma GCC ivdep
for(long i0=0; i0<64; i0+=1)
{
#pragma GCC ivdep
for(long i1=0; i1<0; i1+=1)
{
#pragma GCC ivdep
for(long i2=0; i2<3; i2+=1)
{
float tmp1[16*16] __attribute__ ((aligned (16)));
for (long i1_inner = 0; i1_inner < 16; i1_inner++)
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + (16*i2) + (49*i1_inner) + (147*i0) + (784*i1));
tmp0.store(tmp1 + 16*i1_inner);
}
at::vec::transpose_mxn<float,16,16>(tmp1, 16, out_ptr0 + (16*i1) + (48*i2) + (147*i0), 3);
}
#pragma GCC ivdep
for(long i2=48; i2<49; i2+=1)
{
for (long i1_inner = 0; i1_inner < 16; i1_inner++)
{
auto tmp0 = in_ptr0[i2 + (49*i1_inner) + (147*i0) + (784*i1)];
out_ptr0[i1_inner + (3*i2) + (16*i1) + (147*i0)] = tmp0;
}
}
}
#pragma GCC ivdep
for(long i1=0; i1<3; i1+=1)
{
#pragma GCC ivdep
for(long i2=0; i2<49; i2+=1)
{
auto tmp0 = in_ptr0[i2 + (49*i1) + (147*i0)];
out_ptr0[i1 + (3*i2) + (147*i0)] = tmp0;
}
}
}
}
}
''')
kernel_cpp_1 = async_compile.cpp('''
#include "/tmp/torchinductor_root/dl/cdljpywww2h2ag4o35mwbvm45hhasxnxkhqgbupxnk3y7olula65.h"
extern "C" void kernel(float* __restrict__ in_out_ptr0,
const float* __restrict__ in_ptr0,
const float* __restrict__ in_ptr1,
const float* __restrict__ in_ptr2,
const float* __restrict__ in_ptr3,
float* __restrict__ out_ptr0,
const long ks0,
const long ks1)
{
#pragma omp parallel num_threads(56)
{
{
#pragma omp for
for(long i0=0; i0<ks0 + (ks0*(((((-1) + ks1) / 2))*((((-1) + ks1) / 2)))) + (2*ks0*((((-1) + ks1) / 2))); i0+=1)
{
for(long i1=0; i1<4; i1+=1)
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_out_ptr0 + (16*i1) + (64*i0));
auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr0 + 16*i1);
auto tmp3 = at::vec::Vectorized<float>::loadu(in_ptr1 + 16*i1);
auto tmp11 = at::vec::Vectorized<float>::loadu(in_ptr2 + 16*i1);
auto tmp13 = at::vec::Vectorized<float>::loadu(in_ptr3 + 16*i1);
auto tmp2 = tmp0 - tmp1;
auto tmp4 = at::vec::Vectorized<float>(static_cast<float>(1e-05));
auto tmp5 = tmp3 + tmp4;
auto tmp6 = tmp5.sqrt();
auto tmp7 = tmp6.reciprocal();
auto tmp8 = at::vec::Vectorized<float>(static_cast<float>(1));
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp15 = at::vec::clamp_min(tmp14, decltype(tmp14)(0));
tmp15.store(in_out_ptr0 + (16*i1) + (64*i0));
}
#pragma omp simd simdlen(8)
for(long i1=64; i1<64; i1+=1)
{
auto tmp0 = in_out_ptr0[i1 + (64*i0)];
auto tmp1 = in_ptr0[i1];
auto tmp3 = in_ptr1[i1];
auto tmp11 = in_ptr2[i1];
auto tmp13 = in_ptr3[i1];
auto tmp2 = tmp0 - tmp1;
auto tmp4 = static_cast<float>(1e-05);
auto tmp5 = tmp3 + tmp4;
auto tmp6 = std::sqrt(tmp5);
auto tmp7 = 1 / tmp6;
auto tmp8 = static_cast<float>(1);
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp15 = tmp14 * (tmp14>0);
in_out_ptr0[i1 + (64*i0)] = tmp15;
}
}
}
{
#pragma omp for
for(long i0=0; i0<ks0; i0+=1)
{
#pragma GCC ivdep
for(long i1=0; i1<64; i1+=1)
{
#pragma GCC ivdep
for(long i2=0; i2<1 + ((((-1) + ks1) / 4)); i2+=1)
{
#pragma GCC ivdep
for(long i3=0; i3<1 + ((((-1) + ks1) / 4)); i3+=1)
{
auto tmp0 = static_cast<long>((-1) + (2*i2));
auto tmp1 = static_cast<long>(0);
auto tmp2 = tmp0 >= tmp1;
auto tmp3 = static_cast<long>(1 + ((((-1) + ks1) / 2)));
auto tmp4 = tmp0 < tmp3;
auto tmp5 = tmp2 & tmp4;
auto tmp6 = static_cast<long>((-1) + (2*i3));
auto tmp7 = tmp6 >= tmp1;
auto tmp8 = tmp6 < tmp3;
auto tmp9 = tmp7 & tmp8;
auto tmp10 = tmp5 & tmp9;
auto tmp11 = [&]
{
auto tmp12 = in_out_ptr0[(-128) + i1 + ((-64)*((((-1) + ks1) / 2))) + (64*i0) + (128*i2) + (128*i3) + (64*i0*(((((-1) + ks1) / 2))*((((-1) + ks1) / 2)))) + (128*i0*((((-1) + ks1) / 2))) + (128*i2*((((-1) + ks1) / 2)))];
return tmp12;
}
;
auto tmp13 = tmp10 ? tmp11() : -std::numeric_limits<decltype(tmp11())>::infinity();
auto tmp14 = static_cast<long>(2*i3);
auto tmp15 = tmp14 >= tmp1;
auto tmp16 = tmp14 < tmp3;
auto tmp17 = tmp15 & tmp16;
auto tmp18 = tmp5 & tmp17;
auto tmp19 = [&]
{
auto tmp20 = in_out_ptr0[(-64) + i1 + ((-64)*((((-1) + ks1) / 2))) + (64*i0) + (128*i2) + (128*i3) + (64*i0*(((((-1) + ks1) / 2))*((((-1) + ks1) / 2)))) + (128*i0*((((-1) + ks1) / 2))) + (128*i2*((((-1) + ks1) / 2)))];
return tmp20;
}
;
auto tmp21 = tmp18 ? tmp19() : -std::numeric_limits<decltype(tmp19())>::infinity();
auto tmp22 = (tmp13 != tmp13) ? tmp13 : std::max(tmp21, tmp13);
auto tmp23 = static_cast<long>(1 + (2*i3));
auto tmp24 = tmp23 >= tmp1;
auto tmp25 = tmp23 < tmp3;
auto tmp26 = tmp24 & tmp25;
auto tmp27 = tmp5 & tmp26;
auto tmp28 = [&]
{
auto tmp29 = in_out_ptr0[i1 + ((-64)*((((-1) + ks1) / 2))) + (64*i0) + (128*i2) + (128*i3) + (64*i0*(((((-1) + ks1) / 2))*((((-1) + ks1) / 2)))) + (128*i0*((((-1) + ks1) / 2))) + (128*i2*((((-1) + ks1) / 2)))];
return tmp29;
}
;
auto tmp30 = tmp27 ? tmp28() : -std::numeric_limits<decltype(tmp28())>::infinity();
auto tmp31 = (tmp22 != tmp22) ? tmp22 : std::max(tmp30, tmp22);
auto tmp32 = static_cast<long>(2*i2);
auto tmp33 = tmp32 >= tmp1;
auto tmp34 = tmp32 < tmp3;
auto tmp35 = tmp33 & tmp34;
auto tmp36 = tmp35 & tmp9;
auto tmp37 = [&]
{
auto tmp38 = in_out_ptr0[(-64) + i1 + (64*i0) + (128*i2) + (128*i3) + (64*i0*(((((-1) + ks1) / 2))*((((-1) + ks1) / 2)))) + (128*i0*((((-1) + ks1) / 2))) + (128*i2*((((-1) + ks1) / 2)))];
return tmp38;
}
;
auto tmp39 = tmp36 ? tmp37() : -std::numeric_limits<decltype(tmp37())>::infinity();
auto tmp40 = (tmp31 != tmp31) ? tmp31 : std::max(tmp39, tmp31);
auto tmp41 = tmp35 & tmp17;
auto tmp42 = [&]
{
auto tmp43 = in_out_ptr0[i1 + (64*i0) + (128*i2) + (128*i3) + (64*i0*(((((-1) + ks1) / 2))*((((-1) + ks1) / 2)))) + (128*i0*((((-1) + ks1) / 2))) + (128*i2*((((-1) + ks1) / 2)))];
return tmp43;
}
;
auto tmp44 = tmp41 ? tmp42() : -std::numeric_limits<decltype(tmp42())>::infinity();
auto tmp45 = (tmp40 != tmp40) ? tmp40 : std::max(tmp44, tmp40);
auto tmp46 = tmp35 & tmp26;
auto tmp47 = [&]
{
auto tmp48 = in_out_ptr0[64 + i1 + (64*i0) + (128*i2) + (128*i3) + (64*i0*(((((-1) + ks1) / 2))*((((-1) + ks1) / 2)))) + (128*i0*((((-1) + ks1) / 2))) + (128*i2*((((-1) + ks1) / 2)))];
return tmp48;
}
;
auto tmp49 = tmp46 ? tmp47() : -std::numeric_limits<decltype(tmp47())>::infinity();
auto tmp50 = (tmp45 != tmp45) ? tmp45 : std::max(tmp49, tmp45);
auto tmp51 = static_cast<long>(1 + (2*i2));
auto tmp52 = tmp51 >= tmp1;
auto tmp53 = tmp51 < tmp3;
auto tmp54 = tmp52 & tmp53;
auto tmp55 = tmp54 & tmp9;
auto tmp56 = [&]
{
auto tmp57 = in_out_ptr0[i1 + (64*i0) + (64*((((-1) + ks1) / 2))) + (128*i2) + (128*i3) + (64*i0*(((((-1) + ks1) / 2))*((((-1) + ks1) / 2)))) + (128*i0*((((-1) + ks1) / 2))) + (128*i2*((((-1) + ks1) / 2)))];
return tmp57;
}
;
auto tmp58 = tmp55 ? tmp56() : -std::numeric_limits<decltype(tmp56())>::infinity();
auto tmp59 = (tmp50 != tmp50) ? tmp50 : std::max(tmp58, tmp50);
auto tmp60 = tmp54 & tmp17;
auto tmp61 = [&]
{
auto tmp62 = in_out_ptr0[64 + i1 + (64*i0) + (64*((((-1) + ks1) / 2))) + (128*i2) + (128*i3) + (64*i0*(((((-1) + ks1) / 2))*((((-1) + ks1) / 2)))) + (128*i0*((((-1) + ks1) / 2))) + (128*i2*((((-1) + ks1) / 2)))];
return tmp62;
}
;
auto tmp63 = tmp60 ? tmp61() : -std::numeric_limits<decltype(tmp61())>::infinity();
auto tmp64 = (tmp59 != tmp59) ? tmp59 : std::max(tmp63, tmp59);
auto tmp65 = tmp54 & tmp26;
auto tmp66 = [&]
{
auto tmp67 = in_out_ptr0[128 + i1 + (64*i0) + (64*((((-1) + ks1) / 2))) + (128*i2) + (128*i3) + (64*i0*(((((-1) + ks1) / 2))*((((-1) + ks1) / 2)))) + (128*i0*((((-1) + ks1) / 2))) + (128*i2*((((-1) + ks1) / 2)))];
return tmp67;
}
;
auto tmp68 = tmp65 ? tmp66() : -std::numeric_limits<decltype(tmp66())>::infinity();
auto tmp69 = (tmp64 != tmp64) ? tmp64 : std::max(tmp68, tmp64);
out_ptr0[i1 + (64*i0) + (64*i2) + (64*i3) + (64*i0*(((((-1) + ks1) / 4))*((((-1) + ks1) / 4)))) + (64*i2*((((-1) + ks1) / 4))) + (128*i0*((((-1) + ks1) / 4)))] = tmp69;
}
}
}
}
}
}
}
''')
kernel_cpp_2 = async_compile.cpp('''
#include "/tmp/torchinductor_root/dl/cdljpywww2h2ag4o35mwbvm45hhasxnxkhqgbupxnk3y7olula65.h"
extern "C" void kernel(float* __restrict__ in_out_ptr0,
const float* __restrict__ in_ptr0,
const float* __restrict__ in_ptr1,
const float* __restrict__ in_ptr2,
const float* __restrict__ in_ptr3,
const float* __restrict__ in_ptr4,
float* __restrict__ out_ptr0,
const long ks0,
const long ks1)
{
#pragma omp parallel num_threads(56)
{
{
#pragma omp for
for(long i0=0; i0<ks0 + (ks0*(((((-1) + ks1) / 4))*((((-1) + ks1) / 4)))) + (2*ks0*((((-1) + ks1) / 4))); i0+=1)
{
for(long i1=0; i1<4; i1+=1)
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_out_ptr0 + (16*i1) + (64*i0));
auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr0 + 16*i1);
auto tmp3 = at::vec::Vectorized<float>::loadu(in_ptr1 + 16*i1);
auto tmp11 = at::vec::Vectorized<float>::loadu(in_ptr2 + 16*i1);
auto tmp13 = at::vec::Vectorized<float>::loadu(in_ptr3 + 16*i1);
auto tmp2 = tmp0 - tmp1;
auto tmp4 = at::vec::Vectorized<float>(static_cast<float>(1e-05));
auto tmp5 = tmp3 + tmp4;
auto tmp6 = tmp5.sqrt();
auto tmp7 = tmp6.reciprocal();
auto tmp8 = at::vec::Vectorized<float>(static_cast<float>(1));
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp15 = at::vec::clamp_min(tmp14, decltype(tmp14)(0));
tmp15.store(in_out_ptr0 + (16*i1) + (64*i0));
}
#pragma omp simd simdlen(8)
for(long i1=64; i1<64; i1+=1)
{
auto tmp0 = in_out_ptr0[i1 + (64*i0)];
auto tmp1 = in_ptr0[i1];
auto tmp3 = in_ptr1[i1];
auto tmp11 = in_ptr2[i1];
auto tmp13 = in_ptr3[i1];
auto tmp2 = tmp0 - tmp1;
auto tmp4 = static_cast<float>(1e-05);
auto tmp5 = tmp3 + tmp4;
auto tmp6 = std::sqrt(tmp5);
auto tmp7 = 1 / tmp6;
auto tmp8 = static_cast<float>(1);
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp15 = tmp14 * (tmp14>0);
in_out_ptr0[i1 + (64*i0)] = tmp15;
}
}
}
#pragma omp single
{
{
#pragma GCC ivdep
for(long i0=0; i0<64; i0+=1)
{
#pragma GCC ivdep
for(long i1=0; i1<4; i1+=1)
{
#pragma GCC ivdep
for(long i2=0; i2<0; i2+=1)
{
float tmp1[16*16] __attribute__ ((aligned (16)));
for (long i1_inner = 0; i1_inner < 16; i1_inner++)
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr4 + (9*i1_inner) + (16*i2) + (144*i1) + (576*i0));
tmp0.store(tmp1 + 16*i1_inner);
}
at::vec::transpose_mxn<float,16,16>(tmp1, 16, out_ptr0 + (16*i1) + (576*i0) + (1024*i2), 64);
}
#pragma GCC ivdep
for(long i2=0; i2<9; i2+=1)
{
for (long i1_inner = 0; i1_inner < 16; i1_inner++)
{
auto tmp0 = in_ptr4[i2 + (9*i1_inner) + (144*i1) + (576*i0)];
out_ptr0[i1_inner + (16*i1) + (64*i2) + (576*i0)] = tmp0;
}
}
}
#pragma GCC ivdep
for(long i1=64; i1<64; i1+=1)
{
#pragma GCC ivdep
for(long i2=0; i2<9; i2+=1)
{
auto tmp0 = in_ptr4[i2 + (9*i1) + (576*i0)];
out_ptr0[i1 + (64*i2) + (576*i0)] = tmp0;
}
}
}
}
}
}
}
''')
kernel_cpp_3 = async_compile.cpp('''
#include "/tmp/torchinductor_root/dl/cdljpywww2h2ag4o35mwbvm45hhasxnxkhqgbupxnk3y7olula65.h"
extern "C" void kernel(float* __restrict__ in_out_ptr0,
const float* __restrict__ in_ptr0,
const float* __restrict__ in_ptr1,
const float* __restrict__ in_ptr2,
const float* __restrict__ in_ptr3,
const long ks0,
const long ks1)
{
#pragma omp parallel num_threads(56)
{
{
#pragma omp for
for(long i0=0; i0<ks0 + (ks0*(((((-1) + ks1) / 4))*((((-1) + ks1) / 4)))) + (2*ks0*((((-1) + ks1) / 4))); i0+=1)
{
for(long i1=0; i1<4; i1+=1)
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_out_ptr0 + (16*i1) + (64*i0));
auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr0 + 16*i1);
auto tmp3 = at::vec::Vectorized<float>::loadu(in_ptr1 + 16*i1);
auto tmp11 = at::vec::Vectorized<float>::loadu(in_ptr2 + 16*i1);
auto tmp13 = at::vec::Vectorized<float>::loadu(in_ptr3 + 16*i1);
auto tmp2 = tmp0 - tmp1;
auto tmp4 = at::vec::Vectorized<float>(static_cast<float>(1e-05));
auto tmp5 = tmp3 + tmp4;
auto tmp6 = tmp5.sqrt();
auto tmp7 = tmp6.reciprocal();
auto tmp8 = at::vec::Vectorized<float>(static_cast<float>(1));
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp15 = at::vec::clamp_min(tmp14, decltype(tmp14)(0));
tmp15.store(in_out_ptr0 + (16*i1) + (64*i0));
}
#pragma omp simd simdlen(8)
for(long i1=64; i1<64; i1+=1)
{
auto tmp0 = in_out_ptr0[i1 + (64*i0)];
auto tmp1 = in_ptr0[i1];
auto tmp3 = in_ptr1[i1];
auto tmp11 = in_ptr2[i1];
auto tmp13 = in_ptr3[i1];
auto tmp2 = tmp0 - tmp1;
auto tmp4 = static_cast<float>(1e-05);
auto tmp5 = tmp3 + tmp4;
auto tmp6 = std::sqrt(tmp5);
auto tmp7 = 1 / tmp6;
auto tmp8 = static_cast<float>(1);
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp15 = tmp14 * (tmp14>0);
in_out_ptr0[i1 + (64*i0)] = tmp15;
}
}
}
}
}
''')
kernel_cpp_4 = async_compile.cpp('''
#include "/tmp/torchinductor_root/dl/cdljpywww2h2ag4o35mwbvm45hhasxnxkhqgbupxnk3y7olula65.h"
extern "C" void kernel(float* __restrict__ in_out_ptr0,
const float* __restrict__ in_ptr0,
const float* __restrict__ in_ptr1,
const float* __restrict__ in_ptr2,
const float* __restrict__ in_ptr3,
const float* __restrict__ in_ptr4,
const float* __restrict__ in_ptr5,
const float* __restrict__ in_ptr6,
const float* __restrict__ in_ptr7,
const float* __restrict__ in_ptr8,
const long ks0,
const long ks1)
{
#pragma omp parallel num_threads(56)
{
{
#pragma omp for
for(long i0=0; i0<ks0 + (ks0*(((((-1) + ks1) / 4))*((((-1) + ks1) / 4)))) + (2*ks0*((((-1) + ks1) / 4))); i0+=1)
{
for(long i1=0; i1<16; i1+=1)
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_out_ptr0 + (16*i1) + (256*i0));
auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr0 + 16*i1);
auto tmp3 = at::vec::Vectorized<float>::loadu(in_ptr1 + 16*i1);
auto tmp11 = at::vec::Vectorized<float>::loadu(in_ptr2 + 16*i1);
auto tmp13 = at::vec::Vectorized<float>::loadu(in_ptr3 + 16*i1);
auto tmp15 = at::vec::Vectorized<float>::loadu(in_ptr4 + (16*i1) + (256*i0));
auto tmp16 = at::vec::Vectorized<float>::loadu(in_ptr5 + 16*i1);
auto tmp18 = at::vec::Vectorized<float>::loadu(in_ptr6 + 16*i1);
auto tmp24 = at::vec::Vectorized<float>::loadu(in_ptr7 + 16*i1);
auto tmp26 = at::vec::Vectorized<float>::loadu(in_ptr8 + 16*i1);
auto tmp2 = tmp0 - tmp1;
auto tmp4 = at::vec::Vectorized<float>(static_cast<float>(1e-05));
auto tmp5 = tmp3 + tmp4;
auto tmp6 = tmp5.sqrt();
auto tmp7 = tmp6.reciprocal();
auto tmp8 = at::vec::Vectorized<float>(static_cast<float>(1));
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp17 = tmp15 - tmp16;
auto tmp19 = tmp18 + tmp4;
auto tmp20 = tmp19.sqrt();
auto tmp21 = tmp20.reciprocal();
auto tmp22 = tmp21 * tmp8;
auto tmp23 = tmp17 * tmp22;
auto tmp25 = tmp23 * tmp24;
auto tmp27 = tmp25 + tmp26;
auto tmp28 = tmp14 + tmp27;
tmp28.store(in_out_ptr0 + (16*i1) + (256*i0));
}
#pragma omp simd simdlen(8)
for(long i1=256; i1<256; i1+=1)
{
auto tmp0 = in_out_ptr0[i1 + (256*i0)];
auto tmp1 = in_ptr0[i1];
auto tmp3 = in_ptr1[i1];
auto tmp11 = in_ptr2[i1];
auto tmp13 = in_ptr3[i1];
auto tmp15 = in_ptr4[i1 + (256*i0)];
auto tmp16 = in_ptr5[i1];
auto tmp18 = in_ptr6[i1];
auto tmp24 = in_ptr7[i1];
auto tmp26 = in_ptr8[i1];
auto tmp2 = tmp0 - tmp1;
auto tmp4 = static_cast<float>(1e-05);
auto tmp5 = tmp3 + tmp4;
auto tmp6 = std::sqrt(tmp5);
auto tmp7 = 1 / tmp6;
auto tmp8 = static_cast<float>(1);
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp17 = tmp15 - tmp16;
auto tmp19 = tmp18 + tmp4;
auto tmp20 = std::sqrt(tmp19);
auto tmp21 = 1 / tmp20;
auto tmp22 = tmp21 * tmp8;
auto tmp23 = tmp17 * tmp22;
auto tmp25 = tmp23 * tmp24;
auto tmp27 = tmp25 + tmp26;
auto tmp28 = tmp14 + tmp27;
in_out_ptr0[i1 + (256*i0)] = tmp28;
}
}
}
{
#pragma omp for
for(long i0=0; i0<ks0; i0+=1)
{
#pragma GCC ivdep
for(long i1=0; i1<16; i1+=1)
{
#pragma GCC ivdep
for(long i2=0; i2<((1 + (((((-1) + ks1) / 4))*((((-1) + ks1) / 4))) + (2*((((-1) + ks1) / 4)))) / 16); i2+=1)
{
float tmp0[16*16] __attribute__ ((aligned (16)));
at::vec::transpose_mxn<float,16,16>(in_out_ptr0 + (16*i1) + (256*i0) + (4096*i2) + (256*i0*(((((-1) + ks1) / 4))*((((-1) + ks1) / 4)))) + (512*i0*((((-1) + ks1) / 4))), 256, tmp0, 16);
float tmp3[16*16] __attribute__ ((aligned (16)));
for (long i1_inner = 0; i1_inner < 16; i1_inner++)
{
auto tmp1 = at::vec::Vectorized<float>::loadu(tmp0 + 16*i1_inner);
auto tmp2 = at::vec::clamp_min(tmp1, decltype(tmp1)(0));
tmp2.store(tmp3 + 16*i1_inner);
}
at::vec::transpose_mxn<float,16,16>(tmp3, 16, in_out_ptr0 + (16*i1) + (256*i0) + (4096*i2) + (256*i0*(((((-1) + ks1) / 4))*((((-1) + ks1) / 4)))) + (512*i0*((((-1) + ks1) / 4))), 256);
}
#pragma GCC ivdep
for(long i2=16*(((1 + (((((-1) + ks1) / 4))*((((-1) + ks1) / 4))) + (2*((((-1) + ks1) / 4)))) / 16)); i2<1 + (((((-1) + ks1) / 4))*((((-1) + ks1) / 4))) + (2*((((-1) + ks1) / 4))); i2+=1)
{
for (long i1_inner = 0; i1_inner < 16; i1_inner++)
{
auto tmp0 = in_out_ptr0[i1_inner + (16*i1) + (256*i0) + (256*i2) + (256*i0*(((((-1) + ks1) / 4))*((((-1) + ks1) / 4)))) + (512*i0*((((-1) + ks1) / 4)))];
auto tmp1 = tmp0 * (tmp0>0);
in_out_ptr0[i1_inner + (16*i1) + (256*i0) + (256*i2) + (256*i0*(((((-1) + ks1) / 4))*((((-1) + ks1) / 4)))) + (512*i0*((((-1) + ks1) / 4)))] = tmp1;
}
}
}
#pragma GCC ivdep
for(long i1=256; i1<256; i1+=1)
{
#pragma GCC ivdep
for(long i2=0; i2<1 + (((((-1) + ks1) / 4))*((((-1) + ks1) / 4))) + (2*((((-1) + ks1) / 4))); i2+=1)
{
auto tmp0 = in_out_ptr0[i1 + (256*i0) + (256*i2) + (256*i0*(((((-1) + ks1) / 4))*((((-1) + ks1) / 4)))) + (512*i0*((((-1) + ks1) / 4)))];
auto tmp1 = tmp0 * (tmp0>0);
in_out_ptr0[i1 + (256*i0) + (256*i2) + (256*i0*(((((-1) + ks1) / 4))*((((-1) + ks1) / 4)))) + (512*i0*((((-1) + ks1) / 4)))] = tmp1;
}
}
}
}
}
}
''')
kernel_cpp_5 = async_compile.cpp('''
#include "/tmp/torchinductor_root/dl/cdljpywww2h2ag4o35mwbvm45hhasxnxkhqgbupxnk3y7olula65.h"
extern "C" void kernel(float* __restrict__ in_out_ptr0,
const float* __restrict__ in_ptr0,
const float* __restrict__ in_ptr1,
const float* __restrict__ in_ptr2,
const float* __restrict__ in_ptr3,
const float* __restrict__ in_ptr4,
float* __restrict__ out_ptr0,
const long ks0,
const long ks1)
{
#pragma omp parallel num_threads(56)
{
{
#pragma omp for
for(long i0=0; i0<ks0 + (ks0*(((((-1) + ks1) / 4))*((((-1) + ks1) / 4)))) + (2*ks0*((((-1) + ks1) / 4))); i0+=1)
{
for(long i1=0; i1<4; i1+=1)
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_out_ptr0 + (16*i1) + (64*i0));
auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr0 + 16*i1);
auto tmp3 = at::vec::Vectorized<float>::loadu(in_ptr1 + 16*i1);
auto tmp11 = at::vec::Vectorized<float>::loadu(in_ptr2 + 16*i1);
auto tmp13 = at::vec::Vectorized<float>::loadu(in_ptr3 + 16*i1);
auto tmp2 = tmp0 - tmp1;
auto tmp4 = at::vec::Vectorized<float>(static_cast<float>(1e-05));
auto tmp5 = tmp3 + tmp4;
auto tmp6 = tmp5.sqrt();
auto tmp7 = tmp6.reciprocal();
auto tmp8 = at::vec::Vectorized<float>(static_cast<float>(1));
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp15 = at::vec::clamp_min(tmp14, decltype(tmp14)(0));
tmp15.store(in_out_ptr0 + (16*i1) + (64*i0));
}
#pragma omp simd simdlen(8)
for(long i1=64; i1<64; i1+=1)
{
auto tmp0 = in_out_ptr0[i1 + (64*i0)];
auto tmp1 = in_ptr0[i1];
auto tmp3 = in_ptr1[i1];
auto tmp11 = in_ptr2[i1];
auto tmp13 = in_ptr3[i1];
auto tmp2 = tmp0 - tmp1;
auto tmp4 = static_cast<float>(1e-05);
auto tmp5 = tmp3 + tmp4;
auto tmp6 = std::sqrt(tmp5);
auto tmp7 = 1 / tmp6;
auto tmp8 = static_cast<float>(1);
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp15 = tmp14 * (tmp14>0);
in_out_ptr0[i1 + (64*i0)] = tmp15;
}
}
}
#pragma omp single
{
{
#pragma GCC ivdep
for(long i0=0; i0<64; i0+=1)
{
#pragma GCC ivdep
for(long i1=0; i1<4; i1+=1)
{
#pragma GCC ivdep
for(long i2=0; i2<0; i2+=1)
{
float tmp1[16*16] __attribute__ ((aligned (16)));
for (long i1_inner = 0; i1_inner < 16; i1_inner++)
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr4 + (9*i1_inner) + (16*i2) + (144*i1) + (576*i0));
tmp0.store(tmp1 + 16*i1_inner);
}
at::vec::transpose_mxn<float,16,16>(tmp1, 16, out_ptr0 + (16*i1) + (576*i0) + (1024*i2), 64);
}
#pragma GCC ivdep
for(long i2=0; i2<9; i2+=1)
{
for (long i1_inner = 0; i1_inner < 16; i1_inner++)
{
auto tmp0 = in_ptr4[i2 + (9*i1_inner) + (144*i1) + (576*i0)];
out_ptr0[i1_inner + (16*i1) + (64*i2) + (576*i0)] = tmp0;
}
}
}
#pragma GCC ivdep
for(long i1=64; i1<64; i1+=1)
{
#pragma GCC ivdep
for(long i2=0; i2<9; i2+=1)
{
auto tmp0 = in_ptr4[i2 + (9*i1) + (576*i0)];
out_ptr0[i1 + (64*i2) + (576*i0)] = tmp0;
}
}
}
}
}
}
}
''')
kernel_cpp_6 = async_compile.cpp('''
#include "/tmp/torchinductor_root/dl/cdljpywww2h2ag4o35mwbvm45hhasxnxkhqgbupxnk3y7olula65.h"
extern "C" void kernel(float* __restrict__ in_out_ptr0,
const float* __restrict__ in_ptr0,
const float* __restrict__ in_ptr1,
const float* __restrict__ in_ptr2,
const float* __restrict__ in_ptr3,
const long ks0,
const long ks1)
{
#pragma omp parallel num_threads(56)
{
{
#pragma omp for
for(long i0=0; i0<ks0 + (ks0*(((((-1) + ks1) / 4))*((((-1) + ks1) / 4)))) + (2*ks0*((((-1) + ks1) / 4))); i0+=1)
{
for(long i1=0; i1<4; i1+=1)
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_out_ptr0 + (16*i1) + (64*i0));
auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr0 + 16*i1);
auto tmp3 = at::vec::Vectorized<float>::loadu(in_ptr1 + 16*i1);
auto tmp11 = at::vec::Vectorized<float>::loadu(in_ptr2 + 16*i1);
auto tmp13 = at::vec::Vectorized<float>::loadu(in_ptr3 + 16*i1);
auto tmp2 = tmp0 - tmp1;
auto tmp4 = at::vec::Vectorized<float>(static_cast<float>(1e-05));
auto tmp5 = tmp3 + tmp4;
auto tmp6 = tmp5.sqrt();
auto tmp7 = tmp6.reciprocal();
auto tmp8 = at::vec::Vectorized<float>(static_cast<float>(1));
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp15 = at::vec::clamp_min(tmp14, decltype(tmp14)(0));
tmp15.store(in_out_ptr0 + (16*i1) + (64*i0));
}
#pragma omp simd simdlen(8)
for(long i1=64; i1<64; i1+=1)
{
auto tmp0 = in_out_ptr0[i1 + (64*i0)];
auto tmp1 = in_ptr0[i1];
auto tmp3 = in_ptr1[i1];
auto tmp11 = in_ptr2[i1];
auto tmp13 = in_ptr3[i1];
auto tmp2 = tmp0 - tmp1;
auto tmp4 = static_cast<float>(1e-05);
auto tmp5 = tmp3 + tmp4;
auto tmp6 = std::sqrt(tmp5);
auto tmp7 = 1 / tmp6;
auto tmp8 = static_cast<float>(1);
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp15 = tmp14 * (tmp14>0);
in_out_ptr0[i1 + (64*i0)] = tmp15;
}
}
}
}
}
''')
kernel_cpp_7 = async_compile.cpp('''
#include "/tmp/torchinductor_root/dl/cdljpywww2h2ag4o35mwbvm45hhasxnxkhqgbupxnk3y7olula65.h"
extern "C" void kernel(float* __restrict__ in_out_ptr0,
const float* __restrict__ in_ptr0,
const float* __restrict__ in_ptr1,
const float* __restrict__ in_ptr2,
const float* __restrict__ in_ptr3,
const float* __restrict__ in_ptr4,
const long ks0,
const long ks1)
{
#pragma omp parallel num_threads(56)
{
{
#pragma omp for
for(long i0=0; i0<ks0 + (ks0*(((((-1) + ks1) / 4))*((((-1) + ks1) / 4)))) + (2*ks0*((((-1) + ks1) / 4))); i0+=1)
{
for(long i1=0; i1<16; i1+=1)
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + (16*i1) + (256*i0));
auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr1 + 16*i1);
auto tmp3 = at::vec::Vectorized<float>::loadu(in_ptr2 + 16*i1);
auto tmp11 = at::vec::Vectorized<float>::loadu(in_ptr3 + 16*i1);
auto tmp13 = at::vec::Vectorized<float>::loadu(in_ptr4 + 16*i1);
auto tmp15 = at::vec::Vectorized<float>::loadu(in_out_ptr0 + (16*i1) + (256*i0));
auto tmp2 = tmp0 - tmp1;
auto tmp4 = at::vec::Vectorized<float>(static_cast<float>(1e-05));
auto tmp5 = tmp3 + tmp4;
auto tmp6 = tmp5.sqrt();
auto tmp7 = tmp6.reciprocal();
auto tmp8 = at::vec::Vectorized<float>(static_cast<float>(1));
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp16 = tmp14 + tmp15;
auto tmp17 = at::vec::clamp_min(tmp16, decltype(tmp16)(0));
tmp17.store(in_out_ptr0 + (16*i1) + (256*i0));
}
#pragma omp simd simdlen(8)
for(long i1=256; i1<256; i1+=1)
{
auto tmp0 = in_ptr0[i1 + (256*i0)];
auto tmp1 = in_ptr1[i1];
auto tmp3 = in_ptr2[i1];
auto tmp11 = in_ptr3[i1];
auto tmp13 = in_ptr4[i1];
auto tmp15 = in_out_ptr0[i1 + (256*i0)];
auto tmp2 = tmp0 - tmp1;
auto tmp4 = static_cast<float>(1e-05);
auto tmp5 = tmp3 + tmp4;
auto tmp6 = std::sqrt(tmp5);
auto tmp7 = 1 / tmp6;
auto tmp8 = static_cast<float>(1);
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp16 = tmp14 + tmp15;
auto tmp17 = tmp16 * (tmp16>0);
in_out_ptr0[i1 + (256*i0)] = tmp17;
}
}
}
}
}
''')
kernel_cpp_8 = async_compile.cpp('''
#include "/tmp/torchinductor_root/dl/cdljpywww2h2ag4o35mwbvm45hhasxnxkhqgbupxnk3y7olula65.h"
extern "C" void kernel(float* __restrict__ in_out_ptr0,
const float* __restrict__ in_ptr0,
const float* __restrict__ in_ptr1,
const float* __restrict__ in_ptr2,
const float* __restrict__ in_ptr3,
const float* __restrict__ in_ptr4,
float* __restrict__ out_ptr0,
const long ks0,
const long ks1)
{
#pragma omp parallel num_threads(56)
{
{
#pragma omp for
for(long i0=0; i0<ks0 + (ks0*(((((-1) + ks1) / 4))*((((-1) + ks1) / 4)))) + (2*ks0*((((-1) + ks1) / 4))); i0+=1)
{
for(long i1=0; i1<4; i1+=1)
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_out_ptr0 + (16*i1) + (64*i0));
auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr0 + 16*i1);
auto tmp3 = at::vec::Vectorized<float>::loadu(in_ptr1 + 16*i1);
auto tmp11 = at::vec::Vectorized<float>::loadu(in_ptr2 + 16*i1);
auto tmp13 = at::vec::Vectorized<float>::loadu(in_ptr3 + 16*i1);
auto tmp2 = tmp0 - tmp1;
auto tmp4 = at::vec::Vectorized<float>(static_cast<float>(1e-05));
auto tmp5 = tmp3 + tmp4;
auto tmp6 = tmp5.sqrt();
auto tmp7 = tmp6.reciprocal();
auto tmp8 = at::vec::Vectorized<float>(static_cast<float>(1));
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp15 = at::vec::clamp_min(tmp14, decltype(tmp14)(0));
tmp15.store(in_out_ptr0 + (16*i1) + (64*i0));
}
#pragma omp simd simdlen(8)
for(long i1=64; i1<64; i1+=1)
{
auto tmp0 = in_out_ptr0[i1 + (64*i0)];
auto tmp1 = in_ptr0[i1];
auto tmp3 = in_ptr1[i1];
auto tmp11 = in_ptr2[i1];
auto tmp13 = in_ptr3[i1];
auto tmp2 = tmp0 - tmp1;
auto tmp4 = static_cast<float>(1e-05);
auto tmp5 = tmp3 + tmp4;
auto tmp6 = std::sqrt(tmp5);
auto tmp7 = 1 / tmp6;
auto tmp8 = static_cast<float>(1);
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp15 = tmp14 * (tmp14>0);
in_out_ptr0[i1 + (64*i0)] = tmp15;
}
}
}
#pragma omp single
{
{
#pragma GCC ivdep
for(long i0=0; i0<64; i0+=1)
{
#pragma GCC ivdep
for(long i1=0; i1<4; i1+=1)
{
#pragma GCC ivdep
for(long i2=0; i2<0; i2+=1)
{
float tmp1[16*16] __attribute__ ((aligned (16)));
for (long i1_inner = 0; i1_inner < 16; i1_inner++)
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr4 + (9*i1_inner) + (16*i2) + (144*i1) + (576*i0));
tmp0.store(tmp1 + 16*i1_inner);
}
at::vec::transpose_mxn<float,16,16>(tmp1, 16, out_ptr0 + (16*i1) + (576*i0) + (1024*i2), 64);
}
#pragma GCC ivdep
for(long i2=0; i2<9; i2+=1)
{
for (long i1_inner = 0; i1_inner < 16; i1_inner++)
{
auto tmp0 = in_ptr4[i2 + (9*i1_inner) + (144*i1) + (576*i0)];
out_ptr0[i1_inner + (16*i1) + (64*i2) + (576*i0)] = tmp0;
}
}
}
#pragma GCC ivdep
for(long i1=64; i1<64; i1+=1)
{
#pragma GCC ivdep
for(long i2=0; i2<9; i2+=1)
{
auto tmp0 = in_ptr4[i2 + (9*i1) + (576*i0)];
out_ptr0[i1 + (64*i2) + (576*i0)] = tmp0;
}
}
}
}
}
}
}
''')
kernel_cpp_9 = async_compile.cpp('''
#include "/tmp/torchinductor_root/dl/cdljpywww2h2ag4o35mwbvm45hhasxnxkhqgbupxnk3y7olula65.h"
extern "C" void kernel(float* __restrict__ in_out_ptr0,
const float* __restrict__ in_ptr0,
const float* __restrict__ in_ptr1,
const float* __restrict__ in_ptr2,
const float* __restrict__ in_ptr3,
const long ks0,
const long ks1)
{
#pragma omp parallel num_threads(56)
{
{
#pragma omp for
for(long i0=0; i0<ks0 + (ks0*(((((-1) + ks1) / 4))*((((-1) + ks1) / 4)))) + (2*ks0*((((-1) + ks1) / 4))); i0+=1)
{
for(long i1=0; i1<4; i1+=1)
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_out_ptr0 + (16*i1) + (64*i0));
auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr0 + 16*i1);
auto tmp3 = at::vec::Vectorized<float>::loadu(in_ptr1 + 16*i1);
auto tmp11 = at::vec::Vectorized<float>::loadu(in_ptr2 + 16*i1);
auto tmp13 = at::vec::Vectorized<float>::loadu(in_ptr3 + 16*i1);
auto tmp2 = tmp0 - tmp1;
auto tmp4 = at::vec::Vectorized<float>(static_cast<float>(1e-05));
auto tmp5 = tmp3 + tmp4;
auto tmp6 = tmp5.sqrt();
auto tmp7 = tmp6.reciprocal();
auto tmp8 = at::vec::Vectorized<float>(static_cast<float>(1));
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp15 = at::vec::clamp_min(tmp14, decltype(tmp14)(0));
tmp15.store(in_out_ptr0 + (16*i1) + (64*i0));
}
#pragma omp simd simdlen(8)
for(long i1=64; i1<64; i1+=1)
{
auto tmp0 = in_out_ptr0[i1 + (64*i0)];
auto tmp1 = in_ptr0[i1];
auto tmp3 = in_ptr1[i1];
auto tmp11 = in_ptr2[i1];
auto tmp13 = in_ptr3[i1];
auto tmp2 = tmp0 - tmp1;
auto tmp4 = static_cast<float>(1e-05);
auto tmp5 = tmp3 + tmp4;
auto tmp6 = std::sqrt(tmp5);
auto tmp7 = 1 / tmp6;
auto tmp8 = static_cast<float>(1);
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp15 = tmp14 * (tmp14>0);
in_out_ptr0[i1 + (64*i0)] = tmp15;
}
}
}
}
}
''')
kernel_cpp_10 = async_compile.cpp('''
#include "/tmp/torchinductor_root/dl/cdljpywww2h2ag4o35mwbvm45hhasxnxkhqgbupxnk3y7olula65.h"
extern "C" void kernel(float* __restrict__ in_out_ptr0,
const float* __restrict__ in_ptr0,
const float* __restrict__ in_ptr1,
const float* __restrict__ in_ptr2,
const float* __restrict__ in_ptr3,
const float* __restrict__ in_ptr4,
const long ks0,
const long ks1)
{
#pragma omp parallel num_threads(56)
{
{
#pragma omp for
for(long i0=0; i0<ks0; i0+=1)
{
#pragma GCC ivdep
for(long i1=0; i1<16; i1+=1)
{
#pragma GCC ivdep
for(long i2=0; i2<((1 + (((((-1) + ks1) / 4))*((((-1) + ks1) / 4))) + (2*((((-1) + ks1) / 4)))) / 16); i2+=1)
{
float tmp0[16*16] __attribute__ ((aligned (16)));
at::vec::transpose_mxn<float,16,16>(in_ptr0 + (16*i1) + (256*i0) + (4096*i2) + (256*i0*(((((-1) + ks1) / 4))*((((-1) + ks1) / 4)))) + (512*i0*((((-1) + ks1) / 4))), 256, tmp0, 16);
float tmp16[16*16] __attribute__ ((aligned (16)));
at::vec::transpose_mxn<float,16,16>(in_out_ptr0 + (16*i1) + (256*i0) + (4096*i2) + (256*i0*(((((-1) + ks1) / 4))*((((-1) + ks1) / 4)))) + (512*i0*((((-1) + ks1) / 4))), 256, tmp16, 16);
float tmp20[16*16] __attribute__ ((aligned (16)));
for (long i1_inner = 0; i1_inner < 16; i1_inner++)
{
auto tmp1 = at::vec::Vectorized<float>::loadu(tmp0 + 16*i1_inner);
auto tmp2 = at::vec::Vectorized<float>(in_ptr1[i1_inner + (16*i1)]);
auto tmp4 = at::vec::Vectorized<float>(in_ptr2[i1_inner + (16*i1)]);
auto tmp12 = at::vec::Vectorized<float>(in_ptr3[i1_inner + (16*i1)]);
auto tmp14 = at::vec::Vectorized<float>(in_ptr4[i1_inner + (16*i1)]);
auto tmp17 = at::vec::Vectorized<float>::loadu(tmp16 + 16*i1_inner);
auto tmp3 = tmp1 - tmp2;
auto tmp5 = at::vec::Vectorized<float>(static_cast<float>(1e-05));
auto tmp6 = tmp4 + tmp5;
auto tmp7 = tmp6.sqrt();
auto tmp8 = tmp7.reciprocal();
auto tmp9 = at::vec::Vectorized<float>(static_cast<float>(1));
auto tmp10 = tmp8 * tmp9;
auto tmp11 = tmp3 * tmp10;
auto tmp13 = tmp11 * tmp12;
auto tmp15 = tmp13 + tmp14;
auto tmp18 = tmp15 + tmp17;
auto tmp19 = at::vec::clamp_min(tmp18, decltype(tmp18)(0));
tmp19.store(tmp20 + 16*i1_inner);
}
at::vec::transpose_mxn<float,16,16>(tmp20, 16, in_out_ptr0 + (16*i1) + (256*i0) + (4096*i2) + (256*i0*(((((-1) + ks1) / 4))*((((-1) + ks1) / 4)))) + (512*i0*((((-1) + ks1) / 4))), 256);
}
#pragma GCC ivdep
for(long i2=16*(((1 + (((((-1) + ks1) / 4))*((((-1) + ks1) / 4))) + (2*((((-1) + ks1) / 4)))) / 16)); i2<1 + (((((-1) + ks1) / 4))*((((-1) + ks1) / 4))) + (2*((((-1) + ks1) / 4))); i2+=1)
{
for (long i1_inner = 0; i1_inner < 16; i1_inner++)
{
auto tmp0 = in_ptr0[i1_inner + (16*i1) + (256*i0) + (256*i2) + (256*i0*(((((-1) + ks1) / 4))*((((-1) + ks1) / 4)))) + (512*i0*((((-1) + ks1) / 4)))];
auto tmp1 = in_ptr1[i1_inner + (16*i1)];
auto tmp3 = in_ptr2[i1_inner + (16*i1)];
auto tmp11 = in_ptr3[i1_inner + (16*i1)];
auto tmp13 = in_ptr4[i1_inner + (16*i1)];
auto tmp15 = in_out_ptr0[i1_inner + (16*i1) + (256*i0) + (256*i2) + (256*i0*(((((-1) + ks1) / 4))*((((-1) + ks1) / 4)))) + (512*i0*((((-1) + ks1) / 4)))];
auto tmp2 = tmp0 - tmp1;
auto tmp4 = static_cast<float>(1e-05);
auto tmp5 = tmp3 + tmp4;
auto tmp6 = std::sqrt(tmp5);
auto tmp7 = 1 / tmp6;
auto tmp8 = static_cast<float>(1);
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp16 = tmp14 + tmp15;
auto tmp17 = tmp16 * (tmp16>0);
in_out_ptr0[i1_inner + (16*i1) + (256*i0) + (256*i2) + (256*i0*(((((-1) + ks1) / 4))*((((-1) + ks1) / 4)))) + (512*i0*((((-1) + ks1) / 4)))] = tmp17;
}
}
}
#pragma GCC ivdep
for(long i1=256; i1<256; i1+=1)
{
#pragma GCC ivdep
for(long i2=0; i2<1 + (((((-1) + ks1) / 4))*((((-1) + ks1) / 4))) + (2*((((-1) + ks1) / 4))); i2+=1)
{
auto tmp0 = in_ptr0[i1 + (256*i0) + (256*i2) + (256*i0*(((((-1) + ks1) / 4))*((((-1) + ks1) / 4)))) + (512*i0*((((-1) + ks1) / 4)))];
auto tmp1 = in_ptr1[i1];
auto tmp3 = in_ptr2[i1];
auto tmp11 = in_ptr3[i1];
auto tmp13 = in_ptr4[i1];
auto tmp15 = in_out_ptr0[i1 + (256*i0) + (256*i2) + (256*i0*(((((-1) + ks1) / 4))*((((-1) + ks1) / 4)))) + (512*i0*((((-1) + ks1) / 4)))];
auto tmp2 = tmp0 - tmp1;
auto tmp4 = static_cast<float>(1e-05);
auto tmp5 = tmp3 + tmp4;
auto tmp6 = std::sqrt(tmp5);
auto tmp7 = 1 / tmp6;
auto tmp8 = static_cast<float>(1);
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp16 = tmp14 + tmp15;
auto tmp17 = tmp16 * (tmp16>0);
in_out_ptr0[i1 + (256*i0) + (256*i2) + (256*i0*(((((-1) + ks1) / 4))*((((-1) + ks1) / 4)))) + (512*i0*((((-1) + ks1) / 4)))] = tmp17;
}
}
}
}
}
}
''')
kernel_cpp_11 = async_compile.cpp('''
#include "/tmp/torchinductor_root/dl/cdljpywww2h2ag4o35mwbvm45hhasxnxkhqgbupxnk3y7olula65.h"
extern "C" void kernel(float* __restrict__ in_out_ptr0,
const float* __restrict__ in_ptr0,
const float* __restrict__ in_ptr1,
const float* __restrict__ in_ptr2,
const float* __restrict__ in_ptr3,
const float* __restrict__ in_ptr4,
float* __restrict__ out_ptr0,
const long ks0,
const long ks1)
{
#pragma omp parallel num_threads(56)
{
{
#pragma omp for
for(long i0=0; i0<ks0 + (ks0*(((((-1) + ks1) / 4))*((((-1) + ks1) / 4)))) + (2*ks0*((((-1) + ks1) / 4))); i0+=1)
{
for(long i1=0; i1<8; i1+=1)
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_out_ptr0 + (16*i1) + (128*i0));
auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr0 + 16*i1);
auto tmp3 = at::vec::Vectorized<float>::loadu(in_ptr1 + 16*i1);
auto tmp11 = at::vec::Vectorized<float>::loadu(in_ptr2 + 16*i1);
auto tmp13 = at::vec::Vectorized<float>::loadu(in_ptr3 + 16*i1);
auto tmp2 = tmp0 - tmp1;
auto tmp4 = at::vec::Vectorized<float>(static_cast<float>(1e-05));
auto tmp5 = tmp3 + tmp4;
auto tmp6 = tmp5.sqrt();
auto tmp7 = tmp6.reciprocal();
auto tmp8 = at::vec::Vectorized<float>(static_cast<float>(1));
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp15 = at::vec::clamp_min(tmp14, decltype(tmp14)(0));
tmp15.store(in_out_ptr0 + (16*i1) + (128*i0));
}
#pragma omp simd simdlen(8)
for(long i1=128; i1<128; i1+=1)
{
auto tmp0 = in_out_ptr0[i1 + (128*i0)];
auto tmp1 = in_ptr0[i1];
auto tmp3 = in_ptr1[i1];
auto tmp11 = in_ptr2[i1];
auto tmp13 = in_ptr3[i1];
auto tmp2 = tmp0 - tmp1;
auto tmp4 = static_cast<float>(1e-05);
auto tmp5 = tmp3 + tmp4;
auto tmp6 = std::sqrt(tmp5);
auto tmp7 = 1 / tmp6;
auto tmp8 = static_cast<float>(1);
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp15 = tmp14 * (tmp14>0);
in_out_ptr0[i1 + (128*i0)] = tmp15;
}
}
}
#pragma omp single
{
{
#pragma GCC ivdep
for(long i0=0; i0<128; i0+=1)
{
#pragma GCC ivdep
for(long i1=0; i1<8; i1+=1)
{
#pragma GCC ivdep
for(long i2=0; i2<0; i2+=1)
{
float tmp1[16*16] __attribute__ ((aligned (16)));
for (long i1_inner = 0; i1_inner < 16; i1_inner++)
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr4 + (9*i1_inner) + (16*i2) + (144*i1) + (1152*i0));
tmp0.store(tmp1 + 16*i1_inner);
}
at::vec::transpose_mxn<float,16,16>(tmp1, 16, out_ptr0 + (16*i1) + (1152*i0) + (2048*i2), 128);
}
#pragma GCC ivdep
for(long i2=0; i2<9; i2+=1)
{
for (long i1_inner = 0; i1_inner < 16; i1_inner++)
{
auto tmp0 = in_ptr4[i2 + (9*i1_inner) + (144*i1) + (1152*i0)];
out_ptr0[i1_inner + (16*i1) + (128*i2) + (1152*i0)] = tmp0;
}
}
}
#pragma GCC ivdep
for(long i1=128; i1<128; i1+=1)
{
#pragma GCC ivdep
for(long i2=0; i2<9; i2+=1)
{
auto tmp0 = in_ptr4[i2 + (9*i1) + (1152*i0)];
out_ptr0[i1 + (128*i2) + (1152*i0)] = tmp0;
}
}
}
}
}
}
}
''')
kernel_cpp_12 = async_compile.cpp('''
#include "/tmp/torchinductor_root/dl/cdljpywww2h2ag4o35mwbvm45hhasxnxkhqgbupxnk3y7olula65.h"
extern "C" void kernel(float* __restrict__ in_out_ptr0,
const float* __restrict__ in_ptr0,
const float* __restrict__ in_ptr1,
const float* __restrict__ in_ptr2,
const float* __restrict__ in_ptr3,
const long ks0,
const long ks1)
{
#pragma omp parallel num_threads(56)
{
{
#pragma omp for
for(long i0=0; i0<ks0 + (ks0*(((((-1) + ks1) / 8))*((((-1) + ks1) / 8)))) + (2*ks0*((((-1) + ks1) / 8))); i0+=1)
{
for(long i1=0; i1<8; i1+=1)
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_out_ptr0 + (16*i1) + (128*i0));
auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr0 + 16*i1);
auto tmp3 = at::vec::Vectorized<float>::loadu(in_ptr1 + 16*i1);
auto tmp11 = at::vec::Vectorized<float>::loadu(in_ptr2 + 16*i1);
auto tmp13 = at::vec::Vectorized<float>::loadu(in_ptr3 + 16*i1);
auto tmp2 = tmp0 - tmp1;
auto tmp4 = at::vec::Vectorized<float>(static_cast<float>(1e-05));
auto tmp5 = tmp3 + tmp4;
auto tmp6 = tmp5.sqrt();
auto tmp7 = tmp6.reciprocal();
auto tmp8 = at::vec::Vectorized<float>(static_cast<float>(1));
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp15 = at::vec::clamp_min(tmp14, decltype(tmp14)(0));
tmp15.store(in_out_ptr0 + (16*i1) + (128*i0));
}
#pragma omp simd simdlen(8)
for(long i1=128; i1<128; i1+=1)
{
auto tmp0 = in_out_ptr0[i1 + (128*i0)];
auto tmp1 = in_ptr0[i1];
auto tmp3 = in_ptr1[i1];
auto tmp11 = in_ptr2[i1];
auto tmp13 = in_ptr3[i1];
auto tmp2 = tmp0 - tmp1;
auto tmp4 = static_cast<float>(1e-05);
auto tmp5 = tmp3 + tmp4;
auto tmp6 = std::sqrt(tmp5);
auto tmp7 = 1 / tmp6;
auto tmp8 = static_cast<float>(1);
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp15 = tmp14 * (tmp14>0);
in_out_ptr0[i1 + (128*i0)] = tmp15;
}
}
}
}
}
''')
kernel_cpp_13 = async_compile.cpp('''
#include "/tmp/torchinductor_root/dl/cdljpywww2h2ag4o35mwbvm45hhasxnxkhqgbupxnk3y7olula65.h"
extern "C" void kernel(float* __restrict__ in_out_ptr0,
const float* __restrict__ in_ptr0,
const float* __restrict__ in_ptr1,
const float* __restrict__ in_ptr2,
const float* __restrict__ in_ptr3,
const float* __restrict__ in_ptr4,
const float* __restrict__ in_ptr5,
const float* __restrict__ in_ptr6,
const float* __restrict__ in_ptr7,
const float* __restrict__ in_ptr8,
const long ks0,
const long ks1)
{
#pragma omp parallel num_threads(56)
{
{
#pragma omp for
for(long i0=0; i0<ks0 + (ks0*(((((-1) + ks1) / 8))*((((-1) + ks1) / 8)))) + (2*ks0*((((-1) + ks1) / 8))); i0+=1)
{
for(long i1=0; i1<32; i1+=1)
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_out_ptr0 + (16*i1) + (512*i0));
auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr0 + 16*i1);
auto tmp3 = at::vec::Vectorized<float>::loadu(in_ptr1 + 16*i1);
auto tmp11 = at::vec::Vectorized<float>::loadu(in_ptr2 + 16*i1);
auto tmp13 = at::vec::Vectorized<float>::loadu(in_ptr3 + 16*i1);
auto tmp15 = at::vec::Vectorized<float>::loadu(in_ptr4 + (16*i1) + (512*i0));
auto tmp16 = at::vec::Vectorized<float>::loadu(in_ptr5 + 16*i1);
auto tmp18 = at::vec::Vectorized<float>::loadu(in_ptr6 + 16*i1);
auto tmp24 = at::vec::Vectorized<float>::loadu(in_ptr7 + 16*i1);
auto tmp26 = at::vec::Vectorized<float>::loadu(in_ptr8 + 16*i1);
auto tmp2 = tmp0 - tmp1;
auto tmp4 = at::vec::Vectorized<float>(static_cast<float>(1e-05));
auto tmp5 = tmp3 + tmp4;
auto tmp6 = tmp5.sqrt();
auto tmp7 = tmp6.reciprocal();
auto tmp8 = at::vec::Vectorized<float>(static_cast<float>(1));
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp17 = tmp15 - tmp16;
auto tmp19 = tmp18 + tmp4;
auto tmp20 = tmp19.sqrt();
auto tmp21 = tmp20.reciprocal();
auto tmp22 = tmp21 * tmp8;
auto tmp23 = tmp17 * tmp22;
auto tmp25 = tmp23 * tmp24;
auto tmp27 = tmp25 + tmp26;
auto tmp28 = tmp14 + tmp27;
tmp28.store(in_out_ptr0 + (16*i1) + (512*i0));
}
#pragma omp simd simdlen(8)
for(long i1=512; i1<512; i1+=1)
{
auto tmp0 = in_out_ptr0[i1 + (512*i0)];
auto tmp1 = in_ptr0[i1];
auto tmp3 = in_ptr1[i1];
auto tmp11 = in_ptr2[i1];
auto tmp13 = in_ptr3[i1];
auto tmp15 = in_ptr4[i1 + (512*i0)];
auto tmp16 = in_ptr5[i1];
auto tmp18 = in_ptr6[i1];
auto tmp24 = in_ptr7[i1];
auto tmp26 = in_ptr8[i1];
auto tmp2 = tmp0 - tmp1;
auto tmp4 = static_cast<float>(1e-05);
auto tmp5 = tmp3 + tmp4;
auto tmp6 = std::sqrt(tmp5);
auto tmp7 = 1 / tmp6;
auto tmp8 = static_cast<float>(1);
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp17 = tmp15 - tmp16;
auto tmp19 = tmp18 + tmp4;
auto tmp20 = std::sqrt(tmp19);
auto tmp21 = 1 / tmp20;
auto tmp22 = tmp21 * tmp8;
auto tmp23 = tmp17 * tmp22;
auto tmp25 = tmp23 * tmp24;
auto tmp27 = tmp25 + tmp26;
auto tmp28 = tmp14 + tmp27;
in_out_ptr0[i1 + (512*i0)] = tmp28;
}
}
}
{
#pragma omp for
for(long i0=0; i0<ks0; i0+=1)
{
#pragma GCC ivdep
for(long i1=0; i1<32; i1+=1)
{
#pragma GCC ivdep
for(long i2=0; i2<((1 + (((((-1) + ks1) / 8))*((((-1) + ks1) / 8))) + (2*((((-1) + ks1) / 8)))) / 16); i2+=1)
{
float tmp0[16*16] __attribute__ ((aligned (16)));
at::vec::transpose_mxn<float,16,16>(in_out_ptr0 + (16*i1) + (512*i0) + (8192*i2) + (512*i0*(((((-1) + ks1) / 8))*((((-1) + ks1) / 8)))) + (1024*i0*((((-1) + ks1) / 8))), 512, tmp0, 16);
float tmp3[16*16] __attribute__ ((aligned (16)));
for (long i1_inner = 0; i1_inner < 16; i1_inner++)
{
auto tmp1 = at::vec::Vectorized<float>::loadu(tmp0 + 16*i1_inner);
auto tmp2 = at::vec::clamp_min(tmp1, decltype(tmp1)(0));
tmp2.store(tmp3 + 16*i1_inner);
}
at::vec::transpose_mxn<float,16,16>(tmp3, 16, in_out_ptr0 + (16*i1) + (512*i0) + (8192*i2) + (512*i0*(((((-1) + ks1) / 8))*((((-1) + ks1) / 8)))) + (1024*i0*((((-1) + ks1) / 8))), 512);
}
#pragma GCC ivdep
for(long i2=16*(((1 + (((((-1) + ks1) / 8))*((((-1) + ks1) / 8))) + (2*((((-1) + ks1) / 8)))) / 16)); i2<1 + (((((-1) + ks1) / 8))*((((-1) + ks1) / 8))) + (2*((((-1) + ks1) / 8))); i2+=1)
{
for (long i1_inner = 0; i1_inner < 16; i1_inner++)
{
auto tmp0 = in_out_ptr0[i1_inner + (16*i1) + (512*i0) + (512*i2) + (512*i0*(((((-1) + ks1) / 8))*((((-1) + ks1) / 8)))) + (1024*i0*((((-1) + ks1) / 8)))];
auto tmp1 = tmp0 * (tmp0>0);
in_out_ptr0[i1_inner + (16*i1) + (512*i0) + (512*i2) + (512*i0*(((((-1) + ks1) / 8))*((((-1) + ks1) / 8)))) + (1024*i0*((((-1) + ks1) / 8)))] = tmp1;
}
}
}
#pragma GCC ivdep
for(long i1=512; i1<512; i1+=1)
{
#pragma GCC ivdep
for(long i2=0; i2<1 + (((((-1) + ks1) / 8))*((((-1) + ks1) / 8))) + (2*((((-1) + ks1) / 8))); i2+=1)
{
auto tmp0 = in_out_ptr0[i1 + (512*i0) + (512*i2) + (512*i0*(((((-1) + ks1) / 8))*((((-1) + ks1) / 8)))) + (1024*i0*((((-1) + ks1) / 8)))];
auto tmp1 = tmp0 * (tmp0>0);
in_out_ptr0[i1 + (512*i0) + (512*i2) + (512*i0*(((((-1) + ks1) / 8))*((((-1) + ks1) / 8)))) + (1024*i0*((((-1) + ks1) / 8)))] = tmp1;
}
}
}
}
}
}
''')
kernel_cpp_14 = async_compile.cpp('''
#include "/tmp/torchinductor_root/dl/cdljpywww2h2ag4o35mwbvm45hhasxnxkhqgbupxnk3y7olula65.h"
extern "C" void kernel(float* __restrict__ in_out_ptr0,
const float* __restrict__ in_ptr0,
const float* __restrict__ in_ptr1,
const float* __restrict__ in_ptr2,
const float* __restrict__ in_ptr3,
const float* __restrict__ in_ptr4,
float* __restrict__ out_ptr0,
const long ks0,
const long ks1)
{
#pragma omp parallel num_threads(56)
{
{
#pragma omp for
for(long i0=0; i0<ks0 + (ks0*(((((-1) + ks1) / 8))*((((-1) + ks1) / 8)))) + (2*ks0*((((-1) + ks1) / 8))); i0+=1)
{
for(long i1=0; i1<8; i1+=1)
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_out_ptr0 + (16*i1) + (128*i0));
auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr0 + 16*i1);
auto tmp3 = at::vec::Vectorized<float>::loadu(in_ptr1 + 16*i1);
auto tmp11 = at::vec::Vectorized<float>::loadu(in_ptr2 + 16*i1);
auto tmp13 = at::vec::Vectorized<float>::loadu(in_ptr3 + 16*i1);
auto tmp2 = tmp0 - tmp1;
auto tmp4 = at::vec::Vectorized<float>(static_cast<float>(1e-05));
auto tmp5 = tmp3 + tmp4;
auto tmp6 = tmp5.sqrt();
auto tmp7 = tmp6.reciprocal();
auto tmp8 = at::vec::Vectorized<float>(static_cast<float>(1));
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp15 = at::vec::clamp_min(tmp14, decltype(tmp14)(0));
tmp15.store(in_out_ptr0 + (16*i1) + (128*i0));
}
#pragma omp simd simdlen(8)
for(long i1=128; i1<128; i1+=1)
{
auto tmp0 = in_out_ptr0[i1 + (128*i0)];
auto tmp1 = in_ptr0[i1];
auto tmp3 = in_ptr1[i1];
auto tmp11 = in_ptr2[i1];
auto tmp13 = in_ptr3[i1];
auto tmp2 = tmp0 - tmp1;
auto tmp4 = static_cast<float>(1e-05);
auto tmp5 = tmp3 + tmp4;
auto tmp6 = std::sqrt(tmp5);
auto tmp7 = 1 / tmp6;
auto tmp8 = static_cast<float>(1);
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp15 = tmp14 * (tmp14>0);
in_out_ptr0[i1 + (128*i0)] = tmp15;
}
}
}
#pragma omp single
{
{
#pragma GCC ivdep
for(long i0=0; i0<128; i0+=1)
{
#pragma GCC ivdep
for(long i1=0; i1<8; i1+=1)
{
#pragma GCC ivdep
for(long i2=0; i2<0; i2+=1)
{
float tmp1[16*16] __attribute__ ((aligned (16)));
for (long i1_inner = 0; i1_inner < 16; i1_inner++)
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr4 + (9*i1_inner) + (16*i2) + (144*i1) + (1152*i0));
tmp0.store(tmp1 + 16*i1_inner);
}
at::vec::transpose_mxn<float,16,16>(tmp1, 16, out_ptr0 + (16*i1) + (1152*i0) + (2048*i2), 128);
}
#pragma GCC ivdep
for(long i2=0; i2<9; i2+=1)
{
for (long i1_inner = 0; i1_inner < 16; i1_inner++)
{
auto tmp0 = in_ptr4[i2 + (9*i1_inner) + (144*i1) + (1152*i0)];
out_ptr0[i1_inner + (16*i1) + (128*i2) + (1152*i0)] = tmp0;
}
}
}
#pragma GCC ivdep
for(long i1=128; i1<128; i1+=1)
{
#pragma GCC ivdep
for(long i2=0; i2<9; i2+=1)
{
auto tmp0 = in_ptr4[i2 + (9*i1) + (1152*i0)];
out_ptr0[i1 + (128*i2) + (1152*i0)] = tmp0;
}
}
}
}
}
}
}
''')
kernel_cpp_15 = async_compile.cpp('''
#include "/tmp/torchinductor_root/dl/cdljpywww2h2ag4o35mwbvm45hhasxnxkhqgbupxnk3y7olula65.h"
extern "C" void kernel(float* __restrict__ in_out_ptr0,
const float* __restrict__ in_ptr0,
const float* __restrict__ in_ptr1,
const float* __restrict__ in_ptr2,
const float* __restrict__ in_ptr3,
const long ks0,
const long ks1)
{
#pragma omp parallel num_threads(56)
{
{
#pragma omp for
for(long i0=0; i0<ks0 + (ks0*(((((-1) + ks1) / 8))*((((-1) + ks1) / 8)))) + (2*ks0*((((-1) + ks1) / 8))); i0+=1)
{
for(long i1=0; i1<8; i1+=1)
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_out_ptr0 + (16*i1) + (128*i0));
auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr0 + 16*i1);
auto tmp3 = at::vec::Vectorized<float>::loadu(in_ptr1 + 16*i1);
auto tmp11 = at::vec::Vectorized<float>::loadu(in_ptr2 + 16*i1);
auto tmp13 = at::vec::Vectorized<float>::loadu(in_ptr3 + 16*i1);
auto tmp2 = tmp0 - tmp1;
auto tmp4 = at::vec::Vectorized<float>(static_cast<float>(1e-05));
auto tmp5 = tmp3 + tmp4;
auto tmp6 = tmp5.sqrt();
auto tmp7 = tmp6.reciprocal();
auto tmp8 = at::vec::Vectorized<float>(static_cast<float>(1));
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp15 = at::vec::clamp_min(tmp14, decltype(tmp14)(0));
tmp15.store(in_out_ptr0 + (16*i1) + (128*i0));
}
#pragma omp simd simdlen(8)
for(long i1=128; i1<128; i1+=1)
{
auto tmp0 = in_out_ptr0[i1 + (128*i0)];
auto tmp1 = in_ptr0[i1];
auto tmp3 = in_ptr1[i1];
auto tmp11 = in_ptr2[i1];
auto tmp13 = in_ptr3[i1];
auto tmp2 = tmp0 - tmp1;
auto tmp4 = static_cast<float>(1e-05);
auto tmp5 = tmp3 + tmp4;
auto tmp6 = std::sqrt(tmp5);
auto tmp7 = 1 / tmp6;
auto tmp8 = static_cast<float>(1);
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp15 = tmp14 * (tmp14>0);
in_out_ptr0[i1 + (128*i0)] = tmp15;
}
}
}
}
}
''')
kernel_cpp_16 = async_compile.cpp('''
#include "/tmp/torchinductor_root/dl/cdljpywww2h2ag4o35mwbvm45hhasxnxkhqgbupxnk3y7olula65.h"
extern "C" void kernel(float* __restrict__ in_out_ptr0,
const float* __restrict__ in_ptr0,
const float* __restrict__ in_ptr1,
const float* __restrict__ in_ptr2,
const float* __restrict__ in_ptr3,
const float* __restrict__ in_ptr4,
const long ks0,
const long ks1)
{
#pragma omp parallel num_threads(56)
{
{
#pragma omp for
for(long i0=0; i0<ks0 + (ks0*(((((-1) + ks1) / 8))*((((-1) + ks1) / 8)))) + (2*ks0*((((-1) + ks1) / 8))); i0+=1)
{
for(long i1=0; i1<32; i1+=1)
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + (16*i1) + (512*i0));
auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr1 + 16*i1);
auto tmp3 = at::vec::Vectorized<float>::loadu(in_ptr2 + 16*i1);
auto tmp11 = at::vec::Vectorized<float>::loadu(in_ptr3 + 16*i1);
auto tmp13 = at::vec::Vectorized<float>::loadu(in_ptr4 + 16*i1);
auto tmp15 = at::vec::Vectorized<float>::loadu(in_out_ptr0 + (16*i1) + (512*i0));
auto tmp2 = tmp0 - tmp1;
auto tmp4 = at::vec::Vectorized<float>(static_cast<float>(1e-05));
auto tmp5 = tmp3 + tmp4;
auto tmp6 = tmp5.sqrt();
auto tmp7 = tmp6.reciprocal();
auto tmp8 = at::vec::Vectorized<float>(static_cast<float>(1));
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp16 = tmp14 + tmp15;
auto tmp17 = at::vec::clamp_min(tmp16, decltype(tmp16)(0));
tmp17.store(in_out_ptr0 + (16*i1) + (512*i0));
}
#pragma omp simd simdlen(8)
for(long i1=512; i1<512; i1+=1)
{
auto tmp0 = in_ptr0[i1 + (512*i0)];
auto tmp1 = in_ptr1[i1];
auto tmp3 = in_ptr2[i1];
auto tmp11 = in_ptr3[i1];
auto tmp13 = in_ptr4[i1];
auto tmp15 = in_out_ptr0[i1 + (512*i0)];
auto tmp2 = tmp0 - tmp1;
auto tmp4 = static_cast<float>(1e-05);
auto tmp5 = tmp3 + tmp4;
auto tmp6 = std::sqrt(tmp5);
auto tmp7 = 1 / tmp6;
auto tmp8 = static_cast<float>(1);
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp16 = tmp14 + tmp15;
auto tmp17 = tmp16 * (tmp16>0);
in_out_ptr0[i1 + (512*i0)] = tmp17;
}
}
}
}
}
''')
kernel_cpp_17 = async_compile.cpp('''
#include "/tmp/torchinductor_root/dl/cdljpywww2h2ag4o35mwbvm45hhasxnxkhqgbupxnk3y7olula65.h"
extern "C" void kernel(float* __restrict__ in_out_ptr0,
const float* __restrict__ in_ptr0,
const float* __restrict__ in_ptr1,
const float* __restrict__ in_ptr2,
const float* __restrict__ in_ptr3,
const float* __restrict__ in_ptr4,
float* __restrict__ out_ptr0,
const long ks0,
const long ks1)
{
#pragma omp parallel num_threads(56)
{
{
#pragma omp for
for(long i0=0; i0<ks0 + (ks0*(((((-1) + ks1) / 8))*((((-1) + ks1) / 8)))) + (2*ks0*((((-1) + ks1) / 8))); i0+=1)
{
for(long i1=0; i1<8; i1+=1)
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_out_ptr0 + (16*i1) + (128*i0));
auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr0 + 16*i1);
auto tmp3 = at::vec::Vectorized<float>::loadu(in_ptr1 + 16*i1);
auto tmp11 = at::vec::Vectorized<float>::loadu(in_ptr2 + 16*i1);
auto tmp13 = at::vec::Vectorized<float>::loadu(in_ptr3 + 16*i1);
auto tmp2 = tmp0 - tmp1;
auto tmp4 = at::vec::Vectorized<float>(static_cast<float>(1e-05));
auto tmp5 = tmp3 + tmp4;
auto tmp6 = tmp5.sqrt();
auto tmp7 = tmp6.reciprocal();
auto tmp8 = at::vec::Vectorized<float>(static_cast<float>(1));
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp15 = at::vec::clamp_min(tmp14, decltype(tmp14)(0));
tmp15.store(in_out_ptr0 + (16*i1) + (128*i0));
}
#pragma omp simd simdlen(8)
for(long i1=128; i1<128; i1+=1)
{
auto tmp0 = in_out_ptr0[i1 + (128*i0)];
auto tmp1 = in_ptr0[i1];
auto tmp3 = in_ptr1[i1];
auto tmp11 = in_ptr2[i1];
auto tmp13 = in_ptr3[i1];
auto tmp2 = tmp0 - tmp1;
auto tmp4 = static_cast<float>(1e-05);
auto tmp5 = tmp3 + tmp4;
auto tmp6 = std::sqrt(tmp5);
auto tmp7 = 1 / tmp6;
auto tmp8 = static_cast<float>(1);
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp15 = tmp14 * (tmp14>0);
in_out_ptr0[i1 + (128*i0)] = tmp15;
}
}
}
#pragma omp single
{
{
#pragma GCC ivdep
for(long i0=0; i0<128; i0+=1)
{
#pragma GCC ivdep
for(long i1=0; i1<8; i1+=1)
{
#pragma GCC ivdep
for(long i2=0; i2<0; i2+=1)
{
float tmp1[16*16] __attribute__ ((aligned (16)));
for (long i1_inner = 0; i1_inner < 16; i1_inner++)
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr4 + (9*i1_inner) + (16*i2) + (144*i1) + (1152*i0));
tmp0.store(tmp1 + 16*i1_inner);
}
at::vec::transpose_mxn<float,16,16>(tmp1, 16, out_ptr0 + (16*i1) + (1152*i0) + (2048*i2), 128);
}
#pragma GCC ivdep
for(long i2=0; i2<9; i2+=1)
{
for (long i1_inner = 0; i1_inner < 16; i1_inner++)
{
auto tmp0 = in_ptr4[i2 + (9*i1_inner) + (144*i1) + (1152*i0)];
out_ptr0[i1_inner + (16*i1) + (128*i2) + (1152*i0)] = tmp0;
}
}
}
#pragma GCC ivdep
for(long i1=128; i1<128; i1+=1)
{
#pragma GCC ivdep
for(long i2=0; i2<9; i2+=1)
{
auto tmp0 = in_ptr4[i2 + (9*i1) + (1152*i0)];
out_ptr0[i1 + (128*i2) + (1152*i0)] = tmp0;
}
}
}
}
}
}
}
''')
kernel_cpp_18 = async_compile.cpp('''
#include "/tmp/torchinductor_root/dl/cdljpywww2h2ag4o35mwbvm45hhasxnxkhqgbupxnk3y7olula65.h"
extern "C" void kernel(float* __restrict__ in_out_ptr0,
const float* __restrict__ in_ptr0,
const float* __restrict__ in_ptr1,
const float* __restrict__ in_ptr2,
const float* __restrict__ in_ptr3,
const long ks0,
const long ks1)
{
#pragma omp parallel num_threads(56)
{
{
#pragma omp for
for(long i0=0; i0<ks0 + (ks0*(((((-1) + ks1) / 8))*((((-1) + ks1) / 8)))) + (2*ks0*((((-1) + ks1) / 8))); i0+=1)
{
for(long i1=0; i1<8; i1+=1)
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_out_ptr0 + (16*i1) + (128*i0));
auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr0 + 16*i1);
auto tmp3 = at::vec::Vectorized<float>::loadu(in_ptr1 + 16*i1);
auto tmp11 = at::vec::Vectorized<float>::loadu(in_ptr2 + 16*i1);
auto tmp13 = at::vec::Vectorized<float>::loadu(in_ptr3 + 16*i1);
auto tmp2 = tmp0 - tmp1;
auto tmp4 = at::vec::Vectorized<float>(static_cast<float>(1e-05));
auto tmp5 = tmp3 + tmp4;
auto tmp6 = tmp5.sqrt();
auto tmp7 = tmp6.reciprocal();
auto tmp8 = at::vec::Vectorized<float>(static_cast<float>(1));
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp15 = at::vec::clamp_min(tmp14, decltype(tmp14)(0));
tmp15.store(in_out_ptr0 + (16*i1) + (128*i0));
}
#pragma omp simd simdlen(8)
for(long i1=128; i1<128; i1+=1)
{
auto tmp0 = in_out_ptr0[i1 + (128*i0)];
auto tmp1 = in_ptr0[i1];
auto tmp3 = in_ptr1[i1];
auto tmp11 = in_ptr2[i1];
auto tmp13 = in_ptr3[i1];
auto tmp2 = tmp0 - tmp1;
auto tmp4 = static_cast<float>(1e-05);
auto tmp5 = tmp3 + tmp4;
auto tmp6 = std::sqrt(tmp5);
auto tmp7 = 1 / tmp6;
auto tmp8 = static_cast<float>(1);
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp15 = tmp14 * (tmp14>0);
in_out_ptr0[i1 + (128*i0)] = tmp15;
}
}
}
}
}
''')
kernel_cpp_19 = async_compile.cpp('''
#include "/tmp/torchinductor_root/dl/cdljpywww2h2ag4o35mwbvm45hhasxnxkhqgbupxnk3y7olula65.h"
extern "C" void kernel(float* __restrict__ in_out_ptr0,
const float* __restrict__ in_ptr0,
const float* __restrict__ in_ptr1,
const float* __restrict__ in_ptr2,
const float* __restrict__ in_ptr3,
const float* __restrict__ in_ptr4,
const long ks0,
const long ks1)
{
#pragma omp parallel num_threads(56)
{
{
#pragma omp for
for(long i0=0; i0<ks0; i0+=1)
{
#pragma GCC ivdep
for(long i1=0; i1<32; i1+=1)
{
#pragma GCC ivdep
for(long i2=0; i2<((1 + (((((-1) + ks1) / 8))*((((-1) + ks1) / 8))) + (2*((((-1) + ks1) / 8)))) / 16); i2+=1)
{
float tmp0[16*16] __attribute__ ((aligned (16)));
at::vec::transpose_mxn<float,16,16>(in_ptr0 + (16*i1) + (512*i0) + (8192*i2) + (512*i0*(((((-1) + ks1) / 8))*((((-1) + ks1) / 8)))) + (1024*i0*((((-1) + ks1) / 8))), 512, tmp0, 16);
float tmp16[16*16] __attribute__ ((aligned (16)));
at::vec::transpose_mxn<float,16,16>(in_out_ptr0 + (16*i1) + (512*i0) + (8192*i2) + (512*i0*(((((-1) + ks1) / 8))*((((-1) + ks1) / 8)))) + (1024*i0*((((-1) + ks1) / 8))), 512, tmp16, 16);
float tmp20[16*16] __attribute__ ((aligned (16)));
for (long i1_inner = 0; i1_inner < 16; i1_inner++)
{
auto tmp1 = at::vec::Vectorized<float>::loadu(tmp0 + 16*i1_inner);
auto tmp2 = at::vec::Vectorized<float>(in_ptr1[i1_inner + (16*i1)]);
auto tmp4 = at::vec::Vectorized<float>(in_ptr2[i1_inner + (16*i1)]);
auto tmp12 = at::vec::Vectorized<float>(in_ptr3[i1_inner + (16*i1)]);
auto tmp14 = at::vec::Vectorized<float>(in_ptr4[i1_inner + (16*i1)]);
auto tmp17 = at::vec::Vectorized<float>::loadu(tmp16 + 16*i1_inner);
auto tmp3 = tmp1 - tmp2;
auto tmp5 = at::vec::Vectorized<float>(static_cast<float>(1e-05));
auto tmp6 = tmp4 + tmp5;
auto tmp7 = tmp6.sqrt();
auto tmp8 = tmp7.reciprocal();
auto tmp9 = at::vec::Vectorized<float>(static_cast<float>(1));
auto tmp10 = tmp8 * tmp9;
auto tmp11 = tmp3 * tmp10;
auto tmp13 = tmp11 * tmp12;
auto tmp15 = tmp13 + tmp14;
auto tmp18 = tmp15 + tmp17;
auto tmp19 = at::vec::clamp_min(tmp18, decltype(tmp18)(0));
tmp19.store(tmp20 + 16*i1_inner);
}
at::vec::transpose_mxn<float,16,16>(tmp20, 16, in_out_ptr0 + (16*i1) + (512*i0) + (8192*i2) + (512*i0*(((((-1) + ks1) / 8))*((((-1) + ks1) / 8)))) + (1024*i0*((((-1) + ks1) / 8))), 512);
}
#pragma GCC ivdep
for(long i2=16*(((1 + (((((-1) + ks1) / 8))*((((-1) + ks1) / 8))) + (2*((((-1) + ks1) / 8)))) / 16)); i2<1 + (((((-1) + ks1) / 8))*((((-1) + ks1) / 8))) + (2*((((-1) + ks1) / 8))); i2+=1)
{
for (long i1_inner = 0; i1_inner < 16; i1_inner++)
{
auto tmp0 = in_ptr0[i1_inner + (16*i1) + (512*i0) + (512*i2) + (512*i0*(((((-1) + ks1) / 8))*((((-1) + ks1) / 8)))) + (1024*i0*((((-1) + ks1) / 8)))];
auto tmp1 = in_ptr1[i1_inner + (16*i1)];
auto tmp3 = in_ptr2[i1_inner + (16*i1)];
auto tmp11 = in_ptr3[i1_inner + (16*i1)];
auto tmp13 = in_ptr4[i1_inner + (16*i1)];
auto tmp15 = in_out_ptr0[i1_inner + (16*i1) + (512*i0) + (512*i2) + (512*i0*(((((-1) + ks1) / 8))*((((-1) + ks1) / 8)))) + (1024*i0*((((-1) + ks1) / 8)))];
auto tmp2 = tmp0 - tmp1;
auto tmp4 = static_cast<float>(1e-05);
auto tmp5 = tmp3 + tmp4;
auto tmp6 = std::sqrt(tmp5);
auto tmp7 = 1 / tmp6;
auto tmp8 = static_cast<float>(1);
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp16 = tmp14 + tmp15;
auto tmp17 = tmp16 * (tmp16>0);
in_out_ptr0[i1_inner + (16*i1) + (512*i0) + (512*i2) + (512*i0*(((((-1) + ks1) / 8))*((((-1) + ks1) / 8)))) + (1024*i0*((((-1) + ks1) / 8)))] = tmp17;
}
}
}
#pragma GCC ivdep
for(long i1=512; i1<512; i1+=1)
{
#pragma GCC ivdep
for(long i2=0; i2<1 + (((((-1) + ks1) / 8))*((((-1) + ks1) / 8))) + (2*((((-1) + ks1) / 8))); i2+=1)
{
auto tmp0 = in_ptr0[i1 + (512*i0) + (512*i2) + (512*i0*(((((-1) + ks1) / 8))*((((-1) + ks1) / 8)))) + (1024*i0*((((-1) + ks1) / 8)))];
auto tmp1 = in_ptr1[i1];
auto tmp3 = in_ptr2[i1];
auto tmp11 = in_ptr3[i1];
auto tmp13 = in_ptr4[i1];
auto tmp15 = in_out_ptr0[i1 + (512*i0) + (512*i2) + (512*i0*(((((-1) + ks1) / 8))*((((-1) + ks1) / 8)))) + (1024*i0*((((-1) + ks1) / 8)))];
auto tmp2 = tmp0 - tmp1;
auto tmp4 = static_cast<float>(1e-05);
auto tmp5 = tmp3 + tmp4;
auto tmp6 = std::sqrt(tmp5);
auto tmp7 = 1 / tmp6;
auto tmp8 = static_cast<float>(1);
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp16 = tmp14 + tmp15;
auto tmp17 = tmp16 * (tmp16>0);
in_out_ptr0[i1 + (512*i0) + (512*i2) + (512*i0*(((((-1) + ks1) / 8))*((((-1) + ks1) / 8)))) + (1024*i0*((((-1) + ks1) / 8)))] = tmp17;
}
}
}
}
}
}
''')
kernel_cpp_20 = async_compile.cpp('''
#include "/tmp/torchinductor_root/dl/cdljpywww2h2ag4o35mwbvm45hhasxnxkhqgbupxnk3y7olula65.h"
extern "C" void kernel(float* __restrict__ in_out_ptr0,
const float* __restrict__ in_ptr0,
const float* __restrict__ in_ptr1,
const float* __restrict__ in_ptr2,
const float* __restrict__ in_ptr3,
const float* __restrict__ in_ptr4,
float* __restrict__ out_ptr0,
const long ks0,
const long ks1)
{
#pragma omp parallel num_threads(56)
{
{
#pragma omp for
for(long i0=0; i0<ks0 + (ks0*(((((-1) + ks1) / 8))*((((-1) + ks1) / 8)))) + (2*ks0*((((-1) + ks1) / 8))); i0+=1)
{
for(long i1=0; i1<8; i1+=1)
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_out_ptr0 + (16*i1) + (128*i0));
auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr0 + 16*i1);
auto tmp3 = at::vec::Vectorized<float>::loadu(in_ptr1 + 16*i1);
auto tmp11 = at::vec::Vectorized<float>::loadu(in_ptr2 + 16*i1);
auto tmp13 = at::vec::Vectorized<float>::loadu(in_ptr3 + 16*i1);
auto tmp2 = tmp0 - tmp1;
auto tmp4 = at::vec::Vectorized<float>(static_cast<float>(1e-05));
auto tmp5 = tmp3 + tmp4;
auto tmp6 = tmp5.sqrt();
auto tmp7 = tmp6.reciprocal();
auto tmp8 = at::vec::Vectorized<float>(static_cast<float>(1));
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp15 = at::vec::clamp_min(tmp14, decltype(tmp14)(0));
tmp15.store(in_out_ptr0 + (16*i1) + (128*i0));
}
#pragma omp simd simdlen(8)
for(long i1=128; i1<128; i1+=1)
{
auto tmp0 = in_out_ptr0[i1 + (128*i0)];
auto tmp1 = in_ptr0[i1];
auto tmp3 = in_ptr1[i1];
auto tmp11 = in_ptr2[i1];
auto tmp13 = in_ptr3[i1];
auto tmp2 = tmp0 - tmp1;
auto tmp4 = static_cast<float>(1e-05);
auto tmp5 = tmp3 + tmp4;
auto tmp6 = std::sqrt(tmp5);
auto tmp7 = 1 / tmp6;
auto tmp8 = static_cast<float>(1);
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp15 = tmp14 * (tmp14>0);
in_out_ptr0[i1 + (128*i0)] = tmp15;
}
}
}
#pragma omp single
{
{
#pragma GCC ivdep
for(long i0=0; i0<128; i0+=1)
{
#pragma GCC ivdep
for(long i1=0; i1<8; i1+=1)
{
#pragma GCC ivdep
for(long i2=0; i2<0; i2+=1)
{
float tmp1[16*16] __attribute__ ((aligned (16)));
for (long i1_inner = 0; i1_inner < 16; i1_inner++)
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr4 + (9*i1_inner) + (16*i2) + (144*i1) + (1152*i0));
tmp0.store(tmp1 + 16*i1_inner);
}
at::vec::transpose_mxn<float,16,16>(tmp1, 16, out_ptr0 + (16*i1) + (1152*i0) + (2048*i2), 128);
}
#pragma GCC ivdep
for(long i2=0; i2<9; i2+=1)
{
for (long i1_inner = 0; i1_inner < 16; i1_inner++)
{
auto tmp0 = in_ptr4[i2 + (9*i1_inner) + (144*i1) + (1152*i0)];
out_ptr0[i1_inner + (16*i1) + (128*i2) + (1152*i0)] = tmp0;
}
}
}
#pragma GCC ivdep
for(long i1=128; i1<128; i1+=1)
{
#pragma GCC ivdep
for(long i2=0; i2<9; i2+=1)
{
auto tmp0 = in_ptr4[i2 + (9*i1) + (1152*i0)];
out_ptr0[i1 + (128*i2) + (1152*i0)] = tmp0;
}
}
}
}
}
}
}
''')
kernel_cpp_21 = async_compile.cpp('''
#include "/tmp/torchinductor_root/dl/cdljpywww2h2ag4o35mwbvm45hhasxnxkhqgbupxnk3y7olula65.h"
extern "C" void kernel(float* __restrict__ in_out_ptr0,
const float* __restrict__ in_ptr0,
const float* __restrict__ in_ptr1,
const float* __restrict__ in_ptr2,
const float* __restrict__ in_ptr3,
const long ks0,
const long ks1)
{
#pragma omp parallel num_threads(56)
{
{
#pragma omp for
for(long i0=0; i0<ks0 + (ks0*(((((-1) + ks1) / 8))*((((-1) + ks1) / 8)))) + (2*ks0*((((-1) + ks1) / 8))); i0+=1)
{
for(long i1=0; i1<8; i1+=1)
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_out_ptr0 + (16*i1) + (128*i0));
auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr0 + 16*i1);
auto tmp3 = at::vec::Vectorized<float>::loadu(in_ptr1 + 16*i1);
auto tmp11 = at::vec::Vectorized<float>::loadu(in_ptr2 + 16*i1);
auto tmp13 = at::vec::Vectorized<float>::loadu(in_ptr3 + 16*i1);
auto tmp2 = tmp0 - tmp1;
auto tmp4 = at::vec::Vectorized<float>(static_cast<float>(1e-05));
auto tmp5 = tmp3 + tmp4;
auto tmp6 = tmp5.sqrt();
auto tmp7 = tmp6.reciprocal();
auto tmp8 = at::vec::Vectorized<float>(static_cast<float>(1));
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp15 = at::vec::clamp_min(tmp14, decltype(tmp14)(0));
tmp15.store(in_out_ptr0 + (16*i1) + (128*i0));
}
#pragma omp simd simdlen(8)
for(long i1=128; i1<128; i1+=1)
{
auto tmp0 = in_out_ptr0[i1 + (128*i0)];
auto tmp1 = in_ptr0[i1];
auto tmp3 = in_ptr1[i1];
auto tmp11 = in_ptr2[i1];
auto tmp13 = in_ptr3[i1];
auto tmp2 = tmp0 - tmp1;
auto tmp4 = static_cast<float>(1e-05);
auto tmp5 = tmp3 + tmp4;
auto tmp6 = std::sqrt(tmp5);
auto tmp7 = 1 / tmp6;
auto tmp8 = static_cast<float>(1);
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp15 = tmp14 * (tmp14>0);
in_out_ptr0[i1 + (128*i0)] = tmp15;
}
}
}
}
}
''')
kernel_cpp_22 = async_compile.cpp('''
#include "/tmp/torchinductor_root/dl/cdljpywww2h2ag4o35mwbvm45hhasxnxkhqgbupxnk3y7olula65.h"
extern "C" void kernel(float* __restrict__ in_out_ptr0,
const float* __restrict__ in_ptr0,
const float* __restrict__ in_ptr1,
const float* __restrict__ in_ptr2,
const float* __restrict__ in_ptr3,
const float* __restrict__ in_ptr4,
const long ks0,
const long ks1)
{
#pragma omp parallel num_threads(56)
{
{
#pragma omp for
for(long i0=0; i0<ks0 + (ks0*(((((-1) + ks1) / 8))*((((-1) + ks1) / 8)))) + (2*ks0*((((-1) + ks1) / 8))); i0+=1)
{
for(long i1=0; i1<32; i1+=1)
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + (16*i1) + (512*i0));
auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr1 + 16*i1);
auto tmp3 = at::vec::Vectorized<float>::loadu(in_ptr2 + 16*i1);
auto tmp11 = at::vec::Vectorized<float>::loadu(in_ptr3 + 16*i1);
auto tmp13 = at::vec::Vectorized<float>::loadu(in_ptr4 + 16*i1);
auto tmp15 = at::vec::Vectorized<float>::loadu(in_out_ptr0 + (16*i1) + (512*i0));
auto tmp2 = tmp0 - tmp1;
auto tmp4 = at::vec::Vectorized<float>(static_cast<float>(1e-05));
auto tmp5 = tmp3 + tmp4;
auto tmp6 = tmp5.sqrt();
auto tmp7 = tmp6.reciprocal();
auto tmp8 = at::vec::Vectorized<float>(static_cast<float>(1));
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp16 = tmp14 + tmp15;
auto tmp17 = at::vec::clamp_min(tmp16, decltype(tmp16)(0));
tmp17.store(in_out_ptr0 + (16*i1) + (512*i0));
}
#pragma omp simd simdlen(8)
for(long i1=512; i1<512; i1+=1)
{
auto tmp0 = in_ptr0[i1 + (512*i0)];
auto tmp1 = in_ptr1[i1];
auto tmp3 = in_ptr2[i1];
auto tmp11 = in_ptr3[i1];
auto tmp13 = in_ptr4[i1];
auto tmp15 = in_out_ptr0[i1 + (512*i0)];
auto tmp2 = tmp0 - tmp1;
auto tmp4 = static_cast<float>(1e-05);
auto tmp5 = tmp3 + tmp4;
auto tmp6 = std::sqrt(tmp5);
auto tmp7 = 1 / tmp6;
auto tmp8 = static_cast<float>(1);
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp16 = tmp14 + tmp15;
auto tmp17 = tmp16 * (tmp16>0);
in_out_ptr0[i1 + (512*i0)] = tmp17;
}
}
}
}
}
''')
kernel_cpp_23 = async_compile.cpp('''
#include "/tmp/torchinductor_root/dl/cdljpywww2h2ag4o35mwbvm45hhasxnxkhqgbupxnk3y7olula65.h"
extern "C" void kernel(float* __restrict__ in_out_ptr0,
const float* __restrict__ in_ptr0,
const float* __restrict__ in_ptr1,
const float* __restrict__ in_ptr2,
const float* __restrict__ in_ptr3,
const float* __restrict__ in_ptr4,
float* __restrict__ out_ptr0,
const long ks0,
const long ks1)
{
#pragma omp parallel num_threads(56)
{
{
#pragma omp for
for(long i0=0; i0<ks0 + (ks0*(((((-1) + ks1) / 8))*((((-1) + ks1) / 8)))) + (2*ks0*((((-1) + ks1) / 8))); i0+=1)
{
for(long i1=0; i1<16; i1+=1)
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_out_ptr0 + (16*i1) + (256*i0));
auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr0 + 16*i1);
auto tmp3 = at::vec::Vectorized<float>::loadu(in_ptr1 + 16*i1);
auto tmp11 = at::vec::Vectorized<float>::loadu(in_ptr2 + 16*i1);
auto tmp13 = at::vec::Vectorized<float>::loadu(in_ptr3 + 16*i1);
auto tmp2 = tmp0 - tmp1;
auto tmp4 = at::vec::Vectorized<float>(static_cast<float>(1e-05));
auto tmp5 = tmp3 + tmp4;
auto tmp6 = tmp5.sqrt();
auto tmp7 = tmp6.reciprocal();
auto tmp8 = at::vec::Vectorized<float>(static_cast<float>(1));
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp15 = at::vec::clamp_min(tmp14, decltype(tmp14)(0));
tmp15.store(in_out_ptr0 + (16*i1) + (256*i0));
}
#pragma omp simd simdlen(8)
for(long i1=256; i1<256; i1+=1)
{
auto tmp0 = in_out_ptr0[i1 + (256*i0)];
auto tmp1 = in_ptr0[i1];
auto tmp3 = in_ptr1[i1];
auto tmp11 = in_ptr2[i1];
auto tmp13 = in_ptr3[i1];
auto tmp2 = tmp0 - tmp1;
auto tmp4 = static_cast<float>(1e-05);
auto tmp5 = tmp3 + tmp4;
auto tmp6 = std::sqrt(tmp5);
auto tmp7 = 1 / tmp6;
auto tmp8 = static_cast<float>(1);
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp15 = tmp14 * (tmp14>0);
in_out_ptr0[i1 + (256*i0)] = tmp15;
}
}
}
{
#pragma omp for
for(long i0=0; i0<256; i0+=1)
{
#pragma GCC ivdep
for(long i1=0; i1<16; i1+=1)
{
#pragma GCC ivdep
for(long i2=0; i2<0; i2+=1)
{
float tmp1[16*16] __attribute__ ((aligned (16)));
for (long i1_inner = 0; i1_inner < 16; i1_inner++)
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr4 + (9*i1_inner) + (16*i2) + (144*i1) + (2304*i0));
tmp0.store(tmp1 + 16*i1_inner);
}
at::vec::transpose_mxn<float,16,16>(tmp1, 16, out_ptr0 + (16*i1) + (2304*i0) + (4096*i2), 256);
}
#pragma GCC ivdep
for(long i2=0; i2<9; i2+=1)
{
for (long i1_inner = 0; i1_inner < 16; i1_inner++)
{
auto tmp0 = in_ptr4[i2 + (9*i1_inner) + (144*i1) + (2304*i0)];
out_ptr0[i1_inner + (16*i1) + (256*i2) + (2304*i0)] = tmp0;
}
}
}
#pragma GCC ivdep
for(long i1=256; i1<256; i1+=1)
{
#pragma GCC ivdep
for(long i2=0; i2<9; i2+=1)
{
auto tmp0 = in_ptr4[i2 + (9*i1) + (2304*i0)];
out_ptr0[i1 + (256*i2) + (2304*i0)] = tmp0;
}
}
}
}
}
}
''')
kernel_cpp_24 = async_compile.cpp('''
#include "/tmp/torchinductor_root/dl/cdljpywww2h2ag4o35mwbvm45hhasxnxkhqgbupxnk3y7olula65.h"
extern "C" void kernel(float* __restrict__ in_out_ptr0,
const float* __restrict__ in_ptr0,
const float* __restrict__ in_ptr1,
const float* __restrict__ in_ptr2,
const float* __restrict__ in_ptr3,
const long ks0,
const long ks1)
{
#pragma omp parallel num_threads(56)
{
{
#pragma omp for
for(long i0=0; i0<ks0 + (ks0*(((((-1) + ks1) / 16))*((((-1) + ks1) / 16)))) + (2*ks0*((((-1) + ks1) / 16))); i0+=1)
{
for(long i1=0; i1<16; i1+=1)
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_out_ptr0 + (16*i1) + (256*i0));
auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr0 + 16*i1);
auto tmp3 = at::vec::Vectorized<float>::loadu(in_ptr1 + 16*i1);
auto tmp11 = at::vec::Vectorized<float>::loadu(in_ptr2 + 16*i1);
auto tmp13 = at::vec::Vectorized<float>::loadu(in_ptr3 + 16*i1);
auto tmp2 = tmp0 - tmp1;
auto tmp4 = at::vec::Vectorized<float>(static_cast<float>(1e-05));
auto tmp5 = tmp3 + tmp4;
auto tmp6 = tmp5.sqrt();
auto tmp7 = tmp6.reciprocal();
auto tmp8 = at::vec::Vectorized<float>(static_cast<float>(1));
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp15 = at::vec::clamp_min(tmp14, decltype(tmp14)(0));
tmp15.store(in_out_ptr0 + (16*i1) + (256*i0));
}
#pragma omp simd simdlen(8)
for(long i1=256; i1<256; i1+=1)
{
auto tmp0 = in_out_ptr0[i1 + (256*i0)];
auto tmp1 = in_ptr0[i1];
auto tmp3 = in_ptr1[i1];
auto tmp11 = in_ptr2[i1];
auto tmp13 = in_ptr3[i1];
auto tmp2 = tmp0 - tmp1;
auto tmp4 = static_cast<float>(1e-05);
auto tmp5 = tmp3 + tmp4;
auto tmp6 = std::sqrt(tmp5);
auto tmp7 = 1 / tmp6;
auto tmp8 = static_cast<float>(1);
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp15 = tmp14 * (tmp14>0);
in_out_ptr0[i1 + (256*i0)] = tmp15;
}
}
}
}
}
''')
kernel_cpp_25 = async_compile.cpp('''
#include "/tmp/torchinductor_root/dl/cdljpywww2h2ag4o35mwbvm45hhasxnxkhqgbupxnk3y7olula65.h"
extern "C" void kernel(float* __restrict__ in_out_ptr0,
const float* __restrict__ in_ptr0,
const float* __restrict__ in_ptr1,
const float* __restrict__ in_ptr2,
const float* __restrict__ in_ptr3,
const float* __restrict__ in_ptr4,
const float* __restrict__ in_ptr5,
const float* __restrict__ in_ptr6,
const float* __restrict__ in_ptr7,
const float* __restrict__ in_ptr8,
const long ks0,
const long ks1)
{
#pragma omp parallel num_threads(56)
{
{
#pragma omp for
for(long i0=0; i0<ks0 + (ks0*(((((-1) + ks1) / 16))*((((-1) + ks1) / 16)))) + (2*ks0*((((-1) + ks1) / 16))); i0+=1)
{
for(long i1=0; i1<64; i1+=1)
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_out_ptr0 + (16*i1) + (1024*i0));
auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr0 + 16*i1);
auto tmp3 = at::vec::Vectorized<float>::loadu(in_ptr1 + 16*i1);
auto tmp11 = at::vec::Vectorized<float>::loadu(in_ptr2 + 16*i1);
auto tmp13 = at::vec::Vectorized<float>::loadu(in_ptr3 + 16*i1);
auto tmp15 = at::vec::Vectorized<float>::loadu(in_ptr4 + (16*i1) + (1024*i0));
auto tmp16 = at::vec::Vectorized<float>::loadu(in_ptr5 + 16*i1);
auto tmp18 = at::vec::Vectorized<float>::loadu(in_ptr6 + 16*i1);
auto tmp24 = at::vec::Vectorized<float>::loadu(in_ptr7 + 16*i1);
auto tmp26 = at::vec::Vectorized<float>::loadu(in_ptr8 + 16*i1);
auto tmp2 = tmp0 - tmp1;
auto tmp4 = at::vec::Vectorized<float>(static_cast<float>(1e-05));
auto tmp5 = tmp3 + tmp4;
auto tmp6 = tmp5.sqrt();
auto tmp7 = tmp6.reciprocal();
auto tmp8 = at::vec::Vectorized<float>(static_cast<float>(1));
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp17 = tmp15 - tmp16;
auto tmp19 = tmp18 + tmp4;
auto tmp20 = tmp19.sqrt();
auto tmp21 = tmp20.reciprocal();
auto tmp22 = tmp21 * tmp8;
auto tmp23 = tmp17 * tmp22;
auto tmp25 = tmp23 * tmp24;
auto tmp27 = tmp25 + tmp26;
auto tmp28 = tmp14 + tmp27;
tmp28.store(in_out_ptr0 + (16*i1) + (1024*i0));
}
#pragma omp simd simdlen(8)
for(long i1=1024; i1<1024; i1+=1)
{
auto tmp0 = in_out_ptr0[i1 + (1024*i0)];
auto tmp1 = in_ptr0[i1];
auto tmp3 = in_ptr1[i1];
auto tmp11 = in_ptr2[i1];
auto tmp13 = in_ptr3[i1];
auto tmp15 = in_ptr4[i1 + (1024*i0)];
auto tmp16 = in_ptr5[i1];
auto tmp18 = in_ptr6[i1];
auto tmp24 = in_ptr7[i1];
auto tmp26 = in_ptr8[i1];
auto tmp2 = tmp0 - tmp1;
auto tmp4 = static_cast<float>(1e-05);
auto tmp5 = tmp3 + tmp4;
auto tmp6 = std::sqrt(tmp5);
auto tmp7 = 1 / tmp6;
auto tmp8 = static_cast<float>(1);
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp17 = tmp15 - tmp16;
auto tmp19 = tmp18 + tmp4;
auto tmp20 = std::sqrt(tmp19);
auto tmp21 = 1 / tmp20;
auto tmp22 = tmp21 * tmp8;
auto tmp23 = tmp17 * tmp22;
auto tmp25 = tmp23 * tmp24;
auto tmp27 = tmp25 + tmp26;
auto tmp28 = tmp14 + tmp27;
in_out_ptr0[i1 + (1024*i0)] = tmp28;
}
}
}
{
#pragma omp for
for(long i0=0; i0<ks0; i0+=1)
{
#pragma GCC ivdep
for(long i1=0; i1<64; i1+=1)
{
#pragma GCC ivdep
for(long i2=0; i2<((1 + (((((-1) + ks1) / 16))*((((-1) + ks1) / 16))) + (2*((((-1) + ks1) / 16)))) / 16); i2+=1)
{
float tmp0[16*16] __attribute__ ((aligned (16)));
at::vec::transpose_mxn<float,16,16>(in_out_ptr0 + (16*i1) + (1024*i0) + (16384*i2) + (1024*i0*(((((-1) + ks1) / 16))*((((-1) + ks1) / 16)))) + (2048*i0*((((-1) + ks1) / 16))), 1024, tmp0, 16);
float tmp3[16*16] __attribute__ ((aligned (16)));
for (long i1_inner = 0; i1_inner < 16; i1_inner++)
{
auto tmp1 = at::vec::Vectorized<float>::loadu(tmp0 + 16*i1_inner);
auto tmp2 = at::vec::clamp_min(tmp1, decltype(tmp1)(0));
tmp2.store(tmp3 + 16*i1_inner);
}
at::vec::transpose_mxn<float,16,16>(tmp3, 16, in_out_ptr0 + (16*i1) + (1024*i0) + (16384*i2) + (1024*i0*(((((-1) + ks1) / 16))*((((-1) + ks1) / 16)))) + (2048*i0*((((-1) + ks1) / 16))), 1024);
}
#pragma GCC ivdep
for(long i2=16*(((1 + (((((-1) + ks1) / 16))*((((-1) + ks1) / 16))) + (2*((((-1) + ks1) / 16)))) / 16)); i2<1 + (((((-1) + ks1) / 16))*((((-1) + ks1) / 16))) + (2*((((-1) + ks1) / 16))); i2+=1)
{
for (long i1_inner = 0; i1_inner < 16; i1_inner++)
{
auto tmp0 = in_out_ptr0[i1_inner + (16*i1) + (1024*i0) + (1024*i2) + (1024*i0*(((((-1) + ks1) / 16))*((((-1) + ks1) / 16)))) + (2048*i0*((((-1) + ks1) / 16)))];
auto tmp1 = tmp0 * (tmp0>0);
in_out_ptr0[i1_inner + (16*i1) + (1024*i0) + (1024*i2) + (1024*i0*(((((-1) + ks1) / 16))*((((-1) + ks1) / 16)))) + (2048*i0*((((-1) + ks1) / 16)))] = tmp1;
}
}
}
#pragma GCC ivdep
for(long i1=1024; i1<1024; i1+=1)
{
#pragma GCC ivdep
for(long i2=0; i2<1 + (((((-1) + ks1) / 16))*((((-1) + ks1) / 16))) + (2*((((-1) + ks1) / 16))); i2+=1)
{
auto tmp0 = in_out_ptr0[i1 + (1024*i0) + (1024*i2) + (1024*i0*(((((-1) + ks1) / 16))*((((-1) + ks1) / 16)))) + (2048*i0*((((-1) + ks1) / 16)))];
auto tmp1 = tmp0 * (tmp0>0);
in_out_ptr0[i1 + (1024*i0) + (1024*i2) + (1024*i0*(((((-1) + ks1) / 16))*((((-1) + ks1) / 16)))) + (2048*i0*((((-1) + ks1) / 16)))] = tmp1;
}
}
}
}
}
}
''')
kernel_cpp_26 = async_compile.cpp('''
#include "/tmp/torchinductor_root/dl/cdljpywww2h2ag4o35mwbvm45hhasxnxkhqgbupxnk3y7olula65.h"
extern "C" void kernel(float* __restrict__ in_out_ptr0,
const float* __restrict__ in_ptr0,
const float* __restrict__ in_ptr1,
const float* __restrict__ in_ptr2,
const float* __restrict__ in_ptr3,
const float* __restrict__ in_ptr4,
float* __restrict__ out_ptr0,
const long ks0,
const long ks1)
{
#pragma omp parallel num_threads(56)
{
{
#pragma omp for
for(long i0=0; i0<ks0 + (ks0*(((((-1) + ks1) / 16))*((((-1) + ks1) / 16)))) + (2*ks0*((((-1) + ks1) / 16))); i0+=1)
{
for(long i1=0; i1<16; i1+=1)
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_out_ptr0 + (16*i1) + (256*i0));
auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr0 + 16*i1);
auto tmp3 = at::vec::Vectorized<float>::loadu(in_ptr1 + 16*i1);
auto tmp11 = at::vec::Vectorized<float>::loadu(in_ptr2 + 16*i1);
auto tmp13 = at::vec::Vectorized<float>::loadu(in_ptr3 + 16*i1);
auto tmp2 = tmp0 - tmp1;
auto tmp4 = at::vec::Vectorized<float>(static_cast<float>(1e-05));
auto tmp5 = tmp3 + tmp4;
auto tmp6 = tmp5.sqrt();
auto tmp7 = tmp6.reciprocal();
auto tmp8 = at::vec::Vectorized<float>(static_cast<float>(1));
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp15 = at::vec::clamp_min(tmp14, decltype(tmp14)(0));
tmp15.store(in_out_ptr0 + (16*i1) + (256*i0));
}
#pragma omp simd simdlen(8)
for(long i1=256; i1<256; i1+=1)
{
auto tmp0 = in_out_ptr0[i1 + (256*i0)];
auto tmp1 = in_ptr0[i1];
auto tmp3 = in_ptr1[i1];
auto tmp11 = in_ptr2[i1];
auto tmp13 = in_ptr3[i1];
auto tmp2 = tmp0 - tmp1;
auto tmp4 = static_cast<float>(1e-05);
auto tmp5 = tmp3 + tmp4;
auto tmp6 = std::sqrt(tmp5);
auto tmp7 = 1 / tmp6;
auto tmp8 = static_cast<float>(1);
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp15 = tmp14 * (tmp14>0);
in_out_ptr0[i1 + (256*i0)] = tmp15;
}
}
}
{
#pragma omp for
for(long i0=0; i0<256; i0+=1)
{
#pragma GCC ivdep
for(long i1=0; i1<16; i1+=1)
{
#pragma GCC ivdep
for(long i2=0; i2<0; i2+=1)
{
float tmp1[16*16] __attribute__ ((aligned (16)));
for (long i1_inner = 0; i1_inner < 16; i1_inner++)
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr4 + (9*i1_inner) + (16*i2) + (144*i1) + (2304*i0));
tmp0.store(tmp1 + 16*i1_inner);
}
at::vec::transpose_mxn<float,16,16>(tmp1, 16, out_ptr0 + (16*i1) + (2304*i0) + (4096*i2), 256);
}
#pragma GCC ivdep
for(long i2=0; i2<9; i2+=1)
{
for (long i1_inner = 0; i1_inner < 16; i1_inner++)
{
auto tmp0 = in_ptr4[i2 + (9*i1_inner) + (144*i1) + (2304*i0)];
out_ptr0[i1_inner + (16*i1) + (256*i2) + (2304*i0)] = tmp0;
}
}
}
#pragma GCC ivdep
for(long i1=256; i1<256; i1+=1)
{
#pragma GCC ivdep
for(long i2=0; i2<9; i2+=1)
{
auto tmp0 = in_ptr4[i2 + (9*i1) + (2304*i0)];
out_ptr0[i1 + (256*i2) + (2304*i0)] = tmp0;
}
}
}
}
}
}
''')
kernel_cpp_27 = async_compile.cpp('''
#include "/tmp/torchinductor_root/dl/cdljpywww2h2ag4o35mwbvm45hhasxnxkhqgbupxnk3y7olula65.h"
extern "C" void kernel(float* __restrict__ in_out_ptr0,
const float* __restrict__ in_ptr0,
const float* __restrict__ in_ptr1,
const float* __restrict__ in_ptr2,
const float* __restrict__ in_ptr3,
const long ks0,
const long ks1)
{
#pragma omp parallel num_threads(56)
{
{
#pragma omp for
for(long i0=0; i0<ks0 + (ks0*(((((-1) + ks1) / 16))*((((-1) + ks1) / 16)))) + (2*ks0*((((-1) + ks1) / 16))); i0+=1)
{
for(long i1=0; i1<16; i1+=1)
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_out_ptr0 + (16*i1) + (256*i0));
auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr0 + 16*i1);
auto tmp3 = at::vec::Vectorized<float>::loadu(in_ptr1 + 16*i1);
auto tmp11 = at::vec::Vectorized<float>::loadu(in_ptr2 + 16*i1);
auto tmp13 = at::vec::Vectorized<float>::loadu(in_ptr3 + 16*i1);
auto tmp2 = tmp0 - tmp1;
auto tmp4 = at::vec::Vectorized<float>(static_cast<float>(1e-05));
auto tmp5 = tmp3 + tmp4;
auto tmp6 = tmp5.sqrt();
auto tmp7 = tmp6.reciprocal();
auto tmp8 = at::vec::Vectorized<float>(static_cast<float>(1));
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp15 = at::vec::clamp_min(tmp14, decltype(tmp14)(0));
tmp15.store(in_out_ptr0 + (16*i1) + (256*i0));
}
#pragma omp simd simdlen(8)
for(long i1=256; i1<256; i1+=1)
{
auto tmp0 = in_out_ptr0[i1 + (256*i0)];
auto tmp1 = in_ptr0[i1];
auto tmp3 = in_ptr1[i1];
auto tmp11 = in_ptr2[i1];
auto tmp13 = in_ptr3[i1];
auto tmp2 = tmp0 - tmp1;
auto tmp4 = static_cast<float>(1e-05);
auto tmp5 = tmp3 + tmp4;
auto tmp6 = std::sqrt(tmp5);
auto tmp7 = 1 / tmp6;
auto tmp8 = static_cast<float>(1);
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp15 = tmp14 * (tmp14>0);
in_out_ptr0[i1 + (256*i0)] = tmp15;
}
}
}
}
}
''')
kernel_cpp_28 = async_compile.cpp('''
#include "/tmp/torchinductor_root/dl/cdljpywww2h2ag4o35mwbvm45hhasxnxkhqgbupxnk3y7olula65.h"
extern "C" void kernel(float* __restrict__ in_out_ptr0,
const float* __restrict__ in_ptr0,
const float* __restrict__ in_ptr1,
const float* __restrict__ in_ptr2,
const float* __restrict__ in_ptr3,
const float* __restrict__ in_ptr4,
const long ks0,
const long ks1)
{
#pragma omp parallel num_threads(56)
{
{
#pragma omp for
for(long i0=0; i0<ks0 + (ks0*(((((-1) + ks1) / 16))*((((-1) + ks1) / 16)))) + (2*ks0*((((-1) + ks1) / 16))); i0+=1)
{
for(long i1=0; i1<64; i1+=1)
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_out_ptr0 + (16*i1) + (1024*i0));
auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr0 + 16*i1);
auto tmp3 = at::vec::Vectorized<float>::loadu(in_ptr1 + 16*i1);
auto tmp11 = at::vec::Vectorized<float>::loadu(in_ptr2 + 16*i1);
auto tmp13 = at::vec::Vectorized<float>::loadu(in_ptr3 + 16*i1);
auto tmp15 = at::vec::Vectorized<float>::loadu(in_ptr4 + (16*i1) + (1024*i0));
auto tmp2 = tmp0 - tmp1;
auto tmp4 = at::vec::Vectorized<float>(static_cast<float>(1e-05));
auto tmp5 = tmp3 + tmp4;
auto tmp6 = tmp5.sqrt();
auto tmp7 = tmp6.reciprocal();
auto tmp8 = at::vec::Vectorized<float>(static_cast<float>(1));
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp16 = tmp14 + tmp15;
auto tmp17 = at::vec::clamp_min(tmp16, decltype(tmp16)(0));
tmp17.store(in_out_ptr0 + (16*i1) + (1024*i0));
}
#pragma omp simd simdlen(8)
for(long i1=1024; i1<1024; i1+=1)
{
auto tmp0 = in_out_ptr0[i1 + (1024*i0)];
auto tmp1 = in_ptr0[i1];
auto tmp3 = in_ptr1[i1];
auto tmp11 = in_ptr2[i1];
auto tmp13 = in_ptr3[i1];
auto tmp15 = in_ptr4[i1 + (1024*i0)];
auto tmp2 = tmp0 - tmp1;
auto tmp4 = static_cast<float>(1e-05);
auto tmp5 = tmp3 + tmp4;
auto tmp6 = std::sqrt(tmp5);
auto tmp7 = 1 / tmp6;
auto tmp8 = static_cast<float>(1);
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp16 = tmp14 + tmp15;
auto tmp17 = tmp16 * (tmp16>0);
in_out_ptr0[i1 + (1024*i0)] = tmp17;
}
}
}
}
}
''')
kernel_cpp_29 = async_compile.cpp('''
#include "/tmp/torchinductor_root/dl/cdljpywww2h2ag4o35mwbvm45hhasxnxkhqgbupxnk3y7olula65.h"
extern "C" void kernel(float* __restrict__ in_out_ptr0,
const float* __restrict__ in_ptr0,
const float* __restrict__ in_ptr1,
const float* __restrict__ in_ptr2,
const float* __restrict__ in_ptr3,
const float* __restrict__ in_ptr4,
float* __restrict__ out_ptr0,
const long ks0,
const long ks1)
{
#pragma omp parallel num_threads(56)
{
{
#pragma omp for
for(long i0=0; i0<ks0 + (ks0*(((((-1) + ks1) / 16))*((((-1) + ks1) / 16)))) + (2*ks0*((((-1) + ks1) / 16))); i0+=1)
{
for(long i1=0; i1<16; i1+=1)
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_out_ptr0 + (16*i1) + (256*i0));
auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr0 + 16*i1);
auto tmp3 = at::vec::Vectorized<float>::loadu(in_ptr1 + 16*i1);
auto tmp11 = at::vec::Vectorized<float>::loadu(in_ptr2 + 16*i1);
auto tmp13 = at::vec::Vectorized<float>::loadu(in_ptr3 + 16*i1);
auto tmp2 = tmp0 - tmp1;
auto tmp4 = at::vec::Vectorized<float>(static_cast<float>(1e-05));
auto tmp5 = tmp3 + tmp4;
auto tmp6 = tmp5.sqrt();
auto tmp7 = tmp6.reciprocal();
auto tmp8 = at::vec::Vectorized<float>(static_cast<float>(1));
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp15 = at::vec::clamp_min(tmp14, decltype(tmp14)(0));
tmp15.store(in_out_ptr0 + (16*i1) + (256*i0));
}
#pragma omp simd simdlen(8)
for(long i1=256; i1<256; i1+=1)
{
auto tmp0 = in_out_ptr0[i1 + (256*i0)];
auto tmp1 = in_ptr0[i1];
auto tmp3 = in_ptr1[i1];
auto tmp11 = in_ptr2[i1];
auto tmp13 = in_ptr3[i1];
auto tmp2 = tmp0 - tmp1;
auto tmp4 = static_cast<float>(1e-05);
auto tmp5 = tmp3 + tmp4;
auto tmp6 = std::sqrt(tmp5);
auto tmp7 = 1 / tmp6;
auto tmp8 = static_cast<float>(1);
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp15 = tmp14 * (tmp14>0);
in_out_ptr0[i1 + (256*i0)] = tmp15;
}
}
}
{
#pragma omp for
for(long i0=0; i0<256; i0+=1)
{
#pragma GCC ivdep
for(long i1=0; i1<16; i1+=1)
{
#pragma GCC ivdep
for(long i2=0; i2<0; i2+=1)
{
float tmp1[16*16] __attribute__ ((aligned (16)));
for (long i1_inner = 0; i1_inner < 16; i1_inner++)
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr4 + (9*i1_inner) + (16*i2) + (144*i1) + (2304*i0));
tmp0.store(tmp1 + 16*i1_inner);
}
at::vec::transpose_mxn<float,16,16>(tmp1, 16, out_ptr0 + (16*i1) + (2304*i0) + (4096*i2), 256);
}
#pragma GCC ivdep
for(long i2=0; i2<9; i2+=1)
{
for (long i1_inner = 0; i1_inner < 16; i1_inner++)
{
auto tmp0 = in_ptr4[i2 + (9*i1_inner) + (144*i1) + (2304*i0)];
out_ptr0[i1_inner + (16*i1) + (256*i2) + (2304*i0)] = tmp0;
}
}
}
#pragma GCC ivdep
for(long i1=256; i1<256; i1+=1)
{
#pragma GCC ivdep
for(long i2=0; i2<9; i2+=1)
{
auto tmp0 = in_ptr4[i2 + (9*i1) + (2304*i0)];
out_ptr0[i1 + (256*i2) + (2304*i0)] = tmp0;
}
}
}
}
}
}
''')
kernel_cpp_30 = async_compile.cpp('''
#include "/tmp/torchinductor_root/dl/cdljpywww2h2ag4o35mwbvm45hhasxnxkhqgbupxnk3y7olula65.h"
extern "C" void kernel(float* __restrict__ in_out_ptr0,
const float* __restrict__ in_ptr0,
const float* __restrict__ in_ptr1,
const float* __restrict__ in_ptr2,
const float* __restrict__ in_ptr3,
const long ks0,
const long ks1)
{
#pragma omp parallel num_threads(56)
{
{
#pragma omp for
for(long i0=0; i0<ks0 + (ks0*(((((-1) + ks1) / 16))*((((-1) + ks1) / 16)))) + (2*ks0*((((-1) + ks1) / 16))); i0+=1)
{
for(long i1=0; i1<16; i1+=1)
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_out_ptr0 + (16*i1) + (256*i0));
auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr0 + 16*i1);
auto tmp3 = at::vec::Vectorized<float>::loadu(in_ptr1 + 16*i1);
auto tmp11 = at::vec::Vectorized<float>::loadu(in_ptr2 + 16*i1);
auto tmp13 = at::vec::Vectorized<float>::loadu(in_ptr3 + 16*i1);
auto tmp2 = tmp0 - tmp1;
auto tmp4 = at::vec::Vectorized<float>(static_cast<float>(1e-05));
auto tmp5 = tmp3 + tmp4;
auto tmp6 = tmp5.sqrt();
auto tmp7 = tmp6.reciprocal();
auto tmp8 = at::vec::Vectorized<float>(static_cast<float>(1));
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp15 = at::vec::clamp_min(tmp14, decltype(tmp14)(0));
tmp15.store(in_out_ptr0 + (16*i1) + (256*i0));
}
#pragma omp simd simdlen(8)
for(long i1=256; i1<256; i1+=1)
{
auto tmp0 = in_out_ptr0[i1 + (256*i0)];
auto tmp1 = in_ptr0[i1];
auto tmp3 = in_ptr1[i1];
auto tmp11 = in_ptr2[i1];
auto tmp13 = in_ptr3[i1];
auto tmp2 = tmp0 - tmp1;
auto tmp4 = static_cast<float>(1e-05);
auto tmp5 = tmp3 + tmp4;
auto tmp6 = std::sqrt(tmp5);
auto tmp7 = 1 / tmp6;
auto tmp8 = static_cast<float>(1);
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp15 = tmp14 * (tmp14>0);
in_out_ptr0[i1 + (256*i0)] = tmp15;
}
}
}
}
}
''')
kernel_cpp_31 = async_compile.cpp('''
#include "/tmp/torchinductor_root/dl/cdljpywww2h2ag4o35mwbvm45hhasxnxkhqgbupxnk3y7olula65.h"
extern "C" void kernel(float* __restrict__ in_out_ptr0,
const float* __restrict__ in_ptr0,
const float* __restrict__ in_ptr1,
const float* __restrict__ in_ptr2,
const float* __restrict__ in_ptr3,
const float* __restrict__ in_ptr4,
const long ks0,
const long ks1)
{
#pragma omp parallel num_threads(56)
{
{
#pragma omp for
for(long i0=0; i0<ks0; i0+=1)
{
#pragma GCC ivdep
for(long i1=0; i1<64; i1+=1)
{
#pragma GCC ivdep
for(long i2=0; i2<((1 + (((((-1) + ks1) / 16))*((((-1) + ks1) / 16))) + (2*((((-1) + ks1) / 16)))) / 16); i2+=1)
{
float tmp0[16*16] __attribute__ ((aligned (16)));
at::vec::transpose_mxn<float,16,16>(in_ptr0 + (16*i1) + (1024*i0) + (16384*i2) + (1024*i0*(((((-1) + ks1) / 16))*((((-1) + ks1) / 16)))) + (2048*i0*((((-1) + ks1) / 16))), 1024, tmp0, 16);
float tmp16[16*16] __attribute__ ((aligned (16)));
at::vec::transpose_mxn<float,16,16>(in_out_ptr0 + (16*i1) + (1024*i0) + (16384*i2) + (1024*i0*(((((-1) + ks1) / 16))*((((-1) + ks1) / 16)))) + (2048*i0*((((-1) + ks1) / 16))), 1024, tmp16, 16);
float tmp20[16*16] __attribute__ ((aligned (16)));
for (long i1_inner = 0; i1_inner < 16; i1_inner++)
{
auto tmp1 = at::vec::Vectorized<float>::loadu(tmp0 + 16*i1_inner);
auto tmp2 = at::vec::Vectorized<float>(in_ptr1[i1_inner + (16*i1)]);
auto tmp4 = at::vec::Vectorized<float>(in_ptr2[i1_inner + (16*i1)]);
auto tmp12 = at::vec::Vectorized<float>(in_ptr3[i1_inner + (16*i1)]);
auto tmp14 = at::vec::Vectorized<float>(in_ptr4[i1_inner + (16*i1)]);
auto tmp17 = at::vec::Vectorized<float>::loadu(tmp16 + 16*i1_inner);
auto tmp3 = tmp1 - tmp2;
auto tmp5 = at::vec::Vectorized<float>(static_cast<float>(1e-05));
auto tmp6 = tmp4 + tmp5;
auto tmp7 = tmp6.sqrt();
auto tmp8 = tmp7.reciprocal();
auto tmp9 = at::vec::Vectorized<float>(static_cast<float>(1));
auto tmp10 = tmp8 * tmp9;
auto tmp11 = tmp3 * tmp10;
auto tmp13 = tmp11 * tmp12;
auto tmp15 = tmp13 + tmp14;
auto tmp18 = tmp15 + tmp17;
auto tmp19 = at::vec::clamp_min(tmp18, decltype(tmp18)(0));
tmp19.store(tmp20 + 16*i1_inner);
}
at::vec::transpose_mxn<float,16,16>(tmp20, 16, in_out_ptr0 + (16*i1) + (1024*i0) + (16384*i2) + (1024*i0*(((((-1) + ks1) / 16))*((((-1) + ks1) / 16)))) + (2048*i0*((((-1) + ks1) / 16))), 1024);
}
#pragma GCC ivdep
for(long i2=16*(((1 + (((((-1) + ks1) / 16))*((((-1) + ks1) / 16))) + (2*((((-1) + ks1) / 16)))) / 16)); i2<1 + (((((-1) + ks1) / 16))*((((-1) + ks1) / 16))) + (2*((((-1) + ks1) / 16))); i2+=1)
{
for (long i1_inner = 0; i1_inner < 16; i1_inner++)
{
auto tmp0 = in_ptr0[i1_inner + (16*i1) + (1024*i0) + (1024*i2) + (1024*i0*(((((-1) + ks1) / 16))*((((-1) + ks1) / 16)))) + (2048*i0*((((-1) + ks1) / 16)))];
auto tmp1 = in_ptr1[i1_inner + (16*i1)];
auto tmp3 = in_ptr2[i1_inner + (16*i1)];
auto tmp11 = in_ptr3[i1_inner + (16*i1)];
auto tmp13 = in_ptr4[i1_inner + (16*i1)];
auto tmp15 = in_out_ptr0[i1_inner + (16*i1) + (1024*i0) + (1024*i2) + (1024*i0*(((((-1) + ks1) / 16))*((((-1) + ks1) / 16)))) + (2048*i0*((((-1) + ks1) / 16)))];
auto tmp2 = tmp0 - tmp1;
auto tmp4 = static_cast<float>(1e-05);
auto tmp5 = tmp3 + tmp4;
auto tmp6 = std::sqrt(tmp5);
auto tmp7 = 1 / tmp6;
auto tmp8 = static_cast<float>(1);
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp16 = tmp14 + tmp15;
auto tmp17 = tmp16 * (tmp16>0);
in_out_ptr0[i1_inner + (16*i1) + (1024*i0) + (1024*i2) + (1024*i0*(((((-1) + ks1) / 16))*((((-1) + ks1) / 16)))) + (2048*i0*((((-1) + ks1) / 16)))] = tmp17;
}
}
}
#pragma GCC ivdep
for(long i1=1024; i1<1024; i1+=1)
{
#pragma GCC ivdep
for(long i2=0; i2<1 + (((((-1) + ks1) / 16))*((((-1) + ks1) / 16))) + (2*((((-1) + ks1) / 16))); i2+=1)
{
auto tmp0 = in_ptr0[i1 + (1024*i0) + (1024*i2) + (1024*i0*(((((-1) + ks1) / 16))*((((-1) + ks1) / 16)))) + (2048*i0*((((-1) + ks1) / 16)))];
auto tmp1 = in_ptr1[i1];
auto tmp3 = in_ptr2[i1];
auto tmp11 = in_ptr3[i1];
auto tmp13 = in_ptr4[i1];
auto tmp15 = in_out_ptr0[i1 + (1024*i0) + (1024*i2) + (1024*i0*(((((-1) + ks1) / 16))*((((-1) + ks1) / 16)))) + (2048*i0*((((-1) + ks1) / 16)))];
auto tmp2 = tmp0 - tmp1;
auto tmp4 = static_cast<float>(1e-05);
auto tmp5 = tmp3 + tmp4;
auto tmp6 = std::sqrt(tmp5);
auto tmp7 = 1 / tmp6;
auto tmp8 = static_cast<float>(1);
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp16 = tmp14 + tmp15;
auto tmp17 = tmp16 * (tmp16>0);
in_out_ptr0[i1 + (1024*i0) + (1024*i2) + (1024*i0*(((((-1) + ks1) / 16))*((((-1) + ks1) / 16)))) + (2048*i0*((((-1) + ks1) / 16)))] = tmp17;
}
}
}
}
}
}
''')
kernel_cpp_32 = async_compile.cpp('''
#include "/tmp/torchinductor_root/dl/cdljpywww2h2ag4o35mwbvm45hhasxnxkhqgbupxnk3y7olula65.h"
extern "C" void kernel(float* __restrict__ in_out_ptr0,
const float* __restrict__ in_ptr0,
const float* __restrict__ in_ptr1,
const float* __restrict__ in_ptr2,
const float* __restrict__ in_ptr3,
const float* __restrict__ in_ptr4,
float* __restrict__ out_ptr0,
const long ks0,
const long ks1)
{
#pragma omp parallel num_threads(56)
{
{
#pragma omp for
for(long i0=0; i0<ks0 + (ks0*(((((-1) + ks1) / 16))*((((-1) + ks1) / 16)))) + (2*ks0*((((-1) + ks1) / 16))); i0+=1)
{
for(long i1=0; i1<16; i1+=1)
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_out_ptr0 + (16*i1) + (256*i0));
auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr0 + 16*i1);
auto tmp3 = at::vec::Vectorized<float>::loadu(in_ptr1 + 16*i1);
auto tmp11 = at::vec::Vectorized<float>::loadu(in_ptr2 + 16*i1);
auto tmp13 = at::vec::Vectorized<float>::loadu(in_ptr3 + 16*i1);
auto tmp2 = tmp0 - tmp1;
auto tmp4 = at::vec::Vectorized<float>(static_cast<float>(1e-05));
auto tmp5 = tmp3 + tmp4;
auto tmp6 = tmp5.sqrt();
auto tmp7 = tmp6.reciprocal();
auto tmp8 = at::vec::Vectorized<float>(static_cast<float>(1));
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp15 = at::vec::clamp_min(tmp14, decltype(tmp14)(0));
tmp15.store(in_out_ptr0 + (16*i1) + (256*i0));
}
#pragma omp simd simdlen(8)
for(long i1=256; i1<256; i1+=1)
{
auto tmp0 = in_out_ptr0[i1 + (256*i0)];
auto tmp1 = in_ptr0[i1];
auto tmp3 = in_ptr1[i1];
auto tmp11 = in_ptr2[i1];
auto tmp13 = in_ptr3[i1];
auto tmp2 = tmp0 - tmp1;
auto tmp4 = static_cast<float>(1e-05);
auto tmp5 = tmp3 + tmp4;
auto tmp6 = std::sqrt(tmp5);
auto tmp7 = 1 / tmp6;
auto tmp8 = static_cast<float>(1);
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp15 = tmp14 * (tmp14>0);
in_out_ptr0[i1 + (256*i0)] = tmp15;
}
}
}
{
#pragma omp for
for(long i0=0; i0<256; i0+=1)
{
#pragma GCC ivdep
for(long i1=0; i1<16; i1+=1)
{
#pragma GCC ivdep
for(long i2=0; i2<0; i2+=1)
{
float tmp1[16*16] __attribute__ ((aligned (16)));
for (long i1_inner = 0; i1_inner < 16; i1_inner++)
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr4 + (9*i1_inner) + (16*i2) + (144*i1) + (2304*i0));
tmp0.store(tmp1 + 16*i1_inner);
}
at::vec::transpose_mxn<float,16,16>(tmp1, 16, out_ptr0 + (16*i1) + (2304*i0) + (4096*i2), 256);
}
#pragma GCC ivdep
for(long i2=0; i2<9; i2+=1)
{
for (long i1_inner = 0; i1_inner < 16; i1_inner++)
{
auto tmp0 = in_ptr4[i2 + (9*i1_inner) + (144*i1) + (2304*i0)];
out_ptr0[i1_inner + (16*i1) + (256*i2) + (2304*i0)] = tmp0;
}
}
}
#pragma GCC ivdep
for(long i1=256; i1<256; i1+=1)
{
#pragma GCC ivdep
for(long i2=0; i2<9; i2+=1)
{
auto tmp0 = in_ptr4[i2 + (9*i1) + (2304*i0)];
out_ptr0[i1 + (256*i2) + (2304*i0)] = tmp0;
}
}
}
}
}
}
''')
kernel_cpp_33 = async_compile.cpp('''
#include "/tmp/torchinductor_root/dl/cdljpywww2h2ag4o35mwbvm45hhasxnxkhqgbupxnk3y7olula65.h"
extern "C" void kernel(float* __restrict__ in_out_ptr0,
const float* __restrict__ in_ptr0,
const float* __restrict__ in_ptr1,
const float* __restrict__ in_ptr2,
const float* __restrict__ in_ptr3,
const long ks0,
const long ks1)
{
#pragma omp parallel num_threads(56)
{
{
#pragma omp for
for(long i0=0; i0<ks0 + (ks0*(((((-1) + ks1) / 16))*((((-1) + ks1) / 16)))) + (2*ks0*((((-1) + ks1) / 16))); i0+=1)
{
for(long i1=0; i1<16; i1+=1)
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_out_ptr0 + (16*i1) + (256*i0));
auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr0 + 16*i1);
auto tmp3 = at::vec::Vectorized<float>::loadu(in_ptr1 + 16*i1);
auto tmp11 = at::vec::Vectorized<float>::loadu(in_ptr2 + 16*i1);
auto tmp13 = at::vec::Vectorized<float>::loadu(in_ptr3 + 16*i1);
auto tmp2 = tmp0 - tmp1;
auto tmp4 = at::vec::Vectorized<float>(static_cast<float>(1e-05));
auto tmp5 = tmp3 + tmp4;
auto tmp6 = tmp5.sqrt();
auto tmp7 = tmp6.reciprocal();
auto tmp8 = at::vec::Vectorized<float>(static_cast<float>(1));
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp15 = at::vec::clamp_min(tmp14, decltype(tmp14)(0));
tmp15.store(in_out_ptr0 + (16*i1) + (256*i0));
}
#pragma omp simd simdlen(8)
for(long i1=256; i1<256; i1+=1)
{
auto tmp0 = in_out_ptr0[i1 + (256*i0)];
auto tmp1 = in_ptr0[i1];
auto tmp3 = in_ptr1[i1];
auto tmp11 = in_ptr2[i1];
auto tmp13 = in_ptr3[i1];
auto tmp2 = tmp0 - tmp1;
auto tmp4 = static_cast<float>(1e-05);
auto tmp5 = tmp3 + tmp4;
auto tmp6 = std::sqrt(tmp5);
auto tmp7 = 1 / tmp6;
auto tmp8 = static_cast<float>(1);
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp15 = tmp14 * (tmp14>0);
in_out_ptr0[i1 + (256*i0)] = tmp15;
}
}
}
}
}
''')
kernel_cpp_34 = async_compile.cpp('''
#include "/tmp/torchinductor_root/dl/cdljpywww2h2ag4o35mwbvm45hhasxnxkhqgbupxnk3y7olula65.h"
extern "C" void kernel(float* __restrict__ in_out_ptr0,
const float* __restrict__ in_ptr0,
const float* __restrict__ in_ptr1,
const float* __restrict__ in_ptr2,
const float* __restrict__ in_ptr3,
const float* __restrict__ in_ptr4,
const long ks0,
const long ks1)
{
#pragma omp parallel num_threads(56)
{
{
#pragma omp for
for(long i0=0; i0<ks0 + (ks0*(((((-1) + ks1) / 16))*((((-1) + ks1) / 16)))) + (2*ks0*((((-1) + ks1) / 16))); i0+=1)
{
for(long i1=0; i1<64; i1+=1)
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + (16*i1) + (1024*i0));
auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr1 + 16*i1);
auto tmp3 = at::vec::Vectorized<float>::loadu(in_ptr2 + 16*i1);
auto tmp11 = at::vec::Vectorized<float>::loadu(in_ptr3 + 16*i1);
auto tmp13 = at::vec::Vectorized<float>::loadu(in_ptr4 + 16*i1);
auto tmp15 = at::vec::Vectorized<float>::loadu(in_out_ptr0 + (16*i1) + (1024*i0));
auto tmp2 = tmp0 - tmp1;
auto tmp4 = at::vec::Vectorized<float>(static_cast<float>(1e-05));
auto tmp5 = tmp3 + tmp4;
auto tmp6 = tmp5.sqrt();
auto tmp7 = tmp6.reciprocal();
auto tmp8 = at::vec::Vectorized<float>(static_cast<float>(1));
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp16 = tmp14 + tmp15;
auto tmp17 = at::vec::clamp_min(tmp16, decltype(tmp16)(0));
tmp17.store(in_out_ptr0 + (16*i1) + (1024*i0));
}
#pragma omp simd simdlen(8)
for(long i1=1024; i1<1024; i1+=1)
{
auto tmp0 = in_ptr0[i1 + (1024*i0)];
auto tmp1 = in_ptr1[i1];
auto tmp3 = in_ptr2[i1];
auto tmp11 = in_ptr3[i1];
auto tmp13 = in_ptr4[i1];
auto tmp15 = in_out_ptr0[i1 + (1024*i0)];
auto tmp2 = tmp0 - tmp1;
auto tmp4 = static_cast<float>(1e-05);
auto tmp5 = tmp3 + tmp4;
auto tmp6 = std::sqrt(tmp5);
auto tmp7 = 1 / tmp6;
auto tmp8 = static_cast<float>(1);
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp16 = tmp14 + tmp15;
auto tmp17 = tmp16 * (tmp16>0);
in_out_ptr0[i1 + (1024*i0)] = tmp17;
}
}
}
}
}
''')
kernel_cpp_35 = async_compile.cpp('''
#include "/tmp/torchinductor_root/dl/cdljpywww2h2ag4o35mwbvm45hhasxnxkhqgbupxnk3y7olula65.h"
extern "C" void kernel(float* __restrict__ in_out_ptr0,
const float* __restrict__ in_ptr0,
const float* __restrict__ in_ptr1,
const float* __restrict__ in_ptr2,
const float* __restrict__ in_ptr3,
const float* __restrict__ in_ptr4,
float* __restrict__ out_ptr0,
const long ks0,
const long ks1)
{
#pragma omp parallel num_threads(56)
{
{
#pragma omp for
for(long i0=0; i0<ks0 + (ks0*(((((-1) + ks1) / 16))*((((-1) + ks1) / 16)))) + (2*ks0*((((-1) + ks1) / 16))); i0+=1)
{
for(long i1=0; i1<16; i1+=1)
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_out_ptr0 + (16*i1) + (256*i0));
auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr0 + 16*i1);
auto tmp3 = at::vec::Vectorized<float>::loadu(in_ptr1 + 16*i1);
auto tmp11 = at::vec::Vectorized<float>::loadu(in_ptr2 + 16*i1);
auto tmp13 = at::vec::Vectorized<float>::loadu(in_ptr3 + 16*i1);
auto tmp2 = tmp0 - tmp1;
auto tmp4 = at::vec::Vectorized<float>(static_cast<float>(1e-05));
auto tmp5 = tmp3 + tmp4;
auto tmp6 = tmp5.sqrt();
auto tmp7 = tmp6.reciprocal();
auto tmp8 = at::vec::Vectorized<float>(static_cast<float>(1));
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp15 = at::vec::clamp_min(tmp14, decltype(tmp14)(0));
tmp15.store(in_out_ptr0 + (16*i1) + (256*i0));
}
#pragma omp simd simdlen(8)
for(long i1=256; i1<256; i1+=1)
{
auto tmp0 = in_out_ptr0[i1 + (256*i0)];
auto tmp1 = in_ptr0[i1];
auto tmp3 = in_ptr1[i1];
auto tmp11 = in_ptr2[i1];
auto tmp13 = in_ptr3[i1];
auto tmp2 = tmp0 - tmp1;
auto tmp4 = static_cast<float>(1e-05);
auto tmp5 = tmp3 + tmp4;
auto tmp6 = std::sqrt(tmp5);
auto tmp7 = 1 / tmp6;
auto tmp8 = static_cast<float>(1);
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp15 = tmp14 * (tmp14>0);
in_out_ptr0[i1 + (256*i0)] = tmp15;
}
}
}
{
#pragma omp for
for(long i0=0; i0<256; i0+=1)
{
#pragma GCC ivdep
for(long i1=0; i1<16; i1+=1)
{
#pragma GCC ivdep
for(long i2=0; i2<0; i2+=1)
{
float tmp1[16*16] __attribute__ ((aligned (16)));
for (long i1_inner = 0; i1_inner < 16; i1_inner++)
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr4 + (9*i1_inner) + (16*i2) + (144*i1) + (2304*i0));
tmp0.store(tmp1 + 16*i1_inner);
}
at::vec::transpose_mxn<float,16,16>(tmp1, 16, out_ptr0 + (16*i1) + (2304*i0) + (4096*i2), 256);
}
#pragma GCC ivdep
for(long i2=0; i2<9; i2+=1)
{
for (long i1_inner = 0; i1_inner < 16; i1_inner++)
{
auto tmp0 = in_ptr4[i2 + (9*i1_inner) + (144*i1) + (2304*i0)];
out_ptr0[i1_inner + (16*i1) + (256*i2) + (2304*i0)] = tmp0;
}
}
}
#pragma GCC ivdep
for(long i1=256; i1<256; i1+=1)
{
#pragma GCC ivdep
for(long i2=0; i2<9; i2+=1)
{
auto tmp0 = in_ptr4[i2 + (9*i1) + (2304*i0)];
out_ptr0[i1 + (256*i2) + (2304*i0)] = tmp0;
}
}
}
}
}
}
''')
kernel_cpp_36 = async_compile.cpp('''
#include "/tmp/torchinductor_root/dl/cdljpywww2h2ag4o35mwbvm45hhasxnxkhqgbupxnk3y7olula65.h"
extern "C" void kernel(float* __restrict__ in_out_ptr0,
const float* __restrict__ in_ptr0,
const float* __restrict__ in_ptr1,
const float* __restrict__ in_ptr2,
const float* __restrict__ in_ptr3,
const long ks0,
const long ks1)
{
#pragma omp parallel num_threads(56)
{
{
#pragma omp for
for(long i0=0; i0<ks0 + (ks0*(((((-1) + ks1) / 16))*((((-1) + ks1) / 16)))) + (2*ks0*((((-1) + ks1) / 16))); i0+=1)
{
for(long i1=0; i1<16; i1+=1)
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_out_ptr0 + (16*i1) + (256*i0));
auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr0 + 16*i1);
auto tmp3 = at::vec::Vectorized<float>::loadu(in_ptr1 + 16*i1);
auto tmp11 = at::vec::Vectorized<float>::loadu(in_ptr2 + 16*i1);
auto tmp13 = at::vec::Vectorized<float>::loadu(in_ptr3 + 16*i1);
auto tmp2 = tmp0 - tmp1;
auto tmp4 = at::vec::Vectorized<float>(static_cast<float>(1e-05));
auto tmp5 = tmp3 + tmp4;
auto tmp6 = tmp5.sqrt();
auto tmp7 = tmp6.reciprocal();
auto tmp8 = at::vec::Vectorized<float>(static_cast<float>(1));
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp15 = at::vec::clamp_min(tmp14, decltype(tmp14)(0));
tmp15.store(in_out_ptr0 + (16*i1) + (256*i0));
}
#pragma omp simd simdlen(8)
for(long i1=256; i1<256; i1+=1)
{
auto tmp0 = in_out_ptr0[i1 + (256*i0)];
auto tmp1 = in_ptr0[i1];
auto tmp3 = in_ptr1[i1];
auto tmp11 = in_ptr2[i1];
auto tmp13 = in_ptr3[i1];
auto tmp2 = tmp0 - tmp1;
auto tmp4 = static_cast<float>(1e-05);
auto tmp5 = tmp3 + tmp4;
auto tmp6 = std::sqrt(tmp5);
auto tmp7 = 1 / tmp6;
auto tmp8 = static_cast<float>(1);
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp15 = tmp14 * (tmp14>0);
in_out_ptr0[i1 + (256*i0)] = tmp15;
}
}
}
}
}
''')
kernel_cpp_37 = async_compile.cpp('''
#include "/tmp/torchinductor_root/dl/cdljpywww2h2ag4o35mwbvm45hhasxnxkhqgbupxnk3y7olula65.h"
extern "C" void kernel(float* __restrict__ in_out_ptr0,
const float* __restrict__ in_ptr0,
const float* __restrict__ in_ptr1,
const float* __restrict__ in_ptr2,
const float* __restrict__ in_ptr3,
const float* __restrict__ in_ptr4,
const long ks0,
const long ks1)
{
#pragma omp parallel num_threads(56)
{
{
#pragma omp for
for(long i0=0; i0<ks0; i0+=1)
{
#pragma GCC ivdep
for(long i1=0; i1<64; i1+=1)
{
#pragma GCC ivdep
for(long i2=0; i2<((1 + (((((-1) + ks1) / 16))*((((-1) + ks1) / 16))) + (2*((((-1) + ks1) / 16)))) / 16); i2+=1)
{
float tmp0[16*16] __attribute__ ((aligned (16)));
at::vec::transpose_mxn<float,16,16>(in_ptr0 + (16*i1) + (1024*i0) + (16384*i2) + (1024*i0*(((((-1) + ks1) / 16))*((((-1) + ks1) / 16)))) + (2048*i0*((((-1) + ks1) / 16))), 1024, tmp0, 16);
float tmp16[16*16] __attribute__ ((aligned (16)));
at::vec::transpose_mxn<float,16,16>(in_out_ptr0 + (16*i1) + (1024*i0) + (16384*i2) + (1024*i0*(((((-1) + ks1) / 16))*((((-1) + ks1) / 16)))) + (2048*i0*((((-1) + ks1) / 16))), 1024, tmp16, 16);
float tmp20[16*16] __attribute__ ((aligned (16)));
for (long i1_inner = 0; i1_inner < 16; i1_inner++)
{
auto tmp1 = at::vec::Vectorized<float>::loadu(tmp0 + 16*i1_inner);
auto tmp2 = at::vec::Vectorized<float>(in_ptr1[i1_inner + (16*i1)]);
auto tmp4 = at::vec::Vectorized<float>(in_ptr2[i1_inner + (16*i1)]);
auto tmp12 = at::vec::Vectorized<float>(in_ptr3[i1_inner + (16*i1)]);
auto tmp14 = at::vec::Vectorized<float>(in_ptr4[i1_inner + (16*i1)]);
auto tmp17 = at::vec::Vectorized<float>::loadu(tmp16 + 16*i1_inner);
auto tmp3 = tmp1 - tmp2;
auto tmp5 = at::vec::Vectorized<float>(static_cast<float>(1e-05));
auto tmp6 = tmp4 + tmp5;
auto tmp7 = tmp6.sqrt();
auto tmp8 = tmp7.reciprocal();
auto tmp9 = at::vec::Vectorized<float>(static_cast<float>(1));
auto tmp10 = tmp8 * tmp9;
auto tmp11 = tmp3 * tmp10;
auto tmp13 = tmp11 * tmp12;
auto tmp15 = tmp13 + tmp14;
auto tmp18 = tmp15 + tmp17;
auto tmp19 = at::vec::clamp_min(tmp18, decltype(tmp18)(0));
tmp19.store(tmp20 + 16*i1_inner);
}
at::vec::transpose_mxn<float,16,16>(tmp20, 16, in_out_ptr0 + (16*i1) + (1024*i0) + (16384*i2) + (1024*i0*(((((-1) + ks1) / 16))*((((-1) + ks1) / 16)))) + (2048*i0*((((-1) + ks1) / 16))), 1024);
}
#pragma GCC ivdep
for(long i2=16*(((1 + (((((-1) + ks1) / 16))*((((-1) + ks1) / 16))) + (2*((((-1) + ks1) / 16)))) / 16)); i2<1 + (((((-1) + ks1) / 16))*((((-1) + ks1) / 16))) + (2*((((-1) + ks1) / 16))); i2+=1)
{
for (long i1_inner = 0; i1_inner < 16; i1_inner++)
{
auto tmp0 = in_ptr0[i1_inner + (16*i1) + (1024*i0) + (1024*i2) + (1024*i0*(((((-1) + ks1) / 16))*((((-1) + ks1) / 16)))) + (2048*i0*((((-1) + ks1) / 16)))];
auto tmp1 = in_ptr1[i1_inner + (16*i1)];
auto tmp3 = in_ptr2[i1_inner + (16*i1)];
auto tmp11 = in_ptr3[i1_inner + (16*i1)];
auto tmp13 = in_ptr4[i1_inner + (16*i1)];
auto tmp15 = in_out_ptr0[i1_inner + (16*i1) + (1024*i0) + (1024*i2) + (1024*i0*(((((-1) + ks1) / 16))*((((-1) + ks1) / 16)))) + (2048*i0*((((-1) + ks1) / 16)))];
auto tmp2 = tmp0 - tmp1;
auto tmp4 = static_cast<float>(1e-05);
auto tmp5 = tmp3 + tmp4;
auto tmp6 = std::sqrt(tmp5);
auto tmp7 = 1 / tmp6;
auto tmp8 = static_cast<float>(1);
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp16 = tmp14 + tmp15;
auto tmp17 = tmp16 * (tmp16>0);
in_out_ptr0[i1_inner + (16*i1) + (1024*i0) + (1024*i2) + (1024*i0*(((((-1) + ks1) / 16))*((((-1) + ks1) / 16)))) + (2048*i0*((((-1) + ks1) / 16)))] = tmp17;
}
}
}
#pragma GCC ivdep
for(long i1=1024; i1<1024; i1+=1)
{
#pragma GCC ivdep
for(long i2=0; i2<1 + (((((-1) + ks1) / 16))*((((-1) + ks1) / 16))) + (2*((((-1) + ks1) / 16))); i2+=1)
{
auto tmp0 = in_ptr0[i1 + (1024*i0) + (1024*i2) + (1024*i0*(((((-1) + ks1) / 16))*((((-1) + ks1) / 16)))) + (2048*i0*((((-1) + ks1) / 16)))];
auto tmp1 = in_ptr1[i1];
auto tmp3 = in_ptr2[i1];
auto tmp11 = in_ptr3[i1];
auto tmp13 = in_ptr4[i1];
auto tmp15 = in_out_ptr0[i1 + (1024*i0) + (1024*i2) + (1024*i0*(((((-1) + ks1) / 16))*((((-1) + ks1) / 16)))) + (2048*i0*((((-1) + ks1) / 16)))];
auto tmp2 = tmp0 - tmp1;
auto tmp4 = static_cast<float>(1e-05);
auto tmp5 = tmp3 + tmp4;
auto tmp6 = std::sqrt(tmp5);
auto tmp7 = 1 / tmp6;
auto tmp8 = static_cast<float>(1);
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp16 = tmp14 + tmp15;
auto tmp17 = tmp16 * (tmp16>0);
in_out_ptr0[i1 + (1024*i0) + (1024*i2) + (1024*i0*(((((-1) + ks1) / 16))*((((-1) + ks1) / 16)))) + (2048*i0*((((-1) + ks1) / 16)))] = tmp17;
}
}
}
}
}
}
''')
kernel_cpp_38 = async_compile.cpp('''
#include "/tmp/torchinductor_root/dl/cdljpywww2h2ag4o35mwbvm45hhasxnxkhqgbupxnk3y7olula65.h"
extern "C" void kernel(float* __restrict__ in_out_ptr0,
const float* __restrict__ in_ptr0,
const float* __restrict__ in_ptr1,
const float* __restrict__ in_ptr2,
const float* __restrict__ in_ptr3,
const float* __restrict__ in_ptr4,
float* __restrict__ out_ptr0,
const long ks0,
const long ks1)
{
#pragma omp parallel num_threads(56)
{
{
#pragma omp for
for(long i0=0; i0<ks0 + (ks0*(((((-1) + ks1) / 16))*((((-1) + ks1) / 16)))) + (2*ks0*((((-1) + ks1) / 16))); i0+=1)
{
for(long i1=0; i1<16; i1+=1)
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_out_ptr0 + (16*i1) + (256*i0));
auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr0 + 16*i1);
auto tmp3 = at::vec::Vectorized<float>::loadu(in_ptr1 + 16*i1);
auto tmp11 = at::vec::Vectorized<float>::loadu(in_ptr2 + 16*i1);
auto tmp13 = at::vec::Vectorized<float>::loadu(in_ptr3 + 16*i1);
auto tmp2 = tmp0 - tmp1;
auto tmp4 = at::vec::Vectorized<float>(static_cast<float>(1e-05));
auto tmp5 = tmp3 + tmp4;
auto tmp6 = tmp5.sqrt();
auto tmp7 = tmp6.reciprocal();
auto tmp8 = at::vec::Vectorized<float>(static_cast<float>(1));
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp15 = at::vec::clamp_min(tmp14, decltype(tmp14)(0));
tmp15.store(in_out_ptr0 + (16*i1) + (256*i0));
}
#pragma omp simd simdlen(8)
for(long i1=256; i1<256; i1+=1)
{
auto tmp0 = in_out_ptr0[i1 + (256*i0)];
auto tmp1 = in_ptr0[i1];
auto tmp3 = in_ptr1[i1];
auto tmp11 = in_ptr2[i1];
auto tmp13 = in_ptr3[i1];
auto tmp2 = tmp0 - tmp1;
auto tmp4 = static_cast<float>(1e-05);
auto tmp5 = tmp3 + tmp4;
auto tmp6 = std::sqrt(tmp5);
auto tmp7 = 1 / tmp6;
auto tmp8 = static_cast<float>(1);
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp15 = tmp14 * (tmp14>0);
in_out_ptr0[i1 + (256*i0)] = tmp15;
}
}
}
{
#pragma omp for
for(long i0=0; i0<256; i0+=1)
{
#pragma GCC ivdep
for(long i1=0; i1<16; i1+=1)
{
#pragma GCC ivdep
for(long i2=0; i2<0; i2+=1)
{
float tmp1[16*16] __attribute__ ((aligned (16)));
for (long i1_inner = 0; i1_inner < 16; i1_inner++)
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr4 + (9*i1_inner) + (16*i2) + (144*i1) + (2304*i0));
tmp0.store(tmp1 + 16*i1_inner);
}
at::vec::transpose_mxn<float,16,16>(tmp1, 16, out_ptr0 + (16*i1) + (2304*i0) + (4096*i2), 256);
}
#pragma GCC ivdep
for(long i2=0; i2<9; i2+=1)
{
for (long i1_inner = 0; i1_inner < 16; i1_inner++)
{
auto tmp0 = in_ptr4[i2 + (9*i1_inner) + (144*i1) + (2304*i0)];
out_ptr0[i1_inner + (16*i1) + (256*i2) + (2304*i0)] = tmp0;
}
}
}
#pragma GCC ivdep
for(long i1=256; i1<256; i1+=1)
{
#pragma GCC ivdep
for(long i2=0; i2<9; i2+=1)
{
auto tmp0 = in_ptr4[i2 + (9*i1) + (2304*i0)];
out_ptr0[i1 + (256*i2) + (2304*i0)] = tmp0;
}
}
}
}
}
}
''')
kernel_cpp_39 = async_compile.cpp('''
#include "/tmp/torchinductor_root/dl/cdljpywww2h2ag4o35mwbvm45hhasxnxkhqgbupxnk3y7olula65.h"
extern "C" void kernel(float* __restrict__ in_out_ptr0,
const float* __restrict__ in_ptr0,
const float* __restrict__ in_ptr1,
const float* __restrict__ in_ptr2,
const float* __restrict__ in_ptr3,
const long ks0,
const long ks1)
{
#pragma omp parallel num_threads(56)
{
{
#pragma omp for
for(long i0=0; i0<ks0 + (ks0*(((((-1) + ks1) / 16))*((((-1) + ks1) / 16)))) + (2*ks0*((((-1) + ks1) / 16))); i0+=1)
{
for(long i1=0; i1<16; i1+=1)
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_out_ptr0 + (16*i1) + (256*i0));
auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr0 + 16*i1);
auto tmp3 = at::vec::Vectorized<float>::loadu(in_ptr1 + 16*i1);
auto tmp11 = at::vec::Vectorized<float>::loadu(in_ptr2 + 16*i1);
auto tmp13 = at::vec::Vectorized<float>::loadu(in_ptr3 + 16*i1);
auto tmp2 = tmp0 - tmp1;
auto tmp4 = at::vec::Vectorized<float>(static_cast<float>(1e-05));
auto tmp5 = tmp3 + tmp4;
auto tmp6 = tmp5.sqrt();
auto tmp7 = tmp6.reciprocal();
auto tmp8 = at::vec::Vectorized<float>(static_cast<float>(1));
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp15 = at::vec::clamp_min(tmp14, decltype(tmp14)(0));
tmp15.store(in_out_ptr0 + (16*i1) + (256*i0));
}
#pragma omp simd simdlen(8)
for(long i1=256; i1<256; i1+=1)
{
auto tmp0 = in_out_ptr0[i1 + (256*i0)];
auto tmp1 = in_ptr0[i1];
auto tmp3 = in_ptr1[i1];
auto tmp11 = in_ptr2[i1];
auto tmp13 = in_ptr3[i1];
auto tmp2 = tmp0 - tmp1;
auto tmp4 = static_cast<float>(1e-05);
auto tmp5 = tmp3 + tmp4;
auto tmp6 = std::sqrt(tmp5);
auto tmp7 = 1 / tmp6;
auto tmp8 = static_cast<float>(1);
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp15 = tmp14 * (tmp14>0);
in_out_ptr0[i1 + (256*i0)] = tmp15;
}
}
}
}
}
''')
kernel_cpp_40 = async_compile.cpp('''
#include "/tmp/torchinductor_root/dl/cdljpywww2h2ag4o35mwbvm45hhasxnxkhqgbupxnk3y7olula65.h"
extern "C" void kernel(float* __restrict__ in_out_ptr0,
const float* __restrict__ in_ptr0,
const float* __restrict__ in_ptr1,
const float* __restrict__ in_ptr2,
const float* __restrict__ in_ptr3,
const float* __restrict__ in_ptr4,
const long ks0,
const long ks1)
{
#pragma omp parallel num_threads(56)
{
{
#pragma omp for
for(long i0=0; i0<ks0 + (ks0*(((((-1) + ks1) / 16))*((((-1) + ks1) / 16)))) + (2*ks0*((((-1) + ks1) / 16))); i0+=1)
{
for(long i1=0; i1<64; i1+=1)
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + (16*i1) + (1024*i0));
auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr1 + 16*i1);
auto tmp3 = at::vec::Vectorized<float>::loadu(in_ptr2 + 16*i1);
auto tmp11 = at::vec::Vectorized<float>::loadu(in_ptr3 + 16*i1);
auto tmp13 = at::vec::Vectorized<float>::loadu(in_ptr4 + 16*i1);
auto tmp15 = at::vec::Vectorized<float>::loadu(in_out_ptr0 + (16*i1) + (1024*i0));
auto tmp2 = tmp0 - tmp1;
auto tmp4 = at::vec::Vectorized<float>(static_cast<float>(1e-05));
auto tmp5 = tmp3 + tmp4;
auto tmp6 = tmp5.sqrt();
auto tmp7 = tmp6.reciprocal();
auto tmp8 = at::vec::Vectorized<float>(static_cast<float>(1));
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp16 = tmp14 + tmp15;
auto tmp17 = at::vec::clamp_min(tmp16, decltype(tmp16)(0));
tmp17.store(in_out_ptr0 + (16*i1) + (1024*i0));
}
#pragma omp simd simdlen(8)
for(long i1=1024; i1<1024; i1+=1)
{
auto tmp0 = in_ptr0[i1 + (1024*i0)];
auto tmp1 = in_ptr1[i1];
auto tmp3 = in_ptr2[i1];
auto tmp11 = in_ptr3[i1];
auto tmp13 = in_ptr4[i1];
auto tmp15 = in_out_ptr0[i1 + (1024*i0)];
auto tmp2 = tmp0 - tmp1;
auto tmp4 = static_cast<float>(1e-05);
auto tmp5 = tmp3 + tmp4;
auto tmp6 = std::sqrt(tmp5);
auto tmp7 = 1 / tmp6;
auto tmp8 = static_cast<float>(1);
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp16 = tmp14 + tmp15;
auto tmp17 = tmp16 * (tmp16>0);
in_out_ptr0[i1 + (1024*i0)] = tmp17;
}
}
}
}
}
''')
kernel_cpp_41 = async_compile.cpp('''
#include "/tmp/torchinductor_root/dl/cdljpywww2h2ag4o35mwbvm45hhasxnxkhqgbupxnk3y7olula65.h"
extern "C" void kernel(float* __restrict__ in_out_ptr0,
const float* __restrict__ in_ptr0,
const float* __restrict__ in_ptr1,
const float* __restrict__ in_ptr2,
const float* __restrict__ in_ptr3,
const float* __restrict__ in_ptr4,
float* __restrict__ out_ptr0,
const long ks0,
const long ks1)
{
#pragma omp parallel num_threads(56)
{
{
#pragma omp for
for(long i0=0; i0<ks0 + (ks0*(((((-1) + ks1) / 16))*((((-1) + ks1) / 16)))) + (2*ks0*((((-1) + ks1) / 16))); i0+=1)
{
for(long i1=0; i1<32; i1+=1)
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_out_ptr0 + (16*i1) + (512*i0));
auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr0 + 16*i1);
auto tmp3 = at::vec::Vectorized<float>::loadu(in_ptr1 + 16*i1);
auto tmp11 = at::vec::Vectorized<float>::loadu(in_ptr2 + 16*i1);
auto tmp13 = at::vec::Vectorized<float>::loadu(in_ptr3 + 16*i1);
auto tmp2 = tmp0 - tmp1;
auto tmp4 = at::vec::Vectorized<float>(static_cast<float>(1e-05));
auto tmp5 = tmp3 + tmp4;
auto tmp6 = tmp5.sqrt();
auto tmp7 = tmp6.reciprocal();
auto tmp8 = at::vec::Vectorized<float>(static_cast<float>(1));
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp15 = at::vec::clamp_min(tmp14, decltype(tmp14)(0));
tmp15.store(in_out_ptr0 + (16*i1) + (512*i0));
}
#pragma omp simd simdlen(8)
for(long i1=512; i1<512; i1+=1)
{
auto tmp0 = in_out_ptr0[i1 + (512*i0)];
auto tmp1 = in_ptr0[i1];
auto tmp3 = in_ptr1[i1];
auto tmp11 = in_ptr2[i1];
auto tmp13 = in_ptr3[i1];
auto tmp2 = tmp0 - tmp1;
auto tmp4 = static_cast<float>(1e-05);
auto tmp5 = tmp3 + tmp4;
auto tmp6 = std::sqrt(tmp5);
auto tmp7 = 1 / tmp6;
auto tmp8 = static_cast<float>(1);
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp15 = tmp14 * (tmp14>0);
in_out_ptr0[i1 + (512*i0)] = tmp15;
}
}
}
{
#pragma omp for
for(long i0=0; i0<512; i0+=1)
{
#pragma GCC ivdep
for(long i1=0; i1<32; i1+=1)
{
#pragma GCC ivdep
for(long i2=0; i2<0; i2+=1)
{
float tmp1[16*16] __attribute__ ((aligned (16)));
for (long i1_inner = 0; i1_inner < 16; i1_inner++)
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr4 + (9*i1_inner) + (16*i2) + (144*i1) + (4608*i0));
tmp0.store(tmp1 + 16*i1_inner);
}
at::vec::transpose_mxn<float,16,16>(tmp1, 16, out_ptr0 + (16*i1) + (4608*i0) + (8192*i2), 512);
}
#pragma GCC ivdep
for(long i2=0; i2<9; i2+=1)
{
for (long i1_inner = 0; i1_inner < 16; i1_inner++)
{
auto tmp0 = in_ptr4[i2 + (9*i1_inner) + (144*i1) + (4608*i0)];
out_ptr0[i1_inner + (16*i1) + (512*i2) + (4608*i0)] = tmp0;
}
}
}
#pragma GCC ivdep
for(long i1=512; i1<512; i1+=1)
{
#pragma GCC ivdep
for(long i2=0; i2<9; i2+=1)
{
auto tmp0 = in_ptr4[i2 + (9*i1) + (4608*i0)];
out_ptr0[i1 + (512*i2) + (4608*i0)] = tmp0;
}
}
}
}
}
}
''')
kernel_cpp_42 = async_compile.cpp('''
#include "/tmp/torchinductor_root/dl/cdljpywww2h2ag4o35mwbvm45hhasxnxkhqgbupxnk3y7olula65.h"
extern "C" void kernel(float* __restrict__ in_out_ptr0,
const float* __restrict__ in_ptr0,
const float* __restrict__ in_ptr1,
const float* __restrict__ in_ptr2,
const float* __restrict__ in_ptr3,
const long ks0,
const long ks1)
{
#pragma omp parallel num_threads(56)
{
{
#pragma omp for
for(long i0=0; i0<ks0 + (ks0*(((((-1) + ks1) / 32))*((((-1) + ks1) / 32)))) + (2*ks0*((((-1) + ks1) / 32))); i0+=1)
{
for(long i1=0; i1<32; i1+=1)
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_out_ptr0 + (16*i1) + (512*i0));
auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr0 + 16*i1);
auto tmp3 = at::vec::Vectorized<float>::loadu(in_ptr1 + 16*i1);
auto tmp11 = at::vec::Vectorized<float>::loadu(in_ptr2 + 16*i1);
auto tmp13 = at::vec::Vectorized<float>::loadu(in_ptr3 + 16*i1);
auto tmp2 = tmp0 - tmp1;
auto tmp4 = at::vec::Vectorized<float>(static_cast<float>(1e-05));
auto tmp5 = tmp3 + tmp4;
auto tmp6 = tmp5.sqrt();
auto tmp7 = tmp6.reciprocal();
auto tmp8 = at::vec::Vectorized<float>(static_cast<float>(1));
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp15 = at::vec::clamp_min(tmp14, decltype(tmp14)(0));
tmp15.store(in_out_ptr0 + (16*i1) + (512*i0));
}
#pragma omp simd simdlen(8)
for(long i1=512; i1<512; i1+=1)
{
auto tmp0 = in_out_ptr0[i1 + (512*i0)];
auto tmp1 = in_ptr0[i1];
auto tmp3 = in_ptr1[i1];
auto tmp11 = in_ptr2[i1];
auto tmp13 = in_ptr3[i1];
auto tmp2 = tmp0 - tmp1;
auto tmp4 = static_cast<float>(1e-05);
auto tmp5 = tmp3 + tmp4;
auto tmp6 = std::sqrt(tmp5);
auto tmp7 = 1 / tmp6;
auto tmp8 = static_cast<float>(1);
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp15 = tmp14 * (tmp14>0);
in_out_ptr0[i1 + (512*i0)] = tmp15;
}
}
}
}
}
''')
kernel_cpp_43 = async_compile.cpp('''
#include "/tmp/torchinductor_root/dl/cdljpywww2h2ag4o35mwbvm45hhasxnxkhqgbupxnk3y7olula65.h"
extern "C" void kernel(float* __restrict__ in_out_ptr0,
const float* __restrict__ in_ptr0,
const float* __restrict__ in_ptr1,
const float* __restrict__ in_ptr2,
const float* __restrict__ in_ptr3,
const float* __restrict__ in_ptr4,
const float* __restrict__ in_ptr5,
const float* __restrict__ in_ptr6,
const float* __restrict__ in_ptr7,
const float* __restrict__ in_ptr8,
const long ks0,
const long ks1)
{
#pragma omp parallel num_threads(56)
{
{
#pragma omp for
for(long i0=0; i0<ks0 + (ks0*(((((-1) + ks1) / 32))*((((-1) + ks1) / 32)))) + (2*ks0*((((-1) + ks1) / 32))); i0+=1)
{
for(long i1=0; i1<128; i1+=1)
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_out_ptr0 + (16*i1) + (2048*i0));
auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr0 + 16*i1);
auto tmp3 = at::vec::Vectorized<float>::loadu(in_ptr1 + 16*i1);
auto tmp11 = at::vec::Vectorized<float>::loadu(in_ptr2 + 16*i1);
auto tmp13 = at::vec::Vectorized<float>::loadu(in_ptr3 + 16*i1);
auto tmp15 = at::vec::Vectorized<float>::loadu(in_ptr4 + (16*i1) + (2048*i0));
auto tmp16 = at::vec::Vectorized<float>::loadu(in_ptr5 + 16*i1);
auto tmp18 = at::vec::Vectorized<float>::loadu(in_ptr6 + 16*i1);
auto tmp24 = at::vec::Vectorized<float>::loadu(in_ptr7 + 16*i1);
auto tmp26 = at::vec::Vectorized<float>::loadu(in_ptr8 + 16*i1);
auto tmp2 = tmp0 - tmp1;
auto tmp4 = at::vec::Vectorized<float>(static_cast<float>(1e-05));
auto tmp5 = tmp3 + tmp4;
auto tmp6 = tmp5.sqrt();
auto tmp7 = tmp6.reciprocal();
auto tmp8 = at::vec::Vectorized<float>(static_cast<float>(1));
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp17 = tmp15 - tmp16;
auto tmp19 = tmp18 + tmp4;
auto tmp20 = tmp19.sqrt();
auto tmp21 = tmp20.reciprocal();
auto tmp22 = tmp21 * tmp8;
auto tmp23 = tmp17 * tmp22;
auto tmp25 = tmp23 * tmp24;
auto tmp27 = tmp25 + tmp26;
auto tmp28 = tmp14 + tmp27;
tmp28.store(in_out_ptr0 + (16*i1) + (2048*i0));
}
#pragma omp simd simdlen(8)
for(long i1=2048; i1<2048; i1+=1)
{
auto tmp0 = in_out_ptr0[i1 + (2048*i0)];
auto tmp1 = in_ptr0[i1];
auto tmp3 = in_ptr1[i1];
auto tmp11 = in_ptr2[i1];
auto tmp13 = in_ptr3[i1];
auto tmp15 = in_ptr4[i1 + (2048*i0)];
auto tmp16 = in_ptr5[i1];
auto tmp18 = in_ptr6[i1];
auto tmp24 = in_ptr7[i1];
auto tmp26 = in_ptr8[i1];
auto tmp2 = tmp0 - tmp1;
auto tmp4 = static_cast<float>(1e-05);
auto tmp5 = tmp3 + tmp4;
auto tmp6 = std::sqrt(tmp5);
auto tmp7 = 1 / tmp6;
auto tmp8 = static_cast<float>(1);
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp17 = tmp15 - tmp16;
auto tmp19 = tmp18 + tmp4;
auto tmp20 = std::sqrt(tmp19);
auto tmp21 = 1 / tmp20;
auto tmp22 = tmp21 * tmp8;
auto tmp23 = tmp17 * tmp22;
auto tmp25 = tmp23 * tmp24;
auto tmp27 = tmp25 + tmp26;
auto tmp28 = tmp14 + tmp27;
in_out_ptr0[i1 + (2048*i0)] = tmp28;
}
}
}
{
#pragma omp for
for(long i0=0; i0<ks0; i0+=1)
{
#pragma GCC ivdep
for(long i1=0; i1<128; i1+=1)
{
#pragma GCC ivdep
for(long i2=0; i2<((1 + (((((-1) + ks1) / 32))*((((-1) + ks1) / 32))) + (2*((((-1) + ks1) / 32)))) / 16); i2+=1)
{
float tmp0[16*16] __attribute__ ((aligned (16)));
at::vec::transpose_mxn<float,16,16>(in_out_ptr0 + (16*i1) + (2048*i0) + (32768*i2) + (2048*i0*(((((-1) + ks1) / 32))*((((-1) + ks1) / 32)))) + (4096*i0*((((-1) + ks1) / 32))), 2048, tmp0, 16);
float tmp3[16*16] __attribute__ ((aligned (16)));
for (long i1_inner = 0; i1_inner < 16; i1_inner++)
{
auto tmp1 = at::vec::Vectorized<float>::loadu(tmp0 + 16*i1_inner);
auto tmp2 = at::vec::clamp_min(tmp1, decltype(tmp1)(0));
tmp2.store(tmp3 + 16*i1_inner);
}
at::vec::transpose_mxn<float,16,16>(tmp3, 16, in_out_ptr0 + (16*i1) + (2048*i0) + (32768*i2) + (2048*i0*(((((-1) + ks1) / 32))*((((-1) + ks1) / 32)))) + (4096*i0*((((-1) + ks1) / 32))), 2048);
}
#pragma GCC ivdep
for(long i2=16*(((1 + (((((-1) + ks1) / 32))*((((-1) + ks1) / 32))) + (2*((((-1) + ks1) / 32)))) / 16)); i2<1 + (((((-1) + ks1) / 32))*((((-1) + ks1) / 32))) + (2*((((-1) + ks1) / 32))); i2+=1)
{
for (long i1_inner = 0; i1_inner < 16; i1_inner++)
{
auto tmp0 = in_out_ptr0[i1_inner + (16*i1) + (2048*i0) + (2048*i2) + (2048*i0*(((((-1) + ks1) / 32))*((((-1) + ks1) / 32)))) + (4096*i0*((((-1) + ks1) / 32)))];
auto tmp1 = tmp0 * (tmp0>0);
in_out_ptr0[i1_inner + (16*i1) + (2048*i0) + (2048*i2) + (2048*i0*(((((-1) + ks1) / 32))*((((-1) + ks1) / 32)))) + (4096*i0*((((-1) + ks1) / 32)))] = tmp1;
}
}
}
#pragma GCC ivdep
for(long i1=2048; i1<2048; i1+=1)
{
#pragma GCC ivdep
for(long i2=0; i2<1 + (((((-1) + ks1) / 32))*((((-1) + ks1) / 32))) + (2*((((-1) + ks1) / 32))); i2+=1)
{
auto tmp0 = in_out_ptr0[i1 + (2048*i0) + (2048*i2) + (2048*i0*(((((-1) + ks1) / 32))*((((-1) + ks1) / 32)))) + (4096*i0*((((-1) + ks1) / 32)))];
auto tmp1 = tmp0 * (tmp0>0);
in_out_ptr0[i1 + (2048*i0) + (2048*i2) + (2048*i0*(((((-1) + ks1) / 32))*((((-1) + ks1) / 32)))) + (4096*i0*((((-1) + ks1) / 32)))] = tmp1;
}
}
}
}
}
}
''')
kernel_cpp_44 = async_compile.cpp('''
#include "/tmp/torchinductor_root/dl/cdljpywww2h2ag4o35mwbvm45hhasxnxkhqgbupxnk3y7olula65.h"
extern "C" void kernel(float* __restrict__ in_out_ptr0,
const float* __restrict__ in_ptr0,
const float* __restrict__ in_ptr1,
const float* __restrict__ in_ptr2,
const float* __restrict__ in_ptr3,
const float* __restrict__ in_ptr4,
float* __restrict__ out_ptr0,
const long ks0,
const long ks1)
{
#pragma omp parallel num_threads(56)
{
{
#pragma omp for
for(long i0=0; i0<ks0 + (ks0*(((((-1) + ks1) / 32))*((((-1) + ks1) / 32)))) + (2*ks0*((((-1) + ks1) / 32))); i0+=1)
{
for(long i1=0; i1<32; i1+=1)
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_out_ptr0 + (16*i1) + (512*i0));
auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr0 + 16*i1);
auto tmp3 = at::vec::Vectorized<float>::loadu(in_ptr1 + 16*i1);
auto tmp11 = at::vec::Vectorized<float>::loadu(in_ptr2 + 16*i1);
auto tmp13 = at::vec::Vectorized<float>::loadu(in_ptr3 + 16*i1);
auto tmp2 = tmp0 - tmp1;
auto tmp4 = at::vec::Vectorized<float>(static_cast<float>(1e-05));
auto tmp5 = tmp3 + tmp4;
auto tmp6 = tmp5.sqrt();
auto tmp7 = tmp6.reciprocal();
auto tmp8 = at::vec::Vectorized<float>(static_cast<float>(1));
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp15 = at::vec::clamp_min(tmp14, decltype(tmp14)(0));
tmp15.store(in_out_ptr0 + (16*i1) + (512*i0));
}
#pragma omp simd simdlen(8)
for(long i1=512; i1<512; i1+=1)
{
auto tmp0 = in_out_ptr0[i1 + (512*i0)];
auto tmp1 = in_ptr0[i1];
auto tmp3 = in_ptr1[i1];
auto tmp11 = in_ptr2[i1];
auto tmp13 = in_ptr3[i1];
auto tmp2 = tmp0 - tmp1;
auto tmp4 = static_cast<float>(1e-05);
auto tmp5 = tmp3 + tmp4;
auto tmp6 = std::sqrt(tmp5);
auto tmp7 = 1 / tmp6;
auto tmp8 = static_cast<float>(1);
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp15 = tmp14 * (tmp14>0);
in_out_ptr0[i1 + (512*i0)] = tmp15;
}
}
}
{
#pragma omp for
for(long i0=0; i0<512; i0+=1)
{
#pragma GCC ivdep
for(long i1=0; i1<32; i1+=1)
{
#pragma GCC ivdep
for(long i2=0; i2<0; i2+=1)
{
float tmp1[16*16] __attribute__ ((aligned (16)));
for (long i1_inner = 0; i1_inner < 16; i1_inner++)
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr4 + (9*i1_inner) + (16*i2) + (144*i1) + (4608*i0));
tmp0.store(tmp1 + 16*i1_inner);
}
at::vec::transpose_mxn<float,16,16>(tmp1, 16, out_ptr0 + (16*i1) + (4608*i0) + (8192*i2), 512);
}
#pragma GCC ivdep
for(long i2=0; i2<9; i2+=1)
{
for (long i1_inner = 0; i1_inner < 16; i1_inner++)
{
auto tmp0 = in_ptr4[i2 + (9*i1_inner) + (144*i1) + (4608*i0)];
out_ptr0[i1_inner + (16*i1) + (512*i2) + (4608*i0)] = tmp0;
}
}
}
#pragma GCC ivdep
for(long i1=512; i1<512; i1+=1)
{
#pragma GCC ivdep
for(long i2=0; i2<9; i2+=1)
{
auto tmp0 = in_ptr4[i2 + (9*i1) + (4608*i0)];
out_ptr0[i1 + (512*i2) + (4608*i0)] = tmp0;
}
}
}
}
}
}
''')
kernel_cpp_45 = async_compile.cpp('''
#include "/tmp/torchinductor_root/dl/cdljpywww2h2ag4o35mwbvm45hhasxnxkhqgbupxnk3y7olula65.h"
extern "C" void kernel(float* __restrict__ in_out_ptr0,
const float* __restrict__ in_ptr0,
const float* __restrict__ in_ptr1,
const float* __restrict__ in_ptr2,
const float* __restrict__ in_ptr3,
const long ks0,
const long ks1)
{
#pragma omp parallel num_threads(56)
{
{
#pragma omp for
for(long i0=0; i0<ks0 + (ks0*(((((-1) + ks1) / 32))*((((-1) + ks1) / 32)))) + (2*ks0*((((-1) + ks1) / 32))); i0+=1)
{
for(long i1=0; i1<32; i1+=1)
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_out_ptr0 + (16*i1) + (512*i0));
auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr0 + 16*i1);
auto tmp3 = at::vec::Vectorized<float>::loadu(in_ptr1 + 16*i1);
auto tmp11 = at::vec::Vectorized<float>::loadu(in_ptr2 + 16*i1);
auto tmp13 = at::vec::Vectorized<float>::loadu(in_ptr3 + 16*i1);
auto tmp2 = tmp0 - tmp1;
auto tmp4 = at::vec::Vectorized<float>(static_cast<float>(1e-05));
auto tmp5 = tmp3 + tmp4;
auto tmp6 = tmp5.sqrt();
auto tmp7 = tmp6.reciprocal();
auto tmp8 = at::vec::Vectorized<float>(static_cast<float>(1));
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp15 = at::vec::clamp_min(tmp14, decltype(tmp14)(0));
tmp15.store(in_out_ptr0 + (16*i1) + (512*i0));
}
#pragma omp simd simdlen(8)
for(long i1=512; i1<512; i1+=1)
{
auto tmp0 = in_out_ptr0[i1 + (512*i0)];
auto tmp1 = in_ptr0[i1];
auto tmp3 = in_ptr1[i1];
auto tmp11 = in_ptr2[i1];
auto tmp13 = in_ptr3[i1];
auto tmp2 = tmp0 - tmp1;
auto tmp4 = static_cast<float>(1e-05);
auto tmp5 = tmp3 + tmp4;
auto tmp6 = std::sqrt(tmp5);
auto tmp7 = 1 / tmp6;
auto tmp8 = static_cast<float>(1);
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp15 = tmp14 * (tmp14>0);
in_out_ptr0[i1 + (512*i0)] = tmp15;
}
}
}
}
}
''')
kernel_cpp_46 = async_compile.cpp('''
#include "/tmp/torchinductor_root/dl/cdljpywww2h2ag4o35mwbvm45hhasxnxkhqgbupxnk3y7olula65.h"
extern "C" void kernel(float* __restrict__ in_out_ptr0,
const float* __restrict__ in_ptr0,
const float* __restrict__ in_ptr1,
const float* __restrict__ in_ptr2,
const float* __restrict__ in_ptr3,
const float* __restrict__ in_ptr4,
const long ks0,
const long ks1)
{
#pragma omp parallel num_threads(56)
{
{
#pragma omp for
for(long i0=0; i0<ks0 + (ks0*(((((-1) + ks1) / 32))*((((-1) + ks1) / 32)))) + (2*ks0*((((-1) + ks1) / 32))); i0+=1)
{
for(long i1=0; i1<128; i1+=1)
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + (16*i1) + (2048*i0));
auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr1 + 16*i1);
auto tmp3 = at::vec::Vectorized<float>::loadu(in_ptr2 + 16*i1);
auto tmp11 = at::vec::Vectorized<float>::loadu(in_ptr3 + 16*i1);
auto tmp13 = at::vec::Vectorized<float>::loadu(in_ptr4 + 16*i1);
auto tmp15 = at::vec::Vectorized<float>::loadu(in_out_ptr0 + (16*i1) + (2048*i0));
auto tmp2 = tmp0 - tmp1;
auto tmp4 = at::vec::Vectorized<float>(static_cast<float>(1e-05));
auto tmp5 = tmp3 + tmp4;
auto tmp6 = tmp5.sqrt();
auto tmp7 = tmp6.reciprocal();
auto tmp8 = at::vec::Vectorized<float>(static_cast<float>(1));
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp16 = tmp14 + tmp15;
auto tmp17 = at::vec::clamp_min(tmp16, decltype(tmp16)(0));
tmp17.store(in_out_ptr0 + (16*i1) + (2048*i0));
}
#pragma omp simd simdlen(8)
for(long i1=2048; i1<2048; i1+=1)
{
auto tmp0 = in_ptr0[i1 + (2048*i0)];
auto tmp1 = in_ptr1[i1];
auto tmp3 = in_ptr2[i1];
auto tmp11 = in_ptr3[i1];
auto tmp13 = in_ptr4[i1];
auto tmp15 = in_out_ptr0[i1 + (2048*i0)];
auto tmp2 = tmp0 - tmp1;
auto tmp4 = static_cast<float>(1e-05);
auto tmp5 = tmp3 + tmp4;
auto tmp6 = std::sqrt(tmp5);
auto tmp7 = 1 / tmp6;
auto tmp8 = static_cast<float>(1);
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp16 = tmp14 + tmp15;
auto tmp17 = tmp16 * (tmp16>0);
in_out_ptr0[i1 + (2048*i0)] = tmp17;
}
}
}
}
}
''')
kernel_cpp_47 = async_compile.cpp('''
#include "/tmp/torchinductor_root/dl/cdljpywww2h2ag4o35mwbvm45hhasxnxkhqgbupxnk3y7olula65.h"
extern "C" void kernel(float* __restrict__ in_out_ptr0,
const float* __restrict__ in_ptr0,
const float* __restrict__ in_ptr1,
const float* __restrict__ in_ptr2,
const float* __restrict__ in_ptr3,
const float* __restrict__ in_ptr4,
float* __restrict__ out_ptr0,
const long ks0,
const long ks1)
{
#pragma omp parallel num_threads(56)
{
{
#pragma omp for
for(long i0=0; i0<ks0 + (ks0*(((((-1) + ks1) / 32))*((((-1) + ks1) / 32)))) + (2*ks0*((((-1) + ks1) / 32))); i0+=1)
{
for(long i1=0; i1<32; i1+=1)
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_out_ptr0 + (16*i1) + (512*i0));
auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr0 + 16*i1);
auto tmp3 = at::vec::Vectorized<float>::loadu(in_ptr1 + 16*i1);
auto tmp11 = at::vec::Vectorized<float>::loadu(in_ptr2 + 16*i1);
auto tmp13 = at::vec::Vectorized<float>::loadu(in_ptr3 + 16*i1);
auto tmp2 = tmp0 - tmp1;
auto tmp4 = at::vec::Vectorized<float>(static_cast<float>(1e-05));
auto tmp5 = tmp3 + tmp4;
auto tmp6 = tmp5.sqrt();
auto tmp7 = tmp6.reciprocal();
auto tmp8 = at::vec::Vectorized<float>(static_cast<float>(1));
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp15 = at::vec::clamp_min(tmp14, decltype(tmp14)(0));
tmp15.store(in_out_ptr0 + (16*i1) + (512*i0));
}
#pragma omp simd simdlen(8)
for(long i1=512; i1<512; i1+=1)
{
auto tmp0 = in_out_ptr0[i1 + (512*i0)];
auto tmp1 = in_ptr0[i1];
auto tmp3 = in_ptr1[i1];
auto tmp11 = in_ptr2[i1];
auto tmp13 = in_ptr3[i1];
auto tmp2 = tmp0 - tmp1;
auto tmp4 = static_cast<float>(1e-05);
auto tmp5 = tmp3 + tmp4;
auto tmp6 = std::sqrt(tmp5);
auto tmp7 = 1 / tmp6;
auto tmp8 = static_cast<float>(1);
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp15 = tmp14 * (tmp14>0);
in_out_ptr0[i1 + (512*i0)] = tmp15;
}
}
}
{
#pragma omp for
for(long i0=0; i0<512; i0+=1)
{
#pragma GCC ivdep
for(long i1=0; i1<32; i1+=1)
{
#pragma GCC ivdep
for(long i2=0; i2<0; i2+=1)
{
float tmp1[16*16] __attribute__ ((aligned (16)));
for (long i1_inner = 0; i1_inner < 16; i1_inner++)
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr4 + (9*i1_inner) + (16*i2) + (144*i1) + (4608*i0));
tmp0.store(tmp1 + 16*i1_inner);
}
at::vec::transpose_mxn<float,16,16>(tmp1, 16, out_ptr0 + (16*i1) + (4608*i0) + (8192*i2), 512);
}
#pragma GCC ivdep
for(long i2=0; i2<9; i2+=1)
{
for (long i1_inner = 0; i1_inner < 16; i1_inner++)
{
auto tmp0 = in_ptr4[i2 + (9*i1_inner) + (144*i1) + (4608*i0)];
out_ptr0[i1_inner + (16*i1) + (512*i2) + (4608*i0)] = tmp0;
}
}
}
#pragma GCC ivdep
for(long i1=512; i1<512; i1+=1)
{
#pragma GCC ivdep
for(long i2=0; i2<9; i2+=1)
{
auto tmp0 = in_ptr4[i2 + (9*i1) + (4608*i0)];
out_ptr0[i1 + (512*i2) + (4608*i0)] = tmp0;
}
}
}
}
}
}
''')
kernel_cpp_48 = async_compile.cpp('''
#include "/tmp/torchinductor_root/dl/cdljpywww2h2ag4o35mwbvm45hhasxnxkhqgbupxnk3y7olula65.h"
extern "C" void kernel(float* __restrict__ in_out_ptr0,
const float* __restrict__ in_ptr0,
const float* __restrict__ in_ptr1,
const float* __restrict__ in_ptr2,
const float* __restrict__ in_ptr3,
const long ks0,
const long ks1)
{
#pragma omp parallel num_threads(56)
{
{
#pragma omp for
for(long i0=0; i0<ks0 + (ks0*(((((-1) + ks1) / 32))*((((-1) + ks1) / 32)))) + (2*ks0*((((-1) + ks1) / 32))); i0+=1)
{
for(long i1=0; i1<32; i1+=1)
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_out_ptr0 + (16*i1) + (512*i0));
auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr0 + 16*i1);
auto tmp3 = at::vec::Vectorized<float>::loadu(in_ptr1 + 16*i1);
auto tmp11 = at::vec::Vectorized<float>::loadu(in_ptr2 + 16*i1);
auto tmp13 = at::vec::Vectorized<float>::loadu(in_ptr3 + 16*i1);
auto tmp2 = tmp0 - tmp1;
auto tmp4 = at::vec::Vectorized<float>(static_cast<float>(1e-05));
auto tmp5 = tmp3 + tmp4;
auto tmp6 = tmp5.sqrt();
auto tmp7 = tmp6.reciprocal();
auto tmp8 = at::vec::Vectorized<float>(static_cast<float>(1));
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp15 = at::vec::clamp_min(tmp14, decltype(tmp14)(0));
tmp15.store(in_out_ptr0 + (16*i1) + (512*i0));
}
#pragma omp simd simdlen(8)
for(long i1=512; i1<512; i1+=1)
{
auto tmp0 = in_out_ptr0[i1 + (512*i0)];
auto tmp1 = in_ptr0[i1];
auto tmp3 = in_ptr1[i1];
auto tmp11 = in_ptr2[i1];
auto tmp13 = in_ptr3[i1];
auto tmp2 = tmp0 - tmp1;
auto tmp4 = static_cast<float>(1e-05);
auto tmp5 = tmp3 + tmp4;
auto tmp6 = std::sqrt(tmp5);
auto tmp7 = 1 / tmp6;
auto tmp8 = static_cast<float>(1);
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp15 = tmp14 * (tmp14>0);
in_out_ptr0[i1 + (512*i0)] = tmp15;
}
}
}
}
}
''')
kernel_cpp_49 = async_compile.cpp('''
#include "/tmp/torchinductor_root/dl/cdljpywww2h2ag4o35mwbvm45hhasxnxkhqgbupxnk3y7olula65.h"
extern "C" void kernel(float* __restrict__ in_out_ptr0,
float* __restrict__ in_out_ptr1,
const float* __restrict__ in_ptr0,
const float* __restrict__ in_ptr1,
const float* __restrict__ in_ptr2,
const float* __restrict__ in_ptr3,
const float* __restrict__ in_ptr4,
const long ks0,
const long ks1)
{
auto out_ptr0 = in_out_ptr1;
#pragma omp parallel num_threads(56)
{
{
#pragma omp for
for(long i0=0; i0<ks0; i0+=1)
{
#pragma GCC ivdep
for(long i1=0; i1<2048; i1+=1)
{
{
float tmp18 = 0;
for(long i2=0; i2<1 + (((((-1) + ks1) / 32))*((((-1) + ks1) / 32))) + (2*((((-1) + ks1) / 32))); i2+=1)
{
auto tmp0 = in_ptr0[i1 + (2048*i0) + (2048*i2) + (2048*i0*(((((-1) + ks1) / 32))*((((-1) + ks1) / 32)))) + (4096*i0*((((-1) + ks1) / 32)))];
auto tmp1 = in_ptr1[i1];
auto tmp3 = in_ptr2[i1];
auto tmp11 = in_ptr3[i1];
auto tmp13 = in_ptr4[i1];
auto tmp15 = in_out_ptr0[i1 + (2048*i0) + (2048*i2) + (2048*i0*(((((-1) + ks1) / 32))*((((-1) + ks1) / 32)))) + (4096*i0*((((-1) + ks1) / 32)))];
auto tmp2 = tmp0 - tmp1;
auto tmp4 = static_cast<float>(1e-05);
auto tmp5 = tmp3 + tmp4;
auto tmp6 = std::sqrt(tmp5);
auto tmp7 = 1 / tmp6;
auto tmp8 = static_cast<float>(1);
auto tmp9 = tmp7 * tmp8;
auto tmp10 = tmp2 * tmp9;
auto tmp12 = tmp10 * tmp11;
auto tmp14 = tmp12 + tmp13;
auto tmp16 = tmp14 + tmp15;
auto tmp17 = tmp16 * (tmp16>0);
tmp18 += tmp17;
}
out_ptr0[i1 + (2048*i0)] = tmp18;
}
}
}
}
{
#pragma omp for
for(long i0=0; i0<2048*ks0; i0+=1)
{
auto tmp0 = out_ptr0[i0];
auto tmp1 = static_cast<float>(1 + (((((-1) + ks1) / 32))*((((-1) + ks1) / 32))) + (2*((((-1) + ks1) / 32))));
auto tmp2 = tmp0 / tmp1;
in_out_ptr1[i0] = tmp2;
}
}
}
}
''')
async_compile.wait(globals())
del async_compile
def call(args):
arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1, arg12_1, arg13_1, arg14_1, arg15_1, arg16_1, arg17_1, arg18_1, arg19_1, arg20_1, arg21_1, arg22_1, arg23_1, arg24_1, arg25_1, arg26_1, arg27_1, arg28_1, arg29_1, arg30_1, arg31_1, arg32_1, arg33_1, arg34_1, arg35_1, arg36_1, arg37_1, arg38_1, arg39_1, arg40_1, arg41_1, arg42_1, arg43_1, arg44_1, arg45_1, arg46_1, arg47_1, arg48_1, arg49_1, arg50_1, arg51_1, arg52_1, arg53_1, arg54_1, arg55_1, arg56_1, arg57_1, arg58_1, arg59_1, arg60_1, arg61_1, arg62_1, arg63_1, arg64_1, arg65_1, arg66_1, arg67_1, arg68_1, arg69_1, arg70_1, arg71_1, arg72_1, arg73_1, arg74_1, arg75_1, arg76_1, arg77_1, arg78_1, arg79_1, arg80_1, arg81_1, arg82_1, arg83_1, arg84_1, arg85_1, arg86_1, arg87_1, arg88_1, arg89_1, arg90_1, arg91_1, arg92_1, arg93_1, arg94_1, arg95_1, arg96_1, arg97_1, arg98_1, arg99_1, arg100_1, arg101_1, arg102_1, arg103_1, arg104_1, arg105_1, arg106_1, arg107_1, arg108_1, arg109_1, arg110_1, arg111_1, arg112_1, arg113_1, arg114_1, arg115_1, arg116_1, arg117_1, arg118_1, arg119_1, arg120_1, arg121_1, arg122_1, arg123_1, arg124_1, arg125_1, arg126_1, arg127_1, arg128_1, arg129_1, arg130_1, arg131_1, arg132_1, arg133_1, arg134_1, arg135_1, arg136_1, arg137_1, arg138_1, arg139_1, arg140_1, arg141_1, arg142_1, arg143_1, arg144_1, arg145_1, arg146_1, arg147_1, arg148_1, arg149_1, arg150_1, arg151_1, arg152_1, arg153_1, arg154_1, arg155_1, arg156_1, arg157_1, arg158_1, arg159_1, arg160_1, arg161_1, arg162_1, arg163_1, arg164_1, arg165_1, arg166_1, arg167_1, arg168_1, arg169_1, arg170_1, arg171_1, arg172_1, arg173_1, arg174_1, arg175_1, arg176_1, arg177_1, arg178_1, arg179_1, arg180_1, arg181_1, arg182_1, arg183_1, arg184_1, arg185_1, arg186_1, arg187_1, arg188_1, arg189_1, arg190_1, arg191_1, arg192_1, arg193_1, arg194_1, arg195_1, arg196_1, arg197_1, arg198_1, arg199_1, arg200_1, arg201_1, arg202_1, arg203_1, arg204_1, arg205_1, arg206_1, arg207_1, arg208_1, arg209_1, arg210_1, arg211_1, arg212_1, arg213_1, arg214_1, arg215_1, arg216_1, arg217_1, arg218_1, arg219_1, arg220_1, arg221_1, arg222_1, arg223_1, arg224_1, arg225_1, arg226_1, arg227_1, arg228_1, arg229_1, arg230_1, arg231_1, arg232_1, arg233_1, arg234_1, arg235_1, arg236_1, arg237_1, arg238_1, arg239_1, arg240_1, arg241_1, arg242_1, arg243_1, arg244_1, arg245_1, arg246_1, arg247_1, arg248_1, arg249_1, arg250_1, arg251_1, arg252_1, arg253_1, arg254_1, arg255_1, arg256_1, arg257_1, arg258_1, arg259_1, arg260_1, arg261_1, arg262_1, arg263_1, arg264_1, arg265_1, arg266_1, arg267_1 = args
args.clear()
arg267_1_size = arg267_1.size()
s0 = arg267_1_size[0]
s2 = arg267_1_size[2]
buf0 = empty_strided((64, 3, 7, 7), (147, 1, 21, 3), device='cpu', dtype=torch.float32)
kernel_cpp_0(c_void_p(arg0_1.data_ptr()), c_void_p(buf0.data_ptr()))
del arg0_1
buf1 = aten.convolution(arg267_1, buf0, None, (2, 2), (3, 3), (1, 1), False, (0, 0), 1)
assert_size_stride(buf1, (s0, 64, 1 + ((((-1) + s2) // 2)), 1 + ((((-1) + s2) // 2))), (64 + (64*(((((-1) + s2) // 2))*((((-1) + s2) // 2)))) + (128*((((-1) + s2) // 2))), 1, 64 + (64*((((-1) + s2) // 2))), 64))
del arg267_1
del buf0
buf3 = as_strided(buf1, (s0, 64, 1 + ((((-1) + s2) // 2)), 1 + ((((-1) + s2) // 2))), (64 + (64*(((((-1) + s2) // 2))*((((-1) + s2) // 2)))) + (128*((((-1) + s2) // 2))), 1, 64 + (64*((((-1) + s2) // 2))), 64)); del buf1 # reuse
buf4 = empty_strided((s0, 64, 1 + ((((-1) + s2) // 4)), 1 + ((((-1) + s2) // 4))), (64 + (64*(((((-1) + s2) // 4))*((((-1) + s2) // 4)))) + (128*((((-1) + s2) // 4))), 1, 64 + (64*((((-1) + s2) // 4))), 64), device='cpu', dtype=torch.float32)
kernel_cpp_1(c_void_p(buf3.data_ptr()), c_void_p(arg161_1.data_ptr()), c_void_p(arg162_1.data_ptr()), c_void_p(arg1_1.data_ptr()), c_void_p(arg2_1.data_ptr()), c_void_p(buf4.data_ptr()), c_long(s0), c_long(s2))
del arg161_1
del arg162_1
del arg1_1
del arg2_1
del buf3
buf6 = aten.convolution(buf4, arg3_1, None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1)
assert_size_stride(buf6, (s0, 64, 1 + ((((-1) + s2) // 4)), 1 + ((((-1) + s2) // 4))), (64 + (64*(((((-1) + s2) // 4))*((((-1) + s2) // 4)))) + (128*((((-1) + s2) // 4))), 1, 64 + (64*((((-1) + s2) // 4))), 64))
del arg3_1
buf8 = as_strided(buf6, (s0, 64, 1 + ((((-1) + s2) // 4)), 1 + ((((-1) + s2) // 4))), (64 + (64*(((((-1) + s2) // 4))*((((-1) + s2) // 4)))) + (128*((((-1) + s2) // 4))), 1, 64 + (64*((((-1) + s2) // 4))), 64)); del buf6 # reuse
buf9 = empty_strided((64, 64, 3, 3), (576, 1, 192, 64), device='cpu', dtype=torch.float32)
kernel_cpp_2(c_void_p(buf8.data_ptr()), c_void_p(arg163_1.data_ptr()), c_void_p(arg164_1.data_ptr()), c_void_p(arg4_1.data_ptr()), c_void_p(arg5_1.data_ptr()), c_void_p(arg6_1.data_ptr()), c_void_p(buf9.data_ptr()), c_long(s0), c_long(s2))
del arg163_1
del arg164_1
del arg4_1
del arg5_1
del arg6_1
buf10 = aten.convolution(buf8, buf9, None, (1, 1), (1, 1), (1, 1), False, (0, 0), 1)
assert_size_stride(buf10, (s0, 64, 1 + ((((-1) + s2) // 4)), 1 + ((((-1) + s2) // 4))), (64 + (64*(((((-1) + s2) // 4))*((((-1) + s2) // 4)))) + (128*((((-1) + s2) // 4))), 1, 64 + (64*((((-1) + s2) // 4))), 64))
del buf8
buf12 = as_strided(buf10, (s0, 64, 1 + ((((-1) + s2) // 4)), 1 + ((((-1) + s2) // 4))), (64 + (64*(((((-1) + s2) // 4))*((((-1) + s2) // 4)))) + (128*((((-1) + s2) // 4))), 1, 64 + (64*((((-1) + s2) // 4))), 64)); del buf10 # reuse
kernel_cpp_3(c_void_p(buf12.data_ptr()), c_void_p(arg165_1.data_ptr()), c_void_p(arg166_1.data_ptr()), c_void_p(arg7_1.data_ptr()), c_void_p(arg8_1.data_ptr()), c_long(s0), c_long(s2))
del arg165_1
del arg166_1
del arg7_1
del arg8_1
buf13 = aten.convolution(buf12, arg9_1, None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1)
assert_size_stride(buf13, (s0, 256, 1 + ((((-1) + s2) // 4)), 1 + ((((-1) + s2) // 4))), (256 + (256*(((((-1) + s2) // 4))*((((-1) + s2) // 4)))) + (512*((((-1) + s2) // 4))), 1, 256 + (256*((((-1) + s2) // 4))), 256))
del arg9_1
del buf12
buf15 = aten.convolution(buf4, arg12_1, None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1)
assert_size_stride(buf15, (s0, 256, 1 + ((((-1) + s2) // 4)), 1 + ((((-1) + s2) // 4))), (256 + (256*(((((-1) + s2) // 4))*((((-1) + s2) // 4)))) + (512*((((-1) + s2) // 4))), 1, 256 + (256*((((-1) + s2) // 4))), 256))
del arg12_1
del buf4
buf17 = as_strided(buf13, (s0, 256, 1 + ((((-1) + s2) // 4)), 1 + ((((-1) + s2) // 4))), (256 + (256*(((((-1) + s2) // 4))*((((-1) + s2) // 4)))) + (512*((((-1) + s2) // 4))), 1, 256 + (256*((((-1) + s2) // 4))), 256)); del buf13 # reuse
buf18 = buf17; del buf17 # reuse
kernel_cpp_4(c_void_p(buf18.data_ptr()), c_void_p(arg167_1.data_ptr()), c_void_p(arg168_1.data_ptr()), c_void_p(arg10_1.data_ptr()), c_void_p(arg11_1.data_ptr()), c_void_p(buf15.data_ptr()), c_void_p(arg169_1.data_ptr()), c_void_p(arg170_1.data_ptr()), c_void_p(arg13_1.data_ptr()), c_void_p(arg14_1.data_ptr()), c_long(s0), c_long(s2))
del arg10_1
del arg11_1
del arg13_1
del arg14_1
del arg167_1
del arg168_1
del arg169_1
del arg170_1
del buf15
buf19 = aten.convolution(buf18, arg15_1, None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1)
assert_size_stride(buf19, (s0, 64, 1 + ((((-1) + s2) // 4)), 1 + ((((-1) + s2) // 4))), (64 + (64*(((((-1) + s2) // 4))*((((-1) + s2) // 4)))) + (128*((((-1) + s2) // 4))), 1, 64 + (64*((((-1) + s2) // 4))), 64))
del arg15_1
buf21 = as_strided(buf19, (s0, 64, 1 + ((((-1) + s2) // 4)), 1 + ((((-1) + s2) // 4))), (64 + (64*(((((-1) + s2) // 4))*((((-1) + s2) // 4)))) + (128*((((-1) + s2) // 4))), 1, 64 + (64*((((-1) + s2) // 4))), 64)); del buf19 # reuse
buf22 = buf9; del buf9 # reuse
kernel_cpp_5(c_void_p(buf21.data_ptr()), c_void_p(arg171_1.data_ptr()), c_void_p(arg172_1.data_ptr()), c_void_p(arg16_1.data_ptr()), c_void_p(arg17_1.data_ptr()), c_void_p(arg18_1.data_ptr()), c_void_p(buf22.data_ptr()), c_long(s0), c_long(s2))
del arg16_1
del arg171_1
del arg172_1
del arg17_1
del arg18_1
buf23 = aten.convolution(buf21, buf22, None, (1, 1), (1, 1), (1, 1), False, (0, 0), 1)
assert_size_stride(buf23, (s0, 64, 1 + ((((-1) + s2) // 4)), 1 + ((((-1) + s2) // 4))), (64 + (64*(((((-1) + s2) // 4))*((((-1) + s2) // 4)))) + (128*((((-1) + s2) // 4))), 1, 64 + (64*((((-1) + s2) // 4))), 64))
del buf21
buf25 = as_strided(buf23, (s0, 64, 1 + ((((-1) + s2) // 4)), 1 + ((((-1) + s2) // 4))), (64 + (64*(((((-1) + s2) // 4))*((((-1) + s2) // 4)))) + (128*((((-1) + s2) // 4))), 1, 64 + (64*((((-1) + s2) // 4))), 64)); del buf23 # reuse
kernel_cpp_6(c_void_p(buf25.data_ptr()), c_void_p(arg173_1.data_ptr()), c_void_p(arg174_1.data_ptr()), c_void_p(arg19_1.data_ptr()), c_void_p(arg20_1.data_ptr()), c_long(s0), c_long(s2))
del arg173_1
del arg174_1
del arg19_1
del arg20_1
buf26 = aten.convolution(buf25, arg21_1, None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1)
assert_size_stride(buf26, (s0, 256, 1 + ((((-1) + s2) // 4)), 1 + ((((-1) + s2) // 4))), (256 + (256*(((((-1) + s2) // 4))*((((-1) + s2) // 4)))) + (512*((((-1) + s2) // 4))), 1, 256 + (256*((((-1) + s2) // 4))), 256))
del arg21_1
del buf25
buf28 = buf18; del buf18 # reuse
kernel_cpp_7(c_void_p(buf28.data_ptr()), c_void_p(buf26.data_ptr()), c_void_p(arg175_1.data_ptr()), c_void_p(arg176_1.data_ptr()), c_void_p(arg22_1.data_ptr()), c_void_p(arg23_1.data_ptr()), c_long(s0), c_long(s2))
del arg175_1
del arg176_1
del arg22_1
del arg23_1
del buf26
buf29 = aten.convolution(buf28, arg24_1, None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1)
assert_size_stride(buf29, (s0, 64, 1 + ((((-1) + s2) // 4)), 1 + ((((-1) + s2) // 4))), (64 + (64*(((((-1) + s2) // 4))*((((-1) + s2) // 4)))) + (128*((((-1) + s2) // 4))), 1, 64 + (64*((((-1) + s2) // 4))), 64))
del arg24_1
buf31 = as_strided(buf29, (s0, 64, 1 + ((((-1) + s2) // 4)), 1 + ((((-1) + s2) // 4))), (64 + (64*(((((-1) + s2) // 4))*((((-1) + s2) // 4)))) + (128*((((-1) + s2) // 4))), 1, 64 + (64*((((-1) + s2) // 4))), 64)); del buf29 # reuse
buf32 = buf22; del buf22 # reuse
kernel_cpp_8(c_void_p(buf31.data_ptr()), c_void_p(arg177_1.data_ptr()), c_void_p(arg178_1.data_ptr()), c_void_p(arg25_1.data_ptr()), c_void_p(arg26_1.data_ptr()), c_void_p(arg27_1.data_ptr()), c_void_p(buf32.data_ptr()), c_long(s0), c_long(s2))
del arg177_1
del arg178_1
del arg25_1
del arg26_1
del arg27_1
buf33 = aten.convolution(buf31, buf32, None, (1, 1), (1, 1), (1, 1), False, (0, 0), 1)
assert_size_stride(buf33, (s0, 64, 1 + ((((-1) + s2) // 4)), 1 + ((((-1) + s2) // 4))), (64 + (64*(((((-1) + s2) // 4))*((((-1) + s2) // 4)))) + (128*((((-1) + s2) // 4))), 1, 64 + (64*((((-1) + s2) // 4))), 64))
del buf31
del buf32
buf35 = as_strided(buf33, (s0, 64, 1 + ((((-1) + s2) // 4)), 1 + ((((-1) + s2) // 4))), (64 + (64*(((((-1) + s2) // 4))*((((-1) + s2) // 4)))) + (128*((((-1) + s2) // 4))), 1, 64 + (64*((((-1) + s2) // 4))), 64)); del buf33 # reuse
kernel_cpp_9(c_void_p(buf35.data_ptr()), c_void_p(arg179_1.data_ptr()), c_void_p(arg180_1.data_ptr()), c_void_p(arg28_1.data_ptr()), c_void_p(arg29_1.data_ptr()), c_long(s0), c_long(s2))
del arg179_1
del arg180_1
del arg28_1
del arg29_1
buf36 = aten.convolution(buf35, arg30_1, None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1)
assert_size_stride(buf36, (s0, 256, 1 + ((((-1) + s2) // 4)), 1 + ((((-1) + s2) // 4))), (256 + (256*(((((-1) + s2) // 4))*((((-1) + s2) // 4)))) + (512*((((-1) + s2) // 4))), 1, 256 + (256*((((-1) + s2) // 4))), 256))
del arg30_1
del buf35
buf38 = buf28; del buf28 # reuse
kernel_cpp_10(c_void_p(buf38.data_ptr()), c_void_p(buf36.data_ptr()), c_void_p(arg181_1.data_ptr()), c_void_p(arg182_1.data_ptr()), c_void_p(arg31_1.data_ptr()), c_void_p(arg32_1.data_ptr()), c_long(s0), c_long(s2))
del arg181_1
del arg182_1
del arg31_1
del arg32_1
del buf36
buf39 = aten.convolution(buf38, arg33_1, None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1)
assert_size_stride(buf39, (s0, 128, 1 + ((((-1) + s2) // 4)), 1 + ((((-1) + s2) // 4))), (128 + (128*(((((-1) + s2) // 4))*((((-1) + s2) // 4)))) + (256*((((-1) + s2) // 4))), 1, 128 + (128*((((-1) + s2) // 4))), 128))
del arg33_1
buf41 = as_strided(buf39, (s0, 128, 1 + ((((-1) + s2) // 4)), 1 + ((((-1) + s2) // 4))), (128 + (128*(((((-1) + s2) // 4))*((((-1) + s2) // 4)))) + (256*((((-1) + s2) // 4))), 1, 128 + (128*((((-1) + s2) // 4))), 128)); del buf39 # reuse
buf42 = empty_strided((128, 128, 3, 3), (1152, 1, 384, 128), device='cpu', dtype=torch.float32)
kernel_cpp_11(c_void_p(buf41.data_ptr()), c_void_p(arg183_1.data_ptr()), c_void_p(arg184_1.data_ptr()), c_void_p(arg34_1.data_ptr()), c_void_p(arg35_1.data_ptr()), c_void_p(arg36_1.data_ptr()), c_void_p(buf42.data_ptr()), c_long(s0), c_long(s2))
del arg183_1
del arg184_1
del arg34_1
del arg35_1
del arg36_1
buf43 = aten.convolution(buf41, buf42, None, (2, 2), (1, 1), (1, 1), False, (0, 0), 1)
assert_size_stride(buf43, (s0, 128, 1 + ((((-1) + s2) // 8)), 1 + ((((-1) + s2) // 8))), (128 + (128*(((((-1) + s2) // 8))*((((-1) + s2) // 8)))) + (256*((((-1) + s2) // 8))), 1, 128 + (128*((((-1) + s2) // 8))), 128))
del buf41
buf45 = as_strided(buf43, (s0, 128, 1 + ((((-1) + s2) // 8)), 1 + ((((-1) + s2) // 8))), (128 + (128*(((((-1) + s2) // 8))*((((-1) + s2) // 8)))) + (256*((((-1) + s2) // 8))), 1, 128 + (128*((((-1) + s2) // 8))), 128)); del buf43 # reuse
kernel_cpp_12(c_void_p(buf45.data_ptr()), c_void_p(arg185_1.data_ptr()), c_void_p(arg186_1.data_ptr()), c_void_p(arg37_1.data_ptr()), c_void_p(arg38_1.data_ptr()), c_long(s0), c_long(s2))
del arg185_1
del arg186_1
del arg37_1
del arg38_1
buf46 = aten.convolution(buf45, arg39_1, None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1)
assert_size_stride(buf46, (s0, 512, 1 + ((((-1) + s2) // 8)), 1 + ((((-1) + s2) // 8))), (512 + (512*(((((-1) + s2) // 8))*((((-1) + s2) // 8)))) + (1024*((((-1) + s2) // 8))), 1, 512 + (512*((((-1) + s2) // 8))), 512))
del arg39_1
del buf45
buf48 = aten.convolution(buf38, arg42_1, None, (2, 2), (0, 0), (1, 1), False, (0, 0), 1)
assert_size_stride(buf48, (s0, 512, 1 + ((((-1) + s2) // 8)), 1 + ((((-1) + s2) // 8))), (512 + (512*(((((-1) + s2) // 8))*((((-1) + s2) // 8)))) + (1024*((((-1) + s2) // 8))), 1, 512 + (512*((((-1) + s2) // 8))), 512))
del arg42_1
del buf38
buf50 = as_strided(buf46, (s0, 512, 1 + ((((-1) + s2) // 8)), 1 + ((((-1) + s2) // 8))), (512 + (512*(((((-1) + s2) // 8))*((((-1) + s2) // 8)))) + (1024*((((-1) + s2) // 8))), 1, 512 + (512*((((-1) + s2) // 8))), 512)); del buf46 # reuse
buf51 = buf50; del buf50 # reuse
kernel_cpp_13(c_void_p(buf51.data_ptr()), c_void_p(arg187_1.data_ptr()), c_void_p(arg188_1.data_ptr()), c_void_p(arg40_1.data_ptr()), c_void_p(arg41_1.data_ptr()), c_void_p(buf48.data_ptr()), c_void_p(arg189_1.data_ptr()), c_void_p(arg190_1.data_ptr()), c_void_p(arg43_1.data_ptr()), c_void_p(arg44_1.data_ptr()), c_long(s0), c_long(s2))
del arg187_1
del arg188_1
del arg189_1
del arg190_1
del arg40_1
del arg41_1
del arg43_1
del arg44_1
del buf48
buf52 = aten.convolution(buf51, arg45_1, None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1)
assert_size_stride(buf52, (s0, 128, 1 + ((((-1) + s2) // 8)), 1 + ((((-1) + s2) // 8))), (128 + (128*(((((-1) + s2) // 8))*((((-1) + s2) // 8)))) + (256*((((-1) + s2) // 8))), 1, 128 + (128*((((-1) + s2) // 8))), 128))
del arg45_1
buf54 = as_strided(buf52, (s0, 128, 1 + ((((-1) + s2) // 8)), 1 + ((((-1) + s2) // 8))), (128 + (128*(((((-1) + s2) // 8))*((((-1) + s2) // 8)))) + (256*((((-1) + s2) // 8))), 1, 128 + (128*((((-1) + s2) // 8))), 128)); del buf52 # reuse
buf55 = buf42; del buf42 # reuse
kernel_cpp_14(c_void_p(buf54.data_ptr()), c_void_p(arg191_1.data_ptr()), c_void_p(arg192_1.data_ptr()), c_void_p(arg46_1.data_ptr()), c_void_p(arg47_1.data_ptr()), c_void_p(arg48_1.data_ptr()), c_void_p(buf55.data_ptr()), c_long(s0), c_long(s2))
del arg191_1
del arg192_1
del arg46_1
del arg47_1
del arg48_1
buf56 = aten.convolution(buf54, buf55, None, (1, 1), (1, 1), (1, 1), False, (0, 0), 1)
assert_size_stride(buf56, (s0, 128, 1 + ((((-1) + s2) // 8)), 1 + ((((-1) + s2) // 8))), (128 + (128*(((((-1) + s2) // 8))*((((-1) + s2) // 8)))) + (256*((((-1) + s2) // 8))), 1, 128 + (128*((((-1) + s2) // 8))), 128))
del buf54
buf58 = as_strided(buf56, (s0, 128, 1 + ((((-1) + s2) // 8)), 1 + ((((-1) + s2) // 8))), (128 + (128*(((((-1) + s2) // 8))*((((-1) + s2) // 8)))) + (256*((((-1) + s2) // 8))), 1, 128 + (128*((((-1) + s2) // 8))), 128)); del buf56 # reuse
kernel_cpp_15(c_void_p(buf58.data_ptr()), c_void_p(arg193_1.data_ptr()), c_void_p(arg194_1.data_ptr()), c_void_p(arg49_1.data_ptr()), c_void_p(arg50_1.data_ptr()), c_long(s0), c_long(s2))
del arg193_1
del arg194_1
del arg49_1
del arg50_1
buf59 = aten.convolution(buf58, arg51_1, None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1)
assert_size_stride(buf59, (s0, 512, 1 + ((((-1) + s2) // 8)), 1 + ((((-1) + s2) // 8))), (512 + (512*(((((-1) + s2) // 8))*((((-1) + s2) // 8)))) + (1024*((((-1) + s2) // 8))), 1, 512 + (512*((((-1) + s2) // 8))), 512))
del arg51_1
del buf58
buf61 = buf51; del buf51 # reuse
kernel_cpp_16(c_void_p(buf61.data_ptr()), c_void_p(buf59.data_ptr()), c_void_p(arg195_1.data_ptr()), c_void_p(arg196_1.data_ptr()), c_void_p(arg52_1.data_ptr()), c_void_p(arg53_1.data_ptr()), c_long(s0), c_long(s2))
del arg195_1
del arg196_1
del arg52_1
del arg53_1
del buf59
buf62 = aten.convolution(buf61, arg54_1, None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1)
assert_size_stride(buf62, (s0, 128, 1 + ((((-1) + s2) // 8)), 1 + ((((-1) + s2) // 8))), (128 + (128*(((((-1) + s2) // 8))*((((-1) + s2) // 8)))) + (256*((((-1) + s2) // 8))), 1, 128 + (128*((((-1) + s2) // 8))), 128))
del arg54_1
buf64 = as_strided(buf62, (s0, 128, 1 + ((((-1) + s2) // 8)), 1 + ((((-1) + s2) // 8))), (128 + (128*(((((-1) + s2) // 8))*((((-1) + s2) // 8)))) + (256*((((-1) + s2) // 8))), 1, 128 + (128*((((-1) + s2) // 8))), 128)); del buf62 # reuse
buf65 = buf55; del buf55 # reuse
kernel_cpp_17(c_void_p(buf64.data_ptr()), c_void_p(arg197_1.data_ptr()), c_void_p(arg198_1.data_ptr()), c_void_p(arg55_1.data_ptr()), c_void_p(arg56_1.data_ptr()), c_void_p(arg57_1.data_ptr()), c_void_p(buf65.data_ptr()), c_long(s0), c_long(s2))
del arg197_1
del arg198_1
del arg55_1
del arg56_1
del arg57_1
buf66 = aten.convolution(buf64, buf65, None, (1, 1), (1, 1), (1, 1), False, (0, 0), 1)
assert_size_stride(buf66, (s0, 128, 1 + ((((-1) + s2) // 8)), 1 + ((((-1) + s2) // 8))), (128 + (128*(((((-1) + s2) // 8))*((((-1) + s2) // 8)))) + (256*((((-1) + s2) // 8))), 1, 128 + (128*((((-1) + s2) // 8))), 128))
del buf64
buf68 = as_strided(buf66, (s0, 128, 1 + ((((-1) + s2) // 8)), 1 + ((((-1) + s2) // 8))), (128 + (128*(((((-1) + s2) // 8))*((((-1) + s2) // 8)))) + (256*((((-1) + s2) // 8))), 1, 128 + (128*((((-1) + s2) // 8))), 128)); del buf66 # reuse
kernel_cpp_18(c_void_p(buf68.data_ptr()), c_void_p(arg199_1.data_ptr()), c_void_p(arg200_1.data_ptr()), c_void_p(arg58_1.data_ptr()), c_void_p(arg59_1.data_ptr()), c_long(s0), c_long(s2))
del arg199_1
del arg200_1
del arg58_1
del arg59_1
buf69 = aten.convolution(buf68, arg60_1, None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1)
assert_size_stride(buf69, (s0, 512, 1 + ((((-1) + s2) // 8)), 1 + ((((-1) + s2) // 8))), (512 + (512*(((((-1) + s2) // 8))*((((-1) + s2) // 8)))) + (1024*((((-1) + s2) // 8))), 1, 512 + (512*((((-1) + s2) // 8))), 512))
del arg60_1
del buf68
buf71 = buf61; del buf61 # reuse
kernel_cpp_19(c_void_p(buf71.data_ptr()), c_void_p(buf69.data_ptr()), c_void_p(arg201_1.data_ptr()), c_void_p(arg202_1.data_ptr()), c_void_p(arg61_1.data_ptr()), c_void_p(arg62_1.data_ptr()), c_long(s0), c_long(s2))
del arg201_1
del arg202_1
del arg61_1
del arg62_1
del buf69
buf72 = aten.convolution(buf71, arg63_1, None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1)
assert_size_stride(buf72, (s0, 128, 1 + ((((-1) + s2) // 8)), 1 + ((((-1) + s2) // 8))), (128 + (128*(((((-1) + s2) // 8))*((((-1) + s2) // 8)))) + (256*((((-1) + s2) // 8))), 1, 128 + (128*((((-1) + s2) // 8))), 128))
del arg63_1
buf74 = as_strided(buf72, (s0, 128, 1 + ((((-1) + s2) // 8)), 1 + ((((-1) + s2) // 8))), (128 + (128*(((((-1) + s2) // 8))*((((-1) + s2) // 8)))) + (256*((((-1) + s2) // 8))), 1, 128 + (128*((((-1) + s2) // 8))), 128)); del buf72 # reuse
buf75 = buf65; del buf65 # reuse
kernel_cpp_20(c_void_p(buf74.data_ptr()), c_void_p(arg203_1.data_ptr()), c_void_p(arg204_1.data_ptr()), c_void_p(arg64_1.data_ptr()), c_void_p(arg65_1.data_ptr()), c_void_p(arg66_1.data_ptr()), c_void_p(buf75.data_ptr()), c_long(s0), c_long(s2))
del arg203_1
del arg204_1
del arg64_1
del arg65_1
del arg66_1
buf76 = aten.convolution(buf74, buf75, None, (1, 1), (1, 1), (1, 1), False, (0, 0), 1)
assert_size_stride(buf76, (s0, 128, 1 + ((((-1) + s2) // 8)), 1 + ((((-1) + s2) // 8))), (128 + (128*(((((-1) + s2) // 8))*((((-1) + s2) // 8)))) + (256*((((-1) + s2) // 8))), 1, 128 + (128*((((-1) + s2) // 8))), 128))
del buf74
del buf75
buf78 = as_strided(buf76, (s0, 128, 1 + ((((-1) + s2) // 8)), 1 + ((((-1) + s2) // 8))), (128 + (128*(((((-1) + s2) // 8))*((((-1) + s2) // 8)))) + (256*((((-1) + s2) // 8))), 1, 128 + (128*((((-1) + s2) // 8))), 128)); del buf76 # reuse
kernel_cpp_21(c_void_p(buf78.data_ptr()), c_void_p(arg205_1.data_ptr()), c_void_p(arg206_1.data_ptr()), c_void_p(arg67_1.data_ptr()), c_void_p(arg68_1.data_ptr()), c_long(s0), c_long(s2))
del arg205_1
del arg206_1
del arg67_1
del arg68_1
buf79 = aten.convolution(buf78, arg69_1, None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1)
assert_size_stride(buf79, (s0, 512, 1 + ((((-1) + s2) // 8)), 1 + ((((-1) + s2) // 8))), (512 + (512*(((((-1) + s2) // 8))*((((-1) + s2) // 8)))) + (1024*((((-1) + s2) // 8))), 1, 512 + (512*((((-1) + s2) // 8))), 512))
del arg69_1
del buf78
buf81 = buf71; del buf71 # reuse
kernel_cpp_22(c_void_p(buf81.data_ptr()), c_void_p(buf79.data_ptr()), c_void_p(arg207_1.data_ptr()), c_void_p(arg208_1.data_ptr()), c_void_p(arg70_1.data_ptr()), c_void_p(arg71_1.data_ptr()), c_long(s0), c_long(s2))
del arg207_1
del arg208_1
del arg70_1
del arg71_1
del buf79
buf82 = aten.convolution(buf81, arg72_1, None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1)
assert_size_stride(buf82, (s0, 256, 1 + ((((-1) + s2) // 8)), 1 + ((((-1) + s2) // 8))), (256 + (256*(((((-1) + s2) // 8))*((((-1) + s2) // 8)))) + (512*((((-1) + s2) // 8))), 1, 256 + (256*((((-1) + s2) // 8))), 256))
del arg72_1
buf84 = as_strided(buf82, (s0, 256, 1 + ((((-1) + s2) // 8)), 1 + ((((-1) + s2) // 8))), (256 + (256*(((((-1) + s2) // 8))*((((-1) + s2) // 8)))) + (512*((((-1) + s2) // 8))), 1, 256 + (256*((((-1) + s2) // 8))), 256)); del buf82 # reuse
buf85 = empty_strided((256, 256, 3, 3), (2304, 1, 768, 256), device='cpu', dtype=torch.float32)
kernel_cpp_23(c_void_p(buf84.data_ptr()), c_void_p(arg209_1.data_ptr()), c_void_p(arg210_1.data_ptr()), c_void_p(arg73_1.data_ptr()), c_void_p(arg74_1.data_ptr()), c_void_p(arg75_1.data_ptr()), c_void_p(buf85.data_ptr()), c_long(s0), c_long(s2))
del arg209_1
del arg210_1
del arg73_1
del arg74_1
del arg75_1
buf86 = aten.convolution(buf84, buf85, None, (2, 2), (1, 1), (1, 1), False, (0, 0), 1)
assert_size_stride(buf86, (s0, 256, 1 + ((((-1) + s2) // 16)), 1 + ((((-1) + s2) // 16))), (256 + (256*(((((-1) + s2) // 16))*((((-1) + s2) // 16)))) + (512*((((-1) + s2) // 16))), 1, 256 + (256*((((-1) + s2) // 16))), 256))
del buf84
buf88 = as_strided(buf86, (s0, 256, 1 + ((((-1) + s2) // 16)), 1 + ((((-1) + s2) // 16))), (256 + (256*(((((-1) + s2) // 16))*((((-1) + s2) // 16)))) + (512*((((-1) + s2) // 16))), 1, 256 + (256*((((-1) + s2) // 16))), 256)); del buf86 # reuse
kernel_cpp_24(c_void_p(buf88.data_ptr()), c_void_p(arg211_1.data_ptr()), c_void_p(arg212_1.data_ptr()), c_void_p(arg76_1.data_ptr()), c_void_p(arg77_1.data_ptr()), c_long(s0), c_long(s2))
del arg211_1
del arg212_1
del arg76_1
del arg77_1
buf89 = aten.convolution(buf88, arg78_1, None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1)
assert_size_stride(buf89, (s0, 1024, 1 + ((((-1) + s2) // 16)), 1 + ((((-1) + s2) // 16))), (1024 + (1024*(((((-1) + s2) // 16))*((((-1) + s2) // 16)))) + (2048*((((-1) + s2) // 16))), 1, 1024 + (1024*((((-1) + s2) // 16))), 1024))
del arg78_1
del buf88
buf91 = aten.convolution(buf81, arg81_1, None, (2, 2), (0, 0), (1, 1), False, (0, 0), 1)
assert_size_stride(buf91, (s0, 1024, 1 + ((((-1) + s2) // 16)), 1 + ((((-1) + s2) // 16))), (1024 + (1024*(((((-1) + s2) // 16))*((((-1) + s2) // 16)))) + (2048*((((-1) + s2) // 16))), 1, 1024 + (1024*((((-1) + s2) // 16))), 1024))
del arg81_1
del buf81
buf93 = as_strided(buf89, (s0, 1024, 1 + ((((-1) + s2) // 16)), 1 + ((((-1) + s2) // 16))), (1024 + (1024*(((((-1) + s2) // 16))*((((-1) + s2) // 16)))) + (2048*((((-1) + s2) // 16))), 1, 1024 + (1024*((((-1) + s2) // 16))), 1024)); del buf89 # reuse
buf94 = buf93; del buf93 # reuse
kernel_cpp_25(c_void_p(buf94.data_ptr()), c_void_p(arg213_1.data_ptr()), c_void_p(arg214_1.data_ptr()), c_void_p(arg79_1.data_ptr()), c_void_p(arg80_1.data_ptr()), c_void_p(buf91.data_ptr()), c_void_p(arg215_1.data_ptr()), c_void_p(arg216_1.data_ptr()), c_void_p(arg82_1.data_ptr()), c_void_p(arg83_1.data_ptr()), c_long(s0), c_long(s2))
del arg213_1
del arg214_1
del arg215_1
del arg216_1
del arg79_1
del arg80_1
del arg82_1
del arg83_1
del buf91
buf95 = aten.convolution(buf94, arg84_1, None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1)
assert_size_stride(buf95, (s0, 256, 1 + ((((-1) + s2) // 16)), 1 + ((((-1) + s2) // 16))), (256 + (256*(((((-1) + s2) // 16))*((((-1) + s2) // 16)))) + (512*((((-1) + s2) // 16))), 1, 256 + (256*((((-1) + s2) // 16))), 256))
del arg84_1
buf97 = as_strided(buf95, (s0, 256, 1 + ((((-1) + s2) // 16)), 1 + ((((-1) + s2) // 16))), (256 + (256*(((((-1) + s2) // 16))*((((-1) + s2) // 16)))) + (512*((((-1) + s2) // 16))), 1, 256 + (256*((((-1) + s2) // 16))), 256)); del buf95 # reuse
buf98 = buf85; del buf85 # reuse
kernel_cpp_26(c_void_p(buf97.data_ptr()), c_void_p(arg217_1.data_ptr()), c_void_p(arg218_1.data_ptr()), c_void_p(arg85_1.data_ptr()), c_void_p(arg86_1.data_ptr()), c_void_p(arg87_1.data_ptr()), c_void_p(buf98.data_ptr()), c_long(s0), c_long(s2))
del arg217_1
del arg218_1
del arg85_1
del arg86_1
del arg87_1
buf99 = aten.convolution(buf97, buf98, None, (1, 1), (1, 1), (1, 1), False, (0, 0), 1)
assert_size_stride(buf99, (s0, 256, 1 + ((((-1) + s2) // 16)), 1 + ((((-1) + s2) // 16))), (256 + (256*(((((-1) + s2) // 16))*((((-1) + s2) // 16)))) + (512*((((-1) + s2) // 16))), 1, 256 + (256*((((-1) + s2) // 16))), 256))
del buf97
buf101 = as_strided(buf99, (s0, 256, 1 + ((((-1) + s2) // 16)), 1 + ((((-1) + s2) // 16))), (256 + (256*(((((-1) + s2) // 16))*((((-1) + s2) // 16)))) + (512*((((-1) + s2) // 16))), 1, 256 + (256*((((-1) + s2) // 16))), 256)); del buf99 # reuse
kernel_cpp_27(c_void_p(buf101.data_ptr()), c_void_p(arg219_1.data_ptr()), c_void_p(arg220_1.data_ptr()), c_void_p(arg88_1.data_ptr()), c_void_p(arg89_1.data_ptr()), c_long(s0), c_long(s2))
del arg219_1
del arg220_1
del arg88_1
del arg89_1
buf102 = aten.convolution(buf101, arg90_1, None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1)
assert_size_stride(buf102, (s0, 1024, 1 + ((((-1) + s2) // 16)), 1 + ((((-1) + s2) // 16))), (1024 + (1024*(((((-1) + s2) // 16))*((((-1) + s2) // 16)))) + (2048*((((-1) + s2) // 16))), 1, 1024 + (1024*((((-1) + s2) // 16))), 1024))
del arg90_1
del buf101
buf104 = as_strided(buf102, (s0, 1024, 1 + ((((-1) + s2) // 16)), 1 + ((((-1) + s2) // 16))), (1024 + (1024*(((((-1) + s2) // 16))*((((-1) + s2) // 16)))) + (2048*((((-1) + s2) // 16))), 1, 1024 + (1024*((((-1) + s2) // 16))), 1024)); del buf102 # reuse
kernel_cpp_28(c_void_p(buf104.data_ptr()), c_void_p(arg221_1.data_ptr()), c_void_p(arg222_1.data_ptr()), c_void_p(arg91_1.data_ptr()), c_void_p(arg92_1.data_ptr()), c_void_p(buf94.data_ptr()), c_long(s0), c_long(s2))
del arg221_1
del arg222_1
del arg91_1
del arg92_1
del buf94
buf105 = aten.convolution(buf104, arg93_1, None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1)
assert_size_stride(buf105, (s0, 256, 1 + ((((-1) + s2) // 16)), 1 + ((((-1) + s2) // 16))), (256 + (256*(((((-1) + s2) // 16))*((((-1) + s2) // 16)))) + (512*((((-1) + s2) // 16))), 1, 256 + (256*((((-1) + s2) // 16))), 256))
del arg93_1
buf107 = as_strided(buf105, (s0, 256, 1 + ((((-1) + s2) // 16)), 1 + ((((-1) + s2) // 16))), (256 + (256*(((((-1) + s2) // 16))*((((-1) + s2) // 16)))) + (512*((((-1) + s2) // 16))), 1, 256 + (256*((((-1) + s2) // 16))), 256)); del buf105 # reuse
buf108 = buf98; del buf98 # reuse
kernel_cpp_29(c_void_p(buf107.data_ptr()), c_void_p(arg223_1.data_ptr()), c_void_p(arg224_1.data_ptr()), c_void_p(arg94_1.data_ptr()), c_void_p(arg95_1.data_ptr()), c_void_p(arg96_1.data_ptr()), c_void_p(buf108.data_ptr()), c_long(s0), c_long(s2))
del arg223_1
del arg224_1
del arg94_1
del arg95_1
del arg96_1
buf109 = aten.convolution(buf107, buf108, None, (1, 1), (1, 1), (1, 1), False, (0, 0), 1)
assert_size_stride(buf109, (s0, 256, 1 + ((((-1) + s2) // 16)), 1 + ((((-1) + s2) // 16))), (256 + (256*(((((-1) + s2) // 16))*((((-1) + s2) // 16)))) + (512*((((-1) + s2) // 16))), 1, 256 + (256*((((-1) + s2) // 16))), 256))
del buf107
buf111 = as_strided(buf109, (s0, 256, 1 + ((((-1) + s2) // 16)), 1 + ((((-1) + s2) // 16))), (256 + (256*(((((-1) + s2) // 16))*((((-1) + s2) // 16)))) + (512*((((-1) + s2) // 16))), 1, 256 + (256*((((-1) + s2) // 16))), 256)); del buf109 # reuse
kernel_cpp_30(c_void_p(buf111.data_ptr()), c_void_p(arg225_1.data_ptr()), c_void_p(arg226_1.data_ptr()), c_void_p(arg97_1.data_ptr()), c_void_p(arg98_1.data_ptr()), c_long(s0), c_long(s2))
del arg225_1
del arg226_1
del arg97_1
del arg98_1
buf112 = aten.convolution(buf111, arg99_1, None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1)
assert_size_stride(buf112, (s0, 1024, 1 + ((((-1) + s2) // 16)), 1 + ((((-1) + s2) // 16))), (1024 + (1024*(((((-1) + s2) // 16))*((((-1) + s2) // 16)))) + (2048*((((-1) + s2) // 16))), 1, 1024 + (1024*((((-1) + s2) // 16))), 1024))
del arg99_1
del buf111
buf114 = buf104; del buf104 # reuse
kernel_cpp_31(c_void_p(buf114.data_ptr()), c_void_p(buf112.data_ptr()), c_void_p(arg227_1.data_ptr()), c_void_p(arg228_1.data_ptr()), c_void_p(arg100_1.data_ptr()), c_void_p(arg101_1.data_ptr()), c_long(s0), c_long(s2))
del arg100_1
del arg101_1
del arg227_1
del arg228_1
del buf112
buf115 = aten.convolution(buf114, arg102_1, None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1)
assert_size_stride(buf115, (s0, 256, 1 + ((((-1) + s2) // 16)), 1 + ((((-1) + s2) // 16))), (256 + (256*(((((-1) + s2) // 16))*((((-1) + s2) // 16)))) + (512*((((-1) + s2) // 16))), 1, 256 + (256*((((-1) + s2) // 16))), 256))
del arg102_1
buf117 = as_strided(buf115, (s0, 256, 1 + ((((-1) + s2) // 16)), 1 + ((((-1) + s2) // 16))), (256 + (256*(((((-1) + s2) // 16))*((((-1) + s2) // 16)))) + (512*((((-1) + s2) // 16))), 1, 256 + (256*((((-1) + s2) // 16))), 256)); del buf115 # reuse
buf118 = buf108; del buf108 # reuse
kernel_cpp_32(c_void_p(buf117.data_ptr()), c_void_p(arg229_1.data_ptr()), c_void_p(arg230_1.data_ptr()), c_void_p(arg103_1.data_ptr()), c_void_p(arg104_1.data_ptr()), c_void_p(arg105_1.data_ptr()), c_void_p(buf118.data_ptr()), c_long(s0), c_long(s2))
del arg103_1
del arg104_1
del arg105_1
del arg229_1
del arg230_1
buf119 = aten.convolution(buf117, buf118, None, (1, 1), (1, 1), (1, 1), False, (0, 0), 1)
assert_size_stride(buf119, (s0, 256, 1 + ((((-1) + s2) // 16)), 1 + ((((-1) + s2) // 16))), (256 + (256*(((((-1) + s2) // 16))*((((-1) + s2) // 16)))) + (512*((((-1) + s2) // 16))), 1, 256 + (256*((((-1) + s2) // 16))), 256))
del buf117
buf121 = as_strided(buf119, (s0, 256, 1 + ((((-1) + s2) // 16)), 1 + ((((-1) + s2) // 16))), (256 + (256*(((((-1) + s2) // 16))*((((-1) + s2) // 16)))) + (512*((((-1) + s2) // 16))), 1, 256 + (256*((((-1) + s2) // 16))), 256)); del buf119 # reuse
kernel_cpp_33(c_void_p(buf121.data_ptr()), c_void_p(arg231_1.data_ptr()), c_void_p(arg232_1.data_ptr()), c_void_p(arg106_1.data_ptr()), c_void_p(arg107_1.data_ptr()), c_long(s0), c_long(s2))
del arg106_1
del arg107_1
del arg231_1
del arg232_1
buf122 = aten.convolution(buf121, arg108_1, None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1)
assert_size_stride(buf122, (s0, 1024, 1 + ((((-1) + s2) // 16)), 1 + ((((-1) + s2) // 16))), (1024 + (1024*(((((-1) + s2) // 16))*((((-1) + s2) // 16)))) + (2048*((((-1) + s2) // 16))), 1, 1024 + (1024*((((-1) + s2) // 16))), 1024))
del arg108_1
del buf121
buf124 = buf114; del buf114 # reuse
kernel_cpp_34(c_void_p(buf124.data_ptr()), c_void_p(buf122.data_ptr()), c_void_p(arg233_1.data_ptr()), c_void_p(arg234_1.data_ptr()), c_void_p(arg109_1.data_ptr()), c_void_p(arg110_1.data_ptr()), c_long(s0), c_long(s2))
del arg109_1
del arg110_1
del arg233_1
del arg234_1
del buf122
buf125 = aten.convolution(buf124, arg111_1, None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1)
assert_size_stride(buf125, (s0, 256, 1 + ((((-1) + s2) // 16)), 1 + ((((-1) + s2) // 16))), (256 + (256*(((((-1) + s2) // 16))*((((-1) + s2) // 16)))) + (512*((((-1) + s2) // 16))), 1, 256 + (256*((((-1) + s2) // 16))), 256))
del arg111_1
buf127 = as_strided(buf125, (s0, 256, 1 + ((((-1) + s2) // 16)), 1 + ((((-1) + s2) // 16))), (256 + (256*(((((-1) + s2) // 16))*((((-1) + s2) // 16)))) + (512*((((-1) + s2) // 16))), 1, 256 + (256*((((-1) + s2) // 16))), 256)); del buf125 # reuse
buf128 = buf118; del buf118 # reuse
kernel_cpp_35(c_void_p(buf127.data_ptr()), c_void_p(arg235_1.data_ptr()), c_void_p(arg236_1.data_ptr()), c_void_p(arg112_1.data_ptr()), c_void_p(arg113_1.data_ptr()), c_void_p(arg114_1.data_ptr()), c_void_p(buf128.data_ptr()), c_long(s0), c_long(s2))
del arg112_1
del arg113_1
del arg114_1
del arg235_1
del arg236_1
buf129 = aten.convolution(buf127, buf128, None, (1, 1), (1, 1), (1, 1), False, (0, 0), 1)
assert_size_stride(buf129, (s0, 256, 1 + ((((-1) + s2) // 16)), 1 + ((((-1) + s2) // 16))), (256 + (256*(((((-1) + s2) // 16))*((((-1) + s2) // 16)))) + (512*((((-1) + s2) // 16))), 1, 256 + (256*((((-1) + s2) // 16))), 256))
del buf127
buf131 = as_strided(buf129, (s0, 256, 1 + ((((-1) + s2) // 16)), 1 + ((((-1) + s2) // 16))), (256 + (256*(((((-1) + s2) // 16))*((((-1) + s2) // 16)))) + (512*((((-1) + s2) // 16))), 1, 256 + (256*((((-1) + s2) // 16))), 256)); del buf129 # reuse
kernel_cpp_36(c_void_p(buf131.data_ptr()), c_void_p(arg237_1.data_ptr()), c_void_p(arg238_1.data_ptr()), c_void_p(arg115_1.data_ptr()), c_void_p(arg116_1.data_ptr()), c_long(s0), c_long(s2))
del arg115_1
del arg116_1
del arg237_1
del arg238_1
buf132 = aten.convolution(buf131, arg117_1, None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1)
assert_size_stride(buf132, (s0, 1024, 1 + ((((-1) + s2) // 16)), 1 + ((((-1) + s2) // 16))), (1024 + (1024*(((((-1) + s2) // 16))*((((-1) + s2) // 16)))) + (2048*((((-1) + s2) // 16))), 1, 1024 + (1024*((((-1) + s2) // 16))), 1024))
del arg117_1
del buf131
buf134 = buf124; del buf124 # reuse
kernel_cpp_37(c_void_p(buf134.data_ptr()), c_void_p(buf132.data_ptr()), c_void_p(arg239_1.data_ptr()), c_void_p(arg240_1.data_ptr()), c_void_p(arg118_1.data_ptr()), c_void_p(arg119_1.data_ptr()), c_long(s0), c_long(s2))
del arg118_1
del arg119_1
del arg239_1
del arg240_1
del buf132
buf135 = aten.convolution(buf134, arg120_1, None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1)
assert_size_stride(buf135, (s0, 256, 1 + ((((-1) + s2) // 16)), 1 + ((((-1) + s2) // 16))), (256 + (256*(((((-1) + s2) // 16))*((((-1) + s2) // 16)))) + (512*((((-1) + s2) // 16))), 1, 256 + (256*((((-1) + s2) // 16))), 256))
del arg120_1
buf137 = as_strided(buf135, (s0, 256, 1 + ((((-1) + s2) // 16)), 1 + ((((-1) + s2) // 16))), (256 + (256*(((((-1) + s2) // 16))*((((-1) + s2) // 16)))) + (512*((((-1) + s2) // 16))), 1, 256 + (256*((((-1) + s2) // 16))), 256)); del buf135 # reuse
buf138 = buf128; del buf128 # reuse
kernel_cpp_38(c_void_p(buf137.data_ptr()), c_void_p(arg241_1.data_ptr()), c_void_p(arg242_1.data_ptr()), c_void_p(arg121_1.data_ptr()), c_void_p(arg122_1.data_ptr()), c_void_p(arg123_1.data_ptr()), c_void_p(buf138.data_ptr()), c_long(s0), c_long(s2))
del arg121_1
del arg122_1
del arg123_1
del arg241_1
del arg242_1
buf139 = aten.convolution(buf137, buf138, None, (1, 1), (1, 1), (1, 1), False, (0, 0), 1)
assert_size_stride(buf139, (s0, 256, 1 + ((((-1) + s2) // 16)), 1 + ((((-1) + s2) // 16))), (256 + (256*(((((-1) + s2) // 16))*((((-1) + s2) // 16)))) + (512*((((-1) + s2) // 16))), 1, 256 + (256*((((-1) + s2) // 16))), 256))
del buf137
del buf138
buf141 = as_strided(buf139, (s0, 256, 1 + ((((-1) + s2) // 16)), 1 + ((((-1) + s2) // 16))), (256 + (256*(((((-1) + s2) // 16))*((((-1) + s2) // 16)))) + (512*((((-1) + s2) // 16))), 1, 256 + (256*((((-1) + s2) // 16))), 256)); del buf139 # reuse
kernel_cpp_39(c_void_p(buf141.data_ptr()), c_void_p(arg243_1.data_ptr()), c_void_p(arg244_1.data_ptr()), c_void_p(arg124_1.data_ptr()), c_void_p(arg125_1.data_ptr()), c_long(s0), c_long(s2))
del arg124_1
del arg125_1
del arg243_1
del arg244_1
buf142 = aten.convolution(buf141, arg126_1, None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1)
assert_size_stride(buf142, (s0, 1024, 1 + ((((-1) + s2) // 16)), 1 + ((((-1) + s2) // 16))), (1024 + (1024*(((((-1) + s2) // 16))*((((-1) + s2) // 16)))) + (2048*((((-1) + s2) // 16))), 1, 1024 + (1024*((((-1) + s2) // 16))), 1024))
del arg126_1
del buf141
buf144 = buf134; del buf134 # reuse
kernel_cpp_40(c_void_p(buf144.data_ptr()), c_void_p(buf142.data_ptr()), c_void_p(arg245_1.data_ptr()), c_void_p(arg246_1.data_ptr()), c_void_p(arg127_1.data_ptr()), c_void_p(arg128_1.data_ptr()), c_long(s0), c_long(s2))
del arg127_1
del arg128_1
del arg245_1
del arg246_1
del buf142
buf145 = aten.convolution(buf144, arg129_1, None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1)
assert_size_stride(buf145, (s0, 512, 1 + ((((-1) + s2) // 16)), 1 + ((((-1) + s2) // 16))), (512 + (512*(((((-1) + s2) // 16))*((((-1) + s2) // 16)))) + (1024*((((-1) + s2) // 16))), 1, 512 + (512*((((-1) + s2) // 16))), 512))
del arg129_1
buf147 = as_strided(buf145, (s0, 512, 1 + ((((-1) + s2) // 16)), 1 + ((((-1) + s2) // 16))), (512 + (512*(((((-1) + s2) // 16))*((((-1) + s2) // 16)))) + (1024*((((-1) + s2) // 16))), 1, 512 + (512*((((-1) + s2) // 16))), 512)); del buf145 # reuse
buf148 = empty_strided((512, 512, 3, 3), (4608, 1, 1536, 512), device='cpu', dtype=torch.float32)
kernel_cpp_41(c_void_p(buf147.data_ptr()), c_void_p(arg247_1.data_ptr()), c_void_p(arg248_1.data_ptr()), c_void_p(arg130_1.data_ptr()), c_void_p(arg131_1.data_ptr()), c_void_p(arg132_1.data_ptr()), c_void_p(buf148.data_ptr()), c_long(s0), c_long(s2))
del arg130_1
del arg131_1
del arg132_1
del arg247_1
del arg248_1
buf149 = aten.convolution(buf147, buf148, None, (2, 2), (1, 1), (1, 1), False, (0, 0), 1)
assert_size_stride(buf149, (s0, 512, 1 + ((((-1) + s2) // 32)), 1 + ((((-1) + s2) // 32))), (512 + (512*(((((-1) + s2) // 32))*((((-1) + s2) // 32)))) + (1024*((((-1) + s2) // 32))), 1, 512 + (512*((((-1) + s2) // 32))), 512))
del buf147
buf151 = as_strided(buf149, (s0, 512, 1 + ((((-1) + s2) // 32)), 1 + ((((-1) + s2) // 32))), (512 + (512*(((((-1) + s2) // 32))*((((-1) + s2) // 32)))) + (1024*((((-1) + s2) // 32))), 1, 512 + (512*((((-1) + s2) // 32))), 512)); del buf149 # reuse
kernel_cpp_42(c_void_p(buf151.data_ptr()), c_void_p(arg249_1.data_ptr()), c_void_p(arg250_1.data_ptr()), c_void_p(arg133_1.data_ptr()), c_void_p(arg134_1.data_ptr()), c_long(s0), c_long(s2))
del arg133_1
del arg134_1
del arg249_1
del arg250_1
buf152 = aten.convolution(buf151, arg135_1, None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1)
assert_size_stride(buf152, (s0, 2048, 1 + ((((-1) + s2) // 32)), 1 + ((((-1) + s2) // 32))), (2048 + (2048*(((((-1) + s2) // 32))*((((-1) + s2) // 32)))) + (4096*((((-1) + s2) // 32))), 1, 2048 + (2048*((((-1) + s2) // 32))), 2048))
del arg135_1
del buf151
buf154 = aten.convolution(buf144, arg138_1, None, (2, 2), (0, 0), (1, 1), False, (0, 0), 1)
assert_size_stride(buf154, (s0, 2048, 1 + ((((-1) + s2) // 32)), 1 + ((((-1) + s2) // 32))), (2048 + (2048*(((((-1) + s2) // 32))*((((-1) + s2) // 32)))) + (4096*((((-1) + s2) // 32))), 1, 2048 + (2048*((((-1) + s2) // 32))), 2048))
del arg138_1
del buf144
buf156 = as_strided(buf152, (s0, 2048, 1 + ((((-1) + s2) // 32)), 1 + ((((-1) + s2) // 32))), (2048 + (2048*(((((-1) + s2) // 32))*((((-1) + s2) // 32)))) + (4096*((((-1) + s2) // 32))), 1, 2048 + (2048*((((-1) + s2) // 32))), 2048)); del buf152 # reuse
buf157 = buf156; del buf156 # reuse
kernel_cpp_43(c_void_p(buf157.data_ptr()), c_void_p(arg251_1.data_ptr()), c_void_p(arg252_1.data_ptr()), c_void_p(arg136_1.data_ptr()), c_void_p(arg137_1.data_ptr()), c_void_p(buf154.data_ptr()), c_void_p(arg253_1.data_ptr()), c_void_p(arg254_1.data_ptr()), c_void_p(arg139_1.data_ptr()), c_void_p(arg140_1.data_ptr()), c_long(s0), c_long(s2))
del arg136_1
del arg137_1
del arg139_1
del arg140_1
del arg251_1
del arg252_1
del arg253_1
del arg254_1
del buf154
buf158 = aten.convolution(buf157, arg141_1, None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1)
assert_size_stride(buf158, (s0, 512, 1 + ((((-1) + s2) // 32)), 1 + ((((-1) + s2) // 32))), (512 + (512*(((((-1) + s2) // 32))*((((-1) + s2) // 32)))) + (1024*((((-1) + s2) // 32))), 1, 512 + (512*((((-1) + s2) // 32))), 512))
del arg141_1
buf160 = as_strided(buf158, (s0, 512, 1 + ((((-1) + s2) // 32)), 1 + ((((-1) + s2) // 32))), (512 + (512*(((((-1) + s2) // 32))*((((-1) + s2) // 32)))) + (1024*((((-1) + s2) // 32))), 1, 512 + (512*((((-1) + s2) // 32))), 512)); del buf158 # reuse
buf161 = buf148; del buf148 # reuse
kernel_cpp_44(c_void_p(buf160.data_ptr()), c_void_p(arg255_1.data_ptr()), c_void_p(arg256_1.data_ptr()), c_void_p(arg142_1.data_ptr()), c_void_p(arg143_1.data_ptr()), c_void_p(arg144_1.data_ptr()), c_void_p(buf161.data_ptr()), c_long(s0), c_long(s2))
del arg142_1
del arg143_1
del arg144_1
del arg255_1
del arg256_1
buf162 = aten.convolution(buf160, buf161, None, (1, 1), (1, 1), (1, 1), False, (0, 0), 1)
assert_size_stride(buf162, (s0, 512, 1 + ((((-1) + s2) // 32)), 1 + ((((-1) + s2) // 32))), (512 + (512*(((((-1) + s2) // 32))*((((-1) + s2) // 32)))) + (1024*((((-1) + s2) // 32))), 1, 512 + (512*((((-1) + s2) // 32))), 512))
del buf160
buf164 = as_strided(buf162, (s0, 512, 1 + ((((-1) + s2) // 32)), 1 + ((((-1) + s2) // 32))), (512 + (512*(((((-1) + s2) // 32))*((((-1) + s2) // 32)))) + (1024*((((-1) + s2) // 32))), 1, 512 + (512*((((-1) + s2) // 32))), 512)); del buf162 # reuse
kernel_cpp_45(c_void_p(buf164.data_ptr()), c_void_p(arg257_1.data_ptr()), c_void_p(arg258_1.data_ptr()), c_void_p(arg145_1.data_ptr()), c_void_p(arg146_1.data_ptr()), c_long(s0), c_long(s2))
del arg145_1
del arg146_1
del arg257_1
del arg258_1
buf165 = aten.convolution(buf164, arg147_1, None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1)
assert_size_stride(buf165, (s0, 2048, 1 + ((((-1) + s2) // 32)), 1 + ((((-1) + s2) // 32))), (2048 + (2048*(((((-1) + s2) // 32))*((((-1) + s2) // 32)))) + (4096*((((-1) + s2) // 32))), 1, 2048 + (2048*((((-1) + s2) // 32))), 2048))
del arg147_1
del buf164
buf167 = buf157; del buf157 # reuse
kernel_cpp_46(c_void_p(buf167.data_ptr()), c_void_p(buf165.data_ptr()), c_void_p(arg259_1.data_ptr()), c_void_p(arg260_1.data_ptr()), c_void_p(arg148_1.data_ptr()), c_void_p(arg149_1.data_ptr()), c_long(s0), c_long(s2))
del arg148_1
del arg149_1
del arg259_1
del arg260_1
del buf165
buf168 = aten.convolution(buf167, arg150_1, None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1)
assert_size_stride(buf168, (s0, 512, 1 + ((((-1) + s2) // 32)), 1 + ((((-1) + s2) // 32))), (512 + (512*(((((-1) + s2) // 32))*((((-1) + s2) // 32)))) + (1024*((((-1) + s2) // 32))), 1, 512 + (512*((((-1) + s2) // 32))), 512))
del arg150_1
buf170 = as_strided(buf168, (s0, 512, 1 + ((((-1) + s2) // 32)), 1 + ((((-1) + s2) // 32))), (512 + (512*(((((-1) + s2) // 32))*((((-1) + s2) // 32)))) + (1024*((((-1) + s2) // 32))), 1, 512 + (512*((((-1) + s2) // 32))), 512)); del buf168 # reuse
buf171 = buf161; del buf161 # reuse
kernel_cpp_47(c_void_p(buf170.data_ptr()), c_void_p(arg261_1.data_ptr()), c_void_p(arg262_1.data_ptr()), c_void_p(arg151_1.data_ptr()), c_void_p(arg152_1.data_ptr()), c_void_p(arg153_1.data_ptr()), c_void_p(buf171.data_ptr()), c_long(s0), c_long(s2))
del arg151_1
del arg152_1
del arg153_1
del arg261_1
del arg262_1
buf172 = aten.convolution(buf170, buf171, None, (1, 1), (1, 1), (1, 1), False, (0, 0), 1)
assert_size_stride(buf172, (s0, 512, 1 + ((((-1) + s2) // 32)), 1 + ((((-1) + s2) // 32))), (512 + (512*(((((-1) + s2) // 32))*((((-1) + s2) // 32)))) + (1024*((((-1) + s2) // 32))), 1, 512 + (512*((((-1) + s2) // 32))), 512))
del buf170
del buf171
buf174 = as_strided(buf172, (s0, 512, 1 + ((((-1) + s2) // 32)), 1 + ((((-1) + s2) // 32))), (512 + (512*(((((-1) + s2) // 32))*((((-1) + s2) // 32)))) + (1024*((((-1) + s2) // 32))), 1, 512 + (512*((((-1) + s2) // 32))), 512)); del buf172 # reuse
kernel_cpp_48(c_void_p(buf174.data_ptr()), c_void_p(arg263_1.data_ptr()), c_void_p(arg264_1.data_ptr()), c_void_p(arg154_1.data_ptr()), c_void_p(arg155_1.data_ptr()), c_long(s0), c_long(s2))
del arg154_1
del arg155_1
del arg263_1
del arg264_1
buf175 = aten.convolution(buf174, arg156_1, None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1)
assert_size_stride(buf175, (s0, 2048, 1 + ((((-1) + s2) // 32)), 1 + ((((-1) + s2) // 32))), (2048 + (2048*(((((-1) + s2) // 32))*((((-1) + s2) // 32)))) + (4096*((((-1) + s2) // 32))), 1, 2048 + (2048*((((-1) + s2) // 32))), 2048))
del arg156_1
del buf174
buf177 = buf167; del buf167 # reuse
buf178 = empty_strided((s0, 2048, 1, 1), (2048, 1, 2048*s0, 2048*s0), device='cpu', dtype=torch.float32)
buf179 = as_strided(buf178, (s0, 2048, 1, 1), (2048, 1, 1, 1)); del buf178 # reuse
kernel_cpp_49(c_void_p(buf177.data_ptr()), c_void_p(buf179.data_ptr()), c_void_p(buf175.data_ptr()), c_void_p(arg265_1.data_ptr()), c_void_p(arg266_1.data_ptr()), c_void_p(arg157_1.data_ptr()), c_void_p(arg158_1.data_ptr()), c_long(s0), c_long(s2))
del arg157_1
del arg158_1
del arg265_1
del arg266_1
del buf175
del buf177
buf180 = empty_strided((116, 1000), (1000, 1), device='cpu', dtype=torch.float32)
extern_kernels.addmm(arg160_1, as_strided(buf179, (116, 2048), (2048, 1)), as_strided(arg159_1, (2048, 1000), (1, 2048)), alpha=1, beta=1, out=buf180)
del arg159_1
del arg160_1
return (buf180, )
if __name__ == "__main__":
from torch._dynamo.testing import rand_strided
from torch._inductor.utils import print_performance
arg0_1 = rand_strided((64, 3, 7, 7), (147, 49, 7, 1), device='cpu', dtype=torch.float32)
arg1_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32)
arg2_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32)
arg3_1 = rand_strided((64, 64, 1, 1), (64, 1, 1, 1), device='cpu', dtype=torch.float32)
arg4_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32)
arg5_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32)
arg6_1 = rand_strided((64, 64, 3, 3), (576, 9, 3, 1), device='cpu', dtype=torch.float32)
arg7_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32)
arg8_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32)
arg9_1 = rand_strided((256, 64, 1, 1), (64, 1, 1, 1), device='cpu', dtype=torch.float32)
arg10_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg11_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg12_1 = rand_strided((256, 64, 1, 1), (64, 1, 1, 1), device='cpu', dtype=torch.float32)
arg13_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg14_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg15_1 = rand_strided((64, 256, 1, 1), (256, 1, 1, 1), device='cpu', dtype=torch.float32)
arg16_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32)
arg17_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32)
arg18_1 = rand_strided((64, 64, 3, 3), (576, 9, 3, 1), device='cpu', dtype=torch.float32)
arg19_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32)
arg20_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32)
arg21_1 = rand_strided((256, 64, 1, 1), (64, 1, 1, 1), device='cpu', dtype=torch.float32)
arg22_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg23_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg24_1 = rand_strided((64, 256, 1, 1), (256, 1, 1, 1), device='cpu', dtype=torch.float32)
arg25_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32)
arg26_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32)
arg27_1 = rand_strided((64, 64, 3, 3), (576, 9, 3, 1), device='cpu', dtype=torch.float32)
arg28_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32)
arg29_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32)
arg30_1 = rand_strided((256, 64, 1, 1), (64, 1, 1, 1), device='cpu', dtype=torch.float32)
arg31_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg32_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg33_1 = rand_strided((128, 256, 1, 1), (256, 1, 1, 1), device='cpu', dtype=torch.float32)
arg34_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
arg35_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
arg36_1 = rand_strided((128, 128, 3, 3), (1152, 9, 3, 1), device='cpu', dtype=torch.float32)
arg37_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
arg38_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
arg39_1 = rand_strided((512, 128, 1, 1), (128, 1, 1, 1), device='cpu', dtype=torch.float32)
arg40_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
arg41_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
arg42_1 = rand_strided((512, 256, 1, 1), (256, 1, 1, 1), device='cpu', dtype=torch.float32)
arg43_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
arg44_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
arg45_1 = rand_strided((128, 512, 1, 1), (512, 1, 1, 1), device='cpu', dtype=torch.float32)
arg46_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
arg47_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
arg48_1 = rand_strided((128, 128, 3, 3), (1152, 9, 3, 1), device='cpu', dtype=torch.float32)
arg49_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
arg50_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
arg51_1 = rand_strided((512, 128, 1, 1), (128, 1, 1, 1), device='cpu', dtype=torch.float32)
arg52_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
arg53_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
arg54_1 = rand_strided((128, 512, 1, 1), (512, 1, 1, 1), device='cpu', dtype=torch.float32)
arg55_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
arg56_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
arg57_1 = rand_strided((128, 128, 3, 3), (1152, 9, 3, 1), device='cpu', dtype=torch.float32)
arg58_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
arg59_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
arg60_1 = rand_strided((512, 128, 1, 1), (128, 1, 1, 1), device='cpu', dtype=torch.float32)
arg61_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
arg62_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
arg63_1 = rand_strided((128, 512, 1, 1), (512, 1, 1, 1), device='cpu', dtype=torch.float32)
arg64_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
arg65_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
arg66_1 = rand_strided((128, 128, 3, 3), (1152, 9, 3, 1), device='cpu', dtype=torch.float32)
arg67_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
arg68_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
arg69_1 = rand_strided((512, 128, 1, 1), (128, 1, 1, 1), device='cpu', dtype=torch.float32)
arg70_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
arg71_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
arg72_1 = rand_strided((256, 512, 1, 1), (512, 1, 1, 1), device='cpu', dtype=torch.float32)
arg73_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg74_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg75_1 = rand_strided((256, 256, 3, 3), (2304, 9, 3, 1), device='cpu', dtype=torch.float32)
arg76_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg77_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg78_1 = rand_strided((1024, 256, 1, 1), (256, 1, 1, 1), device='cpu', dtype=torch.float32)
arg79_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32)
arg80_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32)
arg81_1 = rand_strided((1024, 512, 1, 1), (512, 1, 1, 1), device='cpu', dtype=torch.float32)
arg82_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32)
arg83_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32)
arg84_1 = rand_strided((256, 1024, 1, 1), (1024, 1, 1, 1), device='cpu', dtype=torch.float32)
arg85_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg86_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg87_1 = rand_strided((256, 256, 3, 3), (2304, 9, 3, 1), device='cpu', dtype=torch.float32)
arg88_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg89_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg90_1 = rand_strided((1024, 256, 1, 1), (256, 1, 1, 1), device='cpu', dtype=torch.float32)
arg91_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32)
arg92_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32)
arg93_1 = rand_strided((256, 1024, 1, 1), (1024, 1, 1, 1), device='cpu', dtype=torch.float32)
arg94_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg95_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg96_1 = rand_strided((256, 256, 3, 3), (2304, 9, 3, 1), device='cpu', dtype=torch.float32)
arg97_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg98_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg99_1 = rand_strided((1024, 256, 1, 1), (256, 1, 1, 1), device='cpu', dtype=torch.float32)
arg100_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32)
arg101_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32)
arg102_1 = rand_strided((256, 1024, 1, 1), (1024, 1, 1, 1), device='cpu', dtype=torch.float32)
arg103_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg104_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg105_1 = rand_strided((256, 256, 3, 3), (2304, 9, 3, 1), device='cpu', dtype=torch.float32)
arg106_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg107_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg108_1 = rand_strided((1024, 256, 1, 1), (256, 1, 1, 1), device='cpu', dtype=torch.float32)
arg109_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32)
arg110_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32)
arg111_1 = rand_strided((256, 1024, 1, 1), (1024, 1, 1, 1), device='cpu', dtype=torch.float32)
arg112_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg113_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg114_1 = rand_strided((256, 256, 3, 3), (2304, 9, 3, 1), device='cpu', dtype=torch.float32)
arg115_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg116_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg117_1 = rand_strided((1024, 256, 1, 1), (256, 1, 1, 1), device='cpu', dtype=torch.float32)
arg118_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32)
arg119_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32)
arg120_1 = rand_strided((256, 1024, 1, 1), (1024, 1, 1, 1), device='cpu', dtype=torch.float32)
arg121_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg122_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg123_1 = rand_strided((256, 256, 3, 3), (2304, 9, 3, 1), device='cpu', dtype=torch.float32)
arg124_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg125_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg126_1 = rand_strided((1024, 256, 1, 1), (256, 1, 1, 1), device='cpu', dtype=torch.float32)
arg127_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32)
arg128_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32)
arg129_1 = rand_strided((512, 1024, 1, 1), (1024, 1, 1, 1), device='cpu', dtype=torch.float32)
arg130_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
arg131_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
arg132_1 = rand_strided((512, 512, 3, 3), (4608, 9, 3, 1), device='cpu', dtype=torch.float32)
arg133_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
arg134_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
arg135_1 = rand_strided((2048, 512, 1, 1), (512, 1, 1, 1), device='cpu', dtype=torch.float32)
arg136_1 = rand_strided((2048, ), (1, ), device='cpu', dtype=torch.float32)
arg137_1 = rand_strided((2048, ), (1, ), device='cpu', dtype=torch.float32)
arg138_1 = rand_strided((2048, 1024, 1, 1), (1024, 1, 1, 1), device='cpu', dtype=torch.float32)
arg139_1 = rand_strided((2048, ), (1, ), device='cpu', dtype=torch.float32)
arg140_1 = rand_strided((2048, ), (1, ), device='cpu', dtype=torch.float32)
arg141_1 = rand_strided((512, 2048, 1, 1), (2048, 1, 1, 1), device='cpu', dtype=torch.float32)
arg142_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
arg143_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
arg144_1 = rand_strided((512, 512, 3, 3), (4608, 9, 3, 1), device='cpu', dtype=torch.float32)
arg145_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
arg146_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
arg147_1 = rand_strided((2048, 512, 1, 1), (512, 1, 1, 1), device='cpu', dtype=torch.float32)
arg148_1 = rand_strided((2048, ), (1, ), device='cpu', dtype=torch.float32)
arg149_1 = rand_strided((2048, ), (1, ), device='cpu', dtype=torch.float32)
arg150_1 = rand_strided((512, 2048, 1, 1), (2048, 1, 1, 1), device='cpu', dtype=torch.float32)
arg151_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
arg152_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
arg153_1 = rand_strided((512, 512, 3, 3), (4608, 9, 3, 1), device='cpu', dtype=torch.float32)
arg154_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
arg155_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
arg156_1 = rand_strided((2048, 512, 1, 1), (512, 1, 1, 1), device='cpu', dtype=torch.float32)
arg157_1 = rand_strided((2048, ), (1, ), device='cpu', dtype=torch.float32)
arg158_1 = rand_strided((2048, ), (1, ), device='cpu', dtype=torch.float32)
arg159_1 = rand_strided((1000, 2048), (2048, 1), device='cpu', dtype=torch.float32)
arg160_1 = rand_strided((1000, ), (1, ), device='cpu', dtype=torch.float32)
arg161_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32)
arg162_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32)
arg163_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32)
arg164_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32)
arg165_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32)
arg166_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32)
arg167_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg168_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg169_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg170_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg171_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32)
arg172_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32)
arg173_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32)
arg174_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32)
arg175_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg176_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg177_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32)
arg178_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32)
arg179_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32)
arg180_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32)
arg181_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg182_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg183_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
arg184_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
arg185_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
arg186_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
arg187_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
arg188_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
arg189_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
arg190_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
arg191_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
arg192_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
arg193_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
arg194_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
arg195_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
arg196_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
arg197_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
arg198_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
arg199_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
arg200_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
arg201_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
arg202_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
arg203_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
arg204_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
arg205_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
arg206_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
arg207_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
arg208_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
arg209_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg210_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg211_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg212_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg213_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32)
arg214_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32)
arg215_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32)
arg216_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32)
arg217_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg218_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg219_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg220_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg221_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32)
arg222_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32)
arg223_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg224_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg225_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg226_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg227_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32)
arg228_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32)
arg229_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg230_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg231_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg232_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg233_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32)
arg234_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32)
arg235_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg236_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg237_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg238_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg239_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32)
arg240_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32)
arg241_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg242_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg243_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg244_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg245_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32)
arg246_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32)
arg247_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
arg248_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
arg249_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
arg250_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
arg251_1 = rand_strided((2048, ), (1, ), device='cpu', dtype=torch.float32)
arg252_1 = rand_strided((2048, ), (1, ), device='cpu', dtype=torch.float32)
arg253_1 = rand_strided((2048, ), (1, ), device='cpu', dtype=torch.float32)
arg254_1 = rand_strided((2048, ), (1, ), device='cpu', dtype=torch.float32)
arg255_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
arg256_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
arg257_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
arg258_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
arg259_1 = rand_strided((2048, ), (1, ), device='cpu', dtype=torch.float32)
arg260_1 = rand_strided((2048, ), (1, ), device='cpu', dtype=torch.float32)
arg261_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
arg262_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
arg263_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
arg264_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
arg265_1 = rand_strided((2048, ), (1, ), device='cpu', dtype=torch.float32)
arg266_1 = rand_strided((2048, ), (1, ), device='cpu', dtype=torch.float32)
arg267_1 = rand_strided((116, 3, 224, 224), (150528, 1, 672, 3), device='cpu', dtype=torch.float32)
print_performance(lambda: call([arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1, arg12_1, arg13_1, arg14_1, arg15_1, arg16_1, arg17_1, arg18_1, arg19_1, arg20_1, arg21_1, arg22_1, arg23_1, arg24_1, arg25_1, arg26_1, arg27_1, arg28_1, arg29_1, arg30_1, arg31_1, arg32_1, arg33_1, arg34_1, arg35_1, arg36_1, arg37_1, arg38_1, arg39_1, arg40_1, arg41_1, arg42_1, arg43_1, arg44_1, arg45_1, arg46_1, arg47_1, arg48_1, arg49_1, arg50_1, arg51_1, arg52_1, arg53_1, arg54_1, arg55_1, arg56_1, arg57_1, arg58_1, arg59_1, arg60_1, arg61_1, arg62_1, arg63_1, arg64_1, arg65_1, arg66_1, arg67_1, arg68_1, arg69_1, arg70_1, arg71_1, arg72_1, arg73_1, arg74_1, arg75_1, arg76_1, arg77_1, arg78_1, arg79_1, arg80_1, arg81_1, arg82_1, arg83_1, arg84_1, arg85_1, arg86_1, arg87_1, arg88_1, arg89_1, arg90_1, arg91_1, arg92_1, arg93_1, arg94_1, arg95_1, arg96_1, arg97_1, arg98_1, arg99_1, arg100_1, arg101_1, arg102_1, arg103_1, arg104_1, arg105_1, arg106_1, arg107_1, arg108_1, arg109_1, arg110_1, arg111_1, arg112_1, arg113_1, arg114_1, arg115_1, arg116_1, arg117_1, arg118_1, arg119_1, arg120_1, arg121_1, arg122_1, arg123_1, arg124_1, arg125_1, arg126_1, arg127_1, arg128_1, arg129_1, arg130_1, arg131_1, arg132_1, arg133_1, arg134_1, arg135_1, arg136_1, arg137_1, arg138_1, arg139_1, arg140_1, arg141_1, arg142_1, arg143_1, arg144_1, arg145_1, arg146_1, arg147_1, arg148_1, arg149_1, arg150_1, arg151_1, arg152_1, arg153_1, arg154_1, arg155_1, arg156_1, arg157_1, arg158_1, arg159_1, arg160_1, arg161_1, arg162_1, arg163_1, arg164_1, arg165_1, arg166_1, arg167_1, arg168_1, arg169_1, arg170_1, arg171_1, arg172_1, arg173_1, arg174_1, arg175_1, arg176_1, arg177_1, arg178_1, arg179_1, arg180_1, arg181_1, arg182_1, arg183_1, arg184_1, arg185_1, arg186_1, arg187_1, arg188_1, arg189_1, arg190_1, arg191_1, arg192_1, arg193_1, arg194_1, arg195_1, arg196_1, arg197_1, arg198_1, arg199_1, arg200_1, arg201_1, arg202_1, arg203_1, arg204_1, arg205_1, arg206_1, arg207_1, arg208_1, arg209_1, arg210_1, arg211_1, arg212_1, arg213_1, arg214_1, arg215_1, arg216_1, arg217_1, arg218_1, arg219_1, arg220_1, arg221_1, arg222_1, arg223_1, arg224_1, arg225_1, arg226_1, arg227_1, arg228_1, arg229_1, arg230_1, arg231_1, arg232_1, arg233_1, arg234_1, arg235_1, arg236_1, arg237_1, arg238_1, arg239_1, arg240_1, arg241_1, arg242_1, arg243_1, arg244_1, arg245_1, arg246_1, arg247_1, arg248_1, arg249_1, arg250_1, arg251_1, arg252_1, arg253_1, arg254_1, arg255_1, arg256_1, arg257_1, arg258_1, arg259_1, arg260_1, arg261_1, arg262_1, arg263_1, arg264_1, arg265_1, arg266_1, arg267_1]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment