Skip to content

Instantly share code, notes, and snippets.

@leslie-fang-intel
Created February 21, 2023 02:54
Show Gist options
  • Select an option

  • Save leslie-fang-intel/2948a9f58cac3c42f1459269f6973463 to your computer and use it in GitHub Desktop.

Select an option

Save leslie-fang-intel/2948a9f58cac3c42f1459269f6973463 to your computer and use it in GitHub Desktop.
from ctypes import c_void_p, c_long
import torch
import random
from torch import empty_strided, as_strided, device
from torch._inductor.codecache import AsyncCompile
from torch._inductor.select_algorithm import extern_kernels
aten = torch.ops.aten
assert_size_stride = torch._C._dynamo.guards.assert_size_stride
async_compile = AsyncCompile()
kernel_cpp_0 = async_compile.cpp('''
#include "/tmp/torchinductor_root/dm/cdmaihqxwe73zkb3he2zizktpq5uujetg2db26c3r4lgsmlx3b4c.h"
extern "C" void kernel(const float* __restrict__ in_ptr0,
const float* __restrict__ in_ptr1,
const long* __restrict__ in_ptr2,
unsigned char* __restrict__ out_ptr0)
{
#pragma omp parallel num_threads(28)
{
{
#pragma omp for
for(long i0=0; i0<17461248; i0+=1)
{
auto tmp2 = in_ptr0[i0];
auto tmp3 = in_ptr1[0];
auto tmp6 = in_ptr2[0];
auto tmp0 = static_cast<float>(0);
auto tmp1 = static_cast<float>(127);
auto tmp4 = tmp2 / tmp3;
auto tmp5 = std::nearbyint(tmp4);
auto tmp7 = static_cast<float>(tmp6);
auto tmp8 = tmp5 + tmp7;
auto tmp9 = (tmp8 != tmp8) ? tmp8 : std::min(tmp1, tmp8);
auto tmp10 = (tmp9 != tmp9) ? tmp9 : std::max(tmp0, tmp9);
auto tmp11 = static_cast<unsigned char>(tmp10);
out_ptr0[i0] = tmp11;
}
}
}
}
''')
kernel_cpp_1 = async_compile.cpp('''
#include "/tmp/torchinductor_root/dm/cdmaihqxwe73zkb3he2zizktpq5uujetg2db26c3r4lgsmlx3b4c.h"
extern "C" void kernel(const unsigned char* __restrict__ in_ptr0,
const long* __restrict__ in_ptr1,
const float* __restrict__ in_ptr2,
const float* __restrict__ in_ptr3,
const long* __restrict__ in_ptr4,
float* __restrict__ out_ptr0,
float* __restrict__ out_ptr1,
unsigned char* __restrict__ out_ptr2)
{
#pragma omp parallel num_threads(28)
{
{
#pragma omp for
for(long i0=0; i0<93126656; i0+=1)
{
auto tmp0 = in_ptr0[i0];
auto tmp2 = in_ptr1[0];
auto tmp5 = in_ptr2[0];
auto tmp1 = static_cast<float>(tmp0);
auto tmp3 = static_cast<float>(tmp2);
auto tmp4 = tmp1 - tmp3;
auto tmp6 = tmp4 * tmp5;
out_ptr0[i0] = tmp6;
}
}
{
#pragma omp for
for(long i0=0; i0<116; i0+=1)
{
#pragma GCC ivdep
for(long i1=0; i1<64; i1+=1)
{
#pragma GCC ivdep
for(long i2=0; i2<56; i2+=1)
{
#pragma GCC ivdep
for(long i3=0; i3<56; i3+=1)
{
auto tmp0 = static_cast<long>((-1) + (2*i2));
auto tmp1 = static_cast<long>(0);
auto tmp2 = tmp0 >= tmp1;
auto tmp3 = static_cast<long>(112);
auto tmp4 = tmp0 < tmp3;
auto tmp5 = tmp2 & tmp4;
auto tmp6 = static_cast<long>((-1) + (2*i3));
auto tmp7 = tmp6 >= tmp1;
auto tmp8 = tmp6 < tmp3;
auto tmp9 = tmp7 & tmp8;
auto tmp10 = tmp5 & tmp9;
auto tmp11 = [&]
{
auto tmp12 = out_ptr0[(-7232) + i1 + (128*i3) + (14336*i2) + (802816*i0)];
return tmp12;
}
;
auto tmp13 = tmp10 ? tmp11() : -std::numeric_limits<decltype(tmp11())>::infinity();
auto tmp14 = static_cast<long>(2*i3);
auto tmp15 = tmp14 >= tmp1;
auto tmp16 = tmp14 < tmp3;
auto tmp17 = tmp15 & tmp16;
auto tmp18 = tmp5 & tmp17;
auto tmp19 = [&]
{
auto tmp20 = out_ptr0[(-7168) + i1 + (128*i3) + (14336*i2) + (802816*i0)];
return tmp20;
}
;
auto tmp21 = tmp18 ? tmp19() : -std::numeric_limits<decltype(tmp19())>::infinity();
auto tmp22 = (tmp13 != tmp13) ? tmp13 : std::max(tmp21, tmp13);
auto tmp23 = static_cast<long>(1 + (2*i3));
auto tmp24 = tmp23 >= tmp1;
auto tmp25 = tmp23 < tmp3;
auto tmp26 = tmp24 & tmp25;
auto tmp27 = tmp5 & tmp26;
auto tmp28 = [&]
{
auto tmp29 = out_ptr0[(-7104) + i1 + (128*i3) + (14336*i2) + (802816*i0)];
return tmp29;
}
;
auto tmp30 = tmp27 ? tmp28() : -std::numeric_limits<decltype(tmp28())>::infinity();
auto tmp31 = (tmp22 != tmp22) ? tmp22 : std::max(tmp30, tmp22);
auto tmp32 = static_cast<long>(2*i2);
auto tmp33 = tmp32 >= tmp1;
auto tmp34 = tmp32 < tmp3;
auto tmp35 = tmp33 & tmp34;
auto tmp36 = tmp35 & tmp9;
auto tmp37 = [&]
{
auto tmp38 = out_ptr0[(-64) + i1 + (128*i3) + (14336*i2) + (802816*i0)];
return tmp38;
}
;
auto tmp39 = tmp36 ? tmp37() : -std::numeric_limits<decltype(tmp37())>::infinity();
auto tmp40 = (tmp31 != tmp31) ? tmp31 : std::max(tmp39, tmp31);
auto tmp41 = tmp35 & tmp17;
auto tmp42 = [&]
{
auto tmp43 = out_ptr0[i1 + (128*i3) + (14336*i2) + (802816*i0)];
return tmp43;
}
;
auto tmp44 = tmp41 ? tmp42() : -std::numeric_limits<decltype(tmp42())>::infinity();
auto tmp45 = (tmp40 != tmp40) ? tmp40 : std::max(tmp44, tmp40);
auto tmp46 = tmp35 & tmp26;
auto tmp47 = [&]
{
auto tmp48 = out_ptr0[64 + i1 + (128*i3) + (14336*i2) + (802816*i0)];
return tmp48;
}
;
auto tmp49 = tmp46 ? tmp47() : -std::numeric_limits<decltype(tmp47())>::infinity();
auto tmp50 = (tmp45 != tmp45) ? tmp45 : std::max(tmp49, tmp45);
auto tmp51 = static_cast<long>(1 + (2*i2));
auto tmp52 = tmp51 >= tmp1;
auto tmp53 = tmp51 < tmp3;
auto tmp54 = tmp52 & tmp53;
auto tmp55 = tmp54 & tmp9;
auto tmp56 = [&]
{
auto tmp57 = out_ptr0[7104 + i1 + (128*i3) + (14336*i2) + (802816*i0)];
return tmp57;
}
;
auto tmp58 = tmp55 ? tmp56() : -std::numeric_limits<decltype(tmp56())>::infinity();
auto tmp59 = (tmp50 != tmp50) ? tmp50 : std::max(tmp58, tmp50);
auto tmp60 = tmp54 & tmp17;
auto tmp61 = [&]
{
auto tmp62 = out_ptr0[7168 + i1 + (128*i3) + (14336*i2) + (802816*i0)];
return tmp62;
}
;
auto tmp63 = tmp60 ? tmp61() : -std::numeric_limits<decltype(tmp61())>::infinity();
auto tmp64 = (tmp59 != tmp59) ? tmp59 : std::max(tmp63, tmp59);
auto tmp65 = tmp54 & tmp26;
auto tmp66 = [&]
{
auto tmp67 = out_ptr0[7232 + i1 + (128*i3) + (14336*i2) + (802816*i0)];
return tmp67;
}
;
auto tmp68 = tmp65 ? tmp66() : -std::numeric_limits<decltype(tmp66())>::infinity();
auto tmp69 = (tmp64 != tmp64) ? tmp64 : std::max(tmp68, tmp64);
out_ptr1[i1 + (64*i3) + (3584*i2) + (200704*i0)] = tmp69;
}
}
}
}
}
{
#pragma omp for
for(long i0=0; i0<116; i0+=1)
{
#pragma GCC ivdep
for(long i1=0; i1<64; i1+=1)
{
#pragma GCC ivdep
for(long i2=0; i2<3136; i2+=1)
{
auto tmp2 = out_ptr1[i1 + (64*i2) + (200704*i0)];
auto tmp3 = in_ptr3[0];
auto tmp6 = in_ptr4[0];
auto tmp0 = static_cast<float>(0);
auto tmp1 = static_cast<float>(127);
auto tmp4 = tmp2 / tmp3;
auto tmp5 = std::nearbyint(tmp4);
auto tmp7 = static_cast<float>(tmp6);
auto tmp8 = tmp5 + tmp7;
auto tmp9 = (tmp8 != tmp8) ? tmp8 : std::min(tmp1, tmp8);
auto tmp10 = (tmp9 != tmp9) ? tmp9 : std::max(tmp0, tmp9);
auto tmp11 = static_cast<unsigned char>(tmp10);
out_ptr2[i2 + (3136*i1) + (200704*i0)] = tmp11;
}
}
}
}
}
}
''')
kernel_cpp_2 = async_compile.cpp('''
#include "/tmp/torchinductor_root/dm/cdmaihqxwe73zkb3he2zizktpq5uujetg2db26c3r4lgsmlx3b4c.h"
extern "C" void kernel(float* __restrict__ in_out_ptr0,
const unsigned char* __restrict__ in_ptr0,
const long* __restrict__ in_ptr1,
const float* __restrict__ in_ptr2,
const unsigned char* __restrict__ in_ptr3,
const long* __restrict__ in_ptr4,
const float* __restrict__ in_ptr5,
const float* __restrict__ in_ptr6,
const long* __restrict__ in_ptr7,
unsigned char* __restrict__ out_ptr1)
{
auto out_ptr0 = in_out_ptr0;
#pragma omp parallel num_threads(28)
{
{
#pragma omp for
for(long i0=0; i0<93126656; i0+=1)
{
auto tmp0 = in_ptr0[i0];
auto tmp2 = in_ptr1[0];
auto tmp5 = in_ptr2[0];
auto tmp7 = in_ptr3[i0];
auto tmp9 = in_ptr4[0];
auto tmp12 = in_ptr5[0];
auto tmp1 = static_cast<float>(tmp0);
auto tmp3 = static_cast<float>(tmp2);
auto tmp4 = tmp1 - tmp3;
auto tmp6 = tmp4 * tmp5;
auto tmp8 = static_cast<float>(tmp7);
auto tmp10 = static_cast<float>(tmp9);
auto tmp11 = tmp8 - tmp10;
auto tmp13 = tmp11 * tmp12;
auto tmp14 = tmp6 + tmp13;
out_ptr0[i0] = tmp14;
}
}
{
#pragma omp for
for(long i0=0; i0<116; i0+=1)
{
#pragma GCC ivdep
for(long i1=0; i1<256; i1+=1)
{
#pragma GCC ivdep
for(long i2=0; i2<3136; i2+=1)
{
auto tmp0 = out_ptr0[i1 + (256*i2) + (802816*i0)];
auto tmp4 = in_ptr6[0];
auto tmp7 = in_ptr7[0];
auto tmp1 = tmp0 * (tmp0>0);
auto tmp2 = static_cast<float>(0);
auto tmp3 = static_cast<float>(127);
auto tmp5 = tmp1 / tmp4;
auto tmp6 = std::nearbyint(tmp5);
auto tmp8 = static_cast<float>(tmp7);
auto tmp9 = tmp6 + tmp8;
auto tmp10 = (tmp9 != tmp9) ? tmp9 : std::min(tmp3, tmp9);
auto tmp11 = (tmp10 != tmp10) ? tmp10 : std::max(tmp2, tmp10);
auto tmp12 = static_cast<unsigned char>(tmp11);
out_ptr1[i2 + (3136*i1) + (802816*i0)] = tmp12;
}
}
}
}
}
}
''')
kernel_cpp_3 = async_compile.cpp('''
#include "/tmp/torchinductor_root/dm/cdmaihqxwe73zkb3he2zizktpq5uujetg2db26c3r4lgsmlx3b4c.h"
extern "C" void kernel(float* __restrict__ in_out_ptr0,
const unsigned char* __restrict__ in_ptr0,
const long* __restrict__ in_ptr1,
const float* __restrict__ in_ptr2,
const unsigned char* __restrict__ in_ptr3,
const long* __restrict__ in_ptr4,
const float* __restrict__ in_ptr5,
const float* __restrict__ in_ptr6,
const long* __restrict__ in_ptr7,
unsigned char* __restrict__ out_ptr1)
{
auto out_ptr0 = in_out_ptr0;
#pragma omp parallel num_threads(28)
{
{
#pragma omp for
for(long i0=0; i0<116; i0+=1)
{
#pragma GCC ivdep
for(long i1=0; i1<256; i1+=1)
{
#pragma GCC ivdep
for(long i2=0; i2<3136; i2+=1)
{
auto tmp0 = in_ptr0[i1 + (256*i2) + (802816*i0)];
auto tmp2 = in_ptr1[0];
auto tmp5 = in_ptr2[0];
auto tmp7 = in_ptr3[i2 + (3136*i1) + (802816*i0)];
auto tmp9 = in_ptr4[0];
auto tmp12 = in_ptr5[0];
auto tmp1 = static_cast<float>(tmp0);
auto tmp3 = static_cast<float>(tmp2);
auto tmp4 = tmp1 - tmp3;
auto tmp6 = tmp4 * tmp5;
auto tmp8 = static_cast<float>(tmp7);
auto tmp10 = static_cast<float>(tmp9);
auto tmp11 = tmp8 - tmp10;
auto tmp13 = tmp11 * tmp12;
auto tmp14 = tmp6 + tmp13;
out_ptr0[i2 + (3136*i1) + (802816*i0)] = tmp14;
}
}
}
}
{
#pragma omp for
for(long i0=0; i0<93126656; i0+=1)
{
auto tmp0 = out_ptr0[i0];
auto tmp4 = in_ptr6[0];
auto tmp7 = in_ptr7[0];
auto tmp1 = tmp0 * (tmp0>0);
auto tmp2 = static_cast<float>(0);
auto tmp3 = static_cast<float>(127);
auto tmp5 = tmp1 / tmp4;
auto tmp6 = std::nearbyint(tmp5);
auto tmp8 = static_cast<float>(tmp7);
auto tmp9 = tmp6 + tmp8;
auto tmp10 = (tmp9 != tmp9) ? tmp9 : std::min(tmp3, tmp9);
auto tmp11 = (tmp10 != tmp10) ? tmp10 : std::max(tmp2, tmp10);
auto tmp12 = static_cast<unsigned char>(tmp11);
out_ptr1[i0] = tmp12;
}
}
}
}
''')
kernel_cpp_4 = async_compile.cpp('''
#include "/tmp/torchinductor_root/dm/cdmaihqxwe73zkb3he2zizktpq5uujetg2db26c3r4lgsmlx3b4c.h"
extern "C" void kernel(float* __restrict__ in_out_ptr0,
const unsigned char* __restrict__ in_ptr0,
const long* __restrict__ in_ptr1,
const float* __restrict__ in_ptr2,
const unsigned char* __restrict__ in_ptr3,
const long* __restrict__ in_ptr4,
const float* __restrict__ in_ptr5,
const float* __restrict__ in_ptr6,
const long* __restrict__ in_ptr7,
unsigned char* __restrict__ out_ptr1)
{
auto out_ptr0 = in_out_ptr0;
#pragma omp parallel num_threads(28)
{
{
#pragma omp for
for(long i0=0; i0<116; i0+=1)
{
#pragma GCC ivdep
for(long i1=0; i1<256; i1+=1)
{
#pragma GCC ivdep
for(long i2=0; i2<3136; i2+=1)
{
auto tmp0 = in_ptr0[i1 + (256*i2) + (802816*i0)];
auto tmp2 = in_ptr1[0];
auto tmp5 = in_ptr2[0];
auto tmp7 = in_ptr3[i2 + (3136*i1) + (802816*i0)];
auto tmp9 = in_ptr4[0];
auto tmp12 = in_ptr5[0];
auto tmp1 = static_cast<float>(tmp0);
auto tmp3 = static_cast<float>(tmp2);
auto tmp4 = tmp1 - tmp3;
auto tmp6 = tmp4 * tmp5;
auto tmp8 = static_cast<float>(tmp7);
auto tmp10 = static_cast<float>(tmp9);
auto tmp11 = tmp8 - tmp10;
auto tmp13 = tmp11 * tmp12;
auto tmp14 = tmp6 + tmp13;
out_ptr0[i2 + (3136*i1) + (802816*i0)] = tmp14;
}
}
}
}
{
#pragma omp for
for(long i0=0; i0<93126656; i0+=1)
{
auto tmp0 = out_ptr0[i0];
auto tmp4 = in_ptr6[0];
auto tmp7 = in_ptr7[0];
auto tmp1 = tmp0 * (tmp0>0);
auto tmp2 = static_cast<float>(0);
auto tmp3 = static_cast<float>(127);
auto tmp5 = tmp1 / tmp4;
auto tmp6 = std::nearbyint(tmp5);
auto tmp8 = static_cast<float>(tmp7);
auto tmp9 = tmp6 + tmp8;
auto tmp10 = (tmp9 != tmp9) ? tmp9 : std::min(tmp3, tmp9);
auto tmp11 = (tmp10 != tmp10) ? tmp10 : std::max(tmp2, tmp10);
auto tmp12 = static_cast<unsigned char>(tmp11);
out_ptr1[i0] = tmp12;
}
}
}
}
''')
kernel_cpp_5 = async_compile.cpp('''
#include "/tmp/torchinductor_root/dm/cdmaihqxwe73zkb3he2zizktpq5uujetg2db26c3r4lgsmlx3b4c.h"
extern "C" void kernel(float* __restrict__ in_out_ptr0,
const unsigned char* __restrict__ in_ptr0,
const long* __restrict__ in_ptr1,
const float* __restrict__ in_ptr2,
const unsigned char* __restrict__ in_ptr3,
const long* __restrict__ in_ptr4,
const float* __restrict__ in_ptr5,
const float* __restrict__ in_ptr6,
const long* __restrict__ in_ptr7,
unsigned char* __restrict__ out_ptr1)
{
auto out_ptr0 = in_out_ptr0;
#pragma omp parallel num_threads(28)
{
{
#pragma omp for
for(long i0=0; i0<46563328; i0+=1)
{
auto tmp0 = in_ptr0[i0];
auto tmp2 = in_ptr1[0];
auto tmp5 = in_ptr2[0];
auto tmp7 = in_ptr3[i0];
auto tmp9 = in_ptr4[0];
auto tmp12 = in_ptr5[0];
auto tmp1 = static_cast<float>(tmp0);
auto tmp3 = static_cast<float>(tmp2);
auto tmp4 = tmp1 - tmp3;
auto tmp6 = tmp4 * tmp5;
auto tmp8 = static_cast<float>(tmp7);
auto tmp10 = static_cast<float>(tmp9);
auto tmp11 = tmp8 - tmp10;
auto tmp13 = tmp11 * tmp12;
auto tmp14 = tmp6 + tmp13;
out_ptr0[i0] = tmp14;
}
}
{
#pragma omp for
for(long i0=0; i0<116; i0+=1)
{
#pragma GCC ivdep
for(long i1=0; i1<512; i1+=1)
{
#pragma GCC ivdep
for(long i2=0; i2<784; i2+=1)
{
auto tmp0 = out_ptr0[i1 + (512*i2) + (401408*i0)];
auto tmp4 = in_ptr6[0];
auto tmp7 = in_ptr7[0];
auto tmp1 = tmp0 * (tmp0>0);
auto tmp2 = static_cast<float>(0);
auto tmp3 = static_cast<float>(127);
auto tmp5 = tmp1 / tmp4;
auto tmp6 = std::nearbyint(tmp5);
auto tmp8 = static_cast<float>(tmp7);
auto tmp9 = tmp6 + tmp8;
auto tmp10 = (tmp9 != tmp9) ? tmp9 : std::min(tmp3, tmp9);
auto tmp11 = (tmp10 != tmp10) ? tmp10 : std::max(tmp2, tmp10);
auto tmp12 = static_cast<unsigned char>(tmp11);
out_ptr1[i2 + (784*i1) + (401408*i0)] = tmp12;
}
}
}
}
}
}
''')
kernel_cpp_6 = async_compile.cpp('''
#include "/tmp/torchinductor_root/dm/cdmaihqxwe73zkb3he2zizktpq5uujetg2db26c3r4lgsmlx3b4c.h"
extern "C" void kernel(float* __restrict__ in_out_ptr0,
const unsigned char* __restrict__ in_ptr0,
const long* __restrict__ in_ptr1,
const float* __restrict__ in_ptr2,
const unsigned char* __restrict__ in_ptr3,
const long* __restrict__ in_ptr4,
const float* __restrict__ in_ptr5,
const float* __restrict__ in_ptr6,
const long* __restrict__ in_ptr7,
unsigned char* __restrict__ out_ptr1)
{
auto out_ptr0 = in_out_ptr0;
#pragma omp parallel num_threads(28)
{
{
#pragma omp for
for(long i0=0; i0<116; i0+=1)
{
#pragma GCC ivdep
for(long i1=0; i1<512; i1+=1)
{
#pragma GCC ivdep
for(long i2=0; i2<784; i2+=1)
{
auto tmp0 = in_ptr0[i1 + (512*i2) + (401408*i0)];
auto tmp2 = in_ptr1[0];
auto tmp5 = in_ptr2[0];
auto tmp7 = in_ptr3[i2 + (784*i1) + (401408*i0)];
auto tmp9 = in_ptr4[0];
auto tmp12 = in_ptr5[0];
auto tmp1 = static_cast<float>(tmp0);
auto tmp3 = static_cast<float>(tmp2);
auto tmp4 = tmp1 - tmp3;
auto tmp6 = tmp4 * tmp5;
auto tmp8 = static_cast<float>(tmp7);
auto tmp10 = static_cast<float>(tmp9);
auto tmp11 = tmp8 - tmp10;
auto tmp13 = tmp11 * tmp12;
auto tmp14 = tmp6 + tmp13;
out_ptr0[i2 + (784*i1) + (401408*i0)] = tmp14;
}
}
}
}
{
#pragma omp for
for(long i0=0; i0<46563328; i0+=1)
{
auto tmp0 = out_ptr0[i0];
auto tmp4 = in_ptr6[0];
auto tmp7 = in_ptr7[0];
auto tmp1 = tmp0 * (tmp0>0);
auto tmp2 = static_cast<float>(0);
auto tmp3 = static_cast<float>(127);
auto tmp5 = tmp1 / tmp4;
auto tmp6 = std::nearbyint(tmp5);
auto tmp8 = static_cast<float>(tmp7);
auto tmp9 = tmp6 + tmp8;
auto tmp10 = (tmp9 != tmp9) ? tmp9 : std::min(tmp3, tmp9);
auto tmp11 = (tmp10 != tmp10) ? tmp10 : std::max(tmp2, tmp10);
auto tmp12 = static_cast<unsigned char>(tmp11);
out_ptr1[i0] = tmp12;
}
}
}
}
''')
kernel_cpp_7 = async_compile.cpp('''
#include "/tmp/torchinductor_root/dm/cdmaihqxwe73zkb3he2zizktpq5uujetg2db26c3r4lgsmlx3b4c.h"
extern "C" void kernel(float* __restrict__ in_out_ptr0,
const unsigned char* __restrict__ in_ptr0,
const long* __restrict__ in_ptr1,
const float* __restrict__ in_ptr2,
const unsigned char* __restrict__ in_ptr3,
const long* __restrict__ in_ptr4,
const float* __restrict__ in_ptr5,
const float* __restrict__ in_ptr6,
const long* __restrict__ in_ptr7,
unsigned char* __restrict__ out_ptr1)
{
auto out_ptr0 = in_out_ptr0;
#pragma omp parallel num_threads(28)
{
{
#pragma omp for
for(long i0=0; i0<116; i0+=1)
{
#pragma GCC ivdep
for(long i1=0; i1<512; i1+=1)
{
#pragma GCC ivdep
for(long i2=0; i2<784; i2+=1)
{
auto tmp0 = in_ptr0[i1 + (512*i2) + (401408*i0)];
auto tmp2 = in_ptr1[0];
auto tmp5 = in_ptr2[0];
auto tmp7 = in_ptr3[i2 + (784*i1) + (401408*i0)];
auto tmp9 = in_ptr4[0];
auto tmp12 = in_ptr5[0];
auto tmp1 = static_cast<float>(tmp0);
auto tmp3 = static_cast<float>(tmp2);
auto tmp4 = tmp1 - tmp3;
auto tmp6 = tmp4 * tmp5;
auto tmp8 = static_cast<float>(tmp7);
auto tmp10 = static_cast<float>(tmp9);
auto tmp11 = tmp8 - tmp10;
auto tmp13 = tmp11 * tmp12;
auto tmp14 = tmp6 + tmp13;
out_ptr0[i2 + (784*i1) + (401408*i0)] = tmp14;
}
}
}
}
{
#pragma omp for
for(long i0=0; i0<46563328; i0+=1)
{
auto tmp0 = out_ptr0[i0];
auto tmp4 = in_ptr6[0];
auto tmp7 = in_ptr7[0];
auto tmp1 = tmp0 * (tmp0>0);
auto tmp2 = static_cast<float>(0);
auto tmp3 = static_cast<float>(127);
auto tmp5 = tmp1 / tmp4;
auto tmp6 = std::nearbyint(tmp5);
auto tmp8 = static_cast<float>(tmp7);
auto tmp9 = tmp6 + tmp8;
auto tmp10 = (tmp9 != tmp9) ? tmp9 : std::min(tmp3, tmp9);
auto tmp11 = (tmp10 != tmp10) ? tmp10 : std::max(tmp2, tmp10);
auto tmp12 = static_cast<unsigned char>(tmp11);
out_ptr1[i0] = tmp12;
}
}
}
}
''')
kernel_cpp_8 = async_compile.cpp('''
#include "/tmp/torchinductor_root/dm/cdmaihqxwe73zkb3he2zizktpq5uujetg2db26c3r4lgsmlx3b4c.h"
extern "C" void kernel(float* __restrict__ in_out_ptr0,
const unsigned char* __restrict__ in_ptr0,
const long* __restrict__ in_ptr1,
const float* __restrict__ in_ptr2,
const unsigned char* __restrict__ in_ptr3,
const long* __restrict__ in_ptr4,
const float* __restrict__ in_ptr5,
const float* __restrict__ in_ptr6,
const long* __restrict__ in_ptr7,
unsigned char* __restrict__ out_ptr1)
{
auto out_ptr0 = in_out_ptr0;
#pragma omp parallel num_threads(28)
{
{
#pragma omp for
for(long i0=0; i0<116; i0+=1)
{
#pragma GCC ivdep
for(long i1=0; i1<512; i1+=1)
{
#pragma GCC ivdep
for(long i2=0; i2<784; i2+=1)
{
auto tmp0 = in_ptr0[i1 + (512*i2) + (401408*i0)];
auto tmp2 = in_ptr1[0];
auto tmp5 = in_ptr2[0];
auto tmp7 = in_ptr3[i2 + (784*i1) + (401408*i0)];
auto tmp9 = in_ptr4[0];
auto tmp12 = in_ptr5[0];
auto tmp1 = static_cast<float>(tmp0);
auto tmp3 = static_cast<float>(tmp2);
auto tmp4 = tmp1 - tmp3;
auto tmp6 = tmp4 * tmp5;
auto tmp8 = static_cast<float>(tmp7);
auto tmp10 = static_cast<float>(tmp9);
auto tmp11 = tmp8 - tmp10;
auto tmp13 = tmp11 * tmp12;
auto tmp14 = tmp6 + tmp13;
out_ptr0[i2 + (784*i1) + (401408*i0)] = tmp14;
}
}
}
}
{
#pragma omp for
for(long i0=0; i0<46563328; i0+=1)
{
auto tmp0 = out_ptr0[i0];
auto tmp4 = in_ptr6[0];
auto tmp7 = in_ptr7[0];
auto tmp1 = tmp0 * (tmp0>0);
auto tmp2 = static_cast<float>(0);
auto tmp3 = static_cast<float>(127);
auto tmp5 = tmp1 / tmp4;
auto tmp6 = std::nearbyint(tmp5);
auto tmp8 = static_cast<float>(tmp7);
auto tmp9 = tmp6 + tmp8;
auto tmp10 = (tmp9 != tmp9) ? tmp9 : std::min(tmp3, tmp9);
auto tmp11 = (tmp10 != tmp10) ? tmp10 : std::max(tmp2, tmp10);
auto tmp12 = static_cast<unsigned char>(tmp11);
out_ptr1[i0] = tmp12;
}
}
}
}
''')
kernel_cpp_9 = async_compile.cpp('''
#include "/tmp/torchinductor_root/dm/cdmaihqxwe73zkb3he2zizktpq5uujetg2db26c3r4lgsmlx3b4c.h"
extern "C" void kernel(float* __restrict__ in_out_ptr0,
const unsigned char* __restrict__ in_ptr0,
const long* __restrict__ in_ptr1,
const float* __restrict__ in_ptr2,
const unsigned char* __restrict__ in_ptr3,
const long* __restrict__ in_ptr4,
const float* __restrict__ in_ptr5,
const float* __restrict__ in_ptr6,
const long* __restrict__ in_ptr7,
unsigned char* __restrict__ out_ptr1)
{
auto out_ptr0 = in_out_ptr0;
#pragma omp parallel num_threads(28)
{
{
#pragma omp for
for(long i0=0; i0<23281664; i0+=1)
{
auto tmp0 = in_ptr0[i0];
auto tmp2 = in_ptr1[0];
auto tmp5 = in_ptr2[0];
auto tmp7 = in_ptr3[i0];
auto tmp9 = in_ptr4[0];
auto tmp12 = in_ptr5[0];
auto tmp1 = static_cast<float>(tmp0);
auto tmp3 = static_cast<float>(tmp2);
auto tmp4 = tmp1 - tmp3;
auto tmp6 = tmp4 * tmp5;
auto tmp8 = static_cast<float>(tmp7);
auto tmp10 = static_cast<float>(tmp9);
auto tmp11 = tmp8 - tmp10;
auto tmp13 = tmp11 * tmp12;
auto tmp14 = tmp6 + tmp13;
out_ptr0[i0] = tmp14;
}
}
{
#pragma omp for
for(long i0=0; i0<116; i0+=1)
{
#pragma GCC ivdep
for(long i1=0; i1<1024; i1+=1)
{
#pragma GCC ivdep
for(long i2=0; i2<196; i2+=1)
{
auto tmp0 = out_ptr0[i1 + (1024*i2) + (200704*i0)];
auto tmp4 = in_ptr6[0];
auto tmp7 = in_ptr7[0];
auto tmp1 = tmp0 * (tmp0>0);
auto tmp2 = static_cast<float>(0);
auto tmp3 = static_cast<float>(127);
auto tmp5 = tmp1 / tmp4;
auto tmp6 = std::nearbyint(tmp5);
auto tmp8 = static_cast<float>(tmp7);
auto tmp9 = tmp6 + tmp8;
auto tmp10 = (tmp9 != tmp9) ? tmp9 : std::min(tmp3, tmp9);
auto tmp11 = (tmp10 != tmp10) ? tmp10 : std::max(tmp2, tmp10);
auto tmp12 = static_cast<unsigned char>(tmp11);
out_ptr1[i2 + (196*i1) + (200704*i0)] = tmp12;
}
}
}
}
}
}
''')
kernel_cpp_10 = async_compile.cpp('''
#include "/tmp/torchinductor_root/dm/cdmaihqxwe73zkb3he2zizktpq5uujetg2db26c3r4lgsmlx3b4c.h"
extern "C" void kernel(float* __restrict__ in_out_ptr0,
const unsigned char* __restrict__ in_ptr0,
const long* __restrict__ in_ptr1,
const float* __restrict__ in_ptr2,
const unsigned char* __restrict__ in_ptr3,
const long* __restrict__ in_ptr4,
const float* __restrict__ in_ptr5,
const float* __restrict__ in_ptr6,
const long* __restrict__ in_ptr7,
unsigned char* __restrict__ out_ptr1)
{
auto out_ptr0 = in_out_ptr0;
#pragma omp parallel num_threads(28)
{
{
#pragma omp for
for(long i0=0; i0<116; i0+=1)
{
#pragma GCC ivdep
for(long i1=0; i1<1024; i1+=1)
{
#pragma GCC ivdep
for(long i2=0; i2<196; i2+=1)
{
auto tmp0 = in_ptr0[i1 + (1024*i2) + (200704*i0)];
auto tmp2 = in_ptr1[0];
auto tmp5 = in_ptr2[0];
auto tmp7 = in_ptr3[i2 + (196*i1) + (200704*i0)];
auto tmp9 = in_ptr4[0];
auto tmp12 = in_ptr5[0];
auto tmp1 = static_cast<float>(tmp0);
auto tmp3 = static_cast<float>(tmp2);
auto tmp4 = tmp1 - tmp3;
auto tmp6 = tmp4 * tmp5;
auto tmp8 = static_cast<float>(tmp7);
auto tmp10 = static_cast<float>(tmp9);
auto tmp11 = tmp8 - tmp10;
auto tmp13 = tmp11 * tmp12;
auto tmp14 = tmp6 + tmp13;
out_ptr0[i2 + (196*i1) + (200704*i0)] = tmp14;
}
}
}
}
{
#pragma omp for
for(long i0=0; i0<23281664; i0+=1)
{
auto tmp0 = out_ptr0[i0];
auto tmp4 = in_ptr6[0];
auto tmp7 = in_ptr7[0];
auto tmp1 = tmp0 * (tmp0>0);
auto tmp2 = static_cast<float>(0);
auto tmp3 = static_cast<float>(127);
auto tmp5 = tmp1 / tmp4;
auto tmp6 = std::nearbyint(tmp5);
auto tmp8 = static_cast<float>(tmp7);
auto tmp9 = tmp6 + tmp8;
auto tmp10 = (tmp9 != tmp9) ? tmp9 : std::min(tmp3, tmp9);
auto tmp11 = (tmp10 != tmp10) ? tmp10 : std::max(tmp2, tmp10);
auto tmp12 = static_cast<unsigned char>(tmp11);
out_ptr1[i0] = tmp12;
}
}
}
}
''')
kernel_cpp_11 = async_compile.cpp('''
#include "/tmp/torchinductor_root/dm/cdmaihqxwe73zkb3he2zizktpq5uujetg2db26c3r4lgsmlx3b4c.h"
extern "C" void kernel(float* __restrict__ in_out_ptr0,
const unsigned char* __restrict__ in_ptr0,
const long* __restrict__ in_ptr1,
const float* __restrict__ in_ptr2,
const unsigned char* __restrict__ in_ptr3,
const long* __restrict__ in_ptr4,
const float* __restrict__ in_ptr5,
const float* __restrict__ in_ptr6,
const long* __restrict__ in_ptr7,
unsigned char* __restrict__ out_ptr1)
{
auto out_ptr0 = in_out_ptr0;
#pragma omp parallel num_threads(28)
{
{
#pragma omp for
for(long i0=0; i0<116; i0+=1)
{
#pragma GCC ivdep
for(long i1=0; i1<1024; i1+=1)
{
#pragma GCC ivdep
for(long i2=0; i2<196; i2+=1)
{
auto tmp0 = in_ptr0[i1 + (1024*i2) + (200704*i0)];
auto tmp2 = in_ptr1[0];
auto tmp5 = in_ptr2[0];
auto tmp7 = in_ptr3[i2 + (196*i1) + (200704*i0)];
auto tmp9 = in_ptr4[0];
auto tmp12 = in_ptr5[0];
auto tmp1 = static_cast<float>(tmp0);
auto tmp3 = static_cast<float>(tmp2);
auto tmp4 = tmp1 - tmp3;
auto tmp6 = tmp4 * tmp5;
auto tmp8 = static_cast<float>(tmp7);
auto tmp10 = static_cast<float>(tmp9);
auto tmp11 = tmp8 - tmp10;
auto tmp13 = tmp11 * tmp12;
auto tmp14 = tmp6 + tmp13;
out_ptr0[i2 + (196*i1) + (200704*i0)] = tmp14;
}
}
}
}
{
#pragma omp for
for(long i0=0; i0<23281664; i0+=1)
{
auto tmp0 = out_ptr0[i0];
auto tmp4 = in_ptr6[0];
auto tmp7 = in_ptr7[0];
auto tmp1 = tmp0 * (tmp0>0);
auto tmp2 = static_cast<float>(0);
auto tmp3 = static_cast<float>(127);
auto tmp5 = tmp1 / tmp4;
auto tmp6 = std::nearbyint(tmp5);
auto tmp8 = static_cast<float>(tmp7);
auto tmp9 = tmp6 + tmp8;
auto tmp10 = (tmp9 != tmp9) ? tmp9 : std::min(tmp3, tmp9);
auto tmp11 = (tmp10 != tmp10) ? tmp10 : std::max(tmp2, tmp10);
auto tmp12 = static_cast<unsigned char>(tmp11);
out_ptr1[i0] = tmp12;
}
}
}
}
''')
kernel_cpp_12 = async_compile.cpp('''
#include "/tmp/torchinductor_root/dm/cdmaihqxwe73zkb3he2zizktpq5uujetg2db26c3r4lgsmlx3b4c.h"
extern "C" void kernel(float* __restrict__ in_out_ptr0,
const unsigned char* __restrict__ in_ptr0,
const long* __restrict__ in_ptr1,
const float* __restrict__ in_ptr2,
const unsigned char* __restrict__ in_ptr3,
const long* __restrict__ in_ptr4,
const float* __restrict__ in_ptr5,
const float* __restrict__ in_ptr6,
const long* __restrict__ in_ptr7,
unsigned char* __restrict__ out_ptr1)
{
auto out_ptr0 = in_out_ptr0;
#pragma omp parallel num_threads(28)
{
{
#pragma omp for
for(long i0=0; i0<116; i0+=1)
{
#pragma GCC ivdep
for(long i1=0; i1<1024; i1+=1)
{
#pragma GCC ivdep
for(long i2=0; i2<196; i2+=1)
{
auto tmp0 = in_ptr0[i1 + (1024*i2) + (200704*i0)];
auto tmp2 = in_ptr1[0];
auto tmp5 = in_ptr2[0];
auto tmp7 = in_ptr3[i2 + (196*i1) + (200704*i0)];
auto tmp9 = in_ptr4[0];
auto tmp12 = in_ptr5[0];
auto tmp1 = static_cast<float>(tmp0);
auto tmp3 = static_cast<float>(tmp2);
auto tmp4 = tmp1 - tmp3;
auto tmp6 = tmp4 * tmp5;
auto tmp8 = static_cast<float>(tmp7);
auto tmp10 = static_cast<float>(tmp9);
auto tmp11 = tmp8 - tmp10;
auto tmp13 = tmp11 * tmp12;
auto tmp14 = tmp6 + tmp13;
out_ptr0[i2 + (196*i1) + (200704*i0)] = tmp14;
}
}
}
}
{
#pragma omp for
for(long i0=0; i0<23281664; i0+=1)
{
auto tmp0 = out_ptr0[i0];
auto tmp4 = in_ptr6[0];
auto tmp7 = in_ptr7[0];
auto tmp1 = tmp0 * (tmp0>0);
auto tmp2 = static_cast<float>(0);
auto tmp3 = static_cast<float>(127);
auto tmp5 = tmp1 / tmp4;
auto tmp6 = std::nearbyint(tmp5);
auto tmp8 = static_cast<float>(tmp7);
auto tmp9 = tmp6 + tmp8;
auto tmp10 = (tmp9 != tmp9) ? tmp9 : std::min(tmp3, tmp9);
auto tmp11 = (tmp10 != tmp10) ? tmp10 : std::max(tmp2, tmp10);
auto tmp12 = static_cast<unsigned char>(tmp11);
out_ptr1[i0] = tmp12;
}
}
}
}
''')
kernel_cpp_13 = async_compile.cpp('''
#include "/tmp/torchinductor_root/dm/cdmaihqxwe73zkb3he2zizktpq5uujetg2db26c3r4lgsmlx3b4c.h"
extern "C" void kernel(float* __restrict__ in_out_ptr0,
const unsigned char* __restrict__ in_ptr0,
const long* __restrict__ in_ptr1,
const float* __restrict__ in_ptr2,
const unsigned char* __restrict__ in_ptr3,
const long* __restrict__ in_ptr4,
const float* __restrict__ in_ptr5,
const float* __restrict__ in_ptr6,
const long* __restrict__ in_ptr7,
unsigned char* __restrict__ out_ptr1)
{
auto out_ptr0 = in_out_ptr0;
#pragma omp parallel num_threads(28)
{
{
#pragma omp for
for(long i0=0; i0<116; i0+=1)
{
#pragma GCC ivdep
for(long i1=0; i1<1024; i1+=1)
{
#pragma GCC ivdep
for(long i2=0; i2<196; i2+=1)
{
auto tmp0 = in_ptr0[i1 + (1024*i2) + (200704*i0)];
auto tmp2 = in_ptr1[0];
auto tmp5 = in_ptr2[0];
auto tmp7 = in_ptr3[i2 + (196*i1) + (200704*i0)];
auto tmp9 = in_ptr4[0];
auto tmp12 = in_ptr5[0];
auto tmp1 = static_cast<float>(tmp0);
auto tmp3 = static_cast<float>(tmp2);
auto tmp4 = tmp1 - tmp3;
auto tmp6 = tmp4 * tmp5;
auto tmp8 = static_cast<float>(tmp7);
auto tmp10 = static_cast<float>(tmp9);
auto tmp11 = tmp8 - tmp10;
auto tmp13 = tmp11 * tmp12;
auto tmp14 = tmp6 + tmp13;
out_ptr0[i2 + (196*i1) + (200704*i0)] = tmp14;
}
}
}
}
{
#pragma omp for
for(long i0=0; i0<23281664; i0+=1)
{
auto tmp0 = out_ptr0[i0];
auto tmp4 = in_ptr6[0];
auto tmp7 = in_ptr7[0];
auto tmp1 = tmp0 * (tmp0>0);
auto tmp2 = static_cast<float>(0);
auto tmp3 = static_cast<float>(127);
auto tmp5 = tmp1 / tmp4;
auto tmp6 = std::nearbyint(tmp5);
auto tmp8 = static_cast<float>(tmp7);
auto tmp9 = tmp6 + tmp8;
auto tmp10 = (tmp9 != tmp9) ? tmp9 : std::min(tmp3, tmp9);
auto tmp11 = (tmp10 != tmp10) ? tmp10 : std::max(tmp2, tmp10);
auto tmp12 = static_cast<unsigned char>(tmp11);
out_ptr1[i0] = tmp12;
}
}
}
}
''')
kernel_cpp_14 = async_compile.cpp('''
#include "/tmp/torchinductor_root/dm/cdmaihqxwe73zkb3he2zizktpq5uujetg2db26c3r4lgsmlx3b4c.h"
extern "C" void kernel(float* __restrict__ in_out_ptr0,
const unsigned char* __restrict__ in_ptr0,
const long* __restrict__ in_ptr1,
const float* __restrict__ in_ptr2,
const unsigned char* __restrict__ in_ptr3,
const long* __restrict__ in_ptr4,
const float* __restrict__ in_ptr5,
const float* __restrict__ in_ptr6,
const long* __restrict__ in_ptr7,
unsigned char* __restrict__ out_ptr1)
{
auto out_ptr0 = in_out_ptr0;
#pragma omp parallel num_threads(28)
{
{
#pragma omp for
for(long i0=0; i0<116; i0+=1)
{
#pragma GCC ivdep
for(long i1=0; i1<1024; i1+=1)
{
#pragma GCC ivdep
for(long i2=0; i2<196; i2+=1)
{
auto tmp0 = in_ptr0[i1 + (1024*i2) + (200704*i0)];
auto tmp2 = in_ptr1[0];
auto tmp5 = in_ptr2[0];
auto tmp7 = in_ptr3[i2 + (196*i1) + (200704*i0)];
auto tmp9 = in_ptr4[0];
auto tmp12 = in_ptr5[0];
auto tmp1 = static_cast<float>(tmp0);
auto tmp3 = static_cast<float>(tmp2);
auto tmp4 = tmp1 - tmp3;
auto tmp6 = tmp4 * tmp5;
auto tmp8 = static_cast<float>(tmp7);
auto tmp10 = static_cast<float>(tmp9);
auto tmp11 = tmp8 - tmp10;
auto tmp13 = tmp11 * tmp12;
auto tmp14 = tmp6 + tmp13;
out_ptr0[i2 + (196*i1) + (200704*i0)] = tmp14;
}
}
}
}
{
#pragma omp for
for(long i0=0; i0<23281664; i0+=1)
{
auto tmp0 = out_ptr0[i0];
auto tmp4 = in_ptr6[0];
auto tmp7 = in_ptr7[0];
auto tmp1 = tmp0 * (tmp0>0);
auto tmp2 = static_cast<float>(0);
auto tmp3 = static_cast<float>(127);
auto tmp5 = tmp1 / tmp4;
auto tmp6 = std::nearbyint(tmp5);
auto tmp8 = static_cast<float>(tmp7);
auto tmp9 = tmp6 + tmp8;
auto tmp10 = (tmp9 != tmp9) ? tmp9 : std::min(tmp3, tmp9);
auto tmp11 = (tmp10 != tmp10) ? tmp10 : std::max(tmp2, tmp10);
auto tmp12 = static_cast<unsigned char>(tmp11);
out_ptr1[i0] = tmp12;
}
}
}
}
''')
kernel_cpp_15 = async_compile.cpp('''
#include "/tmp/torchinductor_root/dm/cdmaihqxwe73zkb3he2zizktpq5uujetg2db26c3r4lgsmlx3b4c.h"
extern "C" void kernel(float* __restrict__ in_out_ptr0,
const unsigned char* __restrict__ in_ptr0,
const long* __restrict__ in_ptr1,
const float* __restrict__ in_ptr2,
const unsigned char* __restrict__ in_ptr3,
const long* __restrict__ in_ptr4,
const float* __restrict__ in_ptr5,
const float* __restrict__ in_ptr6,
const long* __restrict__ in_ptr7,
unsigned char* __restrict__ out_ptr1)
{
auto out_ptr0 = in_out_ptr0;
#pragma omp parallel num_threads(28)
{
{
#pragma omp for
for(long i0=0; i0<11640832; i0+=1)
{
auto tmp0 = in_ptr0[i0];
auto tmp2 = in_ptr1[0];
auto tmp5 = in_ptr2[0];
auto tmp7 = in_ptr3[i0];
auto tmp9 = in_ptr4[0];
auto tmp12 = in_ptr5[0];
auto tmp1 = static_cast<float>(tmp0);
auto tmp3 = static_cast<float>(tmp2);
auto tmp4 = tmp1 - tmp3;
auto tmp6 = tmp4 * tmp5;
auto tmp8 = static_cast<float>(tmp7);
auto tmp10 = static_cast<float>(tmp9);
auto tmp11 = tmp8 - tmp10;
auto tmp13 = tmp11 * tmp12;
auto tmp14 = tmp6 + tmp13;
out_ptr0[i0] = tmp14;
}
}
{
#pragma omp for
for(long i0=0; i0<116; i0+=1)
{
#pragma GCC ivdep
for(long i1=0; i1<2048; i1+=1)
{
#pragma GCC ivdep
for(long i2=0; i2<49; i2+=1)
{
auto tmp0 = out_ptr0[i1 + (2048*i2) + (100352*i0)];
auto tmp4 = in_ptr6[0];
auto tmp7 = in_ptr7[0];
auto tmp1 = tmp0 * (tmp0>0);
auto tmp2 = static_cast<float>(0);
auto tmp3 = static_cast<float>(127);
auto tmp5 = tmp1 / tmp4;
auto tmp6 = std::nearbyint(tmp5);
auto tmp8 = static_cast<float>(tmp7);
auto tmp9 = tmp6 + tmp8;
auto tmp10 = (tmp9 != tmp9) ? tmp9 : std::min(tmp3, tmp9);
auto tmp11 = (tmp10 != tmp10) ? tmp10 : std::max(tmp2, tmp10);
auto tmp12 = static_cast<unsigned char>(tmp11);
out_ptr1[i2 + (49*i1) + (100352*i0)] = tmp12;
}
}
}
}
}
}
''')
kernel_cpp_16 = async_compile.cpp('''
#include "/tmp/torchinductor_root/dm/cdmaihqxwe73zkb3he2zizktpq5uujetg2db26c3r4lgsmlx3b4c.h"
extern "C" void kernel(float* __restrict__ in_out_ptr0,
const unsigned char* __restrict__ in_ptr0,
const long* __restrict__ in_ptr1,
const float* __restrict__ in_ptr2,
const unsigned char* __restrict__ in_ptr3,
const long* __restrict__ in_ptr4,
const float* __restrict__ in_ptr5,
const float* __restrict__ in_ptr6,
const long* __restrict__ in_ptr7,
unsigned char* __restrict__ out_ptr1)
{
auto out_ptr0 = in_out_ptr0;
#pragma omp parallel num_threads(28)
{
{
#pragma omp for
for(long i0=0; i0<116; i0+=1)
{
#pragma GCC ivdep
for(long i1=0; i1<2048; i1+=1)
{
#pragma GCC ivdep
for(long i2=0; i2<49; i2+=1)
{
auto tmp0 = in_ptr0[i1 + (2048*i2) + (100352*i0)];
auto tmp2 = in_ptr1[0];
auto tmp5 = in_ptr2[0];
auto tmp7 = in_ptr3[i2 + (49*i1) + (100352*i0)];
auto tmp9 = in_ptr4[0];
auto tmp12 = in_ptr5[0];
auto tmp1 = static_cast<float>(tmp0);
auto tmp3 = static_cast<float>(tmp2);
auto tmp4 = tmp1 - tmp3;
auto tmp6 = tmp4 * tmp5;
auto tmp8 = static_cast<float>(tmp7);
auto tmp10 = static_cast<float>(tmp9);
auto tmp11 = tmp8 - tmp10;
auto tmp13 = tmp11 * tmp12;
auto tmp14 = tmp6 + tmp13;
out_ptr0[i2 + (49*i1) + (100352*i0)] = tmp14;
}
}
}
}
{
#pragma omp for
for(long i0=0; i0<11640832; i0+=1)
{
auto tmp0 = out_ptr0[i0];
auto tmp4 = in_ptr6[0];
auto tmp7 = in_ptr7[0];
auto tmp1 = tmp0 * (tmp0>0);
auto tmp2 = static_cast<float>(0);
auto tmp3 = static_cast<float>(127);
auto tmp5 = tmp1 / tmp4;
auto tmp6 = std::nearbyint(tmp5);
auto tmp8 = static_cast<float>(tmp7);
auto tmp9 = tmp6 + tmp8;
auto tmp10 = (tmp9 != tmp9) ? tmp9 : std::min(tmp3, tmp9);
auto tmp11 = (tmp10 != tmp10) ? tmp10 : std::max(tmp2, tmp10);
auto tmp12 = static_cast<unsigned char>(tmp11);
out_ptr1[i0] = tmp12;
}
}
}
}
''')
kernel_cpp_17 = async_compile.cpp('''
#include "/tmp/torchinductor_root/dm/cdmaihqxwe73zkb3he2zizktpq5uujetg2db26c3r4lgsmlx3b4c.h"
extern "C" void kernel(float* __restrict__ in_out_ptr0,
const unsigned char* __restrict__ in_ptr0,
const long* __restrict__ in_ptr1,
const float* __restrict__ in_ptr2,
const unsigned char* __restrict__ in_ptr3,
const long* __restrict__ in_ptr4,
const float* __restrict__ in_ptr5,
const float* __restrict__ in_ptr6,
const long* __restrict__ in_ptr7,
const float* __restrict__ in_ptr8,
const long* __restrict__ in_ptr9,
float* __restrict__ out_ptr1,
unsigned char* __restrict__ out_ptr2)
{
auto out_ptr0 = in_out_ptr0;
#pragma omp parallel num_threads(28)
{
{
#pragma omp for
for(long i0=0; i0<116; i0+=1)
{
#pragma GCC ivdep
for(long i1=0; i1<2048; i1+=1)
{
#pragma GCC ivdep
for(long i2=0; i2<49; i2+=1)
{
auto tmp0 = in_ptr0[i1 + (2048*i2) + (100352*i0)];
auto tmp2 = in_ptr1[0];
auto tmp5 = in_ptr2[0];
auto tmp7 = in_ptr3[i2 + (49*i1) + (100352*i0)];
auto tmp9 = in_ptr4[0];
auto tmp12 = in_ptr5[0];
auto tmp1 = static_cast<float>(tmp0);
auto tmp3 = static_cast<float>(tmp2);
auto tmp4 = tmp1 - tmp3;
auto tmp6 = tmp4 * tmp5;
auto tmp8 = static_cast<float>(tmp7);
auto tmp10 = static_cast<float>(tmp9);
auto tmp11 = tmp8 - tmp10;
auto tmp13 = tmp11 * tmp12;
auto tmp14 = tmp6 + tmp13;
out_ptr0[i2 + (49*i1) + (100352*i0)] = tmp14;
}
}
}
}
{
#pragma omp for
for(long i0=0; i0<727552; i0+=1)
{
auto tmp0 = at::vec::Vectorized<float>::loadu(out_ptr0 + 16*i0);
auto tmp1 = at::vec::clamp_min(tmp0, decltype(tmp0)(0));
tmp1.store(in_out_ptr0 + 16*i0);
}
#pragma omp for simd simdlen(8)
for(long i0=11640832; i0<11640832; i0+=1)
{
auto tmp0 = out_ptr0[i0];
auto tmp1 = tmp0 * (tmp0>0);
in_out_ptr0[i0] = tmp1;
}
}
{
#pragma omp for
for(long i0=0; i0<237568; i0+=1)
{
{
float tmp15 = 0;
for(long i1=0; i1<49; i1+=1)
{
auto tmp2 = in_out_ptr0[i1 + (49*i0)];
auto tmp3 = in_ptr6[0];
auto tmp6 = in_ptr7[0];
auto tmp0 = static_cast<float>(0);
auto tmp1 = static_cast<float>(127);
auto tmp4 = tmp2 / tmp3;
auto tmp5 = std::nearbyint(tmp4);
auto tmp7 = static_cast<float>(tmp6);
auto tmp8 = tmp5 + tmp7;
auto tmp9 = (tmp8 != tmp8) ? tmp8 : std::min(tmp1, tmp8);
auto tmp10 = (tmp9 != tmp9) ? tmp9 : std::max(tmp0, tmp9);
auto tmp11 = static_cast<unsigned char>(tmp10);
auto tmp12 = static_cast<float>(tmp11);
auto tmp13 = tmp12 - tmp7;
auto tmp14 = tmp13 * tmp3;
tmp15 += tmp14;
}
out_ptr1[i0] = tmp15;
}
}
}
{
#pragma omp for
for(long i0=0; i0<237568; i0+=1)
{
auto tmp2 = out_ptr1[i0];
auto tmp5 = in_ptr8[0];
auto tmp8 = in_ptr9[0];
auto tmp0 = static_cast<float>(0);
auto tmp1 = static_cast<float>(127);
auto tmp3 = static_cast<float>(49);
auto tmp4 = tmp2 / tmp3;
auto tmp6 = tmp4 / tmp5;
auto tmp7 = std::nearbyint(tmp6);
auto tmp9 = static_cast<float>(tmp8);
auto tmp10 = tmp7 + tmp9;
auto tmp11 = (tmp10 != tmp10) ? tmp10 : std::min(tmp1, tmp10);
auto tmp12 = (tmp11 != tmp11) ? tmp11 : std::max(tmp0, tmp11);
auto tmp13 = static_cast<unsigned char>(tmp12);
out_ptr2[i0] = tmp13;
}
}
}
}
''')
kernel_cpp_18 = async_compile.cpp('''
#include "/tmp/torchinductor_root/dm/cdmaihqxwe73zkb3he2zizktpq5uujetg2db26c3r4lgsmlx3b4c.h"
extern "C" void kernel(const unsigned char* __restrict__ in_ptr0,
const long* __restrict__ in_ptr1,
const float* __restrict__ in_ptr2,
float* __restrict__ out_ptr0)
{
#pragma omp parallel num_threads(28)
{
{
#pragma omp for
for(long i0=0; i0<116000; i0+=1)
{
auto tmp0 = in_ptr0[i0];
auto tmp2 = in_ptr1[0];
auto tmp5 = in_ptr2[0];
auto tmp1 = static_cast<float>(tmp0);
auto tmp3 = static_cast<float>(tmp2);
auto tmp4 = tmp1 - tmp3;
auto tmp6 = tmp4 * tmp5;
out_ptr0[i0] = tmp6;
}
}
}
}
''')
async_compile.wait(globals())
del async_compile
def call(args):
arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1, arg12_1, arg13_1, arg14_1, arg15_1, arg16_1, arg17_1, arg18_1, arg19_1, arg20_1, arg21_1, arg22_1, arg23_1, arg24_1, arg25_1, arg26_1, arg27_1, arg28_1, arg29_1, arg30_1, arg31_1, arg32_1, arg33_1, arg34_1, arg35_1, arg36_1, arg37_1, arg38_1, arg39_1, arg40_1, arg41_1, arg42_1, arg43_1, arg44_1, arg45_1, arg46_1, arg47_1, arg48_1, arg49_1, arg50_1, arg51_1, arg52_1, arg53_1, arg54_1, arg55_1, arg56_1, arg57_1, arg58_1, arg59_1, arg60_1, arg61_1, arg62_1, arg63_1, arg64_1, arg65_1, arg66_1, arg67_1, arg68_1, arg69_1, arg70_1, arg71_1, arg72_1, arg73_1, arg74_1, arg75_1, arg76_1, arg77_1, arg78_1, arg79_1, arg80_1, arg81_1, arg82_1, arg83_1, arg84_1, arg85_1, arg86_1, arg87_1, arg88_1, arg89_1, arg90_1, arg91_1, arg92_1, arg93_1, arg94_1, arg95_1, arg96_1, arg97_1, arg98_1, arg99_1, arg100_1, arg101_1, arg102_1, arg103_1, arg104_1, arg105_1, arg106_1, arg107_1, arg108_1, arg109_1, arg110_1, arg111_1, arg112_1, arg113_1, arg114_1, arg115_1, arg116_1, arg117_1, arg118_1, arg119_1, arg120_1, arg121_1, arg122_1, arg123_1, arg124_1, arg125_1, arg126_1, arg127_1, arg128_1, arg129_1, arg130_1, arg131_1, arg132_1, arg133_1, arg134_1, arg135_1, arg136_1, arg137_1, arg138_1, arg139_1, arg140_1, arg141_1, arg142_1, arg143_1, arg144_1, arg145_1, arg146_1, arg147_1, arg148_1, arg149_1, arg150_1, arg151_1, arg152_1, arg153_1, arg154_1, arg155_1, arg156_1, arg157_1, arg158_1, arg159_1, arg160_1, arg161_1, arg162_1, arg163_1, arg164_1, arg165_1, arg166_1, arg167_1, arg168_1, arg169_1, arg170_1, arg171_1, arg172_1, arg173_1, arg174_1, arg175_1, arg176_1, arg177_1, arg178_1, arg179_1, arg180_1, arg181_1, arg182_1, arg183_1, arg184_1, arg185_1, arg186_1, arg187_1, arg188_1, arg189_1, arg190_1, arg191_1, arg192_1, arg193_1, arg194_1, arg195_1, arg196_1, arg197_1, arg198_1, arg199_1, arg200_1, arg201_1, arg202_1, arg203_1, arg204_1, arg205_1, arg206_1, arg207_1, arg208_1, arg209_1, arg210_1, arg211_1, arg212_1, arg213_1, arg214_1, arg215_1, arg216_1, arg217_1, arg218_1, arg219_1, arg220_1, arg221_1, arg222_1, arg223_1, arg224_1, arg225_1, arg226_1, arg227_1, arg228_1, arg229_1, arg230_1, arg231_1, arg232_1, arg233_1, arg234_1, arg235_1, arg236_1, arg237_1, arg238_1, arg239_1, arg240_1, arg241_1, arg242_1, arg243_1, arg244_1, arg245_1, arg246_1, arg247_1, arg248_1, arg249_1, arg250_1, arg251_1, arg252_1, arg253_1, arg254_1, arg255_1, arg256_1, arg257_1, arg258_1, arg259_1, arg260_1, arg261_1, arg262_1, arg263_1, arg264_1, arg265_1, arg266_1, arg267_1, arg268_1, arg269_1, arg270_1, arg271_1, arg272_1, arg273_1, arg274_1, arg275_1, arg276_1, arg277_1, arg278_1, arg279_1, arg280_1, arg281_1, arg282_1, arg283_1, arg284_1, arg285_1, arg286_1, arg287_1, arg288_1, arg289_1, arg290_1, arg291_1, arg292_1, arg293_1, arg294_1, arg295_1, arg296_1, arg297_1, arg298_1, arg299_1, arg300_1, arg301_1, arg302_1, arg303_1, arg304_1, arg305_1, arg306_1, arg307_1, arg308_1, arg309_1, arg310_1, arg311_1, arg312_1, arg313_1, arg314_1, arg315_1, arg316_1, arg317_1, arg318_1, arg319_1, arg320_1, arg321_1, arg322_1, arg323_1, arg324_1, arg325_1, arg326_1, arg327_1, arg328_1, arg329_1, arg330_1, arg331_1, arg332_1, arg333_1, arg334_1, arg335_1, arg336_1, arg337_1, arg338_1, arg339_1, arg340_1, arg341_1, arg342_1, arg343_1, arg344_1, arg345_1, arg346_1, arg347_1, arg348_1, arg349_1, arg350_1, arg351_1, arg352_1, arg353_1, arg354_1, arg355_1, arg356_1, arg357_1, arg358_1, arg359_1, arg360_1, arg361_1, arg362_1 = args
args.clear()
buf0 = empty_strided((116, 3, 224, 224), (150528, 50176, 224, 1), device='cpu', dtype=torch.uint8)
kernel_cpp_0(c_void_p(arg362_1.data_ptr()), c_void_p(arg0_1.data_ptr()), c_void_p(arg1_1.data_ptr()), c_void_p(buf0.data_ptr()))
del arg362_1
buf1 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf0, arg0_1, arg1_1, arg254_1, arg2_1, arg3_1, 0, arg255_1, [2, 2], [3, 3], [1, 1], 1, arg4_1, arg5_1, 'relu_')
del arg0_1
del arg1_1
del arg254_1
del arg255_1
del arg2_1
del arg3_1
del buf0
buf2 = buf1
assert_size_stride(buf2, (116, 64, 112, 112), (802816, 1, 7168, 64))
del buf1
buf3 = empty_strided((116, 64, 112, 112), (802816, 1, 7168, 64), device='cpu', dtype=torch.float32)
buf4 = empty_strided((116, 64, 56, 56), (200704, 1, 3584, 64), device='cpu', dtype=torch.float32)
buf6 = empty_strided((116, 64, 56, 56), (200704, 3136, 56, 1), device='cpu', dtype=torch.uint8)
kernel_cpp_1(c_void_p(buf2.data_ptr()), c_void_p(arg5_1.data_ptr()), c_void_p(arg4_1.data_ptr()), c_void_p(arg6_1.data_ptr()), c_void_p(arg7_1.data_ptr()), c_void_p(buf3.data_ptr()), c_void_p(buf4.data_ptr()), c_void_p(buf6.data_ptr()))
del arg4_1
del arg5_1
buf7 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf6, arg6_1, arg7_1, arg256_1, arg8_1, arg9_1, 0, arg257_1, [1, 1], [0, 0], [1, 1], 1, arg10_1, arg11_1, 'relu_')
del arg256_1
del arg257_1
del arg8_1
del arg9_1
buf8 = buf7
assert_size_stride(buf8, (116, 64, 56, 56), (200704, 1, 3584, 64))
del buf7
buf9 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf8, arg10_1, arg11_1, arg258_1, arg12_1, arg13_1, 0, arg259_1, [1, 1], [1, 1], [1, 1], 1, arg14_1, arg15_1, 'relu_')
del arg10_1
del arg11_1
del arg12_1
del arg13_1
del arg258_1
del arg259_1
del buf8
buf10 = buf9
assert_size_stride(buf10, (116, 64, 56, 56), (200704, 1, 3584, 64))
del buf9
buf11 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf10, arg14_1, arg15_1, arg260_1, arg16_1, arg17_1, 0, arg261_1, [1, 1], [0, 0], [1, 1], 1, arg18_1, arg19_1, 'none')
del arg14_1
del arg15_1
del arg16_1
del arg17_1
del arg260_1
del arg261_1
del buf10
buf12 = buf11
assert_size_stride(buf12, (116, 256, 56, 56), (802816, 1, 14336, 256))
del buf11
buf13 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf6, arg6_1, arg7_1, arg262_1, arg20_1, arg21_1, 0, arg263_1, [1, 1], [0, 0], [1, 1], 1, arg22_1, arg23_1, 'none')
del arg20_1
del arg21_1
del arg262_1
del arg263_1
del arg6_1
del arg7_1
del buf6
buf14 = buf13
assert_size_stride(buf14, (116, 256, 56, 56), (802816, 1, 14336, 256))
del buf13
buf15 = as_strided(buf3, (116, 256, 56, 56), (802816, 1, 14336, 256)); del buf3 # reuse
buf16 = buf15; del buf15 # reuse
buf17 = as_strided(buf2, (116, 256, 56, 56), (802816, 3136, 56, 1)); del buf2 # reuse
kernel_cpp_2(c_void_p(buf16.data_ptr()), c_void_p(buf12.data_ptr()), c_void_p(arg19_1.data_ptr()), c_void_p(arg18_1.data_ptr()), c_void_p(buf14.data_ptr()), c_void_p(arg23_1.data_ptr()), c_void_p(arg22_1.data_ptr()), c_void_p(arg24_1.data_ptr()), c_void_p(arg25_1.data_ptr()), c_void_p(buf17.data_ptr()))
del arg18_1
del arg19_1
del arg22_1
del arg23_1
del buf12
buf18 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf17, arg24_1, arg25_1, arg264_1, arg26_1, arg27_1, 0, arg265_1, [1, 1], [0, 0], [1, 1], 1, arg28_1, arg29_1, 'relu_')
del arg264_1
del arg265_1
del arg26_1
del arg27_1
buf19 = buf18
assert_size_stride(buf19, (116, 64, 56, 56), (200704, 1, 3584, 64))
del buf18
buf20 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf19, arg28_1, arg29_1, arg266_1, arg30_1, arg31_1, 0, arg267_1, [1, 1], [1, 1], [1, 1], 1, arg32_1, arg33_1, 'relu_')
del arg266_1
del arg267_1
del arg28_1
del arg29_1
del arg30_1
del arg31_1
del buf19
buf21 = buf20
assert_size_stride(buf21, (116, 64, 56, 56), (200704, 1, 3584, 64))
del buf20
buf22 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf21, arg32_1, arg33_1, arg268_1, arg34_1, arg35_1, 0, arg269_1, [1, 1], [0, 0], [1, 1], 1, arg36_1, arg37_1, 'none')
del arg268_1
del arg269_1
del arg32_1
del arg33_1
del arg34_1
del arg35_1
del buf21
buf23 = buf22
assert_size_stride(buf23, (116, 256, 56, 56), (802816, 1, 14336, 256))
del buf22
buf24 = as_strided(buf16, (116, 256, 56, 56), (802816, 3136, 56, 1)); del buf16 # reuse
buf25 = buf24; del buf24 # reuse
buf26 = as_strided(buf14, (116, 256, 56, 56), (802816, 3136, 56, 1)); del buf14 # reuse
kernel_cpp_3(c_void_p(buf25.data_ptr()), c_void_p(buf23.data_ptr()), c_void_p(arg37_1.data_ptr()), c_void_p(arg36_1.data_ptr()), c_void_p(buf17.data_ptr()), c_void_p(arg25_1.data_ptr()), c_void_p(arg24_1.data_ptr()), c_void_p(arg38_1.data_ptr()), c_void_p(arg39_1.data_ptr()), c_void_p(buf26.data_ptr()))
del arg24_1
del arg25_1
del arg36_1
del arg37_1
del buf17
buf27 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf26, arg38_1, arg39_1, arg270_1, arg40_1, arg41_1, 0, arg271_1, [1, 1], [0, 0], [1, 1], 1, arg42_1, arg43_1, 'relu_')
del arg270_1
del arg271_1
del arg40_1
del arg41_1
buf28 = buf27
assert_size_stride(buf28, (116, 64, 56, 56), (200704, 1, 3584, 64))
del buf27
buf29 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf28, arg42_1, arg43_1, arg272_1, arg44_1, arg45_1, 0, arg273_1, [1, 1], [1, 1], [1, 1], 1, arg46_1, arg47_1, 'relu_')
del arg272_1
del arg273_1
del arg42_1
del arg43_1
del arg44_1
del arg45_1
del buf28
buf30 = buf29
assert_size_stride(buf30, (116, 64, 56, 56), (200704, 1, 3584, 64))
del buf29
buf31 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf30, arg46_1, arg47_1, arg274_1, arg48_1, arg49_1, 0, arg275_1, [1, 1], [0, 0], [1, 1], 1, arg50_1, arg51_1, 'none')
del arg274_1
del arg275_1
del arg46_1
del arg47_1
del arg48_1
del arg49_1
del buf30
buf32 = buf31
assert_size_stride(buf32, (116, 256, 56, 56), (802816, 1, 14336, 256))
del buf31
buf33 = buf25; del buf25 # reuse
buf34 = buf33; del buf33 # reuse
buf35 = as_strided(buf23, (116, 256, 56, 56), (802816, 3136, 56, 1)); del buf23 # reuse
kernel_cpp_4(c_void_p(buf34.data_ptr()), c_void_p(buf32.data_ptr()), c_void_p(arg51_1.data_ptr()), c_void_p(arg50_1.data_ptr()), c_void_p(buf26.data_ptr()), c_void_p(arg39_1.data_ptr()), c_void_p(arg38_1.data_ptr()), c_void_p(arg52_1.data_ptr()), c_void_p(arg53_1.data_ptr()), c_void_p(buf35.data_ptr()))
del arg38_1
del arg39_1
del arg50_1
del arg51_1
del buf26
del buf32
del buf34
buf36 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf35, arg52_1, arg53_1, arg276_1, arg54_1, arg55_1, 0, arg277_1, [1, 1], [0, 0], [1, 1], 1, arg56_1, arg57_1, 'relu_')
del arg276_1
del arg277_1
del arg54_1
del arg55_1
buf37 = buf36
assert_size_stride(buf37, (116, 128, 56, 56), (401408, 1, 7168, 128))
del buf36
buf38 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf37, arg56_1, arg57_1, arg278_1, arg58_1, arg59_1, 0, arg279_1, [2, 2], [1, 1], [1, 1], 1, arg60_1, arg61_1, 'relu_')
del arg278_1
del arg279_1
del arg56_1
del arg57_1
del arg58_1
del arg59_1
buf39 = buf38
assert_size_stride(buf39, (116, 128, 28, 28), (100352, 1, 3584, 128))
del buf38
buf40 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf39, arg60_1, arg61_1, arg280_1, arg62_1, arg63_1, 0, arg281_1, [1, 1], [0, 0], [1, 1], 1, arg64_1, arg65_1, 'none')
del arg280_1
del arg281_1
del arg60_1
del arg61_1
del arg62_1
del arg63_1
del buf39
buf41 = buf40
assert_size_stride(buf41, (116, 512, 28, 28), (401408, 1, 14336, 512))
del buf40
buf42 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf35, arg52_1, arg53_1, arg282_1, arg66_1, arg67_1, 0, arg283_1, [2, 2], [0, 0], [1, 1], 1, arg68_1, arg69_1, 'none')
del arg282_1
del arg283_1
del arg52_1
del arg53_1
del arg66_1
del arg67_1
del buf35
buf43 = buf42
assert_size_stride(buf43, (116, 512, 28, 28), (401408, 1, 14336, 512))
del buf42
buf44 = empty_strided((116, 512, 28, 28), (401408, 1, 14336, 512), device='cpu', dtype=torch.float32)
buf45 = buf44; del buf44 # reuse
buf46 = as_strided(buf37, (116, 512, 28, 28), (401408, 784, 28, 1)); del buf37 # reuse
kernel_cpp_5(c_void_p(buf45.data_ptr()), c_void_p(buf41.data_ptr()), c_void_p(arg65_1.data_ptr()), c_void_p(arg64_1.data_ptr()), c_void_p(buf43.data_ptr()), c_void_p(arg69_1.data_ptr()), c_void_p(arg68_1.data_ptr()), c_void_p(arg70_1.data_ptr()), c_void_p(arg71_1.data_ptr()), c_void_p(buf46.data_ptr()))
del arg64_1
del arg65_1
del arg68_1
del arg69_1
del buf41
buf47 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf46, arg70_1, arg71_1, arg284_1, arg72_1, arg73_1, 0, arg285_1, [1, 1], [0, 0], [1, 1], 1, arg74_1, arg75_1, 'relu_')
del arg284_1
del arg285_1
del arg72_1
del arg73_1
buf48 = buf47
assert_size_stride(buf48, (116, 128, 28, 28), (100352, 1, 3584, 128))
del buf47
buf49 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf48, arg74_1, arg75_1, arg286_1, arg76_1, arg77_1, 0, arg287_1, [1, 1], [1, 1], [1, 1], 1, arg78_1, arg79_1, 'relu_')
del arg286_1
del arg287_1
del arg74_1
del arg75_1
del arg76_1
del arg77_1
del buf48
buf50 = buf49
assert_size_stride(buf50, (116, 128, 28, 28), (100352, 1, 3584, 128))
del buf49
buf51 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf50, arg78_1, arg79_1, arg288_1, arg80_1, arg81_1, 0, arg289_1, [1, 1], [0, 0], [1, 1], 1, arg82_1, arg83_1, 'none')
del arg288_1
del arg289_1
del arg78_1
del arg79_1
del arg80_1
del arg81_1
del buf50
buf52 = buf51
assert_size_stride(buf52, (116, 512, 28, 28), (401408, 1, 14336, 512))
del buf51
buf53 = as_strided(buf45, (116, 512, 28, 28), (401408, 784, 28, 1)); del buf45 # reuse
buf54 = buf53; del buf53 # reuse
buf55 = as_strided(buf43, (116, 512, 28, 28), (401408, 784, 28, 1)); del buf43 # reuse
kernel_cpp_6(c_void_p(buf54.data_ptr()), c_void_p(buf52.data_ptr()), c_void_p(arg83_1.data_ptr()), c_void_p(arg82_1.data_ptr()), c_void_p(buf46.data_ptr()), c_void_p(arg71_1.data_ptr()), c_void_p(arg70_1.data_ptr()), c_void_p(arg84_1.data_ptr()), c_void_p(arg85_1.data_ptr()), c_void_p(buf55.data_ptr()))
del arg70_1
del arg71_1
del arg82_1
del arg83_1
del buf46
buf56 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf55, arg84_1, arg85_1, arg290_1, arg86_1, arg87_1, 0, arg291_1, [1, 1], [0, 0], [1, 1], 1, arg88_1, arg89_1, 'relu_')
del arg290_1
del arg291_1
del arg86_1
del arg87_1
buf57 = buf56
assert_size_stride(buf57, (116, 128, 28, 28), (100352, 1, 3584, 128))
del buf56
buf58 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf57, arg88_1, arg89_1, arg292_1, arg90_1, arg91_1, 0, arg293_1, [1, 1], [1, 1], [1, 1], 1, arg92_1, arg93_1, 'relu_')
del arg292_1
del arg293_1
del arg88_1
del arg89_1
del arg90_1
del arg91_1
del buf57
buf59 = buf58
assert_size_stride(buf59, (116, 128, 28, 28), (100352, 1, 3584, 128))
del buf58
buf60 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf59, arg92_1, arg93_1, arg294_1, arg94_1, arg95_1, 0, arg295_1, [1, 1], [0, 0], [1, 1], 1, arg96_1, arg97_1, 'none')
del arg294_1
del arg295_1
del arg92_1
del arg93_1
del arg94_1
del arg95_1
del buf59
buf61 = buf60
assert_size_stride(buf61, (116, 512, 28, 28), (401408, 1, 14336, 512))
del buf60
buf62 = buf54; del buf54 # reuse
buf63 = buf62; del buf62 # reuse
buf64 = as_strided(buf52, (116, 512, 28, 28), (401408, 784, 28, 1)); del buf52 # reuse
kernel_cpp_7(c_void_p(buf63.data_ptr()), c_void_p(buf61.data_ptr()), c_void_p(arg97_1.data_ptr()), c_void_p(arg96_1.data_ptr()), c_void_p(buf55.data_ptr()), c_void_p(arg85_1.data_ptr()), c_void_p(arg84_1.data_ptr()), c_void_p(arg98_1.data_ptr()), c_void_p(arg99_1.data_ptr()), c_void_p(buf64.data_ptr()))
del arg84_1
del arg85_1
del arg96_1
del arg97_1
del buf55
buf65 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf64, arg98_1, arg99_1, arg296_1, arg100_1, arg101_1, 0, arg297_1, [1, 1], [0, 0], [1, 1], 1, arg102_1, arg103_1, 'relu_')
del arg100_1
del arg101_1
del arg296_1
del arg297_1
buf66 = buf65
assert_size_stride(buf66, (116, 128, 28, 28), (100352, 1, 3584, 128))
del buf65
buf67 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf66, arg102_1, arg103_1, arg298_1, arg104_1, arg105_1, 0, arg299_1, [1, 1], [1, 1], [1, 1], 1, arg106_1, arg107_1, 'relu_')
del arg102_1
del arg103_1
del arg104_1
del arg105_1
del arg298_1
del arg299_1
del buf66
buf68 = buf67
assert_size_stride(buf68, (116, 128, 28, 28), (100352, 1, 3584, 128))
del buf67
buf69 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf68, arg106_1, arg107_1, arg300_1, arg108_1, arg109_1, 0, arg301_1, [1, 1], [0, 0], [1, 1], 1, arg110_1, arg111_1, 'none')
del arg106_1
del arg107_1
del arg108_1
del arg109_1
del arg300_1
del arg301_1
del buf68
buf70 = buf69
assert_size_stride(buf70, (116, 512, 28, 28), (401408, 1, 14336, 512))
del buf69
buf71 = buf63; del buf63 # reuse
buf72 = buf71; del buf71 # reuse
buf73 = as_strided(buf61, (116, 512, 28, 28), (401408, 784, 28, 1)); del buf61 # reuse
kernel_cpp_8(c_void_p(buf72.data_ptr()), c_void_p(buf70.data_ptr()), c_void_p(arg111_1.data_ptr()), c_void_p(arg110_1.data_ptr()), c_void_p(buf64.data_ptr()), c_void_p(arg99_1.data_ptr()), c_void_p(arg98_1.data_ptr()), c_void_p(arg112_1.data_ptr()), c_void_p(arg113_1.data_ptr()), c_void_p(buf73.data_ptr()))
del arg110_1
del arg111_1
del arg98_1
del arg99_1
del buf64
del buf70
del buf72
buf74 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf73, arg112_1, arg113_1, arg302_1, arg114_1, arg115_1, 0, arg303_1, [1, 1], [0, 0], [1, 1], 1, arg116_1, arg117_1, 'relu_')
del arg114_1
del arg115_1
del arg302_1
del arg303_1
buf75 = buf74
assert_size_stride(buf75, (116, 256, 28, 28), (200704, 1, 7168, 256))
del buf74
buf76 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf75, arg116_1, arg117_1, arg304_1, arg118_1, arg119_1, 0, arg305_1, [2, 2], [1, 1], [1, 1], 1, arg120_1, arg121_1, 'relu_')
del arg116_1
del arg117_1
del arg118_1
del arg119_1
del arg304_1
del arg305_1
buf77 = buf76
assert_size_stride(buf77, (116, 256, 14, 14), (50176, 1, 3584, 256))
del buf76
buf78 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf77, arg120_1, arg121_1, arg306_1, arg122_1, arg123_1, 0, arg307_1, [1, 1], [0, 0], [1, 1], 1, arg124_1, arg125_1, 'none')
del arg120_1
del arg121_1
del arg122_1
del arg123_1
del arg306_1
del arg307_1
del buf77
buf79 = buf78
assert_size_stride(buf79, (116, 1024, 14, 14), (200704, 1, 14336, 1024))
del buf78
buf80 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf73, arg112_1, arg113_1, arg308_1, arg126_1, arg127_1, 0, arg309_1, [2, 2], [0, 0], [1, 1], 1, arg128_1, arg129_1, 'none')
del arg112_1
del arg113_1
del arg126_1
del arg127_1
del arg308_1
del arg309_1
del buf73
buf81 = buf80
assert_size_stride(buf81, (116, 1024, 14, 14), (200704, 1, 14336, 1024))
del buf80
buf82 = as_strided(buf4, (116, 1024, 14, 14), (200704, 1, 14336, 1024)); del buf4 # reuse
buf83 = buf82; del buf82 # reuse
buf84 = as_strided(buf75, (116, 1024, 14, 14), (200704, 196, 14, 1)); del buf75 # reuse
kernel_cpp_9(c_void_p(buf83.data_ptr()), c_void_p(buf79.data_ptr()), c_void_p(arg125_1.data_ptr()), c_void_p(arg124_1.data_ptr()), c_void_p(buf81.data_ptr()), c_void_p(arg129_1.data_ptr()), c_void_p(arg128_1.data_ptr()), c_void_p(arg130_1.data_ptr()), c_void_p(arg131_1.data_ptr()), c_void_p(buf84.data_ptr()))
del arg124_1
del arg125_1
del arg128_1
del arg129_1
del buf79
buf85 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf84, arg130_1, arg131_1, arg310_1, arg132_1, arg133_1, 0, arg311_1, [1, 1], [0, 0], [1, 1], 1, arg134_1, arg135_1, 'relu_')
del arg132_1
del arg133_1
del arg310_1
del arg311_1
buf86 = buf85
assert_size_stride(buf86, (116, 256, 14, 14), (50176, 1, 3584, 256))
del buf85
buf87 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf86, arg134_1, arg135_1, arg312_1, arg136_1, arg137_1, 0, arg313_1, [1, 1], [1, 1], [1, 1], 1, arg138_1, arg139_1, 'relu_')
del arg134_1
del arg135_1
del arg136_1
del arg137_1
del arg312_1
del arg313_1
del buf86
buf88 = buf87
assert_size_stride(buf88, (116, 256, 14, 14), (50176, 1, 3584, 256))
del buf87
buf89 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf88, arg138_1, arg139_1, arg314_1, arg140_1, arg141_1, 0, arg315_1, [1, 1], [0, 0], [1, 1], 1, arg142_1, arg143_1, 'none')
del arg138_1
del arg139_1
del arg140_1
del arg141_1
del arg314_1
del arg315_1
del buf88
buf90 = buf89
assert_size_stride(buf90, (116, 1024, 14, 14), (200704, 1, 14336, 1024))
del buf89
buf91 = as_strided(buf83, (116, 1024, 14, 14), (200704, 196, 14, 1)); del buf83 # reuse
buf92 = buf91; del buf91 # reuse
buf93 = as_strided(buf81, (116, 1024, 14, 14), (200704, 196, 14, 1)); del buf81 # reuse
kernel_cpp_10(c_void_p(buf92.data_ptr()), c_void_p(buf90.data_ptr()), c_void_p(arg143_1.data_ptr()), c_void_p(arg142_1.data_ptr()), c_void_p(buf84.data_ptr()), c_void_p(arg131_1.data_ptr()), c_void_p(arg130_1.data_ptr()), c_void_p(arg144_1.data_ptr()), c_void_p(arg145_1.data_ptr()), c_void_p(buf93.data_ptr()))
del arg130_1
del arg131_1
del arg142_1
del arg143_1
del buf84
buf94 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf93, arg144_1, arg145_1, arg316_1, arg146_1, arg147_1, 0, arg317_1, [1, 1], [0, 0], [1, 1], 1, arg148_1, arg149_1, 'relu_')
del arg146_1
del arg147_1
del arg316_1
del arg317_1
buf95 = buf94
assert_size_stride(buf95, (116, 256, 14, 14), (50176, 1, 3584, 256))
del buf94
buf96 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf95, arg148_1, arg149_1, arg318_1, arg150_1, arg151_1, 0, arg319_1, [1, 1], [1, 1], [1, 1], 1, arg152_1, arg153_1, 'relu_')
del arg148_1
del arg149_1
del arg150_1
del arg151_1
del arg318_1
del arg319_1
del buf95
buf97 = buf96
assert_size_stride(buf97, (116, 256, 14, 14), (50176, 1, 3584, 256))
del buf96
buf98 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf97, arg152_1, arg153_1, arg320_1, arg154_1, arg155_1, 0, arg321_1, [1, 1], [0, 0], [1, 1], 1, arg156_1, arg157_1, 'none')
del arg152_1
del arg153_1
del arg154_1
del arg155_1
del arg320_1
del arg321_1
del buf97
buf99 = buf98
assert_size_stride(buf99, (116, 1024, 14, 14), (200704, 1, 14336, 1024))
del buf98
buf100 = buf92; del buf92 # reuse
buf101 = buf100; del buf100 # reuse
buf102 = as_strided(buf90, (116, 1024, 14, 14), (200704, 196, 14, 1)); del buf90 # reuse
kernel_cpp_11(c_void_p(buf101.data_ptr()), c_void_p(buf99.data_ptr()), c_void_p(arg157_1.data_ptr()), c_void_p(arg156_1.data_ptr()), c_void_p(buf93.data_ptr()), c_void_p(arg145_1.data_ptr()), c_void_p(arg144_1.data_ptr()), c_void_p(arg158_1.data_ptr()), c_void_p(arg159_1.data_ptr()), c_void_p(buf102.data_ptr()))
del arg144_1
del arg145_1
del arg156_1
del arg157_1
del buf93
buf103 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf102, arg158_1, arg159_1, arg322_1, arg160_1, arg161_1, 0, arg323_1, [1, 1], [0, 0], [1, 1], 1, arg162_1, arg163_1, 'relu_')
del arg160_1
del arg161_1
del arg322_1
del arg323_1
buf104 = buf103
assert_size_stride(buf104, (116, 256, 14, 14), (50176, 1, 3584, 256))
del buf103
buf105 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf104, arg162_1, arg163_1, arg324_1, arg164_1, arg165_1, 0, arg325_1, [1, 1], [1, 1], [1, 1], 1, arg166_1, arg167_1, 'relu_')
del arg162_1
del arg163_1
del arg164_1
del arg165_1
del arg324_1
del arg325_1
del buf104
buf106 = buf105
assert_size_stride(buf106, (116, 256, 14, 14), (50176, 1, 3584, 256))
del buf105
buf107 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf106, arg166_1, arg167_1, arg326_1, arg168_1, arg169_1, 0, arg327_1, [1, 1], [0, 0], [1, 1], 1, arg170_1, arg171_1, 'none')
del arg166_1
del arg167_1
del arg168_1
del arg169_1
del arg326_1
del arg327_1
del buf106
buf108 = buf107
assert_size_stride(buf108, (116, 1024, 14, 14), (200704, 1, 14336, 1024))
del buf107
buf109 = buf101; del buf101 # reuse
buf110 = buf109; del buf109 # reuse
buf111 = as_strided(buf99, (116, 1024, 14, 14), (200704, 196, 14, 1)); del buf99 # reuse
kernel_cpp_12(c_void_p(buf110.data_ptr()), c_void_p(buf108.data_ptr()), c_void_p(arg171_1.data_ptr()), c_void_p(arg170_1.data_ptr()), c_void_p(buf102.data_ptr()), c_void_p(arg159_1.data_ptr()), c_void_p(arg158_1.data_ptr()), c_void_p(arg172_1.data_ptr()), c_void_p(arg173_1.data_ptr()), c_void_p(buf111.data_ptr()))
del arg158_1
del arg159_1
del arg170_1
del arg171_1
del buf102
buf112 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf111, arg172_1, arg173_1, arg328_1, arg174_1, arg175_1, 0, arg329_1, [1, 1], [0, 0], [1, 1], 1, arg176_1, arg177_1, 'relu_')
del arg174_1
del arg175_1
del arg328_1
del arg329_1
buf113 = buf112
assert_size_stride(buf113, (116, 256, 14, 14), (50176, 1, 3584, 256))
del buf112
buf114 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf113, arg176_1, arg177_1, arg330_1, arg178_1, arg179_1, 0, arg331_1, [1, 1], [1, 1], [1, 1], 1, arg180_1, arg181_1, 'relu_')
del arg176_1
del arg177_1
del arg178_1
del arg179_1
del arg330_1
del arg331_1
del buf113
buf115 = buf114
assert_size_stride(buf115, (116, 256, 14, 14), (50176, 1, 3584, 256))
del buf114
buf116 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf115, arg180_1, arg181_1, arg332_1, arg182_1, arg183_1, 0, arg333_1, [1, 1], [0, 0], [1, 1], 1, arg184_1, arg185_1, 'none')
del arg180_1
del arg181_1
del arg182_1
del arg183_1
del arg332_1
del arg333_1
del buf115
buf117 = buf116
assert_size_stride(buf117, (116, 1024, 14, 14), (200704, 1, 14336, 1024))
del buf116
buf118 = buf110; del buf110 # reuse
buf119 = buf118; del buf118 # reuse
buf120 = as_strided(buf108, (116, 1024, 14, 14), (200704, 196, 14, 1)); del buf108 # reuse
kernel_cpp_13(c_void_p(buf119.data_ptr()), c_void_p(buf117.data_ptr()), c_void_p(arg185_1.data_ptr()), c_void_p(arg184_1.data_ptr()), c_void_p(buf111.data_ptr()), c_void_p(arg173_1.data_ptr()), c_void_p(arg172_1.data_ptr()), c_void_p(arg186_1.data_ptr()), c_void_p(arg187_1.data_ptr()), c_void_p(buf120.data_ptr()))
del arg172_1
del arg173_1
del arg184_1
del arg185_1
del buf111
buf121 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf120, arg186_1, arg187_1, arg334_1, arg188_1, arg189_1, 0, arg335_1, [1, 1], [0, 0], [1, 1], 1, arg190_1, arg191_1, 'relu_')
del arg188_1
del arg189_1
del arg334_1
del arg335_1
buf122 = buf121
assert_size_stride(buf122, (116, 256, 14, 14), (50176, 1, 3584, 256))
del buf121
buf123 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf122, arg190_1, arg191_1, arg336_1, arg192_1, arg193_1, 0, arg337_1, [1, 1], [1, 1], [1, 1], 1, arg194_1, arg195_1, 'relu_')
del arg190_1
del arg191_1
del arg192_1
del arg193_1
del arg336_1
del arg337_1
del buf122
buf124 = buf123
assert_size_stride(buf124, (116, 256, 14, 14), (50176, 1, 3584, 256))
del buf123
buf125 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf124, arg194_1, arg195_1, arg338_1, arg196_1, arg197_1, 0, arg339_1, [1, 1], [0, 0], [1, 1], 1, arg198_1, arg199_1, 'none')
del arg194_1
del arg195_1
del arg196_1
del arg197_1
del arg338_1
del arg339_1
del buf124
buf126 = buf125
assert_size_stride(buf126, (116, 1024, 14, 14), (200704, 1, 14336, 1024))
del buf125
buf127 = buf119; del buf119 # reuse
buf128 = buf127; del buf127 # reuse
buf129 = as_strided(buf117, (116, 1024, 14, 14), (200704, 196, 14, 1)); del buf117 # reuse
kernel_cpp_14(c_void_p(buf128.data_ptr()), c_void_p(buf126.data_ptr()), c_void_p(arg199_1.data_ptr()), c_void_p(arg198_1.data_ptr()), c_void_p(buf120.data_ptr()), c_void_p(arg187_1.data_ptr()), c_void_p(arg186_1.data_ptr()), c_void_p(arg200_1.data_ptr()), c_void_p(arg201_1.data_ptr()), c_void_p(buf129.data_ptr()))
del arg186_1
del arg187_1
del arg198_1
del arg199_1
del buf120
del buf126
del buf128
buf130 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf129, arg200_1, arg201_1, arg340_1, arg202_1, arg203_1, 0, arg341_1, [1, 1], [0, 0], [1, 1], 1, arg204_1, arg205_1, 'relu_')
del arg202_1
del arg203_1
del arg340_1
del arg341_1
buf131 = buf130
assert_size_stride(buf131, (116, 512, 14, 14), (100352, 1, 7168, 512))
del buf130
buf132 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf131, arg204_1, arg205_1, arg342_1, arg206_1, arg207_1, 0, arg343_1, [2, 2], [1, 1], [1, 1], 1, arg208_1, arg209_1, 'relu_')
del arg204_1
del arg205_1
del arg206_1
del arg207_1
del arg342_1
del arg343_1
buf133 = buf132
assert_size_stride(buf133, (116, 512, 7, 7), (25088, 1, 3584, 512))
del buf132
buf134 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf133, arg208_1, arg209_1, arg344_1, arg210_1, arg211_1, 0, arg345_1, [1, 1], [0, 0], [1, 1], 1, arg212_1, arg213_1, 'none')
del arg208_1
del arg209_1
del arg210_1
del arg211_1
del arg344_1
del arg345_1
del buf133
buf135 = buf134
assert_size_stride(buf135, (116, 2048, 7, 7), (100352, 1, 14336, 2048))
del buf134
buf136 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf129, arg200_1, arg201_1, arg346_1, arg214_1, arg215_1, 0, arg347_1, [2, 2], [0, 0], [1, 1], 1, arg216_1, arg217_1, 'none')
del arg200_1
del arg201_1
del arg214_1
del arg215_1
del arg346_1
del arg347_1
del buf129
buf137 = buf136
assert_size_stride(buf137, (116, 2048, 7, 7), (100352, 1, 14336, 2048))
del buf136
buf138 = empty_strided((116, 2048, 7, 7), (100352, 1, 14336, 2048), device='cpu', dtype=torch.float32)
buf139 = buf138; del buf138 # reuse
buf140 = as_strided(buf131, (116, 2048, 7, 7), (100352, 49, 7, 1)); del buf131 # reuse
kernel_cpp_15(c_void_p(buf139.data_ptr()), c_void_p(buf135.data_ptr()), c_void_p(arg213_1.data_ptr()), c_void_p(arg212_1.data_ptr()), c_void_p(buf137.data_ptr()), c_void_p(arg217_1.data_ptr()), c_void_p(arg216_1.data_ptr()), c_void_p(arg218_1.data_ptr()), c_void_p(arg219_1.data_ptr()), c_void_p(buf140.data_ptr()))
del arg212_1
del arg213_1
del arg216_1
del arg217_1
del buf135
buf141 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf140, arg218_1, arg219_1, arg348_1, arg220_1, arg221_1, 0, arg349_1, [1, 1], [0, 0], [1, 1], 1, arg222_1, arg223_1, 'relu_')
del arg220_1
del arg221_1
del arg348_1
del arg349_1
buf142 = buf141
assert_size_stride(buf142, (116, 512, 7, 7), (25088, 1, 3584, 512))
del buf141
buf143 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf142, arg222_1, arg223_1, arg350_1, arg224_1, arg225_1, 0, arg351_1, [1, 1], [1, 1], [1, 1], 1, arg226_1, arg227_1, 'relu_')
del arg222_1
del arg223_1
del arg224_1
del arg225_1
del arg350_1
del arg351_1
del buf142
buf144 = buf143
assert_size_stride(buf144, (116, 512, 7, 7), (25088, 1, 3584, 512))
del buf143
buf145 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf144, arg226_1, arg227_1, arg352_1, arg228_1, arg229_1, 0, arg353_1, [1, 1], [0, 0], [1, 1], 1, arg230_1, arg231_1, 'none')
del arg226_1
del arg227_1
del arg228_1
del arg229_1
del arg352_1
del arg353_1
del buf144
buf146 = buf145
assert_size_stride(buf146, (116, 2048, 7, 7), (100352, 1, 14336, 2048))
del buf145
buf147 = as_strided(buf139, (116, 2048, 7, 7), (100352, 49, 7, 1)); del buf139 # reuse
buf148 = buf147; del buf147 # reuse
buf149 = as_strided(buf137, (116, 2048, 7, 7), (100352, 49, 7, 1)); del buf137 # reuse
kernel_cpp_16(c_void_p(buf148.data_ptr()), c_void_p(buf146.data_ptr()), c_void_p(arg231_1.data_ptr()), c_void_p(arg230_1.data_ptr()), c_void_p(buf140.data_ptr()), c_void_p(arg219_1.data_ptr()), c_void_p(arg218_1.data_ptr()), c_void_p(arg232_1.data_ptr()), c_void_p(arg233_1.data_ptr()), c_void_p(buf149.data_ptr()))
del arg218_1
del arg219_1
del arg230_1
del arg231_1
del buf140
del buf146
buf150 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf149, arg232_1, arg233_1, arg354_1, arg234_1, arg235_1, 0, arg355_1, [1, 1], [0, 0], [1, 1], 1, arg236_1, arg237_1, 'relu_')
del arg234_1
del arg235_1
del arg354_1
del arg355_1
buf151 = buf150
assert_size_stride(buf151, (116, 512, 7, 7), (25088, 1, 3584, 512))
del buf150
buf152 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf151, arg236_1, arg237_1, arg356_1, arg238_1, arg239_1, 0, arg357_1, [1, 1], [1, 1], [1, 1], 1, arg240_1, arg241_1, 'relu_')
del arg236_1
del arg237_1
del arg238_1
del arg239_1
del arg356_1
del arg357_1
del buf151
buf153 = buf152
assert_size_stride(buf153, (116, 512, 7, 7), (25088, 1, 3584, 512))
del buf152
buf154 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf153, arg240_1, arg241_1, arg358_1, arg242_1, arg243_1, 0, arg359_1, [1, 1], [0, 0], [1, 1], 1, arg244_1, arg245_1, 'none')
del arg240_1
del arg241_1
del arg242_1
del arg243_1
del arg358_1
del arg359_1
del buf153
buf155 = buf154
assert_size_stride(buf155, (116, 2048, 7, 7), (100352, 1, 14336, 2048))
del buf154
buf156 = buf148; del buf148 # reuse
buf157 = buf156; del buf156 # reuse
buf158 = empty_strided((116, 2048, 1, 1), (2048, 1, 237568, 237568), device='cpu', dtype=torch.float32)
buf159 = empty_strided((116, 2048), (2048, 1), device='cpu', dtype=torch.uint8)
kernel_cpp_17(c_void_p(buf157.data_ptr()), c_void_p(buf155.data_ptr()), c_void_p(arg245_1.data_ptr()), c_void_p(arg244_1.data_ptr()), c_void_p(buf149.data_ptr()), c_void_p(arg233_1.data_ptr()), c_void_p(arg232_1.data_ptr()), c_void_p(arg246_1.data_ptr()), c_void_p(arg247_1.data_ptr()), c_void_p(arg248_1.data_ptr()), c_void_p(arg249_1.data_ptr()), c_void_p(buf158.data_ptr()), c_void_p(buf159.data_ptr()))
del arg232_1
del arg233_1
del arg244_1
del arg245_1
del arg246_1
del arg247_1
del buf149
del buf155
del buf157
del buf158
buf160 = torch.ops.quantized_decomposed.linear_unary_inductor.tensor(buf159, arg248_1, arg249_1, arg360_1, arg250_1, arg251_1, 0, arg361_1, arg252_1, arg253_1, 'none')
del arg248_1
del arg249_1
del arg250_1
del arg251_1
del arg360_1
del arg361_1
del buf159
buf161 = buf160
assert_size_stride(buf161, (116, 1000), (1000, 1))
del buf160
buf162 = empty_strided((116, 1000), (1000, 1), device='cpu', dtype=torch.float32)
kernel_cpp_18(c_void_p(buf161.data_ptr()), c_void_p(arg253_1.data_ptr()), c_void_p(arg252_1.data_ptr()), c_void_p(buf162.data_ptr()))
del arg252_1
del arg253_1
return (buf162, )
if __name__ == "__main__":
from torch._dynamo.testing import rand_strided
from torch._inductor.utils import print_performance
arg0_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
arg1_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
arg2_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32)
arg3_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.int64)
arg4_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
arg5_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
arg6_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
arg7_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
arg8_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32)
arg9_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.int64)
arg10_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
arg11_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
arg12_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32)
arg13_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.int64)
arg14_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
arg15_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
arg16_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg17_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.int64)
arg18_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
arg19_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
arg20_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg21_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.int64)
arg22_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
arg23_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
arg24_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
arg25_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
arg26_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32)
arg27_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.int64)
arg28_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
arg29_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
arg30_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32)
arg31_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.int64)
arg32_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
arg33_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
arg34_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg35_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.int64)
arg36_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
arg37_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
arg38_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
arg39_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
arg40_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32)
arg41_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.int64)
arg42_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
arg43_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
arg44_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32)
arg45_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.int64)
arg46_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
arg47_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
arg48_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg49_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.int64)
arg50_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
arg51_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
arg52_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
arg53_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
arg54_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
arg55_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.int64)
arg56_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
arg57_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
arg58_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
arg59_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.int64)
arg60_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
arg61_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
arg62_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
arg63_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.int64)
arg64_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
arg65_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
arg66_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
arg67_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.int64)
arg68_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
arg69_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
arg70_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
arg71_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
arg72_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
arg73_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.int64)
arg74_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
arg75_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
arg76_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
arg77_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.int64)
arg78_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
arg79_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
arg80_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
arg81_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.int64)
arg82_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
arg83_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
arg84_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
arg85_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
arg86_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
arg87_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.int64)
arg88_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
arg89_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
arg90_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
arg91_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.int64)
arg92_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
arg93_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
arg94_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
arg95_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.int64)
arg96_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
arg97_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
arg98_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
arg99_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
arg100_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
arg101_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.int64)
arg102_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
arg103_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
arg104_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
arg105_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.int64)
arg106_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
arg107_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
arg108_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
arg109_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.int64)
arg110_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
arg111_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
arg112_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
arg113_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
arg114_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg115_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.int64)
arg116_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
arg117_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
arg118_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg119_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.int64)
arg120_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
arg121_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
arg122_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32)
arg123_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.int64)
arg124_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
arg125_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
arg126_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32)
arg127_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.int64)
arg128_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
arg129_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
arg130_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
arg131_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
arg132_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg133_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.int64)
arg134_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
arg135_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
arg136_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg137_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.int64)
arg138_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
arg139_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
arg140_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32)
arg141_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.int64)
arg142_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
arg143_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
arg144_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
arg145_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
arg146_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg147_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.int64)
arg148_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
arg149_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
arg150_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg151_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.int64)
arg152_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
arg153_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
arg154_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32)
arg155_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.int64)
arg156_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
arg157_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
arg158_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
arg159_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
arg160_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg161_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.int64)
arg162_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
arg163_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
arg164_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg165_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.int64)
arg166_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
arg167_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
arg168_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32)
arg169_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.int64)
arg170_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
arg171_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
arg172_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
arg173_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
arg174_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg175_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.int64)
arg176_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
arg177_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
arg178_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg179_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.int64)
arg180_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
arg181_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
arg182_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32)
arg183_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.int64)
arg184_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
arg185_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
arg186_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
arg187_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
arg188_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg189_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.int64)
arg190_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
arg191_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
arg192_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg193_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.int64)
arg194_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
arg195_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
arg196_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32)
arg197_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.int64)
arg198_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
arg199_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
arg200_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
arg201_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
arg202_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
arg203_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.int64)
arg204_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
arg205_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
arg206_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
arg207_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.int64)
arg208_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
arg209_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
arg210_1 = rand_strided((2048, ), (1, ), device='cpu', dtype=torch.float32)
arg211_1 = rand_strided((2048, ), (1, ), device='cpu', dtype=torch.int64)
arg212_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
arg213_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
arg214_1 = rand_strided((2048, ), (1, ), device='cpu', dtype=torch.float32)
arg215_1 = rand_strided((2048, ), (1, ), device='cpu', dtype=torch.int64)
arg216_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
arg217_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
arg218_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
arg219_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
arg220_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
arg221_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.int64)
arg222_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
arg223_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
arg224_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
arg225_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.int64)
arg226_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
arg227_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
arg228_1 = rand_strided((2048, ), (1, ), device='cpu', dtype=torch.float32)
arg229_1 = rand_strided((2048, ), (1, ), device='cpu', dtype=torch.int64)
arg230_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
arg231_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
arg232_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
arg233_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
arg234_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
arg235_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.int64)
arg236_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
arg237_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
arg238_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
arg239_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.int64)
arg240_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
arg241_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
arg242_1 = rand_strided((2048, ), (1, ), device='cpu', dtype=torch.float32)
arg243_1 = rand_strided((2048, ), (1, ), device='cpu', dtype=torch.int64)
arg244_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
arg245_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
arg246_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
arg247_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
arg248_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
arg249_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
arg250_1 = rand_strided((1000, ), (1, ), device='cpu', dtype=torch.float32)
arg251_1 = rand_strided((1000, ), (1, ), device='cpu', dtype=torch.int64)
arg252_1 = rand_strided((), (), device='cpu', dtype=torch.float32)
arg253_1 = rand_strided((), (), device='cpu', dtype=torch.int64)
arg254_1 = rand_strided((64, 3, 7, 7), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
arg255_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32)
arg256_1 = rand_strided((64, 64, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
arg257_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32)
arg258_1 = rand_strided((64, 64, 3, 3), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
arg259_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32)
arg260_1 = rand_strided((256, 64, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
arg261_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg262_1 = rand_strided((256, 64, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
arg263_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg264_1 = rand_strided((64, 256, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
arg265_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32)
arg266_1 = rand_strided((64, 64, 3, 3), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
arg267_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32)
arg268_1 = rand_strided((256, 64, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
arg269_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg270_1 = rand_strided((64, 256, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
arg271_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32)
arg272_1 = rand_strided((64, 64, 3, 3), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
arg273_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32)
arg274_1 = rand_strided((256, 64, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
arg275_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg276_1 = rand_strided((128, 256, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
arg277_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
arg278_1 = rand_strided((128, 128, 3, 3), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
arg279_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
arg280_1 = rand_strided((512, 128, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
arg281_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
arg282_1 = rand_strided((512, 256, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
arg283_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
arg284_1 = rand_strided((128, 512, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
arg285_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
arg286_1 = rand_strided((128, 128, 3, 3), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
arg287_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
arg288_1 = rand_strided((512, 128, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
arg289_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
arg290_1 = rand_strided((128, 512, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
arg291_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
arg292_1 = rand_strided((128, 128, 3, 3), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
arg293_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
arg294_1 = rand_strided((512, 128, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
arg295_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
arg296_1 = rand_strided((128, 512, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
arg297_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
arg298_1 = rand_strided((128, 128, 3, 3), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
arg299_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
arg300_1 = rand_strided((512, 128, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
arg301_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
arg302_1 = rand_strided((256, 512, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
arg303_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg304_1 = rand_strided((256, 256, 3, 3), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
arg305_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg306_1 = rand_strided((1024, 256, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
arg307_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32)
arg308_1 = rand_strided((1024, 512, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
arg309_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32)
arg310_1 = rand_strided((256, 1024, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
arg311_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg312_1 = rand_strided((256, 256, 3, 3), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
arg313_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg314_1 = rand_strided((1024, 256, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
arg315_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32)
arg316_1 = rand_strided((256, 1024, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
arg317_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg318_1 = rand_strided((256, 256, 3, 3), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
arg319_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg320_1 = rand_strided((1024, 256, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
arg321_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32)
arg322_1 = rand_strided((256, 1024, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
arg323_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg324_1 = rand_strided((256, 256, 3, 3), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
arg325_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg326_1 = rand_strided((1024, 256, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
arg327_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32)
arg328_1 = rand_strided((256, 1024, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
arg329_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg330_1 = rand_strided((256, 256, 3, 3), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
arg331_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg332_1 = rand_strided((1024, 256, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
arg333_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32)
arg334_1 = rand_strided((256, 1024, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
arg335_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg336_1 = rand_strided((256, 256, 3, 3), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
arg337_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
arg338_1 = rand_strided((1024, 256, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
arg339_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32)
arg340_1 = rand_strided((512, 1024, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
arg341_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
arg342_1 = rand_strided((512, 512, 3, 3), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
arg343_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
arg344_1 = rand_strided((2048, 512, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
arg345_1 = rand_strided((2048, ), (1, ), device='cpu', dtype=torch.float32)
arg346_1 = rand_strided((2048, 1024, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
arg347_1 = rand_strided((2048, ), (1, ), device='cpu', dtype=torch.float32)
arg348_1 = rand_strided((512, 2048, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
arg349_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
arg350_1 = rand_strided((512, 512, 3, 3), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
arg351_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
arg352_1 = rand_strided((2048, 512, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
arg353_1 = rand_strided((2048, ), (1, ), device='cpu', dtype=torch.float32)
arg354_1 = rand_strided((512, 2048, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
arg355_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
arg356_1 = rand_strided((512, 512, 3, 3), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
arg357_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
arg358_1 = rand_strided((2048, 512, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8)
arg359_1 = rand_strided((2048, ), (1, ), device='cpu', dtype=torch.float32)
arg360_1 = rand_strided((2048, 1000), (1, 0), device='cpu', dtype=torch.int8)
arg361_1 = rand_strided((1, 1000), (1, 0), device='cpu', dtype=torch.float32)
arg362_1 = rand_strided((116, 3, 224, 224), (150528, 50176, 224, 1), device='cpu', dtype=torch.float32)
print_performance(lambda: call([arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1, arg12_1, arg13_1, arg14_1, arg15_1, arg16_1, arg17_1, arg18_1, arg19_1, arg20_1, arg21_1, arg22_1, arg23_1, arg24_1, arg25_1, arg26_1, arg27_1, arg28_1, arg29_1, arg30_1, arg31_1, arg32_1, arg33_1, arg34_1, arg35_1, arg36_1, arg37_1, arg38_1, arg39_1, arg40_1, arg41_1, arg42_1, arg43_1, arg44_1, arg45_1, arg46_1, arg47_1, arg48_1, arg49_1, arg50_1, arg51_1, arg52_1, arg53_1, arg54_1, arg55_1, arg56_1, arg57_1, arg58_1, arg59_1, arg60_1, arg61_1, arg62_1, arg63_1, arg64_1, arg65_1, arg66_1, arg67_1, arg68_1, arg69_1, arg70_1, arg71_1, arg72_1, arg73_1, arg74_1, arg75_1, arg76_1, arg77_1, arg78_1, arg79_1, arg80_1, arg81_1, arg82_1, arg83_1, arg84_1, arg85_1, arg86_1, arg87_1, arg88_1, arg89_1, arg90_1, arg91_1, arg92_1, arg93_1, arg94_1, arg95_1, arg96_1, arg97_1, arg98_1, arg99_1, arg100_1, arg101_1, arg102_1, arg103_1, arg104_1, arg105_1, arg106_1, arg107_1, arg108_1, arg109_1, arg110_1, arg111_1, arg112_1, arg113_1, arg114_1, arg115_1, arg116_1, arg117_1, arg118_1, arg119_1, arg120_1, arg121_1, arg122_1, arg123_1, arg124_1, arg125_1, arg126_1, arg127_1, arg128_1, arg129_1, arg130_1, arg131_1, arg132_1, arg133_1, arg134_1, arg135_1, arg136_1, arg137_1, arg138_1, arg139_1, arg140_1, arg141_1, arg142_1, arg143_1, arg144_1, arg145_1, arg146_1, arg147_1, arg148_1, arg149_1, arg150_1, arg151_1, arg152_1, arg153_1, arg154_1, arg155_1, arg156_1, arg157_1, arg158_1, arg159_1, arg160_1, arg161_1, arg162_1, arg163_1, arg164_1, arg165_1, arg166_1, arg167_1, arg168_1, arg169_1, arg170_1, arg171_1, arg172_1, arg173_1, arg174_1, arg175_1, arg176_1, arg177_1, arg178_1, arg179_1, arg180_1, arg181_1, arg182_1, arg183_1, arg184_1, arg185_1, arg186_1, arg187_1, arg188_1, arg189_1, arg190_1, arg191_1, arg192_1, arg193_1, arg194_1, arg195_1, arg196_1, arg197_1, arg198_1, arg199_1, arg200_1, arg201_1, arg202_1, arg203_1, arg204_1, arg205_1, arg206_1, arg207_1, arg208_1, arg209_1, arg210_1, arg211_1, arg212_1, arg213_1, arg214_1, arg215_1, arg216_1, arg217_1, arg218_1, arg219_1, arg220_1, arg221_1, arg222_1, arg223_1, arg224_1, arg225_1, arg226_1, arg227_1, arg228_1, arg229_1, arg230_1, arg231_1, arg232_1, arg233_1, arg234_1, arg235_1, arg236_1, arg237_1, arg238_1, arg239_1, arg240_1, arg241_1, arg242_1, arg243_1, arg244_1, arg245_1, arg246_1, arg247_1, arg248_1, arg249_1, arg250_1, arg251_1, arg252_1, arg253_1, arg254_1, arg255_1, arg256_1, arg257_1, arg258_1, arg259_1, arg260_1, arg261_1, arg262_1, arg263_1, arg264_1, arg265_1, arg266_1, arg267_1, arg268_1, arg269_1, arg270_1, arg271_1, arg272_1, arg273_1, arg274_1, arg275_1, arg276_1, arg277_1, arg278_1, arg279_1, arg280_1, arg281_1, arg282_1, arg283_1, arg284_1, arg285_1, arg286_1, arg287_1, arg288_1, arg289_1, arg290_1, arg291_1, arg292_1, arg293_1, arg294_1, arg295_1, arg296_1, arg297_1, arg298_1, arg299_1, arg300_1, arg301_1, arg302_1, arg303_1, arg304_1, arg305_1, arg306_1, arg307_1, arg308_1, arg309_1, arg310_1, arg311_1, arg312_1, arg313_1, arg314_1, arg315_1, arg316_1, arg317_1, arg318_1, arg319_1, arg320_1, arg321_1, arg322_1, arg323_1, arg324_1, arg325_1, arg326_1, arg327_1, arg328_1, arg329_1, arg330_1, arg331_1, arg332_1, arg333_1, arg334_1, arg335_1, arg336_1, arg337_1, arg338_1, arg339_1, arg340_1, arg341_1, arg342_1, arg343_1, arg344_1, arg345_1, arg346_1, arg347_1, arg348_1, arg349_1, arg350_1, arg351_1, arg352_1, arg353_1, arg354_1, arg355_1, arg356_1, arg357_1, arg358_1, arg359_1, arg360_1, arg361_1, arg362_1]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment