Created
February 21, 2023 02:54
-
-
Save leslie-fang-intel/2948a9f58cac3c42f1459269f6973463 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from ctypes import c_void_p, c_long | |
| import torch | |
| import random | |
| from torch import empty_strided, as_strided, device | |
| from torch._inductor.codecache import AsyncCompile | |
| from torch._inductor.select_algorithm import extern_kernels | |
| aten = torch.ops.aten | |
| assert_size_stride = torch._C._dynamo.guards.assert_size_stride | |
| async_compile = AsyncCompile() | |
| kernel_cpp_0 = async_compile.cpp(''' | |
| #include "/tmp/torchinductor_root/dm/cdmaihqxwe73zkb3he2zizktpq5uujetg2db26c3r4lgsmlx3b4c.h" | |
| extern "C" void kernel(const float* __restrict__ in_ptr0, | |
| const float* __restrict__ in_ptr1, | |
| const long* __restrict__ in_ptr2, | |
| unsigned char* __restrict__ out_ptr0) | |
| { | |
| #pragma omp parallel num_threads(28) | |
| { | |
| { | |
| #pragma omp for | |
| for(long i0=0; i0<17461248; i0+=1) | |
| { | |
| auto tmp2 = in_ptr0[i0]; | |
| auto tmp3 = in_ptr1[0]; | |
| auto tmp6 = in_ptr2[0]; | |
| auto tmp0 = static_cast<float>(0); | |
| auto tmp1 = static_cast<float>(127); | |
| auto tmp4 = tmp2 / tmp3; | |
| auto tmp5 = std::nearbyint(tmp4); | |
| auto tmp7 = static_cast<float>(tmp6); | |
| auto tmp8 = tmp5 + tmp7; | |
| auto tmp9 = (tmp8 != tmp8) ? tmp8 : std::min(tmp1, tmp8); | |
| auto tmp10 = (tmp9 != tmp9) ? tmp9 : std::max(tmp0, tmp9); | |
| auto tmp11 = static_cast<unsigned char>(tmp10); | |
| out_ptr0[i0] = tmp11; | |
| } | |
| } | |
| } | |
| } | |
| ''') | |
| kernel_cpp_1 = async_compile.cpp(''' | |
| #include "/tmp/torchinductor_root/dm/cdmaihqxwe73zkb3he2zizktpq5uujetg2db26c3r4lgsmlx3b4c.h" | |
| extern "C" void kernel(const unsigned char* __restrict__ in_ptr0, | |
| const long* __restrict__ in_ptr1, | |
| const float* __restrict__ in_ptr2, | |
| const float* __restrict__ in_ptr3, | |
| const long* __restrict__ in_ptr4, | |
| float* __restrict__ out_ptr0, | |
| float* __restrict__ out_ptr1, | |
| unsigned char* __restrict__ out_ptr2) | |
| { | |
| #pragma omp parallel num_threads(28) | |
| { | |
| { | |
| #pragma omp for | |
| for(long i0=0; i0<93126656; i0+=1) | |
| { | |
| auto tmp0 = in_ptr0[i0]; | |
| auto tmp2 = in_ptr1[0]; | |
| auto tmp5 = in_ptr2[0]; | |
| auto tmp1 = static_cast<float>(tmp0); | |
| auto tmp3 = static_cast<float>(tmp2); | |
| auto tmp4 = tmp1 - tmp3; | |
| auto tmp6 = tmp4 * tmp5; | |
| out_ptr0[i0] = tmp6; | |
| } | |
| } | |
| { | |
| #pragma omp for | |
| for(long i0=0; i0<116; i0+=1) | |
| { | |
| #pragma GCC ivdep | |
| for(long i1=0; i1<64; i1+=1) | |
| { | |
| #pragma GCC ivdep | |
| for(long i2=0; i2<56; i2+=1) | |
| { | |
| #pragma GCC ivdep | |
| for(long i3=0; i3<56; i3+=1) | |
| { | |
| auto tmp0 = static_cast<long>((-1) + (2*i2)); | |
| auto tmp1 = static_cast<long>(0); | |
| auto tmp2 = tmp0 >= tmp1; | |
| auto tmp3 = static_cast<long>(112); | |
| auto tmp4 = tmp0 < tmp3; | |
| auto tmp5 = tmp2 & tmp4; | |
| auto tmp6 = static_cast<long>((-1) + (2*i3)); | |
| auto tmp7 = tmp6 >= tmp1; | |
| auto tmp8 = tmp6 < tmp3; | |
| auto tmp9 = tmp7 & tmp8; | |
| auto tmp10 = tmp5 & tmp9; | |
| auto tmp11 = [&] | |
| { | |
| auto tmp12 = out_ptr0[(-7232) + i1 + (128*i3) + (14336*i2) + (802816*i0)]; | |
| return tmp12; | |
| } | |
| ; | |
| auto tmp13 = tmp10 ? tmp11() : -std::numeric_limits<decltype(tmp11())>::infinity(); | |
| auto tmp14 = static_cast<long>(2*i3); | |
| auto tmp15 = tmp14 >= tmp1; | |
| auto tmp16 = tmp14 < tmp3; | |
| auto tmp17 = tmp15 & tmp16; | |
| auto tmp18 = tmp5 & tmp17; | |
| auto tmp19 = [&] | |
| { | |
| auto tmp20 = out_ptr0[(-7168) + i1 + (128*i3) + (14336*i2) + (802816*i0)]; | |
| return tmp20; | |
| } | |
| ; | |
| auto tmp21 = tmp18 ? tmp19() : -std::numeric_limits<decltype(tmp19())>::infinity(); | |
| auto tmp22 = (tmp13 != tmp13) ? tmp13 : std::max(tmp21, tmp13); | |
| auto tmp23 = static_cast<long>(1 + (2*i3)); | |
| auto tmp24 = tmp23 >= tmp1; | |
| auto tmp25 = tmp23 < tmp3; | |
| auto tmp26 = tmp24 & tmp25; | |
| auto tmp27 = tmp5 & tmp26; | |
| auto tmp28 = [&] | |
| { | |
| auto tmp29 = out_ptr0[(-7104) + i1 + (128*i3) + (14336*i2) + (802816*i0)]; | |
| return tmp29; | |
| } | |
| ; | |
| auto tmp30 = tmp27 ? tmp28() : -std::numeric_limits<decltype(tmp28())>::infinity(); | |
| auto tmp31 = (tmp22 != tmp22) ? tmp22 : std::max(tmp30, tmp22); | |
| auto tmp32 = static_cast<long>(2*i2); | |
| auto tmp33 = tmp32 >= tmp1; | |
| auto tmp34 = tmp32 < tmp3; | |
| auto tmp35 = tmp33 & tmp34; | |
| auto tmp36 = tmp35 & tmp9; | |
| auto tmp37 = [&] | |
| { | |
| auto tmp38 = out_ptr0[(-64) + i1 + (128*i3) + (14336*i2) + (802816*i0)]; | |
| return tmp38; | |
| } | |
| ; | |
| auto tmp39 = tmp36 ? tmp37() : -std::numeric_limits<decltype(tmp37())>::infinity(); | |
| auto tmp40 = (tmp31 != tmp31) ? tmp31 : std::max(tmp39, tmp31); | |
| auto tmp41 = tmp35 & tmp17; | |
| auto tmp42 = [&] | |
| { | |
| auto tmp43 = out_ptr0[i1 + (128*i3) + (14336*i2) + (802816*i0)]; | |
| return tmp43; | |
| } | |
| ; | |
| auto tmp44 = tmp41 ? tmp42() : -std::numeric_limits<decltype(tmp42())>::infinity(); | |
| auto tmp45 = (tmp40 != tmp40) ? tmp40 : std::max(tmp44, tmp40); | |
| auto tmp46 = tmp35 & tmp26; | |
| auto tmp47 = [&] | |
| { | |
| auto tmp48 = out_ptr0[64 + i1 + (128*i3) + (14336*i2) + (802816*i0)]; | |
| return tmp48; | |
| } | |
| ; | |
| auto tmp49 = tmp46 ? tmp47() : -std::numeric_limits<decltype(tmp47())>::infinity(); | |
| auto tmp50 = (tmp45 != tmp45) ? tmp45 : std::max(tmp49, tmp45); | |
| auto tmp51 = static_cast<long>(1 + (2*i2)); | |
| auto tmp52 = tmp51 >= tmp1; | |
| auto tmp53 = tmp51 < tmp3; | |
| auto tmp54 = tmp52 & tmp53; | |
| auto tmp55 = tmp54 & tmp9; | |
| auto tmp56 = [&] | |
| { | |
| auto tmp57 = out_ptr0[7104 + i1 + (128*i3) + (14336*i2) + (802816*i0)]; | |
| return tmp57; | |
| } | |
| ; | |
| auto tmp58 = tmp55 ? tmp56() : -std::numeric_limits<decltype(tmp56())>::infinity(); | |
| auto tmp59 = (tmp50 != tmp50) ? tmp50 : std::max(tmp58, tmp50); | |
| auto tmp60 = tmp54 & tmp17; | |
| auto tmp61 = [&] | |
| { | |
| auto tmp62 = out_ptr0[7168 + i1 + (128*i3) + (14336*i2) + (802816*i0)]; | |
| return tmp62; | |
| } | |
| ; | |
| auto tmp63 = tmp60 ? tmp61() : -std::numeric_limits<decltype(tmp61())>::infinity(); | |
| auto tmp64 = (tmp59 != tmp59) ? tmp59 : std::max(tmp63, tmp59); | |
| auto tmp65 = tmp54 & tmp26; | |
| auto tmp66 = [&] | |
| { | |
| auto tmp67 = out_ptr0[7232 + i1 + (128*i3) + (14336*i2) + (802816*i0)]; | |
| return tmp67; | |
| } | |
| ; | |
| auto tmp68 = tmp65 ? tmp66() : -std::numeric_limits<decltype(tmp66())>::infinity(); | |
| auto tmp69 = (tmp64 != tmp64) ? tmp64 : std::max(tmp68, tmp64); | |
| out_ptr1[i1 + (64*i3) + (3584*i2) + (200704*i0)] = tmp69; | |
| } | |
| } | |
| } | |
| } | |
| } | |
| { | |
| #pragma omp for | |
| for(long i0=0; i0<116; i0+=1) | |
| { | |
| #pragma GCC ivdep | |
| for(long i1=0; i1<64; i1+=1) | |
| { | |
| #pragma GCC ivdep | |
| for(long i2=0; i2<3136; i2+=1) | |
| { | |
| auto tmp2 = out_ptr1[i1 + (64*i2) + (200704*i0)]; | |
| auto tmp3 = in_ptr3[0]; | |
| auto tmp6 = in_ptr4[0]; | |
| auto tmp0 = static_cast<float>(0); | |
| auto tmp1 = static_cast<float>(127); | |
| auto tmp4 = tmp2 / tmp3; | |
| auto tmp5 = std::nearbyint(tmp4); | |
| auto tmp7 = static_cast<float>(tmp6); | |
| auto tmp8 = tmp5 + tmp7; | |
| auto tmp9 = (tmp8 != tmp8) ? tmp8 : std::min(tmp1, tmp8); | |
| auto tmp10 = (tmp9 != tmp9) ? tmp9 : std::max(tmp0, tmp9); | |
| auto tmp11 = static_cast<unsigned char>(tmp10); | |
| out_ptr2[i2 + (3136*i1) + (200704*i0)] = tmp11; | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| ''') | |
| kernel_cpp_2 = async_compile.cpp(''' | |
| #include "/tmp/torchinductor_root/dm/cdmaihqxwe73zkb3he2zizktpq5uujetg2db26c3r4lgsmlx3b4c.h" | |
| extern "C" void kernel(float* __restrict__ in_out_ptr0, | |
| const unsigned char* __restrict__ in_ptr0, | |
| const long* __restrict__ in_ptr1, | |
| const float* __restrict__ in_ptr2, | |
| const unsigned char* __restrict__ in_ptr3, | |
| const long* __restrict__ in_ptr4, | |
| const float* __restrict__ in_ptr5, | |
| const float* __restrict__ in_ptr6, | |
| const long* __restrict__ in_ptr7, | |
| unsigned char* __restrict__ out_ptr1) | |
| { | |
| auto out_ptr0 = in_out_ptr0; | |
| #pragma omp parallel num_threads(28) | |
| { | |
| { | |
| #pragma omp for | |
| for(long i0=0; i0<93126656; i0+=1) | |
| { | |
| auto tmp0 = in_ptr0[i0]; | |
| auto tmp2 = in_ptr1[0]; | |
| auto tmp5 = in_ptr2[0]; | |
| auto tmp7 = in_ptr3[i0]; | |
| auto tmp9 = in_ptr4[0]; | |
| auto tmp12 = in_ptr5[0]; | |
| auto tmp1 = static_cast<float>(tmp0); | |
| auto tmp3 = static_cast<float>(tmp2); | |
| auto tmp4 = tmp1 - tmp3; | |
| auto tmp6 = tmp4 * tmp5; | |
| auto tmp8 = static_cast<float>(tmp7); | |
| auto tmp10 = static_cast<float>(tmp9); | |
| auto tmp11 = tmp8 - tmp10; | |
| auto tmp13 = tmp11 * tmp12; | |
| auto tmp14 = tmp6 + tmp13; | |
| out_ptr0[i0] = tmp14; | |
| } | |
| } | |
| { | |
| #pragma omp for | |
| for(long i0=0; i0<116; i0+=1) | |
| { | |
| #pragma GCC ivdep | |
| for(long i1=0; i1<256; i1+=1) | |
| { | |
| #pragma GCC ivdep | |
| for(long i2=0; i2<3136; i2+=1) | |
| { | |
| auto tmp0 = out_ptr0[i1 + (256*i2) + (802816*i0)]; | |
| auto tmp4 = in_ptr6[0]; | |
| auto tmp7 = in_ptr7[0]; | |
| auto tmp1 = tmp0 * (tmp0>0); | |
| auto tmp2 = static_cast<float>(0); | |
| auto tmp3 = static_cast<float>(127); | |
| auto tmp5 = tmp1 / tmp4; | |
| auto tmp6 = std::nearbyint(tmp5); | |
| auto tmp8 = static_cast<float>(tmp7); | |
| auto tmp9 = tmp6 + tmp8; | |
| auto tmp10 = (tmp9 != tmp9) ? tmp9 : std::min(tmp3, tmp9); | |
| auto tmp11 = (tmp10 != tmp10) ? tmp10 : std::max(tmp2, tmp10); | |
| auto tmp12 = static_cast<unsigned char>(tmp11); | |
| out_ptr1[i2 + (3136*i1) + (802816*i0)] = tmp12; | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| ''') | |
| kernel_cpp_3 = async_compile.cpp(''' | |
| #include "/tmp/torchinductor_root/dm/cdmaihqxwe73zkb3he2zizktpq5uujetg2db26c3r4lgsmlx3b4c.h" | |
| extern "C" void kernel(float* __restrict__ in_out_ptr0, | |
| const unsigned char* __restrict__ in_ptr0, | |
| const long* __restrict__ in_ptr1, | |
| const float* __restrict__ in_ptr2, | |
| const unsigned char* __restrict__ in_ptr3, | |
| const long* __restrict__ in_ptr4, | |
| const float* __restrict__ in_ptr5, | |
| const float* __restrict__ in_ptr6, | |
| const long* __restrict__ in_ptr7, | |
| unsigned char* __restrict__ out_ptr1) | |
| { | |
| auto out_ptr0 = in_out_ptr0; | |
| #pragma omp parallel num_threads(28) | |
| { | |
| { | |
| #pragma omp for | |
| for(long i0=0; i0<116; i0+=1) | |
| { | |
| #pragma GCC ivdep | |
| for(long i1=0; i1<256; i1+=1) | |
| { | |
| #pragma GCC ivdep | |
| for(long i2=0; i2<3136; i2+=1) | |
| { | |
| auto tmp0 = in_ptr0[i1 + (256*i2) + (802816*i0)]; | |
| auto tmp2 = in_ptr1[0]; | |
| auto tmp5 = in_ptr2[0]; | |
| auto tmp7 = in_ptr3[i2 + (3136*i1) + (802816*i0)]; | |
| auto tmp9 = in_ptr4[0]; | |
| auto tmp12 = in_ptr5[0]; | |
| auto tmp1 = static_cast<float>(tmp0); | |
| auto tmp3 = static_cast<float>(tmp2); | |
| auto tmp4 = tmp1 - tmp3; | |
| auto tmp6 = tmp4 * tmp5; | |
| auto tmp8 = static_cast<float>(tmp7); | |
| auto tmp10 = static_cast<float>(tmp9); | |
| auto tmp11 = tmp8 - tmp10; | |
| auto tmp13 = tmp11 * tmp12; | |
| auto tmp14 = tmp6 + tmp13; | |
| out_ptr0[i2 + (3136*i1) + (802816*i0)] = tmp14; | |
| } | |
| } | |
| } | |
| } | |
| { | |
| #pragma omp for | |
| for(long i0=0; i0<93126656; i0+=1) | |
| { | |
| auto tmp0 = out_ptr0[i0]; | |
| auto tmp4 = in_ptr6[0]; | |
| auto tmp7 = in_ptr7[0]; | |
| auto tmp1 = tmp0 * (tmp0>0); | |
| auto tmp2 = static_cast<float>(0); | |
| auto tmp3 = static_cast<float>(127); | |
| auto tmp5 = tmp1 / tmp4; | |
| auto tmp6 = std::nearbyint(tmp5); | |
| auto tmp8 = static_cast<float>(tmp7); | |
| auto tmp9 = tmp6 + tmp8; | |
| auto tmp10 = (tmp9 != tmp9) ? tmp9 : std::min(tmp3, tmp9); | |
| auto tmp11 = (tmp10 != tmp10) ? tmp10 : std::max(tmp2, tmp10); | |
| auto tmp12 = static_cast<unsigned char>(tmp11); | |
| out_ptr1[i0] = tmp12; | |
| } | |
| } | |
| } | |
| } | |
| ''') | |
| kernel_cpp_4 = async_compile.cpp(''' | |
| #include "/tmp/torchinductor_root/dm/cdmaihqxwe73zkb3he2zizktpq5uujetg2db26c3r4lgsmlx3b4c.h" | |
| extern "C" void kernel(float* __restrict__ in_out_ptr0, | |
| const unsigned char* __restrict__ in_ptr0, | |
| const long* __restrict__ in_ptr1, | |
| const float* __restrict__ in_ptr2, | |
| const unsigned char* __restrict__ in_ptr3, | |
| const long* __restrict__ in_ptr4, | |
| const float* __restrict__ in_ptr5, | |
| const float* __restrict__ in_ptr6, | |
| const long* __restrict__ in_ptr7, | |
| unsigned char* __restrict__ out_ptr1) | |
| { | |
| auto out_ptr0 = in_out_ptr0; | |
| #pragma omp parallel num_threads(28) | |
| { | |
| { | |
| #pragma omp for | |
| for(long i0=0; i0<116; i0+=1) | |
| { | |
| #pragma GCC ivdep | |
| for(long i1=0; i1<256; i1+=1) | |
| { | |
| #pragma GCC ivdep | |
| for(long i2=0; i2<3136; i2+=1) | |
| { | |
| auto tmp0 = in_ptr0[i1 + (256*i2) + (802816*i0)]; | |
| auto tmp2 = in_ptr1[0]; | |
| auto tmp5 = in_ptr2[0]; | |
| auto tmp7 = in_ptr3[i2 + (3136*i1) + (802816*i0)]; | |
| auto tmp9 = in_ptr4[0]; | |
| auto tmp12 = in_ptr5[0]; | |
| auto tmp1 = static_cast<float>(tmp0); | |
| auto tmp3 = static_cast<float>(tmp2); | |
| auto tmp4 = tmp1 - tmp3; | |
| auto tmp6 = tmp4 * tmp5; | |
| auto tmp8 = static_cast<float>(tmp7); | |
| auto tmp10 = static_cast<float>(tmp9); | |
| auto tmp11 = tmp8 - tmp10; | |
| auto tmp13 = tmp11 * tmp12; | |
| auto tmp14 = tmp6 + tmp13; | |
| out_ptr0[i2 + (3136*i1) + (802816*i0)] = tmp14; | |
| } | |
| } | |
| } | |
| } | |
| { | |
| #pragma omp for | |
| for(long i0=0; i0<93126656; i0+=1) | |
| { | |
| auto tmp0 = out_ptr0[i0]; | |
| auto tmp4 = in_ptr6[0]; | |
| auto tmp7 = in_ptr7[0]; | |
| auto tmp1 = tmp0 * (tmp0>0); | |
| auto tmp2 = static_cast<float>(0); | |
| auto tmp3 = static_cast<float>(127); | |
| auto tmp5 = tmp1 / tmp4; | |
| auto tmp6 = std::nearbyint(tmp5); | |
| auto tmp8 = static_cast<float>(tmp7); | |
| auto tmp9 = tmp6 + tmp8; | |
| auto tmp10 = (tmp9 != tmp9) ? tmp9 : std::min(tmp3, tmp9); | |
| auto tmp11 = (tmp10 != tmp10) ? tmp10 : std::max(tmp2, tmp10); | |
| auto tmp12 = static_cast<unsigned char>(tmp11); | |
| out_ptr1[i0] = tmp12; | |
| } | |
| } | |
| } | |
| } | |
| ''') | |
| kernel_cpp_5 = async_compile.cpp(''' | |
| #include "/tmp/torchinductor_root/dm/cdmaihqxwe73zkb3he2zizktpq5uujetg2db26c3r4lgsmlx3b4c.h" | |
| extern "C" void kernel(float* __restrict__ in_out_ptr0, | |
| const unsigned char* __restrict__ in_ptr0, | |
| const long* __restrict__ in_ptr1, | |
| const float* __restrict__ in_ptr2, | |
| const unsigned char* __restrict__ in_ptr3, | |
| const long* __restrict__ in_ptr4, | |
| const float* __restrict__ in_ptr5, | |
| const float* __restrict__ in_ptr6, | |
| const long* __restrict__ in_ptr7, | |
| unsigned char* __restrict__ out_ptr1) | |
| { | |
| auto out_ptr0 = in_out_ptr0; | |
| #pragma omp parallel num_threads(28) | |
| { | |
| { | |
| #pragma omp for | |
| for(long i0=0; i0<46563328; i0+=1) | |
| { | |
| auto tmp0 = in_ptr0[i0]; | |
| auto tmp2 = in_ptr1[0]; | |
| auto tmp5 = in_ptr2[0]; | |
| auto tmp7 = in_ptr3[i0]; | |
| auto tmp9 = in_ptr4[0]; | |
| auto tmp12 = in_ptr5[0]; | |
| auto tmp1 = static_cast<float>(tmp0); | |
| auto tmp3 = static_cast<float>(tmp2); | |
| auto tmp4 = tmp1 - tmp3; | |
| auto tmp6 = tmp4 * tmp5; | |
| auto tmp8 = static_cast<float>(tmp7); | |
| auto tmp10 = static_cast<float>(tmp9); | |
| auto tmp11 = tmp8 - tmp10; | |
| auto tmp13 = tmp11 * tmp12; | |
| auto tmp14 = tmp6 + tmp13; | |
| out_ptr0[i0] = tmp14; | |
| } | |
| } | |
| { | |
| #pragma omp for | |
| for(long i0=0; i0<116; i0+=1) | |
| { | |
| #pragma GCC ivdep | |
| for(long i1=0; i1<512; i1+=1) | |
| { | |
| #pragma GCC ivdep | |
| for(long i2=0; i2<784; i2+=1) | |
| { | |
| auto tmp0 = out_ptr0[i1 + (512*i2) + (401408*i0)]; | |
| auto tmp4 = in_ptr6[0]; | |
| auto tmp7 = in_ptr7[0]; | |
| auto tmp1 = tmp0 * (tmp0>0); | |
| auto tmp2 = static_cast<float>(0); | |
| auto tmp3 = static_cast<float>(127); | |
| auto tmp5 = tmp1 / tmp4; | |
| auto tmp6 = std::nearbyint(tmp5); | |
| auto tmp8 = static_cast<float>(tmp7); | |
| auto tmp9 = tmp6 + tmp8; | |
| auto tmp10 = (tmp9 != tmp9) ? tmp9 : std::min(tmp3, tmp9); | |
| auto tmp11 = (tmp10 != tmp10) ? tmp10 : std::max(tmp2, tmp10); | |
| auto tmp12 = static_cast<unsigned char>(tmp11); | |
| out_ptr1[i2 + (784*i1) + (401408*i0)] = tmp12; | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| ''') | |
| kernel_cpp_6 = async_compile.cpp(''' | |
| #include "/tmp/torchinductor_root/dm/cdmaihqxwe73zkb3he2zizktpq5uujetg2db26c3r4lgsmlx3b4c.h" | |
| extern "C" void kernel(float* __restrict__ in_out_ptr0, | |
| const unsigned char* __restrict__ in_ptr0, | |
| const long* __restrict__ in_ptr1, | |
| const float* __restrict__ in_ptr2, | |
| const unsigned char* __restrict__ in_ptr3, | |
| const long* __restrict__ in_ptr4, | |
| const float* __restrict__ in_ptr5, | |
| const float* __restrict__ in_ptr6, | |
| const long* __restrict__ in_ptr7, | |
| unsigned char* __restrict__ out_ptr1) | |
| { | |
| auto out_ptr0 = in_out_ptr0; | |
| #pragma omp parallel num_threads(28) | |
| { | |
| { | |
| #pragma omp for | |
| for(long i0=0; i0<116; i0+=1) | |
| { | |
| #pragma GCC ivdep | |
| for(long i1=0; i1<512; i1+=1) | |
| { | |
| #pragma GCC ivdep | |
| for(long i2=0; i2<784; i2+=1) | |
| { | |
| auto tmp0 = in_ptr0[i1 + (512*i2) + (401408*i0)]; | |
| auto tmp2 = in_ptr1[0]; | |
| auto tmp5 = in_ptr2[0]; | |
| auto tmp7 = in_ptr3[i2 + (784*i1) + (401408*i0)]; | |
| auto tmp9 = in_ptr4[0]; | |
| auto tmp12 = in_ptr5[0]; | |
| auto tmp1 = static_cast<float>(tmp0); | |
| auto tmp3 = static_cast<float>(tmp2); | |
| auto tmp4 = tmp1 - tmp3; | |
| auto tmp6 = tmp4 * tmp5; | |
| auto tmp8 = static_cast<float>(tmp7); | |
| auto tmp10 = static_cast<float>(tmp9); | |
| auto tmp11 = tmp8 - tmp10; | |
| auto tmp13 = tmp11 * tmp12; | |
| auto tmp14 = tmp6 + tmp13; | |
| out_ptr0[i2 + (784*i1) + (401408*i0)] = tmp14; | |
| } | |
| } | |
| } | |
| } | |
| { | |
| #pragma omp for | |
| for(long i0=0; i0<46563328; i0+=1) | |
| { | |
| auto tmp0 = out_ptr0[i0]; | |
| auto tmp4 = in_ptr6[0]; | |
| auto tmp7 = in_ptr7[0]; | |
| auto tmp1 = tmp0 * (tmp0>0); | |
| auto tmp2 = static_cast<float>(0); | |
| auto tmp3 = static_cast<float>(127); | |
| auto tmp5 = tmp1 / tmp4; | |
| auto tmp6 = std::nearbyint(tmp5); | |
| auto tmp8 = static_cast<float>(tmp7); | |
| auto tmp9 = tmp6 + tmp8; | |
| auto tmp10 = (tmp9 != tmp9) ? tmp9 : std::min(tmp3, tmp9); | |
| auto tmp11 = (tmp10 != tmp10) ? tmp10 : std::max(tmp2, tmp10); | |
| auto tmp12 = static_cast<unsigned char>(tmp11); | |
| out_ptr1[i0] = tmp12; | |
| } | |
| } | |
| } | |
| } | |
| ''') | |
| kernel_cpp_7 = async_compile.cpp(''' | |
| #include "/tmp/torchinductor_root/dm/cdmaihqxwe73zkb3he2zizktpq5uujetg2db26c3r4lgsmlx3b4c.h" | |
| extern "C" void kernel(float* __restrict__ in_out_ptr0, | |
| const unsigned char* __restrict__ in_ptr0, | |
| const long* __restrict__ in_ptr1, | |
| const float* __restrict__ in_ptr2, | |
| const unsigned char* __restrict__ in_ptr3, | |
| const long* __restrict__ in_ptr4, | |
| const float* __restrict__ in_ptr5, | |
| const float* __restrict__ in_ptr6, | |
| const long* __restrict__ in_ptr7, | |
| unsigned char* __restrict__ out_ptr1) | |
| { | |
| auto out_ptr0 = in_out_ptr0; | |
| #pragma omp parallel num_threads(28) | |
| { | |
| { | |
| #pragma omp for | |
| for(long i0=0; i0<116; i0+=1) | |
| { | |
| #pragma GCC ivdep | |
| for(long i1=0; i1<512; i1+=1) | |
| { | |
| #pragma GCC ivdep | |
| for(long i2=0; i2<784; i2+=1) | |
| { | |
| auto tmp0 = in_ptr0[i1 + (512*i2) + (401408*i0)]; | |
| auto tmp2 = in_ptr1[0]; | |
| auto tmp5 = in_ptr2[0]; | |
| auto tmp7 = in_ptr3[i2 + (784*i1) + (401408*i0)]; | |
| auto tmp9 = in_ptr4[0]; | |
| auto tmp12 = in_ptr5[0]; | |
| auto tmp1 = static_cast<float>(tmp0); | |
| auto tmp3 = static_cast<float>(tmp2); | |
| auto tmp4 = tmp1 - tmp3; | |
| auto tmp6 = tmp4 * tmp5; | |
| auto tmp8 = static_cast<float>(tmp7); | |
| auto tmp10 = static_cast<float>(tmp9); | |
| auto tmp11 = tmp8 - tmp10; | |
| auto tmp13 = tmp11 * tmp12; | |
| auto tmp14 = tmp6 + tmp13; | |
| out_ptr0[i2 + (784*i1) + (401408*i0)] = tmp14; | |
| } | |
| } | |
| } | |
| } | |
| { | |
| #pragma omp for | |
| for(long i0=0; i0<46563328; i0+=1) | |
| { | |
| auto tmp0 = out_ptr0[i0]; | |
| auto tmp4 = in_ptr6[0]; | |
| auto tmp7 = in_ptr7[0]; | |
| auto tmp1 = tmp0 * (tmp0>0); | |
| auto tmp2 = static_cast<float>(0); | |
| auto tmp3 = static_cast<float>(127); | |
| auto tmp5 = tmp1 / tmp4; | |
| auto tmp6 = std::nearbyint(tmp5); | |
| auto tmp8 = static_cast<float>(tmp7); | |
| auto tmp9 = tmp6 + tmp8; | |
| auto tmp10 = (tmp9 != tmp9) ? tmp9 : std::min(tmp3, tmp9); | |
| auto tmp11 = (tmp10 != tmp10) ? tmp10 : std::max(tmp2, tmp10); | |
| auto tmp12 = static_cast<unsigned char>(tmp11); | |
| out_ptr1[i0] = tmp12; | |
| } | |
| } | |
| } | |
| } | |
| ''') | |
| kernel_cpp_8 = async_compile.cpp(''' | |
| #include "/tmp/torchinductor_root/dm/cdmaihqxwe73zkb3he2zizktpq5uujetg2db26c3r4lgsmlx3b4c.h" | |
| extern "C" void kernel(float* __restrict__ in_out_ptr0, | |
| const unsigned char* __restrict__ in_ptr0, | |
| const long* __restrict__ in_ptr1, | |
| const float* __restrict__ in_ptr2, | |
| const unsigned char* __restrict__ in_ptr3, | |
| const long* __restrict__ in_ptr4, | |
| const float* __restrict__ in_ptr5, | |
| const float* __restrict__ in_ptr6, | |
| const long* __restrict__ in_ptr7, | |
| unsigned char* __restrict__ out_ptr1) | |
| { | |
| auto out_ptr0 = in_out_ptr0; | |
| #pragma omp parallel num_threads(28) | |
| { | |
| { | |
| #pragma omp for | |
| for(long i0=0; i0<116; i0+=1) | |
| { | |
| #pragma GCC ivdep | |
| for(long i1=0; i1<512; i1+=1) | |
| { | |
| #pragma GCC ivdep | |
| for(long i2=0; i2<784; i2+=1) | |
| { | |
| auto tmp0 = in_ptr0[i1 + (512*i2) + (401408*i0)]; | |
| auto tmp2 = in_ptr1[0]; | |
| auto tmp5 = in_ptr2[0]; | |
| auto tmp7 = in_ptr3[i2 + (784*i1) + (401408*i0)]; | |
| auto tmp9 = in_ptr4[0]; | |
| auto tmp12 = in_ptr5[0]; | |
| auto tmp1 = static_cast<float>(tmp0); | |
| auto tmp3 = static_cast<float>(tmp2); | |
| auto tmp4 = tmp1 - tmp3; | |
| auto tmp6 = tmp4 * tmp5; | |
| auto tmp8 = static_cast<float>(tmp7); | |
| auto tmp10 = static_cast<float>(tmp9); | |
| auto tmp11 = tmp8 - tmp10; | |
| auto tmp13 = tmp11 * tmp12; | |
| auto tmp14 = tmp6 + tmp13; | |
| out_ptr0[i2 + (784*i1) + (401408*i0)] = tmp14; | |
| } | |
| } | |
| } | |
| } | |
| { | |
| #pragma omp for | |
| for(long i0=0; i0<46563328; i0+=1) | |
| { | |
| auto tmp0 = out_ptr0[i0]; | |
| auto tmp4 = in_ptr6[0]; | |
| auto tmp7 = in_ptr7[0]; | |
| auto tmp1 = tmp0 * (tmp0>0); | |
| auto tmp2 = static_cast<float>(0); | |
| auto tmp3 = static_cast<float>(127); | |
| auto tmp5 = tmp1 / tmp4; | |
| auto tmp6 = std::nearbyint(tmp5); | |
| auto tmp8 = static_cast<float>(tmp7); | |
| auto tmp9 = tmp6 + tmp8; | |
| auto tmp10 = (tmp9 != tmp9) ? tmp9 : std::min(tmp3, tmp9); | |
| auto tmp11 = (tmp10 != tmp10) ? tmp10 : std::max(tmp2, tmp10); | |
| auto tmp12 = static_cast<unsigned char>(tmp11); | |
| out_ptr1[i0] = tmp12; | |
| } | |
| } | |
| } | |
| } | |
| ''') | |
| kernel_cpp_9 = async_compile.cpp(''' | |
| #include "/tmp/torchinductor_root/dm/cdmaihqxwe73zkb3he2zizktpq5uujetg2db26c3r4lgsmlx3b4c.h" | |
| extern "C" void kernel(float* __restrict__ in_out_ptr0, | |
| const unsigned char* __restrict__ in_ptr0, | |
| const long* __restrict__ in_ptr1, | |
| const float* __restrict__ in_ptr2, | |
| const unsigned char* __restrict__ in_ptr3, | |
| const long* __restrict__ in_ptr4, | |
| const float* __restrict__ in_ptr5, | |
| const float* __restrict__ in_ptr6, | |
| const long* __restrict__ in_ptr7, | |
| unsigned char* __restrict__ out_ptr1) | |
| { | |
| auto out_ptr0 = in_out_ptr0; | |
| #pragma omp parallel num_threads(28) | |
| { | |
| { | |
| #pragma omp for | |
| for(long i0=0; i0<23281664; i0+=1) | |
| { | |
| auto tmp0 = in_ptr0[i0]; | |
| auto tmp2 = in_ptr1[0]; | |
| auto tmp5 = in_ptr2[0]; | |
| auto tmp7 = in_ptr3[i0]; | |
| auto tmp9 = in_ptr4[0]; | |
| auto tmp12 = in_ptr5[0]; | |
| auto tmp1 = static_cast<float>(tmp0); | |
| auto tmp3 = static_cast<float>(tmp2); | |
| auto tmp4 = tmp1 - tmp3; | |
| auto tmp6 = tmp4 * tmp5; | |
| auto tmp8 = static_cast<float>(tmp7); | |
| auto tmp10 = static_cast<float>(tmp9); | |
| auto tmp11 = tmp8 - tmp10; | |
| auto tmp13 = tmp11 * tmp12; | |
| auto tmp14 = tmp6 + tmp13; | |
| out_ptr0[i0] = tmp14; | |
| } | |
| } | |
| { | |
| #pragma omp for | |
| for(long i0=0; i0<116; i0+=1) | |
| { | |
| #pragma GCC ivdep | |
| for(long i1=0; i1<1024; i1+=1) | |
| { | |
| #pragma GCC ivdep | |
| for(long i2=0; i2<196; i2+=1) | |
| { | |
| auto tmp0 = out_ptr0[i1 + (1024*i2) + (200704*i0)]; | |
| auto tmp4 = in_ptr6[0]; | |
| auto tmp7 = in_ptr7[0]; | |
| auto tmp1 = tmp0 * (tmp0>0); | |
| auto tmp2 = static_cast<float>(0); | |
| auto tmp3 = static_cast<float>(127); | |
| auto tmp5 = tmp1 / tmp4; | |
| auto tmp6 = std::nearbyint(tmp5); | |
| auto tmp8 = static_cast<float>(tmp7); | |
| auto tmp9 = tmp6 + tmp8; | |
| auto tmp10 = (tmp9 != tmp9) ? tmp9 : std::min(tmp3, tmp9); | |
| auto tmp11 = (tmp10 != tmp10) ? tmp10 : std::max(tmp2, tmp10); | |
| auto tmp12 = static_cast<unsigned char>(tmp11); | |
| out_ptr1[i2 + (196*i1) + (200704*i0)] = tmp12; | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| ''') | |
| kernel_cpp_10 = async_compile.cpp(''' | |
| #include "/tmp/torchinductor_root/dm/cdmaihqxwe73zkb3he2zizktpq5uujetg2db26c3r4lgsmlx3b4c.h" | |
| extern "C" void kernel(float* __restrict__ in_out_ptr0, | |
| const unsigned char* __restrict__ in_ptr0, | |
| const long* __restrict__ in_ptr1, | |
| const float* __restrict__ in_ptr2, | |
| const unsigned char* __restrict__ in_ptr3, | |
| const long* __restrict__ in_ptr4, | |
| const float* __restrict__ in_ptr5, | |
| const float* __restrict__ in_ptr6, | |
| const long* __restrict__ in_ptr7, | |
| unsigned char* __restrict__ out_ptr1) | |
| { | |
| auto out_ptr0 = in_out_ptr0; | |
| #pragma omp parallel num_threads(28) | |
| { | |
| { | |
| #pragma omp for | |
| for(long i0=0; i0<116; i0+=1) | |
| { | |
| #pragma GCC ivdep | |
| for(long i1=0; i1<1024; i1+=1) | |
| { | |
| #pragma GCC ivdep | |
| for(long i2=0; i2<196; i2+=1) | |
| { | |
| auto tmp0 = in_ptr0[i1 + (1024*i2) + (200704*i0)]; | |
| auto tmp2 = in_ptr1[0]; | |
| auto tmp5 = in_ptr2[0]; | |
| auto tmp7 = in_ptr3[i2 + (196*i1) + (200704*i0)]; | |
| auto tmp9 = in_ptr4[0]; | |
| auto tmp12 = in_ptr5[0]; | |
| auto tmp1 = static_cast<float>(tmp0); | |
| auto tmp3 = static_cast<float>(tmp2); | |
| auto tmp4 = tmp1 - tmp3; | |
| auto tmp6 = tmp4 * tmp5; | |
| auto tmp8 = static_cast<float>(tmp7); | |
| auto tmp10 = static_cast<float>(tmp9); | |
| auto tmp11 = tmp8 - tmp10; | |
| auto tmp13 = tmp11 * tmp12; | |
| auto tmp14 = tmp6 + tmp13; | |
| out_ptr0[i2 + (196*i1) + (200704*i0)] = tmp14; | |
| } | |
| } | |
| } | |
| } | |
| { | |
| #pragma omp for | |
| for(long i0=0; i0<23281664; i0+=1) | |
| { | |
| auto tmp0 = out_ptr0[i0]; | |
| auto tmp4 = in_ptr6[0]; | |
| auto tmp7 = in_ptr7[0]; | |
| auto tmp1 = tmp0 * (tmp0>0); | |
| auto tmp2 = static_cast<float>(0); | |
| auto tmp3 = static_cast<float>(127); | |
| auto tmp5 = tmp1 / tmp4; | |
| auto tmp6 = std::nearbyint(tmp5); | |
| auto tmp8 = static_cast<float>(tmp7); | |
| auto tmp9 = tmp6 + tmp8; | |
| auto tmp10 = (tmp9 != tmp9) ? tmp9 : std::min(tmp3, tmp9); | |
| auto tmp11 = (tmp10 != tmp10) ? tmp10 : std::max(tmp2, tmp10); | |
| auto tmp12 = static_cast<unsigned char>(tmp11); | |
| out_ptr1[i0] = tmp12; | |
| } | |
| } | |
| } | |
| } | |
| ''') | |
| kernel_cpp_11 = async_compile.cpp(''' | |
| #include "/tmp/torchinductor_root/dm/cdmaihqxwe73zkb3he2zizktpq5uujetg2db26c3r4lgsmlx3b4c.h" | |
| extern "C" void kernel(float* __restrict__ in_out_ptr0, | |
| const unsigned char* __restrict__ in_ptr0, | |
| const long* __restrict__ in_ptr1, | |
| const float* __restrict__ in_ptr2, | |
| const unsigned char* __restrict__ in_ptr3, | |
| const long* __restrict__ in_ptr4, | |
| const float* __restrict__ in_ptr5, | |
| const float* __restrict__ in_ptr6, | |
| const long* __restrict__ in_ptr7, | |
| unsigned char* __restrict__ out_ptr1) | |
| { | |
| auto out_ptr0 = in_out_ptr0; | |
| #pragma omp parallel num_threads(28) | |
| { | |
| { | |
| #pragma omp for | |
| for(long i0=0; i0<116; i0+=1) | |
| { | |
| #pragma GCC ivdep | |
| for(long i1=0; i1<1024; i1+=1) | |
| { | |
| #pragma GCC ivdep | |
| for(long i2=0; i2<196; i2+=1) | |
| { | |
| auto tmp0 = in_ptr0[i1 + (1024*i2) + (200704*i0)]; | |
| auto tmp2 = in_ptr1[0]; | |
| auto tmp5 = in_ptr2[0]; | |
| auto tmp7 = in_ptr3[i2 + (196*i1) + (200704*i0)]; | |
| auto tmp9 = in_ptr4[0]; | |
| auto tmp12 = in_ptr5[0]; | |
| auto tmp1 = static_cast<float>(tmp0); | |
| auto tmp3 = static_cast<float>(tmp2); | |
| auto tmp4 = tmp1 - tmp3; | |
| auto tmp6 = tmp4 * tmp5; | |
| auto tmp8 = static_cast<float>(tmp7); | |
| auto tmp10 = static_cast<float>(tmp9); | |
| auto tmp11 = tmp8 - tmp10; | |
| auto tmp13 = tmp11 * tmp12; | |
| auto tmp14 = tmp6 + tmp13; | |
| out_ptr0[i2 + (196*i1) + (200704*i0)] = tmp14; | |
| } | |
| } | |
| } | |
| } | |
| { | |
| #pragma omp for | |
| for(long i0=0; i0<23281664; i0+=1) | |
| { | |
| auto tmp0 = out_ptr0[i0]; | |
| auto tmp4 = in_ptr6[0]; | |
| auto tmp7 = in_ptr7[0]; | |
| auto tmp1 = tmp0 * (tmp0>0); | |
| auto tmp2 = static_cast<float>(0); | |
| auto tmp3 = static_cast<float>(127); | |
| auto tmp5 = tmp1 / tmp4; | |
| auto tmp6 = std::nearbyint(tmp5); | |
| auto tmp8 = static_cast<float>(tmp7); | |
| auto tmp9 = tmp6 + tmp8; | |
| auto tmp10 = (tmp9 != tmp9) ? tmp9 : std::min(tmp3, tmp9); | |
| auto tmp11 = (tmp10 != tmp10) ? tmp10 : std::max(tmp2, tmp10); | |
| auto tmp12 = static_cast<unsigned char>(tmp11); | |
| out_ptr1[i0] = tmp12; | |
| } | |
| } | |
| } | |
| } | |
| ''') | |
| kernel_cpp_12 = async_compile.cpp(''' | |
| #include "/tmp/torchinductor_root/dm/cdmaihqxwe73zkb3he2zizktpq5uujetg2db26c3r4lgsmlx3b4c.h" | |
| extern "C" void kernel(float* __restrict__ in_out_ptr0, | |
| const unsigned char* __restrict__ in_ptr0, | |
| const long* __restrict__ in_ptr1, | |
| const float* __restrict__ in_ptr2, | |
| const unsigned char* __restrict__ in_ptr3, | |
| const long* __restrict__ in_ptr4, | |
| const float* __restrict__ in_ptr5, | |
| const float* __restrict__ in_ptr6, | |
| const long* __restrict__ in_ptr7, | |
| unsigned char* __restrict__ out_ptr1) | |
| { | |
| auto out_ptr0 = in_out_ptr0; | |
| #pragma omp parallel num_threads(28) | |
| { | |
| { | |
| #pragma omp for | |
| for(long i0=0; i0<116; i0+=1) | |
| { | |
| #pragma GCC ivdep | |
| for(long i1=0; i1<1024; i1+=1) | |
| { | |
| #pragma GCC ivdep | |
| for(long i2=0; i2<196; i2+=1) | |
| { | |
| auto tmp0 = in_ptr0[i1 + (1024*i2) + (200704*i0)]; | |
| auto tmp2 = in_ptr1[0]; | |
| auto tmp5 = in_ptr2[0]; | |
| auto tmp7 = in_ptr3[i2 + (196*i1) + (200704*i0)]; | |
| auto tmp9 = in_ptr4[0]; | |
| auto tmp12 = in_ptr5[0]; | |
| auto tmp1 = static_cast<float>(tmp0); | |
| auto tmp3 = static_cast<float>(tmp2); | |
| auto tmp4 = tmp1 - tmp3; | |
| auto tmp6 = tmp4 * tmp5; | |
| auto tmp8 = static_cast<float>(tmp7); | |
| auto tmp10 = static_cast<float>(tmp9); | |
| auto tmp11 = tmp8 - tmp10; | |
| auto tmp13 = tmp11 * tmp12; | |
| auto tmp14 = tmp6 + tmp13; | |
| out_ptr0[i2 + (196*i1) + (200704*i0)] = tmp14; | |
| } | |
| } | |
| } | |
| } | |
| { | |
| #pragma omp for | |
| for(long i0=0; i0<23281664; i0+=1) | |
| { | |
| auto tmp0 = out_ptr0[i0]; | |
| auto tmp4 = in_ptr6[0]; | |
| auto tmp7 = in_ptr7[0]; | |
| auto tmp1 = tmp0 * (tmp0>0); | |
| auto tmp2 = static_cast<float>(0); | |
| auto tmp3 = static_cast<float>(127); | |
| auto tmp5 = tmp1 / tmp4; | |
| auto tmp6 = std::nearbyint(tmp5); | |
| auto tmp8 = static_cast<float>(tmp7); | |
| auto tmp9 = tmp6 + tmp8; | |
| auto tmp10 = (tmp9 != tmp9) ? tmp9 : std::min(tmp3, tmp9); | |
| auto tmp11 = (tmp10 != tmp10) ? tmp10 : std::max(tmp2, tmp10); | |
| auto tmp12 = static_cast<unsigned char>(tmp11); | |
| out_ptr1[i0] = tmp12; | |
| } | |
| } | |
| } | |
| } | |
| ''') | |
| kernel_cpp_13 = async_compile.cpp(''' | |
| #include "/tmp/torchinductor_root/dm/cdmaihqxwe73zkb3he2zizktpq5uujetg2db26c3r4lgsmlx3b4c.h" | |
| extern "C" void kernel(float* __restrict__ in_out_ptr0, | |
| const unsigned char* __restrict__ in_ptr0, | |
| const long* __restrict__ in_ptr1, | |
| const float* __restrict__ in_ptr2, | |
| const unsigned char* __restrict__ in_ptr3, | |
| const long* __restrict__ in_ptr4, | |
| const float* __restrict__ in_ptr5, | |
| const float* __restrict__ in_ptr6, | |
| const long* __restrict__ in_ptr7, | |
| unsigned char* __restrict__ out_ptr1) | |
| { | |
| auto out_ptr0 = in_out_ptr0; | |
| #pragma omp parallel num_threads(28) | |
| { | |
| { | |
| #pragma omp for | |
| for(long i0=0; i0<116; i0+=1) | |
| { | |
| #pragma GCC ivdep | |
| for(long i1=0; i1<1024; i1+=1) | |
| { | |
| #pragma GCC ivdep | |
| for(long i2=0; i2<196; i2+=1) | |
| { | |
| auto tmp0 = in_ptr0[i1 + (1024*i2) + (200704*i0)]; | |
| auto tmp2 = in_ptr1[0]; | |
| auto tmp5 = in_ptr2[0]; | |
| auto tmp7 = in_ptr3[i2 + (196*i1) + (200704*i0)]; | |
| auto tmp9 = in_ptr4[0]; | |
| auto tmp12 = in_ptr5[0]; | |
| auto tmp1 = static_cast<float>(tmp0); | |
| auto tmp3 = static_cast<float>(tmp2); | |
| auto tmp4 = tmp1 - tmp3; | |
| auto tmp6 = tmp4 * tmp5; | |
| auto tmp8 = static_cast<float>(tmp7); | |
| auto tmp10 = static_cast<float>(tmp9); | |
| auto tmp11 = tmp8 - tmp10; | |
| auto tmp13 = tmp11 * tmp12; | |
| auto tmp14 = tmp6 + tmp13; | |
| out_ptr0[i2 + (196*i1) + (200704*i0)] = tmp14; | |
| } | |
| } | |
| } | |
| } | |
| { | |
| #pragma omp for | |
| for(long i0=0; i0<23281664; i0+=1) | |
| { | |
| auto tmp0 = out_ptr0[i0]; | |
| auto tmp4 = in_ptr6[0]; | |
| auto tmp7 = in_ptr7[0]; | |
| auto tmp1 = tmp0 * (tmp0>0); | |
| auto tmp2 = static_cast<float>(0); | |
| auto tmp3 = static_cast<float>(127); | |
| auto tmp5 = tmp1 / tmp4; | |
| auto tmp6 = std::nearbyint(tmp5); | |
| auto tmp8 = static_cast<float>(tmp7); | |
| auto tmp9 = tmp6 + tmp8; | |
| auto tmp10 = (tmp9 != tmp9) ? tmp9 : std::min(tmp3, tmp9); | |
| auto tmp11 = (tmp10 != tmp10) ? tmp10 : std::max(tmp2, tmp10); | |
| auto tmp12 = static_cast<unsigned char>(tmp11); | |
| out_ptr1[i0] = tmp12; | |
| } | |
| } | |
| } | |
| } | |
| ''') | |
| kernel_cpp_14 = async_compile.cpp(''' | |
| #include "/tmp/torchinductor_root/dm/cdmaihqxwe73zkb3he2zizktpq5uujetg2db26c3r4lgsmlx3b4c.h" | |
| extern "C" void kernel(float* __restrict__ in_out_ptr0, | |
| const unsigned char* __restrict__ in_ptr0, | |
| const long* __restrict__ in_ptr1, | |
| const float* __restrict__ in_ptr2, | |
| const unsigned char* __restrict__ in_ptr3, | |
| const long* __restrict__ in_ptr4, | |
| const float* __restrict__ in_ptr5, | |
| const float* __restrict__ in_ptr6, | |
| const long* __restrict__ in_ptr7, | |
| unsigned char* __restrict__ out_ptr1) | |
| { | |
| auto out_ptr0 = in_out_ptr0; | |
| #pragma omp parallel num_threads(28) | |
| { | |
| { | |
| #pragma omp for | |
| for(long i0=0; i0<116; i0+=1) | |
| { | |
| #pragma GCC ivdep | |
| for(long i1=0; i1<1024; i1+=1) | |
| { | |
| #pragma GCC ivdep | |
| for(long i2=0; i2<196; i2+=1) | |
| { | |
| auto tmp0 = in_ptr0[i1 + (1024*i2) + (200704*i0)]; | |
| auto tmp2 = in_ptr1[0]; | |
| auto tmp5 = in_ptr2[0]; | |
| auto tmp7 = in_ptr3[i2 + (196*i1) + (200704*i0)]; | |
| auto tmp9 = in_ptr4[0]; | |
| auto tmp12 = in_ptr5[0]; | |
| auto tmp1 = static_cast<float>(tmp0); | |
| auto tmp3 = static_cast<float>(tmp2); | |
| auto tmp4 = tmp1 - tmp3; | |
| auto tmp6 = tmp4 * tmp5; | |
| auto tmp8 = static_cast<float>(tmp7); | |
| auto tmp10 = static_cast<float>(tmp9); | |
| auto tmp11 = tmp8 - tmp10; | |
| auto tmp13 = tmp11 * tmp12; | |
| auto tmp14 = tmp6 + tmp13; | |
| out_ptr0[i2 + (196*i1) + (200704*i0)] = tmp14; | |
| } | |
| } | |
| } | |
| } | |
| { | |
| #pragma omp for | |
| for(long i0=0; i0<23281664; i0+=1) | |
| { | |
| auto tmp0 = out_ptr0[i0]; | |
| auto tmp4 = in_ptr6[0]; | |
| auto tmp7 = in_ptr7[0]; | |
| auto tmp1 = tmp0 * (tmp0>0); | |
| auto tmp2 = static_cast<float>(0); | |
| auto tmp3 = static_cast<float>(127); | |
| auto tmp5 = tmp1 / tmp4; | |
| auto tmp6 = std::nearbyint(tmp5); | |
| auto tmp8 = static_cast<float>(tmp7); | |
| auto tmp9 = tmp6 + tmp8; | |
| auto tmp10 = (tmp9 != tmp9) ? tmp9 : std::min(tmp3, tmp9); | |
| auto tmp11 = (tmp10 != tmp10) ? tmp10 : std::max(tmp2, tmp10); | |
| auto tmp12 = static_cast<unsigned char>(tmp11); | |
| out_ptr1[i0] = tmp12; | |
| } | |
| } | |
| } | |
| } | |
| ''') | |
| kernel_cpp_15 = async_compile.cpp(''' | |
| #include "/tmp/torchinductor_root/dm/cdmaihqxwe73zkb3he2zizktpq5uujetg2db26c3r4lgsmlx3b4c.h" | |
| extern "C" void kernel(float* __restrict__ in_out_ptr0, | |
| const unsigned char* __restrict__ in_ptr0, | |
| const long* __restrict__ in_ptr1, | |
| const float* __restrict__ in_ptr2, | |
| const unsigned char* __restrict__ in_ptr3, | |
| const long* __restrict__ in_ptr4, | |
| const float* __restrict__ in_ptr5, | |
| const float* __restrict__ in_ptr6, | |
| const long* __restrict__ in_ptr7, | |
| unsigned char* __restrict__ out_ptr1) | |
| { | |
| auto out_ptr0 = in_out_ptr0; | |
| #pragma omp parallel num_threads(28) | |
| { | |
| { | |
| #pragma omp for | |
| for(long i0=0; i0<11640832; i0+=1) | |
| { | |
| auto tmp0 = in_ptr0[i0]; | |
| auto tmp2 = in_ptr1[0]; | |
| auto tmp5 = in_ptr2[0]; | |
| auto tmp7 = in_ptr3[i0]; | |
| auto tmp9 = in_ptr4[0]; | |
| auto tmp12 = in_ptr5[0]; | |
| auto tmp1 = static_cast<float>(tmp0); | |
| auto tmp3 = static_cast<float>(tmp2); | |
| auto tmp4 = tmp1 - tmp3; | |
| auto tmp6 = tmp4 * tmp5; | |
| auto tmp8 = static_cast<float>(tmp7); | |
| auto tmp10 = static_cast<float>(tmp9); | |
| auto tmp11 = tmp8 - tmp10; | |
| auto tmp13 = tmp11 * tmp12; | |
| auto tmp14 = tmp6 + tmp13; | |
| out_ptr0[i0] = tmp14; | |
| } | |
| } | |
| { | |
| #pragma omp for | |
| for(long i0=0; i0<116; i0+=1) | |
| { | |
| #pragma GCC ivdep | |
| for(long i1=0; i1<2048; i1+=1) | |
| { | |
| #pragma GCC ivdep | |
| for(long i2=0; i2<49; i2+=1) | |
| { | |
| auto tmp0 = out_ptr0[i1 + (2048*i2) + (100352*i0)]; | |
| auto tmp4 = in_ptr6[0]; | |
| auto tmp7 = in_ptr7[0]; | |
| auto tmp1 = tmp0 * (tmp0>0); | |
| auto tmp2 = static_cast<float>(0); | |
| auto tmp3 = static_cast<float>(127); | |
| auto tmp5 = tmp1 / tmp4; | |
| auto tmp6 = std::nearbyint(tmp5); | |
| auto tmp8 = static_cast<float>(tmp7); | |
| auto tmp9 = tmp6 + tmp8; | |
| auto tmp10 = (tmp9 != tmp9) ? tmp9 : std::min(tmp3, tmp9); | |
| auto tmp11 = (tmp10 != tmp10) ? tmp10 : std::max(tmp2, tmp10); | |
| auto tmp12 = static_cast<unsigned char>(tmp11); | |
| out_ptr1[i2 + (49*i1) + (100352*i0)] = tmp12; | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| ''') | |
| kernel_cpp_16 = async_compile.cpp(''' | |
| #include "/tmp/torchinductor_root/dm/cdmaihqxwe73zkb3he2zizktpq5uujetg2db26c3r4lgsmlx3b4c.h" | |
| extern "C" void kernel(float* __restrict__ in_out_ptr0, | |
| const unsigned char* __restrict__ in_ptr0, | |
| const long* __restrict__ in_ptr1, | |
| const float* __restrict__ in_ptr2, | |
| const unsigned char* __restrict__ in_ptr3, | |
| const long* __restrict__ in_ptr4, | |
| const float* __restrict__ in_ptr5, | |
| const float* __restrict__ in_ptr6, | |
| const long* __restrict__ in_ptr7, | |
| unsigned char* __restrict__ out_ptr1) | |
| { | |
| auto out_ptr0 = in_out_ptr0; | |
| #pragma omp parallel num_threads(28) | |
| { | |
| { | |
| #pragma omp for | |
| for(long i0=0; i0<116; i0+=1) | |
| { | |
| #pragma GCC ivdep | |
| for(long i1=0; i1<2048; i1+=1) | |
| { | |
| #pragma GCC ivdep | |
| for(long i2=0; i2<49; i2+=1) | |
| { | |
| auto tmp0 = in_ptr0[i1 + (2048*i2) + (100352*i0)]; | |
| auto tmp2 = in_ptr1[0]; | |
| auto tmp5 = in_ptr2[0]; | |
| auto tmp7 = in_ptr3[i2 + (49*i1) + (100352*i0)]; | |
| auto tmp9 = in_ptr4[0]; | |
| auto tmp12 = in_ptr5[0]; | |
| auto tmp1 = static_cast<float>(tmp0); | |
| auto tmp3 = static_cast<float>(tmp2); | |
| auto tmp4 = tmp1 - tmp3; | |
| auto tmp6 = tmp4 * tmp5; | |
| auto tmp8 = static_cast<float>(tmp7); | |
| auto tmp10 = static_cast<float>(tmp9); | |
| auto tmp11 = tmp8 - tmp10; | |
| auto tmp13 = tmp11 * tmp12; | |
| auto tmp14 = tmp6 + tmp13; | |
| out_ptr0[i2 + (49*i1) + (100352*i0)] = tmp14; | |
| } | |
| } | |
| } | |
| } | |
| { | |
| #pragma omp for | |
| for(long i0=0; i0<11640832; i0+=1) | |
| { | |
| auto tmp0 = out_ptr0[i0]; | |
| auto tmp4 = in_ptr6[0]; | |
| auto tmp7 = in_ptr7[0]; | |
| auto tmp1 = tmp0 * (tmp0>0); | |
| auto tmp2 = static_cast<float>(0); | |
| auto tmp3 = static_cast<float>(127); | |
| auto tmp5 = tmp1 / tmp4; | |
| auto tmp6 = std::nearbyint(tmp5); | |
| auto tmp8 = static_cast<float>(tmp7); | |
| auto tmp9 = tmp6 + tmp8; | |
| auto tmp10 = (tmp9 != tmp9) ? tmp9 : std::min(tmp3, tmp9); | |
| auto tmp11 = (tmp10 != tmp10) ? tmp10 : std::max(tmp2, tmp10); | |
| auto tmp12 = static_cast<unsigned char>(tmp11); | |
| out_ptr1[i0] = tmp12; | |
| } | |
| } | |
| } | |
| } | |
| ''') | |
| kernel_cpp_17 = async_compile.cpp(''' | |
| #include "/tmp/torchinductor_root/dm/cdmaihqxwe73zkb3he2zizktpq5uujetg2db26c3r4lgsmlx3b4c.h" | |
| extern "C" void kernel(float* __restrict__ in_out_ptr0, | |
| const unsigned char* __restrict__ in_ptr0, | |
| const long* __restrict__ in_ptr1, | |
| const float* __restrict__ in_ptr2, | |
| const unsigned char* __restrict__ in_ptr3, | |
| const long* __restrict__ in_ptr4, | |
| const float* __restrict__ in_ptr5, | |
| const float* __restrict__ in_ptr6, | |
| const long* __restrict__ in_ptr7, | |
| const float* __restrict__ in_ptr8, | |
| const long* __restrict__ in_ptr9, | |
| float* __restrict__ out_ptr1, | |
| unsigned char* __restrict__ out_ptr2) | |
| { | |
| auto out_ptr0 = in_out_ptr0; | |
| #pragma omp parallel num_threads(28) | |
| { | |
| { | |
| #pragma omp for | |
| for(long i0=0; i0<116; i0+=1) | |
| { | |
| #pragma GCC ivdep | |
| for(long i1=0; i1<2048; i1+=1) | |
| { | |
| #pragma GCC ivdep | |
| for(long i2=0; i2<49; i2+=1) | |
| { | |
| auto tmp0 = in_ptr0[i1 + (2048*i2) + (100352*i0)]; | |
| auto tmp2 = in_ptr1[0]; | |
| auto tmp5 = in_ptr2[0]; | |
| auto tmp7 = in_ptr3[i2 + (49*i1) + (100352*i0)]; | |
| auto tmp9 = in_ptr4[0]; | |
| auto tmp12 = in_ptr5[0]; | |
| auto tmp1 = static_cast<float>(tmp0); | |
| auto tmp3 = static_cast<float>(tmp2); | |
| auto tmp4 = tmp1 - tmp3; | |
| auto tmp6 = tmp4 * tmp5; | |
| auto tmp8 = static_cast<float>(tmp7); | |
| auto tmp10 = static_cast<float>(tmp9); | |
| auto tmp11 = tmp8 - tmp10; | |
| auto tmp13 = tmp11 * tmp12; | |
| auto tmp14 = tmp6 + tmp13; | |
| out_ptr0[i2 + (49*i1) + (100352*i0)] = tmp14; | |
| } | |
| } | |
| } | |
| } | |
| { | |
| #pragma omp for | |
| for(long i0=0; i0<727552; i0+=1) | |
| { | |
| auto tmp0 = at::vec::Vectorized<float>::loadu(out_ptr0 + 16*i0); | |
| auto tmp1 = at::vec::clamp_min(tmp0, decltype(tmp0)(0)); | |
| tmp1.store(in_out_ptr0 + 16*i0); | |
| } | |
| #pragma omp for simd simdlen(8) | |
| for(long i0=11640832; i0<11640832; i0+=1) | |
| { | |
| auto tmp0 = out_ptr0[i0]; | |
| auto tmp1 = tmp0 * (tmp0>0); | |
| in_out_ptr0[i0] = tmp1; | |
| } | |
| } | |
| { | |
| #pragma omp for | |
| for(long i0=0; i0<237568; i0+=1) | |
| { | |
| { | |
| float tmp15 = 0; | |
| for(long i1=0; i1<49; i1+=1) | |
| { | |
| auto tmp2 = in_out_ptr0[i1 + (49*i0)]; | |
| auto tmp3 = in_ptr6[0]; | |
| auto tmp6 = in_ptr7[0]; | |
| auto tmp0 = static_cast<float>(0); | |
| auto tmp1 = static_cast<float>(127); | |
| auto tmp4 = tmp2 / tmp3; | |
| auto tmp5 = std::nearbyint(tmp4); | |
| auto tmp7 = static_cast<float>(tmp6); | |
| auto tmp8 = tmp5 + tmp7; | |
| auto tmp9 = (tmp8 != tmp8) ? tmp8 : std::min(tmp1, tmp8); | |
| auto tmp10 = (tmp9 != tmp9) ? tmp9 : std::max(tmp0, tmp9); | |
| auto tmp11 = static_cast<unsigned char>(tmp10); | |
| auto tmp12 = static_cast<float>(tmp11); | |
| auto tmp13 = tmp12 - tmp7; | |
| auto tmp14 = tmp13 * tmp3; | |
| tmp15 += tmp14; | |
| } | |
| out_ptr1[i0] = tmp15; | |
| } | |
| } | |
| } | |
| { | |
| #pragma omp for | |
| for(long i0=0; i0<237568; i0+=1) | |
| { | |
| auto tmp2 = out_ptr1[i0]; | |
| auto tmp5 = in_ptr8[0]; | |
| auto tmp8 = in_ptr9[0]; | |
| auto tmp0 = static_cast<float>(0); | |
| auto tmp1 = static_cast<float>(127); | |
| auto tmp3 = static_cast<float>(49); | |
| auto tmp4 = tmp2 / tmp3; | |
| auto tmp6 = tmp4 / tmp5; | |
| auto tmp7 = std::nearbyint(tmp6); | |
| auto tmp9 = static_cast<float>(tmp8); | |
| auto tmp10 = tmp7 + tmp9; | |
| auto tmp11 = (tmp10 != tmp10) ? tmp10 : std::min(tmp1, tmp10); | |
| auto tmp12 = (tmp11 != tmp11) ? tmp11 : std::max(tmp0, tmp11); | |
| auto tmp13 = static_cast<unsigned char>(tmp12); | |
| out_ptr2[i0] = tmp13; | |
| } | |
| } | |
| } | |
| } | |
| ''') | |
| kernel_cpp_18 = async_compile.cpp(''' | |
| #include "/tmp/torchinductor_root/dm/cdmaihqxwe73zkb3he2zizktpq5uujetg2db26c3r4lgsmlx3b4c.h" | |
| extern "C" void kernel(const unsigned char* __restrict__ in_ptr0, | |
| const long* __restrict__ in_ptr1, | |
| const float* __restrict__ in_ptr2, | |
| float* __restrict__ out_ptr0) | |
| { | |
| #pragma omp parallel num_threads(28) | |
| { | |
| { | |
| #pragma omp for | |
| for(long i0=0; i0<116000; i0+=1) | |
| { | |
| auto tmp0 = in_ptr0[i0]; | |
| auto tmp2 = in_ptr1[0]; | |
| auto tmp5 = in_ptr2[0]; | |
| auto tmp1 = static_cast<float>(tmp0); | |
| auto tmp3 = static_cast<float>(tmp2); | |
| auto tmp4 = tmp1 - tmp3; | |
| auto tmp6 = tmp4 * tmp5; | |
| out_ptr0[i0] = tmp6; | |
| } | |
| } | |
| } | |
| } | |
| ''') | |
| async_compile.wait(globals()) | |
| del async_compile | |
| def call(args): | |
| arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1, arg12_1, arg13_1, arg14_1, arg15_1, arg16_1, arg17_1, arg18_1, arg19_1, arg20_1, arg21_1, arg22_1, arg23_1, arg24_1, arg25_1, arg26_1, arg27_1, arg28_1, arg29_1, arg30_1, arg31_1, arg32_1, arg33_1, arg34_1, arg35_1, arg36_1, arg37_1, arg38_1, arg39_1, arg40_1, arg41_1, arg42_1, arg43_1, arg44_1, arg45_1, arg46_1, arg47_1, arg48_1, arg49_1, arg50_1, arg51_1, arg52_1, arg53_1, arg54_1, arg55_1, arg56_1, arg57_1, arg58_1, arg59_1, arg60_1, arg61_1, arg62_1, arg63_1, arg64_1, arg65_1, arg66_1, arg67_1, arg68_1, arg69_1, arg70_1, arg71_1, arg72_1, arg73_1, arg74_1, arg75_1, arg76_1, arg77_1, arg78_1, arg79_1, arg80_1, arg81_1, arg82_1, arg83_1, arg84_1, arg85_1, arg86_1, arg87_1, arg88_1, arg89_1, arg90_1, arg91_1, arg92_1, arg93_1, arg94_1, arg95_1, arg96_1, arg97_1, arg98_1, arg99_1, arg100_1, arg101_1, arg102_1, arg103_1, arg104_1, arg105_1, arg106_1, arg107_1, arg108_1, arg109_1, arg110_1, arg111_1, arg112_1, arg113_1, arg114_1, arg115_1, arg116_1, arg117_1, arg118_1, arg119_1, arg120_1, arg121_1, arg122_1, arg123_1, arg124_1, arg125_1, arg126_1, arg127_1, arg128_1, arg129_1, arg130_1, arg131_1, arg132_1, arg133_1, arg134_1, arg135_1, arg136_1, arg137_1, arg138_1, arg139_1, arg140_1, arg141_1, arg142_1, arg143_1, arg144_1, arg145_1, arg146_1, arg147_1, arg148_1, arg149_1, arg150_1, arg151_1, arg152_1, arg153_1, arg154_1, arg155_1, arg156_1, arg157_1, arg158_1, arg159_1, arg160_1, arg161_1, arg162_1, arg163_1, arg164_1, arg165_1, arg166_1, arg167_1, arg168_1, arg169_1, arg170_1, arg171_1, arg172_1, arg173_1, arg174_1, arg175_1, arg176_1, arg177_1, arg178_1, arg179_1, arg180_1, arg181_1, arg182_1, arg183_1, arg184_1, arg185_1, arg186_1, arg187_1, arg188_1, arg189_1, arg190_1, arg191_1, arg192_1, arg193_1, arg194_1, arg195_1, arg196_1, arg197_1, arg198_1, arg199_1, arg200_1, arg201_1, arg202_1, arg203_1, arg204_1, arg205_1, arg206_1, arg207_1, arg208_1, arg209_1, arg210_1, arg211_1, arg212_1, arg213_1, arg214_1, arg215_1, arg216_1, arg217_1, arg218_1, arg219_1, arg220_1, arg221_1, arg222_1, arg223_1, arg224_1, arg225_1, arg226_1, arg227_1, arg228_1, arg229_1, arg230_1, arg231_1, arg232_1, arg233_1, arg234_1, arg235_1, arg236_1, arg237_1, arg238_1, arg239_1, arg240_1, arg241_1, arg242_1, arg243_1, arg244_1, arg245_1, arg246_1, arg247_1, arg248_1, arg249_1, arg250_1, arg251_1, arg252_1, arg253_1, arg254_1, arg255_1, arg256_1, arg257_1, arg258_1, arg259_1, arg260_1, arg261_1, arg262_1, arg263_1, arg264_1, arg265_1, arg266_1, arg267_1, arg268_1, arg269_1, arg270_1, arg271_1, arg272_1, arg273_1, arg274_1, arg275_1, arg276_1, arg277_1, arg278_1, arg279_1, arg280_1, arg281_1, arg282_1, arg283_1, arg284_1, arg285_1, arg286_1, arg287_1, arg288_1, arg289_1, arg290_1, arg291_1, arg292_1, arg293_1, arg294_1, arg295_1, arg296_1, arg297_1, arg298_1, arg299_1, arg300_1, arg301_1, arg302_1, arg303_1, arg304_1, arg305_1, arg306_1, arg307_1, arg308_1, arg309_1, arg310_1, arg311_1, arg312_1, arg313_1, arg314_1, arg315_1, arg316_1, arg317_1, arg318_1, arg319_1, arg320_1, arg321_1, arg322_1, arg323_1, arg324_1, arg325_1, arg326_1, arg327_1, arg328_1, arg329_1, arg330_1, arg331_1, arg332_1, arg333_1, arg334_1, arg335_1, arg336_1, arg337_1, arg338_1, arg339_1, arg340_1, arg341_1, arg342_1, arg343_1, arg344_1, arg345_1, arg346_1, arg347_1, arg348_1, arg349_1, arg350_1, arg351_1, arg352_1, arg353_1, arg354_1, arg355_1, arg356_1, arg357_1, arg358_1, arg359_1, arg360_1, arg361_1, arg362_1 = args | |
| args.clear() | |
| buf0 = empty_strided((116, 3, 224, 224), (150528, 50176, 224, 1), device='cpu', dtype=torch.uint8) | |
| kernel_cpp_0(c_void_p(arg362_1.data_ptr()), c_void_p(arg0_1.data_ptr()), c_void_p(arg1_1.data_ptr()), c_void_p(buf0.data_ptr())) | |
| del arg362_1 | |
| buf1 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf0, arg0_1, arg1_1, arg254_1, arg2_1, arg3_1, 0, arg255_1, [2, 2], [3, 3], [1, 1], 1, arg4_1, arg5_1, 'relu_') | |
| del arg0_1 | |
| del arg1_1 | |
| del arg254_1 | |
| del arg255_1 | |
| del arg2_1 | |
| del arg3_1 | |
| del buf0 | |
| buf2 = buf1 | |
| assert_size_stride(buf2, (116, 64, 112, 112), (802816, 1, 7168, 64)) | |
| del buf1 | |
| buf3 = empty_strided((116, 64, 112, 112), (802816, 1, 7168, 64), device='cpu', dtype=torch.float32) | |
| buf4 = empty_strided((116, 64, 56, 56), (200704, 1, 3584, 64), device='cpu', dtype=torch.float32) | |
| buf6 = empty_strided((116, 64, 56, 56), (200704, 3136, 56, 1), device='cpu', dtype=torch.uint8) | |
| kernel_cpp_1(c_void_p(buf2.data_ptr()), c_void_p(arg5_1.data_ptr()), c_void_p(arg4_1.data_ptr()), c_void_p(arg6_1.data_ptr()), c_void_p(arg7_1.data_ptr()), c_void_p(buf3.data_ptr()), c_void_p(buf4.data_ptr()), c_void_p(buf6.data_ptr())) | |
| del arg4_1 | |
| del arg5_1 | |
| buf7 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf6, arg6_1, arg7_1, arg256_1, arg8_1, arg9_1, 0, arg257_1, [1, 1], [0, 0], [1, 1], 1, arg10_1, arg11_1, 'relu_') | |
| del arg256_1 | |
| del arg257_1 | |
| del arg8_1 | |
| del arg9_1 | |
| buf8 = buf7 | |
| assert_size_stride(buf8, (116, 64, 56, 56), (200704, 1, 3584, 64)) | |
| del buf7 | |
| buf9 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf8, arg10_1, arg11_1, arg258_1, arg12_1, arg13_1, 0, arg259_1, [1, 1], [1, 1], [1, 1], 1, arg14_1, arg15_1, 'relu_') | |
| del arg10_1 | |
| del arg11_1 | |
| del arg12_1 | |
| del arg13_1 | |
| del arg258_1 | |
| del arg259_1 | |
| del buf8 | |
| buf10 = buf9 | |
| assert_size_stride(buf10, (116, 64, 56, 56), (200704, 1, 3584, 64)) | |
| del buf9 | |
| buf11 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf10, arg14_1, arg15_1, arg260_1, arg16_1, arg17_1, 0, arg261_1, [1, 1], [0, 0], [1, 1], 1, arg18_1, arg19_1, 'none') | |
| del arg14_1 | |
| del arg15_1 | |
| del arg16_1 | |
| del arg17_1 | |
| del arg260_1 | |
| del arg261_1 | |
| del buf10 | |
| buf12 = buf11 | |
| assert_size_stride(buf12, (116, 256, 56, 56), (802816, 1, 14336, 256)) | |
| del buf11 | |
| buf13 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf6, arg6_1, arg7_1, arg262_1, arg20_1, arg21_1, 0, arg263_1, [1, 1], [0, 0], [1, 1], 1, arg22_1, arg23_1, 'none') | |
| del arg20_1 | |
| del arg21_1 | |
| del arg262_1 | |
| del arg263_1 | |
| del arg6_1 | |
| del arg7_1 | |
| del buf6 | |
| buf14 = buf13 | |
| assert_size_stride(buf14, (116, 256, 56, 56), (802816, 1, 14336, 256)) | |
| del buf13 | |
| buf15 = as_strided(buf3, (116, 256, 56, 56), (802816, 1, 14336, 256)); del buf3 # reuse | |
| buf16 = buf15; del buf15 # reuse | |
| buf17 = as_strided(buf2, (116, 256, 56, 56), (802816, 3136, 56, 1)); del buf2 # reuse | |
| kernel_cpp_2(c_void_p(buf16.data_ptr()), c_void_p(buf12.data_ptr()), c_void_p(arg19_1.data_ptr()), c_void_p(arg18_1.data_ptr()), c_void_p(buf14.data_ptr()), c_void_p(arg23_1.data_ptr()), c_void_p(arg22_1.data_ptr()), c_void_p(arg24_1.data_ptr()), c_void_p(arg25_1.data_ptr()), c_void_p(buf17.data_ptr())) | |
| del arg18_1 | |
| del arg19_1 | |
| del arg22_1 | |
| del arg23_1 | |
| del buf12 | |
| buf18 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf17, arg24_1, arg25_1, arg264_1, arg26_1, arg27_1, 0, arg265_1, [1, 1], [0, 0], [1, 1], 1, arg28_1, arg29_1, 'relu_') | |
| del arg264_1 | |
| del arg265_1 | |
| del arg26_1 | |
| del arg27_1 | |
| buf19 = buf18 | |
| assert_size_stride(buf19, (116, 64, 56, 56), (200704, 1, 3584, 64)) | |
| del buf18 | |
| buf20 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf19, arg28_1, arg29_1, arg266_1, arg30_1, arg31_1, 0, arg267_1, [1, 1], [1, 1], [1, 1], 1, arg32_1, arg33_1, 'relu_') | |
| del arg266_1 | |
| del arg267_1 | |
| del arg28_1 | |
| del arg29_1 | |
| del arg30_1 | |
| del arg31_1 | |
| del buf19 | |
| buf21 = buf20 | |
| assert_size_stride(buf21, (116, 64, 56, 56), (200704, 1, 3584, 64)) | |
| del buf20 | |
| buf22 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf21, arg32_1, arg33_1, arg268_1, arg34_1, arg35_1, 0, arg269_1, [1, 1], [0, 0], [1, 1], 1, arg36_1, arg37_1, 'none') | |
| del arg268_1 | |
| del arg269_1 | |
| del arg32_1 | |
| del arg33_1 | |
| del arg34_1 | |
| del arg35_1 | |
| del buf21 | |
| buf23 = buf22 | |
| assert_size_stride(buf23, (116, 256, 56, 56), (802816, 1, 14336, 256)) | |
| del buf22 | |
| buf24 = as_strided(buf16, (116, 256, 56, 56), (802816, 3136, 56, 1)); del buf16 # reuse | |
| buf25 = buf24; del buf24 # reuse | |
| buf26 = as_strided(buf14, (116, 256, 56, 56), (802816, 3136, 56, 1)); del buf14 # reuse | |
| kernel_cpp_3(c_void_p(buf25.data_ptr()), c_void_p(buf23.data_ptr()), c_void_p(arg37_1.data_ptr()), c_void_p(arg36_1.data_ptr()), c_void_p(buf17.data_ptr()), c_void_p(arg25_1.data_ptr()), c_void_p(arg24_1.data_ptr()), c_void_p(arg38_1.data_ptr()), c_void_p(arg39_1.data_ptr()), c_void_p(buf26.data_ptr())) | |
| del arg24_1 | |
| del arg25_1 | |
| del arg36_1 | |
| del arg37_1 | |
| del buf17 | |
| buf27 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf26, arg38_1, arg39_1, arg270_1, arg40_1, arg41_1, 0, arg271_1, [1, 1], [0, 0], [1, 1], 1, arg42_1, arg43_1, 'relu_') | |
| del arg270_1 | |
| del arg271_1 | |
| del arg40_1 | |
| del arg41_1 | |
| buf28 = buf27 | |
| assert_size_stride(buf28, (116, 64, 56, 56), (200704, 1, 3584, 64)) | |
| del buf27 | |
| buf29 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf28, arg42_1, arg43_1, arg272_1, arg44_1, arg45_1, 0, arg273_1, [1, 1], [1, 1], [1, 1], 1, arg46_1, arg47_1, 'relu_') | |
| del arg272_1 | |
| del arg273_1 | |
| del arg42_1 | |
| del arg43_1 | |
| del arg44_1 | |
| del arg45_1 | |
| del buf28 | |
| buf30 = buf29 | |
| assert_size_stride(buf30, (116, 64, 56, 56), (200704, 1, 3584, 64)) | |
| del buf29 | |
| buf31 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf30, arg46_1, arg47_1, arg274_1, arg48_1, arg49_1, 0, arg275_1, [1, 1], [0, 0], [1, 1], 1, arg50_1, arg51_1, 'none') | |
| del arg274_1 | |
| del arg275_1 | |
| del arg46_1 | |
| del arg47_1 | |
| del arg48_1 | |
| del arg49_1 | |
| del buf30 | |
| buf32 = buf31 | |
| assert_size_stride(buf32, (116, 256, 56, 56), (802816, 1, 14336, 256)) | |
| del buf31 | |
| buf33 = buf25; del buf25 # reuse | |
| buf34 = buf33; del buf33 # reuse | |
| buf35 = as_strided(buf23, (116, 256, 56, 56), (802816, 3136, 56, 1)); del buf23 # reuse | |
| kernel_cpp_4(c_void_p(buf34.data_ptr()), c_void_p(buf32.data_ptr()), c_void_p(arg51_1.data_ptr()), c_void_p(arg50_1.data_ptr()), c_void_p(buf26.data_ptr()), c_void_p(arg39_1.data_ptr()), c_void_p(arg38_1.data_ptr()), c_void_p(arg52_1.data_ptr()), c_void_p(arg53_1.data_ptr()), c_void_p(buf35.data_ptr())) | |
| del arg38_1 | |
| del arg39_1 | |
| del arg50_1 | |
| del arg51_1 | |
| del buf26 | |
| del buf32 | |
| del buf34 | |
| buf36 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf35, arg52_1, arg53_1, arg276_1, arg54_1, arg55_1, 0, arg277_1, [1, 1], [0, 0], [1, 1], 1, arg56_1, arg57_1, 'relu_') | |
| del arg276_1 | |
| del arg277_1 | |
| del arg54_1 | |
| del arg55_1 | |
| buf37 = buf36 | |
| assert_size_stride(buf37, (116, 128, 56, 56), (401408, 1, 7168, 128)) | |
| del buf36 | |
| buf38 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf37, arg56_1, arg57_1, arg278_1, arg58_1, arg59_1, 0, arg279_1, [2, 2], [1, 1], [1, 1], 1, arg60_1, arg61_1, 'relu_') | |
| del arg278_1 | |
| del arg279_1 | |
| del arg56_1 | |
| del arg57_1 | |
| del arg58_1 | |
| del arg59_1 | |
| buf39 = buf38 | |
| assert_size_stride(buf39, (116, 128, 28, 28), (100352, 1, 3584, 128)) | |
| del buf38 | |
| buf40 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf39, arg60_1, arg61_1, arg280_1, arg62_1, arg63_1, 0, arg281_1, [1, 1], [0, 0], [1, 1], 1, arg64_1, arg65_1, 'none') | |
| del arg280_1 | |
| del arg281_1 | |
| del arg60_1 | |
| del arg61_1 | |
| del arg62_1 | |
| del arg63_1 | |
| del buf39 | |
| buf41 = buf40 | |
| assert_size_stride(buf41, (116, 512, 28, 28), (401408, 1, 14336, 512)) | |
| del buf40 | |
| buf42 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf35, arg52_1, arg53_1, arg282_1, arg66_1, arg67_1, 0, arg283_1, [2, 2], [0, 0], [1, 1], 1, arg68_1, arg69_1, 'none') | |
| del arg282_1 | |
| del arg283_1 | |
| del arg52_1 | |
| del arg53_1 | |
| del arg66_1 | |
| del arg67_1 | |
| del buf35 | |
| buf43 = buf42 | |
| assert_size_stride(buf43, (116, 512, 28, 28), (401408, 1, 14336, 512)) | |
| del buf42 | |
| buf44 = empty_strided((116, 512, 28, 28), (401408, 1, 14336, 512), device='cpu', dtype=torch.float32) | |
| buf45 = buf44; del buf44 # reuse | |
| buf46 = as_strided(buf37, (116, 512, 28, 28), (401408, 784, 28, 1)); del buf37 # reuse | |
| kernel_cpp_5(c_void_p(buf45.data_ptr()), c_void_p(buf41.data_ptr()), c_void_p(arg65_1.data_ptr()), c_void_p(arg64_1.data_ptr()), c_void_p(buf43.data_ptr()), c_void_p(arg69_1.data_ptr()), c_void_p(arg68_1.data_ptr()), c_void_p(arg70_1.data_ptr()), c_void_p(arg71_1.data_ptr()), c_void_p(buf46.data_ptr())) | |
| del arg64_1 | |
| del arg65_1 | |
| del arg68_1 | |
| del arg69_1 | |
| del buf41 | |
| buf47 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf46, arg70_1, arg71_1, arg284_1, arg72_1, arg73_1, 0, arg285_1, [1, 1], [0, 0], [1, 1], 1, arg74_1, arg75_1, 'relu_') | |
| del arg284_1 | |
| del arg285_1 | |
| del arg72_1 | |
| del arg73_1 | |
| buf48 = buf47 | |
| assert_size_stride(buf48, (116, 128, 28, 28), (100352, 1, 3584, 128)) | |
| del buf47 | |
| buf49 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf48, arg74_1, arg75_1, arg286_1, arg76_1, arg77_1, 0, arg287_1, [1, 1], [1, 1], [1, 1], 1, arg78_1, arg79_1, 'relu_') | |
| del arg286_1 | |
| del arg287_1 | |
| del arg74_1 | |
| del arg75_1 | |
| del arg76_1 | |
| del arg77_1 | |
| del buf48 | |
| buf50 = buf49 | |
| assert_size_stride(buf50, (116, 128, 28, 28), (100352, 1, 3584, 128)) | |
| del buf49 | |
| buf51 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf50, arg78_1, arg79_1, arg288_1, arg80_1, arg81_1, 0, arg289_1, [1, 1], [0, 0], [1, 1], 1, arg82_1, arg83_1, 'none') | |
| del arg288_1 | |
| del arg289_1 | |
| del arg78_1 | |
| del arg79_1 | |
| del arg80_1 | |
| del arg81_1 | |
| del buf50 | |
| buf52 = buf51 | |
| assert_size_stride(buf52, (116, 512, 28, 28), (401408, 1, 14336, 512)) | |
| del buf51 | |
| buf53 = as_strided(buf45, (116, 512, 28, 28), (401408, 784, 28, 1)); del buf45 # reuse | |
| buf54 = buf53; del buf53 # reuse | |
| buf55 = as_strided(buf43, (116, 512, 28, 28), (401408, 784, 28, 1)); del buf43 # reuse | |
| kernel_cpp_6(c_void_p(buf54.data_ptr()), c_void_p(buf52.data_ptr()), c_void_p(arg83_1.data_ptr()), c_void_p(arg82_1.data_ptr()), c_void_p(buf46.data_ptr()), c_void_p(arg71_1.data_ptr()), c_void_p(arg70_1.data_ptr()), c_void_p(arg84_1.data_ptr()), c_void_p(arg85_1.data_ptr()), c_void_p(buf55.data_ptr())) | |
| del arg70_1 | |
| del arg71_1 | |
| del arg82_1 | |
| del arg83_1 | |
| del buf46 | |
| buf56 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf55, arg84_1, arg85_1, arg290_1, arg86_1, arg87_1, 0, arg291_1, [1, 1], [0, 0], [1, 1], 1, arg88_1, arg89_1, 'relu_') | |
| del arg290_1 | |
| del arg291_1 | |
| del arg86_1 | |
| del arg87_1 | |
| buf57 = buf56 | |
| assert_size_stride(buf57, (116, 128, 28, 28), (100352, 1, 3584, 128)) | |
| del buf56 | |
| buf58 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf57, arg88_1, arg89_1, arg292_1, arg90_1, arg91_1, 0, arg293_1, [1, 1], [1, 1], [1, 1], 1, arg92_1, arg93_1, 'relu_') | |
| del arg292_1 | |
| del arg293_1 | |
| del arg88_1 | |
| del arg89_1 | |
| del arg90_1 | |
| del arg91_1 | |
| del buf57 | |
| buf59 = buf58 | |
| assert_size_stride(buf59, (116, 128, 28, 28), (100352, 1, 3584, 128)) | |
| del buf58 | |
| buf60 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf59, arg92_1, arg93_1, arg294_1, arg94_1, arg95_1, 0, arg295_1, [1, 1], [0, 0], [1, 1], 1, arg96_1, arg97_1, 'none') | |
| del arg294_1 | |
| del arg295_1 | |
| del arg92_1 | |
| del arg93_1 | |
| del arg94_1 | |
| del arg95_1 | |
| del buf59 | |
| buf61 = buf60 | |
| assert_size_stride(buf61, (116, 512, 28, 28), (401408, 1, 14336, 512)) | |
| del buf60 | |
| buf62 = buf54; del buf54 # reuse | |
| buf63 = buf62; del buf62 # reuse | |
| buf64 = as_strided(buf52, (116, 512, 28, 28), (401408, 784, 28, 1)); del buf52 # reuse | |
| kernel_cpp_7(c_void_p(buf63.data_ptr()), c_void_p(buf61.data_ptr()), c_void_p(arg97_1.data_ptr()), c_void_p(arg96_1.data_ptr()), c_void_p(buf55.data_ptr()), c_void_p(arg85_1.data_ptr()), c_void_p(arg84_1.data_ptr()), c_void_p(arg98_1.data_ptr()), c_void_p(arg99_1.data_ptr()), c_void_p(buf64.data_ptr())) | |
| del arg84_1 | |
| del arg85_1 | |
| del arg96_1 | |
| del arg97_1 | |
| del buf55 | |
| buf65 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf64, arg98_1, arg99_1, arg296_1, arg100_1, arg101_1, 0, arg297_1, [1, 1], [0, 0], [1, 1], 1, arg102_1, arg103_1, 'relu_') | |
| del arg100_1 | |
| del arg101_1 | |
| del arg296_1 | |
| del arg297_1 | |
| buf66 = buf65 | |
| assert_size_stride(buf66, (116, 128, 28, 28), (100352, 1, 3584, 128)) | |
| del buf65 | |
| buf67 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf66, arg102_1, arg103_1, arg298_1, arg104_1, arg105_1, 0, arg299_1, [1, 1], [1, 1], [1, 1], 1, arg106_1, arg107_1, 'relu_') | |
| del arg102_1 | |
| del arg103_1 | |
| del arg104_1 | |
| del arg105_1 | |
| del arg298_1 | |
| del arg299_1 | |
| del buf66 | |
| buf68 = buf67 | |
| assert_size_stride(buf68, (116, 128, 28, 28), (100352, 1, 3584, 128)) | |
| del buf67 | |
| buf69 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf68, arg106_1, arg107_1, arg300_1, arg108_1, arg109_1, 0, arg301_1, [1, 1], [0, 0], [1, 1], 1, arg110_1, arg111_1, 'none') | |
| del arg106_1 | |
| del arg107_1 | |
| del arg108_1 | |
| del arg109_1 | |
| del arg300_1 | |
| del arg301_1 | |
| del buf68 | |
| buf70 = buf69 | |
| assert_size_stride(buf70, (116, 512, 28, 28), (401408, 1, 14336, 512)) | |
| del buf69 | |
| buf71 = buf63; del buf63 # reuse | |
| buf72 = buf71; del buf71 # reuse | |
| buf73 = as_strided(buf61, (116, 512, 28, 28), (401408, 784, 28, 1)); del buf61 # reuse | |
| kernel_cpp_8(c_void_p(buf72.data_ptr()), c_void_p(buf70.data_ptr()), c_void_p(arg111_1.data_ptr()), c_void_p(arg110_1.data_ptr()), c_void_p(buf64.data_ptr()), c_void_p(arg99_1.data_ptr()), c_void_p(arg98_1.data_ptr()), c_void_p(arg112_1.data_ptr()), c_void_p(arg113_1.data_ptr()), c_void_p(buf73.data_ptr())) | |
| del arg110_1 | |
| del arg111_1 | |
| del arg98_1 | |
| del arg99_1 | |
| del buf64 | |
| del buf70 | |
| del buf72 | |
| buf74 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf73, arg112_1, arg113_1, arg302_1, arg114_1, arg115_1, 0, arg303_1, [1, 1], [0, 0], [1, 1], 1, arg116_1, arg117_1, 'relu_') | |
| del arg114_1 | |
| del arg115_1 | |
| del arg302_1 | |
| del arg303_1 | |
| buf75 = buf74 | |
| assert_size_stride(buf75, (116, 256, 28, 28), (200704, 1, 7168, 256)) | |
| del buf74 | |
| buf76 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf75, arg116_1, arg117_1, arg304_1, arg118_1, arg119_1, 0, arg305_1, [2, 2], [1, 1], [1, 1], 1, arg120_1, arg121_1, 'relu_') | |
| del arg116_1 | |
| del arg117_1 | |
| del arg118_1 | |
| del arg119_1 | |
| del arg304_1 | |
| del arg305_1 | |
| buf77 = buf76 | |
| assert_size_stride(buf77, (116, 256, 14, 14), (50176, 1, 3584, 256)) | |
| del buf76 | |
| buf78 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf77, arg120_1, arg121_1, arg306_1, arg122_1, arg123_1, 0, arg307_1, [1, 1], [0, 0], [1, 1], 1, arg124_1, arg125_1, 'none') | |
| del arg120_1 | |
| del arg121_1 | |
| del arg122_1 | |
| del arg123_1 | |
| del arg306_1 | |
| del arg307_1 | |
| del buf77 | |
| buf79 = buf78 | |
| assert_size_stride(buf79, (116, 1024, 14, 14), (200704, 1, 14336, 1024)) | |
| del buf78 | |
| buf80 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf73, arg112_1, arg113_1, arg308_1, arg126_1, arg127_1, 0, arg309_1, [2, 2], [0, 0], [1, 1], 1, arg128_1, arg129_1, 'none') | |
| del arg112_1 | |
| del arg113_1 | |
| del arg126_1 | |
| del arg127_1 | |
| del arg308_1 | |
| del arg309_1 | |
| del buf73 | |
| buf81 = buf80 | |
| assert_size_stride(buf81, (116, 1024, 14, 14), (200704, 1, 14336, 1024)) | |
| del buf80 | |
| buf82 = as_strided(buf4, (116, 1024, 14, 14), (200704, 1, 14336, 1024)); del buf4 # reuse | |
| buf83 = buf82; del buf82 # reuse | |
| buf84 = as_strided(buf75, (116, 1024, 14, 14), (200704, 196, 14, 1)); del buf75 # reuse | |
| kernel_cpp_9(c_void_p(buf83.data_ptr()), c_void_p(buf79.data_ptr()), c_void_p(arg125_1.data_ptr()), c_void_p(arg124_1.data_ptr()), c_void_p(buf81.data_ptr()), c_void_p(arg129_1.data_ptr()), c_void_p(arg128_1.data_ptr()), c_void_p(arg130_1.data_ptr()), c_void_p(arg131_1.data_ptr()), c_void_p(buf84.data_ptr())) | |
| del arg124_1 | |
| del arg125_1 | |
| del arg128_1 | |
| del arg129_1 | |
| del buf79 | |
| buf85 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf84, arg130_1, arg131_1, arg310_1, arg132_1, arg133_1, 0, arg311_1, [1, 1], [0, 0], [1, 1], 1, arg134_1, arg135_1, 'relu_') | |
| del arg132_1 | |
| del arg133_1 | |
| del arg310_1 | |
| del arg311_1 | |
| buf86 = buf85 | |
| assert_size_stride(buf86, (116, 256, 14, 14), (50176, 1, 3584, 256)) | |
| del buf85 | |
| buf87 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf86, arg134_1, arg135_1, arg312_1, arg136_1, arg137_1, 0, arg313_1, [1, 1], [1, 1], [1, 1], 1, arg138_1, arg139_1, 'relu_') | |
| del arg134_1 | |
| del arg135_1 | |
| del arg136_1 | |
| del arg137_1 | |
| del arg312_1 | |
| del arg313_1 | |
| del buf86 | |
| buf88 = buf87 | |
| assert_size_stride(buf88, (116, 256, 14, 14), (50176, 1, 3584, 256)) | |
| del buf87 | |
| buf89 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf88, arg138_1, arg139_1, arg314_1, arg140_1, arg141_1, 0, arg315_1, [1, 1], [0, 0], [1, 1], 1, arg142_1, arg143_1, 'none') | |
| del arg138_1 | |
| del arg139_1 | |
| del arg140_1 | |
| del arg141_1 | |
| del arg314_1 | |
| del arg315_1 | |
| del buf88 | |
| buf90 = buf89 | |
| assert_size_stride(buf90, (116, 1024, 14, 14), (200704, 1, 14336, 1024)) | |
| del buf89 | |
| buf91 = as_strided(buf83, (116, 1024, 14, 14), (200704, 196, 14, 1)); del buf83 # reuse | |
| buf92 = buf91; del buf91 # reuse | |
| buf93 = as_strided(buf81, (116, 1024, 14, 14), (200704, 196, 14, 1)); del buf81 # reuse | |
| kernel_cpp_10(c_void_p(buf92.data_ptr()), c_void_p(buf90.data_ptr()), c_void_p(arg143_1.data_ptr()), c_void_p(arg142_1.data_ptr()), c_void_p(buf84.data_ptr()), c_void_p(arg131_1.data_ptr()), c_void_p(arg130_1.data_ptr()), c_void_p(arg144_1.data_ptr()), c_void_p(arg145_1.data_ptr()), c_void_p(buf93.data_ptr())) | |
| del arg130_1 | |
| del arg131_1 | |
| del arg142_1 | |
| del arg143_1 | |
| del buf84 | |
| buf94 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf93, arg144_1, arg145_1, arg316_1, arg146_1, arg147_1, 0, arg317_1, [1, 1], [0, 0], [1, 1], 1, arg148_1, arg149_1, 'relu_') | |
| del arg146_1 | |
| del arg147_1 | |
| del arg316_1 | |
| del arg317_1 | |
| buf95 = buf94 | |
| assert_size_stride(buf95, (116, 256, 14, 14), (50176, 1, 3584, 256)) | |
| del buf94 | |
| buf96 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf95, arg148_1, arg149_1, arg318_1, arg150_1, arg151_1, 0, arg319_1, [1, 1], [1, 1], [1, 1], 1, arg152_1, arg153_1, 'relu_') | |
| del arg148_1 | |
| del arg149_1 | |
| del arg150_1 | |
| del arg151_1 | |
| del arg318_1 | |
| del arg319_1 | |
| del buf95 | |
| buf97 = buf96 | |
| assert_size_stride(buf97, (116, 256, 14, 14), (50176, 1, 3584, 256)) | |
| del buf96 | |
| buf98 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf97, arg152_1, arg153_1, arg320_1, arg154_1, arg155_1, 0, arg321_1, [1, 1], [0, 0], [1, 1], 1, arg156_1, arg157_1, 'none') | |
| del arg152_1 | |
| del arg153_1 | |
| del arg154_1 | |
| del arg155_1 | |
| del arg320_1 | |
| del arg321_1 | |
| del buf97 | |
| buf99 = buf98 | |
| assert_size_stride(buf99, (116, 1024, 14, 14), (200704, 1, 14336, 1024)) | |
| del buf98 | |
| buf100 = buf92; del buf92 # reuse | |
| buf101 = buf100; del buf100 # reuse | |
| buf102 = as_strided(buf90, (116, 1024, 14, 14), (200704, 196, 14, 1)); del buf90 # reuse | |
| kernel_cpp_11(c_void_p(buf101.data_ptr()), c_void_p(buf99.data_ptr()), c_void_p(arg157_1.data_ptr()), c_void_p(arg156_1.data_ptr()), c_void_p(buf93.data_ptr()), c_void_p(arg145_1.data_ptr()), c_void_p(arg144_1.data_ptr()), c_void_p(arg158_1.data_ptr()), c_void_p(arg159_1.data_ptr()), c_void_p(buf102.data_ptr())) | |
| del arg144_1 | |
| del arg145_1 | |
| del arg156_1 | |
| del arg157_1 | |
| del buf93 | |
| buf103 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf102, arg158_1, arg159_1, arg322_1, arg160_1, arg161_1, 0, arg323_1, [1, 1], [0, 0], [1, 1], 1, arg162_1, arg163_1, 'relu_') | |
| del arg160_1 | |
| del arg161_1 | |
| del arg322_1 | |
| del arg323_1 | |
| buf104 = buf103 | |
| assert_size_stride(buf104, (116, 256, 14, 14), (50176, 1, 3584, 256)) | |
| del buf103 | |
| buf105 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf104, arg162_1, arg163_1, arg324_1, arg164_1, arg165_1, 0, arg325_1, [1, 1], [1, 1], [1, 1], 1, arg166_1, arg167_1, 'relu_') | |
| del arg162_1 | |
| del arg163_1 | |
| del arg164_1 | |
| del arg165_1 | |
| del arg324_1 | |
| del arg325_1 | |
| del buf104 | |
| buf106 = buf105 | |
| assert_size_stride(buf106, (116, 256, 14, 14), (50176, 1, 3584, 256)) | |
| del buf105 | |
| buf107 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf106, arg166_1, arg167_1, arg326_1, arg168_1, arg169_1, 0, arg327_1, [1, 1], [0, 0], [1, 1], 1, arg170_1, arg171_1, 'none') | |
| del arg166_1 | |
| del arg167_1 | |
| del arg168_1 | |
| del arg169_1 | |
| del arg326_1 | |
| del arg327_1 | |
| del buf106 | |
| buf108 = buf107 | |
| assert_size_stride(buf108, (116, 1024, 14, 14), (200704, 1, 14336, 1024)) | |
| del buf107 | |
| buf109 = buf101; del buf101 # reuse | |
| buf110 = buf109; del buf109 # reuse | |
| buf111 = as_strided(buf99, (116, 1024, 14, 14), (200704, 196, 14, 1)); del buf99 # reuse | |
| kernel_cpp_12(c_void_p(buf110.data_ptr()), c_void_p(buf108.data_ptr()), c_void_p(arg171_1.data_ptr()), c_void_p(arg170_1.data_ptr()), c_void_p(buf102.data_ptr()), c_void_p(arg159_1.data_ptr()), c_void_p(arg158_1.data_ptr()), c_void_p(arg172_1.data_ptr()), c_void_p(arg173_1.data_ptr()), c_void_p(buf111.data_ptr())) | |
| del arg158_1 | |
| del arg159_1 | |
| del arg170_1 | |
| del arg171_1 | |
| del buf102 | |
| buf112 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf111, arg172_1, arg173_1, arg328_1, arg174_1, arg175_1, 0, arg329_1, [1, 1], [0, 0], [1, 1], 1, arg176_1, arg177_1, 'relu_') | |
| del arg174_1 | |
| del arg175_1 | |
| del arg328_1 | |
| del arg329_1 | |
| buf113 = buf112 | |
| assert_size_stride(buf113, (116, 256, 14, 14), (50176, 1, 3584, 256)) | |
| del buf112 | |
| buf114 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf113, arg176_1, arg177_1, arg330_1, arg178_1, arg179_1, 0, arg331_1, [1, 1], [1, 1], [1, 1], 1, arg180_1, arg181_1, 'relu_') | |
| del arg176_1 | |
| del arg177_1 | |
| del arg178_1 | |
| del arg179_1 | |
| del arg330_1 | |
| del arg331_1 | |
| del buf113 | |
| buf115 = buf114 | |
| assert_size_stride(buf115, (116, 256, 14, 14), (50176, 1, 3584, 256)) | |
| del buf114 | |
| buf116 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf115, arg180_1, arg181_1, arg332_1, arg182_1, arg183_1, 0, arg333_1, [1, 1], [0, 0], [1, 1], 1, arg184_1, arg185_1, 'none') | |
| del arg180_1 | |
| del arg181_1 | |
| del arg182_1 | |
| del arg183_1 | |
| del arg332_1 | |
| del arg333_1 | |
| del buf115 | |
| buf117 = buf116 | |
| assert_size_stride(buf117, (116, 1024, 14, 14), (200704, 1, 14336, 1024)) | |
| del buf116 | |
| buf118 = buf110; del buf110 # reuse | |
| buf119 = buf118; del buf118 # reuse | |
| buf120 = as_strided(buf108, (116, 1024, 14, 14), (200704, 196, 14, 1)); del buf108 # reuse | |
| kernel_cpp_13(c_void_p(buf119.data_ptr()), c_void_p(buf117.data_ptr()), c_void_p(arg185_1.data_ptr()), c_void_p(arg184_1.data_ptr()), c_void_p(buf111.data_ptr()), c_void_p(arg173_1.data_ptr()), c_void_p(arg172_1.data_ptr()), c_void_p(arg186_1.data_ptr()), c_void_p(arg187_1.data_ptr()), c_void_p(buf120.data_ptr())) | |
| del arg172_1 | |
| del arg173_1 | |
| del arg184_1 | |
| del arg185_1 | |
| del buf111 | |
| buf121 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf120, arg186_1, arg187_1, arg334_1, arg188_1, arg189_1, 0, arg335_1, [1, 1], [0, 0], [1, 1], 1, arg190_1, arg191_1, 'relu_') | |
| del arg188_1 | |
| del arg189_1 | |
| del arg334_1 | |
| del arg335_1 | |
| buf122 = buf121 | |
| assert_size_stride(buf122, (116, 256, 14, 14), (50176, 1, 3584, 256)) | |
| del buf121 | |
| buf123 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf122, arg190_1, arg191_1, arg336_1, arg192_1, arg193_1, 0, arg337_1, [1, 1], [1, 1], [1, 1], 1, arg194_1, arg195_1, 'relu_') | |
| del arg190_1 | |
| del arg191_1 | |
| del arg192_1 | |
| del arg193_1 | |
| del arg336_1 | |
| del arg337_1 | |
| del buf122 | |
| buf124 = buf123 | |
| assert_size_stride(buf124, (116, 256, 14, 14), (50176, 1, 3584, 256)) | |
| del buf123 | |
| buf125 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf124, arg194_1, arg195_1, arg338_1, arg196_1, arg197_1, 0, arg339_1, [1, 1], [0, 0], [1, 1], 1, arg198_1, arg199_1, 'none') | |
| del arg194_1 | |
| del arg195_1 | |
| del arg196_1 | |
| del arg197_1 | |
| del arg338_1 | |
| del arg339_1 | |
| del buf124 | |
| buf126 = buf125 | |
| assert_size_stride(buf126, (116, 1024, 14, 14), (200704, 1, 14336, 1024)) | |
| del buf125 | |
| buf127 = buf119; del buf119 # reuse | |
| buf128 = buf127; del buf127 # reuse | |
| buf129 = as_strided(buf117, (116, 1024, 14, 14), (200704, 196, 14, 1)); del buf117 # reuse | |
| kernel_cpp_14(c_void_p(buf128.data_ptr()), c_void_p(buf126.data_ptr()), c_void_p(arg199_1.data_ptr()), c_void_p(arg198_1.data_ptr()), c_void_p(buf120.data_ptr()), c_void_p(arg187_1.data_ptr()), c_void_p(arg186_1.data_ptr()), c_void_p(arg200_1.data_ptr()), c_void_p(arg201_1.data_ptr()), c_void_p(buf129.data_ptr())) | |
| del arg186_1 | |
| del arg187_1 | |
| del arg198_1 | |
| del arg199_1 | |
| del buf120 | |
| del buf126 | |
| del buf128 | |
| buf130 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf129, arg200_1, arg201_1, arg340_1, arg202_1, arg203_1, 0, arg341_1, [1, 1], [0, 0], [1, 1], 1, arg204_1, arg205_1, 'relu_') | |
| del arg202_1 | |
| del arg203_1 | |
| del arg340_1 | |
| del arg341_1 | |
| buf131 = buf130 | |
| assert_size_stride(buf131, (116, 512, 14, 14), (100352, 1, 7168, 512)) | |
| del buf130 | |
| buf132 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf131, arg204_1, arg205_1, arg342_1, arg206_1, arg207_1, 0, arg343_1, [2, 2], [1, 1], [1, 1], 1, arg208_1, arg209_1, 'relu_') | |
| del arg204_1 | |
| del arg205_1 | |
| del arg206_1 | |
| del arg207_1 | |
| del arg342_1 | |
| del arg343_1 | |
| buf133 = buf132 | |
| assert_size_stride(buf133, (116, 512, 7, 7), (25088, 1, 3584, 512)) | |
| del buf132 | |
| buf134 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf133, arg208_1, arg209_1, arg344_1, arg210_1, arg211_1, 0, arg345_1, [1, 1], [0, 0], [1, 1], 1, arg212_1, arg213_1, 'none') | |
| del arg208_1 | |
| del arg209_1 | |
| del arg210_1 | |
| del arg211_1 | |
| del arg344_1 | |
| del arg345_1 | |
| del buf133 | |
| buf135 = buf134 | |
| assert_size_stride(buf135, (116, 2048, 7, 7), (100352, 1, 14336, 2048)) | |
| del buf134 | |
| buf136 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf129, arg200_1, arg201_1, arg346_1, arg214_1, arg215_1, 0, arg347_1, [2, 2], [0, 0], [1, 1], 1, arg216_1, arg217_1, 'none') | |
| del arg200_1 | |
| del arg201_1 | |
| del arg214_1 | |
| del arg215_1 | |
| del arg346_1 | |
| del arg347_1 | |
| del buf129 | |
| buf137 = buf136 | |
| assert_size_stride(buf137, (116, 2048, 7, 7), (100352, 1, 14336, 2048)) | |
| del buf136 | |
| buf138 = empty_strided((116, 2048, 7, 7), (100352, 1, 14336, 2048), device='cpu', dtype=torch.float32) | |
| buf139 = buf138; del buf138 # reuse | |
| buf140 = as_strided(buf131, (116, 2048, 7, 7), (100352, 49, 7, 1)); del buf131 # reuse | |
| kernel_cpp_15(c_void_p(buf139.data_ptr()), c_void_p(buf135.data_ptr()), c_void_p(arg213_1.data_ptr()), c_void_p(arg212_1.data_ptr()), c_void_p(buf137.data_ptr()), c_void_p(arg217_1.data_ptr()), c_void_p(arg216_1.data_ptr()), c_void_p(arg218_1.data_ptr()), c_void_p(arg219_1.data_ptr()), c_void_p(buf140.data_ptr())) | |
| del arg212_1 | |
| del arg213_1 | |
| del arg216_1 | |
| del arg217_1 | |
| del buf135 | |
| buf141 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf140, arg218_1, arg219_1, arg348_1, arg220_1, arg221_1, 0, arg349_1, [1, 1], [0, 0], [1, 1], 1, arg222_1, arg223_1, 'relu_') | |
| del arg220_1 | |
| del arg221_1 | |
| del arg348_1 | |
| del arg349_1 | |
| buf142 = buf141 | |
| assert_size_stride(buf142, (116, 512, 7, 7), (25088, 1, 3584, 512)) | |
| del buf141 | |
| buf143 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf142, arg222_1, arg223_1, arg350_1, arg224_1, arg225_1, 0, arg351_1, [1, 1], [1, 1], [1, 1], 1, arg226_1, arg227_1, 'relu_') | |
| del arg222_1 | |
| del arg223_1 | |
| del arg224_1 | |
| del arg225_1 | |
| del arg350_1 | |
| del arg351_1 | |
| del buf142 | |
| buf144 = buf143 | |
| assert_size_stride(buf144, (116, 512, 7, 7), (25088, 1, 3584, 512)) | |
| del buf143 | |
| buf145 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf144, arg226_1, arg227_1, arg352_1, arg228_1, arg229_1, 0, arg353_1, [1, 1], [0, 0], [1, 1], 1, arg230_1, arg231_1, 'none') | |
| del arg226_1 | |
| del arg227_1 | |
| del arg228_1 | |
| del arg229_1 | |
| del arg352_1 | |
| del arg353_1 | |
| del buf144 | |
| buf146 = buf145 | |
| assert_size_stride(buf146, (116, 2048, 7, 7), (100352, 1, 14336, 2048)) | |
| del buf145 | |
| buf147 = as_strided(buf139, (116, 2048, 7, 7), (100352, 49, 7, 1)); del buf139 # reuse | |
| buf148 = buf147; del buf147 # reuse | |
| buf149 = as_strided(buf137, (116, 2048, 7, 7), (100352, 49, 7, 1)); del buf137 # reuse | |
| kernel_cpp_16(c_void_p(buf148.data_ptr()), c_void_p(buf146.data_ptr()), c_void_p(arg231_1.data_ptr()), c_void_p(arg230_1.data_ptr()), c_void_p(buf140.data_ptr()), c_void_p(arg219_1.data_ptr()), c_void_p(arg218_1.data_ptr()), c_void_p(arg232_1.data_ptr()), c_void_p(arg233_1.data_ptr()), c_void_p(buf149.data_ptr())) | |
| del arg218_1 | |
| del arg219_1 | |
| del arg230_1 | |
| del arg231_1 | |
| del buf140 | |
| del buf146 | |
| buf150 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf149, arg232_1, arg233_1, arg354_1, arg234_1, arg235_1, 0, arg355_1, [1, 1], [0, 0], [1, 1], 1, arg236_1, arg237_1, 'relu_') | |
| del arg234_1 | |
| del arg235_1 | |
| del arg354_1 | |
| del arg355_1 | |
| buf151 = buf150 | |
| assert_size_stride(buf151, (116, 512, 7, 7), (25088, 1, 3584, 512)) | |
| del buf150 | |
| buf152 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf151, arg236_1, arg237_1, arg356_1, arg238_1, arg239_1, 0, arg357_1, [1, 1], [1, 1], [1, 1], 1, arg240_1, arg241_1, 'relu_') | |
| del arg236_1 | |
| del arg237_1 | |
| del arg238_1 | |
| del arg239_1 | |
| del arg356_1 | |
| del arg357_1 | |
| del buf151 | |
| buf153 = buf152 | |
| assert_size_stride(buf153, (116, 512, 7, 7), (25088, 1, 3584, 512)) | |
| del buf152 | |
| buf154 = torch.ops.quantized_decomposed.conv_unary_inductor.tensor(buf153, arg240_1, arg241_1, arg358_1, arg242_1, arg243_1, 0, arg359_1, [1, 1], [0, 0], [1, 1], 1, arg244_1, arg245_1, 'none') | |
| del arg240_1 | |
| del arg241_1 | |
| del arg242_1 | |
| del arg243_1 | |
| del arg358_1 | |
| del arg359_1 | |
| del buf153 | |
| buf155 = buf154 | |
| assert_size_stride(buf155, (116, 2048, 7, 7), (100352, 1, 14336, 2048)) | |
| del buf154 | |
| buf156 = buf148; del buf148 # reuse | |
| buf157 = buf156; del buf156 # reuse | |
| buf158 = empty_strided((116, 2048, 1, 1), (2048, 1, 237568, 237568), device='cpu', dtype=torch.float32) | |
| buf159 = empty_strided((116, 2048), (2048, 1), device='cpu', dtype=torch.uint8) | |
| kernel_cpp_17(c_void_p(buf157.data_ptr()), c_void_p(buf155.data_ptr()), c_void_p(arg245_1.data_ptr()), c_void_p(arg244_1.data_ptr()), c_void_p(buf149.data_ptr()), c_void_p(arg233_1.data_ptr()), c_void_p(arg232_1.data_ptr()), c_void_p(arg246_1.data_ptr()), c_void_p(arg247_1.data_ptr()), c_void_p(arg248_1.data_ptr()), c_void_p(arg249_1.data_ptr()), c_void_p(buf158.data_ptr()), c_void_p(buf159.data_ptr())) | |
| del arg232_1 | |
| del arg233_1 | |
| del arg244_1 | |
| del arg245_1 | |
| del arg246_1 | |
| del arg247_1 | |
| del buf149 | |
| del buf155 | |
| del buf157 | |
| del buf158 | |
| buf160 = torch.ops.quantized_decomposed.linear_unary_inductor.tensor(buf159, arg248_1, arg249_1, arg360_1, arg250_1, arg251_1, 0, arg361_1, arg252_1, arg253_1, 'none') | |
| del arg248_1 | |
| del arg249_1 | |
| del arg250_1 | |
| del arg251_1 | |
| del arg360_1 | |
| del arg361_1 | |
| del buf159 | |
| buf161 = buf160 | |
| assert_size_stride(buf161, (116, 1000), (1000, 1)) | |
| del buf160 | |
| buf162 = empty_strided((116, 1000), (1000, 1), device='cpu', dtype=torch.float32) | |
| kernel_cpp_18(c_void_p(buf161.data_ptr()), c_void_p(arg253_1.data_ptr()), c_void_p(arg252_1.data_ptr()), c_void_p(buf162.data_ptr())) | |
| del arg252_1 | |
| del arg253_1 | |
| return (buf162, ) | |
| if __name__ == "__main__": | |
| from torch._dynamo.testing import rand_strided | |
| from torch._inductor.utils import print_performance | |
| arg0_1 = rand_strided((), (), device='cpu', dtype=torch.float32) | |
| arg1_1 = rand_strided((), (), device='cpu', dtype=torch.int64) | |
| arg2_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg3_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.int64) | |
| arg4_1 = rand_strided((), (), device='cpu', dtype=torch.float32) | |
| arg5_1 = rand_strided((), (), device='cpu', dtype=torch.int64) | |
| arg6_1 = rand_strided((), (), device='cpu', dtype=torch.float32) | |
| arg7_1 = rand_strided((), (), device='cpu', dtype=torch.int64) | |
| arg8_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg9_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.int64) | |
| arg10_1 = rand_strided((), (), device='cpu', dtype=torch.float32) | |
| arg11_1 = rand_strided((), (), device='cpu', dtype=torch.int64) | |
| arg12_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg13_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.int64) | |
| arg14_1 = rand_strided((), (), device='cpu', dtype=torch.float32) | |
| arg15_1 = rand_strided((), (), device='cpu', dtype=torch.int64) | |
| arg16_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg17_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.int64) | |
| arg18_1 = rand_strided((), (), device='cpu', dtype=torch.float32) | |
| arg19_1 = rand_strided((), (), device='cpu', dtype=torch.int64) | |
| arg20_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg21_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.int64) | |
| arg22_1 = rand_strided((), (), device='cpu', dtype=torch.float32) | |
| arg23_1 = rand_strided((), (), device='cpu', dtype=torch.int64) | |
| arg24_1 = rand_strided((), (), device='cpu', dtype=torch.float32) | |
| arg25_1 = rand_strided((), (), device='cpu', dtype=torch.int64) | |
| arg26_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg27_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.int64) | |
| arg28_1 = rand_strided((), (), device='cpu', dtype=torch.float32) | |
| arg29_1 = rand_strided((), (), device='cpu', dtype=torch.int64) | |
| arg30_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg31_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.int64) | |
| arg32_1 = rand_strided((), (), device='cpu', dtype=torch.float32) | |
| arg33_1 = rand_strided((), (), device='cpu', dtype=torch.int64) | |
| arg34_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg35_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.int64) | |
| arg36_1 = rand_strided((), (), device='cpu', dtype=torch.float32) | |
| arg37_1 = rand_strided((), (), device='cpu', dtype=torch.int64) | |
| arg38_1 = rand_strided((), (), device='cpu', dtype=torch.float32) | |
| arg39_1 = rand_strided((), (), device='cpu', dtype=torch.int64) | |
| arg40_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg41_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.int64) | |
| arg42_1 = rand_strided((), (), device='cpu', dtype=torch.float32) | |
| arg43_1 = rand_strided((), (), device='cpu', dtype=torch.int64) | |
| arg44_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg45_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.int64) | |
| arg46_1 = rand_strided((), (), device='cpu', dtype=torch.float32) | |
| arg47_1 = rand_strided((), (), device='cpu', dtype=torch.int64) | |
| arg48_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg49_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.int64) | |
| arg50_1 = rand_strided((), (), device='cpu', dtype=torch.float32) | |
| arg51_1 = rand_strided((), (), device='cpu', dtype=torch.int64) | |
| arg52_1 = rand_strided((), (), device='cpu', dtype=torch.float32) | |
| arg53_1 = rand_strided((), (), device='cpu', dtype=torch.int64) | |
| arg54_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg55_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.int64) | |
| arg56_1 = rand_strided((), (), device='cpu', dtype=torch.float32) | |
| arg57_1 = rand_strided((), (), device='cpu', dtype=torch.int64) | |
| arg58_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg59_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.int64) | |
| arg60_1 = rand_strided((), (), device='cpu', dtype=torch.float32) | |
| arg61_1 = rand_strided((), (), device='cpu', dtype=torch.int64) | |
| arg62_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg63_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.int64) | |
| arg64_1 = rand_strided((), (), device='cpu', dtype=torch.float32) | |
| arg65_1 = rand_strided((), (), device='cpu', dtype=torch.int64) | |
| arg66_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg67_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.int64) | |
| arg68_1 = rand_strided((), (), device='cpu', dtype=torch.float32) | |
| arg69_1 = rand_strided((), (), device='cpu', dtype=torch.int64) | |
| arg70_1 = rand_strided((), (), device='cpu', dtype=torch.float32) | |
| arg71_1 = rand_strided((), (), device='cpu', dtype=torch.int64) | |
| arg72_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg73_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.int64) | |
| arg74_1 = rand_strided((), (), device='cpu', dtype=torch.float32) | |
| arg75_1 = rand_strided((), (), device='cpu', dtype=torch.int64) | |
| arg76_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg77_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.int64) | |
| arg78_1 = rand_strided((), (), device='cpu', dtype=torch.float32) | |
| arg79_1 = rand_strided((), (), device='cpu', dtype=torch.int64) | |
| arg80_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg81_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.int64) | |
| arg82_1 = rand_strided((), (), device='cpu', dtype=torch.float32) | |
| arg83_1 = rand_strided((), (), device='cpu', dtype=torch.int64) | |
| arg84_1 = rand_strided((), (), device='cpu', dtype=torch.float32) | |
| arg85_1 = rand_strided((), (), device='cpu', dtype=torch.int64) | |
| arg86_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg87_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.int64) | |
| arg88_1 = rand_strided((), (), device='cpu', dtype=torch.float32) | |
| arg89_1 = rand_strided((), (), device='cpu', dtype=torch.int64) | |
| arg90_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg91_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.int64) | |
| arg92_1 = rand_strided((), (), device='cpu', dtype=torch.float32) | |
| arg93_1 = rand_strided((), (), device='cpu', dtype=torch.int64) | |
| arg94_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg95_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.int64) | |
| arg96_1 = rand_strided((), (), device='cpu', dtype=torch.float32) | |
| arg97_1 = rand_strided((), (), device='cpu', dtype=torch.int64) | |
| arg98_1 = rand_strided((), (), device='cpu', dtype=torch.float32) | |
| arg99_1 = rand_strided((), (), device='cpu', dtype=torch.int64) | |
| arg100_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg101_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.int64) | |
| arg102_1 = rand_strided((), (), device='cpu', dtype=torch.float32) | |
| arg103_1 = rand_strided((), (), device='cpu', dtype=torch.int64) | |
| arg104_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg105_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.int64) | |
| arg106_1 = rand_strided((), (), device='cpu', dtype=torch.float32) | |
| arg107_1 = rand_strided((), (), device='cpu', dtype=torch.int64) | |
| arg108_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg109_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.int64) | |
| arg110_1 = rand_strided((), (), device='cpu', dtype=torch.float32) | |
| arg111_1 = rand_strided((), (), device='cpu', dtype=torch.int64) | |
| arg112_1 = rand_strided((), (), device='cpu', dtype=torch.float32) | |
| arg113_1 = rand_strided((), (), device='cpu', dtype=torch.int64) | |
| arg114_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg115_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.int64) | |
| arg116_1 = rand_strided((), (), device='cpu', dtype=torch.float32) | |
| arg117_1 = rand_strided((), (), device='cpu', dtype=torch.int64) | |
| arg118_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg119_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.int64) | |
| arg120_1 = rand_strided((), (), device='cpu', dtype=torch.float32) | |
| arg121_1 = rand_strided((), (), device='cpu', dtype=torch.int64) | |
| arg122_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg123_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.int64) | |
| arg124_1 = rand_strided((), (), device='cpu', dtype=torch.float32) | |
| arg125_1 = rand_strided((), (), device='cpu', dtype=torch.int64) | |
| arg126_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg127_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.int64) | |
| arg128_1 = rand_strided((), (), device='cpu', dtype=torch.float32) | |
| arg129_1 = rand_strided((), (), device='cpu', dtype=torch.int64) | |
| arg130_1 = rand_strided((), (), device='cpu', dtype=torch.float32) | |
| arg131_1 = rand_strided((), (), device='cpu', dtype=torch.int64) | |
| arg132_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg133_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.int64) | |
| arg134_1 = rand_strided((), (), device='cpu', dtype=torch.float32) | |
| arg135_1 = rand_strided((), (), device='cpu', dtype=torch.int64) | |
| arg136_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg137_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.int64) | |
| arg138_1 = rand_strided((), (), device='cpu', dtype=torch.float32) | |
| arg139_1 = rand_strided((), (), device='cpu', dtype=torch.int64) | |
| arg140_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg141_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.int64) | |
| arg142_1 = rand_strided((), (), device='cpu', dtype=torch.float32) | |
| arg143_1 = rand_strided((), (), device='cpu', dtype=torch.int64) | |
| arg144_1 = rand_strided((), (), device='cpu', dtype=torch.float32) | |
| arg145_1 = rand_strided((), (), device='cpu', dtype=torch.int64) | |
| arg146_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg147_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.int64) | |
| arg148_1 = rand_strided((), (), device='cpu', dtype=torch.float32) | |
| arg149_1 = rand_strided((), (), device='cpu', dtype=torch.int64) | |
| arg150_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg151_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.int64) | |
| arg152_1 = rand_strided((), (), device='cpu', dtype=torch.float32) | |
| arg153_1 = rand_strided((), (), device='cpu', dtype=torch.int64) | |
| arg154_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg155_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.int64) | |
| arg156_1 = rand_strided((), (), device='cpu', dtype=torch.float32) | |
| arg157_1 = rand_strided((), (), device='cpu', dtype=torch.int64) | |
| arg158_1 = rand_strided((), (), device='cpu', dtype=torch.float32) | |
| arg159_1 = rand_strided((), (), device='cpu', dtype=torch.int64) | |
| arg160_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg161_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.int64) | |
| arg162_1 = rand_strided((), (), device='cpu', dtype=torch.float32) | |
| arg163_1 = rand_strided((), (), device='cpu', dtype=torch.int64) | |
| arg164_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg165_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.int64) | |
| arg166_1 = rand_strided((), (), device='cpu', dtype=torch.float32) | |
| arg167_1 = rand_strided((), (), device='cpu', dtype=torch.int64) | |
| arg168_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg169_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.int64) | |
| arg170_1 = rand_strided((), (), device='cpu', dtype=torch.float32) | |
| arg171_1 = rand_strided((), (), device='cpu', dtype=torch.int64) | |
| arg172_1 = rand_strided((), (), device='cpu', dtype=torch.float32) | |
| arg173_1 = rand_strided((), (), device='cpu', dtype=torch.int64) | |
| arg174_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg175_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.int64) | |
| arg176_1 = rand_strided((), (), device='cpu', dtype=torch.float32) | |
| arg177_1 = rand_strided((), (), device='cpu', dtype=torch.int64) | |
| arg178_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg179_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.int64) | |
| arg180_1 = rand_strided((), (), device='cpu', dtype=torch.float32) | |
| arg181_1 = rand_strided((), (), device='cpu', dtype=torch.int64) | |
| arg182_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg183_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.int64) | |
| arg184_1 = rand_strided((), (), device='cpu', dtype=torch.float32) | |
| arg185_1 = rand_strided((), (), device='cpu', dtype=torch.int64) | |
| arg186_1 = rand_strided((), (), device='cpu', dtype=torch.float32) | |
| arg187_1 = rand_strided((), (), device='cpu', dtype=torch.int64) | |
| arg188_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg189_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.int64) | |
| arg190_1 = rand_strided((), (), device='cpu', dtype=torch.float32) | |
| arg191_1 = rand_strided((), (), device='cpu', dtype=torch.int64) | |
| arg192_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg193_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.int64) | |
| arg194_1 = rand_strided((), (), device='cpu', dtype=torch.float32) | |
| arg195_1 = rand_strided((), (), device='cpu', dtype=torch.int64) | |
| arg196_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg197_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.int64) | |
| arg198_1 = rand_strided((), (), device='cpu', dtype=torch.float32) | |
| arg199_1 = rand_strided((), (), device='cpu', dtype=torch.int64) | |
| arg200_1 = rand_strided((), (), device='cpu', dtype=torch.float32) | |
| arg201_1 = rand_strided((), (), device='cpu', dtype=torch.int64) | |
| arg202_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg203_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.int64) | |
| arg204_1 = rand_strided((), (), device='cpu', dtype=torch.float32) | |
| arg205_1 = rand_strided((), (), device='cpu', dtype=torch.int64) | |
| arg206_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg207_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.int64) | |
| arg208_1 = rand_strided((), (), device='cpu', dtype=torch.float32) | |
| arg209_1 = rand_strided((), (), device='cpu', dtype=torch.int64) | |
| arg210_1 = rand_strided((2048, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg211_1 = rand_strided((2048, ), (1, ), device='cpu', dtype=torch.int64) | |
| arg212_1 = rand_strided((), (), device='cpu', dtype=torch.float32) | |
| arg213_1 = rand_strided((), (), device='cpu', dtype=torch.int64) | |
| arg214_1 = rand_strided((2048, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg215_1 = rand_strided((2048, ), (1, ), device='cpu', dtype=torch.int64) | |
| arg216_1 = rand_strided((), (), device='cpu', dtype=torch.float32) | |
| arg217_1 = rand_strided((), (), device='cpu', dtype=torch.int64) | |
| arg218_1 = rand_strided((), (), device='cpu', dtype=torch.float32) | |
| arg219_1 = rand_strided((), (), device='cpu', dtype=torch.int64) | |
| arg220_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg221_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.int64) | |
| arg222_1 = rand_strided((), (), device='cpu', dtype=torch.float32) | |
| arg223_1 = rand_strided((), (), device='cpu', dtype=torch.int64) | |
| arg224_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg225_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.int64) | |
| arg226_1 = rand_strided((), (), device='cpu', dtype=torch.float32) | |
| arg227_1 = rand_strided((), (), device='cpu', dtype=torch.int64) | |
| arg228_1 = rand_strided((2048, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg229_1 = rand_strided((2048, ), (1, ), device='cpu', dtype=torch.int64) | |
| arg230_1 = rand_strided((), (), device='cpu', dtype=torch.float32) | |
| arg231_1 = rand_strided((), (), device='cpu', dtype=torch.int64) | |
| arg232_1 = rand_strided((), (), device='cpu', dtype=torch.float32) | |
| arg233_1 = rand_strided((), (), device='cpu', dtype=torch.int64) | |
| arg234_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg235_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.int64) | |
| arg236_1 = rand_strided((), (), device='cpu', dtype=torch.float32) | |
| arg237_1 = rand_strided((), (), device='cpu', dtype=torch.int64) | |
| arg238_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg239_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.int64) | |
| arg240_1 = rand_strided((), (), device='cpu', dtype=torch.float32) | |
| arg241_1 = rand_strided((), (), device='cpu', dtype=torch.int64) | |
| arg242_1 = rand_strided((2048, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg243_1 = rand_strided((2048, ), (1, ), device='cpu', dtype=torch.int64) | |
| arg244_1 = rand_strided((), (), device='cpu', dtype=torch.float32) | |
| arg245_1 = rand_strided((), (), device='cpu', dtype=torch.int64) | |
| arg246_1 = rand_strided((), (), device='cpu', dtype=torch.float32) | |
| arg247_1 = rand_strided((), (), device='cpu', dtype=torch.int64) | |
| arg248_1 = rand_strided((), (), device='cpu', dtype=torch.float32) | |
| arg249_1 = rand_strided((), (), device='cpu', dtype=torch.int64) | |
| arg250_1 = rand_strided((1000, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg251_1 = rand_strided((1000, ), (1, ), device='cpu', dtype=torch.int64) | |
| arg252_1 = rand_strided((), (), device='cpu', dtype=torch.float32) | |
| arg253_1 = rand_strided((), (), device='cpu', dtype=torch.int64) | |
| arg254_1 = rand_strided((64, 3, 7, 7), (1, 0, 0, 0), device='cpu', dtype=torch.int8) | |
| arg255_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg256_1 = rand_strided((64, 64, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8) | |
| arg257_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg258_1 = rand_strided((64, 64, 3, 3), (1, 0, 0, 0), device='cpu', dtype=torch.int8) | |
| arg259_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg260_1 = rand_strided((256, 64, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8) | |
| arg261_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg262_1 = rand_strided((256, 64, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8) | |
| arg263_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg264_1 = rand_strided((64, 256, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8) | |
| arg265_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg266_1 = rand_strided((64, 64, 3, 3), (1, 0, 0, 0), device='cpu', dtype=torch.int8) | |
| arg267_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg268_1 = rand_strided((256, 64, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8) | |
| arg269_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg270_1 = rand_strided((64, 256, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8) | |
| arg271_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg272_1 = rand_strided((64, 64, 3, 3), (1, 0, 0, 0), device='cpu', dtype=torch.int8) | |
| arg273_1 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg274_1 = rand_strided((256, 64, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8) | |
| arg275_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg276_1 = rand_strided((128, 256, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8) | |
| arg277_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg278_1 = rand_strided((128, 128, 3, 3), (1, 0, 0, 0), device='cpu', dtype=torch.int8) | |
| arg279_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg280_1 = rand_strided((512, 128, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8) | |
| arg281_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg282_1 = rand_strided((512, 256, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8) | |
| arg283_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg284_1 = rand_strided((128, 512, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8) | |
| arg285_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg286_1 = rand_strided((128, 128, 3, 3), (1, 0, 0, 0), device='cpu', dtype=torch.int8) | |
| arg287_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg288_1 = rand_strided((512, 128, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8) | |
| arg289_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg290_1 = rand_strided((128, 512, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8) | |
| arg291_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg292_1 = rand_strided((128, 128, 3, 3), (1, 0, 0, 0), device='cpu', dtype=torch.int8) | |
| arg293_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg294_1 = rand_strided((512, 128, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8) | |
| arg295_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg296_1 = rand_strided((128, 512, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8) | |
| arg297_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg298_1 = rand_strided((128, 128, 3, 3), (1, 0, 0, 0), device='cpu', dtype=torch.int8) | |
| arg299_1 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg300_1 = rand_strided((512, 128, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8) | |
| arg301_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg302_1 = rand_strided((256, 512, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8) | |
| arg303_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg304_1 = rand_strided((256, 256, 3, 3), (1, 0, 0, 0), device='cpu', dtype=torch.int8) | |
| arg305_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg306_1 = rand_strided((1024, 256, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8) | |
| arg307_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg308_1 = rand_strided((1024, 512, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8) | |
| arg309_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg310_1 = rand_strided((256, 1024, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8) | |
| arg311_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg312_1 = rand_strided((256, 256, 3, 3), (1, 0, 0, 0), device='cpu', dtype=torch.int8) | |
| arg313_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg314_1 = rand_strided((1024, 256, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8) | |
| arg315_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg316_1 = rand_strided((256, 1024, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8) | |
| arg317_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg318_1 = rand_strided((256, 256, 3, 3), (1, 0, 0, 0), device='cpu', dtype=torch.int8) | |
| arg319_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg320_1 = rand_strided((1024, 256, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8) | |
| arg321_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg322_1 = rand_strided((256, 1024, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8) | |
| arg323_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg324_1 = rand_strided((256, 256, 3, 3), (1, 0, 0, 0), device='cpu', dtype=torch.int8) | |
| arg325_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg326_1 = rand_strided((1024, 256, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8) | |
| arg327_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg328_1 = rand_strided((256, 1024, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8) | |
| arg329_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg330_1 = rand_strided((256, 256, 3, 3), (1, 0, 0, 0), device='cpu', dtype=torch.int8) | |
| arg331_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg332_1 = rand_strided((1024, 256, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8) | |
| arg333_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg334_1 = rand_strided((256, 1024, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8) | |
| arg335_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg336_1 = rand_strided((256, 256, 3, 3), (1, 0, 0, 0), device='cpu', dtype=torch.int8) | |
| arg337_1 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg338_1 = rand_strided((1024, 256, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8) | |
| arg339_1 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg340_1 = rand_strided((512, 1024, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8) | |
| arg341_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg342_1 = rand_strided((512, 512, 3, 3), (1, 0, 0, 0), device='cpu', dtype=torch.int8) | |
| arg343_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg344_1 = rand_strided((2048, 512, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8) | |
| arg345_1 = rand_strided((2048, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg346_1 = rand_strided((2048, 1024, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8) | |
| arg347_1 = rand_strided((2048, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg348_1 = rand_strided((512, 2048, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8) | |
| arg349_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg350_1 = rand_strided((512, 512, 3, 3), (1, 0, 0, 0), device='cpu', dtype=torch.int8) | |
| arg351_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg352_1 = rand_strided((2048, 512, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8) | |
| arg353_1 = rand_strided((2048, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg354_1 = rand_strided((512, 2048, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8) | |
| arg355_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg356_1 = rand_strided((512, 512, 3, 3), (1, 0, 0, 0), device='cpu', dtype=torch.int8) | |
| arg357_1 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg358_1 = rand_strided((2048, 512, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.int8) | |
| arg359_1 = rand_strided((2048, ), (1, ), device='cpu', dtype=torch.float32) | |
| arg360_1 = rand_strided((2048, 1000), (1, 0), device='cpu', dtype=torch.int8) | |
| arg361_1 = rand_strided((1, 1000), (1, 0), device='cpu', dtype=torch.float32) | |
| arg362_1 = rand_strided((116, 3, 224, 224), (150528, 50176, 224, 1), device='cpu', dtype=torch.float32) | |
| print_performance(lambda: call([arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1, arg12_1, arg13_1, arg14_1, arg15_1, arg16_1, arg17_1, arg18_1, arg19_1, arg20_1, arg21_1, arg22_1, arg23_1, arg24_1, arg25_1, arg26_1, arg27_1, arg28_1, arg29_1, arg30_1, arg31_1, arg32_1, arg33_1, arg34_1, arg35_1, arg36_1, arg37_1, arg38_1, arg39_1, arg40_1, arg41_1, arg42_1, arg43_1, arg44_1, arg45_1, arg46_1, arg47_1, arg48_1, arg49_1, arg50_1, arg51_1, arg52_1, arg53_1, arg54_1, arg55_1, arg56_1, arg57_1, arg58_1, arg59_1, arg60_1, arg61_1, arg62_1, arg63_1, arg64_1, arg65_1, arg66_1, arg67_1, arg68_1, arg69_1, arg70_1, arg71_1, arg72_1, arg73_1, arg74_1, arg75_1, arg76_1, arg77_1, arg78_1, arg79_1, arg80_1, arg81_1, arg82_1, arg83_1, arg84_1, arg85_1, arg86_1, arg87_1, arg88_1, arg89_1, arg90_1, arg91_1, arg92_1, arg93_1, arg94_1, arg95_1, arg96_1, arg97_1, arg98_1, arg99_1, arg100_1, arg101_1, arg102_1, arg103_1, arg104_1, arg105_1, arg106_1, arg107_1, arg108_1, arg109_1, arg110_1, arg111_1, arg112_1, arg113_1, arg114_1, arg115_1, arg116_1, arg117_1, arg118_1, arg119_1, arg120_1, arg121_1, arg122_1, arg123_1, arg124_1, arg125_1, arg126_1, arg127_1, arg128_1, arg129_1, arg130_1, arg131_1, arg132_1, arg133_1, arg134_1, arg135_1, arg136_1, arg137_1, arg138_1, arg139_1, arg140_1, arg141_1, arg142_1, arg143_1, arg144_1, arg145_1, arg146_1, arg147_1, arg148_1, arg149_1, arg150_1, arg151_1, arg152_1, arg153_1, arg154_1, arg155_1, arg156_1, arg157_1, arg158_1, arg159_1, arg160_1, arg161_1, arg162_1, arg163_1, arg164_1, arg165_1, arg166_1, arg167_1, arg168_1, arg169_1, arg170_1, arg171_1, arg172_1, arg173_1, arg174_1, arg175_1, arg176_1, arg177_1, arg178_1, arg179_1, arg180_1, arg181_1, arg182_1, arg183_1, arg184_1, arg185_1, arg186_1, arg187_1, arg188_1, arg189_1, arg190_1, arg191_1, arg192_1, arg193_1, arg194_1, arg195_1, arg196_1, arg197_1, arg198_1, arg199_1, arg200_1, arg201_1, arg202_1, arg203_1, arg204_1, arg205_1, arg206_1, arg207_1, arg208_1, arg209_1, arg210_1, arg211_1, arg212_1, arg213_1, arg214_1, arg215_1, arg216_1, arg217_1, arg218_1, arg219_1, arg220_1, arg221_1, arg222_1, arg223_1, arg224_1, arg225_1, arg226_1, arg227_1, arg228_1, arg229_1, arg230_1, arg231_1, arg232_1, arg233_1, arg234_1, arg235_1, arg236_1, arg237_1, arg238_1, arg239_1, arg240_1, arg241_1, arg242_1, arg243_1, arg244_1, arg245_1, arg246_1, arg247_1, arg248_1, arg249_1, arg250_1, arg251_1, arg252_1, arg253_1, arg254_1, arg255_1, arg256_1, arg257_1, arg258_1, arg259_1, arg260_1, arg261_1, arg262_1, arg263_1, arg264_1, arg265_1, arg266_1, arg267_1, arg268_1, arg269_1, arg270_1, arg271_1, arg272_1, arg273_1, arg274_1, arg275_1, arg276_1, arg277_1, arg278_1, arg279_1, arg280_1, arg281_1, arg282_1, arg283_1, arg284_1, arg285_1, arg286_1, arg287_1, arg288_1, arg289_1, arg290_1, arg291_1, arg292_1, arg293_1, arg294_1, arg295_1, arg296_1, arg297_1, arg298_1, arg299_1, arg300_1, arg301_1, arg302_1, arg303_1, arg304_1, arg305_1, arg306_1, arg307_1, arg308_1, arg309_1, arg310_1, arg311_1, arg312_1, arg313_1, arg314_1, arg315_1, arg316_1, arg317_1, arg318_1, arg319_1, arg320_1, arg321_1, arg322_1, arg323_1, arg324_1, arg325_1, arg326_1, arg327_1, arg328_1, arg329_1, arg330_1, arg331_1, arg332_1, arg333_1, arg334_1, arg335_1, arg336_1, arg337_1, arg338_1, arg339_1, arg340_1, arg341_1, arg342_1, arg343_1, arg344_1, arg345_1, arg346_1, arg347_1, arg348_1, arg349_1, arg350_1, arg351_1, arg352_1, arg353_1, arg354_1, arg355_1, arg356_1, arg357_1, arg358_1, arg359_1, arg360_1, arg361_1, arg362_1])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment