leslie-fang-intel · August 22, 2024 01:14
diff --git a/sebotnet33ts_256_after_index_expr.py b/sebotnet33ts_256_after_index_expr.py

 # AOT ID: ['0_inference']
 from ctypes import c_void_p, c_long, c_int
 import torch
 import math
 import random
 import os
 import tempfile
 from math import inf, nan
 from torch._inductor.hooks import run_intermediate_hooks
 from torch._inductor.utils import maybe_profile
 from torch._inductor.codegen.memory_planning import _align as align

 from torch import device, empty_strided
 from torch._inductor.async_compile import AsyncCompile
 from torch._inductor.select_algorithm import extern_kernels
 from torch._inductor.codegen.multi_kernel import MultiKernelCall

 aten = torch.ops.aten
 inductor_ops = torch.ops.inductor
 _quantized = torch.ops._quantized
 assert_size_stride = torch._C._dynamo.guards.assert_size_stride
 empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
 empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
 empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
 reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
 alloc_from_pool = torch.ops.inductor._alloc_from_pool
 async_compile = AsyncCompile()
 _frozen_param18 = None  # device(type='cpu') torch.float32 (64,) (1,) 7f1e30c112b0
 _frozen_param34 = None  # device(type='cpu') torch.float32 (64,) (1,) 7f1e30c10ea0
 _frozen_param47 = None  # device(type='cpu') torch.float32 (128,) (1,) 7f1e30c13dd0
 _frozen_param63 = None  # device(type='cpu') torch.float32 (128,) (1,) 7f1e30d13ab0
 _frozen_param85 = None  # device(type='cpu') torch.float32 (16,) (1,) 7f1e30d11a30
 _frozen_param87 = None  # device(type='cpu') torch.float32 (256,) (1,) 7f1e30d11bc0
 _frozen_param101 = None  # device(type='cpu') torch.float32 (16,) (1,) 7f1e30d05080
 _frozen_param103 = None  # device(type='cpu') torch.float32 (256,) (1,) 7f1e30d07470
 _frozen_param147 = None  # device(type='cpu') torch.float32 (1000,) (1,) 7f1e30d0c8b0
 _frozen_param564 = None  # device(type='cpu') torch.float32 (24,) (1,) 7f1e1f7b8090
 _frozen_param598 = None  # device(type='cpu') torch.float32 (24, 3, 3, 3) (1, 0, 0, 0) 7f1e1f7ff8d0
 _frozen_param565 = None  # device(type='cpu') torch.float32 (32,) (1,) 7f1e1fad4e00
 _frozen_param599 = None  # device(type='cpu') torch.float32 (32, 24, 3, 3) (1, 0, 0, 0) 7f1e1f6b1c60
 _frozen_param566 = None  # device(type='cpu') torch.float32 (64,) (1,) 7f1e1f84e660
 _frozen_param600 = None  # device(type='cpu') torch.float32 (64, 32, 3, 3) (1, 0, 0, 0) 7f1e1f6b1cb0
 _frozen_param567 = None  # device(type='cpu') torch.float32 (64,) (1,) 7f1e1fb2ebb0
 _frozen_param601 = None  # device(type='cpu') torch.float32 (64, 64, 1, 1) (1, 0, 0, 0) 7f1e1f6b1c10
 _frozen_param568 = None  # device(type='cpu') torch.float32 (64,) (1,) 7f1e1fb0c270
 _frozen_param602 = None  # device(type='cpu') torch.float32 (64, 64, 3, 3) (1, 0, 0, 0) 7f1e1f6b1b70
 _frozen_param603 = None  # device(type='cpu') torch.float32 (8, 64, 1, 1) (1, 0, 0, 0) 7f1e1f6b1d50
 _frozen_param604 = None  # device(type='cpu') torch.float32 (64, 8, 1, 1) (1, 0, 0, 0) 7f1e1f6b1da0
 _frozen_param569 = None  # device(type='cpu') torch.float32 (256,) (1,) 7f1e1f79b0b0
 _frozen_param605 = None  # device(type='cpu') torch.float32 (256, 64, 1, 1) (1, 0, 0, 0) 7f1e1f6b1df0
 _frozen_param570 = None  # device(type='cpu') torch.float32 (256,) (1,) 7f1e1fb0e3e0
 _frozen_param606 = None  # device(type='cpu') torch.float32 (256, 64, 1, 1) (1, 0, 0, 0) 7f1e1f6b1e40
 _frozen_param571 = None  # device(type='cpu') torch.float32 (64,) (1,) 7f1e1f793420
 _frozen_param607 = None  # device(type='cpu') torch.float32 (64, 256, 1, 1) (1, 0, 0, 0) 7f1e1f6b1e90
 _frozen_param572 = None  # device(type='cpu') torch.float32 (64,) (1,) 7f1e1f792980
 _frozen_param608 = None  # device(type='cpu') torch.float32 (64, 64, 3, 3) (1, 0, 0, 0) 7f1e1f7e3010
 _frozen_param609 = None  # device(type='cpu') torch.float32 (8, 64, 1, 1) (1, 0, 0, 0) 7f1e1f6b1f30
 _frozen_param610 = None  # device(type='cpu') torch.float32 (64, 8, 1, 1) (1, 0, 0, 0) 7f1e1fa9bdd0
 _frozen_param573 = None  # device(type='cpu') torch.float32 (256,) (1,) 7f1e1f92d0d0
 _frozen_param611 = None  # device(type='cpu') torch.float32 (256, 64, 1, 1) (1, 0, 0, 0) 7f1e1f6b1fd0
 _frozen_param574 = None  # device(type='cpu') torch.float32 (128,) (1,) 7f1e1f9ce930
 _frozen_param612 = None  # device(type='cpu') torch.float32 (128, 256, 1, 1) (1, 0, 0, 0) 7f1e1f6b2020
 _frozen_param575 = None  # device(type='cpu') torch.float32 (128,) (1,) 7f1e1f7fcd60
 _frozen_param613 = None  # device(type='cpu') torch.float32 (128, 128, 3, 3) (1, 0, 0, 0) 7f1e1f6b2070
 _frozen_param614 = None  # device(type='cpu') torch.float32 (8, 128, 1, 1) (1, 0, 0, 0) 7f1e1f6b1ee0
 _frozen_param615 = None  # device(type='cpu') torch.float32 (128, 8, 1, 1) (1, 0, 0, 0) 7f1e1f6b20c0
 _frozen_param576 = None  # device(type='cpu') torch.float32 (512,) (1,) 7f1e1f7fc680
 _frozen_param616 = None  # device(type='cpu') torch.float32 (512, 128, 1, 1) (1, 0, 0, 0) 7f1e1f6b1f80
 _frozen_param577 = None  # device(type='cpu') torch.float32 (512,) (1,) 7f1e1f7fdda0
 _frozen_param617 = None  # device(type='cpu') torch.float32 (512, 256, 1, 1) (1, 0, 0, 0) 7f1e1f6b2110
 _frozen_param578 = None  # device(type='cpu') torch.float32 (128,) (1,) 7f1e1f800a40
 _frozen_param618 = None  # device(type='cpu') torch.float32 (128, 512, 1, 1) (1, 0, 0, 0) 7f1e1f6b2160
 _frozen_param579 = None  # device(type='cpu') torch.float32 (128,) (1,) 7f1e1f801c60
 _frozen_param619 = None  # device(type='cpu') torch.float32 (128, 128, 3, 3) (1, 0, 0, 0) 7f1e1f6b21b0
 _frozen_param620 = None  # device(type='cpu') torch.float32 (8, 128, 1, 1) (1, 0, 0, 0) 7f1e1f6b2200
 _frozen_param621 = None  # device(type='cpu') torch.float32 (128, 8, 1, 1) (1, 0, 0, 0) 7f1e1f6b2250
 _frozen_param580 = None  # device(type='cpu') torch.float32 (512,) (1,) 7f1e1f85f600
 _frozen_param622 = None  # device(type='cpu') torch.float32 (512, 128, 1, 1) (1, 0, 0, 0) 7f1e1f6b22a0
 _frozen_param581 = None  # device(type='cpu') torch.float32 (128,) (1,) 7f1e1f85f740
 _frozen_param623 = None  # device(type='cpu') torch.float32 (128, 512, 1, 1) (1, 0, 0, 0) 7f1e1f6b22f0
 _frozen_param624 = None  # device(type='cpu') torch.float32 (384, 128, 1, 1) (1, 0, 0, 0) 7f1e1f6b2340
 _frozen_param625 = None  # device(type='cpu') torch.float32 (63, 32) (32, 1) 7f1e1fad6bb0
 _frozen_param626 = None  # device(type='cpu') torch.float32 (1982689, 1) (1, 0) 7f1e1f6b23e0
 _frozen_param627 = None  # device(type='cpu') torch.float32 (63, 32) (32, 1) 7f1e1f7f2200
 _frozen_param628 = None  # device(type='cpu') torch.float32 (1982689, 1) (1, 0) 7f1e1f6b2390
 _frozen_param305 = None  # device(type='cpu') torch.float32 (128, 1, 1) (1, 1, 1) 7f1e1f92f060
 _frozen_param306 = None  # device(type='cpu') torch.float32 (128, 1, 1) (1, 1, 1) 7f1e1f92ee80
 _frozen_param307 = None  # device(type='cpu') torch.float32 (128, 1, 1) (1, 1, 1) 7f1e1f92ef70
 _frozen_param308 = None  # device(type='cpu') torch.float32 (128, 1, 1) (1, 1, 1) 7f1e1f92f010
 _frozen_param582 = None  # device(type='cpu') torch.float32 (512,) (1,) 7f1e1fb0c9f0
 _frozen_param629 = None  # device(type='cpu') torch.float32 (512, 128, 1, 1) (1, 0, 0, 0) 7f1e1f6b27f0
 _frozen_param583 = None  # device(type='cpu') torch.float32 (256,) (1,) 7f1e1f7e7470
 _frozen_param630 = None  # device(type='cpu') torch.float32 (256, 512, 1, 1) (1, 0, 0, 0) 7f1e1f6b27a0
 _frozen_param584 = None  # device(type='cpu') torch.float32 (256,) (1,) 7f1e1f85f6f0
 _frozen_param631 = None  # device(type='cpu') torch.float32 (256, 256, 3, 3) (1, 0, 0, 0) 7f1e1f6b2750
 _frozen_param632 = None  # device(type='cpu') torch.float32 (16, 256, 1, 1) (1, 0, 0, 0) 7f1e1f6b2700
 _frozen_param633 = None  # device(type='cpu') torch.float32 (256, 16, 1, 1) (1, 0, 0, 0) 7f1e1f6b2660
 _frozen_param585 = None  # device(type='cpu') torch.float32 (1024,) (1,) 7f1e1f85f2e0
 _frozen_param634 = None  # device(type='cpu') torch.float32 (1024, 256, 1, 1) (1, 0, 0, 0) 7f1e1f6b2570
 _frozen_param586 = None  # device(type='cpu') torch.float32 (1024,) (1,) 7f1e1f85f830
 _frozen_param635 = None  # device(type='cpu') torch.float32 (1024, 512, 1, 1) (1, 0, 0, 0) 7f1e1f6b2840
 _frozen_param587 = None  # device(type='cpu') torch.float32 (256,) (1,) 7f1e1f85f880
 _frozen_param636 = None  # device(type='cpu') torch.float32 (256, 1024, 1, 1) (1, 0, 0, 0) 7f1e1f6b2610
 _frozen_param588 = None  # device(type='cpu') torch.float32 (256,) (1,) 7f1e1f85f8d0
 _frozen_param637 = None  # device(type='cpu') torch.float32 (256, 256, 3, 3) (1, 0, 0, 0) 7f1e1f6b25c0
 _frozen_param638 = None  # device(type='cpu') torch.float32 (16, 256, 1, 1) (1, 0, 0, 0) 7f1e1f6b2520
 _frozen_param639 = None  # device(type='cpu') torch.float32 (256, 16, 1, 1) (1, 0, 0, 0) 7f1e1f6b2890
 _frozen_param589 = None  # device(type='cpu') torch.float32 (1024,) (1,) 7f1e1f85f7e0
 _frozen_param640 = None  # device(type='cpu') torch.float32 (1024, 256, 1, 1) (1, 0, 0, 0) 7f1e1f6b28e0
 _frozen_param590 = None  # device(type='cpu') torch.float32 (256,) (1,) 7f1e1f85f970
 _frozen_param641 = None  # device(type='cpu') torch.float32 (256, 1024, 1, 1) (1, 0, 0, 0) 7f1e1f95a610
 _frozen_param642 = None  # device(type='cpu') torch.float32 (768, 256, 1, 1) (1, 0, 0, 0) 7f1e1f6b2980
 _frozen_param643 = None  # device(type='cpu') torch.float32 (31, 64) (64, 1) 7f1e1f6b2c00
 _frozen_param644 = None  # device(type='cpu') torch.float32 (1982689, 1) (1, 0) 7f1e1f6b2a20
 _frozen_param645 = None  # device(type='cpu') torch.float32 (31, 64) (64, 1) 7f1e1f6b2930
 _frozen_param646 = None  # device(type='cpu') torch.float32 (1982689, 1) (1, 0) 7f1e1f6b2ac0
 _frozen_param349 = None  # device(type='cpu') torch.float32 (256, 1, 1) (1, 1, 1) 7f1e1f92ff60
 _frozen_param350 = None  # device(type='cpu') torch.float32 (256, 1, 1) (1, 1, 1) 7f1e1f93c090
 _frozen_param351 = None  # device(type='cpu') torch.float32 (256, 1, 1) (1, 1, 1) 7f1e1f93c130
 _frozen_param352 = None  # device(type='cpu') torch.float32 (256, 1, 1) (1, 1, 1) 7f1e1f93c180
 _frozen_param591 = None  # device(type='cpu') torch.float32 (1024,) (1,) 7f1e1f85f9c0
 _frozen_param647 = None  # device(type='cpu') torch.float32 (1024, 256, 1, 1) (1, 0, 0, 0) 7f1e1f6b2de0
 _frozen_param592 = None  # device(type='cpu') torch.float32 (512,) (1,) 7f1e1f85fa10
 _frozen_param648 = None  # device(type='cpu') torch.float32 (512, 1024, 1, 1) (1, 0, 0, 0) 7f1e1f6b2d90
 _frozen_param649 = None  # device(type='cpu') torch.float32 (1536, 512, 1, 1) (1, 0, 0, 0) 7f1e1f6b2d40
 _frozen_param650 = None  # device(type='cpu') torch.float32 (31, 128) (128, 1) 7f1e1f6b2e80
 _frozen_param651 = None  # device(type='cpu') torch.float32 (1982689, 1) (1, 0) 7f1e1f6b2ed0
 _frozen_param652 = None  # device(type='cpu') torch.float32 (31, 128) (128, 1) 7f1e1f6b2bb0
 _frozen_param653 = None  # device(type='cpu') torch.float32 (1982689, 1) (1, 0) 7f1e1f6b2ca0
 _frozen_param363 = None  # device(type='cpu') torch.float32 (512, 1, 1) (1, 1, 1) 7f1e1f93c5e0
 _frozen_param364 = None  # device(type='cpu') torch.float32 (512, 1, 1) (1, 1, 1) 7f1e1f93c630
 _frozen_param365 = None  # device(type='cpu') torch.float32 (512, 1, 1) (1, 1, 1) 7f1e1f93c680
 _frozen_param366 = None  # device(type='cpu') torch.float32 (512, 1, 1) (1, 1, 1) 7f1e1f93c6d0
 _frozen_param593 = None  # device(type='cpu') torch.float32 (1536,) (1,) 7f1e1f85f6a0
 _frozen_param654 = None  # device(type='cpu') torch.float32 (1536, 512, 1, 1) (1, 0, 0, 0) 7f1e1f6b30b0
 _frozen_param594 = None  # device(type='cpu') torch.float32 (1536,) (1,) 7f1e1f85fab0
 _frozen_param655 = None  # device(type='cpu') torch.float32 (1536, 1024, 1, 1) (1, 0, 0, 0) 7f1e1f6b3060
 _frozen_param595 = None  # device(type='cpu') torch.float32 (512,) (1,) 7f1e1f85fb00
 _frozen_param656 = None  # device(type='cpu') torch.float32 (512, 1536, 1, 1) (1, 0, 0, 0) 7f1e1f6b3010
 _frozen_param657 = None  # device(type='cpu') torch.float32 (1536, 512, 1, 1) (1, 0, 0, 0) 7f1e1f6b2fc0
 _frozen_param658 = None  # device(type='cpu') torch.float32 (15, 128) (128, 1) 7f1e1f6b31a0
 _frozen_param659 = None  # device(type='cpu') torch.float32 (1982689, 1) (1, 0) 7f1e1f6b31f0
 _frozen_param660 = None  # device(type='cpu') torch.float32 (15, 128) (128, 1) 7f1e1f6b2a70
 _frozen_param661 = None  # device(type='cpu') torch.float32 (1982689, 1) (1, 0) 7f1e1f6b2c50
 _frozen_param381 = None  # device(type='cpu') torch.float32 (512, 1, 1) (1, 1, 1) 7f1e1f93cd10
 _frozen_param382 = None  # device(type='cpu') torch.float32 (512, 1, 1) (1, 1, 1) 7f1e1f93cc70
 _frozen_param383 = None  # device(type='cpu') torch.float32 (512, 1, 1) (1, 1, 1) 7f1e1f93cd60
 _frozen_param384 = None  # device(type='cpu') torch.float32 (512, 1, 1) (1, 1, 1) 7f1e1f93cdb0
 _frozen_param596 = None  # device(type='cpu') torch.float32 (1536,) (1,) 7f1e1f85fb50
 _frozen_param662 = None  # device(type='cpu') torch.float32 (1536, 512, 1, 1) (1, 0, 0, 0) 7f1e1f6b33d0
 _frozen_param597 = None  # device(type='cpu') torch.float32 (1280,) (1,) 7f1e1f85fba0
 _frozen_param663 = None  # device(type='cpu') torch.float32 (1280, 1536, 1, 1) (1, 0, 0, 0) 7f1e1f6b3380
 _frozen_param664 = None  # device(type='cpu') torch.float32 (1000, 1280) (1280, 1) 7f1e1f6b3150
 _frozen_param665 = None  # device(type='cpu') torch.float32 (3490017, 1) (1, 0) 7f1e1f6b3470


 cpp_fused_silu_0 = async_compile.cpp_pybinding(['const float*', 'float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(const float* in_ptr0,
                       float* out_ptr0)
 {
    {
        #pragma GCC ivdep
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(3L); x0+=static_cast<int64_t>(3L))
        {
            #pragma GCC ivdep
            for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(65536L); x1+=static_cast<int64_t>(16L))
            {
                alignas(16) float tmp1[3*16];
                for (long x0_inner = 0; x0_inner < 3; x0_inner++)
                {
                    auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x1 + (65536L*x0) + (65536L*x0_inner)), 16);
                    tmp0.store(tmp1 + static_cast<int64_t>(16L*x0_inner));
                }
                at::vec::transpose_mxn<float,3,16>(tmp1, 16, out_ptr0 + static_cast<int64_t>(x0 + (3L*x1)), static_cast<int64_t>(3L));
            }
        }
    }
 }
 ''')


 cpp_fused_mean_1 = async_compile.cpp_pybinding(['float*', 'const float*', 'float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(float* in_out_ptr0,
                       const float* in_ptr0,
                       float* out_ptr1)
 {
    auto out_ptr0 = in_out_ptr0;
    {
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(64L); x0+=static_cast<int64_t>(16L))
        {
            {
                float tmp_acc0 = 0;
                at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(0);
                for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(4096L); x1+=static_cast<int64_t>(1L))
                {
                    auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x0 + (64L*x1)), 16);
                    tmp_acc0_vec = tmp_acc0_vec + tmp0;
                }
                tmp_acc0_vec.store(in_out_ptr0 + static_cast<int64_t>(x0));
            }
        }
    }
    {
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(64L); x0+=static_cast<int64_t>(16L))
        {
            auto tmp0 = at::vec::Vectorized<float>::loadu(out_ptr0 + static_cast<int64_t>(x0), 16);
            auto tmp1 = static_cast<float>(4096.0);
            auto tmp2 = at::vec::Vectorized<float>(tmp1);
            auto tmp3 = tmp0 / tmp2;
            tmp3.store(in_out_ptr0 + static_cast<int64_t>(x0));
        }
    }
    {
        #pragma omp simd simdlen(8) 
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(8L); x0+=static_cast<int64_t>(1L))
        {
            auto tmp0 = x0;
            auto tmp1 = c10::convert<int64_t>(tmp0);
            auto tmp2 = static_cast<int64_t>(4);
            auto tmp3 = tmp1 < tmp2;
            auto tmp4 = static_cast<int64_t>(2);
            auto tmp5 = tmp1 < tmp4;
            auto tmp6 = static_cast<int64_t>(1);
            auto tmp7 = tmp1 < tmp6;
            auto tmp8 = static_cast<float>(-0.3303021788597107);
            auto tmp9 = static_cast<float>(0.06354581564664841);
            auto tmp10 = tmp7 ? tmp8 : tmp9;
            auto tmp11 = static_cast<int64_t>(3);
            auto tmp12 = tmp1 < tmp11;
            auto tmp13 = static_cast<float>(1.774228811264038);
            auto tmp14 = static_cast<float>(2.1113927364349365);
            auto tmp15 = tmp12 ? tmp13 : tmp14;
            auto tmp16 = tmp5 ? tmp10 : tmp15;
            auto tmp17 = static_cast<int64_t>(6);
            auto tmp18 = tmp1 < tmp17;
            auto tmp19 = static_cast<int64_t>(5);
            auto tmp20 = tmp1 < tmp19;
            auto tmp21 = static_cast<float>(0.32513317465782166);
            auto tmp22 = static_cast<float>(1.232210397720337);
            auto tmp23 = tmp20 ? tmp21 : tmp22;
            auto tmp24 = static_cast<int64_t>(7);
            auto tmp25 = tmp1 < tmp24;
            auto tmp26 = static_cast<float>(0.7079262137413025);
            auto tmp27 = static_cast<float>(0.2353029102087021);
            auto tmp28 = tmp25 ? tmp26 : tmp27;
            auto tmp29 = tmp18 ? tmp23 : tmp28;
            auto tmp30 = tmp3 ? tmp16 : tmp29;
            out_ptr1[static_cast<int64_t>(x0)] = tmp30;
        }
    }
 }
 ''')


 cpp_fused_mul_2 = async_compile.cpp_pybinding(['float*', 'const float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(float* in_out_ptr0,
                       const float* in_ptr0)
 {
    {
        #pragma GCC ivdep
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(4096L); x0+=static_cast<int64_t>(1L))
        {
            for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(64L); x1+=static_cast<int64_t>(16L))
            {
                auto tmp0 = at::vec::Vectorized<float>::loadu(in_out_ptr0 + static_cast<int64_t>(x1 + (64L*x0)), 16);
                auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x1), 16);
                auto tmp2 = tmp0 * tmp1;
                tmp2.store(in_out_ptr0 + static_cast<int64_t>(x1 + (64L*x0)));
            }
        }
    }
 }
 ''')


 cpp_fused_silu_3 = async_compile.cpp_pybinding(['const float*', 'float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(const float* in_ptr0,
                       float* out_ptr0)
 {
    {
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(1048576L); x0+=static_cast<int64_t>(16L))
        {
            auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x0), 16);
            auto tmp1 = decltype(tmp0)(1)/(decltype(tmp0)(1) + tmp0.neg().exp());
            auto tmp2 = tmp0 * tmp1;
            tmp2.store(out_ptr0 + static_cast<int64_t>(x0));
        }
    }
 }
 ''')


 cpp_fused_mean_4 = async_compile.cpp_pybinding(['float*', 'const float*', 'float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(float* in_out_ptr0,
                       const float* in_ptr0,
                       float* out_ptr1)
 {
    auto out_ptr0 = in_out_ptr0;
    {
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(64L); x0+=static_cast<int64_t>(16L))
        {
            {
                float tmp_acc0 = 0;
                at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(0);
                for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(4096L); x1+=static_cast<int64_t>(1L))
                {
                    auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x0 + (64L*x1)), 16);
                    tmp_acc0_vec = tmp_acc0_vec + tmp0;
                }
                tmp_acc0_vec.store(in_out_ptr0 + static_cast<int64_t>(x0));
            }
        }
    }
    {
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(64L); x0+=static_cast<int64_t>(16L))
        {
            auto tmp0 = at::vec::Vectorized<float>::loadu(out_ptr0 + static_cast<int64_t>(x0), 16);
            auto tmp1 = static_cast<float>(4096.0);
            auto tmp2 = at::vec::Vectorized<float>(tmp1);
            auto tmp3 = tmp0 / tmp2;
            tmp3.store(in_out_ptr0 + static_cast<int64_t>(x0));
        }
    }
    {
        #pragma omp simd simdlen(8) 
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(8L); x0+=static_cast<int64_t>(1L))
        {
            auto tmp0 = x0;
            auto tmp1 = c10::convert<int64_t>(tmp0);
            auto tmp2 = static_cast<int64_t>(4);
            auto tmp3 = tmp1 < tmp2;
            auto tmp4 = static_cast<int64_t>(2);
            auto tmp5 = tmp1 < tmp4;
            auto tmp6 = static_cast<int64_t>(1);
            auto tmp7 = tmp1 < tmp6;
            auto tmp8 = static_cast<float>(0.7516645193099976);
            auto tmp9 = static_cast<float>(0.5124969482421875);
            auto tmp10 = tmp7 ? tmp8 : tmp9;
            auto tmp11 = static_cast<int64_t>(3);
            auto tmp12 = tmp1 < tmp11;
            auto tmp13 = static_cast<float>(1.2062517404556274);
            auto tmp14 = static_cast<float>(0.9069646596908569);
            auto tmp15 = tmp12 ? tmp13 : tmp14;
            auto tmp16 = tmp5 ? tmp10 : tmp15;
            auto tmp17 = static_cast<int64_t>(6);
            auto tmp18 = tmp1 < tmp17;
            auto tmp19 = static_cast<int64_t>(5);
            auto tmp20 = tmp1 < tmp19;
            auto tmp21 = static_cast<float>(1.4622137546539307);
            auto tmp22 = static_cast<float>(-0.11386305838823318);
            auto tmp23 = tmp20 ? tmp21 : tmp22;
            auto tmp24 = static_cast<int64_t>(7);
            auto tmp25 = tmp1 < tmp24;
            auto tmp26 = static_cast<float>(-0.2968502938747406);
            auto tmp27 = static_cast<float>(0.884636402130127);
            auto tmp28 = tmp25 ? tmp26 : tmp27;
            auto tmp29 = tmp18 ? tmp23 : tmp28;
            auto tmp30 = tmp3 ? tmp16 : tmp29;
            out_ptr1[static_cast<int64_t>(x0)] = tmp30;
        }
    }
 }
 ''')


 cpp_fused_mul_5 = async_compile.cpp_pybinding(['float*', 'const float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(float* in_out_ptr0,
                       const float* in_ptr0)
 {
    {
        #pragma GCC ivdep
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(4096L); x0+=static_cast<int64_t>(1L))
        {
            for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(64L); x1+=static_cast<int64_t>(16L))
            {
                auto tmp0 = at::vec::Vectorized<float>::loadu(in_out_ptr0 + static_cast<int64_t>(x1 + (64L*x0)), 16);
                auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x1), 16);
                auto tmp2 = tmp0 * tmp1;
                tmp2.store(in_out_ptr0 + static_cast<int64_t>(x1 + (64L*x0)));
            }
        }
    }
 }
 ''')


 cpp_fused_silu_6 = async_compile.cpp_pybinding(['const float*', 'float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(const float* in_ptr0,
                       float* out_ptr0)
 {
    {
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(1048576L); x0+=static_cast<int64_t>(16L))
        {
            auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x0), 16);
            auto tmp1 = decltype(tmp0)(1)/(decltype(tmp0)(1) + tmp0.neg().exp());
            auto tmp2 = tmp0 * tmp1;
            tmp2.store(out_ptr0 + static_cast<int64_t>(x0));
        }
    }
 }
 ''')


 cpp_fused_mean_7 = async_compile.cpp_pybinding(['float*', 'const float*', 'float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(float* in_out_ptr0,
                       const float* in_ptr0,
                       float* out_ptr1)
 {
    auto out_ptr0 = in_out_ptr0;
    {
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(128L); x0+=static_cast<int64_t>(16L))
        {
            {
                float tmp_acc0 = 0;
                at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(0);
                for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(1024L); x1+=static_cast<int64_t>(1L))
                {
                    auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x0 + (128L*x1)), 16);
                    tmp_acc0_vec = tmp_acc0_vec + tmp0;
                }
                tmp_acc0_vec.store(in_out_ptr0 + static_cast<int64_t>(x0));
            }
        }
    }
    {
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(128L); x0+=static_cast<int64_t>(16L))
        {
            auto tmp0 = at::vec::Vectorized<float>::loadu(out_ptr0 + static_cast<int64_t>(x0), 16);
            auto tmp1 = static_cast<float>(1024.0);
            auto tmp2 = at::vec::Vectorized<float>(tmp1);
            auto tmp3 = tmp0 / tmp2;
            tmp3.store(in_out_ptr0 + static_cast<int64_t>(x0));
        }
    }
    {
        #pragma omp simd simdlen(8) 
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(8L); x0+=static_cast<int64_t>(1L))
        {
            auto tmp0 = x0;
            auto tmp1 = c10::convert<int64_t>(tmp0);
            auto tmp2 = static_cast<int64_t>(4);
            auto tmp3 = tmp1 < tmp2;
            auto tmp4 = static_cast<int64_t>(2);
            auto tmp5 = tmp1 < tmp4;
            auto tmp6 = static_cast<int64_t>(1);
            auto tmp7 = tmp1 < tmp6;
            auto tmp8 = static_cast<float>(0.47172150015830994);
            auto tmp9 = static_cast<float>(1.4283583164215088);
            auto tmp10 = tmp7 ? tmp8 : tmp9;
            auto tmp11 = static_cast<int64_t>(3);
            auto tmp12 = tmp1 < tmp11;
            auto tmp13 = static_cast<float>(-0.04577525332570076);
            auto tmp14 = static_cast<float>(2.043065309524536);
            auto tmp15 = tmp12 ? tmp13 : tmp14;
            auto tmp16 = tmp5 ? tmp10 : tmp15;
            auto tmp17 = static_cast<int64_t>(6);
            auto tmp18 = tmp1 < tmp17;
            auto tmp19 = static_cast<int64_t>(5);
            auto tmp20 = tmp1 < tmp19;
            auto tmp21 = static_cast<float>(0.13726529479026794);
            auto tmp22 = static_cast<float>(1.1331775188446045);
            auto tmp23 = tmp20 ? tmp21 : tmp22;
            auto tmp24 = static_cast<int64_t>(7);
            auto tmp25 = tmp1 < tmp24;
            auto tmp26 = static_cast<float>(-0.11772552132606506);
            auto tmp27 = static_cast<float>(0.527721107006073);
            auto tmp28 = tmp25 ? tmp26 : tmp27;
            auto tmp29 = tmp18 ? tmp23 : tmp28;
            auto tmp30 = tmp3 ? tmp16 : tmp29;
            out_ptr1[static_cast<int64_t>(x0)] = tmp30;
        }
    }
 }
 ''')


 cpp_fused_mul_8 = async_compile.cpp_pybinding(['float*', 'const float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(float* in_out_ptr0,
                       const float* in_ptr0)
 {
    {
        #pragma GCC ivdep
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(1024L); x0+=static_cast<int64_t>(1L))
        {
            for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(128L); x1+=static_cast<int64_t>(16L))
            {
                auto tmp0 = at::vec::Vectorized<float>::loadu(in_out_ptr0 + static_cast<int64_t>(x1 + (128L*x0)), 16);
                auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x1), 16);
                auto tmp2 = tmp0 * tmp1;
                tmp2.store(in_out_ptr0 + static_cast<int64_t>(x1 + (128L*x0)));
            }
        }
    }
 }
 ''')


 cpp_fused_silu_9 = async_compile.cpp_pybinding(['const float*', 'float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(const float* in_ptr0,
                       float* out_ptr0)
 {
    {
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(524288L); x0+=static_cast<int64_t>(16L))
        {
            auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x0), 16);
            auto tmp1 = decltype(tmp0)(1)/(decltype(tmp0)(1) + tmp0.neg().exp());
            auto tmp2 = tmp0 * tmp1;
            tmp2.store(out_ptr0 + static_cast<int64_t>(x0));
        }
    }
 }
 ''')


 cpp_fused_mean_10 = async_compile.cpp_pybinding(['float*', 'const float*', 'float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(float* in_out_ptr0,
                       const float* in_ptr0,
                       float* out_ptr1)
 {
    auto out_ptr0 = in_out_ptr0;
    {
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(128L); x0+=static_cast<int64_t>(16L))
        {
            {
                float tmp_acc0 = 0;
                at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(0);
                for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(1024L); x1+=static_cast<int64_t>(1L))
                {
                    auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x0 + (128L*x1)), 16);
                    tmp_acc0_vec = tmp_acc0_vec + tmp0;
                }
                tmp_acc0_vec.store(in_out_ptr0 + static_cast<int64_t>(x0));
            }
        }
    }
    {
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(128L); x0+=static_cast<int64_t>(16L))
        {
            auto tmp0 = at::vec::Vectorized<float>::loadu(out_ptr0 + static_cast<int64_t>(x0), 16);
            auto tmp1 = static_cast<float>(1024.0);
            auto tmp2 = at::vec::Vectorized<float>(tmp1);
            auto tmp3 = tmp0 / tmp2;
            tmp3.store(in_out_ptr0 + static_cast<int64_t>(x0));
        }
    }
    {
        #pragma omp simd simdlen(8) 
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(8L); x0+=static_cast<int64_t>(1L))
        {
            auto tmp0 = x0;
            auto tmp1 = c10::convert<int64_t>(tmp0);
            auto tmp2 = static_cast<int64_t>(4);
            auto tmp3 = tmp1 < tmp2;
            auto tmp4 = static_cast<int64_t>(2);
            auto tmp5 = tmp1 < tmp4;
            auto tmp6 = static_cast<int64_t>(1);
            auto tmp7 = tmp1 < tmp6;
            auto tmp8 = static_cast<float>(0.29536592960357666);
            auto tmp9 = static_cast<float>(1.1849623918533325);
            auto tmp10 = tmp7 ? tmp8 : tmp9;
            auto tmp11 = static_cast<int64_t>(3);
            auto tmp12 = tmp1 < tmp11;
            auto tmp13 = static_cast<float>(0.3150717318058014);
            auto tmp14 = static_cast<float>(0.5096337795257568);
            auto tmp15 = tmp12 ? tmp13 : tmp14;
            auto tmp16 = tmp5 ? tmp10 : tmp15;
            auto tmp17 = static_cast<int64_t>(6);
            auto tmp18 = tmp1 < tmp17;
            auto tmp19 = static_cast<int64_t>(5);
            auto tmp20 = tmp1 < tmp19;
            auto tmp21 = static_cast<float>(-0.18543481826782227);
            auto tmp22 = static_cast<float>(0.3189537227153778);
            auto tmp23 = tmp20 ? tmp21 : tmp22;
            auto tmp24 = static_cast<int64_t>(7);
            auto tmp25 = tmp1 < tmp24;
            auto tmp26 = static_cast<float>(-0.7191315293312073);
            auto tmp27 = static_cast<float>(0.42770394682884216);
            auto tmp28 = tmp25 ? tmp26 : tmp27;
            auto tmp29 = tmp18 ? tmp23 : tmp28;
            auto tmp30 = tmp3 ? tmp16 : tmp29;
            out_ptr1[static_cast<int64_t>(x0)] = tmp30;
        }
    }
 }
 ''')


 cpp_fused_mul_11 = async_compile.cpp_pybinding(['float*', 'const float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(float* in_out_ptr0,
                       const float* in_ptr0)
 {
    {
        #pragma GCC ivdep
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(1024L); x0+=static_cast<int64_t>(1L))
        {
            for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(128L); x1+=static_cast<int64_t>(16L))
            {
                auto tmp0 = at::vec::Vectorized<float>::loadu(in_out_ptr0 + static_cast<int64_t>(x1 + (128L*x0)), 16);
                auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x1), 16);
                auto tmp2 = tmp0 * tmp1;
                tmp2.store(in_out_ptr0 + static_cast<int64_t>(x1 + (128L*x0)));
            }
        }
    }
 }
 ''')


 cpp_fused_silu_12 = async_compile.cpp_pybinding(['const float*', 'float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(const float* in_ptr0,
                       float* out_ptr0)
 {
    {
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(524288L); x0+=static_cast<int64_t>(16L))
        {
            auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x0), 16);
            auto tmp1 = decltype(tmp0)(1)/(decltype(tmp0)(1) + tmp0.neg().exp());
            auto tmp2 = tmp0 * tmp1;
            tmp2.store(out_ptr0 + static_cast<int64_t>(x0));
        }
    }
 }
 ''')


 cpp_fused_clone_13 = async_compile.cpp_pybinding(['const float*', 'float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(const float* in_ptr0,
                       float* out_ptr0)
 {
    {
        #pragma GCC ivdep
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(4L); x0+=static_cast<int64_t>(1L))
        {
            #pragma GCC ivdep
            for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(32L); x1+=static_cast<int64_t>(1L))
            {
                #pragma GCC ivdep
                for(int64_t x2=static_cast<int64_t>(0L); x2<static_cast<int64_t>(32L); x2+=static_cast<int64_t>(1L))
                {
                    for(int64_t x3=static_cast<int64_t>(0L); x3<static_cast<int64_t>(32L); x3+=static_cast<int64_t>(16L))
                    {
                        auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x3 + (32L*x0) + (384L*x1) + (12288L*x2)), 16);
                        tmp0.store(out_ptr0 + static_cast<int64_t>(x3 + (32L*x2) + (1024L*x1) + (32768L*x0)));
                    }
                }
            }
        }
    }
 }
 ''')


 cpp_fused_clone_14 = async_compile.cpp_pybinding(['const float*', 'float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(const float* in_ptr0,
                       float* out_ptr0)
 {
    {
        #pragma GCC ivdep
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(4L); x0+=static_cast<int64_t>(1L))
        {
            #pragma GCC ivdep
            for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(1024L); x1+=static_cast<int64_t>(1L))
            {
                for(int64_t x2=static_cast<int64_t>(0L); x2<static_cast<int64_t>(32L); x2+=static_cast<int64_t>(16L))
                {
                    auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x2 + (32L*x0) + (384L*x1)), 16);
                    tmp0.store(out_ptr0 + static_cast<int64_t>(x2 + (32L*x1) + (32768L*x0)));
                }
            }
        }
    }
 }
 ''')


 cpp_fused__softmax_add_mul_15 = async_compile.cpp_pybinding(['const float*', 'const float*', 'const float*', 'float*', 'float*', 'float*', 'float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(const float* in_ptr0,
                       const float* in_ptr1,
                       const float* in_ptr2,
                       float* out_ptr0,
                       float* out_ptr1,
                       float* out_ptr2,
                       float* out_ptr3)
 {
    {
        #pragma GCC ivdep
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(4L); x0+=static_cast<int64_t>(1L))
        {
            #pragma GCC ivdep
            for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(1024L); x1+=static_cast<int64_t>(1L))
            {
                {
                    float tmp_acc0 = -std::numeric_limits<float>::infinity();
                    for(int64_t x2=static_cast<int64_t>(0L); x2<static_cast<int64_t>(1024L); x2+=static_cast<int64_t>(1L))
                    {
                        auto tmp0 = in_ptr0[static_cast<int64_t>(x2 + (1024L*x1) + (1048576L*x0))];
                        auto tmp1 = static_cast<float>(0.1767766952966369);
                        auto tmp2 = decltype(tmp0)(tmp0 * tmp1);
                        auto tmp3 = 31L + (63L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(32L)))) + (c10::div_floor_integer(static_cast<int64_t>(x2), static_cast<int64_t>(32L)));
                        auto tmp4 = c10::convert<int64_t>(tmp3);
                        auto tmp5 = static_cast<int64_t>(2048);
                        auto tmp6 = tmp4 < tmp5;
                        auto tmp7 = [&]
                        {
                            auto tmp8 = static_cast<int64_t>((31L + (63L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(32L)))) + (c10::div_floor_integer(static_cast<int64_t>(x2), static_cast<int64_t>(32L))))) % static_cast<int64_t>(64L);
                            auto tmp9 = c10::convert<int64_t>(tmp8);
                            auto tmp10 = static_cast<int64_t>(63);
                            auto tmp11 = tmp9 < tmp10;
                            auto tmp12 = [&]
                            {
                                auto tmp13 = in_ptr1[static_cast<int64_t>((63L*(c10::div_floor_integer(static_cast<int64_t>((31L + (63L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(32L)))) + (c10::div_floor_integer(static_cast<int64_t>(x2), static_cast<int64_t>(32L))))), static_cast<int64_t>(64L)))) + (2016L*(static_cast<int64_t>(x1) % static_cast<int64_t>(32L))) + (64512L*x0) + (static_cast<int64_t>((31L + (63L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(32L)))) + (c10::div_floor_integer(static_cast<int64_t>(x2), static_cast<int64_t>(32L))))) % static_cast<int64_t>(64L)))];
                                return tmp13;
                            }
                            ;
                            auto tmp14 = tmp11 ? tmp12() : static_cast<decltype(tmp12())>(0.0);
                            return tmp14;
                        }
                        ;
                        auto tmp15 = tmp6 ? tmp7() : static_cast<decltype(tmp7())>(0.0);
                        auto tmp16 = 31L + (63L*(static_cast<int64_t>(x1) % static_cast<int64_t>(32L))) + (static_cast<int64_t>(x2) % static_cast<int64_t>(32L));
                        auto tmp17 = c10::convert<int64_t>(tmp16);
                        auto tmp18 = tmp17 < tmp5;
                        auto tmp19 = [&]
                        {
                            auto tmp20 = static_cast<int64_t>((31L + (63L*(static_cast<int64_t>(x1) % static_cast<int64_t>(32L))) + (static_cast<int64_t>(x2) % static_cast<int64_t>(32L)))) % static_cast<int64_t>(64L);
                            auto tmp21 = c10::convert<int64_t>(tmp20);
                            auto tmp22 = static_cast<int64_t>(63);
                            auto tmp23 = tmp21 < tmp22;
                            auto tmp24 = [&]
                            {
                                auto tmp25 = in_ptr2[static_cast<int64_t>((63L*(c10::div_floor_integer(static_cast<int64_t>((31L + (63L*(static_cast<int64_t>(x1) % static_cast<int64_t>(32L))) + (static_cast<int64_t>(x2) % static_cast<int64_t>(32L)))), static_cast<int64_t>(64L)))) + (2016L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(32L)))) + (64512L*x0) + (static_cast<int64_t>((31L + (63L*(static_cast<int64_t>(x1) % static_cast<int64_t>(32L))) + (static_cast<int64_t>(x2) % static_cast<int64_t>(32L)))) % static_cast<int64_t>(64L)))];
                                return tmp25;
                            }
                            ;
                            auto tmp26 = tmp23 ? tmp24() : static_cast<decltype(tmp24())>(0.0);
                            return tmp26;
                        }
                        ;
                        auto tmp27 = tmp18 ? tmp19() : static_cast<decltype(tmp19())>(0.0);
                        auto tmp28 = decltype(tmp15)(tmp15 + tmp27);
                        auto tmp29 = decltype(tmp2)(tmp2 + tmp28);
                        tmp_acc0 = max_propagate_nan(tmp_acc0, tmp29);
                    }
                    out_ptr0[static_cast<int64_t>(x1 + (1024L*x0))] = tmp_acc0;
                }
                #pragma GCC ivdep
                for(int64_t x2=static_cast<int64_t>(0L); x2<static_cast<int64_t>(1024L); x2+=static_cast<int64_t>(1L))
                {
                    auto tmp0 = in_ptr0[static_cast<int64_t>(x2 + (1024L*x1) + (1048576L*x0))];
                    auto tmp30 = out_ptr0[static_cast<int64_t>(x1 + (1024L*x0))];
                    auto tmp1 = static_cast<float>(0.1767766952966369);
                    auto tmp2 = decltype(tmp0)(tmp0 * tmp1);
                    auto tmp3 = 31L + (63L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(32L)))) + (c10::div_floor_integer(static_cast<int64_t>(x2), static_cast<int64_t>(32L)));
                    auto tmp4 = c10::convert<int64_t>(tmp3);
                    auto tmp5 = static_cast<int64_t>(2048);
                    auto tmp6 = tmp4 < tmp5;
                    auto tmp7 = [&]
                    {
                        auto tmp8 = static_cast<int64_t>((31L + (63L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(32L)))) + (c10::div_floor_integer(static_cast<int64_t>(x2), static_cast<int64_t>(32L))))) % static_cast<int64_t>(64L);
                        auto tmp9 = c10::convert<int64_t>(tmp8);
                        auto tmp10 = static_cast<int64_t>(63);
                        auto tmp11 = tmp9 < tmp10;
                        auto tmp12 = [&]
                        {
                            auto tmp13 = in_ptr1[static_cast<int64_t>((63L*(c10::div_floor_integer(static_cast<int64_t>((31L + (63L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(32L)))) + (c10::div_floor_integer(static_cast<int64_t>(x2), static_cast<int64_t>(32L))))), static_cast<int64_t>(64L)))) + (2016L*(static_cast<int64_t>(x1) % static_cast<int64_t>(32L))) + (64512L*x0) + (static_cast<int64_t>((31L + (63L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(32L)))) + (c10::div_floor_integer(static_cast<int64_t>(x2), static_cast<int64_t>(32L))))) % static_cast<int64_t>(64L)))];
                            return tmp13;
                        }
                        ;
                        auto tmp14 = tmp11 ? tmp12() : static_cast<decltype(tmp12())>(0.0);
                        return tmp14;
                    }
                    ;
                    auto tmp15 = tmp6 ? tmp7() : static_cast<decltype(tmp7())>(0.0);
                    auto tmp16 = 31L + (63L*(static_cast<int64_t>(x1) % static_cast<int64_t>(32L))) + (static_cast<int64_t>(x2) % static_cast<int64_t>(32L));
                    auto tmp17 = c10::convert<int64_t>(tmp16);
                    auto tmp18 = tmp17 < tmp5;
                    auto tmp19 = [&]
                    {
                        auto tmp20 = static_cast<int64_t>((31L + (63L*(static_cast<int64_t>(x1) % static_cast<int64_t>(32L))) + (static_cast<int64_t>(x2) % static_cast<int64_t>(32L)))) % static_cast<int64_t>(64L);
                        auto tmp21 = c10::convert<int64_t>(tmp20);
                        auto tmp22 = static_cast<int64_t>(63);
                        auto tmp23 = tmp21 < tmp22;
                        auto tmp24 = [&]
                        {
                            auto tmp25 = in_ptr2[static_cast<int64_t>((63L*(c10::div_floor_integer(static_cast<int64_t>((31L + (63L*(static_cast<int64_t>(x1) % static_cast<int64_t>(32L))) + (static_cast<int64_t>(x2) % static_cast<int64_t>(32L)))), static_cast<int64_t>(64L)))) + (2016L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(32L)))) + (64512L*x0) + (static_cast<int64_t>((31L + (63L*(static_cast<int64_t>(x1) % static_cast<int64_t>(32L))) + (static_cast<int64_t>(x2) % static_cast<int64_t>(32L)))) % static_cast<int64_t>(64L)))];
                            return tmp25;
                        }
                        ;
                        auto tmp26 = tmp23 ? tmp24() : static_cast<decltype(tmp24())>(0.0);
                        return tmp26;
                    }
                    ;
                    auto tmp27 = tmp18 ? tmp19() : static_cast<decltype(tmp19())>(0.0);
                    auto tmp28 = decltype(tmp15)(tmp15 + tmp27);
                    auto tmp29 = decltype(tmp2)(tmp2 + tmp28);
                    auto tmp31 = decltype(tmp29)(tmp29 - tmp30);
                    auto tmp32 = std::exp(tmp31);
                    out_ptr1[static_cast<int64_t>(x2 + (1024L*x1) + (1048576L*x0))] = tmp32;
                }
            }
        }
    }
    {
        #pragma GCC ivdep
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(4096L); x0+=static_cast<int64_t>(1L))
        {
            {
                float tmp_acc0 = 0;
                at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(0);
                for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(1024L); x1+=static_cast<int64_t>(16L))
                {
                    auto tmp0 = at::vec::Vectorized<float>::loadu(out_ptr1 + static_cast<int64_t>(x1 + (1024L*x0)), 16);
                    tmp_acc0_vec = tmp_acc0_vec + tmp0;
                }
                tmp_acc0 = tmp_acc0 + at::vec::vec_reduce_all<float>([](at::vec::Vectorized<float>& x, at::vec::Vectorized<float>& y) { return x + y; }, tmp_acc0_vec);
                out_ptr2[static_cast<int64_t>(x0)] = static_cast<float>(tmp_acc0);
            }
            for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(1024L); x1+=static_cast<int64_t>(16L))
            {
                auto tmp0 = at::vec::Vectorized<float>::loadu(out_ptr1 + static_cast<int64_t>(x1 + (1024L*x0)), 16);
                auto tmp1 = out_ptr2[static_cast<int64_t>(x0)];
                auto tmp2 = at::vec::Vectorized<float>(tmp1);
                auto tmp3 = tmp0 / tmp2;
                tmp3.store(out_ptr3 + static_cast<int64_t>(x1 + (1024L*x0)));
            }
        }
    }
 }
 ''')


 cpp_fused__native_batch_norm_legit_no_training_silu_16 = async_compile.cpp_pybinding(['float*', 'const float*', 'const float*', 'const float*', 'const float*', 'const float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(float* in_out_ptr0,
                       const float* in_ptr0,
                       const float* in_ptr1,
                       const float* in_ptr2,
                       const float* in_ptr3,
                       const float* in_ptr4)
 {
    {
        #pragma GCC ivdep
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(1024L); x0+=static_cast<int64_t>(1L))
        {
            for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(128L); x1+=static_cast<int64_t>(16L))
            {
                auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>((32L*x0) + (32768L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(32L)))) + (static_cast<int64_t>(x1) % static_cast<int64_t>(32L))), 16);
                auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr1 + static_cast<int64_t>(x1), 16);
                auto tmp3 = at::vec::Vectorized<float>::loadu(in_ptr2 + static_cast<int64_t>(x1), 16);
                auto tmp5 = at::vec::Vectorized<float>::loadu(in_ptr3 + static_cast<int64_t>(x1), 16);
                auto tmp7 = at::vec::Vectorized<float>::loadu(in_ptr4 + static_cast<int64_t>(x1), 16);
                auto tmp2 = tmp0 - tmp1;
                auto tmp4 = tmp2 * tmp3;
                auto tmp6 = tmp4 * tmp5;
                auto tmp8 = tmp6 + tmp7;
                auto tmp9 = decltype(tmp8)(1)/(decltype(tmp8)(1) + tmp8.neg().exp());
                auto tmp10 = tmp8 * tmp9;
                tmp10.store(in_out_ptr0 + static_cast<int64_t>(x1 + (128L*x0)));
            }
        }
    }
 }
 ''')


 cpp_fused_silu_17 = async_compile.cpp_pybinding(['const float*', 'float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(const float* in_ptr0,
                       float* out_ptr0)
 {
    {
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(524288L); x0+=static_cast<int64_t>(16L))
        {
            auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x0), 16);
            auto tmp1 = decltype(tmp0)(1)/(decltype(tmp0)(1) + tmp0.neg().exp());
            auto tmp2 = tmp0 * tmp1;
            tmp2.store(out_ptr0 + static_cast<int64_t>(x0));
        }
    }
 }
 ''')


 cpp_fused_mean_18 = async_compile.cpp_pybinding(['float*', 'const float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(float* in_out_ptr0,
                       const float* in_ptr0)
 {
    auto out_ptr0 = in_out_ptr0;
    {
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(256L); x0+=static_cast<int64_t>(16L))
        {
            {
                float tmp_acc0 = 0;
                at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(0);
                for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(256L); x1+=static_cast<int64_t>(1L))
                {
                    auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x0 + (256L*x1)), 16);
                    tmp_acc0_vec = tmp_acc0_vec + tmp0;
                }
                tmp_acc0_vec.store(in_out_ptr0 + static_cast<int64_t>(x0));
            }
        }
    }
    {
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(256L); x0+=static_cast<int64_t>(16L))
        {
            auto tmp0 = at::vec::Vectorized<float>::loadu(out_ptr0 + static_cast<int64_t>(x0), 16);
            auto tmp1 = static_cast<float>(256.0);
            auto tmp2 = at::vec::Vectorized<float>(tmp1);
            auto tmp3 = tmp0 / tmp2;
            tmp3.store(in_out_ptr0 + static_cast<int64_t>(x0));
        }
    }
 }
 ''')


 cpp_fused_mul_19 = async_compile.cpp_pybinding(['float*', 'const float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(float* in_out_ptr0,
                       const float* in_ptr0)
 {
    {
        #pragma GCC ivdep
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(256L); x0+=static_cast<int64_t>(1L))
        {
            for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(256L); x1+=static_cast<int64_t>(16L))
            {
                auto tmp0 = at::vec::Vectorized<float>::loadu(in_out_ptr0 + static_cast<int64_t>(x1 + (256L*x0)), 16);
                auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x1), 16);
                auto tmp2 = tmp0 * tmp1;
                tmp2.store(in_out_ptr0 + static_cast<int64_t>(x1 + (256L*x0)));
            }
        }
    }
 }
 ''')


 cpp_fused_silu_20 = async_compile.cpp_pybinding(['const float*', 'float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(const float* in_ptr0,
                       float* out_ptr0)
 {
    {
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(262144L); x0+=static_cast<int64_t>(16L))
        {
            auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x0), 16);
            auto tmp1 = decltype(tmp0)(1)/(decltype(tmp0)(1) + tmp0.neg().exp());
            auto tmp2 = tmp0 * tmp1;
            tmp2.store(out_ptr0 + static_cast<int64_t>(x0));
        }
    }
 }
 ''')


 cpp_fused_mean_21 = async_compile.cpp_pybinding(['float*', 'const float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(float* in_out_ptr0,
                       const float* in_ptr0)
 {
    auto out_ptr0 = in_out_ptr0;
    {
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(256L); x0+=static_cast<int64_t>(16L))
        {
            {
                float tmp_acc0 = 0;
                at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(0);
                for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(256L); x1+=static_cast<int64_t>(1L))
                {
                    auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x0 + (256L*x1)), 16);
                    tmp_acc0_vec = tmp_acc0_vec + tmp0;
                }
                tmp_acc0_vec.store(in_out_ptr0 + static_cast<int64_t>(x0));
            }
        }
    }
    {
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(256L); x0+=static_cast<int64_t>(16L))
        {
            auto tmp0 = at::vec::Vectorized<float>::loadu(out_ptr0 + static_cast<int64_t>(x0), 16);
            auto tmp1 = static_cast<float>(256.0);
            auto tmp2 = at::vec::Vectorized<float>(tmp1);
            auto tmp3 = tmp0 / tmp2;
            tmp3.store(in_out_ptr0 + static_cast<int64_t>(x0));
        }
    }
 }
 ''')


 cpp_fused_mul_22 = async_compile.cpp_pybinding(['float*', 'const float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(float* in_out_ptr0,
                       const float* in_ptr0)
 {
    {
        #pragma GCC ivdep
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(256L); x0+=static_cast<int64_t>(1L))
        {
            for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(256L); x1+=static_cast<int64_t>(16L))
            {
                auto tmp0 = at::vec::Vectorized<float>::loadu(in_out_ptr0 + static_cast<int64_t>(x1 + (256L*x0)), 16);
                auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x1), 16);
                auto tmp2 = tmp0 * tmp1;
                tmp2.store(in_out_ptr0 + static_cast<int64_t>(x1 + (256L*x0)));
            }
        }
    }
 }
 ''')


 cpp_fused_silu_23 = async_compile.cpp_pybinding(['const float*', 'float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(const float* in_ptr0,
                       float* out_ptr0)
 {
    {
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(262144L); x0+=static_cast<int64_t>(16L))
        {
            auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x0), 16);
            auto tmp1 = decltype(tmp0)(1)/(decltype(tmp0)(1) + tmp0.neg().exp());
            auto tmp2 = tmp0 * tmp1;
            tmp2.store(out_ptr0 + static_cast<int64_t>(x0));
        }
    }
 }
 ''')


 cpp_fused_clone_24 = async_compile.cpp_pybinding(['const float*', 'float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(const float* in_ptr0,
                       float* out_ptr0)
 {
    {
        #pragma GCC ivdep
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(4L); x0+=static_cast<int64_t>(1L))
        {
            #pragma GCC ivdep
            for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(16L); x1+=static_cast<int64_t>(1L))
            {
                #pragma GCC ivdep
                for(int64_t x2=static_cast<int64_t>(0L); x2<static_cast<int64_t>(16L); x2+=static_cast<int64_t>(1L))
                {
                    for(int64_t x3=static_cast<int64_t>(0L); x3<static_cast<int64_t>(64L); x3+=static_cast<int64_t>(16L))
                    {
                        auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x3 + (64L*x0) + (768L*x1) + (12288L*x2)), 16);
                        tmp0.store(out_ptr0 + static_cast<int64_t>(x3 + (64L*x2) + (1024L*x1) + (16384L*x0)));
                    }
                }
            }
        }
    }
 }
 ''')


 cpp_fused_clone_25 = async_compile.cpp_pybinding(['const float*', 'float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(const float* in_ptr0,
                       float* out_ptr0)
 {
    {
        #pragma GCC ivdep
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(4L); x0+=static_cast<int64_t>(1L))
        {
            #pragma GCC ivdep
            for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(256L); x1+=static_cast<int64_t>(1L))
            {
                for(int64_t x2=static_cast<int64_t>(0L); x2<static_cast<int64_t>(64L); x2+=static_cast<int64_t>(16L))
                {
                    auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x2 + (64L*x0) + (768L*x1)), 16);
                    tmp0.store(out_ptr0 + static_cast<int64_t>(x2 + (64L*x1) + (16384L*x0)));
                }
            }
        }
    }
 }
 ''')


 cpp_fused__softmax_add_mul_26 = async_compile.cpp_pybinding(['const float*', 'const float*', 'const float*', 'float*', 'float*', 'float*', 'float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(const float* in_ptr0,
                       const float* in_ptr1,
                       const float* in_ptr2,
                       float* out_ptr0,
                       float* out_ptr1,
                       float* out_ptr2,
                       float* out_ptr3)
 {
    {
        #pragma GCC ivdep
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(4L); x0+=static_cast<int64_t>(1L))
        {
            #pragma GCC ivdep
            for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(256L); x1+=static_cast<int64_t>(1L))
            {
                {
                    float tmp_acc0 = -std::numeric_limits<float>::infinity();
                    for(int64_t x2=static_cast<int64_t>(0L); x2<static_cast<int64_t>(256L); x2+=static_cast<int64_t>(1L))
                    {
                        auto tmp0 = in_ptr0[static_cast<int64_t>(x2 + (256L*x1) + (65536L*x0))];
                        auto tmp1 = static_cast<float>(0.125);
                        auto tmp2 = decltype(tmp0)(tmp0 * tmp1);
                        auto tmp3 = 15L + (31L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(16L)))) + (c10::div_floor_integer(static_cast<int64_t>(x2), static_cast<int64_t>(16L)));
                        auto tmp4 = c10::convert<int64_t>(tmp3);
                        auto tmp5 = static_cast<int64_t>(512);
                        auto tmp6 = tmp4 < tmp5;
                        auto tmp7 = [&]
                        {
                            auto tmp8 = static_cast<int64_t>((15L + (31L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(16L)))) + (c10::div_floor_integer(static_cast<int64_t>(x2), static_cast<int64_t>(16L))))) % static_cast<int64_t>(32L);
                            auto tmp9 = c10::convert<int64_t>(tmp8);
                            auto tmp10 = static_cast<int64_t>(31);
                            auto tmp11 = tmp9 < tmp10;
                            auto tmp12 = [&]
                            {
                                auto tmp13 = in_ptr1[static_cast<int64_t>((31L*(c10::div_floor_integer(static_cast<int64_t>((15L + (31L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(16L)))) + (c10::div_floor_integer(static_cast<int64_t>(x2), static_cast<int64_t>(16L))))), static_cast<int64_t>(32L)))) + (496L*(static_cast<int64_t>(x1) % static_cast<int64_t>(16L))) + (7936L*x0) + (static_cast<int64_t>((15L + (31L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(16L)))) + (c10::div_floor_integer(static_cast<int64_t>(x2), static_cast<int64_t>(16L))))) % static_cast<int64_t>(32L)))];
                                return tmp13;
                            }
                            ;
                            auto tmp14 = tmp11 ? tmp12() : static_cast<decltype(tmp12())>(0.0);
                            return tmp14;
                        }
                        ;
                        auto tmp15 = tmp6 ? tmp7() : static_cast<decltype(tmp7())>(0.0);
                        auto tmp16 = 15L + (31L*(static_cast<int64_t>(x1) % static_cast<int64_t>(16L))) + (static_cast<int64_t>(x2) % static_cast<int64_t>(16L));
                        auto tmp17 = c10::convert<int64_t>(tmp16);
                        auto tmp18 = tmp17 < tmp5;
                        auto tmp19 = [&]
                        {
                            auto tmp20 = static_cast<int64_t>((15L + (31L*(static_cast<int64_t>(x1) % static_cast<int64_t>(16L))) + (static_cast<int64_t>(x2) % static_cast<int64_t>(16L)))) % static_cast<int64_t>(32L);
                            auto tmp21 = c10::convert<int64_t>(tmp20);
                            auto tmp22 = static_cast<int64_t>(31);
                            auto tmp23 = tmp21 < tmp22;
                            auto tmp24 = [&]
                            {
                                auto tmp25 = in_ptr2[static_cast<int64_t>((31L*(c10::div_floor_integer(static_cast<int64_t>((15L + (31L*(static_cast<int64_t>(x1) % static_cast<int64_t>(16L))) + (static_cast<int64_t>(x2) % static_cast<int64_t>(16L)))), static_cast<int64_t>(32L)))) + (496L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(16L)))) + (7936L*x0) + (static_cast<int64_t>((15L + (31L*(static_cast<int64_t>(x1) % static_cast<int64_t>(16L))) + (static_cast<int64_t>(x2) % static_cast<int64_t>(16L)))) % static_cast<int64_t>(32L)))];
                                return tmp25;
                            }
                            ;
                            auto tmp26 = tmp23 ? tmp24() : static_cast<decltype(tmp24())>(0.0);
                            return tmp26;
                        }
                        ;
                        auto tmp27 = tmp18 ? tmp19() : static_cast<decltype(tmp19())>(0.0);
                        auto tmp28 = decltype(tmp15)(tmp15 + tmp27);
                        auto tmp29 = decltype(tmp2)(tmp2 + tmp28);
                        tmp_acc0 = max_propagate_nan(tmp_acc0, tmp29);
                    }
                    out_ptr0[static_cast<int64_t>(x1 + (256L*x0))] = tmp_acc0;
                }
                #pragma GCC ivdep
                for(int64_t x2=static_cast<int64_t>(0L); x2<static_cast<int64_t>(256L); x2+=static_cast<int64_t>(1L))
                {
                    auto tmp0 = in_ptr0[static_cast<int64_t>(x2 + (256L*x1) + (65536L*x0))];
                    auto tmp30 = out_ptr0[static_cast<int64_t>(x1 + (256L*x0))];
                    auto tmp1 = static_cast<float>(0.125);
                    auto tmp2 = decltype(tmp0)(tmp0 * tmp1);
                    auto tmp3 = 15L + (31L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(16L)))) + (c10::div_floor_integer(static_cast<int64_t>(x2), static_cast<int64_t>(16L)));
                    auto tmp4 = c10::convert<int64_t>(tmp3);
                    auto tmp5 = static_cast<int64_t>(512);
                    auto tmp6 = tmp4 < tmp5;
                    auto tmp7 = [&]
                    {
                        auto tmp8 = static_cast<int64_t>((15L + (31L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(16L)))) + (c10::div_floor_integer(static_cast<int64_t>(x2), static_cast<int64_t>(16L))))) % static_cast<int64_t>(32L);
                        auto tmp9 = c10::convert<int64_t>(tmp8);
                        auto tmp10 = static_cast<int64_t>(31);
                        auto tmp11 = tmp9 < tmp10;
                        auto tmp12 = [&]
                        {
                            auto tmp13 = in_ptr1[static_cast<int64_t>((31L*(c10::div_floor_integer(static_cast<int64_t>((15L + (31L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(16L)))) + (c10::div_floor_integer(static_cast<int64_t>(x2), static_cast<int64_t>(16L))))), static_cast<int64_t>(32L)))) + (496L*(static_cast<int64_t>(x1) % static_cast<int64_t>(16L))) + (7936L*x0) + (static_cast<int64_t>((15L + (31L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(16L)))) + (c10::div_floor_integer(static_cast<int64_t>(x2), static_cast<int64_t>(16L))))) % static_cast<int64_t>(32L)))];
                            return tmp13;
                        }
                        ;
                        auto tmp14 = tmp11 ? tmp12() : static_cast<decltype(tmp12())>(0.0);
                        return tmp14;
                    }
                    ;
                    auto tmp15 = tmp6 ? tmp7() : static_cast<decltype(tmp7())>(0.0);
                    auto tmp16 = 15L + (31L*(static_cast<int64_t>(x1) % static_cast<int64_t>(16L))) + (static_cast<int64_t>(x2) % static_cast<int64_t>(16L));
                    auto tmp17 = c10::convert<int64_t>(tmp16);
                    auto tmp18 = tmp17 < tmp5;
                    auto tmp19 = [&]
                    {
                        auto tmp20 = static_cast<int64_t>((15L + (31L*(static_cast<int64_t>(x1) % static_cast<int64_t>(16L))) + (static_cast<int64_t>(x2) % static_cast<int64_t>(16L)))) % static_cast<int64_t>(32L);
                        auto tmp21 = c10::convert<int64_t>(tmp20);
                        auto tmp22 = static_cast<int64_t>(31);
                        auto tmp23 = tmp21 < tmp22;
                        auto tmp24 = [&]
                        {
                            auto tmp25 = in_ptr2[static_cast<int64_t>((31L*(c10::div_floor_integer(static_cast<int64_t>((15L + (31L*(static_cast<int64_t>(x1) % static_cast<int64_t>(16L))) + (static_cast<int64_t>(x2) % static_cast<int64_t>(16L)))), static_cast<int64_t>(32L)))) + (496L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(16L)))) + (7936L*x0) + (static_cast<int64_t>((15L + (31L*(static_cast<int64_t>(x1) % static_cast<int64_t>(16L))) + (static_cast<int64_t>(x2) % static_cast<int64_t>(16L)))) % static_cast<int64_t>(32L)))];
                            return tmp25;
                        }
                        ;
                        auto tmp26 = tmp23 ? tmp24() : static_cast<decltype(tmp24())>(0.0);
                        return tmp26;
                    }
                    ;
                    auto tmp27 = tmp18 ? tmp19() : static_cast<decltype(tmp19())>(0.0);
                    auto tmp28 = decltype(tmp15)(tmp15 + tmp27);
                    auto tmp29 = decltype(tmp2)(tmp2 + tmp28);
                    auto tmp31 = decltype(tmp29)(tmp29 - tmp30);
                    auto tmp32 = std::exp(tmp31);
                    out_ptr1[static_cast<int64_t>(x2 + (256L*x1) + (65536L*x0))] = tmp32;
                }
            }
        }
    }
    {
        #pragma GCC ivdep
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(1024L); x0+=static_cast<int64_t>(1L))
        {
            {
                float tmp_acc0 = 0;
                at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(0);
                for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(256L); x1+=static_cast<int64_t>(16L))
                {
                    auto tmp0 = at::vec::Vectorized<float>::loadu(out_ptr1 + static_cast<int64_t>(x1 + (256L*x0)), 16);
                    tmp_acc0_vec = tmp_acc0_vec + tmp0;
                }
                tmp_acc0 = tmp_acc0 + at::vec::vec_reduce_all<float>([](at::vec::Vectorized<float>& x, at::vec::Vectorized<float>& y) { return x + y; }, tmp_acc0_vec);
                out_ptr2[static_cast<int64_t>(x0)] = static_cast<float>(tmp_acc0);
            }
            for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(256L); x1+=static_cast<int64_t>(16L))
            {
                auto tmp0 = at::vec::Vectorized<float>::loadu(out_ptr1 + static_cast<int64_t>(x1 + (256L*x0)), 16);
                auto tmp1 = out_ptr2[static_cast<int64_t>(x0)];
                auto tmp2 = at::vec::Vectorized<float>(tmp1);
                auto tmp3 = tmp0 / tmp2;
                tmp3.store(out_ptr3 + static_cast<int64_t>(x1 + (256L*x0)));
            }
        }
    }
 }
 ''')


 cpp_fused__native_batch_norm_legit_no_training_silu_27 = async_compile.cpp_pybinding(['float*', 'const float*', 'const float*', 'const float*', 'const float*', 'const float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(float* in_out_ptr0,
                       const float* in_ptr0,
                       const float* in_ptr1,
                       const float* in_ptr2,
                       const float* in_ptr3,
                       const float* in_ptr4)
 {
    {
        #pragma GCC ivdep
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(256L); x0+=static_cast<int64_t>(1L))
        {
            for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(256L); x1+=static_cast<int64_t>(16L))
            {
                auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>((64L*x0) + (16384L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(64L)))) + (static_cast<int64_t>(x1) % static_cast<int64_t>(64L))), 16);
                auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr1 + static_cast<int64_t>(x1), 16);
                auto tmp3 = at::vec::Vectorized<float>::loadu(in_ptr2 + static_cast<int64_t>(x1), 16);
                auto tmp5 = at::vec::Vectorized<float>::loadu(in_ptr3 + static_cast<int64_t>(x1), 16);
                auto tmp7 = at::vec::Vectorized<float>::loadu(in_ptr4 + static_cast<int64_t>(x1), 16);
                auto tmp2 = tmp0 - tmp1;
                auto tmp4 = tmp2 * tmp3;
                auto tmp6 = tmp4 * tmp5;
                auto tmp8 = tmp6 + tmp7;
                auto tmp9 = decltype(tmp8)(1)/(decltype(tmp8)(1) + tmp8.neg().exp());
                auto tmp10 = tmp8 * tmp9;
                tmp10.store(in_out_ptr0 + static_cast<int64_t>(x1 + (256L*x0)));
            }
        }
    }
 }
 ''')


 cpp_fused_silu_28 = async_compile.cpp_pybinding(['const float*', 'float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(const float* in_ptr0,
                       float* out_ptr0)
 {
    {
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(262144L); x0+=static_cast<int64_t>(16L))
        {
            auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x0), 16);
            auto tmp1 = decltype(tmp0)(1)/(decltype(tmp0)(1) + tmp0.neg().exp());
            auto tmp2 = tmp0 * tmp1;
            tmp2.store(out_ptr0 + static_cast<int64_t>(x0));
        }
    }
 }
 ''')


 cpp_fused_clone_29 = async_compile.cpp_pybinding(['const float*', 'float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(const float* in_ptr0,
                       float* out_ptr0)
 {
    {
        #pragma GCC ivdep
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(4L); x0+=static_cast<int64_t>(1L))
        {
            #pragma GCC ivdep
            for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(16L); x1+=static_cast<int64_t>(1L))
            {
                #pragma GCC ivdep
                for(int64_t x2=static_cast<int64_t>(0L); x2<static_cast<int64_t>(16L); x2+=static_cast<int64_t>(1L))
                {
                    for(int64_t x3=static_cast<int64_t>(0L); x3<static_cast<int64_t>(128L); x3+=static_cast<int64_t>(16L))
                    {
                        auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x3 + (128L*x0) + (1536L*x1) + (24576L*x2)), 16);
                        tmp0.store(out_ptr0 + static_cast<int64_t>(x3 + (128L*x2) + (2048L*x1) + (32768L*x0)));
                    }
                }
            }
        }
    }
 }
 ''')


 cpp_fused_clone_30 = async_compile.cpp_pybinding(['const float*', 'float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(const float* in_ptr0,
                       float* out_ptr0)
 {
    {
        #pragma GCC ivdep
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(4L); x0+=static_cast<int64_t>(1L))
        {
            #pragma GCC ivdep
            for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(256L); x1+=static_cast<int64_t>(1L))
            {
                for(int64_t x2=static_cast<int64_t>(0L); x2<static_cast<int64_t>(128L); x2+=static_cast<int64_t>(16L))
                {
                    auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x2 + (128L*x0) + (1536L*x1)), 16);
                    tmp0.store(out_ptr0 + static_cast<int64_t>(x2 + (128L*x1) + (32768L*x0)));
                }
            }
        }
    }
 }
 ''')


 cpp_fused__softmax_add_mul_31 = async_compile.cpp_pybinding(['const float*', 'const float*', 'const float*', 'float*', 'float*', 'float*', 'float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(const float* in_ptr0,
                       const float* in_ptr1,
                       const float* in_ptr2,
                       float* out_ptr0,
                       float* out_ptr1,
                       float* out_ptr2,
                       float* out_ptr3)
 {
    {
        #pragma GCC ivdep
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(4L); x0+=static_cast<int64_t>(1L))
        {
            #pragma GCC ivdep
            for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(256L); x1+=static_cast<int64_t>(1L))
            {
                {
                    float tmp_acc0 = -std::numeric_limits<float>::infinity();
                    for(int64_t x2=static_cast<int64_t>(0L); x2<static_cast<int64_t>(256L); x2+=static_cast<int64_t>(1L))
                    {
                        auto tmp0 = in_ptr0[static_cast<int64_t>(x2 + (256L*x1) + (65536L*x0))];
                        auto tmp1 = static_cast<float>(0.08838834764831845);
                        auto tmp2 = decltype(tmp0)(tmp0 * tmp1);
                        auto tmp3 = 15L + (31L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(16L)))) + (c10::div_floor_integer(static_cast<int64_t>(x2), static_cast<int64_t>(16L)));
                        auto tmp4 = c10::convert<int64_t>(tmp3);
                        auto tmp5 = static_cast<int64_t>(512);
                        auto tmp6 = tmp4 < tmp5;
                        auto tmp7 = [&]
                        {
                            auto tmp8 = static_cast<int64_t>((15L + (31L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(16L)))) + (c10::div_floor_integer(static_cast<int64_t>(x2), static_cast<int64_t>(16L))))) % static_cast<int64_t>(32L);
                            auto tmp9 = c10::convert<int64_t>(tmp8);
                            auto tmp10 = static_cast<int64_t>(31);
                            auto tmp11 = tmp9 < tmp10;
                            auto tmp12 = [&]
                            {
                                auto tmp13 = in_ptr1[static_cast<int64_t>((31L*(c10::div_floor_integer(static_cast<int64_t>((15L + (31L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(16L)))) + (c10::div_floor_integer(static_cast<int64_t>(x2), static_cast<int64_t>(16L))))), static_cast<int64_t>(32L)))) + (496L*(static_cast<int64_t>(x1) % static_cast<int64_t>(16L))) + (7936L*x0) + (static_cast<int64_t>((15L + (31L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(16L)))) + (c10::div_floor_integer(static_cast<int64_t>(x2), static_cast<int64_t>(16L))))) % static_cast<int64_t>(32L)))];
                                return tmp13;
                            }
                            ;
                            auto tmp14 = tmp11 ? tmp12() : static_cast<decltype(tmp12())>(0.0);
                            return tmp14;
                        }
                        ;
                        auto tmp15 = tmp6 ? tmp7() : static_cast<decltype(tmp7())>(0.0);
                        auto tmp16 = 15L + (31L*(static_cast<int64_t>(x1) % static_cast<int64_t>(16L))) + (static_cast<int64_t>(x2) % static_cast<int64_t>(16L));
                        auto tmp17 = c10::convert<int64_t>(tmp16);
                        auto tmp18 = tmp17 < tmp5;
                        auto tmp19 = [&]
                        {
                            auto tmp20 = static_cast<int64_t>((15L + (31L*(static_cast<int64_t>(x1) % static_cast<int64_t>(16L))) + (static_cast<int64_t>(x2) % static_cast<int64_t>(16L)))) % static_cast<int64_t>(32L);
                            auto tmp21 = c10::convert<int64_t>(tmp20);
                            auto tmp22 = static_cast<int64_t>(31);
                            auto tmp23 = tmp21 < tmp22;
                            auto tmp24 = [&]
                            {
                                auto tmp25 = in_ptr2[static_cast<int64_t>((31L*(c10::div_floor_integer(static_cast<int64_t>((15L + (31L*(static_cast<int64_t>(x1) % static_cast<int64_t>(16L))) + (static_cast<int64_t>(x2) % static_cast<int64_t>(16L)))), static_cast<int64_t>(32L)))) + (496L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(16L)))) + (7936L*x0) + (static_cast<int64_t>((15L + (31L*(static_cast<int64_t>(x1) % static_cast<int64_t>(16L))) + (static_cast<int64_t>(x2) % static_cast<int64_t>(16L)))) % static_cast<int64_t>(32L)))];
                                return tmp25;
                            }
                            ;
                            auto tmp26 = tmp23 ? tmp24() : static_cast<decltype(tmp24())>(0.0);
                            return tmp26;
                        }
                        ;
                        auto tmp27 = tmp18 ? tmp19() : static_cast<decltype(tmp19())>(0.0);
                        auto tmp28 = decltype(tmp15)(tmp15 + tmp27);
                        auto tmp29 = decltype(tmp2)(tmp2 + tmp28);
                        tmp_acc0 = max_propagate_nan(tmp_acc0, tmp29);
                    }
                    out_ptr0[static_cast<int64_t>(x1 + (256L*x0))] = tmp_acc0;
                }
                #pragma GCC ivdep
                for(int64_t x2=static_cast<int64_t>(0L); x2<static_cast<int64_t>(256L); x2+=static_cast<int64_t>(1L))
                {
                    auto tmp0 = in_ptr0[static_cast<int64_t>(x2 + (256L*x1) + (65536L*x0))];
                    auto tmp30 = out_ptr0[static_cast<int64_t>(x1 + (256L*x0))];
                    auto tmp1 = static_cast<float>(0.08838834764831845);
                    auto tmp2 = decltype(tmp0)(tmp0 * tmp1);
                    auto tmp3 = 15L + (31L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(16L)))) + (c10::div_floor_integer(static_cast<int64_t>(x2), static_cast<int64_t>(16L)));
                    auto tmp4 = c10::convert<int64_t>(tmp3);
                    auto tmp5 = static_cast<int64_t>(512);
                    auto tmp6 = tmp4 < tmp5;
                    auto tmp7 = [&]
                    {
                        auto tmp8 = static_cast<int64_t>((15L + (31L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(16L)))) + (c10::div_floor_integer(static_cast<int64_t>(x2), static_cast<int64_t>(16L))))) % static_cast<int64_t>(32L);
                        auto tmp9 = c10::convert<int64_t>(tmp8);
                        auto tmp10 = static_cast<int64_t>(31);
                        auto tmp11 = tmp9 < tmp10;
                        auto tmp12 = [&]
                        {
                            auto tmp13 = in_ptr1[static_cast<int64_t>((31L*(c10::div_floor_integer(static_cast<int64_t>((15L + (31L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(16L)))) + (c10::div_floor_integer(static_cast<int64_t>(x2), static_cast<int64_t>(16L))))), static_cast<int64_t>(32L)))) + (496L*(static_cast<int64_t>(x1) % static_cast<int64_t>(16L))) + (7936L*x0) + (static_cast<int64_t>((15L + (31L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(16L)))) + (c10::div_floor_integer(static_cast<int64_t>(x2), static_cast<int64_t>(16L))))) % static_cast<int64_t>(32L)))];
                            return tmp13;
                        }
                        ;
                        auto tmp14 = tmp11 ? tmp12() : static_cast<decltype(tmp12())>(0.0);
                        return tmp14;
                    }
                    ;
                    auto tmp15 = tmp6 ? tmp7() : static_cast<decltype(tmp7())>(0.0);
                    auto tmp16 = 15L + (31L*(static_cast<int64_t>(x1) % static_cast<int64_t>(16L))) + (static_cast<int64_t>(x2) % static_cast<int64_t>(16L));
                    auto tmp17 = c10::convert<int64_t>(tmp16);
                    auto tmp18 = tmp17 < tmp5;
                    auto tmp19 = [&]
                    {
                        auto tmp20 = static_cast<int64_t>((15L + (31L*(static_cast<int64_t>(x1) % static_cast<int64_t>(16L))) + (static_cast<int64_t>(x2) % static_cast<int64_t>(16L)))) % static_cast<int64_t>(32L);
                        auto tmp21 = c10::convert<int64_t>(tmp20);
                        auto tmp22 = static_cast<int64_t>(31);
                        auto tmp23 = tmp21 < tmp22;
                        auto tmp24 = [&]
                        {
                            auto tmp25 = in_ptr2[static_cast<int64_t>((31L*(c10::div_floor_integer(static_cast<int64_t>((15L + (31L*(static_cast<int64_t>(x1) % static_cast<int64_t>(16L))) + (static_cast<int64_t>(x2) % static_cast<int64_t>(16L)))), static_cast<int64_t>(32L)))) + (496L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(16L)))) + (7936L*x0) + (static_cast<int64_t>((15L + (31L*(static_cast<int64_t>(x1) % static_cast<int64_t>(16L))) + (static_cast<int64_t>(x2) % static_cast<int64_t>(16L)))) % static_cast<int64_t>(32L)))];
                            return tmp25;
                        }
                        ;
                        auto tmp26 = tmp23 ? tmp24() : static_cast<decltype(tmp24())>(0.0);
                        return tmp26;
                    }
                    ;
                    auto tmp27 = tmp18 ? tmp19() : static_cast<decltype(tmp19())>(0.0);
                    auto tmp28 = decltype(tmp15)(tmp15 + tmp27);
                    auto tmp29 = decltype(tmp2)(tmp2 + tmp28);
                    auto tmp31 = decltype(tmp29)(tmp29 - tmp30);
                    auto tmp32 = std::exp(tmp31);
                    out_ptr1[static_cast<int64_t>(x2 + (256L*x1) + (65536L*x0))] = tmp32;
                }
            }
        }
    }
    {
        #pragma GCC ivdep
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(1024L); x0+=static_cast<int64_t>(1L))
        {
            {
                float tmp_acc0 = 0;
                at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(0);
                for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(256L); x1+=static_cast<int64_t>(16L))
                {
                    auto tmp0 = at::vec::Vectorized<float>::loadu(out_ptr1 + static_cast<int64_t>(x1 + (256L*x0)), 16);
                    tmp_acc0_vec = tmp_acc0_vec + tmp0;
                }
                tmp_acc0 = tmp_acc0 + at::vec::vec_reduce_all<float>([](at::vec::Vectorized<float>& x, at::vec::Vectorized<float>& y) { return x + y; }, tmp_acc0_vec);
                out_ptr2[static_cast<int64_t>(x0)] = static_cast<float>(tmp_acc0);
            }
            for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(256L); x1+=static_cast<int64_t>(16L))
            {
                auto tmp0 = at::vec::Vectorized<float>::loadu(out_ptr1 + static_cast<int64_t>(x1 + (256L*x0)), 16);
                auto tmp1 = out_ptr2[static_cast<int64_t>(x0)];
                auto tmp2 = at::vec::Vectorized<float>(tmp1);
                auto tmp3 = tmp0 / tmp2;
                tmp3.store(out_ptr3 + static_cast<int64_t>(x1 + (256L*x0)));
            }
        }
    }
 }
 ''')


 cpp_fused__native_batch_norm_legit_no_training_avg_pool2d_silu_32 = async_compile.cpp_pybinding(['float*', 'const float*', 'const float*', 'const float*', 'const float*', 'const float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(float* in_out_ptr0,
                       const float* in_ptr0,
                       const float* in_ptr1,
                       const float* in_ptr2,
                       const float* in_ptr3,
                       const float* in_ptr4)
 {
    {
        #pragma GCC ivdep
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(8L); x0+=static_cast<int64_t>(1L))
        {
            #pragma GCC ivdep
            for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(8L); x1+=static_cast<int64_t>(1L))
            {
                for(int64_t x2=static_cast<int64_t>(0L); x2<static_cast<int64_t>(512L); x2+=static_cast<int64_t>(16L))
                {
                    auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>((256L*x1) + (4096L*x0) + (32768L*(c10::div_floor_integer(static_cast<int64_t>(x2), static_cast<int64_t>(128L)))) + (static_cast<int64_t>(x2) % static_cast<int64_t>(128L))), 16);
                    auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(128L + (256L*x1) + (4096L*x0) + (32768L*(c10::div_floor_integer(static_cast<int64_t>(x2), static_cast<int64_t>(128L)))) + (static_cast<int64_t>(x2) % static_cast<int64_t>(128L))), 16);
                    auto tmp3 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(2048L + (256L*x1) + (4096L*x0) + (32768L*(c10::div_floor_integer(static_cast<int64_t>(x2), static_cast<int64_t>(128L)))) + (static_cast<int64_t>(x2) % static_cast<int64_t>(128L))), 16);
                    auto tmp5 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(2176L + (256L*x1) + (4096L*x0) + (32768L*(c10::div_floor_integer(static_cast<int64_t>(x2), static_cast<int64_t>(128L)))) + (static_cast<int64_t>(x2) % static_cast<int64_t>(128L))), 16);
                    auto tmp10 = at::vec::Vectorized<float>::loadu(in_ptr1 + static_cast<int64_t>(x2), 16);
                    auto tmp12 = at::vec::Vectorized<float>::loadu(in_ptr2 + static_cast<int64_t>(x2), 16);
                    auto tmp14 = at::vec::Vectorized<float>::loadu(in_ptr3 + static_cast<int64_t>(x2), 16);
                    auto tmp16 = at::vec::Vectorized<float>::loadu(in_ptr4 + static_cast<int64_t>(x2), 16);
                    auto tmp2 = tmp1 + tmp0;
                    auto tmp4 = tmp3 + tmp2;
                    auto tmp6 = tmp5 + tmp4;
                    auto tmp7 = static_cast<float>(0.25);
                    auto tmp8 = at::vec::Vectorized<float>(tmp7);
                    auto tmp9 = tmp6 * tmp8;
                    auto tmp11 = tmp9 - tmp10;
                    auto tmp13 = tmp11 * tmp12;
                    auto tmp15 = tmp13 * tmp14;
                    auto tmp17 = tmp15 + tmp16;
                    auto tmp18 = decltype(tmp17)(1)/(decltype(tmp17)(1) + tmp17.neg().exp());
                    auto tmp19 = tmp17 * tmp18;
                    tmp19.store(in_out_ptr0 + static_cast<int64_t>(x2 + (512L*x1) + (4096L*x0)));
                }
            }
        }
    }
 }
 ''')


 cpp_fused_silu_33 = async_compile.cpp_pybinding(['const float*', 'float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(const float* in_ptr0,
                       float* out_ptr0)
 {
    {
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(98304L); x0+=static_cast<int64_t>(16L))
        {
            auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x0), 16);
            auto tmp1 = decltype(tmp0)(1)/(decltype(tmp0)(1) + tmp0.neg().exp());
            auto tmp2 = tmp0 * tmp1;
            tmp2.store(out_ptr0 + static_cast<int64_t>(x0));
        }
    }
 }
 ''')


 cpp_fused_clone_34 = async_compile.cpp_pybinding(['const float*', 'float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(const float* in_ptr0,
                       float* out_ptr0)
 {
    {
        #pragma GCC ivdep
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(4L); x0+=static_cast<int64_t>(1L))
        {
            #pragma GCC ivdep
            for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(8L); x1+=static_cast<int64_t>(1L))
            {
                #pragma GCC ivdep
                for(int64_t x2=static_cast<int64_t>(0L); x2<static_cast<int64_t>(8L); x2+=static_cast<int64_t>(1L))
                {
                    for(int64_t x3=static_cast<int64_t>(0L); x3<static_cast<int64_t>(128L); x3+=static_cast<int64_t>(16L))
                    {
                        auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x3 + (128L*x0) + (1536L*x1) + (12288L*x2)), 16);
                        tmp0.store(out_ptr0 + static_cast<int64_t>(x3 + (128L*x2) + (1024L*x1) + (8192L*x0)));
                    }
                }
            }
        }
    }
 }
 ''')


 cpp_fused_clone_35 = async_compile.cpp_pybinding(['const float*', 'float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(const float* in_ptr0,
                       float* out_ptr0)
 {
    {
        #pragma GCC ivdep
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(4L); x0+=static_cast<int64_t>(1L))
        {
            #pragma GCC ivdep
            for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(64L); x1+=static_cast<int64_t>(1L))
            {
                for(int64_t x2=static_cast<int64_t>(0L); x2<static_cast<int64_t>(128L); x2+=static_cast<int64_t>(16L))
                {
                    auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x2 + (128L*x0) + (1536L*x1)), 16);
                    tmp0.store(out_ptr0 + static_cast<int64_t>(x2 + (128L*x1) + (8192L*x0)));
                }
            }
        }
    }
 }
 ''')


 cpp_fused__softmax_add_mul_36 = async_compile.cpp_pybinding(['const float*', 'const float*', 'const float*', 'float*', 'float*', 'float*', 'float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(const float* in_ptr0,
                       const float* in_ptr1,
                       const float* in_ptr2,
                       float* out_ptr0,
                       float* out_ptr1,
                       float* out_ptr2,
                       float* out_ptr3)
 {
    {
        #pragma GCC ivdep
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(4L); x0+=static_cast<int64_t>(1L))
        {
            #pragma GCC ivdep
            for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(64L); x1+=static_cast<int64_t>(1L))
            {
                {
                    float tmp_acc0 = -std::numeric_limits<float>::infinity();
                    for(int64_t x2=static_cast<int64_t>(0L); x2<static_cast<int64_t>(64L); x2+=static_cast<int64_t>(1L))
                    {
                        auto tmp0 = in_ptr0[static_cast<int64_t>(x2 + (64L*x1) + (4096L*x0))];
                        auto tmp1 = static_cast<float>(0.08838834764831845);
                        auto tmp2 = decltype(tmp0)(tmp0 * tmp1);
                        auto tmp3 = 7L + (15L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(8L)))) + (c10::div_floor_integer(static_cast<int64_t>(x2), static_cast<int64_t>(8L)));
                        auto tmp4 = c10::convert<int64_t>(tmp3);
                        auto tmp5 = static_cast<int64_t>(128);
                        auto tmp6 = tmp4 < tmp5;
                        auto tmp7 = [&]
                        {
                            auto tmp8 = static_cast<int64_t>((7L + (15L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(8L)))) + (c10::div_floor_integer(static_cast<int64_t>(x2), static_cast<int64_t>(8L))))) % static_cast<int64_t>(16L);
                            auto tmp9 = c10::convert<int64_t>(tmp8);
                            auto tmp10 = static_cast<int64_t>(15);
                            auto tmp11 = tmp9 < tmp10;
                            auto tmp12 = [&]
                            {
                                auto tmp13 = in_ptr1[static_cast<int64_t>((15L*(c10::div_floor_integer(static_cast<int64_t>((7L + (15L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(8L)))) + (c10::div_floor_integer(static_cast<int64_t>(x2), static_cast<int64_t>(8L))))), static_cast<int64_t>(16L)))) + (120L*(static_cast<int64_t>(x1) % static_cast<int64_t>(8L))) + (960L*x0) + (static_cast<int64_t>((7L + (15L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(8L)))) + (c10::div_floor_integer(static_cast<int64_t>(x2), static_cast<int64_t>(8L))))) % static_cast<int64_t>(16L)))];
                                return tmp13;
                            }
                            ;
                            auto tmp14 = tmp11 ? tmp12() : static_cast<decltype(tmp12())>(0.0);
                            return tmp14;
                        }
                        ;
                        auto tmp15 = tmp6 ? tmp7() : static_cast<decltype(tmp7())>(0.0);
                        auto tmp16 = 7L + (15L*(static_cast<int64_t>(x1) % static_cast<int64_t>(8L))) + (static_cast<int64_t>(x2) % static_cast<int64_t>(8L));
                        auto tmp17 = c10::convert<int64_t>(tmp16);
                        auto tmp18 = tmp17 < tmp5;
                        auto tmp19 = [&]
                        {
                            auto tmp20 = static_cast<int64_t>((7L + (15L*(static_cast<int64_t>(x1) % static_cast<int64_t>(8L))) + (static_cast<int64_t>(x2) % static_cast<int64_t>(8L)))) % static_cast<int64_t>(16L);
                            auto tmp21 = c10::convert<int64_t>(tmp20);
                            auto tmp22 = static_cast<int64_t>(15);
                            auto tmp23 = tmp21 < tmp22;
                            auto tmp24 = [&]
                            {
                                auto tmp25 = in_ptr2[static_cast<int64_t>((15L*(c10::div_floor_integer(static_cast<int64_t>((7L + (15L*(static_cast<int64_t>(x1) % static_cast<int64_t>(8L))) + (static_cast<int64_t>(x2) % static_cast<int64_t>(8L)))), static_cast<int64_t>(16L)))) + (120L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(8L)))) + (960L*x0) + (static_cast<int64_t>((7L + (15L*(static_cast<int64_t>(x1) % static_cast<int64_t>(8L))) + (static_cast<int64_t>(x2) % static_cast<int64_t>(8L)))) % static_cast<int64_t>(16L)))];
                                return tmp25;
                            }
                            ;
                            auto tmp26 = tmp23 ? tmp24() : static_cast<decltype(tmp24())>(0.0);
                            return tmp26;
                        }
                        ;
                        auto tmp27 = tmp18 ? tmp19() : static_cast<decltype(tmp19())>(0.0);
                        auto tmp28 = decltype(tmp15)(tmp15 + tmp27);
                        auto tmp29 = decltype(tmp2)(tmp2 + tmp28);
                        tmp_acc0 = max_propagate_nan(tmp_acc0, tmp29);
                    }
                    out_ptr0[static_cast<int64_t>(x1 + (64L*x0))] = tmp_acc0;
                }
                #pragma GCC ivdep
                for(int64_t x2=static_cast<int64_t>(0L); x2<static_cast<int64_t>(64L); x2+=static_cast<int64_t>(1L))
                {
                    auto tmp0 = in_ptr0[static_cast<int64_t>(x2 + (64L*x1) + (4096L*x0))];
                    auto tmp30 = out_ptr0[static_cast<int64_t>(x1 + (64L*x0))];
                    auto tmp1 = static_cast<float>(0.08838834764831845);
                    auto tmp2 = decltype(tmp0)(tmp0 * tmp1);
                    auto tmp3 = 7L + (15L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(8L)))) + (c10::div_floor_integer(static_cast<int64_t>(x2), static_cast<int64_t>(8L)));
                    auto tmp4 = c10::convert<int64_t>(tmp3);
                    auto tmp5 = static_cast<int64_t>(128);
                    auto tmp6 = tmp4 < tmp5;
                    auto tmp7 = [&]
                    {
                        auto tmp8 = static_cast<int64_t>((7L + (15L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(8L)))) + (c10::div_floor_integer(static_cast<int64_t>(x2), static_cast<int64_t>(8L))))) % static_cast<int64_t>(16L);
                        auto tmp9 = c10::convert<int64_t>(tmp8);
                        auto tmp10 = static_cast<int64_t>(15);
                        auto tmp11 = tmp9 < tmp10;
                        auto tmp12 = [&]
                        {
                            auto tmp13 = in_ptr1[static_cast<int64_t>((15L*(c10::div_floor_integer(static_cast<int64_t>((7L + (15L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(8L)))) + (c10::div_floor_integer(static_cast<int64_t>(x2), static_cast<int64_t>(8L))))), static_cast<int64_t>(16L)))) + (120L*(static_cast<int64_t>(x1) % static_cast<int64_t>(8L))) + (960L*x0) + (static_cast<int64_t>((7L + (15L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(8L)))) + (c10::div_floor_integer(static_cast<int64_t>(x2), static_cast<int64_t>(8L))))) % static_cast<int64_t>(16L)))];
                            return tmp13;
                        }
                        ;
                        auto tmp14 = tmp11 ? tmp12() : static_cast<decltype(tmp12())>(0.0);
                        return tmp14;
                    }
                    ;
                    auto tmp15 = tmp6 ? tmp7() : static_cast<decltype(tmp7())>(0.0);
                    auto tmp16 = 7L + (15L*(static_cast<int64_t>(x1) % static_cast<int64_t>(8L))) + (static_cast<int64_t>(x2) % static_cast<int64_t>(8L));
                    auto tmp17 = c10::convert<int64_t>(tmp16);
                    auto tmp18 = tmp17 < tmp5;
                    auto tmp19 = [&]
                    {
                        auto tmp20 = static_cast<int64_t>((7L + (15L*(static_cast<int64_t>(x1) % static_cast<int64_t>(8L))) + (static_cast<int64_t>(x2) % static_cast<int64_t>(8L)))) % static_cast<int64_t>(16L);
                        auto tmp21 = c10::convert<int64_t>(tmp20);
                        auto tmp22 = static_cast<int64_t>(15);
                        auto tmp23 = tmp21 < tmp22;
                        auto tmp24 = [&]
                        {
                            auto tmp25 = in_ptr2[static_cast<int64_t>((15L*(c10::div_floor_integer(static_cast<int64_t>((7L + (15L*(static_cast<int64_t>(x1) % static_cast<int64_t>(8L))) + (static_cast<int64_t>(x2) % static_cast<int64_t>(8L)))), static_cast<int64_t>(16L)))) + (120L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(8L)))) + (960L*x0) + (static_cast<int64_t>((7L + (15L*(static_cast<int64_t>(x1) % static_cast<int64_t>(8L))) + (static_cast<int64_t>(x2) % static_cast<int64_t>(8L)))) % static_cast<int64_t>(16L)))];
                            return tmp25;
                        }
                        ;
                        auto tmp26 = tmp23 ? tmp24() : static_cast<decltype(tmp24())>(0.0);
                        return tmp26;
                    }
                    ;
                    auto tmp27 = tmp18 ? tmp19() : static_cast<decltype(tmp19())>(0.0);
                    auto tmp28 = decltype(tmp15)(tmp15 + tmp27);
                    auto tmp29 = decltype(tmp2)(tmp2 + tmp28);
                    auto tmp31 = decltype(tmp29)(tmp29 - tmp30);
                    auto tmp32 = std::exp(tmp31);
                    out_ptr1[static_cast<int64_t>(x2 + (64L*x1) + (4096L*x0))] = tmp32;
                }
            }
        }
    }
    {
        #pragma GCC ivdep
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(256L); x0+=static_cast<int64_t>(1L))
        {
            {
                float tmp_acc0 = 0;
                at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(0);
                for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(64L); x1+=static_cast<int64_t>(16L))
                {
                    auto tmp0 = at::vec::Vectorized<float>::loadu(out_ptr1 + static_cast<int64_t>(x1 + (64L*x0)), 16);
                    tmp_acc0_vec = tmp_acc0_vec + tmp0;
                }
                tmp_acc0 = tmp_acc0 + at::vec::vec_reduce_all<float>([](at::vec::Vectorized<float>& x, at::vec::Vectorized<float>& y) { return x + y; }, tmp_acc0_vec);
                out_ptr2[static_cast<int64_t>(x0)] = static_cast<float>(tmp_acc0);
            }
            for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(64L); x1+=static_cast<int64_t>(16L))
            {
                auto tmp0 = at::vec::Vectorized<float>::loadu(out_ptr1 + static_cast<int64_t>(x1 + (64L*x0)), 16);
                auto tmp1 = out_ptr2[static_cast<int64_t>(x0)];
                auto tmp2 = at::vec::Vectorized<float>(tmp1);
                auto tmp3 = tmp0 / tmp2;
                tmp3.store(out_ptr3 + static_cast<int64_t>(x1 + (64L*x0)));
            }
        }
    }
 }
 ''')


 cpp_fused__native_batch_norm_legit_no_training_silu_37 = async_compile.cpp_pybinding(['float*', 'const float*', 'const float*', 'const float*', 'const float*', 'const float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(float* in_out_ptr0,
                       const float* in_ptr0,
                       const float* in_ptr1,
                       const float* in_ptr2,
                       const float* in_ptr3,
                       const float* in_ptr4)
 {
    {
        #pragma GCC ivdep
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(64L); x0+=static_cast<int64_t>(1L))
        {
            for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(512L); x1+=static_cast<int64_t>(16L))
            {
                auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>((128L*x0) + (8192L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(128L)))) + (static_cast<int64_t>(x1) % static_cast<int64_t>(128L))), 16);
                auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr1 + static_cast<int64_t>(x1), 16);
                auto tmp3 = at::vec::Vectorized<float>::loadu(in_ptr2 + static_cast<int64_t>(x1), 16);
                auto tmp5 = at::vec::Vectorized<float>::loadu(in_ptr3 + static_cast<int64_t>(x1), 16);
                auto tmp7 = at::vec::Vectorized<float>::loadu(in_ptr4 + static_cast<int64_t>(x1), 16);
                auto tmp2 = tmp0 - tmp1;
                auto tmp4 = tmp2 * tmp3;
                auto tmp6 = tmp4 * tmp5;
                auto tmp8 = tmp6 + tmp7;
                auto tmp9 = decltype(tmp8)(1)/(decltype(tmp8)(1) + tmp8.neg().exp());
                auto tmp10 = tmp8 * tmp9;
                tmp10.store(in_out_ptr0 + static_cast<int64_t>(x1 + (512L*x0)));
            }
        }
    }
 }
 ''')


 cpp_fused_silu_38 = async_compile.cpp_pybinding(['const float*', 'float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(const float* in_ptr0,
                       float* out_ptr0)
 {
    {
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(98304L); x0+=static_cast<int64_t>(16L))
        {
            auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x0), 16);
            auto tmp1 = decltype(tmp0)(1)/(decltype(tmp0)(1) + tmp0.neg().exp());
            auto tmp2 = tmp0 * tmp1;
            tmp2.store(out_ptr0 + static_cast<int64_t>(x0));
        }
    }
 }
 ''')


 cpp_fused_mean_39 = async_compile.cpp_pybinding(['float*', 'const float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(float* in_out_ptr0,
                       const float* in_ptr0)
 {
    auto out_ptr0 = in_out_ptr0;
    {
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(1280L); x0+=static_cast<int64_t>(16L))
        {
            {
                float tmp_acc0 = 0;
                at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(0);
                for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(64L); x1+=static_cast<int64_t>(1L))
                {
                    auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x0 + (1280L*x1)), 16);
                    tmp_acc0_vec = tmp_acc0_vec + tmp0;
                }
                tmp_acc0_vec.store(in_out_ptr0 + static_cast<int64_t>(x0));
            }
        }
    }
    {
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(1280L); x0+=static_cast<int64_t>(16L))
        {
            auto tmp0 = at::vec::Vectorized<float>::loadu(out_ptr0 + static_cast<int64_t>(x0), 16);
            auto tmp1 = static_cast<float>(64.0);
            auto tmp2 = at::vec::Vectorized<float>(tmp1);
            auto tmp3 = tmp0 / tmp2;
            tmp3.store(in_out_ptr0 + static_cast<int64_t>(x0));
        }
    }
 }
 ''')


 cpp_fused_addmm_40 = async_compile.cpp_pybinding(['float*', 'const float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(float* in_out_ptr0,
                       const float* in_ptr0)
 {
    {
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(992L); x0+=static_cast<int64_t>(16L))
        {
            auto tmp0 = at::vec::Vectorized<float>::loadu(in_out_ptr0 + static_cast<int64_t>(x0), 16);
            auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x0), 16);
            auto tmp2 = tmp0 + tmp1;
            tmp2.store(in_out_ptr0 + static_cast<int64_t>(x0));
        }
        for(int64_t x0=static_cast<int64_t>(992L); x0<static_cast<int64_t>(1000L); x0+=static_cast<int64_t>(8L))
        {
            auto tmp0 = at::vec::Vectorized<float>::loadu(in_out_ptr0 + static_cast<int64_t>(x0), 8);
            auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x0), 8);
            auto tmp2 = tmp0 + tmp1;
            tmp2.store(in_out_ptr0 + static_cast<int64_t>(x0), 8);
        }
    }
 }
 ''')


 async_compile.wait(globals())
 del async_compile

 def call(args):
    arg224_1, = args
    args.clear()
    assert_size_stride(arg224_1, (1, 3, 256, 256), (196608, 65536, 256, 1))
    buf0 = empty_strided_cpu((1, 3, 256, 256), (196608, 1, 768, 3), torch.float32)
    cpp_fused_silu_0(arg224_1, buf0)
    del arg224_1
    buf1 = torch.ops.mkldnn._convolution_pointwise.default(buf0, _frozen_param598, _frozen_param564, [1, 1], [2, 2], [1, 1], 1, 'swish', [None], '')
    assert_size_stride(buf1, (1, 24, 128, 128), (393216, 1, 3072, 24))
    del buf0
    buf2 = torch.ops.mkldnn._convolution_pointwise.default(buf1, _frozen_param599, _frozen_param565, [1, 1], [1, 1], [1, 1], 1, 'swish', [None], '')
    assert_size_stride(buf2, (1, 32, 128, 128), (524288, 1, 4096, 32))
    del buf1
    buf3 = torch.ops.mkldnn._convolution_pointwise.default(buf2, _frozen_param600, _frozen_param566, [1, 1], [2, 2], [1, 1], 1, 'swish', [None], '')
    assert_size_stride(buf3, (1, 64, 64, 64), (262144, 1, 4096, 64))
    del buf2
    buf4 = torch.ops.mkldnn._convolution_pointwise.default(buf3, _frozen_param601, _frozen_param567, [0, 0], [1, 1], [1, 1], 1, 'swish', [None], '')
    assert_size_stride(buf4, (1, 64, 64, 64), (262144, 1, 4096, 64))
    buf5 = torch.ops.mkldnn._convolution_pointwise.default(buf4, _frozen_param602, _frozen_param568, [1, 1], [1, 1], [1, 1], 1, 'swish', [None], '')
    assert_size_stride(buf5, (1, 64, 64, 64), (262144, 1, 4096, 64))
    buf6 = empty_strided_cpu((1, 64, 1, 1), (64, 1, 64, 64), torch.float32)
    buf7 = reinterpret_tensor(buf6, (1, 64, 1, 1), (64, 1, 1, 1), 0); del buf6  # reuse
    buf8 = empty_strided_cpu((8, ), (1, ), torch.float32)
    cpp_fused_mean_1(buf7, buf5, buf8)
    buf9 = torch.ops.mkldnn._convolution_pointwise.default(buf7, _frozen_param603, buf8, [0, 0], [1, 1], [1, 1], 1, 'relu', [None], '')
    assert_size_stride(buf9, (1, 8, 1, 1), (8, 1, 8, 8))
    del buf7
    del buf8
    buf10 = torch.ops.mkldnn._convolution_pointwise.default(buf9, _frozen_param604, _frozen_param18, [0, 0], [1, 1], [1, 1], 1, 'sigmoid', [None], '')
    assert_size_stride(buf10, (1, 64, 1, 1), (64, 1, 64, 64))
    buf11 = buf5; del buf5  # reuse
    cpp_fused_mul_2(buf11, buf10)
    buf12 = torch.ops.mkldnn._convolution_pointwise.default(buf11, _frozen_param605, _frozen_param569, [0, 0], [1, 1], [1, 1], 1, 'none', [None], '')
    assert_size_stride(buf12, (1, 256, 64, 64), (1048576, 1, 16384, 256))
    del buf11
    buf13 = torch.ops.mkldnn._convolution_pointwise_.binary(buf12, buf3, _frozen_param606, _frozen_param570, [0, 0], [1, 1], [1, 1], 1, 'add', 1.0, None, [None], None)
    del buf3
    del buf4
    buf16 = empty_strided_cpu((1, 256, 64, 64), (1048576, 1, 16384, 256), torch.float32)
    cpp_fused_silu_3(buf12, buf16)
    buf17 = torch.ops.mkldnn._convolution_pointwise.default(buf16, _frozen_param607, _frozen_param571, [0, 0], [1, 1], [1, 1], 1, 'swish', [None], '')
    assert_size_stride(buf17, (1, 64, 64, 64), (262144, 1, 4096, 64))
    buf18 = torch.ops.mkldnn._convolution_pointwise.default(buf17, _frozen_param608, _frozen_param572, [1, 1], [1, 1], [1, 1], 1, 'swish', [None], '')
    assert_size_stride(buf18, (1, 64, 64, 64), (262144, 1, 4096, 64))
    buf19 = buf10; del buf10  # reuse
    buf20 = reinterpret_tensor(buf19, (1, 64, 1, 1), (64, 1, 1, 1), 0); del buf19  # reuse
    buf21 = reinterpret_tensor(buf9, (8, ), (1, ), 0); del buf9  # reuse
    cpp_fused_mean_4(buf20, buf18, buf21)
    buf22 = torch.ops.mkldnn._convolution_pointwise.default(buf20, _frozen_param609, buf21, [0, 0], [1, 1], [1, 1], 1, 'relu', [None], '')
    assert_size_stride(buf22, (1, 8, 1, 1), (8, 1, 8, 8))
    del buf20
    del buf21
    buf23 = torch.ops.mkldnn._convolution_pointwise.default(buf22, _frozen_param610, _frozen_param34, [0, 0], [1, 1], [1, 1], 1, 'sigmoid', [None], '')
    assert_size_stride(buf23, (1, 64, 1, 1), (64, 1, 64, 64))
    buf24 = buf18; del buf18  # reuse
    cpp_fused_mul_5(buf24, buf23)
    del buf23
    buf25 = torch.ops.mkldnn._convolution_pointwise_.binary(buf16, buf24, _frozen_param611, _frozen_param573, [0, 0], [1, 1], [1, 1], 1, 'add', 1.0, None, [None], None)
    buf28 = buf12; del buf12  # reuse
    cpp_fused_silu_6(buf16, buf28)
    del buf16
    buf29 = torch.ops.mkldnn._convolution_pointwise.default(buf28, _frozen_param612, _frozen_param574, [0, 0], [1, 1], [1, 1], 1, 'swish', [None], '')
    assert_size_stride(buf29, (1, 128, 64, 64), (524288, 1, 8192, 128))
    buf30 = torch.ops.mkldnn._convolution_pointwise.default(buf29, _frozen_param613, _frozen_param575, [1, 1], [2, 2], [1, 1], 1, 'swish', [None], '')
    assert_size_stride(buf30, (1, 128, 32, 32), (131072, 1, 4096, 128))
    buf31 = empty_strided_cpu((1, 128, 1, 1), (128, 1, 128, 128), torch.float32)
    buf32 = reinterpret_tensor(buf31, (1, 128, 1, 1), (128, 1, 1, 1), 0); del buf31  # reuse
    buf33 = reinterpret_tensor(buf22, (8, ), (1, ), 0); del buf22  # reuse
    cpp_fused_mean_7(buf32, buf30, buf33)
    buf34 = torch.ops.mkldnn._convolution_pointwise.default(buf32, _frozen_param614, buf33, [0, 0], [1, 1], [1, 1], 1, 'relu', [None], '')
    assert_size_stride(buf34, (1, 8, 1, 1), (8, 1, 8, 8))
    del buf32
    del buf33
    buf35 = torch.ops.mkldnn._convolution_pointwise.default(buf34, _frozen_param615, _frozen_param47, [0, 0], [1, 1], [1, 1], 1, 'sigmoid', [None], '')
    assert_size_stride(buf35, (1, 128, 1, 1), (128, 1, 128, 128))
    buf36 = buf30; del buf30  # reuse
    cpp_fused_mul_8(buf36, buf35)
    buf37 = torch.ops.mkldnn._convolution_pointwise.default(buf36, _frozen_param616, _frozen_param576, [0, 0], [1, 1], [1, 1], 1, 'none', [None], '')
    assert_size_stride(buf37, (1, 512, 32, 32), (524288, 1, 16384, 512))
    del buf36
    buf38 = torch.ops.mkldnn._convolution_pointwise_.binary(buf37, buf28, _frozen_param617, _frozen_param577, [0, 0], [2, 2], [1, 1], 1, 'add', 1.0, None, [None], None)
    del buf28
    buf41 = reinterpret_tensor(buf29, (1, 512, 32, 32), (524288, 1, 16384, 512), 0); del buf29  # reuse
    cpp_fused_silu_9(buf37, buf41)
    buf42 = torch.ops.mkldnn._convolution_pointwise.default(buf41, _frozen_param618, _frozen_param578, [0, 0], [1, 1], [1, 1], 1, 'swish', [None], '')
    assert_size_stride(buf42, (1, 128, 32, 32), (131072, 1, 4096, 128))
    buf43 = torch.ops.mkldnn._convolution_pointwise.default(buf42, _frozen_param619, _frozen_param579, [1, 1], [1, 1], [1, 1], 1, 'swish', [None], '')
    assert_size_stride(buf43, (1, 128, 32, 32), (131072, 1, 4096, 128))
    buf44 = buf35; del buf35  # reuse
    buf45 = reinterpret_tensor(buf44, (1, 128, 1, 1), (128, 1, 1, 1), 0); del buf44  # reuse
    buf46 = reinterpret_tensor(buf34, (8, ), (1, ), 0); del buf34  # reuse
    cpp_fused_mean_10(buf45, buf43, buf46)
    buf47 = torch.ops.mkldnn._convolution_pointwise.default(buf45, _frozen_param620, buf46, [0, 0], [1, 1], [1, 1], 1, 'relu', [None], '')
    assert_size_stride(buf47, (1, 8, 1, 1), (8, 1, 8, 8))
    del buf45
    del buf46
    buf48 = torch.ops.mkldnn._convolution_pointwise.default(buf47, _frozen_param621, _frozen_param63, [0, 0], [1, 1], [1, 1], 1, 'sigmoid', [None], '')
    assert_size_stride(buf48, (1, 128, 1, 1), (128, 1, 128, 128))
    del buf47
    buf49 = buf43; del buf43  # reuse
    cpp_fused_mul_11(buf49, buf48)
    del buf48
    buf50 = torch.ops.mkldnn._convolution_pointwise_.binary(buf41, buf49, _frozen_param622, _frozen_param580, [0, 0], [1, 1], [1, 1], 1, 'add', 1.0, None, [None], None)
    buf53 = buf37; del buf37  # reuse
    cpp_fused_silu_12(buf41, buf53)
    buf54 = torch.ops.mkldnn._convolution_pointwise.default(buf53, _frozen_param623, _frozen_param581, [0, 0], [1, 1], [1, 1], 1, 'swish', [None], '')
    assert_size_stride(buf54, (1, 128, 32, 32), (131072, 1, 4096, 128))
    buf55 = torch.ops.mkldnn._convolution_pointwise.default(buf54, _frozen_param624, None, [0, 0], [1, 1], [1, 1], 1, 'none', [None], '')
    assert_size_stride(buf55, (1, 384, 32, 32), (393216, 1, 12288, 384))
    buf56 = empty_strided_cpu((4, 1024, 1024), (1048576, 1024, 1), torch.float32)
    # Topologically Sorted Source Nodes: [matmul], Original ATen: [aten.bmm]
    extern_kernels.bmm(reinterpret_tensor(buf55, (4, 1024, 32), (32, 384, 1), 0), reinterpret_tensor(buf55, (4, 32, 1024), (32, 1, 384), 128), out=buf56)
    buf57 = reinterpret_tensor(buf49, (4, 32, 32, 32), (32768, 1024, 32, 1), 0); del buf49  # reuse
    cpp_fused_clone_13(buf55, buf57)
    buf58 = torch.ops.mkl._mkl_linear.default(reinterpret_tensor(buf57, (4096, 32), (32, 1), 0), _frozen_param628, _frozen_param627, None, 4096)
    buf59 = buf57; del buf57  # reuse
    cpp_fused_clone_14(buf55, buf59)
    buf60 = torch.ops.mkl._mkl_linear.default(reinterpret_tensor(buf59, (4096, 32), (32, 1), 0), _frozen_param626, _frozen_param625, None, 4096)
    buf61 = empty_strided_cpu((4, 1024, 1), (1024, 1, 4096), torch.float32)
    buf62 = empty_strided_cpu((4, 1024, 1024), (1048576, 1024, 1), torch.float32)
    buf63 = empty_strided_cpu((4, 1024, 1), (1024, 1, 4096), torch.float32)
    buf64 = empty_strided_cpu((4, 1024, 1024), (1048576, 1024, 1), torch.float32)
    cpp_fused__softmax_add_mul_15(buf56, buf58, buf60, buf61, buf62, buf63, buf64)
    del buf56
    del buf58
    del buf60
    del buf61
    del buf62
    del buf63
    buf65 = reinterpret_tensor(buf59, (4, 1024, 32), (32768, 32, 1), 0); del buf59  # reuse
    # Topologically Sorted Source Nodes: [attn_1, matmul_3], Original ATen: [aten._softmax, aten.bmm]
    extern_kernels.bmm(buf64, reinterpret_tensor(buf55, (4, 1024, 32), (32, 384, 1), 256), out=buf65)
    del buf55
    del buf64
    buf66 = buf42; del buf42  # reuse
    buf67 = buf66; del buf66  # reuse
    cpp_fused__native_batch_norm_legit_no_training_silu_16(buf67, buf65, _frozen_param305, _frozen_param306, _frozen_param307, _frozen_param308)
    del buf65
    buf68 = torch.ops.mkldnn._convolution_pointwise_.binary(buf53, buf67, _frozen_param629, _frozen_param582, [0, 0], [1, 1], [1, 1], 1, 'add', 1.0, None, [None], None)
    del buf54
    buf71 = buf41; del buf41  # reuse
    cpp_fused_silu_17(buf53, buf71)
    del buf53
    buf72 = torch.ops.mkldnn._convolution_pointwise.default(buf71, _frozen_param630, _frozen_param583, [0, 0], [1, 1], [1, 1], 1, 'swish', [None], '')
    assert_size_stride(buf72, (1, 256, 32, 32), (262144, 1, 8192, 256))
    buf73 = torch.ops.mkldnn._convolution_pointwise.default(buf72, _frozen_param631, _frozen_param584, [1, 1], [2, 2], [1, 1], 1, 'swish', [None], '')
    assert_size_stride(buf73, (1, 256, 16, 16), (65536, 1, 4096, 256))
    buf74 = empty_strided_cpu((1, 256, 1, 1), (256, 1, 256, 256), torch.float32)
    buf75 = reinterpret_tensor(buf74, (1, 256, 1, 1), (256, 1, 1, 1), 0); del buf74  # reuse
    cpp_fused_mean_18(buf75, buf73)
    buf76 = torch.ops.mkldnn._convolution_pointwise.default(buf75, _frozen_param632, _frozen_param85, [0, 0], [1, 1], [1, 1], 1, 'relu', [None], '')
    assert_size_stride(buf76, (1, 16, 1, 1), (16, 1, 16, 16))
    del buf75
    buf77 = torch.ops.mkldnn._convolution_pointwise.default(buf76, _frozen_param633, _frozen_param87, [0, 0], [1, 1], [1, 1], 1, 'sigmoid', [None], '')
    assert_size_stride(buf77, (1, 256, 1, 1), (256, 1, 256, 256))
    del buf76
    buf78 = buf73; del buf73  # reuse
    cpp_fused_mul_19(buf78, buf77)
    buf79 = torch.ops.mkldnn._convolution_pointwise.default(buf78, _frozen_param634, _frozen_param585, [0, 0], [1, 1], [1, 1], 1, 'none', [None], '')
    assert_size_stride(buf79, (1, 1024, 16, 16), (262144, 1, 16384, 1024))
    del buf78
    buf80 = torch.ops.mkldnn._convolution_pointwise_.binary(buf79, buf71, _frozen_param635, _frozen_param586, [0, 0], [2, 2], [1, 1], 1, 'add', 1.0, None, [None], None)
    del buf71
    buf83 = reinterpret_tensor(buf72, (1, 1024, 16, 16), (262144, 1, 16384, 1024), 0); del buf72  # reuse
    cpp_fused_silu_20(buf79, buf83)
    buf84 = torch.ops.mkldnn._convolution_pointwise.default(buf83, _frozen_param636, _frozen_param587, [0, 0], [1, 1], [1, 1], 1, 'swish', [None], '')
    assert_size_stride(buf84, (1, 256, 16, 16), (65536, 1, 4096, 256))
    buf85 = torch.ops.mkldnn._convolution_pointwise.default(buf84, _frozen_param637, _frozen_param588, [1, 1], [1, 1], [1, 1], 1, 'swish', [None], '')
    assert_size_stride(buf85, (1, 256, 16, 16), (65536, 1, 4096, 256))
    buf86 = buf77; del buf77  # reuse
    buf87 = reinterpret_tensor(buf86, (1, 256, 1, 1), (256, 1, 1, 1), 0); del buf86  # reuse
    cpp_fused_mean_21(buf87, buf85)
    buf88 = torch.ops.mkldnn._convolution_pointwise.default(buf87, _frozen_param638, _frozen_param101, [0, 0], [1, 1], [1, 1], 1, 'relu', [None], '')
    assert_size_stride(buf88, (1, 16, 1, 1), (16, 1, 16, 16))
    buf89 = torch.ops.mkldnn._convolution_pointwise.default(buf88, _frozen_param639, _frozen_param103, [0, 0], [1, 1], [1, 1], 1, 'sigmoid', [None], '')
    assert_size_stride(buf89, (1, 256, 1, 1), (256, 1, 256, 256))
    del buf88
    buf90 = buf85; del buf85  # reuse
    cpp_fused_mul_22(buf90, buf89)
    buf91 = torch.ops.mkldnn._convolution_pointwise_.binary(buf83, buf90, _frozen_param640, _frozen_param589, [0, 0], [1, 1], [1, 1], 1, 'add', 1.0, None, [None], None)
    buf94 = buf79; del buf79  # reuse
    cpp_fused_silu_23(buf83, buf94)
    buf95 = torch.ops.mkldnn._convolution_pointwise.default(buf94, _frozen_param641, _frozen_param590, [0, 0], [1, 1], [1, 1], 1, 'swish', [None], '')
    assert_size_stride(buf95, (1, 256, 16, 16), (65536, 1, 4096, 256))
    buf96 = torch.ops.mkldnn._convolution_pointwise.default(buf95, _frozen_param642, None, [0, 0], [1, 1], [1, 1], 1, 'none', [None], '')
    assert_size_stride(buf96, (1, 768, 16, 16), (196608, 1, 12288, 768))
    buf97 = reinterpret_tensor(buf83, (4, 256, 256), (65536, 256, 1), 0); del buf83  # reuse
    # Topologically Sorted Source Nodes: [matmul_4], Original ATen: [aten.bmm]
    extern_kernels.bmm(reinterpret_tensor(buf96, (4, 256, 64), (64, 768, 1), 0), reinterpret_tensor(buf96, (4, 64, 256), (64, 1, 768), 256), out=buf97)
    buf98 = reinterpret_tensor(buf90, (4, 16, 16, 64), (16384, 1024, 64, 1), 0); del buf90  # reuse
    cpp_fused_clone_24(buf96, buf98)
    buf99 = torch.ops.mkl._mkl_linear.default(reinterpret_tensor(buf98, (1024, 64), (64, 1), 0), _frozen_param646, _frozen_param645, None, 1024)
    buf100 = buf98; del buf98  # reuse
    cpp_fused_clone_25(buf96, buf100)
    buf101 = torch.ops.mkl._mkl_linear.default(reinterpret_tensor(buf100, (1024, 64), (64, 1), 0), _frozen_param644, _frozen_param643, None, 1024)
    buf102 = empty_strided_cpu((4, 256, 1), (256, 1, 1024), torch.float32)
    buf103 = reinterpret_tensor(buf24, (4, 256, 256), (65536, 256, 1), 0); del buf24  # reuse
    buf104 = empty_strided_cpu((4, 256, 1), (256, 1, 1024), torch.float32)
    buf105 = reinterpret_tensor(buf17, (4, 256, 256), (65536, 256, 1), 0); del buf17  # reuse
    cpp_fused__softmax_add_mul_26(buf97, buf99, buf101, buf102, buf103, buf104, buf105)
    del buf101
    del buf99
    buf106 = reinterpret_tensor(buf100, (4, 256, 64), (16384, 64, 1), 0); del buf100  # reuse
    # Topologically Sorted Source Nodes: [attn_3, matmul_7], Original ATen: [aten._softmax, aten.bmm]
    extern_kernels.bmm(buf105, reinterpret_tensor(buf96, (4, 256, 64), (64, 768, 1), 512), out=buf106)
    del buf96
    buf107 = buf84; del buf84  # reuse
    buf108 = buf107; del buf107  # reuse
    cpp_fused__native_batch_norm_legit_no_training_silu_27(buf108, buf106, _frozen_param349, _frozen_param350, _frozen_param351, _frozen_param352)
    del buf106
    buf109 = torch.ops.mkldnn._convolution_pointwise_.binary(buf94, buf108, _frozen_param647, _frozen_param591, [0, 0], [1, 1], [1, 1], 1, 'add', 1.0, None, [None], None)
    del buf108
    del buf95
    buf112 = reinterpret_tensor(buf105, (1, 1024, 16, 16), (262144, 1, 16384, 1024), 0); del buf105  # reuse
    cpp_fused_silu_28(buf94, buf112)
    buf113 = torch.ops.mkldnn._convolution_pointwise.default(buf112, _frozen_param648, _frozen_param592, [0, 0], [1, 1], [1, 1], 1, 'swish', [None], '')
    assert_size_stride(buf113, (1, 512, 16, 16), (131072, 1, 8192, 512))
    buf114 = torch.ops.mkldnn._convolution_pointwise.default(buf113, _frozen_param649, None, [0, 0], [1, 1], [1, 1], 1, 'none', [None], '')
    assert_size_stride(buf114, (1, 1536, 16, 16), (393216, 1, 24576, 1536))
    buf115 = reinterpret_tensor(buf94, (4, 256, 256), (65536, 256, 1), 0); del buf94  # reuse
    # Topologically Sorted Source Nodes: [matmul_8], Original ATen: [aten.bmm]
    extern_kernels.bmm(reinterpret_tensor(buf114, (4, 256, 128), (128, 1536, 1), 0), reinterpret_tensor(buf114, (4, 128, 256), (128, 1, 1536), 512), out=buf115)
    buf116 = reinterpret_tensor(buf67, (4, 16, 16, 128), (32768, 2048, 128, 1), 0); del buf67  # reuse
    cpp_fused_clone_29(buf114, buf116)
    buf117 = torch.ops.mkl._mkl_linear.default(reinterpret_tensor(buf116, (1024, 128), (128, 1), 0), _frozen_param653, _frozen_param652, None, 1024)
    buf118 = buf116; del buf116  # reuse
    cpp_fused_clone_30(buf114, buf118)
    buf119 = torch.ops.mkl._mkl_linear.default(reinterpret_tensor(buf118, (1024, 128), (128, 1), 0), _frozen_param651, _frozen_param650, None, 1024)
    buf120 = buf104; del buf104  # reuse
    buf121 = buf97; del buf97  # reuse
    buf122 = buf102; del buf102  # reuse
    buf123 = buf103; del buf103  # reuse
    cpp_fused__softmax_add_mul_31(buf115, buf117, buf119, buf120, buf121, buf122, buf123)
    del buf115
    del buf117
    del buf119
    del buf120
    del buf121
    del buf122
    buf124 = reinterpret_tensor(buf118, (4, 256, 128), (32768, 128, 1), 0); del buf118  # reuse
    # Topologically Sorted Source Nodes: [attn_5, matmul_11], Original ATen: [aten._softmax, aten.bmm]
    extern_kernels.bmm(buf123, reinterpret_tensor(buf114, (4, 256, 128), (128, 1536, 1), 1024), out=buf124)
    del buf114
    del buf123
    buf125 = empty_strided_cpu((1, 512, 8, 8), (32768, 1, 4096, 512), torch.float32)
    buf126 = buf125; del buf125  # reuse
    cpp_fused__native_batch_norm_legit_no_training_avg_pool2d_silu_32(buf126, buf124, _frozen_param363, _frozen_param364, _frozen_param365, _frozen_param366)
    del buf124
    buf127 = torch.ops.mkldnn._convolution_pointwise.default(buf126, _frozen_param654, _frozen_param593, [0, 0], [1, 1], [1, 1], 1, 'none', [None], '')
    assert_size_stride(buf127, (1, 1536, 8, 8), (98304, 1, 12288, 1536))
    buf128 = torch.ops.mkldnn._convolution_pointwise_.binary(buf127, buf112, _frozen_param655, _frozen_param594, [0, 0], [2, 2], [1, 1], 1, 'add', 1.0, None, [None], None)
    del buf112
    del buf113
    buf131 = empty_strided_cpu((1, 1536, 8, 8), (98304, 1, 12288, 1536), torch.float32)
    cpp_fused_silu_33(buf127, buf131)
    del buf127
    buf132 = torch.ops.mkldnn._convolution_pointwise.default(buf131, _frozen_param656, _frozen_param595, [0, 0], [1, 1], [1, 1], 1, 'swish', [None], '')
    assert_size_stride(buf132, (1, 512, 8, 8), (32768, 1, 4096, 512))
    buf133 = torch.ops.mkldnn._convolution_pointwise.default(buf132, _frozen_param657, None, [0, 0], [1, 1], [1, 1], 1, 'none', [None], '')
    assert_size_stride(buf133, (1, 1536, 8, 8), (98304, 1, 12288, 1536))
    buf134 = empty_strided_cpu((4, 64, 64), (4096, 64, 1), torch.float32)
    # Topologically Sorted Source Nodes: [matmul_12], Original ATen: [aten.bmm]
    extern_kernels.bmm(reinterpret_tensor(buf133, (4, 64, 128), (128, 1536, 1), 0), reinterpret_tensor(buf133, (4, 128, 64), (128, 1, 1536), 512), out=buf134)
    buf135 = reinterpret_tensor(buf126, (4, 8, 8, 128), (8192, 1024, 128, 1), 0); del buf126  # reuse
    cpp_fused_clone_34(buf133, buf135)
    buf136 = torch.ops.mkl._mkl_linear.default(reinterpret_tensor(buf135, (256, 128), (128, 1), 0), _frozen_param661, _frozen_param660, None, 256)
    buf137 = buf135; del buf135  # reuse
    cpp_fused_clone_35(buf133, buf137)
    buf138 = torch.ops.mkl._mkl_linear.default(reinterpret_tensor(buf137, (256, 128), (128, 1), 0), _frozen_param659, _frozen_param658, None, 256)
    buf139 = reinterpret_tensor(buf89, (4, 64, 1), (64, 1, 256), 0); del buf89  # reuse
    buf140 = empty_strided_cpu((4, 64, 64), (4096, 64, 1), torch.float32)
    buf141 = reinterpret_tensor(buf87, (4, 64, 1), (64, 1, 256), 0); del buf87  # reuse
    buf142 = empty_strided_cpu((4, 64, 64), (4096, 64, 1), torch.float32)
    cpp_fused__softmax_add_mul_36(buf134, buf136, buf138, buf139, buf140, buf141, buf142)
    del buf134
    del buf136
    del buf138
    del buf139
    del buf140
    del buf141
    buf143 = reinterpret_tensor(buf137, (4, 64, 128), (8192, 128, 1), 0); del buf137  # reuse
    # Topologically Sorted Source Nodes: [attn_7, matmul_15], Original ATen: [aten._softmax, aten.bmm]
    extern_kernels.bmm(buf142, reinterpret_tensor(buf133, (4, 64, 128), (128, 1536, 1), 1024), out=buf143)
    del buf142
    buf144 = empty_strided_cpu((1, 512, 8, 8), (32768, 1, 4096, 512), torch.float32)
    buf145 = buf144; del buf144  # reuse
    cpp_fused__native_batch_norm_legit_no_training_silu_37(buf145, buf143, _frozen_param381, _frozen_param382, _frozen_param383, _frozen_param384)
    del buf143
    buf146 = torch.ops.mkldnn._convolution_pointwise_.binary(buf131, buf145, _frozen_param662, _frozen_param596, [0, 0], [1, 1], [1, 1], 1, 'add', 1.0, None, [None], None)
    del buf132
    del buf145
    buf149 = buf133; del buf133  # reuse
    cpp_fused_silu_38(buf131, buf149)
    del buf131
    buf150 = torch.ops.mkldnn._convolution_pointwise.default(buf149, _frozen_param663, _frozen_param597, [0, 0], [1, 1], [1, 1], 1, 'swish', [None], '')
    assert_size_stride(buf150, (1, 1280, 8, 8), (81920, 1, 10240, 1280))
    del buf149
    buf151 = empty_strided_cpu((1, 1280, 1, 1), (1280, 1, 1280, 1280), torch.float32)
    buf152 = buf151; del buf151  # reuse
    cpp_fused_mean_39(buf152, buf150)
    del buf150
    buf153 = torch.ops.mkl._mkl_linear.default(reinterpret_tensor(buf152, (1, 1280), (0, 1), 0), _frozen_param665, _frozen_param664, None, 1)
    del buf152
    buf154 = buf153; del buf153  # reuse
    cpp_fused_addmm_40(buf154, _frozen_param147)
    return (buf154, )


 def benchmark_compiled_module(times=10, repeat=10):
    from torch._dynamo.testing import rand_strided
    from torch._inductor.utils import print_performance
    global _frozen_param18
    _frozen_param18 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param34
    _frozen_param34 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param47
    _frozen_param47 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param63
    _frozen_param63 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param85
    _frozen_param85 = rand_strided((16, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param87
    _frozen_param87 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param101
    _frozen_param101 = rand_strided((16, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param103
    _frozen_param103 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param147
    _frozen_param147 = rand_strided((1000, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param564
    _frozen_param564 = rand_strided((24, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param598
    _frozen_param598 = rand_strided((24, 3, 3, 3), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param565
    _frozen_param565 = rand_strided((32, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param599
    _frozen_param599 = rand_strided((32, 24, 3, 3), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param566
    _frozen_param566 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param600
    _frozen_param600 = rand_strided((64, 32, 3, 3), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param567
    _frozen_param567 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param601
    _frozen_param601 = rand_strided((64, 64, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param568
    _frozen_param568 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param602
    _frozen_param602 = rand_strided((64, 64, 3, 3), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param603
    _frozen_param603 = rand_strided((8, 64, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param604
    _frozen_param604 = rand_strided((64, 8, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param569
    _frozen_param569 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param605
    _frozen_param605 = rand_strided((256, 64, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param570
    _frozen_param570 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param606
    _frozen_param606 = rand_strided((256, 64, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param571
    _frozen_param571 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param607
    _frozen_param607 = rand_strided((64, 256, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param572
    _frozen_param572 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param608
    _frozen_param608 = rand_strided((64, 64, 3, 3), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param609
    _frozen_param609 = rand_strided((8, 64, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param610
    _frozen_param610 = rand_strided((64, 8, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param573
    _frozen_param573 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param611
    _frozen_param611 = rand_strided((256, 64, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param574
    _frozen_param574 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param612
    _frozen_param612 = rand_strided((128, 256, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param575
    _frozen_param575 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param613
    _frozen_param613 = rand_strided((128, 128, 3, 3), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param614
    _frozen_param614 = rand_strided((8, 128, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param615
    _frozen_param615 = rand_strided((128, 8, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param576
    _frozen_param576 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param616
    _frozen_param616 = rand_strided((512, 128, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param577
    _frozen_param577 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param617
    _frozen_param617 = rand_strided((512, 256, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param578
    _frozen_param578 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param618
    _frozen_param618 = rand_strided((128, 512, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param579
    _frozen_param579 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param619
    _frozen_param619 = rand_strided((128, 128, 3, 3), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param620
    _frozen_param620 = rand_strided((8, 128, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param621
    _frozen_param621 = rand_strided((128, 8, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param580
    _frozen_param580 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param622
    _frozen_param622 = rand_strided((512, 128, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param581
    _frozen_param581 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param623
    _frozen_param623 = rand_strided((128, 512, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param624
    _frozen_param624 = rand_strided((384, 128, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param625
    _frozen_param625 = rand_strided((63, 32), (32, 1), device='cpu', dtype=torch.float32)
    global _frozen_param626
    _frozen_param626 = rand_strided((1982689, 1), (1, 0), device='cpu', dtype=torch.float32)
    global _frozen_param627
    _frozen_param627 = rand_strided((63, 32), (32, 1), device='cpu', dtype=torch.float32)
    global _frozen_param628
    _frozen_param628 = rand_strided((1982689, 1), (1, 0), device='cpu', dtype=torch.float32)
    global _frozen_param305
    _frozen_param305 = rand_strided((128, 1, 1), (1, 1, 1), device='cpu', dtype=torch.float32)
    global _frozen_param306
    _frozen_param306 = rand_strided((128, 1, 1), (1, 1, 1), device='cpu', dtype=torch.float32)
    global _frozen_param307
    _frozen_param307 = rand_strided((128, 1, 1), (1, 1, 1), device='cpu', dtype=torch.float32)
    global _frozen_param308
    _frozen_param308 = rand_strided((128, 1, 1), (1, 1, 1), device='cpu', dtype=torch.float32)
    global _frozen_param582
    _frozen_param582 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param629
    _frozen_param629 = rand_strided((512, 128, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param583
    _frozen_param583 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param630
    _frozen_param630 = rand_strided((256, 512, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param584
    _frozen_param584 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param631
    _frozen_param631 = rand_strided((256, 256, 3, 3), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param632
    _frozen_param632 = rand_strided((16, 256, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param633
    _frozen_param633 = rand_strided((256, 16, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param585
    _frozen_param585 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param634
    _frozen_param634 = rand_strided((1024, 256, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param586
    _frozen_param586 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param635
    _frozen_param635 = rand_strided((1024, 512, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param587
    _frozen_param587 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param636
    _frozen_param636 = rand_strided((256, 1024, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param588
    _frozen_param588 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param637
    _frozen_param637 = rand_strided((256, 256, 3, 3), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param638
    _frozen_param638 = rand_strided((16, 256, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param639
    _frozen_param639 = rand_strided((256, 16, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param589
    _frozen_param589 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param640
    _frozen_param640 = rand_strided((1024, 256, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param590
    _frozen_param590 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param641
    _frozen_param641 = rand_strided((256, 1024, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param642
    _frozen_param642 = rand_strided((768, 256, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param643
    _frozen_param643 = rand_strided((31, 64), (64, 1), device='cpu', dtype=torch.float32)
    global _frozen_param644
    _frozen_param644 = rand_strided((1982689, 1), (1, 0), device='cpu', dtype=torch.float32)
    global _frozen_param645
    _frozen_param645 = rand_strided((31, 64), (64, 1), device='cpu', dtype=torch.float32)
    global _frozen_param646
    _frozen_param646 = rand_strided((1982689, 1), (1, 0), device='cpu', dtype=torch.float32)
    global _frozen_param349
    _frozen_param349 = rand_strided((256, 1, 1), (1, 1, 1), device='cpu', dtype=torch.float32)
    global _frozen_param350
    _frozen_param350 = rand_strided((256, 1, 1), (1, 1, 1), device='cpu', dtype=torch.float32)
    global _frozen_param351
    _frozen_param351 = rand_strided((256, 1, 1), (1, 1, 1), device='cpu', dtype=torch.float32)
    global _frozen_param352
    _frozen_param352 = rand_strided((256, 1, 1), (1, 1, 1), device='cpu', dtype=torch.float32)
    global _frozen_param591
    _frozen_param591 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param647
    _frozen_param647 = rand_strided((1024, 256, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param592
    _frozen_param592 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param648
    _frozen_param648 = rand_strided((512, 1024, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param649
    _frozen_param649 = rand_strided((1536, 512, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param650
    _frozen_param650 = rand_strided((31, 128), (128, 1), device='cpu', dtype=torch.float32)
    global _frozen_param651
    _frozen_param651 = rand_strided((1982689, 1), (1, 0), device='cpu', dtype=torch.float32)
    global _frozen_param652
    _frozen_param652 = rand_strided((31, 128), (128, 1), device='cpu', dtype=torch.float32)
    global _frozen_param653
    _frozen_param653 = rand_strided((1982689, 1), (1, 0), device='cpu', dtype=torch.float32)
    global _frozen_param363
    _frozen_param363 = rand_strided((512, 1, 1), (1, 1, 1), device='cpu', dtype=torch.float32)
    global _frozen_param364
    _frozen_param364 = rand_strided((512, 1, 1), (1, 1, 1), device='cpu', dtype=torch.float32)
    global _frozen_param365
    _frozen_param365 = rand_strided((512, 1, 1), (1, 1, 1), device='cpu', dtype=torch.float32)
    global _frozen_param366
    _frozen_param366 = rand_strided((512, 1, 1), (1, 1, 1), device='cpu', dtype=torch.float32)
    global _frozen_param593
    _frozen_param593 = rand_strided((1536, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param654
    _frozen_param654 = rand_strided((1536, 512, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param594
    _frozen_param594 = rand_strided((1536, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param655
    _frozen_param655 = rand_strided((1536, 1024, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param595
    _frozen_param595 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param656
    _frozen_param656 = rand_strided((512, 1536, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param657
    _frozen_param657 = rand_strided((1536, 512, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param658
    _frozen_param658 = rand_strided((15, 128), (128, 1), device='cpu', dtype=torch.float32)
    global _frozen_param659
    _frozen_param659 = rand_strided((1982689, 1), (1, 0), device='cpu', dtype=torch.float32)
    global _frozen_param660
    _frozen_param660 = rand_strided((15, 128), (128, 1), device='cpu', dtype=torch.float32)
    global _frozen_param661
    _frozen_param661 = rand_strided((1982689, 1), (1, 0), device='cpu', dtype=torch.float32)
    global _frozen_param381
    _frozen_param381 = rand_strided((512, 1, 1), (1, 1, 1), device='cpu', dtype=torch.float32)
    global _frozen_param382
    _frozen_param382 = rand_strided((512, 1, 1), (1, 1, 1), device='cpu', dtype=torch.float32)
    global _frozen_param383
    _frozen_param383 = rand_strided((512, 1, 1), (1, 1, 1), device='cpu', dtype=torch.float32)
    global _frozen_param384
    _frozen_param384 = rand_strided((512, 1, 1), (1, 1, 1), device='cpu', dtype=torch.float32)
    global _frozen_param596
    _frozen_param596 = rand_strided((1536, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param662
    _frozen_param662 = rand_strided((1536, 512, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param597
    _frozen_param597 = rand_strided((1280, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param663
    _frozen_param663 = rand_strided((1280, 1536, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param664
    _frozen_param664 = rand_strided((1000, 1280), (1280, 1), device='cpu', dtype=torch.float32)
    global _frozen_param665
    _frozen_param665 = rand_strided((3490017, 1), (1, 0), device='cpu', dtype=torch.float32)
    arg224_1 = rand_strided((1, 3, 256, 256), (196608, 65536, 256, 1), device='cpu', dtype=torch.float32)
    fn = lambda: call([arg224_1])
    return print_performance(fn, times=times, repeat=repeat)


 if __name__ == "__main__":
    from torch._inductor.wrapper_benchmark import compiled_module_main
    compiled_module_main('sebotnet33ts_256', benchmark_compiled_module)
diff --git a/sebotnet33ts_256_before_index_expr.py b/sebotnet33ts_256_before_index_expr.py

 # AOT ID: ['0_inference']
 from ctypes import c_void_p, c_long, c_int
 import torch
 import math
 import random
 import os
 import tempfile
 from math import inf, nan
 from torch._inductor.hooks import run_intermediate_hooks
 from torch._inductor.utils import maybe_profile
 from torch._inductor.codegen.memory_planning import _align as align

 from torch import device, empty_strided
 from torch._inductor.async_compile import AsyncCompile
 from torch._inductor.select_algorithm import extern_kernels
 from torch._inductor.codegen.multi_kernel import MultiKernelCall

 aten = torch.ops.aten
 inductor_ops = torch.ops.inductor
 _quantized = torch.ops._quantized
 assert_size_stride = torch._C._dynamo.guards.assert_size_stride
 empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
 empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
 empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
 reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
 alloc_from_pool = torch.ops.inductor._alloc_from_pool
 async_compile = AsyncCompile()
 _frozen_param18 = None  # device(type='cpu') torch.float32 (64,) (1,) 7f8740180d10
 _frozen_param34 = None  # device(type='cpu') torch.float32 (64,) (1,) 7f8740183510
 _frozen_param47 = None  # device(type='cpu') torch.float32 (128,) (1,) 7f874228fba0
 _frozen_param63 = None  # device(type='cpu') torch.float32 (128,) (1,) 7f874228ea20
 _frozen_param85 = None  # device(type='cpu') torch.float32 (16,) (1,) 7f8742285300
 _frozen_param87 = None  # device(type='cpu') torch.float32 (256,) (1,) 7f8742285f80
 _frozen_param101 = None  # device(type='cpu') torch.float32 (16,) (1,) 7f874228b330
 _frozen_param103 = None  # device(type='cpu') torch.float32 (256,) (1,) 7f8742289f30
 _frozen_param147 = None  # device(type='cpu') torch.float32 (1000,) (1,) 7f8742271df0
 _frozen_param564 = None  # device(type='cpu') torch.float32 (24,) (1,) 7f872f028720
 _frozen_param598 = None  # device(type='cpu') torch.float32 (24, 3, 3, 3) (1, 0, 0, 0) 7f872f0b36f0
 _frozen_param565 = None  # device(type='cpu') torch.float32 (32,) (1,) 7f872ef84cc0
 _frozen_param599 = None  # device(type='cpu') torch.float32 (32, 24, 3, 3) (1, 0, 0, 0) 7f872ec544f0
 _frozen_param566 = None  # device(type='cpu') torch.float32 (64,) (1,) 7f872f681030
 _frozen_param600 = None  # device(type='cpu') torch.float32 (64, 32, 3, 3) (1, 0, 0, 0) 7f872ec54400
 _frozen_param567 = None  # device(type='cpu') torch.float32 (64,) (1,) 7f872ed47920
 _frozen_param601 = None  # device(type='cpu') torch.float32 (64, 64, 1, 1) (1, 0, 0, 0) 7f872ec544a0
 _frozen_param568 = None  # device(type='cpu') torch.float32 (64,) (1,) 7f872f02b420
 _frozen_param602 = None  # device(type='cpu') torch.float32 (64, 64, 3, 3) (1, 0, 0, 0) 7f872ec54220
 _frozen_param603 = None  # device(type='cpu') torch.float32 (8, 64, 1, 1) (1, 0, 0, 0) 7f872ec54540
 _frozen_param604 = None  # device(type='cpu') torch.float32 (64, 8, 1, 1) (1, 0, 0, 0) 7f872ec54590
 _frozen_param569 = None  # device(type='cpu') torch.float32 (256,) (1,) 7f872f043a10
 _frozen_param605 = None  # device(type='cpu') torch.float32 (256, 64, 1, 1) (1, 0, 0, 0) 7f872ec545e0
 _frozen_param570 = None  # device(type='cpu') torch.float32 (256,) (1,) 7f872eec8db0
 _frozen_param606 = None  # device(type='cpu') torch.float32 (256, 64, 1, 1) (1, 0, 0, 0) 7f872ec54630
 _frozen_param571 = None  # device(type='cpu') torch.float32 (64,) (1,) 7f872ee743b0
 _frozen_param607 = None  # device(type='cpu') torch.float32 (64, 256, 1, 1) (1, 0, 0, 0) 7f872ec54680
 _frozen_param572 = None  # device(type='cpu') torch.float32 (64,) (1,) 7f872ed90ef0
 _frozen_param608 = None  # device(type='cpu') torch.float32 (64, 64, 3, 3) (1, 0, 0, 0) 7f872ec546d0
 _frozen_param609 = None  # device(type='cpu') torch.float32 (8, 64, 1, 1) (1, 0, 0, 0) 7f872ec54720
 _frozen_param610 = None  # device(type='cpu') torch.float32 (64, 8, 1, 1) (1, 0, 0, 0) 7f872ec54770
 _frozen_param573 = None  # device(type='cpu') torch.float32 (256,) (1,) 7f872edd8ae0
 _frozen_param611 = None  # device(type='cpu') torch.float32 (256, 64, 1, 1) (1, 0, 0, 0) 7f872ec547c0
 _frozen_param574 = None  # device(type='cpu') torch.float32 (128,) (1,) 7f872ecfa250
 _frozen_param612 = None  # device(type='cpu') torch.float32 (128, 256, 1, 1) (1, 0, 0, 0) 7f872ec54810
 _frozen_param575 = None  # device(type='cpu') torch.float32 (128,) (1,) 7f872eda6700
 _frozen_param613 = None  # device(type='cpu') torch.float32 (128, 128, 3, 3) (1, 0, 0, 0) 7f872ec54860
 _frozen_param614 = None  # device(type='cpu') torch.float32 (8, 128, 1, 1) (1, 0, 0, 0) 7f872ec548b0
 _frozen_param615 = None  # device(type='cpu') torch.float32 (128, 8, 1, 1) (1, 0, 0, 0) 7f872ec54900
 _frozen_param576 = None  # device(type='cpu') torch.float32 (512,) (1,) 7f872ed32520
 _frozen_param616 = None  # device(type='cpu') torch.float32 (512, 128, 1, 1) (1, 0, 0, 0) 7f872ec54950
 _frozen_param577 = None  # device(type='cpu') torch.float32 (512,) (1,) 7f872ebcbe20
 _frozen_param617 = None  # device(type='cpu') torch.float32 (512, 256, 1, 1) (1, 0, 0, 0) 7f872ec549a0
 _frozen_param578 = None  # device(type='cpu') torch.float32 (128,) (1,) 7f872ee75030
 _frozen_param618 = None  # device(type='cpu') torch.float32 (128, 512, 1, 1) (1, 0, 0, 0) 7f872ec549f0
 _frozen_param579 = None  # device(type='cpu') torch.float32 (128,) (1,) 7f872ebd4ef0
 _frozen_param619 = None  # device(type='cpu') torch.float32 (128, 128, 3, 3) (1, 0, 0, 0) 7f872ec54a40
 _frozen_param620 = None  # device(type='cpu') torch.float32 (8, 128, 1, 1) (1, 0, 0, 0) 7f872ec54a90
 _frozen_param621 = None  # device(type='cpu') torch.float32 (128, 8, 1, 1) (1, 0, 0, 0) 7f872ec54ae0
 _frozen_param580 = None  # device(type='cpu') torch.float32 (512,) (1,) 7f872ed325c0
 _frozen_param622 = None  # device(type='cpu') torch.float32 (512, 128, 1, 1) (1, 0, 0, 0) 7f872ec54b30
 _frozen_param581 = None  # device(type='cpu') torch.float32 (128,) (1,) 7f872ebf87c0
 _frozen_param623 = None  # device(type='cpu') torch.float32 (128, 512, 1, 1) (1, 0, 0, 0) 7f872ec54b80
 _frozen_param624 = None  # device(type='cpu') torch.float32 (384, 128, 1, 1) (1, 0, 0, 0) 7f872ec54bd0
 _frozen_param625 = None  # device(type='cpu') torch.float32 (63, 32) (32, 1) 7f872ed026b0
 _frozen_param626 = None  # device(type='cpu') torch.float32 (1982689, 1) (1, 0) 7f872ecf8680
 _frozen_param627 = None  # device(type='cpu') torch.float32 (63, 32) (32, 1) 7f872ebca610
 _frozen_param628 = None  # device(type='cpu') torch.float32 (1982689, 1) (1, 0) 7f872eda1210
 _frozen_param305 = None  # device(type='cpu') torch.float32 (128, 1, 1) (1, 1, 1) 7f872eccc3b0
 _frozen_param306 = None  # device(type='cpu') torch.float32 (128, 1, 1) (1, 1, 1) 7f872eccc310
 _frozen_param307 = None  # device(type='cpu') torch.float32 (128, 1, 1) (1, 1, 1) 7f872eccc360
 _frozen_param308 = None  # device(type='cpu') torch.float32 (128, 1, 1) (1, 1, 1) 7f872eccc400
 _frozen_param582 = None  # device(type='cpu') torch.float32 (512,) (1,) 7f872ebf8810
 _frozen_param629 = None  # device(type='cpu') torch.float32 (512, 128, 1, 1) (1, 0, 0, 0) 7f872ec54ef0
 _frozen_param583 = None  # device(type='cpu') torch.float32 (256,) (1,) 7f872ebf8860
 _frozen_param630 = None  # device(type='cpu') torch.float32 (256, 512, 1, 1) (1, 0, 0, 0) 7f872ec54ea0
 _frozen_param584 = None  # device(type='cpu') torch.float32 (256,) (1,) 7f872ebf88b0
 _frozen_param631 = None  # device(type='cpu') torch.float32 (256, 256, 3, 3) (1, 0, 0, 0) 7f872ec54e50
 _frozen_param632 = None  # device(type='cpu') torch.float32 (16, 256, 1, 1) (1, 0, 0, 0) 7f872ec54e00
 _frozen_param633 = None  # device(type='cpu') torch.float32 (256, 16, 1, 1) (1, 0, 0, 0) 7f872ec54d60
 _frozen_param585 = None  # device(type='cpu') torch.float32 (1024,) (1,) 7f872ebf84a0
 _frozen_param634 = None  # device(type='cpu') torch.float32 (1024, 256, 1, 1) (1, 0, 0, 0) 7f872ec54c70
 _frozen_param586 = None  # device(type='cpu') torch.float32 (1024,) (1,) 7f872ebf8950
 _frozen_param635 = None  # device(type='cpu') torch.float32 (1024, 512, 1, 1) (1, 0, 0, 0) 7f872ec54f40
 _frozen_param587 = None  # device(type='cpu') torch.float32 (256,) (1,) 7f872ed329d0
 _frozen_param636 = None  # device(type='cpu') torch.float32 (256, 1024, 1, 1) (1, 0, 0, 0) 7f872ec54d10
 _frozen_param588 = None  # device(type='cpu') torch.float32 (256,) (1,) 7f872ebf86d0
 _frozen_param637 = None  # device(type='cpu') torch.float32 (256, 256, 3, 3) (1, 0, 0, 0) 7f872ec54cc0
 _frozen_param638 = None  # device(type='cpu') torch.float32 (16, 256, 1, 1) (1, 0, 0, 0) 7f872ec54c20
 _frozen_param639 = None  # device(type='cpu') torch.float32 (256, 16, 1, 1) (1, 0, 0, 0) 7f872ec54f90
 _frozen_param589 = None  # device(type='cpu') torch.float32 (1024,) (1,) 7f872ebf8310
 _frozen_param640 = None  # device(type='cpu') torch.float32 (1024, 256, 1, 1) (1, 0, 0, 0) 7f872ec54fe0
 _frozen_param590 = None  # device(type='cpu') torch.float32 (256,) (1,) 7f872ebf8a40
 _frozen_param641 = None  # device(type='cpu') torch.float32 (256, 1024, 1, 1) (1, 0, 0, 0) 7f872ec55120
 _frozen_param642 = None  # device(type='cpu') torch.float32 (768, 256, 1, 1) (1, 0, 0, 0) 7f872ec55080
 _frozen_param643 = None  # device(type='cpu') torch.float32 (31, 64) (64, 1) 7f872ec55300
 _frozen_param644 = None  # device(type='cpu') torch.float32 (1982689, 1) (1, 0) 7f872ec55350
 _frozen_param645 = None  # device(type='cpu') torch.float32 (31, 64) (64, 1) 7f872ec55030
 _frozen_param646 = None  # device(type='cpu') torch.float32 (1982689, 1) (1, 0) 7f872ec551c0
 _frozen_param349 = None  # device(type='cpu') torch.float32 (256, 1, 1) (1, 1, 1) 7f872eccd440
 _frozen_param350 = None  # device(type='cpu') torch.float32 (256, 1, 1) (1, 1, 1) 7f872eccd3a0
 _frozen_param351 = None  # device(type='cpu') torch.float32 (256, 1, 1) (1, 1, 1) 7f872eccd490
 _frozen_param352 = None  # device(type='cpu') torch.float32 (256, 1, 1) (1, 1, 1) 7f872eccd260
 _frozen_param591 = None  # device(type='cpu') torch.float32 (1024,) (1,) 7f872ebf8a90
 _frozen_param647 = None  # device(type='cpu') torch.float32 (1024, 256, 1, 1) (1, 0, 0, 0) 7f872ec55530
 _frozen_param592 = None  # device(type='cpu') torch.float32 (512,) (1,) 7f872ebf8ae0
 _frozen_param648 = None  # device(type='cpu') torch.float32 (512, 1024, 1, 1) (1, 0, 0, 0) 7f872ec554e0
 _frozen_param649 = None  # device(type='cpu') torch.float32 (1536, 512, 1, 1) (1, 0, 0, 0) 7f872ec55490
 _frozen_param650 = None  # device(type='cpu') torch.float32 (31, 128) (128, 1) 7f872ec555d0
 _frozen_param651 = None  # device(type='cpu') torch.float32 (1982689, 1) (1, 0) 7f872ec55620
 _frozen_param652 = None  # device(type='cpu') torch.float32 (31, 128) (128, 1) 7f872ec552b0
 _frozen_param653 = None  # device(type='cpu') torch.float32 (1982689, 1) (1, 0) 7f872ec553f0
 _frozen_param363 = None  # device(type='cpu') torch.float32 (512, 1, 1) (1, 1, 1) 7f872eccd940
 _frozen_param364 = None  # device(type='cpu') torch.float32 (512, 1, 1) (1, 1, 1) 7f872eccd990
 _frozen_param365 = None  # device(type='cpu') torch.float32 (512, 1, 1) (1, 1, 1) 7f872eccd9e0
 _frozen_param366 = None  # device(type='cpu') torch.float32 (512, 1, 1) (1, 1, 1) 7f872eccd8f0
 _frozen_param593 = None  # device(type='cpu') torch.float32 (1536,) (1,) 7f872ebf8770
 _frozen_param654 = None  # device(type='cpu') torch.float32 (1536, 512, 1, 1) (1, 0, 0, 0) 7f872ec55800
 _frozen_param594 = None  # device(type='cpu') torch.float32 (1536,) (1,) 7f872ebf8b80
 _frozen_param655 = None  # device(type='cpu') torch.float32 (1536, 1024, 1, 1) (1, 0, 0, 0) 7f872ec557b0
 _frozen_param595 = None  # device(type='cpu') torch.float32 (512,) (1,) 7f872ebf8bd0
 _frozen_param656 = None  # device(type='cpu') torch.float32 (512, 1536, 1, 1) (1, 0, 0, 0) 7f872ec55760
 _frozen_param657 = None  # device(type='cpu') torch.float32 (1536, 512, 1, 1) (1, 0, 0, 0) 7f872ec55710
 _frozen_param658 = None  # device(type='cpu') torch.float32 (15, 128) (128, 1) 7f872ec558f0
 _frozen_param659 = None  # device(type='cpu') torch.float32 (1982689, 1) (1, 0) 7f872ec55940
 _frozen_param660 = None  # device(type='cpu') torch.float32 (15, 128) (128, 1) 7f872ec55170
 _frozen_param661 = None  # device(type='cpu') torch.float32 (1982689, 1) (1, 0) 7f872ec553a0
 _frozen_param381 = None  # device(type='cpu') torch.float32 (512, 1, 1) (1, 1, 1) 7f872ecce070
 _frozen_param382 = None  # device(type='cpu') torch.float32 (512, 1, 1) (1, 1, 1) 7f872eccdfd0
 _frozen_param383 = None  # device(type='cpu') torch.float32 (512, 1, 1) (1, 1, 1) 7f872ecce0c0
 _frozen_param384 = None  # device(type='cpu') torch.float32 (512, 1, 1) (1, 1, 1) 7f872eccde90
 _frozen_param596 = None  # device(type='cpu') torch.float32 (1536,) (1,) 7f872ebf8c20
 _frozen_param662 = None  # device(type='cpu') torch.float32 (1536, 512, 1, 1) (1, 0, 0, 0) 7f872ec55b20
 _frozen_param597 = None  # device(type='cpu') torch.float32 (1280,) (1,) 7f872ebf8c70
 _frozen_param663 = None  # device(type='cpu') torch.float32 (1280, 1536, 1, 1) (1, 0, 0, 0) 7f872ec55ad0
 _frozen_param664 = None  # device(type='cpu') torch.float32 (1000, 1280) (1280, 1) 7f872ec558a0
 _frozen_param665 = None  # device(type='cpu') torch.float32 (3490017, 1) (1, 0) 7f872ec55bc0


 cpp_fused_silu_0 = async_compile.cpp_pybinding(['const float*', 'float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(const float* in_ptr0,
                       float* out_ptr0)
 {
    {
        #pragma GCC ivdep
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(3L); x0+=static_cast<int64_t>(3L))
        {
            #pragma GCC ivdep
            for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(65536L); x1+=static_cast<int64_t>(16L))
            {
                alignas(16) float tmp1[3*16];
                for (long x0_inner = 0; x0_inner < 3; x0_inner++)
                {
                    auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x1 + (65536L*x0) + (65536L*x0_inner)), 16);
                    tmp0.store(tmp1 + static_cast<int64_t>(16L*x0_inner));
                }
                at::vec::transpose_mxn<float,3,16>(tmp1, 16, out_ptr0 + static_cast<int64_t>(x0 + (3L*x1)), static_cast<int64_t>(3L));
            }
        }
    }
 }
 ''')


 cpp_fused_mean_1 = async_compile.cpp_pybinding(['float*', 'const float*', 'float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(float* in_out_ptr0,
                       const float* in_ptr0,
                       float* out_ptr1)
 {
    auto out_ptr0 = in_out_ptr0;
    {
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(64L); x0+=static_cast<int64_t>(16L))
        {
            {
                float tmp_acc0 = 0;
                at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(0);
                for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(4096L); x1+=static_cast<int64_t>(1L))
                {
                    auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x0 + (64L*x1)), 16);
                    tmp_acc0_vec = tmp_acc0_vec + tmp0;
                }
                tmp_acc0_vec.store(in_out_ptr0 + static_cast<int64_t>(x0));
            }
        }
    }
    {
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(64L); x0+=static_cast<int64_t>(16L))
        {
            auto tmp0 = at::vec::Vectorized<float>::loadu(out_ptr0 + static_cast<int64_t>(x0), 16);
            auto tmp1 = static_cast<float>(4096.0);
            auto tmp2 = at::vec::Vectorized<float>(tmp1);
            auto tmp3 = tmp0 / tmp2;
            tmp3.store(in_out_ptr0 + static_cast<int64_t>(x0));
        }
    }
    {
        #pragma omp simd simdlen(8) 
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(8L); x0+=static_cast<int64_t>(1L))
        {
            auto tmp0 = x0;
            auto tmp1 = c10::convert<int64_t>(tmp0);
            auto tmp2 = static_cast<int64_t>(4);
            auto tmp3 = tmp1 < tmp2;
            auto tmp4 = static_cast<int64_t>(2);
            auto tmp5 = tmp1 < tmp4;
            auto tmp6 = static_cast<int64_t>(1);
            auto tmp7 = tmp1 < tmp6;
            auto tmp8 = static_cast<float>(-0.3303021788597107);
            auto tmp9 = static_cast<float>(0.06354581564664841);
            auto tmp10 = tmp7 ? tmp8 : tmp9;
            auto tmp11 = static_cast<int64_t>(3);
            auto tmp12 = tmp1 < tmp11;
            auto tmp13 = static_cast<float>(1.774228811264038);
            auto tmp14 = static_cast<float>(2.1113927364349365);
            auto tmp15 = tmp12 ? tmp13 : tmp14;
            auto tmp16 = tmp5 ? tmp10 : tmp15;
            auto tmp17 = static_cast<int64_t>(6);
            auto tmp18 = tmp1 < tmp17;
            auto tmp19 = static_cast<int64_t>(5);
            auto tmp20 = tmp1 < tmp19;
            auto tmp21 = static_cast<float>(0.32513317465782166);
            auto tmp22 = static_cast<float>(1.232210397720337);
            auto tmp23 = tmp20 ? tmp21 : tmp22;
            auto tmp24 = static_cast<int64_t>(7);
            auto tmp25 = tmp1 < tmp24;
            auto tmp26 = static_cast<float>(0.7079262137413025);
            auto tmp27 = static_cast<float>(0.2353029102087021);
            auto tmp28 = tmp25 ? tmp26 : tmp27;
            auto tmp29 = tmp18 ? tmp23 : tmp28;
            auto tmp30 = tmp3 ? tmp16 : tmp29;
            out_ptr1[static_cast<int64_t>(x0)] = tmp30;
        }
    }
 }
 ''')


 cpp_fused_mul_2 = async_compile.cpp_pybinding(['float*', 'const float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(float* in_out_ptr0,
                       const float* in_ptr0)
 {
    {
        #pragma GCC ivdep
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(4096L); x0+=static_cast<int64_t>(1L))
        {
            for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(64L); x1+=static_cast<int64_t>(16L))
            {
                auto tmp0 = at::vec::Vectorized<float>::loadu(in_out_ptr0 + static_cast<int64_t>(x1 + (64L*x0)), 16);
                auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x1), 16);
                auto tmp2 = tmp0 * tmp1;
                tmp2.store(in_out_ptr0 + static_cast<int64_t>(x1 + (64L*x0)));
            }
        }
    }
 }
 ''')


 cpp_fused_silu_3 = async_compile.cpp_pybinding(['const float*', 'float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(const float* in_ptr0,
                       float* out_ptr0)
 {
    {
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(1048576L); x0+=static_cast<int64_t>(16L))
        {
            auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x0), 16);
            auto tmp1 = decltype(tmp0)(1)/(decltype(tmp0)(1) + tmp0.neg().exp());
            auto tmp2 = tmp0 * tmp1;
            tmp2.store(out_ptr0 + static_cast<int64_t>(x0));
        }
    }
 }
 ''')


 cpp_fused_mean_4 = async_compile.cpp_pybinding(['float*', 'const float*', 'float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(float* in_out_ptr0,
                       const float* in_ptr0,
                       float* out_ptr1)
 {
    auto out_ptr0 = in_out_ptr0;
    {
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(64L); x0+=static_cast<int64_t>(16L))
        {
            {
                float tmp_acc0 = 0;
                at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(0);
                for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(4096L); x1+=static_cast<int64_t>(1L))
                {
                    auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x0 + (64L*x1)), 16);
                    tmp_acc0_vec = tmp_acc0_vec + tmp0;
                }
                tmp_acc0_vec.store(in_out_ptr0 + static_cast<int64_t>(x0));
            }
        }
    }
    {
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(64L); x0+=static_cast<int64_t>(16L))
        {
            auto tmp0 = at::vec::Vectorized<float>::loadu(out_ptr0 + static_cast<int64_t>(x0), 16);
            auto tmp1 = static_cast<float>(4096.0);
            auto tmp2 = at::vec::Vectorized<float>(tmp1);
            auto tmp3 = tmp0 / tmp2;
            tmp3.store(in_out_ptr0 + static_cast<int64_t>(x0));
        }
    }
    {
        #pragma omp simd simdlen(8) 
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(8L); x0+=static_cast<int64_t>(1L))
        {
            auto tmp0 = x0;
            auto tmp1 = c10::convert<int64_t>(tmp0);
            auto tmp2 = static_cast<int64_t>(4);
            auto tmp3 = tmp1 < tmp2;
            auto tmp4 = static_cast<int64_t>(2);
            auto tmp5 = tmp1 < tmp4;
            auto tmp6 = static_cast<int64_t>(1);
            auto tmp7 = tmp1 < tmp6;
            auto tmp8 = static_cast<float>(0.7516645193099976);
            auto tmp9 = static_cast<float>(0.5124969482421875);
            auto tmp10 = tmp7 ? tmp8 : tmp9;
            auto tmp11 = static_cast<int64_t>(3);
            auto tmp12 = tmp1 < tmp11;
            auto tmp13 = static_cast<float>(1.2062517404556274);
            auto tmp14 = static_cast<float>(0.9069646596908569);
            auto tmp15 = tmp12 ? tmp13 : tmp14;
            auto tmp16 = tmp5 ? tmp10 : tmp15;
            auto tmp17 = static_cast<int64_t>(6);
            auto tmp18 = tmp1 < tmp17;
            auto tmp19 = static_cast<int64_t>(5);
            auto tmp20 = tmp1 < tmp19;
            auto tmp21 = static_cast<float>(1.4622137546539307);
            auto tmp22 = static_cast<float>(-0.11386305838823318);
            auto tmp23 = tmp20 ? tmp21 : tmp22;
            auto tmp24 = static_cast<int64_t>(7);
            auto tmp25 = tmp1 < tmp24;
            auto tmp26 = static_cast<float>(-0.2968502938747406);
            auto tmp27 = static_cast<float>(0.884636402130127);
            auto tmp28 = tmp25 ? tmp26 : tmp27;
            auto tmp29 = tmp18 ? tmp23 : tmp28;
            auto tmp30 = tmp3 ? tmp16 : tmp29;
            out_ptr1[static_cast<int64_t>(x0)] = tmp30;
        }
    }
 }
 ''')


 cpp_fused_mul_5 = async_compile.cpp_pybinding(['float*', 'const float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(float* in_out_ptr0,
                       const float* in_ptr0)
 {
    {
        #pragma GCC ivdep
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(4096L); x0+=static_cast<int64_t>(1L))
        {
            for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(64L); x1+=static_cast<int64_t>(16L))
            {
                auto tmp0 = at::vec::Vectorized<float>::loadu(in_out_ptr0 + static_cast<int64_t>(x1 + (64L*x0)), 16);
                auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x1), 16);
                auto tmp2 = tmp0 * tmp1;
                tmp2.store(in_out_ptr0 + static_cast<int64_t>(x1 + (64L*x0)));
            }
        }
    }
 }
 ''')


 cpp_fused_silu_6 = async_compile.cpp_pybinding(['const float*', 'float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(const float* in_ptr0,
                       float* out_ptr0)
 {
    {
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(1048576L); x0+=static_cast<int64_t>(16L))
        {
            auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x0), 16);
            auto tmp1 = decltype(tmp0)(1)/(decltype(tmp0)(1) + tmp0.neg().exp());
            auto tmp2 = tmp0 * tmp1;
            tmp2.store(out_ptr0 + static_cast<int64_t>(x0));
        }
    }
 }
 ''')


 cpp_fused_mean_7 = async_compile.cpp_pybinding(['float*', 'const float*', 'float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(float* in_out_ptr0,
                       const float* in_ptr0,
                       float* out_ptr1)
 {
    auto out_ptr0 = in_out_ptr0;
    {
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(128L); x0+=static_cast<int64_t>(16L))
        {
            {
                float tmp_acc0 = 0;
                at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(0);
                for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(1024L); x1+=static_cast<int64_t>(1L))
                {
                    auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x0 + (128L*x1)), 16);
                    tmp_acc0_vec = tmp_acc0_vec + tmp0;
                }
                tmp_acc0_vec.store(in_out_ptr0 + static_cast<int64_t>(x0));
            }
        }
    }
    {
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(128L); x0+=static_cast<int64_t>(16L))
        {
            auto tmp0 = at::vec::Vectorized<float>::loadu(out_ptr0 + static_cast<int64_t>(x0), 16);
            auto tmp1 = static_cast<float>(1024.0);
            auto tmp2 = at::vec::Vectorized<float>(tmp1);
            auto tmp3 = tmp0 / tmp2;
            tmp3.store(in_out_ptr0 + static_cast<int64_t>(x0));
        }
    }
    {
        #pragma omp simd simdlen(8) 
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(8L); x0+=static_cast<int64_t>(1L))
        {
            auto tmp0 = x0;
            auto tmp1 = c10::convert<int64_t>(tmp0);
            auto tmp2 = static_cast<int64_t>(4);
            auto tmp3 = tmp1 < tmp2;
            auto tmp4 = static_cast<int64_t>(2);
            auto tmp5 = tmp1 < tmp4;
            auto tmp6 = static_cast<int64_t>(1);
            auto tmp7 = tmp1 < tmp6;
            auto tmp8 = static_cast<float>(0.47172150015830994);
            auto tmp9 = static_cast<float>(1.4283583164215088);
            auto tmp10 = tmp7 ? tmp8 : tmp9;
            auto tmp11 = static_cast<int64_t>(3);
            auto tmp12 = tmp1 < tmp11;
            auto tmp13 = static_cast<float>(-0.04577525332570076);
            auto tmp14 = static_cast<float>(2.043065309524536);
            auto tmp15 = tmp12 ? tmp13 : tmp14;
            auto tmp16 = tmp5 ? tmp10 : tmp15;
            auto tmp17 = static_cast<int64_t>(6);
            auto tmp18 = tmp1 < tmp17;
            auto tmp19 = static_cast<int64_t>(5);
            auto tmp20 = tmp1 < tmp19;
            auto tmp21 = static_cast<float>(0.13726529479026794);
            auto tmp22 = static_cast<float>(1.1331775188446045);
            auto tmp23 = tmp20 ? tmp21 : tmp22;
            auto tmp24 = static_cast<int64_t>(7);
            auto tmp25 = tmp1 < tmp24;
            auto tmp26 = static_cast<float>(-0.11772552132606506);
            auto tmp27 = static_cast<float>(0.527721107006073);
            auto tmp28 = tmp25 ? tmp26 : tmp27;
            auto tmp29 = tmp18 ? tmp23 : tmp28;
            auto tmp30 = tmp3 ? tmp16 : tmp29;
            out_ptr1[static_cast<int64_t>(x0)] = tmp30;
        }
    }
 }
 ''')


 cpp_fused_mul_8 = async_compile.cpp_pybinding(['float*', 'const float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(float* in_out_ptr0,
                       const float* in_ptr0)
 {
    {
        #pragma GCC ivdep
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(1024L); x0+=static_cast<int64_t>(1L))
        {
            for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(128L); x1+=static_cast<int64_t>(16L))
            {
                auto tmp0 = at::vec::Vectorized<float>::loadu(in_out_ptr0 + static_cast<int64_t>(x1 + (128L*x0)), 16);
                auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x1), 16);
                auto tmp2 = tmp0 * tmp1;
                tmp2.store(in_out_ptr0 + static_cast<int64_t>(x1 + (128L*x0)));
            }
        }
    }
 }
 ''')


 cpp_fused_silu_9 = async_compile.cpp_pybinding(['const float*', 'float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(const float* in_ptr0,
                       float* out_ptr0)
 {
    {
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(524288L); x0+=static_cast<int64_t>(16L))
        {
            auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x0), 16);
            auto tmp1 = decltype(tmp0)(1)/(decltype(tmp0)(1) + tmp0.neg().exp());
            auto tmp2 = tmp0 * tmp1;
            tmp2.store(out_ptr0 + static_cast<int64_t>(x0));
        }
    }
 }
 ''')


 cpp_fused_mean_10 = async_compile.cpp_pybinding(['float*', 'const float*', 'float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(float* in_out_ptr0,
                       const float* in_ptr0,
                       float* out_ptr1)
 {
    auto out_ptr0 = in_out_ptr0;
    {
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(128L); x0+=static_cast<int64_t>(16L))
        {
            {
                float tmp_acc0 = 0;
                at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(0);
                for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(1024L); x1+=static_cast<int64_t>(1L))
                {
                    auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x0 + (128L*x1)), 16);
                    tmp_acc0_vec = tmp_acc0_vec + tmp0;
                }
                tmp_acc0_vec.store(in_out_ptr0 + static_cast<int64_t>(x0));
            }
        }
    }
    {
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(128L); x0+=static_cast<int64_t>(16L))
        {
            auto tmp0 = at::vec::Vectorized<float>::loadu(out_ptr0 + static_cast<int64_t>(x0), 16);
            auto tmp1 = static_cast<float>(1024.0);
            auto tmp2 = at::vec::Vectorized<float>(tmp1);
            auto tmp3 = tmp0 / tmp2;
            tmp3.store(in_out_ptr0 + static_cast<int64_t>(x0));
        }
    }
    {
        #pragma omp simd simdlen(8) 
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(8L); x0+=static_cast<int64_t>(1L))
        {
            auto tmp0 = x0;
            auto tmp1 = c10::convert<int64_t>(tmp0);
            auto tmp2 = static_cast<int64_t>(4);
            auto tmp3 = tmp1 < tmp2;
            auto tmp4 = static_cast<int64_t>(2);
            auto tmp5 = tmp1 < tmp4;
            auto tmp6 = static_cast<int64_t>(1);
            auto tmp7 = tmp1 < tmp6;
            auto tmp8 = static_cast<float>(0.29536592960357666);
            auto tmp9 = static_cast<float>(1.1849623918533325);
            auto tmp10 = tmp7 ? tmp8 : tmp9;
            auto tmp11 = static_cast<int64_t>(3);
            auto tmp12 = tmp1 < tmp11;
            auto tmp13 = static_cast<float>(0.3150717318058014);
            auto tmp14 = static_cast<float>(0.5096337795257568);
            auto tmp15 = tmp12 ? tmp13 : tmp14;
            auto tmp16 = tmp5 ? tmp10 : tmp15;
            auto tmp17 = static_cast<int64_t>(6);
            auto tmp18 = tmp1 < tmp17;
            auto tmp19 = static_cast<int64_t>(5);
            auto tmp20 = tmp1 < tmp19;
            auto tmp21 = static_cast<float>(-0.18543481826782227);
            auto tmp22 = static_cast<float>(0.3189537227153778);
            auto tmp23 = tmp20 ? tmp21 : tmp22;
            auto tmp24 = static_cast<int64_t>(7);
            auto tmp25 = tmp1 < tmp24;
            auto tmp26 = static_cast<float>(-0.7191315293312073);
            auto tmp27 = static_cast<float>(0.42770394682884216);
            auto tmp28 = tmp25 ? tmp26 : tmp27;
            auto tmp29 = tmp18 ? tmp23 : tmp28;
            auto tmp30 = tmp3 ? tmp16 : tmp29;
            out_ptr1[static_cast<int64_t>(x0)] = tmp30;
        }
    }
 }
 ''')


 cpp_fused_mul_11 = async_compile.cpp_pybinding(['float*', 'const float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(float* in_out_ptr0,
                       const float* in_ptr0)
 {
    {
        #pragma GCC ivdep
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(1024L); x0+=static_cast<int64_t>(1L))
        {
            for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(128L); x1+=static_cast<int64_t>(16L))
            {
                auto tmp0 = at::vec::Vectorized<float>::loadu(in_out_ptr0 + static_cast<int64_t>(x1 + (128L*x0)), 16);
                auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x1), 16);
                auto tmp2 = tmp0 * tmp1;
                tmp2.store(in_out_ptr0 + static_cast<int64_t>(x1 + (128L*x0)));
            }
        }
    }
 }
 ''')


 cpp_fused_silu_12 = async_compile.cpp_pybinding(['const float*', 'float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(const float* in_ptr0,
                       float* out_ptr0)
 {
    {
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(524288L); x0+=static_cast<int64_t>(16L))
        {
            auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x0), 16);
            auto tmp1 = decltype(tmp0)(1)/(decltype(tmp0)(1) + tmp0.neg().exp());
            auto tmp2 = tmp0 * tmp1;
            tmp2.store(out_ptr0 + static_cast<int64_t>(x0));
        }
    }
 }
 ''')


 cpp_fused_clone_13 = async_compile.cpp_pybinding(['const float*', 'float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(const float* in_ptr0,
                       float* out_ptr0)
 {
    {
        #pragma GCC ivdep
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(4L); x0+=static_cast<int64_t>(1L))
        {
            #pragma GCC ivdep
            for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(32L); x1+=static_cast<int64_t>(1L))
            {
                #pragma GCC ivdep
                for(int64_t x2=static_cast<int64_t>(0L); x2<static_cast<int64_t>(32L); x2+=static_cast<int64_t>(1L))
                {
                    for(int64_t x3=static_cast<int64_t>(0L); x3<static_cast<int64_t>(32L); x3+=static_cast<int64_t>(16L))
                    {
                        auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x3 + (32L*x0) + (384L*x1) + (12288L*x2)), 16);
                        tmp0.store(out_ptr0 + static_cast<int64_t>(x3 + (32L*x2) + (1024L*x1) + (32768L*x0)));
                    }
                }
            }
        }
    }
 }
 ''')


 cpp_fused_clone_14 = async_compile.cpp_pybinding(['const float*', 'float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(const float* in_ptr0,
                       float* out_ptr0)
 {
    {
        #pragma GCC ivdep
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(4L); x0+=static_cast<int64_t>(1L))
        {
            #pragma GCC ivdep
            for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(1024L); x1+=static_cast<int64_t>(1L))
            {
                for(int64_t x2=static_cast<int64_t>(0L); x2<static_cast<int64_t>(32L); x2+=static_cast<int64_t>(16L))
                {
                    auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x2 + (32L*x0) + (384L*x1)), 16);
                    tmp0.store(out_ptr0 + static_cast<int64_t>(x2 + (32L*x1) + (32768L*x0)));
                }
            }
        }
    }
 }
 ''')


 cpp_fused__softmax_add_mul_15 = async_compile.cpp_pybinding(['const float*', 'const float*', 'const float*', 'float*', 'float*', 'float*', 'float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(const float* in_ptr0,
                       const float* in_ptr1,
                       const float* in_ptr2,
                       float* out_ptr0,
                       float* out_ptr1,
                       float* out_ptr2,
                       float* out_ptr3)
 {
    {
        #pragma GCC ivdep
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(4L); x0+=static_cast<int64_t>(1L))
        {
            #pragma GCC ivdep
            for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(1024L); x1+=static_cast<int64_t>(1L))
            {
                {
                    float tmp_acc0 = -std::numeric_limits<float>::infinity();
                    at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(-std::numeric_limits<float>::infinity());
                    for(int64_t x2=static_cast<int64_t>(0L); x2<static_cast<int64_t>(1024L); x2+=static_cast<int64_t>(16L))
                    {
                        auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x2 + (1024L*x1) + (1048576L*x0)), 16);
                        auto tmp1 = static_cast<float>(0.1767766952966369);
                        auto tmp2 = at::vec::Vectorized<float>(tmp1);
                        auto tmp3 = tmp0 * tmp2;
                        auto tmp4 = 31L + (63L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(32L)))) + (c10::div_floor_integer(static_cast<int64_t>(x2), static_cast<int64_t>(32L)));
                        auto tmp5 = c10::convert<int64_t>(tmp4);
                        auto tmp6 = static_cast<int64_t>(2048);
                        auto tmp7 = tmp5 < tmp6;
                        auto tmp8 = [&]
                        {
                            auto tmp9 = static_cast<int64_t>((31L + (63L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(32L)))) + (c10::div_floor_integer(static_cast<int64_t>(x2), static_cast<int64_t>(32L))))) % static_cast<int64_t>(64L);
                            auto tmp10 = c10::convert<int64_t>(tmp9);
                            auto tmp11 = static_cast<int64_t>(63);
                            auto tmp12 = tmp10 < tmp11;
                            auto tmp14 = tmp12 & tmp7;
                            auto tmp13 = [&]
                            {
                                auto tmp15 = in_ptr1[static_cast<int64_t>((63L*(c10::div_floor_integer(static_cast<int64_t>((31L + (63L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(32L)))) + (c10::div_floor_integer(static_cast<int64_t>(x2), static_cast<int64_t>(32L))))), static_cast<int64_t>(64L)))) + (2016L*(static_cast<int64_t>(x1) % static_cast<int64_t>(32L))) + (64512L*x0) + (static_cast<int64_t>((31L + (63L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(32L)))) + (c10::div_floor_integer(static_cast<int64_t>(x2), static_cast<int64_t>(32L))))) % static_cast<int64_t>(64L)))];
                                return tmp15;
                            }
                            ;
                            auto tmp16 = tmp12 ? tmp13() : static_cast<float>(0.0);
                            return tmp16;
                        }
                        ;
                        auto tmp17 = tmp7 ? tmp8() : static_cast<float>(0.0);
                        auto tmp18 = 31L + (63L*(static_cast<int64_t>(x1) % static_cast<int64_t>(32L))) + (static_cast<int64_t>(x2) % static_cast<int64_t>(32L));
                        auto tmp19 = c10::convert<int64_t>(tmp18);
                        auto tmp20 = at::vec::VectorizedN<int64_t,2>::arange(tmp19, 1);
                        auto tmp21 = at::vec::VectorizedN<int64_t,2>(tmp6);
                        auto tmp22 = at::vec::VecMask<int64_t,2>(tmp20 < tmp21);
                        auto tmp23 = [&]
                        {
                            auto tmp24 =
                            [&]
                            {
                                __at_align__ std::array<int64_t, 16> tmpbuf;
                                #pragma GCC unroll 16
                                for (long x2_inner = 0; x2_inner < 16; x2_inner++)
                                {
                                    if (tmp22.is_masked(x2_inner))
                                    {
                                        tmpbuf[x2_inner] = static_cast<int64_t>(static_cast<int64_t>((31L + (63L*(static_cast<int64_t>(x1) % static_cast<int64_t>(32L))) + (static_cast<int64_t>((x2 + x2_inner)) % static_cast<int64_t>(32L)))) % static_cast<int64_t>(64L));
                                    }
                                }
                                return at::vec::VectorizedN<int64_t,2>::loadu(tmpbuf.data(), 16);
                            }
                            ()
                            ;
                            auto tmp25 = static_cast<int64_t>(63);
                            auto tmp26 = at::vec::VectorizedN<int64_t,2>(tmp25);
                            auto tmp27 = at::vec::VecMask<int64_t,2>(tmp24 < tmp26);
                            auto tmp29 = tmp27 & tmp22;
                            auto tmp28 = [&]
                            {
                                auto tmp30 =
                                [&]
                                {
                                    __at_align__ std::array<float, 16> tmpbuf;
                                    #pragma GCC unroll 16
                                    for (long x2_inner = 0; x2_inner < 16; x2_inner++)
                                    {
                                        if (tmp29.is_masked(x2_inner))
                                        {
                                            tmpbuf[x2_inner] = in_ptr2[static_cast<int64_t>((63L*(c10::div_floor_integer(static_cast<int64_t>((31L + (63L*(static_cast<int64_t>(x1) % static_cast<int64_t>(32L))) + (static_cast<int64_t>((x2 + x2_inner)) % static_cast<int64_t>(32L)))), static_cast<int64_t>(64L)))) + (2016L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(32L)))) + (64512L*x0) + (static_cast<int64_t>((31L + (63L*(static_cast<int64_t>(x1) % static_cast<int64_t>(32L))) + (static_cast<int64_t>((x2 + x2_inner)) % static_cast<int64_t>(32L)))) % static_cast<int64_t>(64L)))];
                                        }
                                    }
                                    return at::vec::Vectorized<float>::loadu(tmpbuf.data(), 16);
                                }
                                ()
                                ;
                                return tmp30;
                            }
                            ;
                            auto tmp33 =
                            [&]
                            {
                                if (tmp29.all_zero())
                                {
                                    return at::vec::Vectorized<float>(static_cast<float>(0.0));
                                }
                                else
                                {
                                    auto tmp31 = tmp28();
                                    auto tmp32 = at::vec::Vectorized<float>(static_cast<float>(0.0));
                                    return decltype(tmp31)::blendv(tmp32, tmp31, tmp29.template cast<float,1>());
                                }
                            }
                            ()
                            ;
                            return tmp33;
                        }
                        ;
                        auto tmp36 =
                        [&]
                        {
                            if (tmp22.all_zero())
                            {
                                return at::vec::Vectorized<float>(static_cast<float>(0.0));
                            }
                            else
                            {
                                auto tmp34 = tmp23();
                                auto tmp35 = at::vec::Vectorized<float>(static_cast<float>(0.0));
                                return decltype(tmp34)::blendv(tmp35, tmp34, tmp22.template cast<float,1>());
                            }
                        }
                        ()
                        ;
                        auto tmp37 = at::vec::Vectorized<float>(tmp17);
                        auto tmp38 = tmp37 + tmp36;
                        auto tmp39 = tmp3 + tmp38;
                        tmp_acc0_vec = at::vec::maximum(tmp_acc0_vec, tmp39);
                    }
                    tmp_acc0 = max_propagate_nan(tmp_acc0, at::vec::vec_reduce_all<float>([](at::vec::Vectorized<float>& x, at::vec::Vectorized<float>& y) { return at::vec::maximum(x, y); }, tmp_acc0_vec));
                    out_ptr0[static_cast<int64_t>(x1 + (1024L*x0))] = static_cast<float>(tmp_acc0);
                }
                for(int64_t x2=static_cast<int64_t>(0L); x2<static_cast<int64_t>(1024L); x2+=static_cast<int64_t>(16L))
                {
                    auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x2 + (1024L*x1) + (1048576L*x0)), 16);
                    auto tmp40 = out_ptr0[static_cast<int64_t>(x1 + (1024L*x0))];
                    auto tmp1 = static_cast<float>(0.1767766952966369);
                    auto tmp2 = at::vec::Vectorized<float>(tmp1);
                    auto tmp3 = tmp0 * tmp2;
                    auto tmp4 = 31L + (63L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(32L)))) + (c10::div_floor_integer(static_cast<int64_t>(x2), static_cast<int64_t>(32L)));
                    auto tmp5 = c10::convert<int64_t>(tmp4);
                    auto tmp6 = static_cast<int64_t>(2048);
                    auto tmp7 = tmp5 < tmp6;
                    auto tmp8 = [&]
                    {
                        auto tmp9 = static_cast<int64_t>((31L + (63L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(32L)))) + (c10::div_floor_integer(static_cast<int64_t>(x2), static_cast<int64_t>(32L))))) % static_cast<int64_t>(64L);
                        auto tmp10 = c10::convert<int64_t>(tmp9);
                        auto tmp11 = static_cast<int64_t>(63);
                        auto tmp12 = tmp10 < tmp11;
                        auto tmp14 = tmp12 & tmp7;
                        auto tmp13 = [&]
                        {
                            auto tmp15 = in_ptr1[static_cast<int64_t>((63L*(c10::div_floor_integer(static_cast<int64_t>((31L + (63L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(32L)))) + (c10::div_floor_integer(static_cast<int64_t>(x2), static_cast<int64_t>(32L))))), static_cast<int64_t>(64L)))) + (2016L*(static_cast<int64_t>(x1) % static_cast<int64_t>(32L))) + (64512L*x0) + (static_cast<int64_t>((31L + (63L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(32L)))) + (c10::div_floor_integer(static_cast<int64_t>(x2), static_cast<int64_t>(32L))))) % static_cast<int64_t>(64L)))];
                            return tmp15;
                        }
                        ;
                        auto tmp16 = tmp12 ? tmp13() : static_cast<float>(0.0);
                        return tmp16;
                    }
                    ;
                    auto tmp17 = tmp7 ? tmp8() : static_cast<float>(0.0);
                    auto tmp18 = 31L + (63L*(static_cast<int64_t>(x1) % static_cast<int64_t>(32L))) + (static_cast<int64_t>(x2) % static_cast<int64_t>(32L));
                    auto tmp19 = c10::convert<int64_t>(tmp18);
                    auto tmp20 = at::vec::VectorizedN<int64_t,2>::arange(tmp19, 1);
                    auto tmp21 = at::vec::VectorizedN<int64_t,2>(tmp6);
                    auto tmp22 = at::vec::VecMask<int64_t,2>(tmp20 < tmp21);
                    auto tmp23 = [&]
                    {
                        auto tmp24 =
                        [&]
                        {
                            __at_align__ std::array<int64_t, 16> tmpbuf;
                            #pragma GCC unroll 16
                            for (long x2_inner = 0; x2_inner < 16; x2_inner++)
                            {
                                if (tmp22.is_masked(x2_inner))
                                {
                                    tmpbuf[x2_inner] = static_cast<int64_t>(static_cast<int64_t>((31L + (63L*(static_cast<int64_t>(x1) % static_cast<int64_t>(32L))) + (static_cast<int64_t>((x2 + x2_inner)) % static_cast<int64_t>(32L)))) % static_cast<int64_t>(64L));
                                }
                            }
                            return at::vec::VectorizedN<int64_t,2>::loadu(tmpbuf.data(), 16);
                        }
                        ()
                        ;
                        auto tmp25 = static_cast<int64_t>(63);
                        auto tmp26 = at::vec::VectorizedN<int64_t,2>(tmp25);
                        auto tmp27 = at::vec::VecMask<int64_t,2>(tmp24 < tmp26);
                        auto tmp29 = tmp27 & tmp22;
                        auto tmp28 = [&]
                        {
                            auto tmp30 =
                            [&]
                            {
                                __at_align__ std::array<float, 16> tmpbuf;
                                #pragma GCC unroll 16
                                for (long x2_inner = 0; x2_inner < 16; x2_inner++)
                                {
                                    if (tmp29.is_masked(x2_inner))
                                    {
                                        tmpbuf[x2_inner] = in_ptr2[static_cast<int64_t>((63L*(c10::div_floor_integer(static_cast<int64_t>((31L + (63L*(static_cast<int64_t>(x1) % static_cast<int64_t>(32L))) + (static_cast<int64_t>((x2 + x2_inner)) % static_cast<int64_t>(32L)))), static_cast<int64_t>(64L)))) + (2016L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(32L)))) + (64512L*x0) + (static_cast<int64_t>((31L + (63L*(static_cast<int64_t>(x1) % static_cast<int64_t>(32L))) + (static_cast<int64_t>((x2 + x2_inner)) % static_cast<int64_t>(32L)))) % static_cast<int64_t>(64L)))];
                                    }
                                }
                                return at::vec::Vectorized<float>::loadu(tmpbuf.data(), 16);
                            }
                            ()
                            ;
                            return tmp30;
                        }
                        ;
                        auto tmp33 =
                        [&]
                        {
                            if (tmp29.all_zero())
                            {
                                return at::vec::Vectorized<float>(static_cast<float>(0.0));
                            }
                            else
                            {
                                auto tmp31 = tmp28();
                                auto tmp32 = at::vec::Vectorized<float>(static_cast<float>(0.0));
                                return decltype(tmp31)::blendv(tmp32, tmp31, tmp29.template cast<float,1>());
                            }
                        }
                        ()
                        ;
                        return tmp33;
                    }
                    ;
                    auto tmp36 =
                    [&]
                    {
                        if (tmp22.all_zero())
                        {
                            return at::vec::Vectorized<float>(static_cast<float>(0.0));
                        }
                        else
                        {
                            auto tmp34 = tmp23();
                            auto tmp35 = at::vec::Vectorized<float>(static_cast<float>(0.0));
                            return decltype(tmp34)::blendv(tmp35, tmp34, tmp22.template cast<float,1>());
                        }
                    }
                    ()
                    ;
                    auto tmp37 = at::vec::Vectorized<float>(tmp17);
                    auto tmp38 = tmp37 + tmp36;
                    auto tmp39 = tmp3 + tmp38;
                    auto tmp41 = at::vec::Vectorized<float>(tmp40);
                    auto tmp42 = tmp39 - tmp41;
                    auto tmp43 = tmp42.exp();
                    tmp43.store(out_ptr1 + static_cast<int64_t>(x2 + (1024L*x1) + (1048576L*x0)));
                }
            }
        }
    }
    {
        #pragma GCC ivdep
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(4096L); x0+=static_cast<int64_t>(1L))
        {
            {
                float tmp_acc0 = 0;
                at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(0);
                for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(1024L); x1+=static_cast<int64_t>(16L))
                {
                    auto tmp0 = at::vec::Vectorized<float>::loadu(out_ptr1 + static_cast<int64_t>(x1 + (1024L*x0)), 16);
                    tmp_acc0_vec = tmp_acc0_vec + tmp0;
                }
                tmp_acc0 = tmp_acc0 + at::vec::vec_reduce_all<float>([](at::vec::Vectorized<float>& x, at::vec::Vectorized<float>& y) { return x + y; }, tmp_acc0_vec);
                out_ptr2[static_cast<int64_t>(x0)] = static_cast<float>(tmp_acc0);
            }
            for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(1024L); x1+=static_cast<int64_t>(16L))
            {
                auto tmp0 = at::vec::Vectorized<float>::loadu(out_ptr1 + static_cast<int64_t>(x1 + (1024L*x0)), 16);
                auto tmp1 = out_ptr2[static_cast<int64_t>(x0)];
                auto tmp2 = at::vec::Vectorized<float>(tmp1);
                auto tmp3 = tmp0 / tmp2;
                tmp3.store(out_ptr3 + static_cast<int64_t>(x1 + (1024L*x0)));
            }
        }
    }
 }
 ''')


 cpp_fused__native_batch_norm_legit_no_training_silu_16 = async_compile.cpp_pybinding(['float*', 'const float*', 'const float*', 'const float*', 'const float*', 'const float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(float* in_out_ptr0,
                       const float* in_ptr0,
                       const float* in_ptr1,
                       const float* in_ptr2,
                       const float* in_ptr3,
                       const float* in_ptr4)
 {
    {
        #pragma GCC ivdep
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(1024L); x0+=static_cast<int64_t>(1L))
        {
            for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(128L); x1+=static_cast<int64_t>(16L))
            {
                auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>((32L*x0) + (32768L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(32L)))) + (static_cast<int64_t>(x1) % static_cast<int64_t>(32L))), 16);
                auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr1 + static_cast<int64_t>(x1), 16);
                auto tmp3 = at::vec::Vectorized<float>::loadu(in_ptr2 + static_cast<int64_t>(x1), 16);
                auto tmp5 = at::vec::Vectorized<float>::loadu(in_ptr3 + static_cast<int64_t>(x1), 16);
                auto tmp7 = at::vec::Vectorized<float>::loadu(in_ptr4 + static_cast<int64_t>(x1), 16);
                auto tmp2 = tmp0 - tmp1;
                auto tmp4 = tmp2 * tmp3;
                auto tmp6 = tmp4 * tmp5;
                auto tmp8 = tmp6 + tmp7;
                auto tmp9 = decltype(tmp8)(1)/(decltype(tmp8)(1) + tmp8.neg().exp());
                auto tmp10 = tmp8 * tmp9;
                tmp10.store(in_out_ptr0 + static_cast<int64_t>(x1 + (128L*x0)));
            }
        }
    }
 }
 ''')


 cpp_fused_silu_17 = async_compile.cpp_pybinding(['const float*', 'float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(const float* in_ptr0,
                       float* out_ptr0)
 {
    {
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(524288L); x0+=static_cast<int64_t>(16L))
        {
            auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x0), 16);
            auto tmp1 = decltype(tmp0)(1)/(decltype(tmp0)(1) + tmp0.neg().exp());
            auto tmp2 = tmp0 * tmp1;
            tmp2.store(out_ptr0 + static_cast<int64_t>(x0));
        }
    }
 }
 ''')


 cpp_fused_mean_18 = async_compile.cpp_pybinding(['float*', 'const float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(float* in_out_ptr0,
                       const float* in_ptr0)
 {
    auto out_ptr0 = in_out_ptr0;
    {
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(256L); x0+=static_cast<int64_t>(16L))
        {
            {
                float tmp_acc0 = 0;
                at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(0);
                for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(256L); x1+=static_cast<int64_t>(1L))
                {
                    auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x0 + (256L*x1)), 16);
                    tmp_acc0_vec = tmp_acc0_vec + tmp0;
                }
                tmp_acc0_vec.store(in_out_ptr0 + static_cast<int64_t>(x0));
            }
        }
    }
    {
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(256L); x0+=static_cast<int64_t>(16L))
        {
            auto tmp0 = at::vec::Vectorized<float>::loadu(out_ptr0 + static_cast<int64_t>(x0), 16);
            auto tmp1 = static_cast<float>(256.0);
            auto tmp2 = at::vec::Vectorized<float>(tmp1);
            auto tmp3 = tmp0 / tmp2;
            tmp3.store(in_out_ptr0 + static_cast<int64_t>(x0));
        }
    }
 }
 ''')


 cpp_fused_mul_19 = async_compile.cpp_pybinding(['float*', 'const float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(float* in_out_ptr0,
                       const float* in_ptr0)
 {
    {
        #pragma GCC ivdep
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(256L); x0+=static_cast<int64_t>(1L))
        {
            for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(256L); x1+=static_cast<int64_t>(16L))
            {
                auto tmp0 = at::vec::Vectorized<float>::loadu(in_out_ptr0 + static_cast<int64_t>(x1 + (256L*x0)), 16);
                auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x1), 16);
                auto tmp2 = tmp0 * tmp1;
                tmp2.store(in_out_ptr0 + static_cast<int64_t>(x1 + (256L*x0)));
            }
        }
    }
 }
 ''')


 cpp_fused_silu_20 = async_compile.cpp_pybinding(['const float*', 'float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(const float* in_ptr0,
                       float* out_ptr0)
 {
    {
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(262144L); x0+=static_cast<int64_t>(16L))
        {
            auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x0), 16);
            auto tmp1 = decltype(tmp0)(1)/(decltype(tmp0)(1) + tmp0.neg().exp());
            auto tmp2 = tmp0 * tmp1;
            tmp2.store(out_ptr0 + static_cast<int64_t>(x0));
        }
    }
 }
 ''')


 cpp_fused_mean_21 = async_compile.cpp_pybinding(['float*', 'const float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(float* in_out_ptr0,
                       const float* in_ptr0)
 {
    auto out_ptr0 = in_out_ptr0;
    {
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(256L); x0+=static_cast<int64_t>(16L))
        {
            {
                float tmp_acc0 = 0;
                at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(0);
                for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(256L); x1+=static_cast<int64_t>(1L))
                {
                    auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x0 + (256L*x1)), 16);
                    tmp_acc0_vec = tmp_acc0_vec + tmp0;
                }
                tmp_acc0_vec.store(in_out_ptr0 + static_cast<int64_t>(x0));
            }
        }
    }
    {
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(256L); x0+=static_cast<int64_t>(16L))
        {
            auto tmp0 = at::vec::Vectorized<float>::loadu(out_ptr0 + static_cast<int64_t>(x0), 16);
            auto tmp1 = static_cast<float>(256.0);
            auto tmp2 = at::vec::Vectorized<float>(tmp1);
            auto tmp3 = tmp0 / tmp2;
            tmp3.store(in_out_ptr0 + static_cast<int64_t>(x0));
        }
    }
 }
 ''')


 cpp_fused_mul_22 = async_compile.cpp_pybinding(['float*', 'const float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(float* in_out_ptr0,
                       const float* in_ptr0)
 {
    {
        #pragma GCC ivdep
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(256L); x0+=static_cast<int64_t>(1L))
        {
            for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(256L); x1+=static_cast<int64_t>(16L))
            {
                auto tmp0 = at::vec::Vectorized<float>::loadu(in_out_ptr0 + static_cast<int64_t>(x1 + (256L*x0)), 16);
                auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x1), 16);
                auto tmp2 = tmp0 * tmp1;
                tmp2.store(in_out_ptr0 + static_cast<int64_t>(x1 + (256L*x0)));
            }
        }
    }
 }
 ''')


 cpp_fused_silu_23 = async_compile.cpp_pybinding(['const float*', 'float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(const float* in_ptr0,
                       float* out_ptr0)
 {
    {
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(262144L); x0+=static_cast<int64_t>(16L))
        {
            auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x0), 16);
            auto tmp1 = decltype(tmp0)(1)/(decltype(tmp0)(1) + tmp0.neg().exp());
            auto tmp2 = tmp0 * tmp1;
            tmp2.store(out_ptr0 + static_cast<int64_t>(x0));
        }
    }
 }
 ''')


 cpp_fused_clone_24 = async_compile.cpp_pybinding(['const float*', 'float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(const float* in_ptr0,
                       float* out_ptr0)
 {
    {
        #pragma GCC ivdep
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(4L); x0+=static_cast<int64_t>(1L))
        {
            #pragma GCC ivdep
            for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(16L); x1+=static_cast<int64_t>(1L))
            {
                #pragma GCC ivdep
                for(int64_t x2=static_cast<int64_t>(0L); x2<static_cast<int64_t>(16L); x2+=static_cast<int64_t>(1L))
                {
                    for(int64_t x3=static_cast<int64_t>(0L); x3<static_cast<int64_t>(64L); x3+=static_cast<int64_t>(16L))
                    {
                        auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x3 + (64L*x0) + (768L*x1) + (12288L*x2)), 16);
                        tmp0.store(out_ptr0 + static_cast<int64_t>(x3 + (64L*x2) + (1024L*x1) + (16384L*x0)));
                    }
                }
            }
        }
    }
 }
 ''')


 cpp_fused_clone_25 = async_compile.cpp_pybinding(['const float*', 'float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(const float* in_ptr0,
                       float* out_ptr0)
 {
    {
        #pragma GCC ivdep
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(4L); x0+=static_cast<int64_t>(1L))
        {
            #pragma GCC ivdep
            for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(256L); x1+=static_cast<int64_t>(1L))
            {
                for(int64_t x2=static_cast<int64_t>(0L); x2<static_cast<int64_t>(64L); x2+=static_cast<int64_t>(16L))
                {
                    auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x2 + (64L*x0) + (768L*x1)), 16);
                    tmp0.store(out_ptr0 + static_cast<int64_t>(x2 + (64L*x1) + (16384L*x0)));
                }
            }
        }
    }
 }
 ''')


 cpp_fused__softmax_add_mul_26 = async_compile.cpp_pybinding(['const float*', 'const float*', 'const float*', 'float*', 'float*', 'float*', 'float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(const float* in_ptr0,
                       const float* in_ptr1,
                       const float* in_ptr2,
                       float* out_ptr0,
                       float* out_ptr1,
                       float* out_ptr2,
                       float* out_ptr3)
 {
    {
        #pragma GCC ivdep
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(4L); x0+=static_cast<int64_t>(1L))
        {
            #pragma GCC ivdep
            for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(256L); x1+=static_cast<int64_t>(1L))
            {
                {
                    float tmp_acc0 = -std::numeric_limits<float>::infinity();
                    at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(-std::numeric_limits<float>::infinity());
                    for(int64_t x2=static_cast<int64_t>(0L); x2<static_cast<int64_t>(256L); x2+=static_cast<int64_t>(16L))
                    {
                        auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x2 + (256L*x1) + (65536L*x0)), 16);
                        auto tmp1 = static_cast<float>(0.125);
                        auto tmp2 = at::vec::Vectorized<float>(tmp1);
                        auto tmp3 = tmp0 * tmp2;
                        auto tmp4 = 15L + (31L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(16L)))) + (c10::div_floor_integer(static_cast<int64_t>(x2), static_cast<int64_t>(16L)));
                        auto tmp5 = c10::convert<int64_t>(tmp4);
                        auto tmp6 = static_cast<int64_t>(512);
                        auto tmp7 = tmp5 < tmp6;
                        auto tmp8 = [&]
                        {
                            auto tmp9 = static_cast<int64_t>((15L + (31L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(16L)))) + (c10::div_floor_integer(static_cast<int64_t>(x2), static_cast<int64_t>(16L))))) % static_cast<int64_t>(32L);
                            auto tmp10 = c10::convert<int64_t>(tmp9);
                            auto tmp11 = static_cast<int64_t>(31);
                            auto tmp12 = tmp10 < tmp11;
                            auto tmp14 = tmp12 & tmp7;
                            auto tmp13 = [&]
                            {
                                auto tmp15 = in_ptr1[static_cast<int64_t>((31L*(c10::div_floor_integer(static_cast<int64_t>((15L + (31L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(16L)))) + (c10::div_floor_integer(static_cast<int64_t>(x2), static_cast<int64_t>(16L))))), static_cast<int64_t>(32L)))) + (496L*(static_cast<int64_t>(x1) % static_cast<int64_t>(16L))) + (7936L*x0) + (static_cast<int64_t>((15L + (31L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(16L)))) + (c10::div_floor_integer(static_cast<int64_t>(x2), static_cast<int64_t>(16L))))) % static_cast<int64_t>(32L)))];
                                return tmp15;
                            }
                            ;
                            auto tmp16 = tmp12 ? tmp13() : static_cast<float>(0.0);
                            return tmp16;
                        }
                        ;
                        auto tmp17 = tmp7 ? tmp8() : static_cast<float>(0.0);
                        auto tmp18 = 15L + (31L*(static_cast<int64_t>(x1) % static_cast<int64_t>(16L))) + (static_cast<int64_t>(x2) % static_cast<int64_t>(16L));
                        auto tmp19 = c10::convert<int64_t>(tmp18);
                        auto tmp20 = at::vec::VectorizedN<int64_t,2>::arange(tmp19, 1);
                        auto tmp21 = at::vec::VectorizedN<int64_t,2>(tmp6);
                        auto tmp22 = at::vec::VecMask<int64_t,2>(tmp20 < tmp21);
                        auto tmp23 = [&]
                        {
                            auto tmp24 =
                            [&]
                            {
                                __at_align__ std::array<int64_t, 16> tmpbuf;
                                #pragma GCC unroll 16
                                for (long x2_inner = 0; x2_inner < 16; x2_inner++)
                                {
                                    if (tmp22.is_masked(x2_inner))
                                    {
                                        tmpbuf[x2_inner] = static_cast<int64_t>(static_cast<int64_t>((15L + (31L*(static_cast<int64_t>(x1) % static_cast<int64_t>(16L))) + (static_cast<int64_t>((x2 + x2_inner)) % static_cast<int64_t>(16L)))) % static_cast<int64_t>(32L));
                                    }
                                }
                                return at::vec::VectorizedN<int64_t,2>::loadu(tmpbuf.data(), 16);
                            }
                            ()
                            ;
                            auto tmp25 = static_cast<int64_t>(31);
                            auto tmp26 = at::vec::VectorizedN<int64_t,2>(tmp25);
                            auto tmp27 = at::vec::VecMask<int64_t,2>(tmp24 < tmp26);
                            auto tmp29 = tmp27 & tmp22;
                            auto tmp28 = [&]
                            {
                                auto tmp30 =
                                [&]
                                {
                                    __at_align__ std::array<float, 16> tmpbuf;
                                    #pragma GCC unroll 16
                                    for (long x2_inner = 0; x2_inner < 16; x2_inner++)
                                    {
                                        if (tmp29.is_masked(x2_inner))
                                        {
                                            tmpbuf[x2_inner] = in_ptr2[static_cast<int64_t>((31L*(c10::div_floor_integer(static_cast<int64_t>((15L + (31L*(static_cast<int64_t>(x1) % static_cast<int64_t>(16L))) + (static_cast<int64_t>((x2 + x2_inner)) % static_cast<int64_t>(16L)))), static_cast<int64_t>(32L)))) + (496L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(16L)))) + (7936L*x0) + (static_cast<int64_t>((15L + (31L*(static_cast<int64_t>(x1) % static_cast<int64_t>(16L))) + (static_cast<int64_t>((x2 + x2_inner)) % static_cast<int64_t>(16L)))) % static_cast<int64_t>(32L)))];
                                        }
                                    }
                                    return at::vec::Vectorized<float>::loadu(tmpbuf.data(), 16);
                                }
                                ()
                                ;
                                return tmp30;
                            }
                            ;
                            auto tmp33 =
                            [&]
                            {
                                if (tmp29.all_zero())
                                {
                                    return at::vec::Vectorized<float>(static_cast<float>(0.0));
                                }
                                else
                                {
                                    auto tmp31 = tmp28();
                                    auto tmp32 = at::vec::Vectorized<float>(static_cast<float>(0.0));
                                    return decltype(tmp31)::blendv(tmp32, tmp31, tmp29.template cast<float,1>());
                                }
                            }
                            ()
                            ;
                            return tmp33;
                        }
                        ;
                        auto tmp36 =
                        [&]
                        {
                            if (tmp22.all_zero())
                            {
                                return at::vec::Vectorized<float>(static_cast<float>(0.0));
                            }
                            else
                            {
                                auto tmp34 = tmp23();
                                auto tmp35 = at::vec::Vectorized<float>(static_cast<float>(0.0));
                                return decltype(tmp34)::blendv(tmp35, tmp34, tmp22.template cast<float,1>());
                            }
                        }
                        ()
                        ;
                        auto tmp37 = at::vec::Vectorized<float>(tmp17);
                        auto tmp38 = tmp37 + tmp36;
                        auto tmp39 = tmp3 + tmp38;
                        tmp_acc0_vec = at::vec::maximum(tmp_acc0_vec, tmp39);
                    }
                    tmp_acc0 = max_propagate_nan(tmp_acc0, at::vec::vec_reduce_all<float>([](at::vec::Vectorized<float>& x, at::vec::Vectorized<float>& y) { return at::vec::maximum(x, y); }, tmp_acc0_vec));
                    out_ptr0[static_cast<int64_t>(x1 + (256L*x0))] = static_cast<float>(tmp_acc0);
                }
                for(int64_t x2=static_cast<int64_t>(0L); x2<static_cast<int64_t>(256L); x2+=static_cast<int64_t>(16L))
                {
                    auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x2 + (256L*x1) + (65536L*x0)), 16);
                    auto tmp40 = out_ptr0[static_cast<int64_t>(x1 + (256L*x0))];
                    auto tmp1 = static_cast<float>(0.125);
                    auto tmp2 = at::vec::Vectorized<float>(tmp1);
                    auto tmp3 = tmp0 * tmp2;
                    auto tmp4 = 15L + (31L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(16L)))) + (c10::div_floor_integer(static_cast<int64_t>(x2), static_cast<int64_t>(16L)));
                    auto tmp5 = c10::convert<int64_t>(tmp4);
                    auto tmp6 = static_cast<int64_t>(512);
                    auto tmp7 = tmp5 < tmp6;
                    auto tmp8 = [&]
                    {
                        auto tmp9 = static_cast<int64_t>((15L + (31L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(16L)))) + (c10::div_floor_integer(static_cast<int64_t>(x2), static_cast<int64_t>(16L))))) % static_cast<int64_t>(32L);
                        auto tmp10 = c10::convert<int64_t>(tmp9);
                        auto tmp11 = static_cast<int64_t>(31);
                        auto tmp12 = tmp10 < tmp11;
                        auto tmp14 = tmp12 & tmp7;
                        auto tmp13 = [&]
                        {
                            auto tmp15 = in_ptr1[static_cast<int64_t>((31L*(c10::div_floor_integer(static_cast<int64_t>((15L + (31L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(16L)))) + (c10::div_floor_integer(static_cast<int64_t>(x2), static_cast<int64_t>(16L))))), static_cast<int64_t>(32L)))) + (496L*(static_cast<int64_t>(x1) % static_cast<int64_t>(16L))) + (7936L*x0) + (static_cast<int64_t>((15L + (31L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(16L)))) + (c10::div_floor_integer(static_cast<int64_t>(x2), static_cast<int64_t>(16L))))) % static_cast<int64_t>(32L)))];
                            return tmp15;
                        }
                        ;
                        auto tmp16 = tmp12 ? tmp13() : static_cast<float>(0.0);
                        return tmp16;
                    }
                    ;
                    auto tmp17 = tmp7 ? tmp8() : static_cast<float>(0.0);
                    auto tmp18 = 15L + (31L*(static_cast<int64_t>(x1) % static_cast<int64_t>(16L))) + (static_cast<int64_t>(x2) % static_cast<int64_t>(16L));
                    auto tmp19 = c10::convert<int64_t>(tmp18);
                    auto tmp20 = at::vec::VectorizedN<int64_t,2>::arange(tmp19, 1);
                    auto tmp21 = at::vec::VectorizedN<int64_t,2>(tmp6);
                    auto tmp22 = at::vec::VecMask<int64_t,2>(tmp20 < tmp21);
                    auto tmp23 = [&]
                    {
                        auto tmp24 =
                        [&]
                        {
                            __at_align__ std::array<int64_t, 16> tmpbuf;
                            #pragma GCC unroll 16
                            for (long x2_inner = 0; x2_inner < 16; x2_inner++)
                            {
                                if (tmp22.is_masked(x2_inner))
                                {
                                    tmpbuf[x2_inner] = static_cast<int64_t>(static_cast<int64_t>((15L + (31L*(static_cast<int64_t>(x1) % static_cast<int64_t>(16L))) + (static_cast<int64_t>((x2 + x2_inner)) % static_cast<int64_t>(16L)))) % static_cast<int64_t>(32L));
                                }
                            }
                            return at::vec::VectorizedN<int64_t,2>::loadu(tmpbuf.data(), 16);
                        }
                        ()
                        ;
                        auto tmp25 = static_cast<int64_t>(31);
                        auto tmp26 = at::vec::VectorizedN<int64_t,2>(tmp25);
                        auto tmp27 = at::vec::VecMask<int64_t,2>(tmp24 < tmp26);
                        auto tmp29 = tmp27 & tmp22;
                        auto tmp28 = [&]
                        {
                            auto tmp30 =
                            [&]
                            {
                                __at_align__ std::array<float, 16> tmpbuf;
                                #pragma GCC unroll 16
                                for (long x2_inner = 0; x2_inner < 16; x2_inner++)
                                {
                                    if (tmp29.is_masked(x2_inner))
                                    {
                                        tmpbuf[x2_inner] = in_ptr2[static_cast<int64_t>((31L*(c10::div_floor_integer(static_cast<int64_t>((15L + (31L*(static_cast<int64_t>(x1) % static_cast<int64_t>(16L))) + (static_cast<int64_t>((x2 + x2_inner)) % static_cast<int64_t>(16L)))), static_cast<int64_t>(32L)))) + (496L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(16L)))) + (7936L*x0) + (static_cast<int64_t>((15L + (31L*(static_cast<int64_t>(x1) % static_cast<int64_t>(16L))) + (static_cast<int64_t>((x2 + x2_inner)) % static_cast<int64_t>(16L)))) % static_cast<int64_t>(32L)))];
                                    }
                                }
                                return at::vec::Vectorized<float>::loadu(tmpbuf.data(), 16);
                            }
                            ()
                            ;
                            return tmp30;
                        }
                        ;
                        auto tmp33 =
                        [&]
                        {
                            if (tmp29.all_zero())
                            {
                                return at::vec::Vectorized<float>(static_cast<float>(0.0));
                            }
                            else
                            {
                                auto tmp31 = tmp28();
                                auto tmp32 = at::vec::Vectorized<float>(static_cast<float>(0.0));
                                return decltype(tmp31)::blendv(tmp32, tmp31, tmp29.template cast<float,1>());
                            }
                        }
                        ()
                        ;
                        return tmp33;
                    }
                    ;
                    auto tmp36 =
                    [&]
                    {
                        if (tmp22.all_zero())
                        {
                            return at::vec::Vectorized<float>(static_cast<float>(0.0));
                        }
                        else
                        {
                            auto tmp34 = tmp23();
                            auto tmp35 = at::vec::Vectorized<float>(static_cast<float>(0.0));
                            return decltype(tmp34)::blendv(tmp35, tmp34, tmp22.template cast<float,1>());
                        }
                    }
                    ()
                    ;
                    auto tmp37 = at::vec::Vectorized<float>(tmp17);
                    auto tmp38 = tmp37 + tmp36;
                    auto tmp39 = tmp3 + tmp38;
                    auto tmp41 = at::vec::Vectorized<float>(tmp40);
                    auto tmp42 = tmp39 - tmp41;
                    auto tmp43 = tmp42.exp();
                    tmp43.store(out_ptr1 + static_cast<int64_t>(x2 + (256L*x1) + (65536L*x0)));
                }
            }
        }
    }
    {
        #pragma GCC ivdep
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(1024L); x0+=static_cast<int64_t>(1L))
        {
            {
                float tmp_acc0 = 0;
                at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(0);
                for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(256L); x1+=static_cast<int64_t>(16L))
                {
                    auto tmp0 = at::vec::Vectorized<float>::loadu(out_ptr1 + static_cast<int64_t>(x1 + (256L*x0)), 16);
                    tmp_acc0_vec = tmp_acc0_vec + tmp0;
                }
                tmp_acc0 = tmp_acc0 + at::vec::vec_reduce_all<float>([](at::vec::Vectorized<float>& x, at::vec::Vectorized<float>& y) { return x + y; }, tmp_acc0_vec);
                out_ptr2[static_cast<int64_t>(x0)] = static_cast<float>(tmp_acc0);
            }
            for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(256L); x1+=static_cast<int64_t>(16L))
            {
                auto tmp0 = at::vec::Vectorized<float>::loadu(out_ptr1 + static_cast<int64_t>(x1 + (256L*x0)), 16);
                auto tmp1 = out_ptr2[static_cast<int64_t>(x0)];
                auto tmp2 = at::vec::Vectorized<float>(tmp1);
                auto tmp3 = tmp0 / tmp2;
                tmp3.store(out_ptr3 + static_cast<int64_t>(x1 + (256L*x0)));
            }
        }
    }
 }
 ''')


 cpp_fused__native_batch_norm_legit_no_training_silu_27 = async_compile.cpp_pybinding(['float*', 'const float*', 'const float*', 'const float*', 'const float*', 'const float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(float* in_out_ptr0,
                       const float* in_ptr0,
                       const float* in_ptr1,
                       const float* in_ptr2,
                       const float* in_ptr3,
                       const float* in_ptr4)
 {
    {
        #pragma GCC ivdep
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(256L); x0+=static_cast<int64_t>(1L))
        {
            for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(256L); x1+=static_cast<int64_t>(16L))
            {
                auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>((64L*x0) + (16384L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(64L)))) + (static_cast<int64_t>(x1) % static_cast<int64_t>(64L))), 16);
                auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr1 + static_cast<int64_t>(x1), 16);
                auto tmp3 = at::vec::Vectorized<float>::loadu(in_ptr2 + static_cast<int64_t>(x1), 16);
                auto tmp5 = at::vec::Vectorized<float>::loadu(in_ptr3 + static_cast<int64_t>(x1), 16);
                auto tmp7 = at::vec::Vectorized<float>::loadu(in_ptr4 + static_cast<int64_t>(x1), 16);
                auto tmp2 = tmp0 - tmp1;
                auto tmp4 = tmp2 * tmp3;
                auto tmp6 = tmp4 * tmp5;
                auto tmp8 = tmp6 + tmp7;
                auto tmp9 = decltype(tmp8)(1)/(decltype(tmp8)(1) + tmp8.neg().exp());
                auto tmp10 = tmp8 * tmp9;
                tmp10.store(in_out_ptr0 + static_cast<int64_t>(x1 + (256L*x0)));
            }
        }
    }
 }
 ''')


 cpp_fused_silu_28 = async_compile.cpp_pybinding(['const float*', 'float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(const float* in_ptr0,
                       float* out_ptr0)
 {
    {
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(262144L); x0+=static_cast<int64_t>(16L))
        {
            auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x0), 16);
            auto tmp1 = decltype(tmp0)(1)/(decltype(tmp0)(1) + tmp0.neg().exp());
            auto tmp2 = tmp0 * tmp1;
            tmp2.store(out_ptr0 + static_cast<int64_t>(x0));
        }
    }
 }
 ''')


 cpp_fused_clone_29 = async_compile.cpp_pybinding(['const float*', 'float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(const float* in_ptr0,
                       float* out_ptr0)
 {
    {
        #pragma GCC ivdep
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(4L); x0+=static_cast<int64_t>(1L))
        {
            #pragma GCC ivdep
            for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(16L); x1+=static_cast<int64_t>(1L))
            {
                #pragma GCC ivdep
                for(int64_t x2=static_cast<int64_t>(0L); x2<static_cast<int64_t>(16L); x2+=static_cast<int64_t>(1L))
                {
                    for(int64_t x3=static_cast<int64_t>(0L); x3<static_cast<int64_t>(128L); x3+=static_cast<int64_t>(16L))
                    {
                        auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x3 + (128L*x0) + (1536L*x1) + (24576L*x2)), 16);
                        tmp0.store(out_ptr0 + static_cast<int64_t>(x3 + (128L*x2) + (2048L*x1) + (32768L*x0)));
                    }
                }
            }
        }
    }
 }
 ''')


 cpp_fused_clone_30 = async_compile.cpp_pybinding(['const float*', 'float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(const float* in_ptr0,
                       float* out_ptr0)
 {
    {
        #pragma GCC ivdep
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(4L); x0+=static_cast<int64_t>(1L))
        {
            #pragma GCC ivdep
            for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(256L); x1+=static_cast<int64_t>(1L))
            {
                for(int64_t x2=static_cast<int64_t>(0L); x2<static_cast<int64_t>(128L); x2+=static_cast<int64_t>(16L))
                {
                    auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x2 + (128L*x0) + (1536L*x1)), 16);
                    tmp0.store(out_ptr0 + static_cast<int64_t>(x2 + (128L*x1) + (32768L*x0)));
                }
            }
        }
    }
 }
 ''')


 cpp_fused__softmax_add_mul_31 = async_compile.cpp_pybinding(['const float*', 'const float*', 'const float*', 'float*', 'float*', 'float*', 'float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(const float* in_ptr0,
                       const float* in_ptr1,
                       const float* in_ptr2,
                       float* out_ptr0,
                       float* out_ptr1,
                       float* out_ptr2,
                       float* out_ptr3)
 {
    {
        #pragma GCC ivdep
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(4L); x0+=static_cast<int64_t>(1L))
        {
            #pragma GCC ivdep
            for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(256L); x1+=static_cast<int64_t>(1L))
            {
                {
                    float tmp_acc0 = -std::numeric_limits<float>::infinity();
                    at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(-std::numeric_limits<float>::infinity());
                    for(int64_t x2=static_cast<int64_t>(0L); x2<static_cast<int64_t>(256L); x2+=static_cast<int64_t>(16L))
                    {
                        auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x2 + (256L*x1) + (65536L*x0)), 16);
                        auto tmp1 = static_cast<float>(0.08838834764831845);
                        auto tmp2 = at::vec::Vectorized<float>(tmp1);
                        auto tmp3 = tmp0 * tmp2;
                        auto tmp4 = 15L + (31L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(16L)))) + (c10::div_floor_integer(static_cast<int64_t>(x2), static_cast<int64_t>(16L)));
                        auto tmp5 = c10::convert<int64_t>(tmp4);
                        auto tmp6 = static_cast<int64_t>(512);
                        auto tmp7 = tmp5 < tmp6;
                        auto tmp8 = [&]
                        {
                            auto tmp9 = static_cast<int64_t>((15L + (31L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(16L)))) + (c10::div_floor_integer(static_cast<int64_t>(x2), static_cast<int64_t>(16L))))) % static_cast<int64_t>(32L);
                            auto tmp10 = c10::convert<int64_t>(tmp9);
                            auto tmp11 = static_cast<int64_t>(31);
                            auto tmp12 = tmp10 < tmp11;
                            auto tmp14 = tmp12 & tmp7;
                            auto tmp13 = [&]
                            {
                                auto tmp15 = in_ptr1[static_cast<int64_t>((31L*(c10::div_floor_integer(static_cast<int64_t>((15L + (31L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(16L)))) + (c10::div_floor_integer(static_cast<int64_t>(x2), static_cast<int64_t>(16L))))), static_cast<int64_t>(32L)))) + (496L*(static_cast<int64_t>(x1) % static_cast<int64_t>(16L))) + (7936L*x0) + (static_cast<int64_t>((15L + (31L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(16L)))) + (c10::div_floor_integer(static_cast<int64_t>(x2), static_cast<int64_t>(16L))))) % static_cast<int64_t>(32L)))];
                                return tmp15;
                            }
                            ;
                            auto tmp16 = tmp12 ? tmp13() : static_cast<float>(0.0);
                            return tmp16;
                        }
                        ;
                        auto tmp17 = tmp7 ? tmp8() : static_cast<float>(0.0);
                        auto tmp18 = 15L + (31L*(static_cast<int64_t>(x1) % static_cast<int64_t>(16L))) + (static_cast<int64_t>(x2) % static_cast<int64_t>(16L));
                        auto tmp19 = c10::convert<int64_t>(tmp18);
                        auto tmp20 = at::vec::VectorizedN<int64_t,2>::arange(tmp19, 1);
                        auto tmp21 = at::vec::VectorizedN<int64_t,2>(tmp6);
                        auto tmp22 = at::vec::VecMask<int64_t,2>(tmp20 < tmp21);
                        auto tmp23 = [&]
                        {
                            auto tmp24 =
                            [&]
                            {
                                __at_align__ std::array<int64_t, 16> tmpbuf;
                                #pragma GCC unroll 16
                                for (long x2_inner = 0; x2_inner < 16; x2_inner++)
                                {
                                    if (tmp22.is_masked(x2_inner))
                                    {
                                        tmpbuf[x2_inner] = static_cast<int64_t>(static_cast<int64_t>((15L + (31L*(static_cast<int64_t>(x1) % static_cast<int64_t>(16L))) + (static_cast<int64_t>((x2 + x2_inner)) % static_cast<int64_t>(16L)))) % static_cast<int64_t>(32L));
                                    }
                                }
                                return at::vec::VectorizedN<int64_t,2>::loadu(tmpbuf.data(), 16);
                            }
                            ()
                            ;
                            auto tmp25 = static_cast<int64_t>(31);
                            auto tmp26 = at::vec::VectorizedN<int64_t,2>(tmp25);
                            auto tmp27 = at::vec::VecMask<int64_t,2>(tmp24 < tmp26);
                            auto tmp29 = tmp27 & tmp22;
                            auto tmp28 = [&]
                            {
                                auto tmp30 =
                                [&]
                                {
                                    __at_align__ std::array<float, 16> tmpbuf;
                                    #pragma GCC unroll 16
                                    for (long x2_inner = 0; x2_inner < 16; x2_inner++)
                                    {
                                        if (tmp29.is_masked(x2_inner))
                                        {
                                            tmpbuf[x2_inner] = in_ptr2[static_cast<int64_t>((31L*(c10::div_floor_integer(static_cast<int64_t>((15L + (31L*(static_cast<int64_t>(x1) % static_cast<int64_t>(16L))) + (static_cast<int64_t>((x2 + x2_inner)) % static_cast<int64_t>(16L)))), static_cast<int64_t>(32L)))) + (496L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(16L)))) + (7936L*x0) + (static_cast<int64_t>((15L + (31L*(static_cast<int64_t>(x1) % static_cast<int64_t>(16L))) + (static_cast<int64_t>((x2 + x2_inner)) % static_cast<int64_t>(16L)))) % static_cast<int64_t>(32L)))];
                                        }
                                    }
                                    return at::vec::Vectorized<float>::loadu(tmpbuf.data(), 16);
                                }
                                ()
                                ;
                                return tmp30;
                            }
                            ;
                            auto tmp33 =
                            [&]
                            {
                                if (tmp29.all_zero())
                                {
                                    return at::vec::Vectorized<float>(static_cast<float>(0.0));
                                }
                                else
                                {
                                    auto tmp31 = tmp28();
                                    auto tmp32 = at::vec::Vectorized<float>(static_cast<float>(0.0));
                                    return decltype(tmp31)::blendv(tmp32, tmp31, tmp29.template cast<float,1>());
                                }
                            }
                            ()
                            ;
                            return tmp33;
                        }
                        ;
                        auto tmp36 =
                        [&]
                        {
                            if (tmp22.all_zero())
                            {
                                return at::vec::Vectorized<float>(static_cast<float>(0.0));
                            }
                            else
                            {
                                auto tmp34 = tmp23();
                                auto tmp35 = at::vec::Vectorized<float>(static_cast<float>(0.0));
                                return decltype(tmp34)::blendv(tmp35, tmp34, tmp22.template cast<float,1>());
                            }
                        }
                        ()
                        ;
                        auto tmp37 = at::vec::Vectorized<float>(tmp17);
                        auto tmp38 = tmp37 + tmp36;
                        auto tmp39 = tmp3 + tmp38;
                        tmp_acc0_vec = at::vec::maximum(tmp_acc0_vec, tmp39);
                    }
                    tmp_acc0 = max_propagate_nan(tmp_acc0, at::vec::vec_reduce_all<float>([](at::vec::Vectorized<float>& x, at::vec::Vectorized<float>& y) { return at::vec::maximum(x, y); }, tmp_acc0_vec));
                    out_ptr0[static_cast<int64_t>(x1 + (256L*x0))] = static_cast<float>(tmp_acc0);
                }
                for(int64_t x2=static_cast<int64_t>(0L); x2<static_cast<int64_t>(256L); x2+=static_cast<int64_t>(16L))
                {
                    auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x2 + (256L*x1) + (65536L*x0)), 16);
                    auto tmp40 = out_ptr0[static_cast<int64_t>(x1 + (256L*x0))];
                    auto tmp1 = static_cast<float>(0.08838834764831845);
                    auto tmp2 = at::vec::Vectorized<float>(tmp1);
                    auto tmp3 = tmp0 * tmp2;
                    auto tmp4 = 15L + (31L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(16L)))) + (c10::div_floor_integer(static_cast<int64_t>(x2), static_cast<int64_t>(16L)));
                    auto tmp5 = c10::convert<int64_t>(tmp4);
                    auto tmp6 = static_cast<int64_t>(512);
                    auto tmp7 = tmp5 < tmp6;
                    auto tmp8 = [&]
                    {
                        auto tmp9 = static_cast<int64_t>((15L + (31L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(16L)))) + (c10::div_floor_integer(static_cast<int64_t>(x2), static_cast<int64_t>(16L))))) % static_cast<int64_t>(32L);
                        auto tmp10 = c10::convert<int64_t>(tmp9);
                        auto tmp11 = static_cast<int64_t>(31);
                        auto tmp12 = tmp10 < tmp11;
                        auto tmp14 = tmp12 & tmp7;
                        auto tmp13 = [&]
                        {
                            auto tmp15 = in_ptr1[static_cast<int64_t>((31L*(c10::div_floor_integer(static_cast<int64_t>((15L + (31L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(16L)))) + (c10::div_floor_integer(static_cast<int64_t>(x2), static_cast<int64_t>(16L))))), static_cast<int64_t>(32L)))) + (496L*(static_cast<int64_t>(x1) % static_cast<int64_t>(16L))) + (7936L*x0) + (static_cast<int64_t>((15L + (31L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(16L)))) + (c10::div_floor_integer(static_cast<int64_t>(x2), static_cast<int64_t>(16L))))) % static_cast<int64_t>(32L)))];
                            return tmp15;
                        }
                        ;
                        auto tmp16 = tmp12 ? tmp13() : static_cast<float>(0.0);
                        return tmp16;
                    }
                    ;
                    auto tmp17 = tmp7 ? tmp8() : static_cast<float>(0.0);
                    auto tmp18 = 15L + (31L*(static_cast<int64_t>(x1) % static_cast<int64_t>(16L))) + (static_cast<int64_t>(x2) % static_cast<int64_t>(16L));
                    auto tmp19 = c10::convert<int64_t>(tmp18);
                    auto tmp20 = at::vec::VectorizedN<int64_t,2>::arange(tmp19, 1);
                    auto tmp21 = at::vec::VectorizedN<int64_t,2>(tmp6);
                    auto tmp22 = at::vec::VecMask<int64_t,2>(tmp20 < tmp21);
                    auto tmp23 = [&]
                    {
                        auto tmp24 =
                        [&]
                        {
                            __at_align__ std::array<int64_t, 16> tmpbuf;
                            #pragma GCC unroll 16
                            for (long x2_inner = 0; x2_inner < 16; x2_inner++)
                            {
                                if (tmp22.is_masked(x2_inner))
                                {
                                    tmpbuf[x2_inner] = static_cast<int64_t>(static_cast<int64_t>((15L + (31L*(static_cast<int64_t>(x1) % static_cast<int64_t>(16L))) + (static_cast<int64_t>((x2 + x2_inner)) % static_cast<int64_t>(16L)))) % static_cast<int64_t>(32L));
                                }
                            }
                            return at::vec::VectorizedN<int64_t,2>::loadu(tmpbuf.data(), 16);
                        }
                        ()
                        ;
                        auto tmp25 = static_cast<int64_t>(31);
                        auto tmp26 = at::vec::VectorizedN<int64_t,2>(tmp25);
                        auto tmp27 = at::vec::VecMask<int64_t,2>(tmp24 < tmp26);
                        auto tmp29 = tmp27 & tmp22;
                        auto tmp28 = [&]
                        {
                            auto tmp30 =
                            [&]
                            {
                                __at_align__ std::array<float, 16> tmpbuf;
                                #pragma GCC unroll 16
                                for (long x2_inner = 0; x2_inner < 16; x2_inner++)
                                {
                                    if (tmp29.is_masked(x2_inner))
                                    {
                                        tmpbuf[x2_inner] = in_ptr2[static_cast<int64_t>((31L*(c10::div_floor_integer(static_cast<int64_t>((15L + (31L*(static_cast<int64_t>(x1) % static_cast<int64_t>(16L))) + (static_cast<int64_t>((x2 + x2_inner)) % static_cast<int64_t>(16L)))), static_cast<int64_t>(32L)))) + (496L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(16L)))) + (7936L*x0) + (static_cast<int64_t>((15L + (31L*(static_cast<int64_t>(x1) % static_cast<int64_t>(16L))) + (static_cast<int64_t>((x2 + x2_inner)) % static_cast<int64_t>(16L)))) % static_cast<int64_t>(32L)))];
                                    }
                                }
                                return at::vec::Vectorized<float>::loadu(tmpbuf.data(), 16);
                            }
                            ()
                            ;
                            return tmp30;
                        }
                        ;
                        auto tmp33 =
                        [&]
                        {
                            if (tmp29.all_zero())
                            {
                                return at::vec::Vectorized<float>(static_cast<float>(0.0));
                            }
                            else
                            {
                                auto tmp31 = tmp28();
                                auto tmp32 = at::vec::Vectorized<float>(static_cast<float>(0.0));
                                return decltype(tmp31)::blendv(tmp32, tmp31, tmp29.template cast<float,1>());
                            }
                        }
                        ()
                        ;
                        return tmp33;
                    }
                    ;
                    auto tmp36 =
                    [&]
                    {
                        if (tmp22.all_zero())
                        {
                            return at::vec::Vectorized<float>(static_cast<float>(0.0));
                        }
                        else
                        {
                            auto tmp34 = tmp23();
                            auto tmp35 = at::vec::Vectorized<float>(static_cast<float>(0.0));
                            return decltype(tmp34)::blendv(tmp35, tmp34, tmp22.template cast<float,1>());
                        }
                    }
                    ()
                    ;
                    auto tmp37 = at::vec::Vectorized<float>(tmp17);
                    auto tmp38 = tmp37 + tmp36;
                    auto tmp39 = tmp3 + tmp38;
                    auto tmp41 = at::vec::Vectorized<float>(tmp40);
                    auto tmp42 = tmp39 - tmp41;
                    auto tmp43 = tmp42.exp();
                    tmp43.store(out_ptr1 + static_cast<int64_t>(x2 + (256L*x1) + (65536L*x0)));
                }
            }
        }
    }
    {
        #pragma GCC ivdep
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(1024L); x0+=static_cast<int64_t>(1L))
        {
            {
                float tmp_acc0 = 0;
                at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(0);
                for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(256L); x1+=static_cast<int64_t>(16L))
                {
                    auto tmp0 = at::vec::Vectorized<float>::loadu(out_ptr1 + static_cast<int64_t>(x1 + (256L*x0)), 16);
                    tmp_acc0_vec = tmp_acc0_vec + tmp0;
                }
                tmp_acc0 = tmp_acc0 + at::vec::vec_reduce_all<float>([](at::vec::Vectorized<float>& x, at::vec::Vectorized<float>& y) { return x + y; }, tmp_acc0_vec);
                out_ptr2[static_cast<int64_t>(x0)] = static_cast<float>(tmp_acc0);
            }
            for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(256L); x1+=static_cast<int64_t>(16L))
            {
                auto tmp0 = at::vec::Vectorized<float>::loadu(out_ptr1 + static_cast<int64_t>(x1 + (256L*x0)), 16);
                auto tmp1 = out_ptr2[static_cast<int64_t>(x0)];
                auto tmp2 = at::vec::Vectorized<float>(tmp1);
                auto tmp3 = tmp0 / tmp2;
                tmp3.store(out_ptr3 + static_cast<int64_t>(x1 + (256L*x0)));
            }
        }
    }
 }
 ''')


 cpp_fused__native_batch_norm_legit_no_training_avg_pool2d_silu_32 = async_compile.cpp_pybinding(['float*', 'const float*', 'const float*', 'const float*', 'const float*', 'const float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(float* in_out_ptr0,
                       const float* in_ptr0,
                       const float* in_ptr1,
                       const float* in_ptr2,
                       const float* in_ptr3,
                       const float* in_ptr4)
 {
    {
        #pragma GCC ivdep
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(8L); x0+=static_cast<int64_t>(1L))
        {
            #pragma GCC ivdep
            for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(8L); x1+=static_cast<int64_t>(1L))
            {
                for(int64_t x2=static_cast<int64_t>(0L); x2<static_cast<int64_t>(512L); x2+=static_cast<int64_t>(16L))
                {
                    auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>((256L*x1) + (4096L*x0) + (32768L*(c10::div_floor_integer(static_cast<int64_t>(x2), static_cast<int64_t>(128L)))) + (static_cast<int64_t>(x2) % static_cast<int64_t>(128L))), 16);
                    auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(128L + (256L*x1) + (4096L*x0) + (32768L*(c10::div_floor_integer(static_cast<int64_t>(x2), static_cast<int64_t>(128L)))) + (static_cast<int64_t>(x2) % static_cast<int64_t>(128L))), 16);
                    auto tmp3 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(2048L + (256L*x1) + (4096L*x0) + (32768L*(c10::div_floor_integer(static_cast<int64_t>(x2), static_cast<int64_t>(128L)))) + (static_cast<int64_t>(x2) % static_cast<int64_t>(128L))), 16);
                    auto tmp5 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(2176L + (256L*x1) + (4096L*x0) + (32768L*(c10::div_floor_integer(static_cast<int64_t>(x2), static_cast<int64_t>(128L)))) + (static_cast<int64_t>(x2) % static_cast<int64_t>(128L))), 16);
                    auto tmp10 = at::vec::Vectorized<float>::loadu(in_ptr1 + static_cast<int64_t>(x2), 16);
                    auto tmp12 = at::vec::Vectorized<float>::loadu(in_ptr2 + static_cast<int64_t>(x2), 16);
                    auto tmp14 = at::vec::Vectorized<float>::loadu(in_ptr3 + static_cast<int64_t>(x2), 16);
                    auto tmp16 = at::vec::Vectorized<float>::loadu(in_ptr4 + static_cast<int64_t>(x2), 16);
                    auto tmp2 = tmp1 + tmp0;
                    auto tmp4 = tmp3 + tmp2;
                    auto tmp6 = tmp5 + tmp4;
                    auto tmp7 = static_cast<float>(0.25);
                    auto tmp8 = at::vec::Vectorized<float>(tmp7);
                    auto tmp9 = tmp6 * tmp8;
                    auto tmp11 = tmp9 - tmp10;
                    auto tmp13 = tmp11 * tmp12;
                    auto tmp15 = tmp13 * tmp14;
                    auto tmp17 = tmp15 + tmp16;
                    auto tmp18 = decltype(tmp17)(1)/(decltype(tmp17)(1) + tmp17.neg().exp());
                    auto tmp19 = tmp17 * tmp18;
                    tmp19.store(in_out_ptr0 + static_cast<int64_t>(x2 + (512L*x1) + (4096L*x0)));
                }
            }
        }
    }
 }
 ''')


 cpp_fused_silu_33 = async_compile.cpp_pybinding(['const float*', 'float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(const float* in_ptr0,
                       float* out_ptr0)
 {
    {
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(98304L); x0+=static_cast<int64_t>(16L))
        {
            auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x0), 16);
            auto tmp1 = decltype(tmp0)(1)/(decltype(tmp0)(1) + tmp0.neg().exp());
            auto tmp2 = tmp0 * tmp1;
            tmp2.store(out_ptr0 + static_cast<int64_t>(x0));
        }
    }
 }
 ''')


 cpp_fused_clone_34 = async_compile.cpp_pybinding(['const float*', 'float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(const float* in_ptr0,
                       float* out_ptr0)
 {
    {
        #pragma GCC ivdep
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(4L); x0+=static_cast<int64_t>(1L))
        {
            #pragma GCC ivdep
            for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(8L); x1+=static_cast<int64_t>(1L))
            {
                #pragma GCC ivdep
                for(int64_t x2=static_cast<int64_t>(0L); x2<static_cast<int64_t>(8L); x2+=static_cast<int64_t>(1L))
                {
                    for(int64_t x3=static_cast<int64_t>(0L); x3<static_cast<int64_t>(128L); x3+=static_cast<int64_t>(16L))
                    {
                        auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x3 + (128L*x0) + (1536L*x1) + (12288L*x2)), 16);
                        tmp0.store(out_ptr0 + static_cast<int64_t>(x3 + (128L*x2) + (1024L*x1) + (8192L*x0)));
                    }
                }
            }
        }
    }
 }
 ''')


 cpp_fused_clone_35 = async_compile.cpp_pybinding(['const float*', 'float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(const float* in_ptr0,
                       float* out_ptr0)
 {
    {
        #pragma GCC ivdep
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(4L); x0+=static_cast<int64_t>(1L))
        {
            #pragma GCC ivdep
            for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(64L); x1+=static_cast<int64_t>(1L))
            {
                for(int64_t x2=static_cast<int64_t>(0L); x2<static_cast<int64_t>(128L); x2+=static_cast<int64_t>(16L))
                {
                    auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x2 + (128L*x0) + (1536L*x1)), 16);
                    tmp0.store(out_ptr0 + static_cast<int64_t>(x2 + (128L*x1) + (8192L*x0)));
                }
            }
        }
    }
 }
 ''')


 cpp_fused__softmax_add_mul_36 = async_compile.cpp_pybinding(['const float*', 'const float*', 'const float*', 'float*', 'float*', 'float*', 'float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(const float* in_ptr0,
                       const float* in_ptr1,
                       const float* in_ptr2,
                       float* out_ptr0,
                       float* out_ptr1,
                       float* out_ptr2,
                       float* out_ptr3)
 {
    {
        #pragma GCC ivdep
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(4L); x0+=static_cast<int64_t>(1L))
        {
            #pragma GCC ivdep
            for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(64L); x1+=static_cast<int64_t>(1L))
            {
                {
                    float tmp_acc0 = -std::numeric_limits<float>::infinity();
                    at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(-std::numeric_limits<float>::infinity());
                    for(int64_t x2=static_cast<int64_t>(0L); x2<static_cast<int64_t>(64L); x2+=static_cast<int64_t>(16L))
                    {
                        auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x2 + (64L*x1) + (4096L*x0)), 16);
                        auto tmp1 = static_cast<float>(0.08838834764831845);
                        auto tmp2 = at::vec::Vectorized<float>(tmp1);
                        auto tmp3 = tmp0 * tmp2;
                        auto tmp4 =
                        [&]
                        {
                            __at_align__ std::array<int64_t, 16> tmpbuf;
                            #pragma GCC unroll 16
                            for (long x2_inner = 0; x2_inner < 16; x2_inner++)
                            {
                                tmpbuf[x2_inner] = static_cast<int64_t>(7L + (15L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(8L)))) + (c10::div_floor_integer(static_cast<int64_t>((x2 + x2_inner)), static_cast<int64_t>(8L))));
                            }
                            return at::vec::VectorizedN<int64_t,2>::loadu(tmpbuf.data(), 16);
                        }
                        ()
                        ;
                        auto tmp5 = static_cast<int64_t>(128);
                        auto tmp6 = at::vec::VectorizedN<int64_t,2>(tmp5);
                        auto tmp7 = at::vec::VecMask<int64_t,2>(tmp4 < tmp6);
                        auto tmp8 = [&]
                        {
                            auto tmp9 =
                            [&]
                            {
                                __at_align__ std::array<int64_t, 16> tmpbuf;
                                #pragma GCC unroll 16
                                for (long x2_inner = 0; x2_inner < 16; x2_inner++)
                                {
                                    if (tmp7.is_masked(x2_inner))
                                    {
                                        tmpbuf[x2_inner] = static_cast<int64_t>(static_cast<int64_t>((7L + (15L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(8L)))) + (c10::div_floor_integer(static_cast<int64_t>((x2 + x2_inner)), static_cast<int64_t>(8L))))) % static_cast<int64_t>(16L));
                                    }
                                }
                                return at::vec::VectorizedN<int64_t,2>::loadu(tmpbuf.data(), 16);
                            }
                            ()
                            ;
                            auto tmp10 = static_cast<int64_t>(15);
                            auto tmp11 = at::vec::VectorizedN<int64_t,2>(tmp10);
                            auto tmp12 = at::vec::VecMask<int64_t,2>(tmp9 < tmp11);
                            auto tmp14 = tmp12 & tmp7;
                            auto tmp13 = [&]
                            {
                                auto tmp15 =
                                [&]
                                {
                                    __at_align__ std::array<float, 16> tmpbuf;
                                    #pragma GCC unroll 16
                                    for (long x2_inner = 0; x2_inner < 16; x2_inner++)
                                    {
                                        if (tmp14.is_masked(x2_inner))
                                        {
                                            tmpbuf[x2_inner] = in_ptr1[static_cast<int64_t>((15L*(c10::div_floor_integer(static_cast<int64_t>((7L + (15L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(8L)))) + (c10::div_floor_integer(static_cast<int64_t>((x2 + x2_inner)), static_cast<int64_t>(8L))))), static_cast<int64_t>(16L)))) + (120L*(static_cast<int64_t>(x1) % static_cast<int64_t>(8L))) + (960L*x0) + (static_cast<int64_t>((7L + (15L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(8L)))) + (c10::div_floor_integer(static_cast<int64_t>((x2 + x2_inner)), static_cast<int64_t>(8L))))) % static_cast<int64_t>(16L)))];
                                        }
                                    }
                                    return at::vec::Vectorized<float>::loadu(tmpbuf.data(), 16);
                                }
                                ()
                                ;
                                return tmp15;
                            }
                            ;
                            auto tmp18 =
                            [&]
                            {
                                if (tmp14.all_zero())
                                {
                                    return at::vec::Vectorized<float>(static_cast<float>(0.0));
                                }
                                else
                                {
                                    auto tmp16 = tmp13();
                                    auto tmp17 = at::vec::Vectorized<float>(static_cast<float>(0.0));
                                    return decltype(tmp16)::blendv(tmp17, tmp16, tmp14.template cast<float,1>());
                                }
                            }
                            ()
                            ;
                            return tmp18;
                        }
                        ;
                        auto tmp21 =
                        [&]
                        {
                            if (tmp7.all_zero())
                            {
                                return at::vec::Vectorized<float>(static_cast<float>(0.0));
                            }
                            else
                            {
                                auto tmp19 = tmp8();
                                auto tmp20 = at::vec::Vectorized<float>(static_cast<float>(0.0));
                                return decltype(tmp19)::blendv(tmp20, tmp19, tmp7.template cast<float,1>());
                            }
                        }
                        ()
                        ;
                        auto tmp22 =
                        [&]
                        {
                            __at_align__ std::array<int64_t, 16> tmpbuf;
                            #pragma GCC unroll 16
                            for (long x2_inner = 0; x2_inner < 16; x2_inner++)
                            {
                                tmpbuf[x2_inner] = static_cast<int64_t>(7L + (15L*(static_cast<int64_t>(x1) % static_cast<int64_t>(8L))) + (static_cast<int64_t>((x2 + x2_inner)) % static_cast<int64_t>(8L)));
                            }
                            return at::vec::VectorizedN<int64_t,2>::loadu(tmpbuf.data(), 16);
                        }
                        ()
                        ;
                        auto tmp23 = at::vec::VecMask<int64_t,2>(tmp22 < tmp6);
                        auto tmp24 = [&]
                        {
                            auto tmp25 =
                            [&]
                            {
                                __at_align__ std::array<int64_t, 16> tmpbuf;
                                #pragma GCC unroll 16
                                for (long x2_inner = 0; x2_inner < 16; x2_inner++)
                                {
                                    if (tmp23.is_masked(x2_inner))
                                    {
                                        tmpbuf[x2_inner] = static_cast<int64_t>(static_cast<int64_t>((7L + (15L*(static_cast<int64_t>(x1) % static_cast<int64_t>(8L))) + (static_cast<int64_t>((x2 + x2_inner)) % static_cast<int64_t>(8L)))) % static_cast<int64_t>(16L));
                                    }
                                }
                                return at::vec::VectorizedN<int64_t,2>::loadu(tmpbuf.data(), 16);
                            }
                            ()
                            ;
                            auto tmp26 = static_cast<int64_t>(15);
                            auto tmp27 = at::vec::VectorizedN<int64_t,2>(tmp26);
                            auto tmp28 = at::vec::VecMask<int64_t,2>(tmp25 < tmp27);
                            auto tmp30 = tmp28 & tmp23;
                            auto tmp29 = [&]
                            {
                                auto tmp31 =
                                [&]
                                {
                                    __at_align__ std::array<float, 16> tmpbuf;
                                    #pragma GCC unroll 16
                                    for (long x2_inner = 0; x2_inner < 16; x2_inner++)
                                    {
                                        if (tmp30.is_masked(x2_inner))
                                        {
                                            tmpbuf[x2_inner] = in_ptr2[static_cast<int64_t>((15L*(c10::div_floor_integer(static_cast<int64_t>((7L + (15L*(static_cast<int64_t>(x1) % static_cast<int64_t>(8L))) + (static_cast<int64_t>((x2 + x2_inner)) % static_cast<int64_t>(8L)))), static_cast<int64_t>(16L)))) + (120L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(8L)))) + (960L*x0) + (static_cast<int64_t>((7L + (15L*(static_cast<int64_t>(x1) % static_cast<int64_t>(8L))) + (static_cast<int64_t>((x2 + x2_inner)) % static_cast<int64_t>(8L)))) % static_cast<int64_t>(16L)))];
                                        }
                                    }
                                    return at::vec::Vectorized<float>::loadu(tmpbuf.data(), 16);
                                }
                                ()
                                ;
                                return tmp31;
                            }
                            ;
                            auto tmp34 =
                            [&]
                            {
                                if (tmp30.all_zero())
                                {
                                    return at::vec::Vectorized<float>(static_cast<float>(0.0));
                                }
                                else
                                {
                                    auto tmp32 = tmp29();
                                    auto tmp33 = at::vec::Vectorized<float>(static_cast<float>(0.0));
                                    return decltype(tmp32)::blendv(tmp33, tmp32, tmp30.template cast<float,1>());
                                }
                            }
                            ()
                            ;
                            return tmp34;
                        }
                        ;
                        auto tmp37 =
                        [&]
                        {
                            if (tmp23.all_zero())
                            {
                                return at::vec::Vectorized<float>(static_cast<float>(0.0));
                            }
                            else
                            {
                                auto tmp35 = tmp24();
                                auto tmp36 = at::vec::Vectorized<float>(static_cast<float>(0.0));
                                return decltype(tmp35)::blendv(tmp36, tmp35, tmp23.template cast<float,1>());
                            }
                        }
                        ()
                        ;
                        auto tmp38 = tmp21 + tmp37;
                        auto tmp39 = tmp3 + tmp38;
                        tmp_acc0_vec = at::vec::maximum(tmp_acc0_vec, tmp39);
                    }
                    tmp_acc0 = max_propagate_nan(tmp_acc0, at::vec::vec_reduce_all<float>([](at::vec::Vectorized<float>& x, at::vec::Vectorized<float>& y) { return at::vec::maximum(x, y); }, tmp_acc0_vec));
                    out_ptr0[static_cast<int64_t>(x1 + (64L*x0))] = static_cast<float>(tmp_acc0);
                }
                for(int64_t x2=static_cast<int64_t>(0L); x2<static_cast<int64_t>(64L); x2+=static_cast<int64_t>(16L))
                {
                    auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x2 + (64L*x1) + (4096L*x0)), 16);
                    auto tmp40 = out_ptr0[static_cast<int64_t>(x1 + (64L*x0))];
                    auto tmp1 = static_cast<float>(0.08838834764831845);
                    auto tmp2 = at::vec::Vectorized<float>(tmp1);
                    auto tmp3 = tmp0 * tmp2;
                    auto tmp4 =
                    [&]
                    {
                        __at_align__ std::array<int64_t, 16> tmpbuf;
                        #pragma GCC unroll 16
                        for (long x2_inner = 0; x2_inner < 16; x2_inner++)
                        {
                            tmpbuf[x2_inner] = static_cast<int64_t>(7L + (15L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(8L)))) + (c10::div_floor_integer(static_cast<int64_t>((x2 + x2_inner)), static_cast<int64_t>(8L))));
                        }
                        return at::vec::VectorizedN<int64_t,2>::loadu(tmpbuf.data(), 16);
                    }
                    ()
                    ;
                    auto tmp5 = static_cast<int64_t>(128);
                    auto tmp6 = at::vec::VectorizedN<int64_t,2>(tmp5);
                    auto tmp7 = at::vec::VecMask<int64_t,2>(tmp4 < tmp6);
                    auto tmp8 = [&]
                    {
                        auto tmp9 =
                        [&]
                        {
                            __at_align__ std::array<int64_t, 16> tmpbuf;
                            #pragma GCC unroll 16
                            for (long x2_inner = 0; x2_inner < 16; x2_inner++)
                            {
                                if (tmp7.is_masked(x2_inner))
                                {
                                    tmpbuf[x2_inner] = static_cast<int64_t>(static_cast<int64_t>((7L + (15L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(8L)))) + (c10::div_floor_integer(static_cast<int64_t>((x2 + x2_inner)), static_cast<int64_t>(8L))))) % static_cast<int64_t>(16L));
                                }
                            }
                            return at::vec::VectorizedN<int64_t,2>::loadu(tmpbuf.data(), 16);
                        }
                        ()
                        ;
                        auto tmp10 = static_cast<int64_t>(15);
                        auto tmp11 = at::vec::VectorizedN<int64_t,2>(tmp10);
                        auto tmp12 = at::vec::VecMask<int64_t,2>(tmp9 < tmp11);
                        auto tmp14 = tmp12 & tmp7;
                        auto tmp13 = [&]
                        {
                            auto tmp15 =
                            [&]
                            {
                                __at_align__ std::array<float, 16> tmpbuf;
                                #pragma GCC unroll 16
                                for (long x2_inner = 0; x2_inner < 16; x2_inner++)
                                {
                                    if (tmp14.is_masked(x2_inner))
                                    {
                                        tmpbuf[x2_inner] = in_ptr1[static_cast<int64_t>((15L*(c10::div_floor_integer(static_cast<int64_t>((7L + (15L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(8L)))) + (c10::div_floor_integer(static_cast<int64_t>((x2 + x2_inner)), static_cast<int64_t>(8L))))), static_cast<int64_t>(16L)))) + (120L*(static_cast<int64_t>(x1) % static_cast<int64_t>(8L))) + (960L*x0) + (static_cast<int64_t>((7L + (15L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(8L)))) + (c10::div_floor_integer(static_cast<int64_t>((x2 + x2_inner)), static_cast<int64_t>(8L))))) % static_cast<int64_t>(16L)))];
                                    }
                                }
                                return at::vec::Vectorized<float>::loadu(tmpbuf.data(), 16);
                            }
                            ()
                            ;
                            return tmp15;
                        }
                        ;
                        auto tmp18 =
                        [&]
                        {
                            if (tmp14.all_zero())
                            {
                                return at::vec::Vectorized<float>(static_cast<float>(0.0));
                            }
                            else
                            {
                                auto tmp16 = tmp13();
                                auto tmp17 = at::vec::Vectorized<float>(static_cast<float>(0.0));
                                return decltype(tmp16)::blendv(tmp17, tmp16, tmp14.template cast<float,1>());
                            }
                        }
                        ()
                        ;
                        return tmp18;
                    }
                    ;
                    auto tmp21 =
                    [&]
                    {
                        if (tmp7.all_zero())
                        {
                            return at::vec::Vectorized<float>(static_cast<float>(0.0));
                        }
                        else
                        {
                            auto tmp19 = tmp8();
                            auto tmp20 = at::vec::Vectorized<float>(static_cast<float>(0.0));
                            return decltype(tmp19)::blendv(tmp20, tmp19, tmp7.template cast<float,1>());
                        }
                    }
                    ()
                    ;
                    auto tmp22 =
                    [&]
                    {
                        __at_align__ std::array<int64_t, 16> tmpbuf;
                        #pragma GCC unroll 16
                        for (long x2_inner = 0; x2_inner < 16; x2_inner++)
                        {
                            tmpbuf[x2_inner] = static_cast<int64_t>(7L + (15L*(static_cast<int64_t>(x1) % static_cast<int64_t>(8L))) + (static_cast<int64_t>((x2 + x2_inner)) % static_cast<int64_t>(8L)));
                        }
                        return at::vec::VectorizedN<int64_t,2>::loadu(tmpbuf.data(), 16);
                    }
                    ()
                    ;
                    auto tmp23 = at::vec::VecMask<int64_t,2>(tmp22 < tmp6);
                    auto tmp24 = [&]
                    {
                        auto tmp25 =
                        [&]
                        {
                            __at_align__ std::array<int64_t, 16> tmpbuf;
                            #pragma GCC unroll 16
                            for (long x2_inner = 0; x2_inner < 16; x2_inner++)
                            {
                                if (tmp23.is_masked(x2_inner))
                                {
                                    tmpbuf[x2_inner] = static_cast<int64_t>(static_cast<int64_t>((7L + (15L*(static_cast<int64_t>(x1) % static_cast<int64_t>(8L))) + (static_cast<int64_t>((x2 + x2_inner)) % static_cast<int64_t>(8L)))) % static_cast<int64_t>(16L));
                                }
                            }
                            return at::vec::VectorizedN<int64_t,2>::loadu(tmpbuf.data(), 16);
                        }
                        ()
                        ;
                        auto tmp26 = static_cast<int64_t>(15);
                        auto tmp27 = at::vec::VectorizedN<int64_t,2>(tmp26);
                        auto tmp28 = at::vec::VecMask<int64_t,2>(tmp25 < tmp27);
                        auto tmp30 = tmp28 & tmp23;
                        auto tmp29 = [&]
                        {
                            auto tmp31 =
                            [&]
                            {
                                __at_align__ std::array<float, 16> tmpbuf;
                                #pragma GCC unroll 16
                                for (long x2_inner = 0; x2_inner < 16; x2_inner++)
                                {
                                    if (tmp30.is_masked(x2_inner))
                                    {
                                        tmpbuf[x2_inner] = in_ptr2[static_cast<int64_t>((15L*(c10::div_floor_integer(static_cast<int64_t>((7L + (15L*(static_cast<int64_t>(x1) % static_cast<int64_t>(8L))) + (static_cast<int64_t>((x2 + x2_inner)) % static_cast<int64_t>(8L)))), static_cast<int64_t>(16L)))) + (120L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(8L)))) + (960L*x0) + (static_cast<int64_t>((7L + (15L*(static_cast<int64_t>(x1) % static_cast<int64_t>(8L))) + (static_cast<int64_t>((x2 + x2_inner)) % static_cast<int64_t>(8L)))) % static_cast<int64_t>(16L)))];
                                    }
                                }
                                return at::vec::Vectorized<float>::loadu(tmpbuf.data(), 16);
                            }
                            ()
                            ;
                            return tmp31;
                        }
                        ;
                        auto tmp34 =
                        [&]
                        {
                            if (tmp30.all_zero())
                            {
                                return at::vec::Vectorized<float>(static_cast<float>(0.0));
                            }
                            else
                            {
                                auto tmp32 = tmp29();
                                auto tmp33 = at::vec::Vectorized<float>(static_cast<float>(0.0));
                                return decltype(tmp32)::blendv(tmp33, tmp32, tmp30.template cast<float,1>());
                            }
                        }
                        ()
                        ;
                        return tmp34;
                    }
                    ;
                    auto tmp37 =
                    [&]
                    {
                        if (tmp23.all_zero())
                        {
                            return at::vec::Vectorized<float>(static_cast<float>(0.0));
                        }
                        else
                        {
                            auto tmp35 = tmp24();
                            auto tmp36 = at::vec::Vectorized<float>(static_cast<float>(0.0));
                            return decltype(tmp35)::blendv(tmp36, tmp35, tmp23.template cast<float,1>());
                        }
                    }
                    ()
                    ;
                    auto tmp38 = tmp21 + tmp37;
                    auto tmp39 = tmp3 + tmp38;
                    auto tmp41 = at::vec::Vectorized<float>(tmp40);
                    auto tmp42 = tmp39 - tmp41;
                    auto tmp43 = tmp42.exp();
                    tmp43.store(out_ptr1 + static_cast<int64_t>(x2 + (64L*x1) + (4096L*x0)));
                }
            }
        }
    }
    {
        #pragma GCC ivdep
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(256L); x0+=static_cast<int64_t>(1L))
        {
            {
                float tmp_acc0 = 0;
                at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(0);
                for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(64L); x1+=static_cast<int64_t>(16L))
                {
                    auto tmp0 = at::vec::Vectorized<float>::loadu(out_ptr1 + static_cast<int64_t>(x1 + (64L*x0)), 16);
                    tmp_acc0_vec = tmp_acc0_vec + tmp0;
                }
                tmp_acc0 = tmp_acc0 + at::vec::vec_reduce_all<float>([](at::vec::Vectorized<float>& x, at::vec::Vectorized<float>& y) { return x + y; }, tmp_acc0_vec);
                out_ptr2[static_cast<int64_t>(x0)] = static_cast<float>(tmp_acc0);
            }
            for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(64L); x1+=static_cast<int64_t>(16L))
            {
                auto tmp0 = at::vec::Vectorized<float>::loadu(out_ptr1 + static_cast<int64_t>(x1 + (64L*x0)), 16);
                auto tmp1 = out_ptr2[static_cast<int64_t>(x0)];
                auto tmp2 = at::vec::Vectorized<float>(tmp1);
                auto tmp3 = tmp0 / tmp2;
                tmp3.store(out_ptr3 + static_cast<int64_t>(x1 + (64L*x0)));
            }
        }
    }
 }
 ''')


 cpp_fused__native_batch_norm_legit_no_training_silu_37 = async_compile.cpp_pybinding(['float*', 'const float*', 'const float*', 'const float*', 'const float*', 'const float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(float* in_out_ptr0,
                       const float* in_ptr0,
                       const float* in_ptr1,
                       const float* in_ptr2,
                       const float* in_ptr3,
                       const float* in_ptr4)
 {
    {
        #pragma GCC ivdep
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(64L); x0+=static_cast<int64_t>(1L))
        {
            for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(512L); x1+=static_cast<int64_t>(16L))
            {
                auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>((128L*x0) + (8192L*(c10::div_floor_integer(static_cast<int64_t>(x1), static_cast<int64_t>(128L)))) + (static_cast<int64_t>(x1) % static_cast<int64_t>(128L))), 16);
                auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr1 + static_cast<int64_t>(x1), 16);
                auto tmp3 = at::vec::Vectorized<float>::loadu(in_ptr2 + static_cast<int64_t>(x1), 16);
                auto tmp5 = at::vec::Vectorized<float>::loadu(in_ptr3 + static_cast<int64_t>(x1), 16);
                auto tmp7 = at::vec::Vectorized<float>::loadu(in_ptr4 + static_cast<int64_t>(x1), 16);
                auto tmp2 = tmp0 - tmp1;
                auto tmp4 = tmp2 * tmp3;
                auto tmp6 = tmp4 * tmp5;
                auto tmp8 = tmp6 + tmp7;
                auto tmp9 = decltype(tmp8)(1)/(decltype(tmp8)(1) + tmp8.neg().exp());
                auto tmp10 = tmp8 * tmp9;
                tmp10.store(in_out_ptr0 + static_cast<int64_t>(x1 + (512L*x0)));
            }
        }
    }
 }
 ''')


 cpp_fused_silu_38 = async_compile.cpp_pybinding(['const float*', 'float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(const float* in_ptr0,
                       float* out_ptr0)
 {
    {
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(98304L); x0+=static_cast<int64_t>(16L))
        {
            auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x0), 16);
            auto tmp1 = decltype(tmp0)(1)/(decltype(tmp0)(1) + tmp0.neg().exp());
            auto tmp2 = tmp0 * tmp1;
            tmp2.store(out_ptr0 + static_cast<int64_t>(x0));
        }
    }
 }
 ''')


 cpp_fused_mean_39 = async_compile.cpp_pybinding(['float*', 'const float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(float* in_out_ptr0,
                       const float* in_ptr0)
 {
    auto out_ptr0 = in_out_ptr0;
    {
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(1280L); x0+=static_cast<int64_t>(16L))
        {
            {
                float tmp_acc0 = 0;
                at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(0);
                for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(64L); x1+=static_cast<int64_t>(1L))
                {
                    auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x0 + (1280L*x1)), 16);
                    tmp_acc0_vec = tmp_acc0_vec + tmp0;
                }
                tmp_acc0_vec.store(in_out_ptr0 + static_cast<int64_t>(x0));
            }
        }
    }
    {
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(1280L); x0+=static_cast<int64_t>(16L))
        {
            auto tmp0 = at::vec::Vectorized<float>::loadu(out_ptr0 + static_cast<int64_t>(x0), 16);
            auto tmp1 = static_cast<float>(64.0);
            auto tmp2 = at::vec::Vectorized<float>(tmp1);
            auto tmp3 = tmp0 / tmp2;
            tmp3.store(in_out_ptr0 + static_cast<int64_t>(x0));
        }
    }
 }
 ''')


 cpp_fused_addmm_40 = async_compile.cpp_pybinding(['float*', 'const float*'], '''
 #include "/tmp/torchinductor_leslie/j2/cj22tgrdgh42wbunl7gdptg2lintcziox2kmr7rdbcc6n2njrhgx.h"
 extern "C"  void kernel(float* in_out_ptr0,
                       const float* in_ptr0)
 {
    {
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(992L); x0+=static_cast<int64_t>(16L))
        {
            auto tmp0 = at::vec::Vectorized<float>::loadu(in_out_ptr0 + static_cast<int64_t>(x0), 16);
            auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x0), 16);
            auto tmp2 = tmp0 + tmp1;
            tmp2.store(in_out_ptr0 + static_cast<int64_t>(x0));
        }
        for(int64_t x0=static_cast<int64_t>(992L); x0<static_cast<int64_t>(1000L); x0+=static_cast<int64_t>(8L))
        {
            auto tmp0 = at::vec::Vectorized<float>::loadu(in_out_ptr0 + static_cast<int64_t>(x0), 8);
            auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x0), 8);
            auto tmp2 = tmp0 + tmp1;
            tmp2.store(in_out_ptr0 + static_cast<int64_t>(x0), 8);
        }
    }
 }
 ''')


 async_compile.wait(globals())
 del async_compile

 def call(args):
    arg224_1, = args
    args.clear()
    assert_size_stride(arg224_1, (1, 3, 256, 256), (196608, 65536, 256, 1))
    buf0 = empty_strided_cpu((1, 3, 256, 256), (196608, 1, 768, 3), torch.float32)
    cpp_fused_silu_0(arg224_1, buf0)
    del arg224_1
    buf1 = torch.ops.mkldnn._convolution_pointwise.default(buf0, _frozen_param598, _frozen_param564, [1, 1], [2, 2], [1, 1], 1, 'swish', [None], '')
    assert_size_stride(buf1, (1, 24, 128, 128), (393216, 1, 3072, 24))
    del buf0
    buf2 = torch.ops.mkldnn._convolution_pointwise.default(buf1, _frozen_param599, _frozen_param565, [1, 1], [1, 1], [1, 1], 1, 'swish', [None], '')
    assert_size_stride(buf2, (1, 32, 128, 128), (524288, 1, 4096, 32))
    del buf1
    buf3 = torch.ops.mkldnn._convolution_pointwise.default(buf2, _frozen_param600, _frozen_param566, [1, 1], [2, 2], [1, 1], 1, 'swish', [None], '')
    assert_size_stride(buf3, (1, 64, 64, 64), (262144, 1, 4096, 64))
    del buf2
    buf4 = torch.ops.mkldnn._convolution_pointwise.default(buf3, _frozen_param601, _frozen_param567, [0, 0], [1, 1], [1, 1], 1, 'swish', [None], '')
    assert_size_stride(buf4, (1, 64, 64, 64), (262144, 1, 4096, 64))
    buf5 = torch.ops.mkldnn._convolution_pointwise.default(buf4, _frozen_param602, _frozen_param568, [1, 1], [1, 1], [1, 1], 1, 'swish', [None], '')
    assert_size_stride(buf5, (1, 64, 64, 64), (262144, 1, 4096, 64))
    buf6 = empty_strided_cpu((1, 64, 1, 1), (64, 1, 64, 64), torch.float32)
    buf7 = reinterpret_tensor(buf6, (1, 64, 1, 1), (64, 1, 1, 1), 0); del buf6  # reuse
    buf8 = empty_strided_cpu((8, ), (1, ), torch.float32)
    cpp_fused_mean_1(buf7, buf5, buf8)
    buf9 = torch.ops.mkldnn._convolution_pointwise.default(buf7, _frozen_param603, buf8, [0, 0], [1, 1], [1, 1], 1, 'relu', [None], '')
    assert_size_stride(buf9, (1, 8, 1, 1), (8, 1, 8, 8))
    del buf7
    del buf8
    buf10 = torch.ops.mkldnn._convolution_pointwise.default(buf9, _frozen_param604, _frozen_param18, [0, 0], [1, 1], [1, 1], 1, 'sigmoid', [None], '')
    assert_size_stride(buf10, (1, 64, 1, 1), (64, 1, 64, 64))
    buf11 = buf5; del buf5  # reuse
    cpp_fused_mul_2(buf11, buf10)
    buf12 = torch.ops.mkldnn._convolution_pointwise.default(buf11, _frozen_param605, _frozen_param569, [0, 0], [1, 1], [1, 1], 1, 'none', [None], '')
    assert_size_stride(buf12, (1, 256, 64, 64), (1048576, 1, 16384, 256))
    del buf11
    buf13 = torch.ops.mkldnn._convolution_pointwise_.binary(buf12, buf3, _frozen_param606, _frozen_param570, [0, 0], [1, 1], [1, 1], 1, 'add', 1.0, None, [None], None)
    del buf3
    del buf4
    buf16 = empty_strided_cpu((1, 256, 64, 64), (1048576, 1, 16384, 256), torch.float32)
    cpp_fused_silu_3(buf12, buf16)
    buf17 = torch.ops.mkldnn._convolution_pointwise.default(buf16, _frozen_param607, _frozen_param571, [0, 0], [1, 1], [1, 1], 1, 'swish', [None], '')
    assert_size_stride(buf17, (1, 64, 64, 64), (262144, 1, 4096, 64))
    buf18 = torch.ops.mkldnn._convolution_pointwise.default(buf17, _frozen_param608, _frozen_param572, [1, 1], [1, 1], [1, 1], 1, 'swish', [None], '')
    assert_size_stride(buf18, (1, 64, 64, 64), (262144, 1, 4096, 64))
    buf19 = buf10; del buf10  # reuse
    buf20 = reinterpret_tensor(buf19, (1, 64, 1, 1), (64, 1, 1, 1), 0); del buf19  # reuse
    buf21 = reinterpret_tensor(buf9, (8, ), (1, ), 0); del buf9  # reuse
    cpp_fused_mean_4(buf20, buf18, buf21)
    buf22 = torch.ops.mkldnn._convolution_pointwise.default(buf20, _frozen_param609, buf21, [0, 0], [1, 1], [1, 1], 1, 'relu', [None], '')
    assert_size_stride(buf22, (1, 8, 1, 1), (8, 1, 8, 8))
    del buf20
    del buf21
    buf23 = torch.ops.mkldnn._convolution_pointwise.default(buf22, _frozen_param610, _frozen_param34, [0, 0], [1, 1], [1, 1], 1, 'sigmoid', [None], '')
    assert_size_stride(buf23, (1, 64, 1, 1), (64, 1, 64, 64))
    buf24 = buf18; del buf18  # reuse
    cpp_fused_mul_5(buf24, buf23)
    del buf23
    buf25 = torch.ops.mkldnn._convolution_pointwise_.binary(buf16, buf24, _frozen_param611, _frozen_param573, [0, 0], [1, 1], [1, 1], 1, 'add', 1.0, None, [None], None)
    buf28 = buf12; del buf12  # reuse
    cpp_fused_silu_6(buf16, buf28)
    del buf16
    buf29 = torch.ops.mkldnn._convolution_pointwise.default(buf28, _frozen_param612, _frozen_param574, [0, 0], [1, 1], [1, 1], 1, 'swish', [None], '')
    assert_size_stride(buf29, (1, 128, 64, 64), (524288, 1, 8192, 128))
    buf30 = torch.ops.mkldnn._convolution_pointwise.default(buf29, _frozen_param613, _frozen_param575, [1, 1], [2, 2], [1, 1], 1, 'swish', [None], '')
    assert_size_stride(buf30, (1, 128, 32, 32), (131072, 1, 4096, 128))
    buf31 = empty_strided_cpu((1, 128, 1, 1), (128, 1, 128, 128), torch.float32)
    buf32 = reinterpret_tensor(buf31, (1, 128, 1, 1), (128, 1, 1, 1), 0); del buf31  # reuse
    buf33 = reinterpret_tensor(buf22, (8, ), (1, ), 0); del buf22  # reuse
    cpp_fused_mean_7(buf32, buf30, buf33)
    buf34 = torch.ops.mkldnn._convolution_pointwise.default(buf32, _frozen_param614, buf33, [0, 0], [1, 1], [1, 1], 1, 'relu', [None], '')
    assert_size_stride(buf34, (1, 8, 1, 1), (8, 1, 8, 8))
    del buf32
    del buf33
    buf35 = torch.ops.mkldnn._convolution_pointwise.default(buf34, _frozen_param615, _frozen_param47, [0, 0], [1, 1], [1, 1], 1, 'sigmoid', [None], '')
    assert_size_stride(buf35, (1, 128, 1, 1), (128, 1, 128, 128))
    buf36 = buf30; del buf30  # reuse
    cpp_fused_mul_8(buf36, buf35)
    buf37 = torch.ops.mkldnn._convolution_pointwise.default(buf36, _frozen_param616, _frozen_param576, [0, 0], [1, 1], [1, 1], 1, 'none', [None], '')
    assert_size_stride(buf37, (1, 512, 32, 32), (524288, 1, 16384, 512))
    del buf36
    buf38 = torch.ops.mkldnn._convolution_pointwise_.binary(buf37, buf28, _frozen_param617, _frozen_param577, [0, 0], [2, 2], [1, 1], 1, 'add', 1.0, None, [None], None)
    del buf28
    buf41 = reinterpret_tensor(buf29, (1, 512, 32, 32), (524288, 1, 16384, 512), 0); del buf29  # reuse
    cpp_fused_silu_9(buf37, buf41)
    buf42 = torch.ops.mkldnn._convolution_pointwise.default(buf41, _frozen_param618, _frozen_param578, [0, 0], [1, 1], [1, 1], 1, 'swish', [None], '')
    assert_size_stride(buf42, (1, 128, 32, 32), (131072, 1, 4096, 128))
    buf43 = torch.ops.mkldnn._convolution_pointwise.default(buf42, _frozen_param619, _frozen_param579, [1, 1], [1, 1], [1, 1], 1, 'swish', [None], '')
    assert_size_stride(buf43, (1, 128, 32, 32), (131072, 1, 4096, 128))
    buf44 = buf35; del buf35  # reuse
    buf45 = reinterpret_tensor(buf44, (1, 128, 1, 1), (128, 1, 1, 1), 0); del buf44  # reuse
    buf46 = reinterpret_tensor(buf34, (8, ), (1, ), 0); del buf34  # reuse
    cpp_fused_mean_10(buf45, buf43, buf46)
    buf47 = torch.ops.mkldnn._convolution_pointwise.default(buf45, _frozen_param620, buf46, [0, 0], [1, 1], [1, 1], 1, 'relu', [None], '')
    assert_size_stride(buf47, (1, 8, 1, 1), (8, 1, 8, 8))
    del buf45
    del buf46
    buf48 = torch.ops.mkldnn._convolution_pointwise.default(buf47, _frozen_param621, _frozen_param63, [0, 0], [1, 1], [1, 1], 1, 'sigmoid', [None], '')
    assert_size_stride(buf48, (1, 128, 1, 1), (128, 1, 128, 128))
    del buf47
    buf49 = buf43; del buf43  # reuse
    cpp_fused_mul_11(buf49, buf48)
    del buf48
    buf50 = torch.ops.mkldnn._convolution_pointwise_.binary(buf41, buf49, _frozen_param622, _frozen_param580, [0, 0], [1, 1], [1, 1], 1, 'add', 1.0, None, [None], None)
    buf53 = buf37; del buf37  # reuse
    cpp_fused_silu_12(buf41, buf53)
    buf54 = torch.ops.mkldnn._convolution_pointwise.default(buf53, _frozen_param623, _frozen_param581, [0, 0], [1, 1], [1, 1], 1, 'swish', [None], '')
    assert_size_stride(buf54, (1, 128, 32, 32), (131072, 1, 4096, 128))
    buf55 = torch.ops.mkldnn._convolution_pointwise.default(buf54, _frozen_param624, None, [0, 0], [1, 1], [1, 1], 1, 'none', [None], '')
    assert_size_stride(buf55, (1, 384, 32, 32), (393216, 1, 12288, 384))
    buf56 = empty_strided_cpu((4, 1024, 1024), (1048576, 1024, 1), torch.float32)
    # Topologically Sorted Source Nodes: [matmul], Original ATen: [aten.bmm]
    extern_kernels.bmm(reinterpret_tensor(buf55, (4, 1024, 32), (32, 384, 1), 0), reinterpret_tensor(buf55, (4, 32, 1024), (32, 1, 384), 128), out=buf56)
    buf57 = reinterpret_tensor(buf49, (4, 32, 32, 32), (32768, 1024, 32, 1), 0); del buf49  # reuse
    cpp_fused_clone_13(buf55, buf57)
    buf58 = torch.ops.mkl._mkl_linear.default(reinterpret_tensor(buf57, (4096, 32), (32, 1), 0), _frozen_param628, _frozen_param627, None, 4096)
    buf59 = buf57; del buf57  # reuse
    cpp_fused_clone_14(buf55, buf59)
    buf60 = torch.ops.mkl._mkl_linear.default(reinterpret_tensor(buf59, (4096, 32), (32, 1), 0), _frozen_param626, _frozen_param625, None, 4096)
    buf61 = empty_strided_cpu((4, 1024, 1), (1024, 1, 4096), torch.float32)
    buf62 = empty_strided_cpu((4, 1024, 1024), (1048576, 1024, 1), torch.float32)
    buf63 = empty_strided_cpu((4, 1024, 1), (1024, 1, 4096), torch.float32)
    buf64 = empty_strided_cpu((4, 1024, 1024), (1048576, 1024, 1), torch.float32)
    cpp_fused__softmax_add_mul_15(buf56, buf58, buf60, buf61, buf62, buf63, buf64)
    del buf56
    del buf58
    del buf60
    del buf61
    del buf62
    del buf63
    buf65 = reinterpret_tensor(buf59, (4, 1024, 32), (32768, 32, 1), 0); del buf59  # reuse
    # Topologically Sorted Source Nodes: [attn_1, matmul_3], Original ATen: [aten._softmax, aten.bmm]
    extern_kernels.bmm(buf64, reinterpret_tensor(buf55, (4, 1024, 32), (32, 384, 1), 256), out=buf65)
    del buf55
    del buf64
    buf66 = buf42; del buf42  # reuse
    buf67 = buf66; del buf66  # reuse
    cpp_fused__native_batch_norm_legit_no_training_silu_16(buf67, buf65, _frozen_param305, _frozen_param306, _frozen_param307, _frozen_param308)
    del buf65
    buf68 = torch.ops.mkldnn._convolution_pointwise_.binary(buf53, buf67, _frozen_param629, _frozen_param582, [0, 0], [1, 1], [1, 1], 1, 'add', 1.0, None, [None], None)
    del buf54
    buf71 = buf41; del buf41  # reuse
    cpp_fused_silu_17(buf53, buf71)
    del buf53
    buf72 = torch.ops.mkldnn._convolution_pointwise.default(buf71, _frozen_param630, _frozen_param583, [0, 0], [1, 1], [1, 1], 1, 'swish', [None], '')
    assert_size_stride(buf72, (1, 256, 32, 32), (262144, 1, 8192, 256))
    buf73 = torch.ops.mkldnn._convolution_pointwise.default(buf72, _frozen_param631, _frozen_param584, [1, 1], [2, 2], [1, 1], 1, 'swish', [None], '')
    assert_size_stride(buf73, (1, 256, 16, 16), (65536, 1, 4096, 256))
    buf74 = empty_strided_cpu((1, 256, 1, 1), (256, 1, 256, 256), torch.float32)
    buf75 = reinterpret_tensor(buf74, (1, 256, 1, 1), (256, 1, 1, 1), 0); del buf74  # reuse
    cpp_fused_mean_18(buf75, buf73)
    buf76 = torch.ops.mkldnn._convolution_pointwise.default(buf75, _frozen_param632, _frozen_param85, [0, 0], [1, 1], [1, 1], 1, 'relu', [None], '')
    assert_size_stride(buf76, (1, 16, 1, 1), (16, 1, 16, 16))
    del buf75
    buf77 = torch.ops.mkldnn._convolution_pointwise.default(buf76, _frozen_param633, _frozen_param87, [0, 0], [1, 1], [1, 1], 1, 'sigmoid', [None], '')
    assert_size_stride(buf77, (1, 256, 1, 1), (256, 1, 256, 256))
    del buf76
    buf78 = buf73; del buf73  # reuse
    cpp_fused_mul_19(buf78, buf77)
    buf79 = torch.ops.mkldnn._convolution_pointwise.default(buf78, _frozen_param634, _frozen_param585, [0, 0], [1, 1], [1, 1], 1, 'none', [None], '')
    assert_size_stride(buf79, (1, 1024, 16, 16), (262144, 1, 16384, 1024))
    del buf78
    buf80 = torch.ops.mkldnn._convolution_pointwise_.binary(buf79, buf71, _frozen_param635, _frozen_param586, [0, 0], [2, 2], [1, 1], 1, 'add', 1.0, None, [None], None)
    del buf71
    buf83 = reinterpret_tensor(buf72, (1, 1024, 16, 16), (262144, 1, 16384, 1024), 0); del buf72  # reuse
    cpp_fused_silu_20(buf79, buf83)
    buf84 = torch.ops.mkldnn._convolution_pointwise.default(buf83, _frozen_param636, _frozen_param587, [0, 0], [1, 1], [1, 1], 1, 'swish', [None], '')
    assert_size_stride(buf84, (1, 256, 16, 16), (65536, 1, 4096, 256))
    buf85 = torch.ops.mkldnn._convolution_pointwise.default(buf84, _frozen_param637, _frozen_param588, [1, 1], [1, 1], [1, 1], 1, 'swish', [None], '')
    assert_size_stride(buf85, (1, 256, 16, 16), (65536, 1, 4096, 256))
    buf86 = buf77; del buf77  # reuse
    buf87 = reinterpret_tensor(buf86, (1, 256, 1, 1), (256, 1, 1, 1), 0); del buf86  # reuse
    cpp_fused_mean_21(buf87, buf85)
    buf88 = torch.ops.mkldnn._convolution_pointwise.default(buf87, _frozen_param638, _frozen_param101, [0, 0], [1, 1], [1, 1], 1, 'relu', [None], '')
    assert_size_stride(buf88, (1, 16, 1, 1), (16, 1, 16, 16))
    buf89 = torch.ops.mkldnn._convolution_pointwise.default(buf88, _frozen_param639, _frozen_param103, [0, 0], [1, 1], [1, 1], 1, 'sigmoid', [None], '')
    assert_size_stride(buf89, (1, 256, 1, 1), (256, 1, 256, 256))
    del buf88
    buf90 = buf85; del buf85  # reuse
    cpp_fused_mul_22(buf90, buf89)
    buf91 = torch.ops.mkldnn._convolution_pointwise_.binary(buf83, buf90, _frozen_param640, _frozen_param589, [0, 0], [1, 1], [1, 1], 1, 'add', 1.0, None, [None], None)
    buf94 = buf79; del buf79  # reuse
    cpp_fused_silu_23(buf83, buf94)
    buf95 = torch.ops.mkldnn._convolution_pointwise.default(buf94, _frozen_param641, _frozen_param590, [0, 0], [1, 1], [1, 1], 1, 'swish', [None], '')
    assert_size_stride(buf95, (1, 256, 16, 16), (65536, 1, 4096, 256))
    buf96 = torch.ops.mkldnn._convolution_pointwise.default(buf95, _frozen_param642, None, [0, 0], [1, 1], [1, 1], 1, 'none', [None], '')
    assert_size_stride(buf96, (1, 768, 16, 16), (196608, 1, 12288, 768))
    buf97 = reinterpret_tensor(buf83, (4, 256, 256), (65536, 256, 1), 0); del buf83  # reuse
    # Topologically Sorted Source Nodes: [matmul_4], Original ATen: [aten.bmm]
    extern_kernels.bmm(reinterpret_tensor(buf96, (4, 256, 64), (64, 768, 1), 0), reinterpret_tensor(buf96, (4, 64, 256), (64, 1, 768), 256), out=buf97)
    buf98 = reinterpret_tensor(buf90, (4, 16, 16, 64), (16384, 1024, 64, 1), 0); del buf90  # reuse
    cpp_fused_clone_24(buf96, buf98)
    buf99 = torch.ops.mkl._mkl_linear.default(reinterpret_tensor(buf98, (1024, 64), (64, 1), 0), _frozen_param646, _frozen_param645, None, 1024)
    buf100 = buf98; del buf98  # reuse
    cpp_fused_clone_25(buf96, buf100)
    buf101 = torch.ops.mkl._mkl_linear.default(reinterpret_tensor(buf100, (1024, 64), (64, 1), 0), _frozen_param644, _frozen_param643, None, 1024)
    buf102 = empty_strided_cpu((4, 256, 1), (256, 1, 1024), torch.float32)
    buf103 = reinterpret_tensor(buf24, (4, 256, 256), (65536, 256, 1), 0); del buf24  # reuse
    buf104 = empty_strided_cpu((4, 256, 1), (256, 1, 1024), torch.float32)
    buf105 = reinterpret_tensor(buf17, (4, 256, 256), (65536, 256, 1), 0); del buf17  # reuse
    cpp_fused__softmax_add_mul_26(buf97, buf99, buf101, buf102, buf103, buf104, buf105)
    del buf101
    del buf99
    buf106 = reinterpret_tensor(buf100, (4, 256, 64), (16384, 64, 1), 0); del buf100  # reuse
    # Topologically Sorted Source Nodes: [attn_3, matmul_7], Original ATen: [aten._softmax, aten.bmm]
    extern_kernels.bmm(buf105, reinterpret_tensor(buf96, (4, 256, 64), (64, 768, 1), 512), out=buf106)
    del buf96
    buf107 = buf84; del buf84  # reuse
    buf108 = buf107; del buf107  # reuse
    cpp_fused__native_batch_norm_legit_no_training_silu_27(buf108, buf106, _frozen_param349, _frozen_param350, _frozen_param351, _frozen_param352)
    del buf106
    buf109 = torch.ops.mkldnn._convolution_pointwise_.binary(buf94, buf108, _frozen_param647, _frozen_param591, [0, 0], [1, 1], [1, 1], 1, 'add', 1.0, None, [None], None)
    del buf108
    del buf95
    buf112 = reinterpret_tensor(buf105, (1, 1024, 16, 16), (262144, 1, 16384, 1024), 0); del buf105  # reuse
    cpp_fused_silu_28(buf94, buf112)
    buf113 = torch.ops.mkldnn._convolution_pointwise.default(buf112, _frozen_param648, _frozen_param592, [0, 0], [1, 1], [1, 1], 1, 'swish', [None], '')
    assert_size_stride(buf113, (1, 512, 16, 16), (131072, 1, 8192, 512))
    buf114 = torch.ops.mkldnn._convolution_pointwise.default(buf113, _frozen_param649, None, [0, 0], [1, 1], [1, 1], 1, 'none', [None], '')
    assert_size_stride(buf114, (1, 1536, 16, 16), (393216, 1, 24576, 1536))
    buf115 = reinterpret_tensor(buf94, (4, 256, 256), (65536, 256, 1), 0); del buf94  # reuse
    # Topologically Sorted Source Nodes: [matmul_8], Original ATen: [aten.bmm]
    extern_kernels.bmm(reinterpret_tensor(buf114, (4, 256, 128), (128, 1536, 1), 0), reinterpret_tensor(buf114, (4, 128, 256), (128, 1, 1536), 512), out=buf115)
    buf116 = reinterpret_tensor(buf67, (4, 16, 16, 128), (32768, 2048, 128, 1), 0); del buf67  # reuse
    cpp_fused_clone_29(buf114, buf116)
    buf117 = torch.ops.mkl._mkl_linear.default(reinterpret_tensor(buf116, (1024, 128), (128, 1), 0), _frozen_param653, _frozen_param652, None, 1024)
    buf118 = buf116; del buf116  # reuse
    cpp_fused_clone_30(buf114, buf118)
    buf119 = torch.ops.mkl._mkl_linear.default(reinterpret_tensor(buf118, (1024, 128), (128, 1), 0), _frozen_param651, _frozen_param650, None, 1024)
    buf120 = buf104; del buf104  # reuse
    buf121 = buf97; del buf97  # reuse
    buf122 = buf102; del buf102  # reuse
    buf123 = buf103; del buf103  # reuse
    cpp_fused__softmax_add_mul_31(buf115, buf117, buf119, buf120, buf121, buf122, buf123)
    del buf115
    del buf117
    del buf119
    del buf120
    del buf121
    del buf122
    buf124 = reinterpret_tensor(buf118, (4, 256, 128), (32768, 128, 1), 0); del buf118  # reuse
    # Topologically Sorted Source Nodes: [attn_5, matmul_11], Original ATen: [aten._softmax, aten.bmm]
    extern_kernels.bmm(buf123, reinterpret_tensor(buf114, (4, 256, 128), (128, 1536, 1), 1024), out=buf124)
    del buf114
    del buf123
    buf125 = empty_strided_cpu((1, 512, 8, 8), (32768, 1, 4096, 512), torch.float32)
    buf126 = buf125; del buf125  # reuse
    cpp_fused__native_batch_norm_legit_no_training_avg_pool2d_silu_32(buf126, buf124, _frozen_param363, _frozen_param364, _frozen_param365, _frozen_param366)
    del buf124
    buf127 = torch.ops.mkldnn._convolution_pointwise.default(buf126, _frozen_param654, _frozen_param593, [0, 0], [1, 1], [1, 1], 1, 'none', [None], '')
    assert_size_stride(buf127, (1, 1536, 8, 8), (98304, 1, 12288, 1536))
    buf128 = torch.ops.mkldnn._convolution_pointwise_.binary(buf127, buf112, _frozen_param655, _frozen_param594, [0, 0], [2, 2], [1, 1], 1, 'add', 1.0, None, [None], None)
    del buf112
    del buf113
    buf131 = empty_strided_cpu((1, 1536, 8, 8), (98304, 1, 12288, 1536), torch.float32)
    cpp_fused_silu_33(buf127, buf131)
    del buf127
    buf132 = torch.ops.mkldnn._convolution_pointwise.default(buf131, _frozen_param656, _frozen_param595, [0, 0], [1, 1], [1, 1], 1, 'swish', [None], '')
    assert_size_stride(buf132, (1, 512, 8, 8), (32768, 1, 4096, 512))
    buf133 = torch.ops.mkldnn._convolution_pointwise.default(buf132, _frozen_param657, None, [0, 0], [1, 1], [1, 1], 1, 'none', [None], '')
    assert_size_stride(buf133, (1, 1536, 8, 8), (98304, 1, 12288, 1536))
    buf134 = empty_strided_cpu((4, 64, 64), (4096, 64, 1), torch.float32)
    # Topologically Sorted Source Nodes: [matmul_12], Original ATen: [aten.bmm]
    extern_kernels.bmm(reinterpret_tensor(buf133, (4, 64, 128), (128, 1536, 1), 0), reinterpret_tensor(buf133, (4, 128, 64), (128, 1, 1536), 512), out=buf134)
    buf135 = reinterpret_tensor(buf126, (4, 8, 8, 128), (8192, 1024, 128, 1), 0); del buf126  # reuse
    cpp_fused_clone_34(buf133, buf135)
    buf136 = torch.ops.mkl._mkl_linear.default(reinterpret_tensor(buf135, (256, 128), (128, 1), 0), _frozen_param661, _frozen_param660, None, 256)
    buf137 = buf135; del buf135  # reuse
    cpp_fused_clone_35(buf133, buf137)
    buf138 = torch.ops.mkl._mkl_linear.default(reinterpret_tensor(buf137, (256, 128), (128, 1), 0), _frozen_param659, _frozen_param658, None, 256)
    buf139 = reinterpret_tensor(buf89, (4, 64, 1), (64, 1, 256), 0); del buf89  # reuse
    buf140 = empty_strided_cpu((4, 64, 64), (4096, 64, 1), torch.float32)
    buf141 = reinterpret_tensor(buf87, (4, 64, 1), (64, 1, 256), 0); del buf87  # reuse
    buf142 = empty_strided_cpu((4, 64, 64), (4096, 64, 1), torch.float32)
    cpp_fused__softmax_add_mul_36(buf134, buf136, buf138, buf139, buf140, buf141, buf142)
    del buf134
    del buf136
    del buf138
    del buf139
    del buf140
    del buf141
    buf143 = reinterpret_tensor(buf137, (4, 64, 128), (8192, 128, 1), 0); del buf137  # reuse
    # Topologically Sorted Source Nodes: [attn_7, matmul_15], Original ATen: [aten._softmax, aten.bmm]
    extern_kernels.bmm(buf142, reinterpret_tensor(buf133, (4, 64, 128), (128, 1536, 1), 1024), out=buf143)
    del buf142
    buf144 = empty_strided_cpu((1, 512, 8, 8), (32768, 1, 4096, 512), torch.float32)
    buf145 = buf144; del buf144  # reuse
    cpp_fused__native_batch_norm_legit_no_training_silu_37(buf145, buf143, _frozen_param381, _frozen_param382, _frozen_param383, _frozen_param384)
    del buf143
    buf146 = torch.ops.mkldnn._convolution_pointwise_.binary(buf131, buf145, _frozen_param662, _frozen_param596, [0, 0], [1, 1], [1, 1], 1, 'add', 1.0, None, [None], None)
    del buf132
    del buf145
    buf149 = buf133; del buf133  # reuse
    cpp_fused_silu_38(buf131, buf149)
    del buf131
    buf150 = torch.ops.mkldnn._convolution_pointwise.default(buf149, _frozen_param663, _frozen_param597, [0, 0], [1, 1], [1, 1], 1, 'swish', [None], '')
    assert_size_stride(buf150, (1, 1280, 8, 8), (81920, 1, 10240, 1280))
    del buf149
    buf151 = empty_strided_cpu((1, 1280, 1, 1), (1280, 1, 1280, 1280), torch.float32)
    buf152 = buf151; del buf151  # reuse
    cpp_fused_mean_39(buf152, buf150)
    del buf150
    buf153 = torch.ops.mkl._mkl_linear.default(reinterpret_tensor(buf152, (1, 1280), (0, 1), 0), _frozen_param665, _frozen_param664, None, 1)
    del buf152
    buf154 = buf153; del buf153  # reuse
    cpp_fused_addmm_40(buf154, _frozen_param147)
    return (buf154, )


 def benchmark_compiled_module(times=10, repeat=10):
    from torch._dynamo.testing import rand_strided
    from torch._inductor.utils import print_performance
    global _frozen_param18
    _frozen_param18 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param34
    _frozen_param34 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param47
    _frozen_param47 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param63
    _frozen_param63 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param85
    _frozen_param85 = rand_strided((16, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param87
    _frozen_param87 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param101
    _frozen_param101 = rand_strided((16, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param103
    _frozen_param103 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param147
    _frozen_param147 = rand_strided((1000, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param564
    _frozen_param564 = rand_strided((24, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param598
    _frozen_param598 = rand_strided((24, 3, 3, 3), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param565
    _frozen_param565 = rand_strided((32, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param599
    _frozen_param599 = rand_strided((32, 24, 3, 3), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param566
    _frozen_param566 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param600
    _frozen_param600 = rand_strided((64, 32, 3, 3), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param567
    _frozen_param567 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param601
    _frozen_param601 = rand_strided((64, 64, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param568
    _frozen_param568 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param602
    _frozen_param602 = rand_strided((64, 64, 3, 3), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param603
    _frozen_param603 = rand_strided((8, 64, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param604
    _frozen_param604 = rand_strided((64, 8, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param569
    _frozen_param569 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param605
    _frozen_param605 = rand_strided((256, 64, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param570
    _frozen_param570 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param606
    _frozen_param606 = rand_strided((256, 64, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param571
    _frozen_param571 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param607
    _frozen_param607 = rand_strided((64, 256, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param572
    _frozen_param572 = rand_strided((64, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param608
    _frozen_param608 = rand_strided((64, 64, 3, 3), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param609
    _frozen_param609 = rand_strided((8, 64, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param610
    _frozen_param610 = rand_strided((64, 8, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param573
    _frozen_param573 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param611
    _frozen_param611 = rand_strided((256, 64, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param574
    _frozen_param574 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param612
    _frozen_param612 = rand_strided((128, 256, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param575
    _frozen_param575 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param613
    _frozen_param613 = rand_strided((128, 128, 3, 3), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param614
    _frozen_param614 = rand_strided((8, 128, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param615
    _frozen_param615 = rand_strided((128, 8, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param576
    _frozen_param576 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param616
    _frozen_param616 = rand_strided((512, 128, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param577
    _frozen_param577 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param617
    _frozen_param617 = rand_strided((512, 256, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param578
    _frozen_param578 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param618
    _frozen_param618 = rand_strided((128, 512, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param579
    _frozen_param579 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param619
    _frozen_param619 = rand_strided((128, 128, 3, 3), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param620
    _frozen_param620 = rand_strided((8, 128, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param621
    _frozen_param621 = rand_strided((128, 8, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param580
    _frozen_param580 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param622
    _frozen_param622 = rand_strided((512, 128, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param581
    _frozen_param581 = rand_strided((128, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param623
    _frozen_param623 = rand_strided((128, 512, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param624
    _frozen_param624 = rand_strided((384, 128, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param625
    _frozen_param625 = rand_strided((63, 32), (32, 1), device='cpu', dtype=torch.float32)
    global _frozen_param626
    _frozen_param626 = rand_strided((1982689, 1), (1, 0), device='cpu', dtype=torch.float32)
    global _frozen_param627
    _frozen_param627 = rand_strided((63, 32), (32, 1), device='cpu', dtype=torch.float32)
    global _frozen_param628
    _frozen_param628 = rand_strided((1982689, 1), (1, 0), device='cpu', dtype=torch.float32)
    global _frozen_param305
    _frozen_param305 = rand_strided((128, 1, 1), (1, 1, 1), device='cpu', dtype=torch.float32)
    global _frozen_param306
    _frozen_param306 = rand_strided((128, 1, 1), (1, 1, 1), device='cpu', dtype=torch.float32)
    global _frozen_param307
    _frozen_param307 = rand_strided((128, 1, 1), (1, 1, 1), device='cpu', dtype=torch.float32)
    global _frozen_param308
    _frozen_param308 = rand_strided((128, 1, 1), (1, 1, 1), device='cpu', dtype=torch.float32)
    global _frozen_param582
    _frozen_param582 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param629
    _frozen_param629 = rand_strided((512, 128, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param583
    _frozen_param583 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param630
    _frozen_param630 = rand_strided((256, 512, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param584
    _frozen_param584 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param631
    _frozen_param631 = rand_strided((256, 256, 3, 3), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param632
    _frozen_param632 = rand_strided((16, 256, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param633
    _frozen_param633 = rand_strided((256, 16, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param585
    _frozen_param585 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param634
    _frozen_param634 = rand_strided((1024, 256, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param586
    _frozen_param586 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param635
    _frozen_param635 = rand_strided((1024, 512, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param587
    _frozen_param587 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param636
    _frozen_param636 = rand_strided((256, 1024, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param588
    _frozen_param588 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param637
    _frozen_param637 = rand_strided((256, 256, 3, 3), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param638
    _frozen_param638 = rand_strided((16, 256, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param639
    _frozen_param639 = rand_strided((256, 16, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param589
    _frozen_param589 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param640
    _frozen_param640 = rand_strided((1024, 256, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param590
    _frozen_param590 = rand_strided((256, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param641
    _frozen_param641 = rand_strided((256, 1024, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param642
    _frozen_param642 = rand_strided((768, 256, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param643
    _frozen_param643 = rand_strided((31, 64), (64, 1), device='cpu', dtype=torch.float32)
    global _frozen_param644
    _frozen_param644 = rand_strided((1982689, 1), (1, 0), device='cpu', dtype=torch.float32)
    global _frozen_param645
    _frozen_param645 = rand_strided((31, 64), (64, 1), device='cpu', dtype=torch.float32)
    global _frozen_param646
    _frozen_param646 = rand_strided((1982689, 1), (1, 0), device='cpu', dtype=torch.float32)
    global _frozen_param349
    _frozen_param349 = rand_strided((256, 1, 1), (1, 1, 1), device='cpu', dtype=torch.float32)
    global _frozen_param350
    _frozen_param350 = rand_strided((256, 1, 1), (1, 1, 1), device='cpu', dtype=torch.float32)
    global _frozen_param351
    _frozen_param351 = rand_strided((256, 1, 1), (1, 1, 1), device='cpu', dtype=torch.float32)
    global _frozen_param352
    _frozen_param352 = rand_strided((256, 1, 1), (1, 1, 1), device='cpu', dtype=torch.float32)
    global _frozen_param591
    _frozen_param591 = rand_strided((1024, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param647
    _frozen_param647 = rand_strided((1024, 256, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param592
    _frozen_param592 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param648
    _frozen_param648 = rand_strided((512, 1024, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param649
    _frozen_param649 = rand_strided((1536, 512, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param650
    _frozen_param650 = rand_strided((31, 128), (128, 1), device='cpu', dtype=torch.float32)
    global _frozen_param651
    _frozen_param651 = rand_strided((1982689, 1), (1, 0), device='cpu', dtype=torch.float32)
    global _frozen_param652
    _frozen_param652 = rand_strided((31, 128), (128, 1), device='cpu', dtype=torch.float32)
    global _frozen_param653
    _frozen_param653 = rand_strided((1982689, 1), (1, 0), device='cpu', dtype=torch.float32)
    global _frozen_param363
    _frozen_param363 = rand_strided((512, 1, 1), (1, 1, 1), device='cpu', dtype=torch.float32)
    global _frozen_param364
    _frozen_param364 = rand_strided((512, 1, 1), (1, 1, 1), device='cpu', dtype=torch.float32)
    global _frozen_param365
    _frozen_param365 = rand_strided((512, 1, 1), (1, 1, 1), device='cpu', dtype=torch.float32)
    global _frozen_param366
    _frozen_param366 = rand_strided((512, 1, 1), (1, 1, 1), device='cpu', dtype=torch.float32)
    global _frozen_param593
    _frozen_param593 = rand_strided((1536, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param654
    _frozen_param654 = rand_strided((1536, 512, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param594
    _frozen_param594 = rand_strided((1536, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param655
    _frozen_param655 = rand_strided((1536, 1024, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param595
    _frozen_param595 = rand_strided((512, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param656
    _frozen_param656 = rand_strided((512, 1536, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param657
    _frozen_param657 = rand_strided((1536, 512, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param658
    _frozen_param658 = rand_strided((15, 128), (128, 1), device='cpu', dtype=torch.float32)
    global _frozen_param659
    _frozen_param659 = rand_strided((1982689, 1), (1, 0), device='cpu', dtype=torch.float32)
    global _frozen_param660
    _frozen_param660 = rand_strided((15, 128), (128, 1), device='cpu', dtype=torch.float32)
    global _frozen_param661
    _frozen_param661 = rand_strided((1982689, 1), (1, 0), device='cpu', dtype=torch.float32)
    global _frozen_param381
    _frozen_param381 = rand_strided((512, 1, 1), (1, 1, 1), device='cpu', dtype=torch.float32)
    global _frozen_param382
    _frozen_param382 = rand_strided((512, 1, 1), (1, 1, 1), device='cpu', dtype=torch.float32)
    global _frozen_param383
    _frozen_param383 = rand_strided((512, 1, 1), (1, 1, 1), device='cpu', dtype=torch.float32)
    global _frozen_param384
    _frozen_param384 = rand_strided((512, 1, 1), (1, 1, 1), device='cpu', dtype=torch.float32)
    global _frozen_param596
    _frozen_param596 = rand_strided((1536, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param662
    _frozen_param662 = rand_strided((1536, 512, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param597
    _frozen_param597 = rand_strided((1280, ), (1, ), device='cpu', dtype=torch.float32)
    global _frozen_param663
    _frozen_param663 = rand_strided((1280, 1536, 1, 1), (1, 0, 0, 0), device='cpu', dtype=torch.float32)
    global _frozen_param664
    _frozen_param664 = rand_strided((1000, 1280), (1280, 1), device='cpu', dtype=torch.float32)
    global _frozen_param665
    _frozen_param665 = rand_strided((3490017, 1), (1, 0), device='cpu', dtype=torch.float32)
    arg224_1 = rand_strided((1, 3, 256, 256), (196608, 65536, 256, 1), device='cpu', dtype=torch.float32)
    fn = lambda: call([arg224_1])
    return print_performance(fn, times=times, repeat=repeat)


 if __name__ == "__main__":
    from torch._inductor.wrapper_benchmark import compiled_module_main
    compiled_module_main('sebotnet33ts_256', benchmark_compiled_module)
No results found