leslie-fang-intel · October 30, 2024 11:21
diff --git a/index_put_code.py b/index_put_code.py
 # AOT ID: ['0_inference']
 from ctypes import c_void_p, c_long, c_int
 import torch
 import math
 import random
 import os
 import tempfile
 from math import inf, nan
 from torch._inductor.hooks import run_intermediate_hooks
 from torch._inductor.utils import maybe_profile
 from torch._inductor.codegen.memory_planning import _align as align
 from torch import device, empty_strided
 from torch._inductor.async_compile import AsyncCompile
 from torch._inductor.select_algorithm import extern_kernels
 from torch._inductor.codegen.multi_kernel import MultiKernelCall

 aten = torch.ops.aten
 inductor_ops = torch.ops.inductor
 _quantized = torch.ops._quantized
 assert_size_stride = torch._C._dynamo.guards.assert_size_stride
 empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
 empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
 empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
 reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
 alloc_from_pool = torch.ops.inductor._alloc_from_pool
 async_compile = AsyncCompile()


 cpp_fused_index_put_0 = async_compile.cpp_pybinding(['const int64_t*', 'const float*', 'float*'], '''
 #include "/tmp/torchinductor_leslie/2r/c2rnilspx43ivnzu4uieul65kx65dfhfbptbh5og4wk6rqebuxoo.h"
 extern "C"  void kernel(const int64_t* in_ptr0,
                       const float* in_ptr1,
                       float* out_ptr0)
 {
    {
        #pragma GCC ivdep
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(2L); x0+=static_cast<int64_t>(1L))
        {
            for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(32L); x1+=static_cast<int64_t>(16L))
            {
                auto tmp0 = in_ptr0[static_cast<int64_t>(x0)];
                auto tmp1 = static_cast<int64_t>(10);
                auto tmp2 = decltype(tmp0)(tmp0 + tmp1);
                auto tmp3 = 2L;
                auto tmp4 = c10::convert<int64_t>(tmp3);
                auto tmp5 = decltype(tmp2)(tmp2 + tmp4);
                auto tmp6 = tmp2 < 0;
                auto tmp7 = tmp6 ? tmp5 : tmp2;
                auto tmp8 = tmp7;
                auto tmp9 = c10::convert<int64_t>(tmp8);
                AOTI_TORCH_CHECK((0 <= tmp9) & (tmp9 < 2L), "index out of bounds: 0 <= tmp9 < 2L");
                auto tmp11 = at::vec::Vectorized<float>::loadu(out_ptr0 + static_cast<int64_t>(x1 + (32L*tmp7)), static_cast<int64_t>(16));
                auto tmp12 = tmp11 + tmp11;
                tmp12.store(out_ptr0 + static_cast<int64_t>(x1 + (32L*tmp7)));
            }
        }
    }
 }
 ''')


 async_compile.wait(globals())
 del async_compile

 def call(args):
    arg0_1, arg1_1 = args
    args.clear()
    assert_size_stride(arg0_1, (1, 2), (2, 1))
    assert_size_stride(arg1_1, (2, 32), (32, 1))
    cpp_fused_index_put_0(arg0_1, arg1_1, arg1_1)
    del arg0_1
    del arg1_1
    return ()


 def benchmark_compiled_module(times=10, repeat=10):
    from torch._dynamo.testing import rand_strided
    from torch._inductor.utils import print_performance
    arg0_1 = rand_strided((1, 2), (2, 1), device='cpu', dtype=torch.int64)
    arg1_1 = rand_strided((2, 32), (32, 1), device='cpu', dtype=torch.float32)
    fn = lambda: call([arg0_1, arg1_1])
    return print_performance(fn, times=times, repeat=repeat)


 if __name__ == "__main__":
    from torch._inductor.wrapper_benchmark import compiled_module_main
    compiled_module_main('None', benchmark_compiled_module)
	# AOT ID: ['0_inference']
	from ctypes import c_void_p, c_long, c_int
	import torch
	import math
	import random
	import os
	import tempfile
	from math import inf, nan
	from torch._inductor.hooks import run_intermediate_hooks
	from torch._inductor.utils import maybe_profile
	from torch._inductor.codegen.memory_planning import _align as align
	from torch import device, empty_strided
	from torch._inductor.async_compile import AsyncCompile
	from torch._inductor.select_algorithm import extern_kernels
	from torch._inductor.codegen.multi_kernel import MultiKernelCall

	aten = torch.ops.aten
	inductor_ops = torch.ops.inductor
	_quantized = torch.ops._quantized
	assert_size_stride = torch._C._dynamo.guards.assert_size_stride
	empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
	empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
	empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
	reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
	alloc_from_pool = torch.ops.inductor._alloc_from_pool
	async_compile = AsyncCompile()


	cpp_fused_index_put_0 = async_compile.cpp_pybinding(['const int64_t', 'const float', 'float*'], '''
	#include "/tmp/torchinductor_leslie/2r/c2rnilspx43ivnzu4uieul65kx65dfhfbptbh5og4wk6rqebuxoo.h"
	extern "C" void kernel(const int64_t* in_ptr0,
	const float* in_ptr1,
	float* out_ptr0)
	{
	{
	#pragma GCC ivdep
	for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(2L); x0+=static_cast<int64_t>(1L))
	{
	for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(32L); x1+=static_cast<int64_t>(16L))
	{
	auto tmp0 = in_ptr0[static_cast<int64_t>(x0)];
	auto tmp1 = static_cast<int64_t>(10);
	auto tmp2 = decltype(tmp0)(tmp0 + tmp1);
	auto tmp3 = 2L;
	auto tmp4 = c10::convert<int64_t>(tmp3);
	auto tmp5 = decltype(tmp2)(tmp2 + tmp4);
	auto tmp6 = tmp2 < 0;
	auto tmp7 = tmp6 ? tmp5 : tmp2;
	auto tmp8 = tmp7;
	auto tmp9 = c10::convert<int64_t>(tmp8);
	AOTI_TORCH_CHECK((0 <= tmp9) & (tmp9 < 2L), "index out of bounds: 0 <= tmp9 < 2L");
	auto tmp11 = at::vec::Vectorized<float>::loadu(out_ptr0 + static_cast<int64_t>(x1 + (32L*tmp7)), static_cast<int64_t>(16));
	auto tmp12 = tmp11 + tmp11;
	tmp12.store(out_ptr0 + static_cast<int64_t>(x1 + (32L*tmp7)));
	}
	}
	}
	}
	''')


	async_compile.wait(globals())
	del async_compile

	def call(args):
	arg0_1, arg1_1 = args
	args.clear()
	assert_size_stride(arg0_1, (1, 2), (2, 1))
	assert_size_stride(arg1_1, (2, 32), (32, 1))
	cpp_fused_index_put_0(arg0_1, arg1_1, arg1_1)
	del arg0_1
	del arg1_1
	return ()


	def benchmark_compiled_module(times=10, repeat=10):
	from torch._dynamo.testing import rand_strided
	from torch._inductor.utils import print_performance
	arg0_1 = rand_strided((1, 2), (2, 1), device='cpu', dtype=torch.int64)
	arg1_1 = rand_strided((2, 32), (32, 1), device='cpu', dtype=torch.float32)
	fn = lambda: call([arg0_1, arg1_1])
	return print_performance(fn, times=times, repeat=repeat)


	if __name__ == "__main__":
	from torch._inductor.wrapper_benchmark import compiled_module_main
	compiled_module_main('None', benchmark_compiled_module)
No results found