davidberard98 · May 9, 2025 21:16
diff --git a/test_max_autotune.log b/test_max_autotune.log
 AUTOTUNE addmm(4096x16, 4096x3, 3x16)
  triton_mm_4 0.0034 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_5 0.0035 ms 98.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4
  triton_mm_2 0.0035 ms 97.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=2
  triton_mm_3 0.0035 ms 97.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4
  triton_mm_6 0.0035 ms 97.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=4
  triton_mm_8 0.0035 ms 96.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=128, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8
  triton_mm_10 0.0035 ms 96.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=128, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_9 0.0036 ms 93.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=128, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=8
  triton_mm_1 0.0036 ms 93.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=2
  triton_mm_0 0.0038 ms 89.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=2
 SingleProcess AUTOTUNE benchmarking takes 0.2860 seconds and 0.1056 seconds precompiling for 11 choices
 frames [('total', 1), ('ok', 1)]
 inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 inductor [('triton_bundler_save_kernel', 84), ('async_compile_cache_miss', 13), ('select_algorithm_num_precompiles', 11), ('benchmarking.InductorBenchmarker.benchmark_gpu', 11), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 graph_break []
 aten_mm_info [('aten.addmm_4096_16_3', 1)]
 .sAUTOTUNE baddbmm(64x2048x192, 64x2048x64, 64x64x192)
  triton_bmm_25 0.0939 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8
  triton_bmm_29 0.0939 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
  triton_bmm_18 0.0939 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8
  triton_bmm_24 0.0939 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
  triton_bmm_27 0.0939 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
  triton_bmm_20 0.0942 ms 99.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
  triton_bmm_17 0.0942 ms 99.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4
  triton_bmm_19 0.0942 ms 99.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4
  triton_bmm_22 0.0942 ms 99.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
  triton_bmm_23 0.0942 ms 99.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4
 SingleProcess AUTOTUNE benchmarking takes 0.6409 seconds and 0.3905 seconds precompiling for 19 choices
 frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 140), ('async_compile_cache_miss', 21), ('select_algorithm_num_precompiles', 19), ('benchmarking.InductorBenchmarker.benchmark_gpu', 19), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)]
 graph_break []
 aten_mm_info [('aten.baddbmm_2048_192_64', 1)]
 .Process SpawnProcess-1:
 Traceback (most recent call last):
  File "/root/miniconda3/envs/triton/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/root/miniconda3/envs/triton/lib/python3.12/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/workspace/pytorch/test/inductor/test_max_autotune.py", line 64, in benchmark_choice
    result = choice.benchmark(*args, out=out)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/test/inductor/test_max_autotune.py", line 73, in benchmark
    raise RuntimeError("This choice caller will always throw")
 RuntimeError: This choice caller will always throw
 .timings is tensor([0.0077, 0.0077, 0.0077]), out tensor([[ 5.8251, -2.6431],
        [ 4.6481,  1.1723]], device='cuda:0'), expected_out None
 .AUTOTUNE addmm(4x4, 4x4, 4x4)
  triton_mm_30 0.0034 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=1
  triton_mm_31 0.0035 ms 99.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=1
  triton_mm_32 0.0035 ms 98.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=1
  triton_mm_34 0.0036 ms 96.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=1
  triton_mm_33 0.0036 ms 93.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=1
 SingleProcess AUTOTUNE benchmarking takes 0.1741 seconds and 0.0496 seconds precompiling for 5 choices
 frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 3), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 56), ('async_compile_cache_miss', 9), ('select_algorithm_num_precompiles', 5), ('benchmarking.InductorBenchmarker.benchmark_gpu', 5), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)]
 graph_break []
 aten_mm_info [('aten.addmm_4_4_4', 2)]
 .AUTOTUNE mm(32x32, 32x32)
  triton_mm_41 0.0033 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_44 0.0034 ms 98.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
  triton_mm_45 0.0034 ms 96.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4
  triton_mm_43 0.0036 ms 90.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_42 0.0037 ms 88.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4
  triton_mm_40 0.0038 ms 86.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=2
  mm 0.0061 ms 53.9% 
 SingleProcess AUTOTUNE benchmarking takes 0.2061 seconds and 0.0794 seconds precompiling for 7 choices
 frames [('total', 3), ('ok', 3)]
 stats [('calls_captured', 11), ('unique_graphs', 3)]
 aot_autograd [('total', 3), ('autograd_cache_miss', 3), ('autograd_cache_saved', 3), ('ok', 3)]
 inductor [('triton_bundler_save_kernel', 168), ('benchmarking.InductorBenchmarker.benchmark_gpu', 33), ('async_compile_cache_miss', 13), ('select_algorithm_num_precompiles', 6), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 3), ('extern_calls', 2), ('select_algorithm_precompile', 1), ('benchmarking.InductorBenchmarker.benchmark', 1), ('select_algorithm_autotune', 1)]
 graph_break []
 aten_mm_info [('aten.mm_32_32_32', 3)]
 .AUTOTUNE mm(32x32, 32x32)
  triton_mm_62 0.0034 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
  triton_mm_59 0.0034 ms 99.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_60 0.0035 ms 98.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4
  triton_mm_63 0.0035 ms 98.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4
  triton_mm_58 0.0038 ms 89.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=2
  triton_mm_61 0.0038 ms 89.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4
 SingleProcess AUTOTUNE benchmarking takes 0.1909 seconds and 0.0791 seconds precompiling for 6 choices
 frames [('total', 3), ('ok', 3)]
 stats [('calls_captured', 11), ('unique_graphs', 3)]
 aot_autograd [('total', 3), ('autograd_cache_miss', 3), ('autograd_cache_saved', 3), ('ok', 3)]
 inductor [('triton_bundler_save_kernel', 154), ('benchmarking.InductorBenchmarker.benchmark_gpu', 24), ('async_compile_cache_miss', 17), ('select_algorithm_num_precompiles', 6), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 3), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)]
 graph_break []
 aten_mm_info [('aten.mm_32_32_32', 3)]
 .AUTOTUNE mm(168084x3, 3x64)
  triton_mm_80 0.0603 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4
  triton_mm_81 0.0609 ms 98.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_82 0.0612 ms 98.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=8
  triton_mm_84 0.0612 ms 98.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4
  triton_mm_85 0.0612 ms 98.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8
  triton_mm_86 0.0612 ms 98.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=4
  triton_mm_87 0.0612 ms 98.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4
  triton_mm_88 0.0612 ms 98.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8
  triton_mm_89 0.0612 ms 98.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=8
  triton_mm_90 0.0612 ms 98.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8
 SingleProcess AUTOTUNE benchmarking takes 0.4810 seconds and 0.1080 seconds precompiling for 16 choices
 AUTOTUNE mm(159856x3, 3x64)
  triton_mm_102 0.0573 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4
  triton_mm_95 0.0588 ms 97.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4
  triton_mm_104 0.0589 ms 97.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=8
  triton_mm_101 0.0589 ms 97.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=4
  triton_mm_105 0.0590 ms 97.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_93 0.0591 ms 97.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_94 0.0591 ms 97.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_99 0.0591 ms 97.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4
  triton_mm_100 0.0591 ms 97.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8
  triton_mm_103 0.0591 ms 97.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8
 SingleProcess AUTOTUNE benchmarking takes 0.4557 seconds and 0.1030 seconds precompiling for 16 choices
 frames [('total', 8), ('ok', 8)]
 unimplemented []
 graph_break [("Dynamic shape operator\n  Explanation: Operator `aten.nonzero.default`'s output shape depends on input Tensor data.\n  Hint: Enable tracing of dynamic shape operators with `torch._dynamo.config.capture_dynamic_output_shape_ops = True`\n\n  Developer debug context: aten.nonzero.default\n", 3)]
 inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 3)]
 aot_autograd [('total', 3), ('autograd_cache_miss', 3), ('ok', 3), ('autograd_cache_saved', 1)]
 inductor [('triton_bundler_save_kernel', 280), ('benchmarking.InductorBenchmarker.benchmark_gpu', 40), ('async_compile_cache_miss', 36), ('select_algorithm_num_precompiles', 30), ('fxgraph_cache_miss', 3), ('select_algorithm_precompile', 2), ('benchmarking.InductorBenchmarker.benchmark', 2), ('select_algorithm_autotune', 2), ('extern_calls', 2)]
 aten_mm_info [('aten.mm_168084_64_3', 1), ('aten.mm_4*s0*s1_64_3', 1)]
 .frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('fxgraph_cache_miss', 1), ('extern_calls', 1)]
 graph_break []
 aten_mm_info []
 .frames [('total', 1)]
 inline_call []
 stats [('calls_captured', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('not_ok', 1)]
 inductor [('fxgraph_cache_miss', 1)]
 graph_break []
 .AUTOTUNE convolution(32x3x64x64, 64x3x3x3)
  convolution 0.0563 ms 100.0% 
  triton_convolution2d_106 0.0922 ms 61.1% ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, GROUPS=1, KERNEL_H=3, KERNEL_W=3, PADDING_H=1, PADDING_W=1, STRIDE_H=1, STRIDE_W=1, UNROLL=False, num_stages=2, num_warps=4
  triton_convolution2d_110 0.0939 ms 60.0% ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, GROUPS=1, KERNEL_H=3, KERNEL_W=3, PADDING_H=1, PADDING_W=1, STRIDE_H=1, STRIDE_W=1, UNROLL=False, num_stages=2, num_warps=8
  triton_convolution2d_111 0.1120 ms 50.3% ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=256, BLOCK_N=64, GROUPS=1, KERNEL_H=3, KERNEL_W=3, PADDING_H=1, PADDING_W=1, STRIDE_H=1, STRIDE_W=1, UNROLL=False, num_stages=2, num_warps=8
  triton_convolution2d_109 0.1203 ms 46.8% ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=128, BLOCK_N=64, GROUPS=1, KERNEL_H=3, KERNEL_W=3, PADDING_H=1, PADDING_W=1, STRIDE_H=1, STRIDE_W=1, UNROLL=False, num_stages=2, num_warps=8
  triton_convolution2d_108 0.1393 ms 40.4% ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=1024, BLOCK_N=16, GROUPS=1, KERNEL_H=3, KERNEL_W=3, PADDING_H=1, PADDING_W=1, STRIDE_H=1, STRIDE_W=1, UNROLL=False, num_stages=1, num_warps=8
  triton_convolution2d_107 0.2045 ms 27.5% ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=256, BLOCK_N=64, GROUPS=1, KERNEL_H=3, KERNEL_W=3, PADDING_H=1, PADDING_W=1, STRIDE_H=1, STRIDE_W=1, UNROLL=False, num_stages=2, num_warps=4
 SingleProcess AUTOTUNE benchmarking takes 0.2998 seconds and 0.3119 seconds precompiling for 7 choices
 frames [('total', 1), ('ok', 1)]
 inline_call []
 stats [('calls_captured', 3), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 147), ('benchmarking.InductorBenchmarker.benchmark_gpu', 49), ('async_compile_cache_miss', 9), ('select_algorithm_num_precompiles', 6), ('async_compile_cache_hit', 3), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('benchmarking.InductorBenchmarker.benchmark', 1), ('select_algorithm_autotune', 1), ('extern_calls', 1)]
 graph_break []
 aten_mm_info []
 .frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('async_compile_cache_miss', 6), ('fxgraph_cache_miss', 1)]
 graph_break []
 aten_mm_info []
 .E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] Triton compilation failed: Placeholder.DESCRIPTIVE_NAME
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] def triton_convolution2d(arg_X, arg_W, out_ptr0):
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]     KERNEL_H : tl.constexpr = 3
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]     KERNEL_W : tl.constexpr = 3
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]     STRIDE_H : tl.constexpr = 1
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]     STRIDE_W : tl.constexpr = 1
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]     PADDING_H : tl.constexpr = 0
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]     PADDING_W : tl.constexpr = 0
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]     GROUPS : tl.constexpr = 1
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]     UNROLL : tl.constexpr = False
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]     ALLOW_TF32 : tl.constexpr = True
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]     BLOCK_M : tl.constexpr = 16
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]     BLOCK_N : tl.constexpr = 16
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]     BLOCK_K : tl.constexpr = 16
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]     X = arg_X
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]     W = arg_W
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]     # Tensor dimensions
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]     BATCH = 0
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]     IN_C = 256
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]     IN_H = 14
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]     IN_W = 14
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]     OUT_C = 256
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]     OUT_H = 12
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]     OUT_W = 12
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]     # Strides:
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]     stride_xn = 50176
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]     stride_xc = 196
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]     stride_xh = 14
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]     stride_xw = 1
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]     stride_wc_out = 2304
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]     stride_wc_in = 1
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]     stride_wh = 768
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]     stride_ww = 256
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]     nhw = tl.program_id(0) * BLOCK_M + tl.arange(0, BLOCK_M)
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]     idx_y_w = nhw % OUT_W
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]     nh = nhw // OUT_W
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]     idx_y_h = nh % OUT_H
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]     idx_n = nh // OUT_H
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]     idx_y_c = tl.program_id(1) * BLOCK_N + tl.arange(0, BLOCK_N)
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]     group = 0
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]     GROUP_IN_C = IN_C
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]     GROUP_OUT_C = OUT_C
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]     x_base = X + (group * stride_xc * GROUP_IN_C + idx_n * stride_xn)[:, None]
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]     w_base = (
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]         W + (group * stride_wc_out * GROUP_OUT_C + idx_y_c * stride_wc_out)[None, :]
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]     )
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]     acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]     # Could be simplified, but slightly slower:
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]     # for i in range(KERNEL_H):
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]     #     for j in range(KERNEL_W):
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]     #         for k in range(0, GROUP_IN_C, BLOCK_K):
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]     BLOCK_K_COUNT = (GROUP_IN_C + BLOCK_K - 1) // BLOCK_K
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]     for ijk in range(KERNEL_H * KERNEL_W * BLOCK_K_COUNT):
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]         k = (ijk % BLOCK_K_COUNT) * BLOCK_K
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]         ij = ijk // BLOCK_K_COUNT
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]         i = ij // KERNEL_W
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]         j = ij % KERNEL_W
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]         idx_x_h = i - PADDING_H + idx_y_h * STRIDE_H
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]         idx_x_w = j - PADDING_W + idx_y_w * STRIDE_W
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]         idx_x_c = tl.arange(0, BLOCK_K) + k
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]         x_ptrs = x_base + (
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]             (idx_x_h * stride_xh)[:, None]
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]             + (idx_x_w * stride_xw)[:, None]
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]             + (idx_x_c * stride_xc)[None, :]
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]         )
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]         mask_x = (
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]             (idx_n < BATCH)[:, None]
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]             & (idx_x_h >= 0)[:, None]
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]             & (idx_x_h < IN_H)[:, None]
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]             & (idx_x_w >= 0)[:, None]
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]             & (idx_x_w < IN_W)[:, None]
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]             & (idx_x_c < GROUP_IN_C)[None, :]
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]         )
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]         matrix_x = tl.load(x_ptrs, mask=mask_x, other=0.0)
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]         w_ptrs = w_base + (
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]             (idx_x_c * stride_wc_in)[:, None] + (i * stride_wh) + (j * stride_ww)
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]         )
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]         mask_w = (idx_x_c[:, None] < GROUP_IN_C) & (idx_y_c[None, :] < GROUP_OUT_C)
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]         matrix_w = tl.load(w_ptrs, mask=mask_w, other=0.0)
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]         acc += tl.dot(matrix_x, matrix_w, allow_tf32=ALLOW_TF32)
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]     mask = (
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]         (idx_n < BATCH)[:, None]
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]         & (idx_y_h < OUT_H)[:, None]
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]         & (idx_y_w < OUT_W)[:, None]
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]         & (idx_y_c < GROUP_OUT_C)[None, :]
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]     )
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]     idx_n = idx_n[:, None]
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]     idx_c = idx_y_c[None, :] + group * GROUP_OUT_C
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]     idx_h = idx_y_h[:, None]
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]     idx_w = idx_y_w[:, None]
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]     # inductor generates a suffix
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]     xindex = idx_w + 12*idx_h + 144*idx_c + 36864*idx_n
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]     tl.store(out_ptr0 + (tl.broadcast_to(idx_c + 256*idx_w + 3072*idx_h + 36864*idx_n, acc.shape)), acc, mask)
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] metadata: {'signature': {'arg_X': '*fp32', 'arg_W': '*fp32', 'out_ptr0': '*fp32'}, 'device': 0, 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}], 'device_type': 'cuda', 'num_warps': 1, 'num_stages': 1, 'debug': True, 'cc': 120}
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] Traceback (most recent call last):
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]   File "/workspace/pytorch/torch/_inductor/runtime/triton_heuristics.py", line 537, in _precompile_config
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]     binary = triton.compile(*compile_args, **compile_kwargs)
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]   File "/workspace/triton/python/triton/compiler/compiler.py", line 316, in compile
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]     return CompiledKernel(src, metadata_group, hash)
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]   File "/workspace/triton/python/triton/compiler/compiler.py", line 382, in __init__
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]     file.suffix[1:]: file.read_bytes() if file.suffix[1:] == binary_ext else file.read_text()
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]                                                                              ^^^^^^^^^^^^^^^^
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]   File "/root/miniconda3/envs/triton/lib/python3.12/pathlib.py", line 1027, in read_text
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]     with self.open(mode='r', encoding=encoding, errors=errors) as f:
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]   File "/root/miniconda3/envs/triton/lib/python3.12/pathlib.py", line 1013, in open
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]     return io.open(self, mode, buffering, encoding, errors, newline)
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] FileNotFoundError: [Errno 2] No such file or directory: '/tmp/tmp8jpqfdds/triton/OKBX2FGS2GKNCNNM3EGQETF6HKRQ3TWANAB6W2NPWMSAR3BN6HGA/triton_convolution2d.ttir'
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] Triton compilation failed: Placeholder.DESCRIPTIVE_NAME
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] def triton_convolution2d(arg_X, arg_W, out_ptr0):
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]     KERNEL_H : tl.constexpr = 3
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]     KERNEL_W : tl.constexpr = 3
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]     STRIDE_H : tl.constexpr = 1
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]     STRIDE_W : tl.constexpr = 1
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]     PADDING_H : tl.constexpr = 0
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]     PADDING_W : tl.constexpr = 0
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]     GROUPS : tl.constexpr = 1
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]     UNROLL : tl.constexpr = False
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]     ALLOW_TF32 : tl.constexpr = True
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]     BLOCK_M : tl.constexpr = 16
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]     BLOCK_N : tl.constexpr = 64
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]     BLOCK_K : tl.constexpr = 16
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]     X = arg_X
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]     W = arg_W
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]     # Tensor dimensions
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]     BATCH = 0
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]     IN_C = 256
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]     IN_H = 14
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]     IN_W = 14
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]     OUT_C = 256
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]     OUT_H = 12
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]     OUT_W = 12
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]     # Strides:
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]     stride_xn = 50176
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]     stride_xc = 196
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]     stride_xh = 14
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]     stride_xw = 1
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]     stride_wc_out = 2304
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]     stride_wc_in = 1
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]     stride_wh = 768
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]     stride_ww = 256
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]     nhw = tl.program_id(0) * BLOCK_M + tl.arange(0, BLOCK_M)
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]     idx_y_w = nhw % OUT_W
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]     nh = nhw // OUT_W
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]     idx_y_h = nh % OUT_H
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]     idx_n = nh // OUT_H
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]     idx_y_c = tl.program_id(1) * BLOCK_N + tl.arange(0, BLOCK_N)
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]     group = 0
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]     GROUP_IN_C = IN_C
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]     GROUP_OUT_C = OUT_C
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]     x_base = X + (group * stride_xc * GROUP_IN_C + idx_n * stride_xn)[:, None]
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]     w_base = (
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]         W + (group * stride_wc_out * GROUP_OUT_C + idx_y_c * stride_wc_out)[None, :]
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]     )
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]     acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]     # Could be simplified, but slightly slower:
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]     # for i in range(KERNEL_H):
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]     #     for j in range(KERNEL_W):
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]     #         for k in range(0, GROUP_IN_C, BLOCK_K):
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]     BLOCK_K_COUNT = (GROUP_IN_C + BLOCK_K - 1) // BLOCK_K
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]     for ijk in range(KERNEL_H * KERNEL_W * BLOCK_K_COUNT):
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]         k = (ijk % BLOCK_K_COUNT) * BLOCK_K
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]         ij = ijk // BLOCK_K_COUNT
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]         i = ij // KERNEL_W
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]         j = ij % KERNEL_W
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]         idx_x_h = i - PADDING_H + idx_y_h * STRIDE_H
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]         idx_x_w = j - PADDING_W + idx_y_w * STRIDE_W
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]         idx_x_c = tl.arange(0, BLOCK_K) + k
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]         x_ptrs = x_base + (
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]             (idx_x_h * stride_xh)[:, None]
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]             + (idx_x_w * stride_xw)[:, None]
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]             + (idx_x_c * stride_xc)[None, :]
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]         )
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]         mask_x = (
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]             (idx_n < BATCH)[:, None]
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]             & (idx_x_h >= 0)[:, None]
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]             & (idx_x_h < IN_H)[:, None]
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]             & (idx_x_w >= 0)[:, None]
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]             & (idx_x_w < IN_W)[:, None]
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]             & (idx_x_c < GROUP_IN_C)[None, :]
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]         )
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]         matrix_x = tl.load(x_ptrs, mask=mask_x, other=0.0)
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]         w_ptrs = w_base + (
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]             (idx_x_c * stride_wc_in)[:, None] + (i * stride_wh) + (j * stride_ww)
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]         )
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]         mask_w = (idx_x_c[:, None] < GROUP_IN_C) & (idx_y_c[None, :] < GROUP_OUT_C)
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]         matrix_w = tl.load(w_ptrs, mask=mask_w, other=0.0)
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]         acc += tl.dot(matrix_x, matrix_w, allow_tf32=ALLOW_TF32)
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]     mask = (
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]         (idx_n < BATCH)[:, None]
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]         & (idx_y_h < OUT_H)[:, None]
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]         & (idx_y_w < OUT_W)[:, None]
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]         & (idx_y_c < GROUP_OUT_C)[None, :]
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]     )
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]     idx_n = idx_n[:, None]
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]     idx_c = idx_y_c[None, :] + group * GROUP_OUT_C
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]     idx_h = idx_y_h[:, None]
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]     idx_w = idx_y_w[:, None]
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]     # inductor generates a suffix
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]     xindex = idx_w + 12*idx_h + 144*idx_c + 36864*idx_n
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]     tl.store(out_ptr0 + (tl.broadcast_to(idx_c + 256*idx_w + 3072*idx_h + 36864*idx_n, acc.shape)), acc, mask)
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] metadata: {'signature': {'arg_X': '*fp32', 'arg_W': '*fp32', 'out_ptr0': '*fp32'}, 'device': 0, 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}], 'device_type': 'cuda', 'num_warps': 4, 'num_stages': 2, 'debug': True, 'cc': 120}
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] Traceback (most recent call last):
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]   File "/workspace/pytorch/torch/_inductor/runtime/triton_heuristics.py", line 537, in _precompile_config
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]     binary = triton.compile(*compile_args, **compile_kwargs)
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]   File "/workspace/triton/python/triton/compiler/compiler.py", line 316, in compile
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]     return CompiledKernel(src, metadata_group, hash)
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]   File "/workspace/triton/python/triton/compiler/compiler.py", line 382, in __init__
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]     file.suffix[1:]: file.read_bytes() if file.suffix[1:] == binary_ext else file.read_text()
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]                                                                              ^^^^^^^^^^^^^^^^
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]   File "/root/miniconda3/envs/triton/lib/python3.12/pathlib.py", line 1027, in read_text
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]     with self.open(mode='r', encoding=encoding, errors=errors) as f:
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]   File "/root/miniconda3/envs/triton/lib/python3.12/pathlib.py", line 1013, in open
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]     return io.open(self, mode, buffering, encoding, errors, newline)
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] FileNotFoundError: [Errno 2] No such file or directory: '/tmp/tmp8jpqfdds/triton/ADUSF5KEFEADV5PKAOTDGCRC6SJ7HGCR67UP5W6TLE2DQIRRAVWA/triton_convolution2d.ttir'
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] Triton compilation failed: Placeholder.DESCRIPTIVE_NAME
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] def triton_convolution2d(arg_X, arg_W, out_ptr0):
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]     KERNEL_H : tl.constexpr = 3
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]     KERNEL_W : tl.constexpr = 3
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]     STRIDE_H : tl.constexpr = 1
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]     STRIDE_W : tl.constexpr = 1
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]     PADDING_H : tl.constexpr = 0
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]     PADDING_W : tl.constexpr = 0
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]     GROUPS : tl.constexpr = 1
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]     UNROLL : tl.constexpr = False
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]     ALLOW_TF32 : tl.constexpr = True
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]     BLOCK_M : tl.constexpr = 16
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]     BLOCK_N : tl.constexpr = 64
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]     BLOCK_K : tl.constexpr = 32
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]     X = arg_X
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]     W = arg_W
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]     # Tensor dimensions
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]     BATCH = 0
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]     IN_C = 256
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]     IN_H = 14
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]     IN_W = 14
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]     OUT_C = 256
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]     OUT_H = 12
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]     OUT_W = 12
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]     # Strides:
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]     stride_xn = 50176
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]     stride_xc = 196
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]     stride_xh = 14
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]     stride_xw = 1
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]     stride_wc_out = 2304
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]     stride_wc_in = 1
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]     stride_wh = 768
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]     stride_ww = 256
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]     nhw = tl.program_id(0) * BLOCK_M + tl.arange(0, BLOCK_M)
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]     idx_y_w = nhw % OUT_W
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]     nh = nhw // OUT_W
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]     idx_y_h = nh % OUT_H
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]     idx_n = nh // OUT_H
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]     idx_y_c = tl.program_id(1) * BLOCK_N + tl.arange(0, BLOCK_N)
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]     group = 0
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]     GROUP_IN_C = IN_C
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]     GROUP_OUT_C = OUT_C
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]     x_base = X + (group * stride_xc * GROUP_IN_C + idx_n * stride_xn)[:, None]
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]     w_base = (
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]         W + (group * stride_wc_out * GROUP_OUT_C + idx_y_c * stride_wc_out)[None, :]
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]     )
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]     acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]     # Could be simplified, but slightly slower:
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]     # for i in range(KERNEL_H):
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]     #     for j in range(KERNEL_W):
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]     #         for k in range(0, GROUP_IN_C, BLOCK_K):
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]     BLOCK_K_COUNT = (GROUP_IN_C + BLOCK_K - 1) // BLOCK_K
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]     for ijk in range(KERNEL_H * KERNEL_W * BLOCK_K_COUNT):
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]         k = (ijk % BLOCK_K_COUNT) * BLOCK_K
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]         ij = ijk // BLOCK_K_COUNT
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]         i = ij // KERNEL_W
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]         j = ij % KERNEL_W
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]         idx_x_h = i - PADDING_H + idx_y_h * STRIDE_H
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]         idx_x_w = j - PADDING_W + idx_y_w * STRIDE_W
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]         idx_x_c = tl.arange(0, BLOCK_K) + k
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]         x_ptrs = x_base + (
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]             (idx_x_h * stride_xh)[:, None]
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]             + (idx_x_w * stride_xw)[:, None]
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]             + (idx_x_c * stride_xc)[None, :]
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]         )
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]         mask_x = (
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]             (idx_n < BATCH)[:, None]
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]             & (idx_x_h >= 0)[:, None]
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]             & (idx_x_h < IN_H)[:, None]
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]             & (idx_x_w >= 0)[:, None]
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]             & (idx_x_w < IN_W)[:, None]
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]             & (idx_x_c < GROUP_IN_C)[None, :]
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]         )
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]         matrix_x = tl.load(x_ptrs, mask=mask_x, other=0.0)
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]         w_ptrs = w_base + (
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]             (idx_x_c * stride_wc_in)[:, None] + (i * stride_wh) + (j * stride_ww)
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]         )
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]         mask_w = (idx_x_c[:, None] < GROUP_IN_C) & (idx_y_c[None, :] < GROUP_OUT_C)
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]         matrix_w = tl.load(w_ptrs, mask=mask_w, other=0.0)
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]         acc += tl.dot(matrix_x, matrix_w, allow_tf32=ALLOW_TF32)
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]     mask = (
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]         (idx_n < BATCH)[:, None]
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]         & (idx_y_h < OUT_H)[:, None]
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]         & (idx_y_w < OUT_W)[:, None]
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]         & (idx_y_c < GROUP_OUT_C)[None, :]
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]     )
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]     idx_n = idx_n[:, None]
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]     idx_c = idx_y_c[None, :] + group * GROUP_OUT_C
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]     idx_h = idx_y_h[:, None]
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]     idx_w = idx_y_w[:, None]
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]     # inductor generates a suffix
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]     xindex = idx_w + 12*idx_h + 144*idx_c + 36864*idx_n
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]     tl.store(out_ptr0 + (tl.broadcast_to(idx_c + 256*idx_w + 3072*idx_h + 36864*idx_n, acc.shape)), acc, mask)
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] metadata: {'signature': {'arg_X': '*fp32', 'arg_W': '*fp32', 'out_ptr0': '*fp32'}, 'device': 0, 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}], 'device_type': 'cuda', 'num_warps': 4, 'num_stages': 2, 'debug': True, 'cc': 120}
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] Traceback (most recent call last):
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]   File "/workspace/pytorch/torch/_inductor/runtime/triton_heuristics.py", line 537, in _precompile_config
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]     binary = triton.compile(*compile_args, **compile_kwargs)
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]   File "/workspace/triton/python/triton/compiler/compiler.py", line 316, in compile
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]     return CompiledKernel(src, metadata_group, hash)
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]   File "/workspace/triton/python/triton/compiler/compiler.py", line 382, in __init__
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]     file.suffix[1:]: file.read_bytes() if file.suffix[1:] == binary_ext else file.read_text()
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]                                                                              ^^^^^^^^^^^^^^^^
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]   File "/root/miniconda3/envs/triton/lib/python3.12/pathlib.py", line 1027, in read_text
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]     with self.open(mode='r', encoding=encoding, errors=errors) as f:
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]   File "/root/miniconda3/envs/triton/lib/python3.12/pathlib.py", line 1013, in open
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]     return io.open(self, mode, buffering, encoding, errors, newline)
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] FileNotFoundError: [Errno 2] No such file or directory: '/tmp/tmp8jpqfdds/triton/XVSZCZAGZM4IIKDSICNUCKNWZDCSZB3WTBJWXA6NA6STV4YQ4IUA/triton_convolution2d.ttir'
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] Triton compilation failed: Placeholder.DESCRIPTIVE_NAME
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] def triton_convolution2d(arg_X, arg_W, out_ptr0):
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]     KERNEL_H : tl.constexpr = 3
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]     KERNEL_W : tl.constexpr = 3
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]     STRIDE_H : tl.constexpr = 1
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]     STRIDE_W : tl.constexpr = 1
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]     PADDING_H : tl.constexpr = 0
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]     PADDING_W : tl.constexpr = 0
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]     GROUPS : tl.constexpr = 1
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]     UNROLL : tl.constexpr = False
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]     ALLOW_TF32 : tl.constexpr = True
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]     BLOCK_M : tl.constexpr = 16
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]     BLOCK_N : tl.constexpr = 128
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]     BLOCK_K : tl.constexpr = 32
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]     X = arg_X
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]     W = arg_W
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]     # Tensor dimensions
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]     BATCH = 0
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]     IN_C = 256
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]     IN_H = 14
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]     IN_W = 14
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]     OUT_C = 256
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]     OUT_H = 12
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]     OUT_W = 12
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]     # Strides:
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]     stride_xn = 50176
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]     stride_xc = 196
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]     stride_xh = 14
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]     stride_xw = 1
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]     stride_wc_out = 2304
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]     stride_wc_in = 1
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]     stride_wh = 768
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]     stride_ww = 256
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]     nhw = tl.program_id(0) * BLOCK_M + tl.arange(0, BLOCK_M)
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]     idx_y_w = nhw % OUT_W
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]     nh = nhw // OUT_W
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]     idx_y_h = nh % OUT_H
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]     idx_n = nh // OUT_H
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]     idx_y_c = tl.program_id(1) * BLOCK_N + tl.arange(0, BLOCK_N)
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]     group = 0
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]     GROUP_IN_C = IN_C
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]     GROUP_OUT_C = OUT_C
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]     x_base = X + (group * stride_xc * GROUP_IN_C + idx_n * stride_xn)[:, None]
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]     w_base = (
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]         W + (group * stride_wc_out * GROUP_OUT_C + idx_y_c * stride_wc_out)[None, :]
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]     )
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]     acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]     # Could be simplified, but slightly slower:
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]     # for i in range(KERNEL_H):
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]     #     for j in range(KERNEL_W):
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]     #         for k in range(0, GROUP_IN_C, BLOCK_K):
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]     BLOCK_K_COUNT = (GROUP_IN_C + BLOCK_K - 1) // BLOCK_K
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]     for ijk in range(KERNEL_H * KERNEL_W * BLOCK_K_COUNT):
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]         k = (ijk % BLOCK_K_COUNT) * BLOCK_K
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]         ij = ijk // BLOCK_K_COUNT
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]         i = ij // KERNEL_W
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]         j = ij % KERNEL_W
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]         idx_x_h = i - PADDING_H + idx_y_h * STRIDE_H
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]         idx_x_w = j - PADDING_W + idx_y_w * STRIDE_W
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]         idx_x_c = tl.arange(0, BLOCK_K) + k
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]         x_ptrs = x_base + (
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]             (idx_x_h * stride_xh)[:, None]
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]             + (idx_x_w * stride_xw)[:, None]
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]             + (idx_x_c * stride_xc)[None, :]
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]         )
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]         mask_x = (
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]             (idx_n < BATCH)[:, None]
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]             & (idx_x_h >= 0)[:, None]
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]             & (idx_x_h < IN_H)[:, None]
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]             & (idx_x_w >= 0)[:, None]
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]             & (idx_x_w < IN_W)[:, None]
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]             & (idx_x_c < GROUP_IN_C)[None, :]
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]         )
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]         matrix_x = tl.load(x_ptrs, mask=mask_x, other=0.0)
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]         w_ptrs = w_base + (
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]             (idx_x_c * stride_wc_in)[:, None] + (i * stride_wh) + (j * stride_ww)
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]         )
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]         mask_w = (idx_x_c[:, None] < GROUP_IN_C) & (idx_y_c[None, :] < GROUP_OUT_C)
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]         matrix_w = tl.load(w_ptrs, mask=mask_w, other=0.0)
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]         acc += tl.dot(matrix_x, matrix_w, allow_tf32=ALLOW_TF32)
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]     mask = (
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]         (idx_n < BATCH)[:, None]
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]         & (idx_y_h < OUT_H)[:, None]
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]         & (idx_y_w < OUT_W)[:, None]
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]         & (idx_y_c < GROUP_OUT_C)[None, :]
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]     )
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]     idx_n = idx_n[:, None]
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]     idx_c = idx_y_c[None, :] + group * GROUP_OUT_C
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]     idx_h = idx_y_h[:, None]
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]     idx_w = idx_y_w[:, None]
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]     # inductor generates a suffix
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]     xindex = idx_w + 12*idx_h + 144*idx_c + 36864*idx_n
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]     tl.store(out_ptr0 + (tl.broadcast_to(idx_c + 256*idx_w + 3072*idx_h + 36864*idx_n, acc.shape)), acc, mask)
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] metadata: {'signature': {'arg_X': '*fp32', 'arg_W': '*fp32', 'out_ptr0': '*fp32'}, 'device': 0, 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}], 'device_type': 'cuda', 'num_warps': 8, 'num_stages': 2, 'debug': True, 'cc': 120}
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] Traceback (most recent call last):
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]   File "/workspace/pytorch/torch/_inductor/runtime/triton_heuristics.py", line 537, in _precompile_config
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]     binary = triton.compile(*compile_args, **compile_kwargs)
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]   File "/workspace/triton/python/triton/compiler/compiler.py", line 316, in compile
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]     return CompiledKernel(src, metadata_group, hash)
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]   File "/workspace/triton/python/triton/compiler/compiler.py", line 382, in __init__
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]     file.suffix[1:]: file.read_bytes() if file.suffix[1:] == binary_ext else file.read_text()
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]                                                                              ^^^^^^^^^^^^^^^^
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]   File "/root/miniconda3/envs/triton/lib/python3.12/pathlib.py", line 1027, in read_text
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]     with self.open(mode='r', encoding=encoding, errors=errors) as f:
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]   File "/root/miniconda3/envs/triton/lib/python3.12/pathlib.py", line 1013, in open
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]     return io.open(self, mode, buffering, encoding, errors, newline)
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] FileNotFoundError: [Errno 2] No such file or directory: '/tmp/tmp8jpqfdds/triton/HFCWHWC5X7TRLLBPY4QK6UJSXJVFBJ3TWILWKLHULF27TZOFFDGQ/triton_convolution2d.ttir'
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] Triton compilation failed: Placeholder.DESCRIPTIVE_NAME
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] def triton_convolution2d(arg_X, arg_W, out_ptr0):
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]     KERNEL_H : tl.constexpr = 3
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]     KERNEL_W : tl.constexpr = 3
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]     STRIDE_H : tl.constexpr = 1
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]     STRIDE_W : tl.constexpr = 1
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]     PADDING_H : tl.constexpr = 0
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]     PADDING_W : tl.constexpr = 0
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]     GROUPS : tl.constexpr = 1
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]     UNROLL : tl.constexpr = False
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]     ALLOW_TF32 : tl.constexpr = True
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]     BLOCK_M : tl.constexpr = 16
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]     BLOCK_N : tl.constexpr = 256
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]     BLOCK_K : tl.constexpr = 32
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]     X = arg_X
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]     W = arg_W
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]     # Tensor dimensions
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]     BATCH = 0
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]     IN_C = 256
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]     IN_H = 14
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]     IN_W = 14
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]     OUT_C = 256
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]     OUT_H = 12
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]     OUT_W = 12
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]     # Strides:
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]     stride_xn = 50176
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]     stride_xc = 196
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]     stride_xh = 14
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]     stride_xw = 1
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]     stride_wc_out = 2304
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]     stride_wc_in = 1
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]     stride_wh = 768
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]     stride_ww = 256
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]     nhw = tl.program_id(0) * BLOCK_M + tl.arange(0, BLOCK_M)
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]     idx_y_w = nhw % OUT_W
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]     nh = nhw // OUT_W
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]     idx_y_h = nh % OUT_H
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]     idx_n = nh // OUT_H
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]     idx_y_c = tl.program_id(1) * BLOCK_N + tl.arange(0, BLOCK_N)
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]     group = 0
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]     GROUP_IN_C = IN_C
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]     GROUP_OUT_C = OUT_C
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]     x_base = X + (group * stride_xc * GROUP_IN_C + idx_n * stride_xn)[:, None]
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]     w_base = (
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]         W + (group * stride_wc_out * GROUP_OUT_C + idx_y_c * stride_wc_out)[None, :]
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]     )
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]     acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]     # Could be simplified, but slightly slower:
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]     # for i in range(KERNEL_H):
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]     #     for j in range(KERNEL_W):
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]     #         for k in range(0, GROUP_IN_C, BLOCK_K):
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]     BLOCK_K_COUNT = (GROUP_IN_C + BLOCK_K - 1) // BLOCK_K
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]     for ijk in range(KERNEL_H * KERNEL_W * BLOCK_K_COUNT):
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]         k = (ijk % BLOCK_K_COUNT) * BLOCK_K
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]         ij = ijk // BLOCK_K_COUNT
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]         i = ij // KERNEL_W
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]         j = ij % KERNEL_W
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]         idx_x_h = i - PADDING_H + idx_y_h * STRIDE_H
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]         idx_x_w = j - PADDING_W + idx_y_w * STRIDE_W
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]         idx_x_c = tl.arange(0, BLOCK_K) + k
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]         x_ptrs = x_base + (
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]             (idx_x_h * stride_xh)[:, None]
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]             + (idx_x_w * stride_xw)[:, None]
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]             + (idx_x_c * stride_xc)[None, :]
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]         )
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]         mask_x = (
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]             (idx_n < BATCH)[:, None]
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]             & (idx_x_h >= 0)[:, None]
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]             & (idx_x_h < IN_H)[:, None]
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]             & (idx_x_w >= 0)[:, None]
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]             & (idx_x_w < IN_W)[:, None]
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]             & (idx_x_c < GROUP_IN_C)[None, :]
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]         )
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]         matrix_x = tl.load(x_ptrs, mask=mask_x, other=0.0)
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]         w_ptrs = w_base + (
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]             (idx_x_c * stride_wc_in)[:, None] + (i * stride_wh) + (j * stride_ww)
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]         )
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]         mask_w = (idx_x_c[:, None] < GROUP_IN_C) & (idx_y_c[None, :] < GROUP_OUT_C)
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]         matrix_w = tl.load(w_ptrs, mask=mask_w, other=0.0)
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]         acc += tl.dot(matrix_x, matrix_w, allow_tf32=ALLOW_TF32)
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]     mask = (
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]         (idx_n < BATCH)[:, None]
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]         & (idx_y_h < OUT_H)[:, None]
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]         & (idx_y_w < OUT_W)[:, None]
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]         & (idx_y_c < GROUP_OUT_C)[None, :]
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]     )
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]     idx_n = idx_n[:, None]
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]     idx_c = idx_y_c[None, :] + group * GROUP_OUT_C
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]     idx_h = idx_y_h[:, None]
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]     idx_w = idx_y_w[:, None]
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]     # inductor generates a suffix
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]     xindex = idx_w + 12*idx_h + 144*idx_c + 36864*idx_n
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]     tl.store(out_ptr0 + (tl.broadcast_to(idx_c + 256*idx_w + 3072*idx_h + 36864*idx_n, acc.shape)), acc, mask)
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] metadata: {'signature': {'arg_X': '*fp32', 'arg_W': '*fp32', 'out_ptr0': '*fp32'}, 'device': 0, 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}], 'device_type': 'cuda', 'num_warps': 8, 'num_stages': 2, 'debug': True, 'cc': 120}
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] Traceback (most recent call last):
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]   File "/workspace/pytorch/torch/_inductor/runtime/triton_heuristics.py", line 537, in _precompile_config
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]     binary = triton.compile(*compile_args, **compile_kwargs)
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]   File "/workspace/triton/python/triton/compiler/compiler.py", line 316, in compile
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]     return CompiledKernel(src, metadata_group, hash)
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]   File "/workspace/triton/python/triton/compiler/compiler.py", line 382, in __init__
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]     file.suffix[1:]: file.read_bytes() if file.suffix[1:] == binary_ext else file.read_text()
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]                                                                              ^^^^^^^^^^^^^^^^
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]   File "/root/miniconda3/envs/triton/lib/python3.12/pathlib.py", line 1027, in read_text
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]     with self.open(mode='r', encoding=encoding, errors=errors) as f:
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]   File "/root/miniconda3/envs/triton/lib/python3.12/pathlib.py", line 1013, in open
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]     return io.open(self, mode, buffering, encoding, errors, newline)
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] FileNotFoundError: [Errno 2] No such file or directory: '/tmp/tmp8jpqfdds/triton/TYN3MXM4E73ZH7RBDASGFLNCZ6AHOJBB52BSNH55QJSE4NPI3X3Q/triton_convolution2d.ttir'
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] Triton compilation failed: Placeholder.DESCRIPTIVE_NAME
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] def triton_convolution2d(arg_X, arg_W, out_ptr0):
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]     KERNEL_H : tl.constexpr = 3
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]     KERNEL_W : tl.constexpr = 3
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]     STRIDE_H : tl.constexpr = 1
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]     STRIDE_W : tl.constexpr = 1
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]     PADDING_H : tl.constexpr = 0
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]     PADDING_W : tl.constexpr = 0
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]     GROUPS : tl.constexpr = 1
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]     UNROLL : tl.constexpr = False
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]     ALLOW_TF32 : tl.constexpr = True
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]     BLOCK_M : tl.constexpr = 16
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]     BLOCK_N : tl.constexpr = 256
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]     BLOCK_K : tl.constexpr = 16
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]     X = arg_X
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]     W = arg_W
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]     # Tensor dimensions
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]     BATCH = 0
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]     IN_C = 256
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]     IN_H = 14
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]     IN_W = 14
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]     OUT_C = 256
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]     OUT_H = 12
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]     OUT_W = 12
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]     # Strides:
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]     stride_xn = 50176
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]     stride_xc = 196
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]     stride_xh = 14
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]     stride_xw = 1
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]     stride_wc_out = 2304
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]     stride_wc_in = 1
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]     stride_wh = 768
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]     stride_ww = 256
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]     nhw = tl.program_id(0) * BLOCK_M + tl.arange(0, BLOCK_M)
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]     idx_y_w = nhw % OUT_W
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]     nh = nhw // OUT_W
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]     idx_y_h = nh % OUT_H
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]     idx_n = nh // OUT_H
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]     idx_y_c = tl.program_id(1) * BLOCK_N + tl.arange(0, BLOCK_N)
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]     group = 0
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]     GROUP_IN_C = IN_C
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]     GROUP_OUT_C = OUT_C
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]     x_base = X + (group * stride_xc * GROUP_IN_C + idx_n * stride_xn)[:, None]
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]     w_base = (
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]         W + (group * stride_wc_out * GROUP_OUT_C + idx_y_c * stride_wc_out)[None, :]
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]     )
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]     acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]     # Could be simplified, but slightly slower:
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]     # for i in range(KERNEL_H):
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]     #     for j in range(KERNEL_W):
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]     #         for k in range(0, GROUP_IN_C, BLOCK_K):
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]     BLOCK_K_COUNT = (GROUP_IN_C + BLOCK_K - 1) // BLOCK_K
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]     for ijk in range(KERNEL_H * KERNEL_W * BLOCK_K_COUNT):
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]         k = (ijk % BLOCK_K_COUNT) * BLOCK_K
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]         ij = ijk // BLOCK_K_COUNT
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]         i = ij // KERNEL_W
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]         j = ij % KERNEL_W
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]         idx_x_h = i - PADDING_H + idx_y_h * STRIDE_H
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]         idx_x_w = j - PADDING_W + idx_y_w * STRIDE_W
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]         idx_x_c = tl.arange(0, BLOCK_K) + k
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]         x_ptrs = x_base + (
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]             (idx_x_h * stride_xh)[:, None]
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]             + (idx_x_w * stride_xw)[:, None]
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]             + (idx_x_c * stride_xc)[None, :]
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]         )
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]         mask_x = (
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]             (idx_n < BATCH)[:, None]
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]             & (idx_x_h >= 0)[:, None]
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]             & (idx_x_h < IN_H)[:, None]
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]             & (idx_x_w >= 0)[:, None]
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]             & (idx_x_w < IN_W)[:, None]
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]             & (idx_x_c < GROUP_IN_C)[None, :]
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]         )
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]         matrix_x = tl.load(x_ptrs, mask=mask_x, other=0.0)
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]         w_ptrs = w_base + (
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]             (idx_x_c * stride_wc_in)[:, None] + (i * stride_wh) + (j * stride_ww)
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]         )
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]         mask_w = (idx_x_c[:, None] < GROUP_IN_C) & (idx_y_c[None, :] < GROUP_OUT_C)
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]         matrix_w = tl.load(w_ptrs, mask=mask_w, other=0.0)
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]         acc += tl.dot(matrix_x, matrix_w, allow_tf32=ALLOW_TF32)
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]     mask = (
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]         (idx_n < BATCH)[:, None]
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]         & (idx_y_h < OUT_H)[:, None]
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]         & (idx_y_w < OUT_W)[:, None]
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]         & (idx_y_c < GROUP_OUT_C)[None, :]
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]     )
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]     idx_n = idx_n[:, None]
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]     idx_c = idx_y_c[None, :] + group * GROUP_OUT_C
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]     idx_h = idx_y_h[:, None]
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]     idx_w = idx_y_w[:, None]
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]     # inductor generates a suffix
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]     xindex = idx_w + 12*idx_h + 144*idx_c + 36864*idx_n
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]     tl.store(out_ptr0 + (tl.broadcast_to(idx_c + 256*idx_w + 3072*idx_h + 36864*idx_n, acc.shape)), acc, mask)
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] 
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] metadata: {'signature': {'arg_X': '*fp32', 'arg_W': '*fp32', 'out_ptr0': '*fp32'}, 'device': 0, 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}], 'device_type': 'cuda', 'num_warps': 4, 'num_stages': 2, 'debug': True, 'cc': 120}
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] Traceback (most recent call last):
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]   File "/workspace/pytorch/torch/_inductor/runtime/triton_heuristics.py", line 537, in _precompile_config
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]     binary = triton.compile(*compile_args, **compile_kwargs)
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]   File "/workspace/triton/python/triton/compiler/compiler.py", line 316, in compile
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]     return CompiledKernel(src, metadata_group, hash)
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]   File "/workspace/triton/python/triton/compiler/compiler.py", line 382, in __init__
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]     file.suffix[1:]: file.read_bytes() if file.suffix[1:] == binary_ext else file.read_text()
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]                                                                              ^^^^^^^^^^^^^^^^
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]   File "/root/miniconda3/envs/triton/lib/python3.12/pathlib.py", line 1027, in read_text
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]     with self.open(mode='r', encoding=encoding, errors=errors) as f:
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]   File "/root/miniconda3/envs/triton/lib/python3.12/pathlib.py", line 1013, in open
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]     return io.open(self, mode, buffering, encoding, errors, newline)
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] FileNotFoundError: [Errno 2] No such file or directory: '/tmp/tmp8jpqfdds/triton/A2N73RSRHPH2Z2I7RZ5P43Y7EPOVGA52QQJESLGX5WFQHWKKTOYQ/triton_convolution2d.ttir'
 AUTOTUNE convolution(0x256x14x14, 256x256x1x1)
  convolution 0.0000 ms <DIVIDED BY ZERO ERROR>
  conv1x1_via_mm 0.0000 ms <DIVIDED BY ZERO ERROR>
  triton_convolution2d_118 0.0004 ms 0.0% ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=16, BLOCK_N=256, GROUPS=1, KERNEL_H=1, KERNEL_W=1, PADDING_H=0, PADDING_W=0, STRIDE_H=1, STRIDE_W=1, UNROLL=True, num_stages=2, num_warps=4
  triton_convolution2d_119 0.0004 ms 0.0% ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=16, BLOCK_N=64, GROUPS=1, KERNEL_H=1, KERNEL_W=1, PADDING_H=0, PADDING_W=0, STRIDE_H=1, STRIDE_W=1, UNROLL=True, num_stages=2, num_warps=4
  triton_convolution2d_120 0.0004 ms 0.0% ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, GROUPS=1, KERNEL_H=1, KERNEL_W=1, PADDING_H=0, PADDING_W=0, STRIDE_H=1, STRIDE_W=1, UNROLL=True, num_stages=1, num_warps=1
  triton_convolution2d_121 0.0004 ms 0.0% ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=16, BLOCK_N=128, GROUPS=1, KERNEL_H=1, KERNEL_W=1, PADDING_H=0, PADDING_W=0, STRIDE_H=1, STRIDE_W=1, UNROLL=True, num_stages=2, num_warps=8
  triton_convolution2d_122 0.0004 ms 0.0% ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=16, BLOCK_N=64, GROUPS=1, KERNEL_H=1, KERNEL_W=1, PADDING_H=0, PADDING_W=0, STRIDE_H=1, STRIDE_W=1, UNROLL=True, num_stages=2, num_warps=4
  triton_convolution2d_123 0.0004 ms 0.0% ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=16, BLOCK_N=256, GROUPS=1, KERNEL_H=1, KERNEL_W=1, PADDING_H=0, PADDING_W=0, STRIDE_H=1, STRIDE_W=1, UNROLL=True, num_stages=2, num_warps=8
 SingleProcess AUTOTUNE benchmarking takes 0.1891 seconds and 0.1215 seconds precompiling for 8 choices
 frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 42), ('async_compile_cache_miss', 6), ('select_algorithm_num_precompiles', 6), ('benchmarking.InductorBenchmarker.benchmark_gpu', 6), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1), ('extern_calls', 1)]
 graph_break []
 aten_mm_info []
 .E0509 19:03:37.800000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:03:37.800000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 196608, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:03:37.800000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 E0509 19:03:37.858000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:03:37.858000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:03:37.858000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 E0509 19:03:37.915000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:03:37.915000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:03:37.915000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 E0509 19:03:37.992000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:03:37.992000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 131072, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:03:37.992000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 E0509 19:03:37.995000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:03:37.995000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:03:37.995000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 AUTOTUNE mm(256x256, 256x256)
  triton_mm_145 0.0061 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_144 0.0061 ms 99.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_146 0.0061 ms 99.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
  mm 0.0074 ms 83.0% 
  triton_mm_150 0.0082 ms 74.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8
  triton_mm_153 0.0116 ms 52.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8
  triton_mm_157 0.0116 ms 52.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8
  triton_mm_154 0.0118 ms 51.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
  triton_mm_156 0.0119 ms 51.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
  triton_mm_152 0.0119 ms 51.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
 SingleProcess AUTOTUNE benchmarking takes 0.3662 seconds and 0.1939 seconds precompiling for 20 choices
 E0509 19:03:38.839000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:03:38.839000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 131072, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:03:38.839000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 E0509 19:03:38.895000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:03:38.895000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 147456, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:03:38.895000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 E0509 19:03:38.988000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:03:38.988000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 131072, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:03:38.988000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 AUTOTUNE mm(256x256, 256x256)
  triton_mm_164 0.0041 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_165 0.0048 ms 85.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_166 0.0054 ms 76.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4
  triton_mm_169 0.0056 ms 73.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8
  triton_mm_163 0.0057 ms 71.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4
  mm 0.0059 ms 69.9% 
  triton_mm_173 0.0072 ms 57.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
  triton_mm_176 0.0075 ms 54.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8
  triton_mm_172 0.0075 ms 54.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8
  triton_mm_175 0.0076 ms 53.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
 SingleProcess AUTOTUNE benchmarking takes 0.3083 seconds and 0.0002 seconds precompiling for 20 choices
 inductor []
 .E0509 19:03:39.899000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:03:39.899000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 131088, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:03:39.899000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 E0509 19:03:39.946000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:03:39.946000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 163856, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:03:39.946000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 E0509 19:03:39.950000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:03:39.950000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 163856, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:03:39.950000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 E0509 19:03:39.954000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:03:39.954000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 114712, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:03:39.954000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 AUTOTUNE mm(2560x2560, 2560x2560)
  triton_mm_persistent_tma_220 0.6267 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=False, EVEN_K=True, GROUP_M=8, NUM_SMS=70, TMA_SIZE=128, num_stages=3, num_warps=8
  triton_mm_persistent_tma_219 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=256, B_ROW_MAJOR=False, EVEN_K=True, GROUP_M=8, NUM_SMS=70, TMA_SIZE=128, num_stages=3, num_warps=8
  triton_mm_persistent_tma_221 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=True, BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=False, EVEN_K=True, GROUP_M=8, NUM_SMS=70, TMA_SIZE=128, num_stages=3, num_warps=8
  triton_mm_persistent_tma_222 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=True, BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=False, EVEN_K=True, GROUP_M=8, NUM_SMS=70, TMA_SIZE=128, num_stages=3, num_warps=4
  triton_mm_persistent_tma_223 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=False, EVEN_K=True, GROUP_M=8, NUM_SMS=70, TMA_SIZE=128, num_stages=4, num_warps=8
 SingleProcess AUTOTUNE benchmarking takes 0.1509 seconds and 0.5661 seconds precompiling for 5 choices
 /workspace/pytorch/test/inductor/test_max_autotune.py:545: ResourceWarning: unclosed file <_io.TextIOWrapper name='/tmp/tmp51okix8i' mode='r' encoding='UTF-8'>
  for evt in json.load(open(f.name))["traceEvents"]
 ResourceWarning: Enable tracemalloc to get the object allocation traceback
 frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 42), ('async_compile_cache_miss', 7), ('select_algorithm_num_precompiles', 5), ('benchmarking.InductorBenchmarker.benchmark_gpu', 5), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)]
 graph_break []
 aten_mm_info [('aten.mm_2560_2560_2560', 1)]
 .E0509 19:03:41.098000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:03:41.098000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 114712, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:03:41.098000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 E0509 19:03:41.103000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:03:41.103000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 114712, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:03:41.103000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 E0509 19:03:41.154000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:03:41.154000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 147488, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:03:41.154000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 E0509 19:03:41.158000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:03:41.158000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 147488, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:03:41.158000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 E0509 19:03:41.163000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:03:41.163000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 180264, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:03:41.163000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 AUTOTUNE scaled_mm(2560x2560, 2560x2560, , )
  triton_scaled_mm_device_tma_232 0.5816 ms 100.0% ACC_TYPE='tl.float32', BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, NUM_SMS=70, SCALING_ROWWISE=False, TMA_SIZE=128, USE_FAST_ACCUM=False, num_stages=4, num_warps=8
  triton_scaled_mm_device_tma_224 0.5834 ms 99.7% ACC_TYPE='tl.float32', BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, NUM_SMS=70, SCALING_ROWWISE=False, TMA_SIZE=128, USE_FAST_ACCUM=False, num_stages=3, num_warps=8
  triton_scaled_mm_device_tma_225 0.7854 ms 74.1% ACC_TYPE='tl.float32', BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, NUM_SMS=70, SCALING_ROWWISE=False, TMA_SIZE=128, USE_FAST_ACCUM=False, num_stages=3, num_warps=8
  triton_scaled_mm_device_tma_228 0.9073 ms 64.1% ACC_TYPE='tl.float32', BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, NUM_SMS=70, SCALING_ROWWISE=False, TMA_SIZE=128, USE_FAST_ACCUM=False, num_stages=3, num_warps=4
  triton_scaled_mm_device_tma_226 inf ms 0.0% ACC_TYPE='tl.float32', BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, NUM_SMS=70, SCALING_ROWWISE=False, TMA_SIZE=128, USE_FAST_ACCUM=False, num_stages=4, num_warps=8
  triton_scaled_mm_device_tma_227 inf ms 0.0% ACC_TYPE='tl.float32', BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, NUM_SMS=70, SCALING_ROWWISE=False, TMA_SIZE=128, USE_FAST_ACCUM=False, num_stages=4, num_warps=4
  triton_scaled_mm_device_tma_229 inf ms 0.0% ACC_TYPE='tl.float32', BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, NUM_SMS=70, SCALING_ROWWISE=False, TMA_SIZE=128, USE_FAST_ACCUM=False, num_stages=5, num_warps=4
  triton_scaled_mm_device_tma_230 inf ms 0.0% ACC_TYPE='tl.float32', BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, NUM_SMS=70, SCALING_ROWWISE=False, TMA_SIZE=128, USE_FAST_ACCUM=False, num_stages=5, num_warps=8
  triton_scaled_mm_device_tma_231 inf ms 0.0% ACC_TYPE='tl.float32', BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, NUM_SMS=70, SCALING_ROWWISE=False, TMA_SIZE=128, USE_FAST_ACCUM=False, num_stages=6, num_warps=8
 SingleProcess AUTOTUNE benchmarking takes 0.3013 seconds and 0.6370 seconds precompiling for 9 choices
 /workspace/pytorch/test/inductor/test_max_autotune.py:545: ResourceWarning: unclosed file <_io.TextIOWrapper name='/tmp/tmp3h_w8d9r' mode='r' encoding='UTF-8'>
  for evt in json.load(open(f.name))["traceEvents"]
 ResourceWarning: Enable tracemalloc to get the object allocation traceback
 frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 70), ('async_compile_cache_miss', 11), ('select_algorithm_num_precompiles', 9), ('benchmarking.InductorBenchmarker.benchmark_gpu', 9), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)]
 graph_break []
 aten_mm_info [('aten._scaled_mm.default_2560_2560_2560', 1)]
 .[W509 19:03:41.700913810 Context.cpp:469] Warning: Setting the SM carveout for matmuls is a temporary experimental mitigation for performance issues, while more robust solutions are developed. It may be removed at any moment without notice. (function operator())
 E0509 19:03:42.186000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:03:42.186000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 131088, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:03:42.186000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 E0509 19:03:42.233000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:03:42.233000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 163856, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:03:42.233000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 E0509 19:03:42.237000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:03:42.237000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 163856, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:03:42.237000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 E0509 19:03:42.242000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:03:42.242000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 114712, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:03:42.242000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 AUTOTUNE mm(2560x2560, 2560x2560)
  triton_mm_persistent_tma_253 0.6273 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=False, EVEN_K=True, GROUP_M=8, NUM_SMS=70, TMA_SIZE=128, num_stages=3, num_warps=8
  triton_mm_persistent_tma_252 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=256, B_ROW_MAJOR=False, EVEN_K=True, GROUP_M=8, NUM_SMS=70, TMA_SIZE=128, num_stages=3, num_warps=8
  triton_mm_persistent_tma_254 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=True, BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=False, EVEN_K=True, GROUP_M=8, NUM_SMS=70, TMA_SIZE=128, num_stages=3, num_warps=8
  triton_mm_persistent_tma_255 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=True, BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=False, EVEN_K=True, GROUP_M=8, NUM_SMS=70, TMA_SIZE=128, num_stages=3, num_warps=4
  triton_mm_persistent_tma_256 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=False, EVEN_K=True, GROUP_M=8, NUM_SMS=70, TMA_SIZE=128, num_stages=4, num_warps=8
 SingleProcess AUTOTUNE benchmarking takes 0.1578 seconds and 0.5631 seconds precompiling for 5 choices
 /workspace/pytorch/test/inductor/test_max_autotune.py:545: ResourceWarning: unclosed file <_io.TextIOWrapper name='/tmp/tmp507pmtl1' mode='r' encoding='UTF-8'>
  for evt in json.load(open(f.name))["traceEvents"]
 ResourceWarning: Enable tracemalloc to get the object allocation traceback
 frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 42), ('async_compile_cache_miss', 7), ('select_algorithm_num_precompiles', 5), ('benchmarking.InductorBenchmarker.benchmark_gpu', 5), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)]
 graph_break []
 aten_mm_info [('aten.mm_2560_2560_2560', 1)]
 .E0509 19:03:43.308000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:03:43.308000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 114712, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:03:43.308000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 E0509 19:03:43.313000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:03:43.313000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 114712, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:03:43.313000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 E0509 19:03:43.360000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:03:43.360000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 147488, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:03:43.360000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 E0509 19:03:43.365000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:03:43.365000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 147488, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:03:43.365000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 E0509 19:03:43.370000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:03:43.370000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 180264, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:03:43.370000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 AUTOTUNE scaled_mm(2560x2560, 2560x2560, , )
  triton_scaled_mm_device_tma_265 0.5819 ms 100.0% ACC_TYPE='tl.float32', BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, NUM_SMS=70, SCALING_ROWWISE=False, TMA_SIZE=128, USE_FAST_ACCUM=False, num_stages=4, num_warps=8
  triton_scaled_mm_device_tma_257 0.5834 ms 99.7% ACC_TYPE='tl.float32', BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, NUM_SMS=70, SCALING_ROWWISE=False, TMA_SIZE=128, USE_FAST_ACCUM=False, num_stages=3, num_warps=8
  triton_scaled_mm_device_tma_258 0.7820 ms 74.4% ACC_TYPE='tl.float32', BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, NUM_SMS=70, SCALING_ROWWISE=False, TMA_SIZE=128, USE_FAST_ACCUM=False, num_stages=3, num_warps=8
  triton_scaled_mm_device_tma_261 0.9103 ms 63.9% ACC_TYPE='tl.float32', BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, NUM_SMS=70, SCALING_ROWWISE=False, TMA_SIZE=128, USE_FAST_ACCUM=False, num_stages=3, num_warps=4
  triton_scaled_mm_device_tma_259 inf ms 0.0% ACC_TYPE='tl.float32', BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, NUM_SMS=70, SCALING_ROWWISE=False, TMA_SIZE=128, USE_FAST_ACCUM=False, num_stages=4, num_warps=8
  triton_scaled_mm_device_tma_260 inf ms 0.0% ACC_TYPE='tl.float32', BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, NUM_SMS=70, SCALING_ROWWISE=False, TMA_SIZE=128, USE_FAST_ACCUM=False, num_stages=4, num_warps=4
  triton_scaled_mm_device_tma_262 inf ms 0.0% ACC_TYPE='tl.float32', BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, NUM_SMS=70, SCALING_ROWWISE=False, TMA_SIZE=128, USE_FAST_ACCUM=False, num_stages=5, num_warps=4
  triton_scaled_mm_device_tma_263 inf ms 0.0% ACC_TYPE='tl.float32', BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, NUM_SMS=70, SCALING_ROWWISE=False, TMA_SIZE=128, USE_FAST_ACCUM=False, num_stages=5, num_warps=8
  triton_scaled_mm_device_tma_264 inf ms 0.0% ACC_TYPE='tl.float32', BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, NUM_SMS=70, SCALING_ROWWISE=False, TMA_SIZE=128, USE_FAST_ACCUM=False, num_stages=6, num_warps=8
 SingleProcess AUTOTUNE benchmarking takes 0.2965 seconds and 0.5921 seconds precompiling for 9 choices
 /workspace/pytorch/test/inductor/test_max_autotune.py:545: ResourceWarning: unclosed file <_io.TextIOWrapper name='/tmp/tmpqk38a3b3' mode='r' encoding='UTF-8'>
  for evt in json.load(open(f.name))["traceEvents"]
 ResourceWarning: Enable tracemalloc to get the object allocation traceback
 frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 70), ('async_compile_cache_miss', 11), ('select_algorithm_num_precompiles', 9), ('benchmarking.InductorBenchmarker.benchmark_gpu', 9), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)]
 graph_break []
 aten_mm_info [('aten._scaled_mm.default_2560_2560_2560', 1)]
 .E0509 19:03:44.409000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:03:44.409000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 131088, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:03:44.409000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 E0509 19:03:44.458000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:03:44.458000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 163856, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:03:44.458000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 E0509 19:03:44.463000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:03:44.463000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 163856, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:03:44.463000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 E0509 19:03:44.467000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:03:44.467000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 114712, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:03:44.467000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 AUTOTUNE mm(2560x2560, 2560x2560)
  triton_mm_persistent_tma_286 1.0383 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=False, EVEN_K=True, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=3, num_warps=8
  triton_mm_persistent_tma_285 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=256, B_ROW_MAJOR=False, EVEN_K=True, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=3, num_warps=8
  triton_mm_persistent_tma_287 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=True, BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=False, EVEN_K=True, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=3, num_warps=8
  triton_mm_persistent_tma_288 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=True, BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=False, EVEN_K=True, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=3, num_warps=4
  triton_mm_persistent_tma_289 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=False, EVEN_K=True, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=4, num_warps=8
 SingleProcess AUTOTUNE benchmarking takes 0.1597 seconds and 0.5892 seconds precompiling for 5 choices
 /workspace/pytorch/test/inductor/test_max_autotune.py:545: ResourceWarning: unclosed file <_io.TextIOWrapper name='/tmp/tmpdxhdeq4z' mode='r' encoding='UTF-8'>
  for evt in json.load(open(f.name))["traceEvents"]
 ResourceWarning: Enable tracemalloc to get the object allocation traceback
 frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 42), ('async_compile_cache_miss', 7), ('select_algorithm_num_precompiles', 5), ('benchmarking.InductorBenchmarker.benchmark_gpu', 5), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)]
 graph_break []
 aten_mm_info [('aten.mm_2560_2560_2560', 1)]
 .E0509 19:03:45.522000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:03:45.522000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 114712, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:03:45.522000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 E0509 19:03:45.527000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:03:45.527000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 114712, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:03:45.527000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 E0509 19:03:45.577000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:03:45.577000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 147488, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:03:45.577000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 E0509 19:03:45.581000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:03:45.581000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 147488, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:03:45.581000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 E0509 19:03:45.586000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:03:45.586000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 180264, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:03:45.586000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 AUTOTUNE scaled_mm(2560x2560, 2560x2560, , )
  triton_scaled_mm_device_tma_298 0.9646 ms 100.0% ACC_TYPE='tl.float32', BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, NUM_SMS=43, SCALING_ROWWISE=False, TMA_SIZE=128, USE_FAST_ACCUM=False, num_stages=4, num_warps=8
  triton_scaled_mm_device_tma_290 0.9656 ms 99.9% ACC_TYPE='tl.float32', BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, NUM_SMS=43, SCALING_ROWWISE=False, TMA_SIZE=128, USE_FAST_ACCUM=False, num_stages=3, num_warps=8
  triton_scaled_mm_device_tma_291 1.1715 ms 82.3% ACC_TYPE='tl.float32', BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, NUM_SMS=43, SCALING_ROWWISE=False, TMA_SIZE=128, USE_FAST_ACCUM=False, num_stages=3, num_warps=8
  triton_scaled_mm_device_tma_294 1.4828 ms 65.1% ACC_TYPE='tl.float32', BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, NUM_SMS=43, SCALING_ROWWISE=False, TMA_SIZE=128, USE_FAST_ACCUM=False, num_stages=3, num_warps=4
  triton_scaled_mm_device_tma_292 inf ms 0.0% ACC_TYPE='tl.float32', BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, NUM_SMS=43, SCALING_ROWWISE=False, TMA_SIZE=128, USE_FAST_ACCUM=False, num_stages=4, num_warps=8
  triton_scaled_mm_device_tma_293 inf ms 0.0% ACC_TYPE='tl.float32', BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, NUM_SMS=43, SCALING_ROWWISE=False, TMA_SIZE=128, USE_FAST_ACCUM=False, num_stages=4, num_warps=4
  triton_scaled_mm_device_tma_295 inf ms 0.0% ACC_TYPE='tl.float32', BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, NUM_SMS=43, SCALING_ROWWISE=False, TMA_SIZE=128, USE_FAST_ACCUM=False, num_stages=5, num_warps=4
  triton_scaled_mm_device_tma_296 inf ms 0.0% ACC_TYPE='tl.float32', BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, NUM_SMS=43, SCALING_ROWWISE=False, TMA_SIZE=128, USE_FAST_ACCUM=False, num_stages=5, num_warps=8
  triton_scaled_mm_device_tma_297 inf ms 0.0% ACC_TYPE='tl.float32', BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, NUM_SMS=43, SCALING_ROWWISE=False, TMA_SIZE=128, USE_FAST_ACCUM=False, num_stages=6, num_warps=8
 SingleProcess AUTOTUNE benchmarking takes 0.3033 seconds and 0.5743 seconds precompiling for 9 choices
 /workspace/pytorch/test/inductor/test_max_autotune.py:545: ResourceWarning: unclosed file <_io.TextIOWrapper name='/tmp/tmp_e__0eio' mode='r' encoding='UTF-8'>
  for evt in json.load(open(f.name))["traceEvents"]
 ResourceWarning: Enable tracemalloc to get the object allocation traceback
 frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 70), ('async_compile_cache_miss', 11), ('select_algorithm_num_precompiles', 9), ('benchmarking.InductorBenchmarker.benchmark_gpu', 9), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)]
 graph_break []
 aten_mm_info [('aten._scaled_mm.default_2560_2560_2560', 1)]
 .frames [('total', 1)]
 stats [('calls_captured', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('not_ok', 1)]
 inductor [('async_compile_cache_miss', 5), ('select_algorithm_num_precompiles', 5), ('benchmarking.InductorBenchmarker.benchmark_gpu', 5), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1)]
 graph_break []
 aten_mm_info [('aten.mm_16_16_16', 1)]
 .frames [('total', 1)]
 stats [('calls_captured', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('not_ok', 1)]
 inductor [('async_compile_cache_miss', 5), ('select_algorithm_num_precompiles', 5), ('benchmarking.InductorBenchmarker.benchmark_gpu', 5), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1)]
 graph_break []
 aten_mm_info [('aten.mm_16_16_16', 1)]
 .W0509 19:03:46.433000 434429 torch/_export/__init__.py:67] +============================+
 W0509 19:03:46.434000 434429 torch/_export/__init__.py:68] |     !!!   WARNING   !!!    |
 W0509 19:03:46.434000 434429 torch/_export/__init__.py:69] +============================+
 W0509 19:03:46.434000 434429 torch/_export/__init__.py:70] torch._export.aot_compile()/torch._export.aot_load() is being deprecated, please switch to directly calling torch._inductor.aoti_compile_and_package(torch.export.export())/torch._inductor.aoti_load_package() instead.
 E0509 19:03:47.184000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:03:47.184000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 196608, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:03:47.184000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 E0509 19:03:47.242000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:03:47.242000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:03:47.242000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 E0509 19:03:47.302000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:03:47.302000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:03:47.302000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 E0509 19:03:47.381000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:03:47.381000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 131072, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:03:47.381000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 E0509 19:03:47.385000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:03:47.385000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:03:47.385000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 AUTOTUNE mm(256x256, 256x256)
  triton_mm_311 0.0058 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_310 0.0059 ms 98.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_312 0.0059 ms 98.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
  mm 0.0076 ms 76.4% 
  triton_mm_316 0.0077 ms 75.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8
  triton_mm_319 0.0114 ms 50.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8
  triton_mm_315 0.0116 ms 50.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_320 0.0116 ms 49.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
  triton_mm_318 0.0117 ms 49.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
  triton_mm_323 0.0117 ms 49.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8
 SingleProcess AUTOTUNE benchmarking takes 0.3755 seconds and 0.4873 seconds precompiling for 20 choices
 stats [('calls_captured', 5), ('unique_graphs', 1)]
 inductor [('benchmarking.InductorBenchmarker.benchmark_gpu', 26), ('async_compile_cache_miss', 23), ('select_algorithm_num_precompiles', 19), ('select_algorithm_precompile', 1), ('benchmarking.InductorBenchmarker.benchmark', 1), ('select_algorithm_autotune', 1), ('extern_calls', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info [('aten.mm_256_256_256', 1)]
 .sCompiled module path: /tmp/tmpyumonn_d/kf/ckf24tdynyym2ganzaqomb25tiktyvf3wzxn5e5qf53scj65g2w3.py
 Compiled module path: /tmp/tmpyumonn_d/7w/c7w2sojsp5z6u6hlqvzlw4ghmkbikuq6sflwfr4myo77rhxygnrz.py
 frames [('total', 96), ('ok', 92)]
 inline_call []
 unimplemented [('Attempt to trace generator\n  Explanation: Generators cannot be compiled directly with `torch.compile`.\n  Hint: Call a generator from inside of a non-generator Python function and compile that function instead.\n  Hint: This graph break is fundamental - it is unlikely that Dynamo will ever be able to trace through your code. Consider finding a workaround.\n\n  Developer debug context: \n', 4)]
 graph_break [('Tensor.backward', 1)]
 stats [('calls_captured', 3), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('ok', 1), ('autograd_cache_saved', 1)]
 inductor [('extern_calls', 5), ('fxgraph_cache_miss', 2)]
 aten_mm_info [('aten.mm_128_128_128', 2)]
 .E0509 19:03:52.871000 434429 torch/_inductor/select_algorithm.py:2100] [0/0_1] Runtime error during autotuning: 
 E0509 19:03:52.871000 434429 torch/_inductor/select_algorithm.py:2100] [0/0_1] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 131072, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:03:52.871000 434429 torch/_inductor/select_algorithm.py:2100] [0/0_1] Ignoring this choice.
 E0509 19:03:52.875000 434429 torch/_inductor/select_algorithm.py:2100] [0/0_1] Runtime error during autotuning: 
 E0509 19:03:52.875000 434429 torch/_inductor/select_algorithm.py:2100] [0/0_1] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:03:52.875000 434429 torch/_inductor/select_algorithm.py:2100] [0/0_1] Ignoring this choice.
 AUTOTUNE mm(128x128, 128x128)
  triton_mm_330 0.0041 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_331 0.0041 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
  mm 0.0058 ms 70.3% 
  triton_mm_335 0.0058 ms 70.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8
  triton_mm_329 0.0059 ms 69.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_332 0.0059 ms 69.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4
  triton_mm_336 0.0059 ms 69.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4
  triton_mm_334 0.0075 ms 54.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_337 0.0078 ms 52.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
  triton_mm_338 0.0079 ms 52.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8
 SingleProcess AUTOTUNE benchmarking takes 0.4238 seconds and 0.4223 seconds precompiling for 20 choices
 Compiled module path: /tmp/tmpq56gtiip/zz/czz4ecoozipcry2vnodb4iw5zfsawonb6iza5re7zvzy32bkehs6.py
 E0509 19:03:53.868000 434429 torch/_inductor/select_algorithm.py:2100] [0/0_1] Runtime error during autotuning: 
 E0509 19:03:53.868000 434429 torch/_inductor/select_algorithm.py:2100] [0/0_1] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 131072, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:03:53.868000 434429 torch/_inductor/select_algorithm.py:2100] [0/0_1] Ignoring this choice.
 E0509 19:03:53.872000 434429 torch/_inductor/select_algorithm.py:2100] [0/0_1] Runtime error during autotuning: 
 E0509 19:03:53.872000 434429 torch/_inductor/select_algorithm.py:2100] [0/0_1] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:03:53.872000 434429 torch/_inductor/select_algorithm.py:2100] [0/0_1] Ignoring this choice.
 AUTOTUNE mm(128x128, 128x128)
  triton_mm_348 0.0041 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_349 0.0041 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_350 0.0041 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
  mm 0.0055 ms 74.0% 
  triton_mm_354 0.0056 ms 72.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8
  triton_mm_351 0.0059 ms 69.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4
  triton_mm_355 0.0061 ms 66.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4
  triton_mm_357 0.0075 ms 54.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8
  triton_mm_360 0.0075 ms 54.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
  triton_mm_361 0.0076 ms 54.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8
 SingleProcess AUTOTUNE benchmarking takes 0.3251 seconds and 0.3140 seconds precompiling for 20 choices
 Compiled module path: /tmp/tmpq56gtiip/y4/cy4phkkchieez3q5cck2pcmpqw4o3goqhrjlltrh5q2u7ga2cryr.py
 frames [('total', 5), ('ok', 5)]
 inline_call []
 unimplemented []
 graph_break [('Tensor.backward', 1)]
 stats [('calls_captured', 3), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('ok', 1), ('autograd_cache_saved', 1)]
 inductor [('triton_bundler_save_kernel', 322), ('async_compile_cache_miss', 45), ('benchmarking.InductorBenchmarker.benchmark_gpu', 44), ('select_algorithm_num_precompiles', 38), ('extern_calls', 3), ('fxgraph_cache_miss', 2), ('select_algorithm_precompile', 2), ('benchmarking.InductorBenchmarker.benchmark', 2), ('select_algorithm_autotune', 2), ('async_compile_cache_hit', 2)]
 aten_mm_info [('aten.mm_128_128_128', 2)]
 .AUTOTUNE addmm(100x100, 100x10, 10x100)
  triton_mm_367 0.0034 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_368 0.0034 ms 99.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_366 0.0035 ms 97.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=2
  triton_mm_373 0.0036 ms 92.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4
  triton_mm_369 0.0037 ms 91.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_371 0.0037 ms 90.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_370 0.0038 ms 89.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4
  triton_mm_372 0.0041 ms 82.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=8
  triton_mm_375 0.0041 ms 82.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8
  triton_mm_376 0.0041 ms 82.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=4
 SubProcess AUTOTUNE benchmarking takes 0.8253 seconds and 0.2758 seconds precompiling for 18 choices
 frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('async_compile_cache_miss', 18), ('select_algorithm_num_precompiles', 16), ('triton_bundler_save_kernel', 7), ('benchmarking.InductorBenchmarker.benchmark_gpu', 4), ('benchmarking.InductorBenchmarker.benchmark', 2), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)]
 graph_break []
 aten_mm_info [('aten.addmm_100_100_10', 1)]
 .AUTOTUNE addmm(100x100, 100x10, 10x100)
  triton_mm_383 0.0041 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_384 0.0041 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_386 0.0041 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4
  triton_mm_388 0.0049 ms 84.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=8
  triton_mm_387 0.0051 ms 80.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_389 0.0054 ms 75.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4
  triton_mm_385 0.0055 ms 74.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_391 0.0057 ms 72.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8
  triton_mm_382 0.0058 ms 70.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=2
  triton_mm_390 0.0059 ms 69.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4
 SubProcess AUTOTUNE benchmarking takes 3.3739 seconds and 0.4662 seconds precompiling for 18 choices
 frames [('total', 4), ('ok', 4)]
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('async_compile_cache_miss', 18), ('select_algorithm_num_precompiles', 16), ('triton_bundler_save_kernel', 7), ('benchmarking.InductorBenchmarker.benchmark', 2), ('benchmarking.InductorBenchmarker.benchmark_gpu', 2), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)]
 graph_break []
 aten_mm_info [('aten.addmm_s1_s1_s0', 1)]
 .E0509 19:04:02.553000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:04:02.553000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 115216, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:04:02.553000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 E0509 19:04:02.580000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:04:02.580000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 139792, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:04:02.580000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 E0509 19:04:02.585000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:04:02.585000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 139792, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:04:02.585000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 E0509 19:04:02.589000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:04:02.589000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 107032, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:04:02.589000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 AUTOTUNE addmm(168x248, 168x88, 88x248)
  triton_mm_persistent_tma_418 0.0100 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=True, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=3, num_warps=8
  triton_mm_persistent_tma_417 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=256, B_ROW_MAJOR=True, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=3, num_warps=8
  triton_mm_persistent_tma_419 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=True, BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=True, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=3, num_warps=8
  triton_mm_persistent_tma_420 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=True, BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=True, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=3, num_warps=4
  triton_mm_persistent_tma_421 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=True, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=4, num_warps=8
 SingleProcess AUTOTUNE benchmarking takes 0.1399 seconds and 0.5961 seconds precompiling for 5 choices
 frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 4), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 84), ('benchmarking.InductorBenchmarker.benchmark_gpu', 11), ('async_compile_cache_miss', 10), ('select_algorithm_num_precompiles', 5), ('async_compile_cache_hit', 3), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)]
 graph_break []
 aten_mm_info [('aten.addmm_168_248_88', 1)]
 .E0509 19:04:03.956000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:04:03.956000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 115216, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:04:03.956000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 Eframes [('total', 1)]
 stats [('calls_captured', 4)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('not_ok', 1)]
 inductor [('async_compile_cache_miss', 5), ('select_algorithm_num_precompiles', 5), ('benchmarking.InductorBenchmarker.benchmark_gpu', 2), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1)]
 graph_break []
 aten_mm_info [('aten.addmm_8*s1_8*s0_8*s2', 1)]
 E0509 19:04:04.819000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:04:04.819000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 131088, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:04:04.819000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 E0509 19:04:04.850000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:04:04.850000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 163856, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:04:04.850000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 E0509 19:04:04.855000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:04:04.855000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 163856, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:04:04.855000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 E0509 19:04:04.859000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:04:04.859000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 114712, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:04:04.859000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 AUTOTUNE addmm(168x248, 168x88, 88x248)
  triton_mm_persistent_tma_466 0.0119 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=False, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=3, num_warps=8
  triton_mm_persistent_tma_465 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=256, B_ROW_MAJOR=False, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=3, num_warps=8
  triton_mm_persistent_tma_467 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=True, BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=False, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=3, num_warps=8
  triton_mm_persistent_tma_468 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=True, BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=False, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=3, num_warps=4
  triton_mm_persistent_tma_469 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=False, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=4, num_warps=8
 SingleProcess AUTOTUNE benchmarking takes 0.1379 seconds and 0.6032 seconds precompiling for 5 choices
 frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 5), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 84), ('benchmarking.InductorBenchmarker.benchmark_gpu', 11), ('async_compile_cache_miss', 10), ('select_algorithm_num_precompiles', 5), ('async_compile_cache_hit', 3), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)]
 graph_break []
 aten_mm_info [('aten.addmm_168_248_88', 1)]
 .E0509 19:04:06.158000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:04:06.158000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 131088, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:04:06.158000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 Eframes [('total', 1)]
 stats [('calls_captured', 5)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('not_ok', 1)]
 inductor [('async_compile_cache_miss', 5), ('select_algorithm_num_precompiles', 5), ('benchmarking.InductorBenchmarker.benchmark_gpu', 2), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1)]
 graph_break []
 aten_mm_info [('aten.addmm_8*s1_8*s0_8*s2', 1)]
 E0509 19:04:07.131000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:04:07.131000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 115216, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:04:07.131000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 E0509 19:04:07.162000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:04:07.162000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 163856, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:04:07.162000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 E0509 19:04:07.167000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:04:07.167000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 163856, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:04:07.167000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 E0509 19:04:07.171000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:04:07.171000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 114712, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:04:07.171000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 AUTOTUNE addmm(168x248, 168x88, 88x248)
  triton_mm_persistent_tma_514 0.0117 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=False, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=True, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=3, num_warps=8
  triton_mm_persistent_tma_513 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=False, BLOCK_K=64, BLOCK_M=128, BLOCK_N=256, B_ROW_MAJOR=True, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=3, num_warps=8
  triton_mm_persistent_tma_515 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=False, BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=True, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=3, num_warps=8
  triton_mm_persistent_tma_516 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=False, BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=True, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=3, num_warps=4
  triton_mm_persistent_tma_517 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=False, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=True, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=4, num_warps=8
 SingleProcess AUTOTUNE benchmarking takes 0.1419 seconds and 0.7292 seconds precompiling for 5 choices
 frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 5), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 84), ('benchmarking.InductorBenchmarker.benchmark_gpu', 11), ('async_compile_cache_miss', 10), ('select_algorithm_num_precompiles', 5), ('async_compile_cache_hit', 3), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)]
 graph_break []
 aten_mm_info [('aten.addmm_168_248_88', 1)]
 .E0509 19:04:08.692000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:04:08.692000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 115216, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:04:08.692000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 Eframes [('total', 1)]
 stats [('calls_captured', 5)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('not_ok', 1)]
 inductor [('async_compile_cache_miss', 5), ('select_algorithm_num_precompiles', 5), ('benchmarking.InductorBenchmarker.benchmark_gpu', 2), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1)]
 graph_break []
 aten_mm_info [('aten.addmm_8*s2_8*s0_8*s1', 1)]
 E0509 19:04:09.642000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:04:09.642000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 147472, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:04:09.642000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 E0509 19:04:09.671000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:04:09.671000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 196624, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:04:09.671000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 E0509 19:04:09.678000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:04:09.678000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 196624, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:04:09.678000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 E0509 19:04:09.682000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:04:09.682000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 131096, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:04:09.682000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 AUTOTUNE addmm(168x248, 168x88, 88x248)
  triton_mm_persistent_tma_562 0.0117 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=False, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=False, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=3, num_warps=8
  triton_mm_persistent_tma_561 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=False, BLOCK_K=64, BLOCK_M=128, BLOCK_N=256, B_ROW_MAJOR=False, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=3, num_warps=8
  triton_mm_persistent_tma_563 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=False, BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=False, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=3, num_warps=8
  triton_mm_persistent_tma_564 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=False, BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=False, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=3, num_warps=4
  triton_mm_persistent_tma_565 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=False, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=False, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=4, num_warps=8
 SingleProcess AUTOTUNE benchmarking takes 0.1404 seconds and 0.6975 seconds precompiling for 5 choices
 frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 6), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 84), ('benchmarking.InductorBenchmarker.benchmark_gpu', 11), ('async_compile_cache_miss', 10), ('select_algorithm_num_precompiles', 5), ('async_compile_cache_hit', 3), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)]
 graph_break []
 aten_mm_info [('aten.addmm_168_248_88', 1)]
 .E0509 19:04:10.979000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:04:10.979000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 147472, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:04:10.979000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 Eframes [('total', 1)]
 stats [('calls_captured', 6)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('not_ok', 1)]
 inductor [('async_compile_cache_miss', 5), ('select_algorithm_num_precompiles', 5), ('benchmarking.InductorBenchmarker.benchmark_gpu', 2), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1)]
 graph_break []
 aten_mm_info [('aten.addmm_8*s2_8*s0_8*s1', 1)]
 frames [('total', 1)]
 stats [('calls_captured', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('not_ok', 1)]
 inductor [('benchmarking.InductorBenchmarker.benchmark_gpu', 2), ('fxgraph_cache_miss', 1)]
 graph_break []
 aten_mm_info [('aten.addmm_21_31_11', 1)]
 .frames [('total', 1)]
 stats [('calls_captured', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('not_ok', 1)]
 inductor [('fxgraph_cache_miss', 1)]
 graph_break []
 aten_mm_info [('aten.addmm_s2_s1_s0', 1)]
 .frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('fxgraph_cache_miss', 1), ('extern_calls', 1)]
 graph_break []
 aten_mm_info [('aten.addmm_0_100_10', 1)]
 .frames [('total', 4), ('ok', 4)]
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('fxgraph_cache_miss', 1), ('extern_calls', 1)]
 graph_break []
 aten_mm_info [('aten.addmm_0_s1_s0', 1)]
 .AUTOTUNE mm_plus_mm(2048x64, 64x1536, 2048x64, 64x1536)
  triton_mm_plus_mm_600 0.0287 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_plus_mm_601 0.0302 ms 94.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8
  triton_mm_plus_mm_606 0.0304 ms 94.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_plus_mm_602 0.0362 ms 79.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=16
  triton_mm_plus_mm_607 0.0365 ms 78.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=2
  triton_mm_plus_mm_605 0.0407 ms 70.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=8
  triton_mm_plus_mm_604 0.0423 ms 67.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8
  triton_mm_plus_mm_603 0.0426 ms 67.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8
  _mm_plus_mm 0.0448 ms 64.0% 
 SingleProcess AUTOTUNE benchmarking takes 0.2765 seconds and 0.1758 seconds precompiling for 9 choices
 frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 3), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 63), ('async_compile_cache_miss', 10), ('benchmarking.InductorBenchmarker.benchmark_gpu', 9), ('select_algorithm_num_precompiles', 8), ('pattern_matcher_nodes', 3), ('fxgraph_cache_miss', 1), ('pattern_matcher_count', 1), ('select_algorithm_precompile', 1), ('benchmarking.InductorBenchmarker.benchmark', 1), ('select_algorithm_autotune', 1)]
 graph_break []
 aten_mm_info []
 .AUTOTUNE mm_plus_mm(2048x64, 64x1536, 2048x64, 64x1536)
  triton_mm_plus_mm_608 0.0292 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_plus_mm_609 0.0304 ms 96.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8
  triton_mm_plus_mm_614 0.0307 ms 95.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_plus_mm_610 0.0365 ms 79.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=16
  triton_mm_plus_mm_615 0.0368 ms 79.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=2
  triton_mm_plus_mm_613 0.0406 ms 71.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=8
  triton_mm_plus_mm_612 0.0410 ms 71.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8
  triton_mm_plus_mm_611 0.0427 ms 68.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8
  _mm_plus_mm 0.0447 ms 65.2% 
 SingleProcess AUTOTUNE benchmarking takes 0.2755 seconds and 0.1820 seconds precompiling for 9 choices
 frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 3), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 63), ('async_compile_cache_miss', 10), ('benchmarking.InductorBenchmarker.benchmark_gpu', 9), ('select_algorithm_num_precompiles', 8), ('pattern_matcher_nodes', 3), ('fxgraph_cache_miss', 1), ('pattern_matcher_count', 1), ('select_algorithm_precompile', 1), ('benchmarking.InductorBenchmarker.benchmark', 1), ('select_algorithm_autotune', 1)]
 graph_break []
 aten_mm_info []
 .AUTOTUNE mm_plus_mm(2048x64, 64x1536, 2048x64, 64x1536)
  triton_mm_plus_mm_616 0.0280 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_plus_mm_617 0.0286 ms 97.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8
  triton_mm_plus_mm_622 0.0297 ms 94.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_plus_mm_618 0.0359 ms 77.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=16
  triton_mm_plus_mm_623 0.0382 ms 73.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=2
  triton_mm_plus_mm_621 0.0409 ms 68.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=8
  triton_mm_plus_mm_620 0.0420 ms 66.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8
  triton_mm_plus_mm_619 0.0427 ms 65.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8
  _mm_plus_mm 0.0447 ms 62.5% 
 SubProcess AUTOTUNE benchmarking takes 1.2338 seconds and 0.1758 seconds precompiling for 9 choices
 frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 3), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('async_compile_cache_miss', 10), ('select_algorithm_num_precompiles', 8), ('triton_bundler_save_kernel', 7), ('pattern_matcher_nodes', 3), ('fxgraph_cache_miss', 1), ('pattern_matcher_count', 1), ('select_algorithm_precompile', 1), ('benchmarking.InductorBenchmarker.benchmark', 1), ('benchmarking.InductorBenchmarker.benchmark_gpu', 1), ('select_algorithm_autotune', 1)]
 graph_break []
 aten_mm_info []
 .AUTOTUNE mm_plus_mm(2048x64, 64x1536, 2048x64, 64x1536)
  triton_mm_plus_mm_624 0.0281 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_plus_mm_625 0.0286 ms 98.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8
  triton_mm_plus_mm_630 0.0291 ms 96.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_plus_mm_626 0.0364 ms 77.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=16
  triton_mm_plus_mm_631 0.0371 ms 75.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=2
  triton_mm_plus_mm_629 0.0399 ms 70.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=8
  triton_mm_plus_mm_628 0.0423 ms 66.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8
  triton_mm_plus_mm_627 0.0424 ms 66.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8
  _mm_plus_mm 0.0450 ms 62.4% 
 SubProcess AUTOTUNE benchmarking takes 0.1845 seconds and 0.1787 seconds precompiling for 9 choices
 frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 3), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('async_compile_cache_miss', 10), ('select_algorithm_num_precompiles', 8), ('triton_bundler_save_kernel', 7), ('pattern_matcher_nodes', 3), ('fxgraph_cache_miss', 1), ('pattern_matcher_count', 1), ('select_algorithm_precompile', 1), ('benchmarking.InductorBenchmarker.benchmark', 1), ('benchmarking.InductorBenchmarker.benchmark_gpu', 1), ('select_algorithm_autotune', 1)]
 graph_break []
 aten_mm_info []
 .frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 3), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('pattern_matcher_nodes', 3), ('fxgraph_cache_miss', 1), ('pattern_matcher_count', 1)]
 graph_break []
 aten_mm_info [('aten.mm_0_1536_64', 2)]
 .frames [('total', 4), ('ok', 4)]
 stats [('calls_captured', 3), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('pattern_matcher_nodes', 3), ('fxgraph_cache_miss', 1), ('pattern_matcher_count', 1)]
 graph_break []
 aten_mm_info [('aten.mm_0_s1_s0', 2)]
 .AUTOTUNE mm(100x10, 10x100)
  triton_mm_633 0.0031 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_636 0.0035 ms 89.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4
  triton_mm_635 0.0036 ms 87.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_632 0.0036 ms 86.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=2
  triton_mm_634 0.0036 ms 86.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_638 0.0036 ms 86.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=8
  triton_mm_637 0.0038 ms 83.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_639 0.0038 ms 82.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4
  triton_mm_640 0.0041 ms 77.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4
  triton_mm_641 0.0041 ms 76.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8
 SubProcess AUTOTUNE benchmarking takes 1.9723 seconds and 0.1850 seconds precompiling for 17 choices
 frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 2), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 35), ('async_compile_cache_miss', 19), ('select_algorithm_num_precompiles', 16), ('benchmarking.InductorBenchmarker.benchmark_gpu', 7), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('benchmarking.InductorBenchmarker.benchmark', 1), ('select_algorithm_autotune', 1)]
 graph_break []
 aten_mm_info [('aten.mm_100_100_10', 1)]
 .AUTOTUNE mm(100x10, 10x100)
  triton_mm_652 0.0037 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4
  triton_mm_650 0.0038 ms 97.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_648 0.0041 ms 90.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=2
  triton_mm_649 0.0041 ms 90.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_651 0.0041 ms 90.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_653 0.0041 ms 90.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_654 0.0041 ms 90.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=8
  triton_mm_655 0.0041 ms 90.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4
  triton_mm_658 0.0047 ms 78.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=4
  triton_mm_660 0.0051 ms 72.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8
 SubProcess AUTOTUNE benchmarking takes 2.7352 seconds and 0.3131 seconds precompiling for 17 choices
 frames [('total', 4), ('ok', 4)]
 stats [('calls_captured', 2), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 35), ('async_compile_cache_miss', 19), ('select_algorithm_num_precompiles', 16), ('benchmarking.InductorBenchmarker.benchmark_gpu', 5), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('benchmarking.InductorBenchmarker.benchmark', 1), ('select_algorithm_autotune', 1)]
 graph_break []
 aten_mm_info [('aten.mm_s0_s0_s1', 1)]
 .E0509 19:04:22.750000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:04:22.750000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 115216, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:04:22.750000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 E0509 19:04:22.778000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:04:22.778000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 139792, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:04:22.778000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 E0509 19:04:22.785000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:04:22.785000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 139792, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:04:22.785000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 E0509 19:04:22.790000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:04:22.790000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 107032, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:04:22.790000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 AUTOTUNE mm(168x88, 88x248)
  triton_mm_persistent_tma_684 0.0097 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=True, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=3, num_warps=8
  triton_mm_persistent_tma_683 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=256, B_ROW_MAJOR=True, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=3, num_warps=8
  triton_mm_persistent_tma_685 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=True, BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=True, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=3, num_warps=8
  triton_mm_persistent_tma_686 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=True, BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=True, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=3, num_warps=4
  triton_mm_persistent_tma_687 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=True, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=4, num_warps=8
 SingleProcess AUTOTUNE benchmarking takes 0.1341 seconds and 0.5370 seconds precompiling for 5 choices
 frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 3), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 70), ('async_compile_cache_miss', 9), ('benchmarking.InductorBenchmarker.benchmark_gpu', 9), ('select_algorithm_num_precompiles', 5), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)]
 graph_break []
 aten_mm_info [('aten.mm_168_248_88', 1)]
 .E0509 19:04:24.023000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:04:24.023000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 115216, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:04:24.023000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 Eframes [('total', 1)]
 stats [('calls_captured', 3)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('not_ok', 1)]
 inductor [('async_compile_cache_miss', 5), ('select_algorithm_num_precompiles', 5), ('benchmarking.InductorBenchmarker.benchmark_gpu', 2), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1)]
 graph_break []
 aten_mm_info [('aten.mm_8*s0_8*s2_8*s1', 1)]
 E0509 19:04:24.785000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:04:24.785000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 131088, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:04:24.785000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 E0509 19:04:24.809000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:04:24.809000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 163856, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:04:24.809000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 E0509 19:04:24.813000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:04:24.813000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 163856, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:04:24.813000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 E0509 19:04:24.818000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:04:24.818000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 114712, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:04:24.818000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 AUTOTUNE mm(168x88, 88x248)
  triton_mm_persistent_tma_732 0.0102 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=False, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=3, num_warps=8
  triton_mm_persistent_tma_731 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=256, B_ROW_MAJOR=False, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=3, num_warps=8
  triton_mm_persistent_tma_733 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=True, BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=False, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=3, num_warps=8
  triton_mm_persistent_tma_734 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=True, BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=False, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=3, num_warps=4
  triton_mm_persistent_tma_735 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=False, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=4, num_warps=8
 SingleProcess AUTOTUNE benchmarking takes 0.1279 seconds and 0.5386 seconds precompiling for 5 choices
 frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 4), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 70), ('async_compile_cache_miss', 9), ('benchmarking.InductorBenchmarker.benchmark_gpu', 9), ('select_algorithm_num_precompiles', 5), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)]
 graph_break []
 aten_mm_info [('aten.mm_168_248_88', 1)]
 .E0509 19:04:26.040000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:04:26.040000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 131088, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:04:26.040000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 Eframes [('total', 1)]
 stats [('calls_captured', 4)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('not_ok', 1)]
 inductor [('async_compile_cache_miss', 5), ('select_algorithm_num_precompiles', 5), ('benchmarking.InductorBenchmarker.benchmark_gpu', 2), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1)]
 graph_break []
 aten_mm_info [('aten.mm_8*s0_8*s2_8*s1', 1)]
 E0509 19:04:26.887000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:04:26.887000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 115216, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:04:26.887000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 E0509 19:04:26.915000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:04:26.915000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 163856, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:04:26.915000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 E0509 19:04:26.922000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:04:26.922000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 163856, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:04:26.922000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 E0509 19:04:26.926000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:04:26.926000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 114712, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:04:26.926000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 AUTOTUNE mm(168x88, 88x248)
  triton_mm_persistent_tma_780 0.0100 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=False, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=True, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=3, num_warps=8
  triton_mm_persistent_tma_779 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=False, BLOCK_K=64, BLOCK_M=128, BLOCK_N=256, B_ROW_MAJOR=True, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=3, num_warps=8
  triton_mm_persistent_tma_781 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=False, BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=True, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=3, num_warps=8
  triton_mm_persistent_tma_782 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=False, BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=True, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=3, num_warps=4
  triton_mm_persistent_tma_783 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=False, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=True, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=4, num_warps=8
 SingleProcess AUTOTUNE benchmarking takes 0.1326 seconds and 0.6240 seconds precompiling for 5 choices
 frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 4), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 70), ('async_compile_cache_miss', 9), ('benchmarking.InductorBenchmarker.benchmark_gpu', 9), ('select_algorithm_num_precompiles', 5), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)]
 graph_break []
 aten_mm_info [('aten.mm_168_248_88', 1)]
 .E0509 19:04:28.267000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:04:28.267000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 115216, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:04:28.267000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 Eframes [('total', 1)]
 stats [('calls_captured', 4)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('not_ok', 1)]
 inductor [('async_compile_cache_miss', 5), ('select_algorithm_num_precompiles', 5), ('benchmarking.InductorBenchmarker.benchmark_gpu', 2), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1)]
 graph_break []
 aten_mm_info [('aten.mm_8*s1_8*s2_8*s0', 1)]
 E0509 19:04:29.132000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:04:29.132000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 147472, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:04:29.132000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 E0509 19:04:29.159000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:04:29.159000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 196624, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:04:29.159000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 E0509 19:04:29.164000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:04:29.164000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 196624, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:04:29.164000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 E0509 19:04:29.169000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:04:29.169000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 131096, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:04:29.169000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 AUTOTUNE mm(168x88, 88x248)
  triton_mm_persistent_tma_828 0.0116 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=False, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=False, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=3, num_warps=8
  triton_mm_persistent_tma_827 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=False, BLOCK_K=64, BLOCK_M=128, BLOCK_N=256, B_ROW_MAJOR=False, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=3, num_warps=8
  triton_mm_persistent_tma_829 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=False, BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=False, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=3, num_warps=8
  triton_mm_persistent_tma_830 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=False, BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=False, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=3, num_warps=4
  triton_mm_persistent_tma_831 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=False, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=False, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=4, num_warps=8
 SingleProcess AUTOTUNE benchmarking takes 0.1342 seconds and 0.6350 seconds precompiling for 5 choices
 frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 5), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 70), ('async_compile_cache_miss', 9), ('benchmarking.InductorBenchmarker.benchmark_gpu', 9), ('select_algorithm_num_precompiles', 5), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)]
 graph_break []
 aten_mm_info [('aten.mm_168_248_88', 1)]
 .E0509 19:04:30.332000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:04:30.332000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 147472, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:04:30.332000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 Eframes [('total', 1)]
 stats [('calls_captured', 5)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('not_ok', 1)]
 inductor [('async_compile_cache_miss', 5), ('select_algorithm_num_precompiles', 5), ('benchmarking.InductorBenchmarker.benchmark_gpu', 2), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1)]
 graph_break []
 aten_mm_info [('aten.mm_8*s1_8*s2_8*s0', 1)]
 frames [('total', 1)]
 stats [('calls_captured', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('not_ok', 1)]
 inductor [('benchmarking.InductorBenchmarker.benchmark_gpu', 2), ('fxgraph_cache_miss', 1)]
 graph_break []
 aten_mm_info [('aten.mm_21_31_11', 1)]
 .frames [('total', 1)]
 stats [('calls_captured', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('not_ok', 1)]
 inductor [('fxgraph_cache_miss', 1)]
 graph_break []
 aten_mm_info [('aten.mm_s2_s1_s0', 1)]
 .frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 2), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('fxgraph_cache_miss', 1), ('extern_calls', 1)]
 graph_break []
 aten_mm_info [('aten.mm_0_100_10', 1)]
 .frames [('total', 4), ('ok', 4)]
 stats [('calls_captured', 2), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('fxgraph_cache_miss', 1), ('extern_calls', 1)]
 graph_break []
 aten_mm_info [('aten.mm_0_s1_s0', 1)]
 .W0509 19:04:30.598000 434429 torch/_inductor/kernel/mm_common.py:447] [0/0] No choices for GEMM, chose not to fallback to ATen backend. To temporarily change this behavior, set autotune_fallback_to_aten to True via TORCHINDUCTOR_AUTOTUNE_FALLBACK_TO_ATEN=1, but this knob is being deprecated. The long term fix is to include Aten in max_autotune_gemm_backends.
 frames [('total', 1)]
 stats [('calls_captured', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('not_ok', 1)]
 inductor [('benchmarking.InductorBenchmarker.benchmark_gpu', 2), ('fxgraph_cache_miss', 1)]
 graph_break []
 aten_mm_info [('aten.mm_2_2_2', 1)]
 .E0509 19:04:36.188000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:04:36.188000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 114688, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:04:36.188000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 AUTOTUNE addmm(50257x768, 50257x32768, 32768x768)
  bias_addmm 27.0531 ms 100.0% 
  addmm 27.2312 ms 99.3% 
  triton_mm_882 28.6515 ms 94.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
  triton_mm_875 29.2564 ms 92.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
  triton_mm_883 31.4952 ms 85.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
  triton_mm_872 31.8126 ms 85.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_877 31.9027 ms 84.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
  triton_mm_876 34.2344 ms 79.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8
  triton_mm_871 34.5999 ms 78.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_880 34.7853 ms 77.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8
 SingleProcess AUTOTUNE benchmarking takes 5.9978 seconds and 0.3724 seconds precompiling for 21 choices
 frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 133), ('benchmarking.InductorBenchmarker.benchmark_gpu', 23), ('async_compile_cache_miss', 19), ('select_algorithm_num_precompiles', 19), ('benchmarking.InductorBenchmarker.benchmark', 2), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1), ('extern_calls', 1)]
 graph_break []
 aten_mm_info [('aten.addmm_50257_768_32768', 1)]
 .E0509 19:04:43.404000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:04:43.404000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 114688, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:04:43.404000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 AUTOTUNE bmm(1x50257x32768, 1x32768x768)
  bmm 27.0459 ms 100.0% 
  triton_bmm_901 28.0566 ms 96.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
  triton_bmm_895 28.9188 ms 93.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8
  triton_bmm_894 29.8875 ms 90.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
  triton_bmm_892 30.8091 ms 87.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8
  triton_bmm_902 31.2515 ms 86.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
  triton_bmm_891 32.2224 ms 83.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4
  triton_bmm_890 32.4854 ms 83.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4
  triton_bmm_898 33.7879 ms 80.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
  triton_bmm_896 34.5201 ms 78.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
 SingleProcess AUTOTUNE benchmarking takes 5.7565 seconds and 0.3608 seconds precompiling for 20 choices
 frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 133), ('benchmarking.InductorBenchmarker.benchmark_gpu', 22), ('async_compile_cache_miss', 19), ('select_algorithm_num_precompiles', 19), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('benchmarking.InductorBenchmarker.benchmark', 1), ('select_algorithm_autotune', 1), ('extern_calls', 1)]
 graph_break []
 aten_mm_info [('aten.bmm_50257_768_32768', 1)]
 .E0509 19:04:50.648000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:04:50.648000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 114688, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:04:50.648000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 AUTOTUNE mm(50257x32768, 32768x768)
  mm 27.0220 ms 100.0% 
  triton_mm_920 28.6218 ms 94.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
  triton_mm_913 29.2509 ms 92.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
  triton_mm_917 30.7866 ms 87.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
  triton_mm_921 31.4286 ms 86.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
  triton_mm_910 31.7901 ms 85.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_915 31.8812 ms 84.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
  triton_mm_914 34.2467 ms 78.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8
  triton_mm_918 34.8518 ms 77.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8
  triton_mm_909 35.6383 ms 75.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4
 SingleProcess AUTOTUNE benchmarking takes 5.8182 seconds and 0.3117 seconds precompiling for 20 choices
 frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 133), ('benchmarking.InductorBenchmarker.benchmark_gpu', 22), ('async_compile_cache_miss', 19), ('select_algorithm_num_precompiles', 19), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('benchmarking.InductorBenchmarker.benchmark', 1), ('select_algorithm_autotune', 1), ('extern_calls', 1)]
 graph_break []
 aten_mm_info [('aten.mm_50257_768_32768', 1)]
 .Einductor [('select_algorithm_num_precompiles', 10), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)]
 .E0509 19:04:53.503000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:04:53.503000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 196608, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:04:53.503000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 E0509 19:04:53.562000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:04:53.562000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:04:53.562000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 E0509 19:04:53.620000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:04:53.620000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:04:53.620000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 E0509 19:04:53.698000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:04:53.698000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 131072, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:04:53.698000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 E0509 19:04:53.702000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:04:53.702000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:04:53.702000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 AUTOTUNE mm(256x256, 256x256)
  triton_mm_944 0.0066 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_943 0.0076 ms 87.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_945 0.0078 ms 84.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
  mm 0.0082 ms 80.9% 
  triton_mm_949 0.0082 ms 80.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8
  triton_mm_951 0.0119 ms 55.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
  triton_mm_952 0.0119 ms 55.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8
  triton_mm_956 0.0120 ms 55.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8
  triton_mm_948 0.0123 ms 53.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_953 0.0123 ms 53.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
 SingleProcess AUTOTUNE benchmarking takes 0.3827 seconds and 0.3606 seconds precompiling for 20 choices
 E0509 19:04:54.134000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:04:54.134000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 196608, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:04:54.134000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 E0509 19:04:54.193000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:04:54.193000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:04:54.193000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 E0509 19:04:54.251000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:04:54.251000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:04:54.251000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 E0509 19:04:54.328000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:04:54.328000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 131072, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:04:54.328000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 E0509 19:04:54.332000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:04:54.332000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:04:54.332000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 AUTOTUNE mm(256x256, 256x256)
  triton_mm_925 0.0075 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_926 0.0075 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_924 0.0076 ms 98.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4
  mm 0.0079 ms 94.7% 
  triton_mm_930 0.0082 ms 91.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8
  triton_mm_933 0.0118 ms 63.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8
  triton_mm_932 0.0119 ms 63.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
  triton_mm_934 0.0123 ms 61.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
  triton_mm_929 0.0123 ms 60.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_936 0.0123 ms 60.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
 SingleProcess AUTOTUNE benchmarking takes 0.2869 seconds and 0.0001 seconds precompiling for 20 choices
 E0509 19:04:54.718000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:04:54.718000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 131072, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:04:54.718000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 E0509 19:04:54.775000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:04:54.775000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 147456, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:04:54.775000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 E0509 19:04:54.869000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:04:54.869000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 131072, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:04:54.869000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 AUTOTUNE mm(256x256, 256x256)
  triton_mm_964 0.0055 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_965 0.0056 ms 98.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4
  triton_mm_962 0.0057 ms 97.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_963 0.0059 ms 94.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
  mm 0.0061 ms 90.1% 
  triton_mm_968 0.0061 ms 90.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8
  triton_mm_971 0.0075 ms 73.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8
  triton_mm_972 0.0076 ms 73.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
  triton_mm_975 0.0077 ms 72.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8
  triton_mm_970 0.0082 ms 67.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
 SingleProcess AUTOTUNE benchmarking takes 0.3099 seconds and 0.0001 seconds precompiling for 20 choices
 E0509 19:04:55.039000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:04:55.039000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 131072, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:04:55.039000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 E0509 19:04:55.101000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:04:55.101000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 147456, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:04:55.101000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 E0509 19:04:55.200000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: 
 E0509 19:04:55.200000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 131072, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. 
 E0509 19:04:55.200000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice.
 AUTOTUNE mm(256x256, 256x256)
  triton_mm_982 0.0051 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_981 0.0058 ms 88.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_983 0.0058 ms 87.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_984 0.0059 ms 86.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4
  mm 0.0061 ms 82.8% 
  triton_mm_987 0.0061 ms 82.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8
  triton_mm_991 0.0076 ms 67.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
  triton_mm_990 0.0077 ms 66.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8
  triton_mm_994 0.0077 ms 66.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8
  triton_mm_993 0.0079 ms 64.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
 SingleProcess AUTOTUNE benchmarking takes 0.3306 seconds and 0.0001 seconds precompiling for 20 choices
 frames [('total', 1), ('ok', 1)]
 inline_call []
 unimplemented []
 stats [('calls_captured', 7), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 637), ('benchmarking.InductorBenchmarker.benchmark_gpu', 89), ('async_compile_cache_miss', 49), ('select_algorithm_num_precompiles', 38), ('benchmarking.InductorBenchmarker.benchmark', 4), ('select_algorithm_autotune', 4), ('select_algorithm_precompile', 2), ('fxgraph_cache_miss', 1)]
 graph_break []
 aten_mm_info [('aten.mm_256_256_256', 4)]
 .frames [('total', 1), ('ok', 1)]
 inline_call []
 unimplemented []
 stats [('calls_captured', 7), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 14), ('extern_calls', 4), ('benchmarking.InductorBenchmarker.benchmark_gpu', 2), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info [('aten.mm_256_256_256', 4)]
 .AUTOTUNE mm(5x4, 4x3)
  triton_mm_1075 0.0034 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=1
  triton_mm_1076 0.0034 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=1
  triton_mm_1077 0.0034 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=1
  triton_mm_1079 0.0036 ms 96.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=1
  triton_mm_1078 0.0037 ms 91.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=1
 SubProcess AUTOTUNE benchmarking takes 0.6042 seconds and 0.0867 seconds precompiling for 5 choices
 stats [('calls_captured', 6), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 49), ('async_compile_cache_miss', 8), ('benchmarking.InductorBenchmarker.benchmark_gpu', 6), ('select_algorithm_num_precompiles', 5), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)]
 graph_break []
 aten_mm_info [('aten.mm_s1_s2_s0', 1)]
 .AUTOTUNE mm(100x10, 10x100)
  triton_mm_1081 0.0031 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_1084 0.0033 ms 92.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4
  triton_mm_1080 0.0034 ms 90.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=2
  triton_mm_1082 0.0034 ms 90.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_1083 0.0037 ms 82.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_1085 0.0037 ms 82.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_1089 0.0038 ms 81.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8
  triton_mm_1086 0.0038 ms 80.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=8
  triton_mm_1087 0.0038 ms 80.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4
  triton_mm_1088 0.0041 ms 75.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4
 SingleProcess AUTOTUNE benchmarking takes 0.3161 seconds and 1.0394 seconds precompiling for 17 choices
 AUTOTUNE mm(100x10, 10x100)
  triton_mm_1096 0.0035 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=2
  triton_mm_1100 0.0035 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4
  triton_mm_1097 0.0036 ms 97.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_1099 0.0036 ms 96.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_1098 0.0036 ms 95.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_1101 0.0037 ms 94.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_1103 0.0037 ms 94.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4
  triton_mm_1102 0.0038 ms 91.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=8
  triton_mm_1104 0.0041 ms 85.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4
  triton_mm_1105 0.0041 ms 85.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8
 SingleProcess AUTOTUNE benchmarking takes 0.3056 seconds and 0.9424 seconds precompiling for 17 choices
 AUTOTUNE mm(100x10, 10x100)
  triton_mm_1119 0.0034 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4
  triton_mm_1116 0.0034 ms 99.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4
  triton_mm_1115 0.0035 ms 98.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_1112 0.0035 ms 96.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=2
  triton_mm_1113 0.0035 ms 96.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_1118 0.0035 ms 96.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=8
  triton_mm_1114 0.0036 ms 93.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_1117 0.0037 ms 90.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_1122 0.0038 ms 89.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=4
  triton_mm_1120 0.0041 ms 82.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4
 SingleProcess AUTOTUNE benchmarking takes 0.3135 seconds and 0.7753 seconds precompiling for 17 choices
 AUTOTUNE mm(100x10, 10x100)
  triton_mm_1130 0.0034 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_1128 0.0034 ms 99.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=2
  triton_mm_1135 0.0034 ms 99.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4
  triton_mm_1129 0.0035 ms 95.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_1132 0.0036 ms 92.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4
  triton_mm_1131 0.0037 ms 90.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_1133 0.0038 ms 89.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_1137 0.0038 ms 89.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8
  triton_mm_1138 0.0038 ms 87.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=4
  triton_mm_1134 0.0040 ms 84.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=8
 SingleProcess AUTOTUNE benchmarking takes 0.3112 seconds and 1.1342 seconds precompiling for 17 choices
 AUTOTUNE mm(100x10, 10x100)
  triton_mm_1144 0.0029 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=2
  triton_mm_1147 0.0035 ms 83.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_1148 0.0036 ms 82.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4
  triton_mm_1146 0.0036 ms 81.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_1145 0.0036 ms 79.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_1149 0.0038 ms 77.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_1151 0.0038 ms 77.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4
  triton_mm_1152 0.0038 ms 75.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4
  triton_mm_1154 0.0040 ms 72.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=4
  triton_mm_1150 0.0041 ms 71.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=8
 SingleProcess AUTOTUNE benchmarking takes 0.3073 seconds and 1.0976 seconds precompiling for 17 choices
 Cache Stats:
  autotune_local: puts: 0, misses: 0, hits: 0, 
  autotune_remote: puts: 2, misses: 2, hits: 3, 
  bundled_autotune: puts: 0, misses: 0, hits: 0, 
  fx_graph: puts: 0, misses: 0, hits: 0, 
  triton: puts: 0, misses: 0, hits: 0, 
  aot_autograd: puts: 0, misses: 0, hits: 0, 
  dynamo_pgo: puts: 0, misses: 0, hits: 0, 
 Cache Entries:
  autotune_remote:
    'pt2:de08cb2292af23d436d8389fec3ec308647c494f61372e4d803a3cd48bee1827::51358a087e4794b75273376732b5ed9b0fe534dff4bfcda33e6a91b52c4cd428:c1': b'{"XBLOCK": 128, "num_warps": 4, "num_stages": 1, "configs_hash": "1b2cc4dbebb9680d3ce31843331593b1...
    'pt2:de08cb2292af23d436d8389fec3ec308647c494f61372e4d803a3cd48bee1827::eb21653aa258b18c3119d2ff55ce0f124ed954d0df0253f8e37c5c036f80a418:c1': b'{"XBLOCK": 128, "num_warps": 4, "num_stages": 1, "configs_hash": "1b2cc4dbebb9680d3ce31843331593b1...
 Cache Stats:
  autotune_local: puts: 0, misses: 0, hits: 0, 
  autotune_remote: puts: 2, misses: 2, hits: 3, 
  bundled_autotune: puts: 0, misses: 0, hits: 0, 
  fx_graph: puts: 0, misses: 0, hits: 0, 
  triton: puts: 0, misses: 0, hits: 0, 
  aot_autograd: puts: 0, misses: 0, hits: 0, 
  dynamo_pgo: puts: 0, misses: 0, hits: 0, 
 Cache Entries:
  autotune_remote:
    'pt2:de08cb2292af23d436d8389fec3ec308647c494f61372e4d803a3cd48bee1827::2104d7bc884978e3985bdb2c8f01e511e320b9198192788d15ae25e1469d3425:c1': b'{"XBLOCK": 256, "num_warps": 4, "num_stages": 1, "configs_hash": "1b2cc4dbebb9680d3ce31843331593b1...
    'pt2:de08cb2292af23d436d8389fec3ec308647c494f61372e4d803a3cd48bee1827::e53bbadf30ab7b008055c8eed484fc260c45957d83de810cbd62287008666588:c1': b'{"XBLOCK": 256, "num_warps": 4, "num_stages": 1, "configs_hash": "1b2cc4dbebb9680d3ce31843331593b1...
 frames [('total', 10), ('ok', 10)]
 stats [('calls_captured', 20), ('unique_graphs', 10)]
 aot_autograd [('total', 10), ('autograd_cache_bypass', 10), ('ok', 10)]
 inductor [('benchmarking.InductorBenchmarker.benchmark_gpu', 105), ('select_algorithm_num_precompiles', 80), ('async_compile_cache_miss', 15), ('select_algorithm_precompile', 5), ('benchmarking.InductorBenchmarker.benchmark', 5), ('select_algorithm_autotune', 5), ('extern_calls', 1)]
 graph_break []
 aten_mm_info [('aten.mm_100_100_10', 6)]
 inline_call []
 .AUTOTUNE mm(100x10, 10x100)
  triton_mm_1161 0.0035 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_1160 0.0036 ms 96.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=2
  triton_mm_1162 0.0036 ms 94.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_1164 0.0038 ms 91.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4
  triton_mm_1163 0.0041 ms 84.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_1165 0.0041 ms 84.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_1166 0.0041 ms 84.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=8
  triton_mm_1167 0.0041 ms 84.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4
  triton_mm_1168 0.0041 ms 84.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4
  triton_mm_1170 0.0041 ms 84.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=4
 SingleProcess AUTOTUNE benchmarking takes 0.2971 seconds and 1.4003 seconds precompiling for 17 choices
 AUTOTUNE mm(100x10, 10x100)
  triton_mm_1177 0.0036 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_1180 0.0037 ms 95.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4
  triton_mm_1179 0.0038 ms 94.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_1176 0.0041 ms 87.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=2
  triton_mm_1178 0.0041 ms 87.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_1181 0.0041 ms 87.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_1182 0.0041 ms 87.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=8
  triton_mm_1183 0.0041 ms 87.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4
  triton_mm_1186 0.0041 ms 87.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=4
  triton_mm_1187 0.0054 ms 65.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4
 SingleProcess AUTOTUNE benchmarking takes 0.2962 seconds and 0.9495 seconds precompiling for 17 choices
 AUTOTUNE mm(100x10, 10x100)
  triton_mm_1193 0.0038 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_1194 0.0038 ms 99.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_1196 0.0038 ms 99.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4
  triton_mm_1195 0.0038 ms 98.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_1192 0.0041 ms 92.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=2
  triton_mm_1197 0.0041 ms 92.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_1198 0.0041 ms 92.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=8
  triton_mm_1199 0.0041 ms 92.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4
  triton_mm_1200 0.0041 ms 92.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4
  triton_mm_1202 0.0041 ms 91.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=4
 SingleProcess AUTOTUNE benchmarking takes 0.2969 seconds and 1.4035 seconds precompiling for 17 choices
 AUTOTUNE mm(100x10, 10x100)
  triton_mm_1209 0.0035 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_1210 0.0037 ms 93.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_1211 0.0038 ms 91.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_1208 0.0041 ms 84.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=2
  triton_mm_1212 0.0041 ms 84.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4
  triton_mm_1213 0.0041 ms 84.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_1214 0.0041 ms 84.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=8
  triton_mm_1215 0.0041 ms 84.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4
  triton_mm_1218 0.0041 ms 84.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=4
  triton_mm_1219 0.0054 ms 63.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4
 SingleProcess AUTOTUNE benchmarking takes 0.3139 seconds and 1.3179 seconds precompiling for 17 choices
 AUTOTUNE mm(100x10, 10x100)
  triton_mm_1227 0.0037 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_1225 0.0038 ms 98.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_1226 0.0038 ms 98.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_1228 0.0038 ms 97.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4
  triton_mm_1224 0.0041 ms 91.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=2
  triton_mm_1229 0.0041 ms 91.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_1230 0.0041 ms 91.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=8
  triton_mm_1231 0.0041 ms 91.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4
  triton_mm_1234 0.0041 ms 91.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=4
  triton_mm_1235 0.0054 ms 68.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4
 SingleProcess AUTOTUNE benchmarking takes 0.2981 seconds and 1.5387 seconds precompiling for 17 choices
 Cache Stats:
  autotune_local: puts: 0, misses: 0, hits: 0, 
  autotune_remote: puts: 2, misses: 2, hits: 3, 
  bundled_autotune: puts: 0, misses: 0, hits: 0, 
  fx_graph: puts: 0, misses: 0, hits: 0, 
  triton: puts: 0, misses: 0, hits: 0, 
  aot_autograd: puts: 0, misses: 0, hits: 0, 
  dynamo_pgo: puts: 0, misses: 0, hits: 0, 
 Cache Entries:
  autotune_remote:
    'pt2:de08cb2292af23d436d8389fec3ec308647c494f61372e4d803a3cd48bee1827::103bad2bba162792a246553a9660490b13c42cb9fc556ee026a2ee76225c5e39:c1': b'{"XBLOCK": 256, "num_warps": 4, "num_stages": 1, "configs_hash": "1b2cc4dbebb9680d3ce31843331593b1...
    'pt2:de08cb2292af23d436d8389fec3ec308647c494f61372e4d803a3cd48bee1827::75d7fc095dca95edbe9b36a582f8a7a52af37ea17ffa0cf4b1f698f3da660974:c1': b'{"XBLOCK": 128, "num_warps": 4, "num_stages": 1, "configs_hash": "1b2cc4dbebb9680d3ce31843331593b1...
 Cache Stats:
  autotune_local: puts: 0, misses: 0, hits: 0, 
  autotune_remote: puts: 2, misses: 2, hits: 3, 
  bundled_autotune: puts: 0, misses: 0, hits: 0, 
  fx_graph: puts: 0, misses: 0, hits: 0, 
  triton: puts: 0, misses: 0, hits: 0, 
  aot_autograd: puts: 0, misses: 0, hits: 0, 
  dynamo_pgo: puts: 0, misses: 0, hits: 0, 
 Cache Entries:
  autotune_remote:
    'pt2:de08cb2292af23d436d8389fec3ec308647c494f61372e4d803a3cd48bee1827::74d8befde4020dffc0fb6b69ebb0e418c3f11ac8bc30ece94f24cd314c0d60da:c1': b'{"XBLOCK": 128, "num_warps": 4, "num_stages": 1, "configs_hash": "1b2cc4dbebb9680d3ce31843331593b1...
    'pt2:de08cb2292af23d436d8389fec3ec308647c494f61372e4d803a3cd48bee1827::945b170871dfe60acaf2756015ad6acb1a693ea876400ff39314054d40efc61c:c1': b'{"XBLOCK": 128, "num_warps": 4, "num_stages": 1, "configs_hash": "1b2cc4dbebb9680d3ce31843331593b1...
 frames [('total', 40), ('ok', 40)]
 stats [('calls_captured', 20), ('unique_graphs', 10)]
 aot_autograd [('total', 10), ('autograd_cache_bypass', 10), ('ok', 10)]
 inductor [('benchmarking.InductorBenchmarker.benchmark_gpu', 93), ('select_algorithm_num_precompiles', 80), ('async_compile_cache_miss', 15), ('select_algorithm_precompile', 5), ('benchmarking.InductorBenchmarker.benchmark', 5), ('select_algorithm_autotune', 5), ('extern_calls', 1)]
 graph_break []
 aten_mm_info [('aten.mm_s0_s0_s1', 6)]
 inline_call []
 .AUTOTUNE mm(1x63, 63x128)
  triton_mm_1243 0.0041 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=16, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=2
  triton_mm_1241 0.0051 ms 80.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=16, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=2
  triton_mm_1242 0.0056 ms 73.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=16, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4
  triton_mm_1240 0.0072 ms 57.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=16, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=2
 SingleProcess AUTOTUNE benchmarking takes 0.1558 seconds and 0.1524 seconds precompiling for 4 choices
 frames [('total', 4), ('ok', 4)]
 stats [('calls_captured', 3), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 35), ('async_compile_cache_miss', 6), ('select_algorithm_num_precompiles', 4), ('benchmarking.InductorBenchmarker.benchmark_gpu', 4), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)]
 graph_break []
 aten_mm_info [('aten.mm_1_s1_s0', 1)]
 .AUTOTUNE mm(1x64, 64x128)
  triton_mm_1245 0.0041 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=16, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=2
  triton_mm_1246 0.0057 ms 71.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=16, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4
  triton_mm_1247 0.0057 ms 71.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=16, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=2
  triton_mm_1244 0.0061 ms 66.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=16, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=2
 SingleProcess AUTOTUNE benchmarking takes 0.1575 seconds and 0.1463 seconds precompiling for 4 choices
 frames [('total', 4), ('ok', 4)]
 stats [('calls_captured', 3), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 35), ('async_compile_cache_miss', 6), ('select_algorithm_num_precompiles', 4), ('benchmarking.InductorBenchmarker.benchmark_gpu', 4), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)]
 graph_break []
 aten_mm_info [('aten.mm_1_s1_s0', 1)]
 .AUTOTUNE mm(20x1, 1x1)
  triton_mm_1251 0.0028 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=2
  triton_mm_1250 0.0034 ms 84.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=2
  triton_mm_1248 0.0035 ms 82.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=2
  triton_mm_1249 0.0035 ms 81.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=2
 SingleProcess AUTOTUNE benchmarking takes 0.1526 seconds and 0.0779 seconds precompiling for 4 choices
 frames [('total', 4), ('ok', 4)]
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 35), ('async_compile_cache_miss', 6), ('select_algorithm_num_precompiles', 4), ('benchmarking.InductorBenchmarker.benchmark_gpu', 4), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)]
 graph_break []
 aten_mm_info [('aten.mm_s0_1_1', 1)]
 .AUTOTUNE mm(64x128, 128x256)
  triton_mm_1254 0.0036 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_1255 0.0038 ms 94.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_1253 0.0041 ms 87.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_1252 0.0076 ms 47.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=2
 SingleProcess AUTOTUNE benchmarking takes 0.1604 seconds and 0.0812 seconds precompiling for 4 choices
 frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 2), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 49), ('async_compile_cache_miss', 7), ('benchmarking.InductorBenchmarker.benchmark_gpu', 6), ('select_algorithm_num_precompiles', 4), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info [('aten.mm_64_256_128', 1)]
 .sAUTOTUNE mm(128x128, 128x128)
  triton_mm_1259 0.0036 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_1257 0.0037 ms 94.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_1258 0.0041 ms 87.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_1256 0.0061 ms 57.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=2
 SingleProcess AUTOTUNE benchmarking takes 0.1626 seconds and 0.0937 seconds precompiling for 4 choices
 Eframes [('total', 2), ('ok', 2)]
 stats [('calls_captured', 5), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 49), ('async_compile_cache_miss', 9), ('select_algorithm_num_precompiles', 4), ('benchmarking.InductorBenchmarker.benchmark_gpu', 4), ('fxgraph_cache_miss', 2), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info [('aten.mm_128_128_128', 2)]
 AUTOTUNE mm(256x256, 256x128)
  triton_mm_1265 0.0058 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_1266 0.0059 ms 98.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_1267 0.0061 ms 94.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_1264 0.0123 ms 47.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=2
 SingleProcess AUTOTUNE benchmarking takes 0.1550 seconds and 0.1321 seconds precompiling for 4 choices
 frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 4), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 49), ('async_compile_cache_miss', 7), ('benchmarking.InductorBenchmarker.benchmark_gpu', 6), ('select_algorithm_num_precompiles', 4), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info [('aten.mm_256_128_256', 1)]
 .AUTOTUNE mm(64x128, 128x256)
  triton_mm_1271 0.0041 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_1270 0.0056 ms 73.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_1269 0.0061 ms 66.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_1268 0.0082 ms 50.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=2
 SingleProcess AUTOTUNE benchmarking takes 0.1526 seconds and 0.1262 seconds precompiling for 4 choices
 frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 4), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 35), ('async_compile_cache_miss', 6), ('select_algorithm_num_precompiles', 4), ('benchmarking.InductorBenchmarker.benchmark_gpu', 4), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)]
 graph_break []
 aten_mm_info [('aten.mm_64_256_128', 1)]
 .AUTOTUNE mm(64x64, 64x64)
  triton_mm_1275 0.0036 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_1274 0.0040 ms 89.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_1273 0.0041 ms 86.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_1272 0.0058 ms 61.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=2
 SingleProcess AUTOTUNE benchmarking takes 0.1591 seconds and 0.1152 seconds precompiling for 4 choices
 frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 4), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 35), ('async_compile_cache_miss', 6), ('select_algorithm_num_precompiles', 4), ('benchmarking.InductorBenchmarker.benchmark_gpu', 4), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)]
 graph_break []
 aten_mm_info [('aten.mm_64_64_64', 1)]
 .AUTOTUNE mm(64x120, 120x64)
  triton_mm_1278 0.0041 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_1279 0.0041 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_1277 0.0073 ms 55.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_1276 0.0078 ms 52.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=2
 SingleProcess AUTOTUNE benchmarking takes 0.1522 seconds and 0.1398 seconds precompiling for 4 choices
 frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 4), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 35), ('async_compile_cache_miss', 6), ('select_algorithm_num_precompiles', 4), ('benchmarking.InductorBenchmarker.benchmark_gpu', 4), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)]
 graph_break []
 aten_mm_info [('aten.mm_64_64_120', 1)]
 .AUTOTUNE mm(64x128, 128x256)
  triton_mm_1283 0.0041 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_1282 0.0055 ms 74.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_1281 0.0061 ms 66.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_1280 0.0082 ms 50.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=2
 SingleProcess AUTOTUNE benchmarking takes 0.1560 seconds and 0.1241 seconds precompiling for 4 choices
 frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 3), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 35), ('async_compile_cache_miss', 6), ('select_algorithm_num_precompiles', 4), ('benchmarking.InductorBenchmarker.benchmark_gpu', 4), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)]
 graph_break []
 aten_mm_info [('aten.mm_64_256_128', 1)]
 .AUTOTUNE mm(128x128, 128x128)
  triton_mm_1287 0.0041 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_1286 0.0055 ms 74.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_1285 0.0059 ms 69.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_1284 0.0079 ms 51.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=2
 SingleProcess AUTOTUNE benchmarking takes 0.1569 seconds and 0.1299 seconds precompiling for 4 choices
 frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 3), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 35), ('async_compile_cache_miss', 6), ('select_algorithm_num_precompiles', 4), ('benchmarking.InductorBenchmarker.benchmark_gpu', 4), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)]
 graph_break []
 aten_mm_info [('aten.mm_128_128_128', 1)]
 .AUTOTUNE mm(63x120, 120x250)
  triton_mm_1291 0.0050 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_1290 0.0054 ms 91.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_1289 0.0058 ms 85.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_1288 0.0098 ms 51.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=2
 SingleProcess AUTOTUNE benchmarking takes 0.1597 seconds and 0.1381 seconds precompiling for 4 choices
 frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 3), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 35), ('async_compile_cache_miss', 6), ('select_algorithm_num_precompiles', 4), ('benchmarking.InductorBenchmarker.benchmark_gpu', 4), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)]
 graph_break []
 aten_mm_info [('aten.mm_63_250_120', 1)]
 .AUTOTUNE mm(128x128, 128x128)
  triton_mm_1295 0.0041 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_1293 0.0055 ms 74.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_1294 0.0055 ms 74.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_1292 0.0075 ms 54.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=2
 SingleProcess AUTOTUNE benchmarking takes 0.1517 seconds and 0.1158 seconds precompiling for 4 choices
 frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 3), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 147), ('benchmarking.InductorBenchmarker.benchmark_gpu', 17), ('async_compile_cache_miss', 12), ('select_algorithm_num_precompiles', 4), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)]
 graph_break []
 aten_mm_info [('aten.mm_128_128_128', 1)]
 .AUTOTUNE mm(128x32, 32x128)
  triton_mm_1301 0.0034 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_1302 0.0038 ms 89.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_1300 0.0038 ms 89.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=2
  triton_mm_1303 0.0038 ms 89.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
 SingleProcess AUTOTUNE benchmarking takes 0.1504 seconds and 0.0751 seconds precompiling for 4 choices
 AUTOTUNE mm(128x16, 16x128)
  triton_mm_1296 0.0033 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=2
  triton_mm_1297 0.0034 ms 96.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_1298 0.0038 ms 86.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_1299 0.0038 ms 86.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
 SingleProcess AUTOTUNE benchmarking takes 0.0811 seconds and 0.0003 seconds precompiling for 4 choices
 AUTOTUNE mm(128x128, 128x128)
  triton_mm_1306 0.0041 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_1307 0.0054 ms 75.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_1305 0.0059 ms 69.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_1304 0.0076 ms 53.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=2
 SingleProcess AUTOTUNE benchmarking takes 0.0714 seconds and 0.1176 seconds precompiling for 4 choices
 frames [('total', 2), ('ok', 2)]
 stats [('calls_captured', 7), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 203), ('benchmarking.InductorBenchmarker.benchmark_gpu', 25), ('async_compile_cache_miss', 23), ('select_algorithm_num_precompiles', 12), ('select_algorithm_precompile', 3), ('select_algorithm_autotune', 3), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info [('aten.mm_128_128_16', 1), ('aten.mm_128_128_32', 1), ('aten.mm_128_128_128', 1)]
 .AUTOTUNE mm(64x127, 127x64)
  triton_mm_1309 0.0054 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_1311 0.0054 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_1310 0.0055 ms 98.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_1308 0.0097 ms 56.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=2
 SingleProcess AUTOTUNE benchmarking takes 0.1544 seconds and 0.1414 seconds precompiling for 4 choices
 frames [('total', 3), ('ok', 3)]
 inline_call []
 stats [('calls_captured', 6), ('unique_graphs', 3)]
 aot_autograd [('total', 3), ('autograd_cache_miss', 3), ('autograd_cache_saved', 3), ('ok', 3)]
 inductor [('triton_bundler_save_kernel', 49), ('async_compile_cache_miss', 10), ('select_algorithm_num_precompiles', 4), ('benchmarking.InductorBenchmarker.benchmark_gpu', 4), ('fxgraph_cache_miss', 3), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)]
 graph_break []
 aten_mm_info [('aten.mm_64_64_127', 3)]
 .AUTOTUNE mm(252x248, 248x128)
  triton_mm_1321 0.0059 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_1322 0.0059 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_1323 0.0059 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_1320 0.0140 ms 41.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=2
 SingleProcess AUTOTUNE benchmarking takes 0.1544 seconds and 0.1310 seconds precompiling for 4 choices
 frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 63), ('async_compile_cache_miss', 8), ('benchmarking.InductorBenchmarker.benchmark_gpu', 8), ('select_algorithm_num_precompiles', 4), ('async_compile_cache_hit', 2), ('pattern_matcher_count', 1), ('pattern_matcher_nodes', 1), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)]
 graph_break []
 aten_mm_info [('aten.mm_252_128_248', 1)]
 .AUTOTUNE mm(252x248, 248x128)
  triton_mm_1325 0.0056 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_1326 0.0058 ms 97.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_1327 0.0058 ms 97.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_1324 0.0137 ms 41.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=2
 SingleProcess AUTOTUNE benchmarking takes 0.1596 seconds and 0.1340 seconds precompiling for 4 choices
 frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 63), ('async_compile_cache_miss', 8), ('benchmarking.InductorBenchmarker.benchmark_gpu', 8), ('select_algorithm_num_precompiles', 4), ('async_compile_cache_hit', 2), ('pattern_matcher_count', 1), ('pattern_matcher_nodes', 1), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)]
 graph_break []
 aten_mm_info [('aten.mm_252_128_248', 1)]
 .AUTOTUNE mm(252x248, 248x128)
  triton_mm_1329 0.0055 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_1330 0.0059 ms 93.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_1331 0.0059 ms 93.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_1328 0.0123 ms 44.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=2
 SingleProcess AUTOTUNE benchmarking takes 0.1566 seconds and 0.1339 seconds precompiling for 4 choices
 frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 63), ('async_compile_cache_miss', 8), ('benchmarking.InductorBenchmarker.benchmark_gpu', 8), ('select_algorithm_num_precompiles', 4), ('async_compile_cache_hit', 2), ('pattern_matcher_count', 1), ('pattern_matcher_nodes', 1), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)]
 graph_break []
 aten_mm_info [('aten.mm_252_128_248', 1)]
 .AUTOTUNE mm(64x128, 128x256)
  triton_mm_1334 0.0041 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_1335 0.0053 ms 77.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_1333 0.0061 ms 66.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_1332 0.0079 ms 52.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=2
 SingleProcess AUTOTUNE benchmarking takes 0.1609 seconds and 0.1504 seconds precompiling for 4 choices
 frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 6), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 35), ('async_compile_cache_miss', 6), ('select_algorithm_num_precompiles', 4), ('benchmarking.InductorBenchmarker.benchmark_gpu', 4), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)]
 graph_break []
 aten_mm_info [('aten.mm_64_256_128', 1)]
 .AUTOTUNE mm(128x128, 128x128)
  triton_mm_1338 0.0041 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_1339 0.0056 ms 72.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_1337 0.0061 ms 66.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_1336 0.0076 ms 54.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=2
 SingleProcess AUTOTUNE benchmarking takes 0.1566 seconds and 0.1363 seconds precompiling for 4 choices
 frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 6), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 35), ('async_compile_cache_miss', 6), ('select_algorithm_num_precompiles', 4), ('benchmarking.InductorBenchmarker.benchmark_gpu', 4), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)]
 graph_break []
 aten_mm_info [('aten.mm_128_128_128', 1)]
 .AUTOTUNE mm(63x120, 120x250)
  triton_mm_1342 0.0054 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_1343 0.0055 ms 98.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_1341 0.0061 ms 88.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_1340 0.0100 ms 54.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=2
 SingleProcess AUTOTUNE benchmarking takes 0.1582 seconds and 0.1255 seconds precompiling for 4 choices
 frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 6), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 35), ('async_compile_cache_miss', 6), ('select_algorithm_num_precompiles', 4), ('benchmarking.InductorBenchmarker.benchmark_gpu', 4), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)]
 graph_break []
 aten_mm_info [('aten.mm_63_250_120', 1)]
 .AUTOTUNE mm(256x256, 256x256)
  triton_mm_1347 0.0059 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_1345 0.0061 ms 95.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_1346 0.0061 ms 95.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_1344 0.0134 ms 43.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=2
 SingleProcess AUTOTUNE benchmarking takes 0.1592 seconds and 0.1350 seconds precompiling for 4 choices
 frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 4), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 49), ('async_compile_cache_miss', 7), ('benchmarking.InductorBenchmarker.benchmark_gpu', 6), ('select_algorithm_num_precompiles', 4), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info [('aten.mm_256_256_256', 1)]
 .AUTOTUNE mm(256x256, 256x256)
  triton_mm_1350 0.0061 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_1349 0.0061 ms 99.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_1351 0.0061 ms 99.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_1348 0.0126 ms 48.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=2
 SingleProcess AUTOTUNE benchmarking takes 0.1628 seconds and 0.1518 seconds precompiling for 4 choices
 frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 4), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 49), ('async_compile_cache_miss', 7), ('benchmarking.InductorBenchmarker.benchmark_gpu', 6), ('select_algorithm_num_precompiles', 4), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info [('aten.mm_256_256_256', 1)]
 .AUTOTUNE mm(64x64, 64x64)
  triton_mm_1354 0.0036 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_1353 0.0038 ms 95.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_1355 0.0041 ms 89.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_1352 0.0056 ms 65.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=2
 SingleProcess AUTOTUNE benchmarking takes 0.1525 seconds and 0.0915 seconds precompiling for 4 choices
 frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 5), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 35), ('async_compile_cache_miss', 6), ('select_algorithm_num_precompiles', 4), ('benchmarking.InductorBenchmarker.benchmark_gpu', 4), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)]
 graph_break []
 aten_mm_info [('aten.mm_64_64_64', 1)]
 .AUTOTUNE mm(64x128, 128x256)
  triton_mm_1358 0.0041 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_1359 0.0041 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_1357 0.0061 ms 66.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_1356 0.0079 ms 51.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=2
 SingleProcess AUTOTUNE benchmarking takes 0.1556 seconds and 0.1287 seconds precompiling for 4 choices
 frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 2), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 35), ('async_compile_cache_miss', 6), ('select_algorithm_num_precompiles', 4), ('benchmarking.InductorBenchmarker.benchmark_gpu', 4), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)]
 graph_break []
 aten_mm_info [('aten.mm_64_256_128', 1)]
 .AUTOTUNE mm(128x128, 128x128)
  triton_mm_1362 0.0041 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_1363 0.0055 ms 74.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_1361 0.0061 ms 66.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_1360 0.0079 ms 51.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=2
 SingleProcess AUTOTUNE benchmarking takes 0.1567 seconds and 0.1207 seconds precompiling for 4 choices
 frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 2), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 35), ('async_compile_cache_miss', 6), ('select_algorithm_num_precompiles', 4), ('benchmarking.InductorBenchmarker.benchmark_gpu', 4), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)]
 graph_break []
 aten_mm_info [('aten.mm_128_128_128', 1)]
 .AUTOTUNE mm(63x120, 120x250)
  triton_mm_1366 0.0053 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_1367 0.0055 ms 96.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8
  triton_mm_1365 0.0059 ms 91.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4
  triton_mm_1364 0.0100 ms 53.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=2
 SingleProcess AUTOTUNE benchmarking takes 0.1642 seconds and 0.1487 seconds precompiling for 4 choices
 frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 2), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 35), ('async_compile_cache_miss', 6), ('select_algorithm_num_precompiles', 4), ('benchmarking.InductorBenchmarker.benchmark_gpu', 4), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)]
 graph_break []
 aten_mm_info [('aten.mm_63_250_120', 1)]
 .Exception in TuningProcess
 Traceback (most recent call last):
  File "/workspace/pytorch/torch/_inductor/autotune_process.py", line 118, in process_main
    TuningProcess.workloop(request_queue, response_queue)
  File "/workspace/pytorch/torch/_inductor/autotune_process.py", line 135, in workloop
    response_queue.put(obj.benchmark())
                       ^^^^^^^^^^^^^^^
  File "/workspace/pytorch/test/inductor/test_max_autotune.py", line 1220, in benchmark
    assert visible_devices == self.parent_visible_devices
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 AssertionError
 /workspace/pytorch/torch/_inductor/autotune_process.py:382: UserWarning: Failed to benchmark choice 'test'. It will be ignored. Please debug the root cause in case the choice can bring perf gains.
  warnings.warn(
 ..
 ======================================================================
 ERROR: test_max_autotune_addmm_persistent_tma_a_transposed_False_b_transposed_False_dynamic_True (__main__.TestMaxAutotune.test_max_autotune_addmm_persistent_tma_a_transposed_False_b_transposed_False_dynamic_True)
 ----------------------------------------------------------------------
 Traceback (most recent call last):
  File "/workspace/pytorch/torch/testing/_internal/common_utils.py", line 3153, in wrapper
    method(*args, **kwargs)
  File "/workspace/pytorch/torch/testing/_internal/common_utils.py", line 552, in instantiated_test
    test(self, **param_kwargs)
  File "/workspace/pytorch/test/inductor/test_max_autotune.py", line 437, in test_max_autotune_addmm_persistent_tma
    c_actual = torch.compile(addmm, dynamic=dynamic)(x, a, b)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_dynamo/eval_frame.py", line 663, in _fn
    raise e.remove_dynamo_frames() from None  # see TORCHDYNAMO_VERBOSE=1
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/compile_fx.py", line 760, in _compile_fx_inner
    raise InductorError(e, currentframe()).with_traceback(
  File "/workspace/pytorch/torch/_inductor/compile_fx.py", line 745, in _compile_fx_inner
    mb_compiled_graph = fx_codegen_and_compile(
                        ^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/compile_fx.py", line 1295, in fx_codegen_and_compile
    return scheme.codegen_and_compile(gm, example_inputs, inputs_to_check, graph_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/compile_fx.py", line 1197, in codegen_and_compile
    compiled_fn = graph.compile_to_module().call
                  ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/graph.py", line 2083, in compile_to_module
    return self._compile_to_module()
           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/graph.py", line 2091, in _compile_to_module
    self.codegen_with_cpp_wrapper() if self.cpp_wrapper else self.codegen()
                                                             ^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/graph.py", line 1998, in codegen
    self._update_scheduler()
  File "/workspace/pytorch/torch/_inductor/graph.py", line 1992, in _update_scheduler
    self.scheduler = Scheduler(self.operations)
                     ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/scheduler.py", line 1985, in __init__
    self._init(nodes)
  File "/workspace/pytorch/torch/_inductor/scheduler.py", line 2065, in _init
    self.finalize_multi_template_buffers()
  File "/workspace/pytorch/torch/_inductor/scheduler.py", line 2671, in finalize_multi_template_buffers
    min_node_unfused, _ = multi_node.get_min_choice()
                          ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/ir.py", line 4578, in get_min_choice
    min_choice = min(self.choice_timings, key=self.choice_timings.get)  # type: ignore[arg-type]
                     ^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/ir.py", line 4556, in choice_timings
    self._choice_timings = self._choice_timings_fn()
                           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 1949, in get_timings
    timings = do_autotuning(precompile_fn)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 1913, in do_autotuning
    timings = self.lookup(
              ^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/codecache.py", line 321, in lookup
    timings = benchmark(choices)
              ^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 1893, in autotune
    return make_benchmark_fn()(choices)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 2117, in benchmark_in_current_process
    raise e
  File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 2083, in benchmark_in_current_process
    timing = benchmark_choice_in_current_process(choice, inputs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 2063, in benchmark_choice_in_current_process
    result = choice.benchmark(*inpts, out=output)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 1353, in benchmark
    return self.bmreq.benchmark(*args, output_tensor=out)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/autotune_process.py", line 554, in benchmark
    out = self.do_bench(fn, *input_tensors, output_tensor)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/autotune_process.py", line 615, in do_bench
    out = benchmarker.benchmark_gpu(fn)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/runtime/benchmarking.py", line 39, in wrapper
    return fn(self, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/runtime/benchmarking.py", line 243, in benchmark_gpu
    _callable()
  File "/workspace/pytorch/torch/_inductor/autotune_process.py", line 709, in run_with_workspace
    run_method(
  File "/workspace/pytorch/torch/_inductor/runtime/triton_heuristics.py", line 956, in run
    return launcher(
           ^^^^^^^^^
  File "<string>", line 5, in launcher
  File "/workspace/triton/python/triton/backends/nvidia/driver.py", line 529, in __call__
    self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args)
 torch._inductor.exc.InductorError: TypeError: only integer tensors of a single element can be converted to an index

 Set TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS="+dynamo"


 To execute this test, run the following from the base repo dir:
    python test/inductor/test_max_autotune.py TestMaxAutotune.test_max_autotune_addmm_persistent_tma_a_transposed_False_b_transposed_False_dynamic_True

 This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0

 ======================================================================
 ERROR: test_max_autotune_addmm_persistent_tma_a_transposed_False_b_transposed_True_dynamic_True (__main__.TestMaxAutotune.test_max_autotune_addmm_persistent_tma_a_transposed_False_b_transposed_True_dynamic_True)
 ----------------------------------------------------------------------
 Traceback (most recent call last):
  File "/workspace/pytorch/torch/testing/_internal/common_utils.py", line 3153, in wrapper
    method(*args, **kwargs)
  File "/workspace/pytorch/torch/testing/_internal/common_utils.py", line 552, in instantiated_test
    test(self, **param_kwargs)
  File "/workspace/pytorch/test/inductor/test_max_autotune.py", line 437, in test_max_autotune_addmm_persistent_tma
    c_actual = torch.compile(addmm, dynamic=dynamic)(x, a, b)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_dynamo/eval_frame.py", line 663, in _fn
    raise e.remove_dynamo_frames() from None  # see TORCHDYNAMO_VERBOSE=1
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/compile_fx.py", line 760, in _compile_fx_inner
    raise InductorError(e, currentframe()).with_traceback(
  File "/workspace/pytorch/torch/_inductor/compile_fx.py", line 745, in _compile_fx_inner
    mb_compiled_graph = fx_codegen_and_compile(
                        ^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/compile_fx.py", line 1295, in fx_codegen_and_compile
    return scheme.codegen_and_compile(gm, example_inputs, inputs_to_check, graph_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/compile_fx.py", line 1197, in codegen_and_compile
    compiled_fn = graph.compile_to_module().call
                  ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/graph.py", line 2083, in compile_to_module
    return self._compile_to_module()
           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/graph.py", line 2091, in _compile_to_module
    self.codegen_with_cpp_wrapper() if self.cpp_wrapper else self.codegen()
                                                             ^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/graph.py", line 1998, in codegen
    self._update_scheduler()
  File "/workspace/pytorch/torch/_inductor/graph.py", line 1992, in _update_scheduler
    self.scheduler = Scheduler(self.operations)
                     ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/scheduler.py", line 1985, in __init__
    self._init(nodes)
  File "/workspace/pytorch/torch/_inductor/scheduler.py", line 2065, in _init
    self.finalize_multi_template_buffers()
  File "/workspace/pytorch/torch/_inductor/scheduler.py", line 2671, in finalize_multi_template_buffers
    min_node_unfused, _ = multi_node.get_min_choice()
                          ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/ir.py", line 4578, in get_min_choice
    min_choice = min(self.choice_timings, key=self.choice_timings.get)  # type: ignore[arg-type]
                     ^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/ir.py", line 4556, in choice_timings
    self._choice_timings = self._choice_timings_fn()
                           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 1949, in get_timings
    timings = do_autotuning(precompile_fn)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 1913, in do_autotuning
    timings = self.lookup(
              ^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/codecache.py", line 321, in lookup
    timings = benchmark(choices)
              ^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 1893, in autotune
    return make_benchmark_fn()(choices)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 2117, in benchmark_in_current_process
    raise e
  File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 2083, in benchmark_in_current_process
    timing = benchmark_choice_in_current_process(choice, inputs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 2063, in benchmark_choice_in_current_process
    result = choice.benchmark(*inpts, out=output)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 1353, in benchmark
    return self.bmreq.benchmark(*args, output_tensor=out)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/autotune_process.py", line 554, in benchmark
    out = self.do_bench(fn, *input_tensors, output_tensor)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/autotune_process.py", line 615, in do_bench
    out = benchmarker.benchmark_gpu(fn)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/runtime/benchmarking.py", line 39, in wrapper
    return fn(self, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/runtime/benchmarking.py", line 243, in benchmark_gpu
    _callable()
  File "/workspace/pytorch/torch/_inductor/autotune_process.py", line 709, in run_with_workspace
    run_method(
  File "/workspace/pytorch/torch/_inductor/runtime/triton_heuristics.py", line 956, in run
    return launcher(
           ^^^^^^^^^
  File "<string>", line 5, in launcher
  File "/workspace/triton/python/triton/backends/nvidia/driver.py", line 529, in __call__
    self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args)
 torch._inductor.exc.InductorError: TypeError: only integer tensors of a single element can be converted to an index

 Set TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS="+dynamo"


 To execute this test, run the following from the base repo dir:
    python test/inductor/test_max_autotune.py TestMaxAutotune.test_max_autotune_addmm_persistent_tma_a_transposed_False_b_transposed_True_dynamic_True

 This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0

 ======================================================================
 ERROR: test_max_autotune_addmm_persistent_tma_a_transposed_True_b_transposed_False_dynamic_True (__main__.TestMaxAutotune.test_max_autotune_addmm_persistent_tma_a_transposed_True_b_transposed_False_dynamic_True)
 ----------------------------------------------------------------------
 Traceback (most recent call last):
  File "/workspace/pytorch/torch/testing/_internal/common_utils.py", line 3153, in wrapper
    method(*args, **kwargs)
  File "/workspace/pytorch/torch/testing/_internal/common_utils.py", line 552, in instantiated_test
    test(self, **param_kwargs)
  File "/workspace/pytorch/test/inductor/test_max_autotune.py", line 437, in test_max_autotune_addmm_persistent_tma
    c_actual = torch.compile(addmm, dynamic=dynamic)(x, a, b)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_dynamo/eval_frame.py", line 663, in _fn
    raise e.remove_dynamo_frames() from None  # see TORCHDYNAMO_VERBOSE=1
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/compile_fx.py", line 760, in _compile_fx_inner
    raise InductorError(e, currentframe()).with_traceback(
  File "/workspace/pytorch/torch/_inductor/compile_fx.py", line 745, in _compile_fx_inner
    mb_compiled_graph = fx_codegen_and_compile(
                        ^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/compile_fx.py", line 1295, in fx_codegen_and_compile
    return scheme.codegen_and_compile(gm, example_inputs, inputs_to_check, graph_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/compile_fx.py", line 1197, in codegen_and_compile
    compiled_fn = graph.compile_to_module().call
                  ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/graph.py", line 2083, in compile_to_module
    return self._compile_to_module()
           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/graph.py", line 2091, in _compile_to_module
    self.codegen_with_cpp_wrapper() if self.cpp_wrapper else self.codegen()
                                                             ^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/graph.py", line 1998, in codegen
    self._update_scheduler()
  File "/workspace/pytorch/torch/_inductor/graph.py", line 1992, in _update_scheduler
    self.scheduler = Scheduler(self.operations)
                     ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/scheduler.py", line 1985, in __init__
    self._init(nodes)
  File "/workspace/pytorch/torch/_inductor/scheduler.py", line 2065, in _init
    self.finalize_multi_template_buffers()
  File "/workspace/pytorch/torch/_inductor/scheduler.py", line 2671, in finalize_multi_template_buffers
    min_node_unfused, _ = multi_node.get_min_choice()
                          ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/ir.py", line 4578, in get_min_choice
    min_choice = min(self.choice_timings, key=self.choice_timings.get)  # type: ignore[arg-type]
                     ^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/ir.py", line 4556, in choice_timings
    self._choice_timings = self._choice_timings_fn()
                           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 1949, in get_timings
    timings = do_autotuning(precompile_fn)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 1913, in do_autotuning
    timings = self.lookup(
              ^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/codecache.py", line 321, in lookup
    timings = benchmark(choices)
              ^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 1893, in autotune
    return make_benchmark_fn()(choices)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 2117, in benchmark_in_current_process
    raise e
  File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 2083, in benchmark_in_current_process
    timing = benchmark_choice_in_current_process(choice, inputs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 2063, in benchmark_choice_in_current_process
    result = choice.benchmark(*inpts, out=output)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 1353, in benchmark
    return self.bmreq.benchmark(*args, output_tensor=out)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/autotune_process.py", line 554, in benchmark
    out = self.do_bench(fn, *input_tensors, output_tensor)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/autotune_process.py", line 615, in do_bench
    out = benchmarker.benchmark_gpu(fn)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/runtime/benchmarking.py", line 39, in wrapper
    return fn(self, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/runtime/benchmarking.py", line 243, in benchmark_gpu
    _callable()
  File "/workspace/pytorch/torch/_inductor/autotune_process.py", line 709, in run_with_workspace
    run_method(
  File "/workspace/pytorch/torch/_inductor/runtime/triton_heuristics.py", line 956, in run
    return launcher(
           ^^^^^^^^^
  File "<string>", line 5, in launcher
  File "/workspace/triton/python/triton/backends/nvidia/driver.py", line 529, in __call__
    self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args)
 torch._inductor.exc.InductorError: TypeError: only integer tensors of a single element can be converted to an index

 Set TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS="+dynamo"


 To execute this test, run the following from the base repo dir:
    python test/inductor/test_max_autotune.py TestMaxAutotune.test_max_autotune_addmm_persistent_tma_a_transposed_True_b_transposed_False_dynamic_True

 This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0

 ======================================================================
 ERROR: test_max_autotune_addmm_persistent_tma_a_transposed_True_b_transposed_True_dynamic_True (__main__.TestMaxAutotune.test_max_autotune_addmm_persistent_tma_a_transposed_True_b_transposed_True_dynamic_True)
 ----------------------------------------------------------------------
 Traceback (most recent call last):
  File "/workspace/pytorch/torch/testing/_internal/common_utils.py", line 3153, in wrapper
    method(*args, **kwargs)
  File "/workspace/pytorch/torch/testing/_internal/common_utils.py", line 552, in instantiated_test
    test(self, **param_kwargs)
  File "/workspace/pytorch/test/inductor/test_max_autotune.py", line 437, in test_max_autotune_addmm_persistent_tma
    c_actual = torch.compile(addmm, dynamic=dynamic)(x, a, b)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_dynamo/eval_frame.py", line 663, in _fn
    raise e.remove_dynamo_frames() from None  # see TORCHDYNAMO_VERBOSE=1
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/compile_fx.py", line 760, in _compile_fx_inner
    raise InductorError(e, currentframe()).with_traceback(
  File "/workspace/pytorch/torch/_inductor/compile_fx.py", line 745, in _compile_fx_inner
    mb_compiled_graph = fx_codegen_and_compile(
                        ^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/compile_fx.py", line 1295, in fx_codegen_and_compile
    return scheme.codegen_and_compile(gm, example_inputs, inputs_to_check, graph_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/compile_fx.py", line 1197, in codegen_and_compile
    compiled_fn = graph.compile_to_module().call
                  ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/graph.py", line 2083, in compile_to_module
    return self._compile_to_module()
           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/graph.py", line 2091, in _compile_to_module
    self.codegen_with_cpp_wrapper() if self.cpp_wrapper else self.codegen()
                                                             ^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/graph.py", line 1998, in codegen
    self._update_scheduler()
  File "/workspace/pytorch/torch/_inductor/graph.py", line 1992, in _update_scheduler
    self.scheduler = Scheduler(self.operations)
                     ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/scheduler.py", line 1985, in __init__
    self._init(nodes)
  File "/workspace/pytorch/torch/_inductor/scheduler.py", line 2065, in _init
    self.finalize_multi_template_buffers()
  File "/workspace/pytorch/torch/_inductor/scheduler.py", line 2671, in finalize_multi_template_buffers
    min_node_unfused, _ = multi_node.get_min_choice()
                          ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/ir.py", line 4578, in get_min_choice
    min_choice = min(self.choice_timings, key=self.choice_timings.get)  # type: ignore[arg-type]
                     ^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/ir.py", line 4556, in choice_timings
    self._choice_timings = self._choice_timings_fn()
                           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 1949, in get_timings
    timings = do_autotuning(precompile_fn)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 1913, in do_autotuning
    timings = self.lookup(
              ^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/codecache.py", line 321, in lookup
    timings = benchmark(choices)
              ^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 1893, in autotune
    return make_benchmark_fn()(choices)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 2117, in benchmark_in_current_process
    raise e
  File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 2083, in benchmark_in_current_process
    timing = benchmark_choice_in_current_process(choice, inputs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 2063, in benchmark_choice_in_current_process
    result = choice.benchmark(*inpts, out=output)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 1353, in benchmark
    return self.bmreq.benchmark(*args, output_tensor=out)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/autotune_process.py", line 554, in benchmark
    out = self.do_bench(fn, *input_tensors, output_tensor)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/autotune_process.py", line 615, in do_bench
    out = benchmarker.benchmark_gpu(fn)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/runtime/benchmarking.py", line 39, in wrapper
    return fn(self, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/runtime/benchmarking.py", line 243, in benchmark_gpu
    _callable()
  File "/workspace/pytorch/torch/_inductor/autotune_process.py", line 709, in run_with_workspace
    run_method(
  File "/workspace/pytorch/torch/_inductor/runtime/triton_heuristics.py", line 956, in run
    return launcher(
           ^^^^^^^^^
  File "<string>", line 5, in launcher
  File "/workspace/triton/python/triton/backends/nvidia/driver.py", line 529, in __call__
    self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args)
 torch._inductor.exc.InductorError: TypeError: only integer tensors of a single element can be converted to an index

 Set TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS="+dynamo"


 To execute this test, run the following from the base repo dir:
    python test/inductor/test_max_autotune.py TestMaxAutotune.test_max_autotune_addmm_persistent_tma_a_transposed_True_b_transposed_True_dynamic_True

 This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0

 ======================================================================
 ERROR: test_max_autotune_regular_mm_persistent_tma_a_transposed_False_b_transposed_False_dynamic_True (__main__.TestMaxAutotune.test_max_autotune_regular_mm_persistent_tma_a_transposed_False_b_transposed_False_dynamic_True)
 ----------------------------------------------------------------------
 Traceback (most recent call last):
  File "/workspace/pytorch/torch/testing/_internal/common_utils.py", line 3153, in wrapper
    method(*args, **kwargs)
  File "/workspace/pytorch/torch/testing/_internal/common_utils.py", line 552, in instantiated_test
    test(self, **param_kwargs)
  File "/workspace/pytorch/test/inductor/test_max_autotune.py", line 262, in test_max_autotune_regular_mm_persistent_tma
    c_actual = torch.compile(mm, dynamic=dynamic)(a, b)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_dynamo/eval_frame.py", line 663, in _fn
    raise e.remove_dynamo_frames() from None  # see TORCHDYNAMO_VERBOSE=1
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/compile_fx.py", line 760, in _compile_fx_inner
    raise InductorError(e, currentframe()).with_traceback(
  File "/workspace/pytorch/torch/_inductor/compile_fx.py", line 745, in _compile_fx_inner
    mb_compiled_graph = fx_codegen_and_compile(
                        ^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/compile_fx.py", line 1295, in fx_codegen_and_compile
    return scheme.codegen_and_compile(gm, example_inputs, inputs_to_check, graph_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/compile_fx.py", line 1197, in codegen_and_compile
    compiled_fn = graph.compile_to_module().call
                  ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/graph.py", line 2083, in compile_to_module
    return self._compile_to_module()
           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/graph.py", line 2091, in _compile_to_module
    self.codegen_with_cpp_wrapper() if self.cpp_wrapper else self.codegen()
                                                             ^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/graph.py", line 1998, in codegen
    self._update_scheduler()
  File "/workspace/pytorch/torch/_inductor/graph.py", line 1992, in _update_scheduler
    self.scheduler = Scheduler(self.operations)
                     ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/scheduler.py", line 1985, in __init__
    self._init(nodes)
  File "/workspace/pytorch/torch/_inductor/scheduler.py", line 2065, in _init
    self.finalize_multi_template_buffers()
  File "/workspace/pytorch/torch/_inductor/scheduler.py", line 2671, in finalize_multi_template_buffers
    min_node_unfused, _ = multi_node.get_min_choice()
                          ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/ir.py", line 4578, in get_min_choice
    min_choice = min(self.choice_timings, key=self.choice_timings.get)  # type: ignore[arg-type]
                     ^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/ir.py", line 4556, in choice_timings
    self._choice_timings = self._choice_timings_fn()
                           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 1949, in get_timings
    timings = do_autotuning(precompile_fn)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 1913, in do_autotuning
    timings = self.lookup(
              ^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/codecache.py", line 321, in lookup
    timings = benchmark(choices)
              ^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 1893, in autotune
    return make_benchmark_fn()(choices)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 2117, in benchmark_in_current_process
    raise e
  File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 2083, in benchmark_in_current_process
    timing = benchmark_choice_in_current_process(choice, inputs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 2063, in benchmark_choice_in_current_process
    result = choice.benchmark(*inpts, out=output)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 1353, in benchmark
    return self.bmreq.benchmark(*args, output_tensor=out)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/autotune_process.py", line 554, in benchmark
    out = self.do_bench(fn, *input_tensors, output_tensor)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/autotune_process.py", line 615, in do_bench
    out = benchmarker.benchmark_gpu(fn)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/runtime/benchmarking.py", line 39, in wrapper
    return fn(self, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/runtime/benchmarking.py", line 243, in benchmark_gpu
    _callable()
  File "/workspace/pytorch/torch/_inductor/autotune_process.py", line 709, in run_with_workspace
    run_method(
  File "/workspace/pytorch/torch/_inductor/runtime/triton_heuristics.py", line 956, in run
    return launcher(
           ^^^^^^^^^
  File "<string>", line 5, in launcher
  File "/workspace/triton/python/triton/backends/nvidia/driver.py", line 529, in __call__
    self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args)
 torch._inductor.exc.InductorError: TypeError: only integer tensors of a single element can be converted to an index

 Set TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS="+dynamo"


 To execute this test, run the following from the base repo dir:
    python test/inductor/test_max_autotune.py TestMaxAutotune.test_max_autotune_regular_mm_persistent_tma_a_transposed_False_b_transposed_False_dynamic_True

 This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0

 ======================================================================
 ERROR: test_max_autotune_regular_mm_persistent_tma_a_transposed_False_b_transposed_True_dynamic_True (__main__.TestMaxAutotune.test_max_autotune_regular_mm_persistent_tma_a_transposed_False_b_transposed_True_dynamic_True)
 ----------------------------------------------------------------------
 Traceback (most recent call last):
  File "/workspace/pytorch/torch/testing/_internal/common_utils.py", line 3153, in wrapper
    method(*args, **kwargs)
  File "/workspace/pytorch/torch/testing/_internal/common_utils.py", line 552, in instantiated_test
    test(self, **param_kwargs)
  File "/workspace/pytorch/test/inductor/test_max_autotune.py", line 262, in test_max_autotune_regular_mm_persistent_tma
    c_actual = torch.compile(mm, dynamic=dynamic)(a, b)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_dynamo/eval_frame.py", line 663, in _fn
    raise e.remove_dynamo_frames() from None  # see TORCHDYNAMO_VERBOSE=1
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/compile_fx.py", line 760, in _compile_fx_inner
    raise InductorError(e, currentframe()).with_traceback(
  File "/workspace/pytorch/torch/_inductor/compile_fx.py", line 745, in _compile_fx_inner
    mb_compiled_graph = fx_codegen_and_compile(
                        ^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/compile_fx.py", line 1295, in fx_codegen_and_compile
    return scheme.codegen_and_compile(gm, example_inputs, inputs_to_check, graph_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/compile_fx.py", line 1197, in codegen_and_compile
    compiled_fn = graph.compile_to_module().call
                  ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/graph.py", line 2083, in compile_to_module
    return self._compile_to_module()
           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/graph.py", line 2091, in _compile_to_module
    self.codegen_with_cpp_wrapper() if self.cpp_wrapper else self.codegen()
                                                             ^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/graph.py", line 1998, in codegen
    self._update_scheduler()
  File "/workspace/pytorch/torch/_inductor/graph.py", line 1992, in _update_scheduler
    self.scheduler = Scheduler(self.operations)
                     ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/scheduler.py", line 1985, in __init__
    self._init(nodes)
  File "/workspace/pytorch/torch/_inductor/scheduler.py", line 2065, in _init
    self.finalize_multi_template_buffers()
  File "/workspace/pytorch/torch/_inductor/scheduler.py", line 2671, in finalize_multi_template_buffers
    min_node_unfused, _ = multi_node.get_min_choice()
                          ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/ir.py", line 4578, in get_min_choice
    min_choice = min(self.choice_timings, key=self.choice_timings.get)  # type: ignore[arg-type]
                     ^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/ir.py", line 4556, in choice_timings
    self._choice_timings = self._choice_timings_fn()
                           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 1949, in get_timings
    timings = do_autotuning(precompile_fn)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 1913, in do_autotuning
    timings = self.lookup(
              ^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/codecache.py", line 321, in lookup
    timings = benchmark(choices)
              ^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 1893, in autotune
    return make_benchmark_fn()(choices)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 2117, in benchmark_in_current_process
    raise e
  File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 2083, in benchmark_in_current_process
    timing = benchmark_choice_in_current_process(choice, inputs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 2063, in benchmark_choice_in_current_process
    result = choice.benchmark(*inpts, out=output)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 1353, in benchmark
    return self.bmreq.benchmark(*args, output_tensor=out)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/autotune_process.py", line 554, in benchmark
    out = self.do_bench(fn, *input_tensors, output_tensor)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/autotune_process.py", line 615, in do_bench
    out = benchmarker.benchmark_gpu(fn)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/runtime/benchmarking.py", line 39, in wrapper
    return fn(self, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/runtime/benchmarking.py", line 243, in benchmark_gpu
    _callable()
  File "/workspace/pytorch/torch/_inductor/autotune_process.py", line 709, in run_with_workspace
    run_method(
  File "/workspace/pytorch/torch/_inductor/runtime/triton_heuristics.py", line 956, in run
    return launcher(
           ^^^^^^^^^
  File "<string>", line 5, in launcher
  File "/workspace/triton/python/triton/backends/nvidia/driver.py", line 529, in __call__
    self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args)
 torch._inductor.exc.InductorError: TypeError: only integer tensors of a single element can be converted to an index

 Set TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS="+dynamo"


 To execute this test, run the following from the base repo dir:
    python test/inductor/test_max_autotune.py TestMaxAutotune.test_max_autotune_regular_mm_persistent_tma_a_transposed_False_b_transposed_True_dynamic_True

 This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0

 ======================================================================
 ERROR: test_max_autotune_regular_mm_persistent_tma_a_transposed_True_b_transposed_False_dynamic_True (__main__.TestMaxAutotune.test_max_autotune_regular_mm_persistent_tma_a_transposed_True_b_transposed_False_dynamic_True)
 ----------------------------------------------------------------------
 Traceback (most recent call last):
  File "/workspace/pytorch/torch/testing/_internal/common_utils.py", line 3153, in wrapper
    method(*args, **kwargs)
  File "/workspace/pytorch/torch/testing/_internal/common_utils.py", line 552, in instantiated_test
    test(self, **param_kwargs)
  File "/workspace/pytorch/test/inductor/test_max_autotune.py", line 262, in test_max_autotune_regular_mm_persistent_tma
    c_actual = torch.compile(mm, dynamic=dynamic)(a, b)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_dynamo/eval_frame.py", line 663, in _fn
    raise e.remove_dynamo_frames() from None  # see TORCHDYNAMO_VERBOSE=1
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/compile_fx.py", line 760, in _compile_fx_inner
    raise InductorError(e, currentframe()).with_traceback(
  File "/workspace/pytorch/torch/_inductor/compile_fx.py", line 745, in _compile_fx_inner
    mb_compiled_graph = fx_codegen_and_compile(
                        ^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/compile_fx.py", line 1295, in fx_codegen_and_compile
    return scheme.codegen_and_compile(gm, example_inputs, inputs_to_check, graph_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/compile_fx.py", line 1197, in codegen_and_compile
    compiled_fn = graph.compile_to_module().call
                  ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/graph.py", line 2083, in compile_to_module
    return self._compile_to_module()
           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/graph.py", line 2091, in _compile_to_module
    self.codegen_with_cpp_wrapper() if self.cpp_wrapper else self.codegen()
                                                             ^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/graph.py", line 1998, in codegen
    self._update_scheduler()
  File "/workspace/pytorch/torch/_inductor/graph.py", line 1992, in _update_scheduler
    self.scheduler = Scheduler(self.operations)
                     ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/scheduler.py", line 1985, in __init__
    self._init(nodes)
  File "/workspace/pytorch/torch/_inductor/scheduler.py", line 2065, in _init
    self.finalize_multi_template_buffers()
  File "/workspace/pytorch/torch/_inductor/scheduler.py", line 2671, in finalize_multi_template_buffers
    min_node_unfused, _ = multi_node.get_min_choice()
                          ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/ir.py", line 4578, in get_min_choice
    min_choice = min(self.choice_timings, key=self.choice_timings.get)  # type: ignore[arg-type]
                     ^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/ir.py", line 4556, in choice_timings
    self._choice_timings = self._choice_timings_fn()
                           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 1949, in get_timings
    timings = do_autotuning(precompile_fn)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 1913, in do_autotuning
    timings = self.lookup(
              ^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/codecache.py", line 321, in lookup
    timings = benchmark(choices)
              ^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 1893, in autotune
    return make_benchmark_fn()(choices)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 2117, in benchmark_in_current_process
    raise e
  File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 2083, in benchmark_in_current_process
    timing = benchmark_choice_in_current_process(choice, inputs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 2063, in benchmark_choice_in_current_process
    result = choice.benchmark(*inpts, out=output)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 1353, in benchmark
    return self.bmreq.benchmark(*args, output_tensor=out)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/autotune_process.py", line 554, in benchmark
    out = self.do_bench(fn, *input_tensors, output_tensor)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/autotune_process.py", line 615, in do_bench
    out = benchmarker.benchmark_gpu(fn)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/runtime/benchmarking.py", line 39, in wrapper
    return fn(self, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/runtime/benchmarking.py", line 243, in benchmark_gpu
    _callable()
  File "/workspace/pytorch/torch/_inductor/autotune_process.py", line 709, in run_with_workspace
    run_method(
  File "/workspace/pytorch/torch/_inductor/runtime/triton_heuristics.py", line 956, in run
    return launcher(
           ^^^^^^^^^
  File "<string>", line 5, in launcher
  File "/workspace/triton/python/triton/backends/nvidia/driver.py", line 529, in __call__
    self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args)
 torch._inductor.exc.InductorError: TypeError: only integer tensors of a single element can be converted to an index

 Set TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS="+dynamo"


 To execute this test, run the following from the base repo dir:
    python test/inductor/test_max_autotune.py TestMaxAutotune.test_max_autotune_regular_mm_persistent_tma_a_transposed_True_b_transposed_False_dynamic_True

 This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0

 ======================================================================
 ERROR: test_max_autotune_regular_mm_persistent_tma_a_transposed_True_b_transposed_True_dynamic_True (__main__.TestMaxAutotune.test_max_autotune_regular_mm_persistent_tma_a_transposed_True_b_transposed_True_dynamic_True)
 ----------------------------------------------------------------------
 Traceback (most recent call last):
  File "/workspace/pytorch/torch/testing/_internal/common_utils.py", line 3153, in wrapper
    method(*args, **kwargs)
  File "/workspace/pytorch/torch/testing/_internal/common_utils.py", line 552, in instantiated_test
    test(self, **param_kwargs)
  File "/workspace/pytorch/test/inductor/test_max_autotune.py", line 262, in test_max_autotune_regular_mm_persistent_tma
    c_actual = torch.compile(mm, dynamic=dynamic)(a, b)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_dynamo/eval_frame.py", line 663, in _fn
    raise e.remove_dynamo_frames() from None  # see TORCHDYNAMO_VERBOSE=1
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/compile_fx.py", line 760, in _compile_fx_inner
    raise InductorError(e, currentframe()).with_traceback(
  File "/workspace/pytorch/torch/_inductor/compile_fx.py", line 745, in _compile_fx_inner
    mb_compiled_graph = fx_codegen_and_compile(
                        ^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/compile_fx.py", line 1295, in fx_codegen_and_compile
    return scheme.codegen_and_compile(gm, example_inputs, inputs_to_check, graph_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/compile_fx.py", line 1197, in codegen_and_compile
    compiled_fn = graph.compile_to_module().call
                  ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/graph.py", line 2083, in compile_to_module
    return self._compile_to_module()
           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/graph.py", line 2091, in _compile_to_module
    self.codegen_with_cpp_wrapper() if self.cpp_wrapper else self.codegen()
                                                             ^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/graph.py", line 1998, in codegen
    self._update_scheduler()
  File "/workspace/pytorch/torch/_inductor/graph.py", line 1992, in _update_scheduler
    self.scheduler = Scheduler(self.operations)
                     ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/scheduler.py", line 1985, in __init__
    self._init(nodes)
  File "/workspace/pytorch/torch/_inductor/scheduler.py", line 2065, in _init
    self.finalize_multi_template_buffers()
  File "/workspace/pytorch/torch/_inductor/scheduler.py", line 2671, in finalize_multi_template_buffers
    min_node_unfused, _ = multi_node.get_min_choice()
                          ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/ir.py", line 4578, in get_min_choice
    min_choice = min(self.choice_timings, key=self.choice_timings.get)  # type: ignore[arg-type]
                     ^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/ir.py", line 4556, in choice_timings
    self._choice_timings = self._choice_timings_fn()
                           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 1949, in get_timings
    timings = do_autotuning(precompile_fn)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 1913, in do_autotuning
    timings = self.lookup(
              ^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/codecache.py", line 321, in lookup
    timings = benchmark(choices)
              ^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 1893, in autotune
    return make_benchmark_fn()(choices)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 2117, in benchmark_in_current_process
    raise e
  File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 2083, in benchmark_in_current_process
    timing = benchmark_choice_in_current_process(choice, inputs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 2063, in benchmark_choice_in_current_process
    result = choice.benchmark(*inpts, out=output)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 1353, in benchmark
    return self.bmreq.benchmark(*args, output_tensor=out)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/autotune_process.py", line 554, in benchmark
    out = self.do_bench(fn, *input_tensors, output_tensor)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/autotune_process.py", line 615, in do_bench
    out = benchmarker.benchmark_gpu(fn)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/runtime/benchmarking.py", line 39, in wrapper
    return fn(self, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_inductor/runtime/benchmarking.py", line 243, in benchmark_gpu
    _callable()
  File "/workspace/pytorch/torch/_inductor/autotune_process.py", line 709, in run_with_workspace
    run_method(
  File "/workspace/pytorch/torch/_inductor/runtime/triton_heuristics.py", line 956, in run
    return launcher(
           ^^^^^^^^^
  File "<string>", line 5, in launcher
  File "/workspace/triton/python/triton/backends/nvidia/driver.py", line 529, in __call__
    self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args)
 torch._inductor.exc.InductorError: TypeError: only integer tensors of a single element can be converted to an index

 Set TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS="+dynamo"


 To execute this test, run the following from the base repo dir:
    python test/inductor/test_max_autotune.py TestMaxAutotune.test_max_autotune_regular_mm_persistent_tma_a_transposed_True_b_transposed_True_dynamic_True

 This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0

 ======================================================================
 ERROR: test_non_contiguous_input_mm_plus_mm (__main__.TestMaxAutotune.test_non_contiguous_input_mm_plus_mm)
 ----------------------------------------------------------------------
 Traceback (most recent call last):
  File "/workspace/pytorch/torch/testing/_internal/common_utils.py", line 3153, in wrapper
    method(*args, **kwargs)
  File "/workspace/pytorch/torch/testing/_internal/common_utils.py", line 1928, in wrapper
    return fn(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/test/inductor/test_max_autotune.py", line 1040, in test_non_contiguous_input_mm_plus_mm
    x2 = rand_strided((50257, 32768), (1, 50304), device=GPU_TYPE)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/pytorch/torch/_dynamo/testing.py", line 411, in rand_strided
    buffer = torch.randn(needed_size, dtype=dtype, device=device)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 6.14 GiB. GPU 0 has a total capacity of 15.48 GiB of which 4.01 GiB is free. Including non-PyTorch memory, this process has 11.10 GiB memory in use. Process 437762 has 360.00 MiB memory in use. Of the allocated memory 6.23 GiB is allocated by PyTorch, and 2.98 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

 To execute this test, run the following from the base repo dir:
    python test/inductor/test_max_autotune.py TestMaxAutotune.test_non_contiguous_input_mm_plus_mm

 This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0

 ======================================================================
 ERROR: test_low_precision (__main__.TestPrologueFusion.test_low_precision)
 ----------------------------------------------------------------------
 Traceback (most recent call last):
  File "/workspace/pytorch/torch/testing/_internal/common_utils.py", line 3153, in wrapper
    method(*args, **kwargs)
  File "/workspace/pytorch/test/inductor/test_max_autotune.py", line 1407, in test_low_precision
    self.check_code(code[0], num_kernels=1, num_allocs=1, num_deallocs=2)
  File "/workspace/pytorch/test/inductor/test_max_autotune.py", line 1321, in check_code
    ).run(code_str)
      ^^^^^^^^^^^^^
 RuntimeError: Expected to not find ".run(" but found it
        # Topologically Sorted Source Nodes: [to, add, matmul], Original ATen: [aten._to_copy, aten.add, aten.mm]
        stream0 = get_raw_stream(0)
        triton_tem_fused__to_copy_add_mm_1.run(buf0, arg1_1, buf1, 8, 1, 1, stream=stream0)
                                          ~~~~~ <--- HERE
        del arg1_1
        del buf0
 From CHECK-NOT: .run(


 To execute this test, run the following from the base repo dir:
    python test/inductor/test_max_autotune.py TestPrologueFusion.test_low_precision

 This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0

 ----------------------------------------------------------------------
 Ran 102 tests in 152.328s

 FAILED (errors=10, skipped=3)
diff --git a/test_torchinductor.log b/test_torchinductor.log
 test_config_option_dont_assume_alignment_cudagraphs_cpu (__main__.CpuTests.test_config_option_dont_assume_alignment_cudagraphs_cpu) ... frames [('total', 4), ('ok', 4)]
 stats [('calls_captured', 3), ('unique_graphs', 1)]
 inductor [('fxgraph_cache_miss', 2), ('cudagraph_skips', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_AllenaiLongformerBase_repro_cuda (__main__.GPUTests.test_AllenaiLongformerBase_repro_cuda) ... inline_call []
 stats [('calls_captured', 22), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('pattern_matcher_nodes', 10), ('pattern_matcher_count', 8), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test__dyn_quant_matmul_4bit_cuda (__main__.GPUTests.test__dyn_quant_matmul_4bit_cuda) ... skipped 'No _dyn_quant_matmul_4bit implementation on CUDA'
 test__dyn_quant_pack_4bit_weight_cuda (__main__.GPUTests.test__dyn_quant_pack_4bit_weight_cuda) ... skipped 'No _dyn_quant_pack_4bit_weight implementation on CUDA'
 test__unsafe_masked_index_cuda (__main__.GPUTests.test__unsafe_masked_index_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test__unsafe_masked_index_put_accumulate_cuda (__main__.GPUTests.test__unsafe_masked_index_put_accumulate_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_abs_cuda (__main__.GPUTests.test_abs_cuda) ... inline_call []
 stats [('calls_captured', 6), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_adaptive_avg_pool1d_argmax_cuda (__main__.GPUTests.test_adaptive_avg_pool1d_argmax_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('async_compile_cache_miss', 3), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_adaptive_avg_pool2d1_cuda (__main__.GPUTests.test_adaptive_avg_pool2d1_cuda) ... inline_call []
 stats [('calls_captured', 15), ('unique_graphs', 5)]
 aot_autograd [('total', 5), ('autograd_cache_miss', 5), ('autograd_cache_saved', 5), ('ok', 5)]
 inductor [('triton_bundler_save_kernel', 70), ('async_compile_cache_miss', 10), ('async_compile_cache_hit', 10), ('fxgraph_cache_miss', 5), ('extern_calls', 2), ('intermediate_hooks', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_adaptive_avg_pool2d2_cuda (__main__.GPUTests.test_adaptive_avg_pool2d2_cuda) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('extern_calls', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_adaptive_avg_pool2d_low_prec_cuda (__main__.GPUTests.test_adaptive_avg_pool2d_low_prec_cuda) ... frames [('total', 2), ('ok', 2)]
 inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_adaptive_avg_pool_errors_with_long_cuda (__main__.GPUTests.test_adaptive_avg_pool_errors_with_long_cuda) ... inline_call []
 stats [('calls_captured', 6), ('unique_graphs', 1)]
 aot_autograd [('total', 3), ('autograd_cache_miss', 3), ('not_ok', 2), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 3), ('extern_calls', 2), ('async_compile_cache_miss', 1), ('intermediate_hooks', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_adaptive_avg_pool_with_output_size_0_cuda (__main__.GPUTests.test_adaptive_avg_pool_with_output_size_0_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 4)]
 aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)]
 inductor [('fxgraph_cache_miss', 4)]
 graph_break []
 aten_mm_info []
 ok
 test_adaptive_max_pool2d1_cuda (__main__.GPUTests.test_adaptive_max_pool2d1_cuda) ... inline_call []
 stats [('calls_captured', 15), ('unique_graphs', 5)]
 aot_autograd [('total', 5), ('autograd_cache_miss', 5), ('autograd_cache_saved', 5), ('ok', 5)]
 inductor [('triton_bundler_save_kernel', 49), ('async_compile_cache_miss', 7), ('async_compile_cache_hit', 7), ('fxgraph_cache_miss', 5)]
 graph_break []
 aten_mm_info []
 ok
 test_adaptive_max_pool2d2_cuda (__main__.GPUTests.test_adaptive_max_pool2d2_cuda) ... inline_call []
 stats [('calls_captured', 3), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('extern_calls', 3), ('fxgraph_cache_miss', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_adaptive_max_pool2d3_cuda (__main__.GPUTests.test_adaptive_max_pool2d3_cuda) ... inline_call []
 stats [('calls_captured', 12), ('unique_graphs', 4)]
 aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)]
 inductor [('triton_bundler_save_kernel', 28), ('fxgraph_cache_miss', 4), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4)]
 graph_break []
 aten_mm_info []
 ok
 test_adaptive_pool_errors_with_long_cuda (__main__.GPUTests.test_adaptive_pool_errors_with_long_cuda) ... inline_call []
 stats [('calls_captured', 6), ('unique_graphs', 1)]
 aot_autograd [('total', 3), ('autograd_cache_miss', 3), ('not_ok', 2), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 3), ('extern_calls', 2), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_add_complex3_cuda (__main__.GPUTests.test_add_complex3_cuda) ... /workspace/pytorch/torch/_inductor/lowering.py:1917: UserWarning: Torchinductor does not support code generation for complex operators. Performance may be worse than eager.
  warnings.warn(
 frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 2), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('extern_calls', 6), ('pattern_matcher_count', 3), ('pattern_matcher_nodes', 3), ('intermediate_hooks', 3), ('fxgraph_cache_miss', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_add_complex4_cuda (__main__.GPUTests.test_add_complex4_cuda) ... /workspace/pytorch/test/inductor/test_torchinductor.py:1365: UserWarning: ComplexHalf support is experimental and many operators don't support it yet. (Triggered internally at /workspace/pytorch/aten/src/ATen/EmptyTensor.cpp:56.)
  x = torch.tensor(
 frames [('total', 3), ('ok', 3)]
 stats [('calls_captured', 9), ('unique_graphs', 3)]
 aot_autograd [('total', 3), ('autograd_cache_miss', 3), ('autograd_cache_saved', 3), ('ok', 3)]
 inductor [('pattern_matcher_nodes', 33), ('pattern_matcher_count', 27), ('triton_bundler_save_kernel', 21), ('extern_calls', 18), ('intermediate_hooks', 9), ('fxgraph_cache_miss', 3), ('async_compile_cache_miss', 3), ('async_compile_cache_hit', 3)]
 graph_break []
 aten_mm_info []
 ok
 test_add_complex5_cuda (__main__.GPUTests.test_add_complex5_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)]
 inductor [('extern_calls', 16), ('intermediate_hooks', 8), ('triton_bundler_save_kernel', 7), ('pattern_matcher_count', 3), ('pattern_matcher_nodes', 3), ('async_compile_cache_miss', 3), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_add_complex6_cuda (__main__.GPUTests.test_add_complex6_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)]
 inductor [('extern_calls', 16), ('triton_bundler_save_kernel', 7), ('intermediate_hooks', 6), ('pattern_matcher_count', 3), ('pattern_matcher_nodes', 3), ('async_compile_cache_miss', 3), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_add_complex_cuda (__main__.GPUTests.test_add_complex_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)]
 inductor [('extern_calls', 16), ('intermediate_hooks', 8), ('triton_bundler_save_kernel', 7), ('pattern_matcher_count', 3), ('pattern_matcher_nodes', 3), ('async_compile_cache_miss', 3), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_add_const_float_cuda (__main__.GPUTests.test_add_const_float_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_add_const_int_cuda (__main__.GPUTests.test_add_const_int_cuda) ... inline_call []
 stats [('calls_captured', 12), ('unique_graphs', 6)]
 aot_autograd [('total', 6), ('ok', 6), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('autograd_cache_hit', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 8), ('async_compile_cache_hit', 6), ('fxgraph_cache_miss', 4), ('fxgraph_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_add_inplace_permuted_cuda (__main__.GPUTests.test_add_inplace_permuted_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_adding_tensor_offsets_cuda (__main__.GPUTests.test_adding_tensor_offsets_cuda) ... stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_bypass', 2), ('ok', 2)]
 inductor [('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_addmm_cuda (__main__.GPUTests.test_addmm_cuda) ... /workspace/pytorch/torch/_inductor/compile_fx.py:236: UserWarning: TensorFloat32 tensor cores for float32 matrix multiplication available but not enabled. Consider setting `torch.set_float32_matmul_precision('high')` for better performance.
  warnings.warn(
 inline_call []
 stats [('calls_captured', 10), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 42), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 6), ('fxgraph_cache_miss', 2), ('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('extern_calls', 2)]
 graph_break []
 aten_mm_info [('aten.mm_8_8_8', 2)]
 ok
 test_addmv_cuda (__main__.GPUTests.test_addmv_cuda) ... frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('fxgraph_cache_miss', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_alexnet_prefix_cuda (__main__.GPUTests.test_alexnet_prefix_cuda) ... inline_call []
 stats [('calls_captured', 8), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 56), ('async_compile_cache_miss', 8), ('async_compile_cache_hit', 8), ('fxgraph_cache_miss', 2), ('extern_calls', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_aliased_buffer_reuse_cuda (__main__.GPUTests.test_aliased_buffer_reuse_cuda) ... inline_call []
 stats [('calls_captured', 7), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 14), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('extern_calls', 1)]
 graph_break []
 aten_mm_info [('aten.mm_4_4_4', 1)]
 ok
 test_angle_cuda (__main__.GPUTests.test_angle_cuda) ... inline_call []
 stats [('calls_captured', 6), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 42), ('extern_calls', 12), ('intermediate_hooks', 6), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 6), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_any_cuda (__main__.GPUTests.test_any_cuda) ... inline_call []
 stats [('calls_captured', 32), ('unique_graphs', 4)]
 aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)]
 inductor [('triton_bundler_save_kernel', 56), ('async_compile_cache_miss', 8), ('async_compile_cache_hit', 8), ('fxgraph_cache_miss', 4)]
 graph_break []
 aten_mm_info []
 ok
 test_aoti_eager_cache_hit_cuda (__main__.GPUTests.test_aoti_eager_cache_hit_cuda) ... W0509 18:19:03.396000 415353 torch/_export/__init__.py:67] +============================+
 W0509 18:19:03.397000 415353 torch/_export/__init__.py:68] |     !!!   WARNING   !!!    |
 W0509 18:19:03.397000 415353 torch/_export/__init__.py:69] +============================+
 W0509 18:19:03.397000 415353 torch/_export/__init__.py:70] torch._export.aot_compile()/torch._export.aot_load() is being deprecated, please switch to directly calling torch._inductor.aoti_compile_and_package(torch.export.export())/torch._inductor.aoti_load_package() instead.
 W0509 18:19:03.397000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.abs
 W0509 18:19:03.404000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.abs
 /workspace/pytorch/torch/library.py:288: UserWarning: Warning only once for all operators,  other operators may also be overridden.
  Overriding a previously registered kernel for the same operator and the same dispatch key
  operator: aten::abs.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
    registered at /workspace/pytorch/build/aten/src/ATen/RegisterSchema.cpp:6
  dispatch key: CUDA
  previous kernel: registered at /workspace/pytorch/build/aten/src/ATen/RegisterCPU_2.cpp:1215
       new kernel: registered at /dev/null:137 (Triggered internally at /workspace/pytorch/aten/src/ATen/core/dispatch/OperatorEntry.cpp:154.)
  impl_fn(self.ns, name.split("::")[-1], dispatch_key)
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 inductor [('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_aoti_eager_dtype_device_layout_cuda (__main__.GPUTests.test_aoti_eager_dtype_device_layout_cuda) ... W0509 18:19:05.915000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.tril_indices.default
 W0509 18:19:05.934000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.tril_indices.default
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 inductor [('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_aoti_eager_override_registration_cuda (__main__.GPUTests.test_aoti_eager_override_registration_cuda) ... W0509 18:19:08.680000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.abs.default
 W0509 18:19:08.686000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.abs.default
 W0509 18:19:11.034000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.acos.default
 W0509 18:19:11.043000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.acos.default
 W0509 18:19:13.407000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.abs.default
 W0509 18:19:13.419000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.abs.default
 W0509 18:19:15.796000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.abs.default
 W0509 18:19:15.803000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.abs.default
 W0509 18:19:18.121000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.abs.default
 W0509 18:19:18.128000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.abs.default
 W0509 18:19:20.467000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.abs.default
 W0509 18:19:20.478000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.abs.default
 W0509 18:19:22.880000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.abs.default
 W0509 18:19:22.891000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.abs.default
 W0509 18:19:25.336000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.abs.default
 W0509 18:19:25.348000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.abs.default
 W0509 18:19:27.735000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.clamp.Tensor
 W0509 18:19:27.747000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.clamp.Tensor
 W0509 18:19:30.224000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.clamp.Tensor
 W0509 18:19:30.239000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.clamp.Tensor
 frames [('total', 2), ('ok', 2)]
 inline_call []
 stats [('calls_captured', 12), ('unique_graphs', 12)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('async_compile_cache_miss', 12), ('async_compile_cache_hit', 12), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_aoti_eager_support_out_cuda (__main__.GPUTests.test_aoti_eager_support_out_cuda) ... W0509 18:19:32.749000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.clamp.Tensor_out
 W0509 18:19:32.766000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.clamp.Tensor_out
 W0509 18:19:35.383000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.clamp.Tensor_out
 W0509 18:19:35.399000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.clamp.Tensor_out
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 inductor [('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_aoti_eager_support_str_cuda (__main__.GPUTests.test_aoti_eager_support_str_cuda) ... W0509 18:19:37.717000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.div.Tensor_mode
 W0509 18:19:37.730000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.div.Tensor_mode
 W0509 18:19:40.191000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.div.Tensor_mode
 W0509 18:19:40.200000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.div.Tensor_mode
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 inductor [('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_aoti_eager_with_persistent_cache_cuda (__main__.GPUTests.test_aoti_eager_with_persistent_cache_cuda) ... stats [('calls_captured', 1), ('unique_graphs', 1)]
 inductor [('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_aoti_eager_with_scalar_cuda (__main__.GPUTests.test_aoti_eager_with_scalar_cuda) ... W0509 18:19:45.042000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.add
 W0509 18:19:45.049000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.add
 W0509 18:19:47.511000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.add.Tensor
 W0509 18:19:47.524000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.add.Tensor
 W0509 18:19:50.205000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.add.Tensor
 W0509 18:19:50.214000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.add.Tensor
 W0509 18:19:52.607000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.add.Tensor
 W0509 18:19:52.615000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.add.Tensor
 W0509 18:19:54.974000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.add.Scalar
 W0509 18:19:54.986000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.add.Scalar
 W0509 18:19:57.475000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.add.Scalar
 W0509 18:19:57.486000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.add.Scalar
 W0509 18:19:59.947000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.add.Scalar
 W0509 18:19:59.955000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.add.Scalar
 stats [('calls_captured', 7), ('unique_graphs', 7)]
 inductor [('async_compile_cache_miss', 7), ('async_compile_cache_hit', 7)]
 graph_break []
 aten_mm_info []
 ok
 test_arange1_cuda (__main__.GPUTests.test_arange1_cuda) ... inline_call []
 stats [('calls_captured', 10), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('pattern_matcher_count', 6), ('pattern_matcher_nodes', 6), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_arange2_cuda (__main__.GPUTests.test_arange2_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('pattern_matcher_count', 1), ('pattern_matcher_nodes', 1), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_arange3_cuda (__main__.GPUTests.test_arange3_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_arange4_cuda (__main__.GPUTests.test_arange4_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_arange5_cuda (__main__.GPUTests.test_arange5_cuda) ... frames [('total', 2), ('ok', 2)]
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_arange6_cuda (__main__.GPUTests.test_arange6_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 4)]
 aot_autograd [('total', 4), ('ok', 4), ('autograd_cache_miss', 3), ('autograd_cache_saved', 3), ('autograd_cache_hit', 1)]
 inductor [('triton_bundler_save_kernel', 21), ('async_compile_cache_miss', 5), ('async_compile_cache_hit', 4), ('pattern_matcher_count', 3), ('pattern_matcher_nodes', 3), ('fxgraph_cache_miss', 3), ('fxgraph_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_argmax_argmin1_cuda (__main__.GPUTests.test_argmax_argmin1_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_argmax_argmin2_cuda (__main__.GPUTests.test_argmax_argmin2_cuda) ... inline_call []
 stats [('calls_captured', 8), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_argmax_argmin3_cuda (__main__.GPUTests.test_argmax_argmin3_cuda) ... inline_call []
 stats [('calls_captured', 8), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)]
 inductor [('triton_bundler_save_kernel', 14), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_argmax_argmin_with_duplicates_cuda (__main__.GPUTests.test_argmax_argmin_with_duplicates_cuda) ... inline_call []
 stats [('calls_captured', 24), ('unique_graphs', 6)]
 aot_autograd [('total', 6), ('ok', 6), ('autograd_cache_miss', 3), ('autograd_cache_saved', 3), ('autograd_cache_hit', 3)]
 inductor [('triton_bundler_save_kernel', 42), ('async_compile_cache_miss', 18), ('async_compile_cache_hit', 12), ('fxgraph_cache_miss', 3), ('fxgraph_cache_hit', 3)]
 graph_break []
 aten_mm_info []
 ok
 test_argmax_argmin_with_nan_cuda (__main__.GPUTests.test_argmax_argmin_with_nan_cuda) ... inline_call []
 stats [('calls_captured', 24), ('unique_graphs', 6)]
 aot_autograd [('total', 6), ('autograd_cache_miss', 6), ('autograd_cache_saved', 6), ('ok', 6)]
 inductor [('triton_bundler_save_kernel', 84), ('async_compile_cache_miss', 12), ('async_compile_cache_hit', 12), ('fxgraph_cache_miss', 6)]
 graph_break []
 aten_mm_info []
 ok
 test_argmax_min_int32_cuda (__main__.GPUTests.test_argmax_min_int32_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('async_compile_cache_miss', 3), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_argmax_to_float_cuda (__main__.GPUTests.test_argmax_to_float_cuda) ... inline_call []
 stats [('calls_captured', 8), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)]
 inductor [('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_as_strided_cuda (__main__.GPUTests.test_as_strided_cuda) ... inline_call []
 stats [('calls_captured', 16), ('unique_graphs', 4)]
 aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_bypass', 4), ('ok', 4)]
 inductor [('triton_bundler_save_kernel', 42), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 6), ('fxgraph_cache_miss', 4)]
 graph_break []
 aten_mm_info []
 ok
 test_as_strided_scatter_cuda (__main__.GPUTests.test_as_strided_scatter_cuda) ... inline_call []
 stats [('calls_captured', 10), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_avg_pool2d1_cuda (__main__.GPUTests.test_avg_pool2d1_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_avg_pool2d2_cuda (__main__.GPUTests.test_avg_pool2d2_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_avg_pool2d3_cuda (__main__.GPUTests.test_avg_pool2d3_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_avg_pool2d4_cuda (__main__.GPUTests.test_avg_pool2d4_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_avg_pool2d5_cuda (__main__.GPUTests.test_avg_pool2d5_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_avg_pool2d6_cuda (__main__.GPUTests.test_avg_pool2d6_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_avg_pool2d7_cuda (__main__.GPUTests.test_avg_pool2d7_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('extern_calls', 4), ('fxgraph_cache_miss', 2), ('intermediate_hooks', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_avg_pool2d8_cuda (__main__.GPUTests.test_avg_pool2d8_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_avg_pool2d_backward2_cuda (__main__.GPUTests.test_avg_pool2d_backward2_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_avg_pool2d_backward3_cuda (__main__.GPUTests.test_avg_pool2d_backward3_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_avg_pool2d_backward4_cuda (__main__.GPUTests.test_avg_pool2d_backward4_cuda) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('extern_calls', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_avg_pool2d_backward_cuda (__main__.GPUTests.test_avg_pool2d_backward_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_avg_pool3d_backward2_cuda (__main__.GPUTests.test_avg_pool3d_backward2_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_avg_pool3d_backward3_cuda (__main__.GPUTests.test_avg_pool3d_backward3_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_avg_pool3d_backward4_cuda (__main__.GPUTests.test_avg_pool3d_backward4_cuda) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('extern_calls', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_avg_pool3d_backward_cuda (__main__.GPUTests.test_avg_pool3d_backward_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_avg_pool_errors_with_uint_cuda (__main__.GPUTests.test_avg_pool_errors_with_uint_cuda) ... E0509 18:20:48.465000 415353 torch/_subclasses/fake_tensor.py:2431] [0/0] failed while attempting to run meta for aten.avg_pool2d.default
 E0509 18:20:48.465000 415353 torch/_subclasses/fake_tensor.py:2431] [0/0] Traceback (most recent call last):
 E0509 18:20:48.465000 415353 torch/_subclasses/fake_tensor.py:2431] [0/0]   File "/workspace/pytorch/torch/_subclasses/fake_tensor.py", line 2427, in _dispatch_impl
 E0509 18:20:48.465000 415353 torch/_subclasses/fake_tensor.py:2431] [0/0]     r = func(*args, **kwargs)
 E0509 18:20:48.465000 415353 torch/_subclasses/fake_tensor.py:2431] [0/0]         ^^^^^^^^^^^^^^^^^^^^^
 E0509 18:20:48.465000 415353 torch/_subclasses/fake_tensor.py:2431] [0/0]   File "/workspace/pytorch/torch/_ops.py", line 756, in __call__
 E0509 18:20:48.465000 415353 torch/_subclasses/fake_tensor.py:2431] [0/0]     return self._op(*args, **kwargs)
 E0509 18:20:48.465000 415353 torch/_subclasses/fake_tensor.py:2431] [0/0]            ^^^^^^^^^^^^^^^^^^^^^^^^^
 E0509 18:20:48.465000 415353 torch/_subclasses/fake_tensor.py:2431] [0/0]   File "/workspace/pytorch/torch/_meta_registrations.py", line 2727, in meta_avg_pool2d
 E0509 18:20:48.465000 415353 torch/_subclasses/fake_tensor.py:2431] [0/0]     torch._check(
 E0509 18:20:48.465000 415353 torch/_subclasses/fake_tensor.py:2431] [0/0]   File "/workspace/pytorch/torch/__init__.py", line 1660, in _check
 E0509 18:20:48.465000 415353 torch/_subclasses/fake_tensor.py:2431] [0/0]     _check_with(RuntimeError, cond, message)
 E0509 18:20:48.465000 415353 torch/_subclasses/fake_tensor.py:2431] [0/0]   File "/workspace/pytorch/torch/__init__.py", line 1642, in _check_with
 E0509 18:20:48.465000 415353 torch/_subclasses/fake_tensor.py:2431] [0/0]     raise error_type(message_evaluated)
 E0509 18:20:48.465000 415353 torch/_subclasses/fake_tensor.py:2431] [0/0] RuntimeError: "avg_pool2d" not implemented for 'torch.uint8'
 E0509 18:20:48.474000 415353 torch/_subclasses/fake_tensor.py:2431] [0/1] failed while attempting to run meta for aten.avg_pool2d.default
 E0509 18:20:48.474000 415353 torch/_subclasses/fake_tensor.py:2431] [0/1] Traceback (most recent call last):
 E0509 18:20:48.474000 415353 torch/_subclasses/fake_tensor.py:2431] [0/1]   File "/workspace/pytorch/torch/_subclasses/fake_tensor.py", line 2427, in _dispatch_impl
 E0509 18:20:48.474000 415353 torch/_subclasses/fake_tensor.py:2431] [0/1]     r = func(*args, **kwargs)
 E0509 18:20:48.474000 415353 torch/_subclasses/fake_tensor.py:2431] [0/1]         ^^^^^^^^^^^^^^^^^^^^^
 E0509 18:20:48.474000 415353 torch/_subclasses/fake_tensor.py:2431] [0/1]   File "/workspace/pytorch/torch/_ops.py", line 756, in __call__
 E0509 18:20:48.474000 415353 torch/_subclasses/fake_tensor.py:2431] [0/1]     return self._op(*args, **kwargs)
 E0509 18:20:48.474000 415353 torch/_subclasses/fake_tensor.py:2431] [0/1]            ^^^^^^^^^^^^^^^^^^^^^^^^^
 E0509 18:20:48.474000 415353 torch/_subclasses/fake_tensor.py:2431] [0/1]   File "/workspace/pytorch/torch/_meta_registrations.py", line 2727, in meta_avg_pool2d
 E0509 18:20:48.474000 415353 torch/_subclasses/fake_tensor.py:2431] [0/1]     torch._check(
 E0509 18:20:48.474000 415353 torch/_subclasses/fake_tensor.py:2431] [0/1]   File "/workspace/pytorch/torch/__init__.py", line 1660, in _check
 E0509 18:20:48.474000 415353 torch/_subclasses/fake_tensor.py:2431] [0/1]     _check_with(RuntimeError, cond, message)
 E0509 18:20:48.474000 415353 torch/_subclasses/fake_tensor.py:2431] [0/1]   File "/workspace/pytorch/torch/__init__.py", line 1642, in _check_with
 E0509 18:20:48.474000 415353 torch/_subclasses/fake_tensor.py:2431] [0/1]     raise error_type(message_evaluated)
 E0509 18:20:48.474000 415353 torch/_subclasses/fake_tensor.py:2431] [0/1] RuntimeError: "avg_pool2d" not implemented for 'torch.uint16'
 E0509 18:20:48.478000 415353 torch/_subclasses/fake_tensor.py:2431] [0/2] failed while attempting to run meta for aten.avg_pool2d.default
 E0509 18:20:48.478000 415353 torch/_subclasses/fake_tensor.py:2431] [0/2] Traceback (most recent call last):
 E0509 18:20:48.478000 415353 torch/_subclasses/fake_tensor.py:2431] [0/2]   File "/workspace/pytorch/torch/_subclasses/fake_tensor.py", line 2427, in _dispatch_impl
 E0509 18:20:48.478000 415353 torch/_subclasses/fake_tensor.py:2431] [0/2]     r = func(*args, **kwargs)
 E0509 18:20:48.478000 415353 torch/_subclasses/fake_tensor.py:2431] [0/2]         ^^^^^^^^^^^^^^^^^^^^^
 E0509 18:20:48.478000 415353 torch/_subclasses/fake_tensor.py:2431] [0/2]   File "/workspace/pytorch/torch/_ops.py", line 756, in __call__
 E0509 18:20:48.478000 415353 torch/_subclasses/fake_tensor.py:2431] [0/2]     return self._op(*args, **kwargs)
 E0509 18:20:48.478000 415353 torch/_subclasses/fake_tensor.py:2431] [0/2]            ^^^^^^^^^^^^^^^^^^^^^^^^^
 E0509 18:20:48.478000 415353 torch/_subclasses/fake_tensor.py:2431] [0/2]   File "/workspace/pytorch/torch/_meta_registrations.py", line 2727, in meta_avg_pool2d
 E0509 18:20:48.478000 415353 torch/_subclasses/fake_tensor.py:2431] [0/2]     torch._check(
 E0509 18:20:48.478000 415353 torch/_subclasses/fake_tensor.py:2431] [0/2]   File "/workspace/pytorch/torch/__init__.py", line 1660, in _check
 E0509 18:20:48.478000 415353 torch/_subclasses/fake_tensor.py:2431] [0/2]     _check_with(RuntimeError, cond, message)
 E0509 18:20:48.478000 415353 torch/_subclasses/fake_tensor.py:2431] [0/2]   File "/workspace/pytorch/torch/__init__.py", line 1642, in _check_with
 E0509 18:20:48.478000 415353 torch/_subclasses/fake_tensor.py:2431] [0/2]     raise error_type(message_evaluated)
 E0509 18:20:48.478000 415353 torch/_subclasses/fake_tensor.py:2431] [0/2] RuntimeError: "avg_pool2d" not implemented for 'torch.uint32'
 E0509 18:20:48.483000 415353 torch/_subclasses/fake_tensor.py:2431] [0/3] failed while attempting to run meta for aten.avg_pool2d.default
 E0509 18:20:48.483000 415353 torch/_subclasses/fake_tensor.py:2431] [0/3] Traceback (most recent call last):
 E0509 18:20:48.483000 415353 torch/_subclasses/fake_tensor.py:2431] [0/3]   File "/workspace/pytorch/torch/_subclasses/fake_tensor.py", line 2427, in _dispatch_impl
 E0509 18:20:48.483000 415353 torch/_subclasses/fake_tensor.py:2431] [0/3]     r = func(*args, **kwargs)
 E0509 18:20:48.483000 415353 torch/_subclasses/fake_tensor.py:2431] [0/3]         ^^^^^^^^^^^^^^^^^^^^^
 E0509 18:20:48.483000 415353 torch/_subclasses/fake_tensor.py:2431] [0/3]   File "/workspace/pytorch/torch/_ops.py", line 756, in __call__
 E0509 18:20:48.483000 415353 torch/_subclasses/fake_tensor.py:2431] [0/3]     return self._op(*args, **kwargs)
 E0509 18:20:48.483000 415353 torch/_subclasses/fake_tensor.py:2431] [0/3]            ^^^^^^^^^^^^^^^^^^^^^^^^^
 E0509 18:20:48.483000 415353 torch/_subclasses/fake_tensor.py:2431] [0/3]   File "/workspace/pytorch/torch/_meta_registrations.py", line 2727, in meta_avg_pool2d
 E0509 18:20:48.483000 415353 torch/_subclasses/fake_tensor.py:2431] [0/3]     torch._check(
 E0509 18:20:48.483000 415353 torch/_subclasses/fake_tensor.py:2431] [0/3]   File "/workspace/pytorch/torch/__init__.py", line 1660, in _check
 E0509 18:20:48.483000 415353 torch/_subclasses/fake_tensor.py:2431] [0/3]     _check_with(RuntimeError, cond, message)
 E0509 18:20:48.483000 415353 torch/_subclasses/fake_tensor.py:2431] [0/3]   File "/workspace/pytorch/torch/__init__.py", line 1642, in _check_with
 E0509 18:20:48.483000 415353 torch/_subclasses/fake_tensor.py:2431] [0/3]     raise error_type(message_evaluated)
 E0509 18:20:48.483000 415353 torch/_subclasses/fake_tensor.py:2431] [0/3] RuntimeError: "avg_pool2d" not implemented for 'torch.uint64'
 E0509 18:20:48.492000 415353 torch/_subclasses/fake_tensor.py:2431] [0/4] failed while attempting to run meta for aten.avg_pool2d.default
 E0509 18:20:48.492000 415353 torch/_subclasses/fake_tensor.py:2431] [0/4] Traceback (most recent call last):
 E0509 18:20:48.492000 415353 torch/_subclasses/fake_tensor.py:2431] [0/4]   File "/workspace/pytorch/torch/_subclasses/fake_tensor.py", line 2427, in _dispatch_impl
 E0509 18:20:48.492000 415353 torch/_subclasses/fake_tensor.py:2431] [0/4]     r = func(*args, **kwargs)
 E0509 18:20:48.492000 415353 torch/_subclasses/fake_tensor.py:2431] [0/4]         ^^^^^^^^^^^^^^^^^^^^^
 E0509 18:20:48.492000 415353 torch/_subclasses/fake_tensor.py:2431] [0/4]   File "/workspace/pytorch/torch/_ops.py", line 756, in __call__
 E0509 18:20:48.492000 415353 torch/_subclasses/fake_tensor.py:2431] [0/4]     return self._op(*args, **kwargs)
 E0509 18:20:48.492000 415353 torch/_subclasses/fake_tensor.py:2431] [0/4]            ^^^^^^^^^^^^^^^^^^^^^^^^^
 E0509 18:20:48.492000 415353 torch/_subclasses/fake_tensor.py:2431] [0/4]   File "/workspace/pytorch/torch/_meta_registrations.py", line 2727, in meta_avg_pool2d
 E0509 18:20:48.492000 415353 torch/_subclasses/fake_tensor.py:2431] [0/4]     torch._check(
 E0509 18:20:48.492000 415353 torch/_subclasses/fake_tensor.py:2431] [0/4]   File "/workspace/pytorch/torch/__init__.py", line 1660, in _check
 E0509 18:20:48.492000 415353 torch/_subclasses/fake_tensor.py:2431] [0/4]     _check_with(RuntimeError, cond, message)
 E0509 18:20:48.492000 415353 torch/_subclasses/fake_tensor.py:2431] [0/4]   File "/workspace/pytorch/torch/__init__.py", line 1642, in _check_with
 E0509 18:20:48.492000 415353 torch/_subclasses/fake_tensor.py:2431] [0/4]     raise error_type(message_evaluated)
 E0509 18:20:48.492000 415353 torch/_subclasses/fake_tensor.py:2431] [0/4] RuntimeError: "avg_pool2d" not implemented for 'torch.uint8'
 E0509 18:20:48.497000 415353 torch/_subclasses/fake_tensor.py:2431] [0/5] failed while attempting to run meta for aten.avg_pool2d.default
 E0509 18:20:48.497000 415353 torch/_subclasses/fake_tensor.py:2431] [0/5] Traceback (most recent call last):
 E0509 18:20:48.497000 415353 torch/_subclasses/fake_tensor.py:2431] [0/5]   File "/workspace/pytorch/torch/_subclasses/fake_tensor.py", line 2427, in _dispatch_impl
 E0509 18:20:48.497000 415353 torch/_subclasses/fake_tensor.py:2431] [0/5]     r = func(*args, **kwargs)
 E0509 18:20:48.497000 415353 torch/_subclasses/fake_tensor.py:2431] [0/5]         ^^^^^^^^^^^^^^^^^^^^^
 E0509 18:20:48.497000 415353 torch/_subclasses/fake_tensor.py:2431] [0/5]   File "/workspace/pytorch/torch/_ops.py", line 756, in __call__
 E0509 18:20:48.497000 415353 torch/_subclasses/fake_tensor.py:2431] [0/5]     return self._op(*args, **kwargs)
 E0509 18:20:48.497000 415353 torch/_subclasses/fake_tensor.py:2431] [0/5]            ^^^^^^^^^^^^^^^^^^^^^^^^^
 E0509 18:20:48.497000 415353 torch/_subclasses/fake_tensor.py:2431] [0/5]   File "/workspace/pytorch/torch/_meta_registrations.py", line 2727, in meta_avg_pool2d
 E0509 18:20:48.497000 415353 torch/_subclasses/fake_tensor.py:2431] [0/5]     torch._check(
 E0509 18:20:48.497000 415353 torch/_subclasses/fake_tensor.py:2431] [0/5]   File "/workspace/pytorch/torch/__init__.py", line 1660, in _check
 E0509 18:20:48.497000 415353 torch/_subclasses/fake_tensor.py:2431] [0/5]     _check_with(RuntimeError, cond, message)
 E0509 18:20:48.497000 415353 torch/_subclasses/fake_tensor.py:2431] [0/5]   File "/workspace/pytorch/torch/__init__.py", line 1642, in _check_with
 E0509 18:20:48.497000 415353 torch/_subclasses/fake_tensor.py:2431] [0/5]     raise error_type(message_evaluated)
 E0509 18:20:48.497000 415353 torch/_subclasses/fake_tensor.py:2431] [0/5] RuntimeError: "avg_pool2d" not implemented for 'torch.uint16'
 E0509 18:20:48.503000 415353 torch/_subclasses/fake_tensor.py:2431] [0/6] failed while attempting to run meta for aten.avg_pool2d.default
 E0509 18:20:48.503000 415353 torch/_subclasses/fake_tensor.py:2431] [0/6] Traceback (most recent call last):
 E0509 18:20:48.503000 415353 torch/_subclasses/fake_tensor.py:2431] [0/6]   File "/workspace/pytorch/torch/_subclasses/fake_tensor.py", line 2427, in _dispatch_impl
 E0509 18:20:48.503000 415353 torch/_subclasses/fake_tensor.py:2431] [0/6]     r = func(*args, **kwargs)
 E0509 18:20:48.503000 415353 torch/_subclasses/fake_tensor.py:2431] [0/6]         ^^^^^^^^^^^^^^^^^^^^^
 E0509 18:20:48.503000 415353 torch/_subclasses/fake_tensor.py:2431] [0/6]   File "/workspace/pytorch/torch/_ops.py", line 756, in __call__
 E0509 18:20:48.503000 415353 torch/_subclasses/fake_tensor.py:2431] [0/6]     return self._op(*args, **kwargs)
 E0509 18:20:48.503000 415353 torch/_subclasses/fake_tensor.py:2431] [0/6]            ^^^^^^^^^^^^^^^^^^^^^^^^^
 E0509 18:20:48.503000 415353 torch/_subclasses/fake_tensor.py:2431] [0/6]   File "/workspace/pytorch/torch/_meta_registrations.py", line 2727, in meta_avg_pool2d
 E0509 18:20:48.503000 415353 torch/_subclasses/fake_tensor.py:2431] [0/6]     torch._check(
 E0509 18:20:48.503000 415353 torch/_subclasses/fake_tensor.py:2431] [0/6]   File "/workspace/pytorch/torch/__init__.py", line 1660, in _check
 E0509 18:20:48.503000 415353 torch/_subclasses/fake_tensor.py:2431] [0/6]     _check_with(RuntimeError, cond, message)
 E0509 18:20:48.503000 415353 torch/_subclasses/fake_tensor.py:2431] [0/6]   File "/workspace/pytorch/torch/__init__.py", line 1642, in _check_with
 E0509 18:20:48.503000 415353 torch/_subclasses/fake_tensor.py:2431] [0/6]     raise error_type(message_evaluated)
 E0509 18:20:48.503000 415353 torch/_subclasses/fake_tensor.py:2431] [0/6] RuntimeError: "avg_pool2d" not implemented for 'torch.uint32'
 E0509 18:20:48.509000 415353 torch/_subclasses/fake_tensor.py:2431] [0/7] failed while attempting to run meta for aten.avg_pool2d.default
 E0509 18:20:48.509000 415353 torch/_subclasses/fake_tensor.py:2431] [0/7] Traceback (most recent call last):
 E0509 18:20:48.509000 415353 torch/_subclasses/fake_tensor.py:2431] [0/7]   File "/workspace/pytorch/torch/_subclasses/fake_tensor.py", line 2427, in _dispatch_impl
 E0509 18:20:48.509000 415353 torch/_subclasses/fake_tensor.py:2431] [0/7]     r = func(*args, **kwargs)
 E0509 18:20:48.509000 415353 torch/_subclasses/fake_tensor.py:2431] [0/7]         ^^^^^^^^^^^^^^^^^^^^^
 E0509 18:20:48.509000 415353 torch/_subclasses/fake_tensor.py:2431] [0/7]   File "/workspace/pytorch/torch/_ops.py", line 756, in __call__
 E0509 18:20:48.509000 415353 torch/_subclasses/fake_tensor.py:2431] [0/7]     return self._op(*args, **kwargs)
 E0509 18:20:48.509000 415353 torch/_subclasses/fake_tensor.py:2431] [0/7]            ^^^^^^^^^^^^^^^^^^^^^^^^^
 E0509 18:20:48.509000 415353 torch/_subclasses/fake_tensor.py:2431] [0/7]   File "/workspace/pytorch/torch/_meta_registrations.py", line 2727, in meta_avg_pool2d
 E0509 18:20:48.509000 415353 torch/_subclasses/fake_tensor.py:2431] [0/7]     torch._check(
 E0509 18:20:48.509000 415353 torch/_subclasses/fake_tensor.py:2431] [0/7]   File "/workspace/pytorch/torch/__init__.py", line 1660, in _check
 E0509 18:20:48.509000 415353 torch/_subclasses/fake_tensor.py:2431] [0/7]     _check_with(RuntimeError, cond, message)
 E0509 18:20:48.509000 415353 torch/_subclasses/fake_tensor.py:2431] [0/7]   File "/workspace/pytorch/torch/__init__.py", line 1642, in _check_with
 E0509 18:20:48.509000 415353 torch/_subclasses/fake_tensor.py:2431] [0/7]     raise error_type(message_evaluated)
 E0509 18:20:48.509000 415353 torch/_subclasses/fake_tensor.py:2431] [0/7] RuntimeError: "avg_pool2d" not implemented for 'torch.uint64'
 E0509 18:20:48.516000 415353 torch/_subclasses/fake_tensor.py:2431] [0/8] failed while attempting to run meta for aten.avg_pool3d.default
 E0509 18:20:48.516000 415353 torch/_subclasses/fake_tensor.py:2431] [0/8] Traceback (most recent call last):
 E0509 18:20:48.516000 415353 torch/_subclasses/fake_tensor.py:2431] [0/8]   File "/workspace/pytorch/torch/_subclasses/fake_tensor.py", line 2427, in _dispatch_impl
 E0509 18:20:48.516000 415353 torch/_subclasses/fake_tensor.py:2431] [0/8]     r = func(*args, **kwargs)
 E0509 18:20:48.516000 415353 torch/_subclasses/fake_tensor.py:2431] [0/8]         ^^^^^^^^^^^^^^^^^^^^^
 E0509 18:20:48.516000 415353 torch/_subclasses/fake_tensor.py:2431] [0/8]   File "/workspace/pytorch/torch/_ops.py", line 756, in __call__
 E0509 18:20:48.516000 415353 torch/_subclasses/fake_tensor.py:2431] [0/8]     return self._op(*args, **kwargs)
 E0509 18:20:48.516000 415353 torch/_subclasses/fake_tensor.py:2431] [0/8]            ^^^^^^^^^^^^^^^^^^^^^^^^^
 E0509 18:20:48.516000 415353 torch/_subclasses/fake_tensor.py:2431] [0/8]   File "/workspace/pytorch/torch/_prims_common/wrappers.py", line 308, in _fn
 E0509 18:20:48.516000 415353 torch/_subclasses/fake_tensor.py:2431] [0/8]     result = fn(*args, **kwargs)
 E0509 18:20:48.516000 415353 torch/_subclasses/fake_tensor.py:2431] [0/8]              ^^^^^^^^^^^^^^^^^^^
 E0509 18:20:48.516000 415353 torch/_subclasses/fake_tensor.py:2431] [0/8]   File "/workspace/pytorch/torch/_meta_registrations.py", line 2925, in meta_avg_pool3d
 E0509 18:20:48.516000 415353 torch/_subclasses/fake_tensor.py:2431] [0/8]     torch._check(
 E0509 18:20:48.516000 415353 torch/_subclasses/fake_tensor.py:2431] [0/8]   File "/workspace/pytorch/torch/__init__.py", line 1660, in _check
 E0509 18:20:48.516000 415353 torch/_subclasses/fake_tensor.py:2431] [0/8]     _check_with(RuntimeError, cond, message)
 E0509 18:20:48.516000 415353 torch/_subclasses/fake_tensor.py:2431] [0/8]   File "/workspace/pytorch/torch/__init__.py", line 1642, in _check_with
 E0509 18:20:48.516000 415353 torch/_subclasses/fake_tensor.py:2431] [0/8]     raise error_type(message_evaluated)
 E0509 18:20:48.516000 415353 torch/_subclasses/fake_tensor.py:2431] [0/8] RuntimeError: "avg_pool3d" not implemented for 'torch.uint8'
 E0509 18:20:48.523000 415353 torch/_subclasses/fake_tensor.py:2431] [0/9] failed while attempting to run meta for aten.avg_pool3d.default
 E0509 18:20:48.523000 415353 torch/_subclasses/fake_tensor.py:2431] [0/9] Traceback (most recent call last):
 E0509 18:20:48.523000 415353 torch/_subclasses/fake_tensor.py:2431] [0/9]   File "/workspace/pytorch/torch/_subclasses/fake_tensor.py", line 2427, in _dispatch_impl
 E0509 18:20:48.523000 415353 torch/_subclasses/fake_tensor.py:2431] [0/9]     r = func(*args, **kwargs)
 E0509 18:20:48.523000 415353 torch/_subclasses/fake_tensor.py:2431] [0/9]         ^^^^^^^^^^^^^^^^^^^^^
 E0509 18:20:48.523000 415353 torch/_subclasses/fake_tensor.py:2431] [0/9]   File "/workspace/pytorch/torch/_ops.py", line 756, in __call__
 E0509 18:20:48.523000 415353 torch/_subclasses/fake_tensor.py:2431] [0/9]     return self._op(*args, **kwargs)
 E0509 18:20:48.523000 415353 torch/_subclasses/fake_tensor.py:2431] [0/9]            ^^^^^^^^^^^^^^^^^^^^^^^^^
 E0509 18:20:48.523000 415353 torch/_subclasses/fake_tensor.py:2431] [0/9]   File "/workspace/pytorch/torch/_prims_common/wrappers.py", line 308, in _fn
 E0509 18:20:48.523000 415353 torch/_subclasses/fake_tensor.py:2431] [0/9]     result = fn(*args, **kwargs)
 E0509 18:20:48.523000 415353 torch/_subclasses/fake_tensor.py:2431] [0/9]              ^^^^^^^^^^^^^^^^^^^
 E0509 18:20:48.523000 415353 torch/_subclasses/fake_tensor.py:2431] [0/9]   File "/workspace/pytorch/torch/_meta_registrations.py", line 2925, in meta_avg_pool3d
 E0509 18:20:48.523000 415353 torch/_subclasses/fake_tensor.py:2431] [0/9]     torch._check(
 E0509 18:20:48.523000 415353 torch/_subclasses/fake_tensor.py:2431] [0/9]   File "/workspace/pytorch/torch/__init__.py", line 1660, in _check
 E0509 18:20:48.523000 415353 torch/_subclasses/fake_tensor.py:2431] [0/9]     _check_with(RuntimeError, cond, message)
 E0509 18:20:48.523000 415353 torch/_subclasses/fake_tensor.py:2431] [0/9]   File "/workspace/pytorch/torch/__init__.py", line 1642, in _check_with
 E0509 18:20:48.523000 415353 torch/_subclasses/fake_tensor.py:2431] [0/9]     raise error_type(message_evaluated)
 E0509 18:20:48.523000 415353 torch/_subclasses/fake_tensor.py:2431] [0/9] RuntimeError: "avg_pool3d" not implemented for 'torch.uint16'
 E0509 18:20:48.530000 415353 torch/_subclasses/fake_tensor.py:2431] [0/10] failed while attempting to run meta for aten.avg_pool3d.default
 E0509 18:20:48.530000 415353 torch/_subclasses/fake_tensor.py:2431] [0/10] Traceback (most recent call last):
 E0509 18:20:48.530000 415353 torch/_subclasses/fake_tensor.py:2431] [0/10]   File "/workspace/pytorch/torch/_subclasses/fake_tensor.py", line 2427, in _dispatch_impl
 E0509 18:20:48.530000 415353 torch/_subclasses/fake_tensor.py:2431] [0/10]     r = func(*args, **kwargs)
 E0509 18:20:48.530000 415353 torch/_subclasses/fake_tensor.py:2431] [0/10]         ^^^^^^^^^^^^^^^^^^^^^
 E0509 18:20:48.530000 415353 torch/_subclasses/fake_tensor.py:2431] [0/10]   File "/workspace/pytorch/torch/_ops.py", line 756, in __call__
 E0509 18:20:48.530000 415353 torch/_subclasses/fake_tensor.py:2431] [0/10]     return self._op(*args, **kwargs)
 E0509 18:20:48.530000 415353 torch/_subclasses/fake_tensor.py:2431] [0/10]            ^^^^^^^^^^^^^^^^^^^^^^^^^
 E0509 18:20:48.530000 415353 torch/_subclasses/fake_tensor.py:2431] [0/10]   File "/workspace/pytorch/torch/_prims_common/wrappers.py", line 308, in _fn
 E0509 18:20:48.530000 415353 torch/_subclasses/fake_tensor.py:2431] [0/10]     result = fn(*args, **kwargs)
 E0509 18:20:48.530000 415353 torch/_subclasses/fake_tensor.py:2431] [0/10]              ^^^^^^^^^^^^^^^^^^^
 E0509 18:20:48.530000 415353 torch/_subclasses/fake_tensor.py:2431] [0/10]   File "/workspace/pytorch/torch/_meta_registrations.py", line 2925, in meta_avg_pool3d
 E0509 18:20:48.530000 415353 torch/_subclasses/fake_tensor.py:2431] [0/10]     torch._check(
 E0509 18:20:48.530000 415353 torch/_subclasses/fake_tensor.py:2431] [0/10]   File "/workspace/pytorch/torch/__init__.py", line 1660, in _check
 E0509 18:20:48.530000 415353 torch/_subclasses/fake_tensor.py:2431] [0/10]     _check_with(RuntimeError, cond, message)
 E0509 18:20:48.530000 415353 torch/_subclasses/fake_tensor.py:2431] [0/10]   File "/workspace/pytorch/torch/__init__.py", line 1642, in _check_with
 E0509 18:20:48.530000 415353 torch/_subclasses/fake_tensor.py:2431] [0/10]     raise error_type(message_evaluated)
 E0509 18:20:48.530000 415353 torch/_subclasses/fake_tensor.py:2431] [0/10] RuntimeError: "avg_pool3d" not implemented for 'torch.uint32'
 E0509 18:20:48.535000 415353 torch/_subclasses/fake_tensor.py:2431] [0/11] failed while attempting to run meta for aten.avg_pool3d.default
 E0509 18:20:48.535000 415353 torch/_subclasses/fake_tensor.py:2431] [0/11] Traceback (most recent call last):
 E0509 18:20:48.535000 415353 torch/_subclasses/fake_tensor.py:2431] [0/11]   File "/workspace/pytorch/torch/_subclasses/fake_tensor.py", line 2427, in _dispatch_impl
 E0509 18:20:48.535000 415353 torch/_subclasses/fake_tensor.py:2431] [0/11]     r = func(*args, **kwargs)
 E0509 18:20:48.535000 415353 torch/_subclasses/fake_tensor.py:2431] [0/11]         ^^^^^^^^^^^^^^^^^^^^^
 E0509 18:20:48.535000 415353 torch/_subclasses/fake_tensor.py:2431] [0/11]   File "/workspace/pytorch/torch/_ops.py", line 756, in __call__
 E0509 18:20:48.535000 415353 torch/_subclasses/fake_tensor.py:2431] [0/11]     return self._op(*args, **kwargs)
 E0509 18:20:48.535000 415353 torch/_subclasses/fake_tensor.py:2431] [0/11]            ^^^^^^^^^^^^^^^^^^^^^^^^^
 E0509 18:20:48.535000 415353 torch/_subclasses/fake_tensor.py:2431] [0/11]   File "/workspace/pytorch/torch/_prims_common/wrappers.py", line 308, in _fn
 E0509 18:20:48.535000 415353 torch/_subclasses/fake_tensor.py:2431] [0/11]     result = fn(*args, **kwargs)
 E0509 18:20:48.535000 415353 torch/_subclasses/fake_tensor.py:2431] [0/11]              ^^^^^^^^^^^^^^^^^^^
 E0509 18:20:48.535000 415353 torch/_subclasses/fake_tensor.py:2431] [0/11]   File "/workspace/pytorch/torch/_meta_registrations.py", line 2925, in meta_avg_pool3d
 E0509 18:20:48.535000 415353 torch/_subclasses/fake_tensor.py:2431] [0/11]     torch._check(
 E0509 18:20:48.535000 415353 torch/_subclasses/fake_tensor.py:2431] [0/11]   File "/workspace/pytorch/torch/__init__.py", line 1660, in _check
 E0509 18:20:48.535000 415353 torch/_subclasses/fake_tensor.py:2431] [0/11]     _check_with(RuntimeError, cond, message)
 E0509 18:20:48.535000 415353 torch/_subclasses/fake_tensor.py:2431] [0/11]   File "/workspace/pytorch/torch/__init__.py", line 1642, in _check_with
 E0509 18:20:48.535000 415353 torch/_subclasses/fake_tensor.py:2431] [0/11]     raise error_type(message_evaluated)
 E0509 18:20:48.535000 415353 torch/_subclasses/fake_tensor.py:2431] [0/11] RuntimeError: "avg_pool3d" not implemented for 'torch.uint64'
 frames [('total', 12)]
 ok
 test_baddbmm_cuda (__main__.GPUTests.test_baddbmm_cuda) ... inline_call []
 stats [('calls_captured', 8), ('unique_graphs', 8)]
 aot_autograd [('total', 8), ('ok', 8), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('autograd_cache_hit', 4)]
 inductor [('extern_calls', 8), ('fxgraph_cache_miss', 4), ('fxgraph_cache_hit', 4)]
 graph_break []
 aten_mm_info [('aten.baddbmm_128_100_64', 4)]
 ok
 test_batch_norm_2d_2_cuda (__main__.GPUTests.test_batch_norm_2d_2_cuda) ... frames [('total', 1), ('ok', 1)]
 inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 56), ('async_compile_cache_miss', 8), ('async_compile_cache_hit', 8), ('fxgraph_cache_miss', 1), ('extern_calls', 1), ('intermediate_hooks', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_batch_norm_2d_cuda (__main__.GPUTests.test_batch_norm_2d_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_bernoulli1_cuda (__main__.GPUTests.test_bernoulli1_cuda) ... inline_call []
 stats [('calls_captured', 12), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 42), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 6), ('fxgraph_cache_miss', 2), ('extern_calls', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_bernoulli2_cuda (__main__.GPUTests.test_bernoulli2_cuda) ... inline_call []
 stats [('calls_captured', 10), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('fxgraph_cache_miss', 2), ('extern_calls', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_bfloat16_to_int16_cuda (__main__.GPUTests.test_bfloat16_to_int16_cuda) ... frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 5), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_bitwise2_cuda (__main__.GPUTests.test_bitwise2_cuda) ... inline_call []
 stats [('calls_captured', 8), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('async_compile_cache_miss', 3), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_bitwise3_cuda (__main__.GPUTests.test_bitwise3_cuda) ... inline_call []
 stats [('calls_captured', 12), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('async_compile_cache_miss', 3), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_bitwise_cuda (__main__.GPUTests.test_bitwise_cuda) ... inline_call []
 stats [('calls_captured', 8), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('async_compile_cache_miss', 3), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_bmm1_cuda (__main__.GPUTests.test_bmm1_cuda) ... inline_call []
 stats [('calls_captured', 10), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 42), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 6), ('extern_calls', 4), ('benchmarking.InductorBenchmarker.benchmark_gpu', 3), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info [('aten.bmm_8_8_8', 2), ('aten.bmm_16_10_8', 2)]
 ok
 test_bmm2_cuda (__main__.GPUTests.test_bmm2_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('fxgraph_cache_miss', 1), ('extern_calls', 1)]
 graph_break []
 aten_mm_info [('aten.bmm_8_8_8', 1)]
 ok
 test_bool_cuda (__main__.GPUTests.test_bool_cuda) ... inline_call []
 stats [('calls_captured', 18), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)]
 inductor [('triton_bundler_save_kernel', 35), ('async_compile_cache_miss', 15), ('async_compile_cache_hit', 10), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_both_scalars_cuda (__main__.GPUTests.test_both_scalars_cuda) ... inline_call []
 stats [('calls_captured', 12), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)]
 inductor [('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_bucketize_add_autotune_cuda (__main__.GPUTests.test_bucketize_add_autotune_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 56), ('benchmarking.InductorBenchmarker.benchmark_gpu', 8), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_bucketize_computed_offsets_cuda (__main__.GPUTests.test_bucketize_computed_offsets_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 14), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_bucketize_default_kwargs_cuda (__main__.GPUTests.test_bucketize_default_kwargs_cuda) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_bucketize_int_cuda (__main__.GPUTests.test_bucketize_int_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 4)]
 aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)]
 inductor [('triton_bundler_save_kernel', 28), ('fxgraph_cache_miss', 4), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4)]
 graph_break []
 aten_mm_info []
 ok
 test_bucketize_nd_tiling_False_cuda (__main__.GPUTests.test_bucketize_nd_tiling_False_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 4)]
 aot_autograd [('total', 4), ('ok', 4), ('autograd_cache_hit', 3), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1)]
 inductor [('async_compile_cache_miss', 7), ('triton_bundler_save_kernel', 7), ('async_compile_cache_hit', 4), ('fxgraph_cache_hit', 3), ('fxgraph_cache_miss', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_bucketize_nd_tiling_True_cuda (__main__.GPUTests.test_bucketize_nd_tiling_True_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 4)]
 aot_autograd [('total', 4), ('ok', 4), ('autograd_cache_hit', 3), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1)]
 inductor [('async_compile_cache_miss', 7), ('triton_bundler_save_kernel', 7), ('async_compile_cache_hit', 4), ('fxgraph_cache_hit', 3), ('fxgraph_cache_miss', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_buffer_batch_norm_cuda (__main__.GPUTests.test_buffer_batch_norm_cuda) ... frames [('total', 1), ('ok', 1)]
 inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 21), ('async_compile_cache_miss', 3), ('async_compile_cache_hit', 3), ('fxgraph_cache_miss', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_buffer_copied_in_graph_cuda (__main__.GPUTests.test_buffer_copied_in_graph_cuda) ... frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 6), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_buffer_copied_in_graph_with_different_shapes_cuda (__main__.GPUTests.test_buffer_copied_in_graph_with_different_shapes_cuda) ... frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 5), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('benchmarking.InductorBenchmarker.benchmark_gpu', 6), ('fxgraph_cache_miss', 1), ('extern_calls', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info [('aten.mm_4_4_2', 1)]
 ok
 test_buffer_use_after_remove_cuda (__main__.GPUTests.test_buffer_use_after_remove_cuda) ... frames [('total', 1), ('ok', 1)]
 inline_call []
 stats [('calls_captured', 42), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('ok', 1), ('autograd_cache_saved', 1)]
 inductor [('triton_bundler_save_kernel', 42), ('pattern_matcher_nodes', 16), ('pattern_matcher_count', 13), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 6), ('benchmarking.InductorBenchmarker.benchmark_gpu', 4), ('fxgraph_cache_miss', 2), ('extern_calls', 2)]
 graph_break []
 aten_mm_info [('aten.bmm_6_2_3', 1), ('aten.bmm_6_3_2', 1)]
 ok
 test_builtins_round_cuda (__main__.GPUTests.test_builtins_round_cuda) ... stats [('calls_captured', 6), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_builtins_round_float_ndigits_neg_cuda (__main__.GPUTests.test_builtins_round_float_ndigits_neg_cuda) ... stats [('calls_captured', 4), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_builtins_round_float_ndigits_pos_cuda (__main__.GPUTests.test_builtins_round_float_ndigits_pos_cuda) ... stats [('calls_captured', 4), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_builtins_round_float_ndigits_zero_cuda (__main__.GPUTests.test_builtins_round_float_ndigits_zero_cuda) ... stats [('calls_captured', 4), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_builtins_round_int_ndigits_pos_cuda (__main__.GPUTests.test_builtins_round_int_ndigits_pos_cuda) ... stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_builtins_round_int_ndigits_zero_cuda (__main__.GPUTests.test_builtins_round_int_ndigits_zero_cuda) ... stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_cat_cuda (__main__.GPUTests.test_cat_cuda) ... inline_call []
 stats [('calls_captured', 32), ('unique_graphs', 4)]
 aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)]
 inductor [('triton_bundler_save_kernel', 70), ('async_compile_cache_miss', 10), ('async_compile_cache_hit', 10), ('pattern_matcher_count', 4), ('pattern_matcher_nodes', 4), ('fxgraph_cache_miss', 4)]
 graph_break []
 aten_mm_info []
 ok
 test_cat_empty_cuda (__main__.GPUTests.test_cat_empty_cuda) ... inline_call []
 stats [('calls_captured', 6), ('unique_graphs', 6)]
 aot_autograd [('total', 6), ('autograd_cache_miss', 6), ('autograd_cache_saved', 6), ('ok', 6)]
 inductor [('triton_bundler_save_kernel', 42), ('fxgraph_cache_miss', 6), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 6)]
 graph_break []
 aten_mm_info []
 ok
 test_cat_empty_index_cuda (__main__.GPUTests.test_cat_empty_index_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_cat_extern_kernel_cuda (__main__.GPUTests.test_cat_extern_kernel_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('extern_calls', 2), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info [('aten.mm_256_1600_1024', 1), ('aten.mm_256_256_100', 1)]
 ok
 test_cat_inplace_cuda (__main__.GPUTests.test_cat_inplace_cuda) ... frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 2), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('fxgraph_cache_miss', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_cat_negative_dim_cuda (__main__.GPUTests.test_cat_negative_dim_cuda) ... inline_call []
 stats [('calls_captured', 6), ('unique_graphs', 6)]
 aot_autograd [('total', 6), ('autograd_cache_miss', 6), ('autograd_cache_saved', 6), ('ok', 6)]
 inductor [('triton_bundler_save_kernel', 42), ('fxgraph_cache_miss', 6), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 6)]
 graph_break []
 aten_mm_info []
 ok
 test_cat_of_loops_and_extern_kernel_cuda (__main__.GPUTests.test_cat_of_loops_and_extern_kernel_cuda) ... frames [('total', 1), ('ok', 1)]
 inline_call []
 stats [('calls_captured', 3), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('ok', 1)]
 inductor [('fxgraph_cache_miss', 1), ('extern_calls', 1), ('intermediate_hooks', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_cat_single_empty_cuda (__main__.GPUTests.test_cat_single_empty_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_cat_uint8_cuda (__main__.GPUTests.test_cat_uint8_cuda) ... inline_call []
 stats [('calls_captured', 6), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('async_compile_cache_miss', 3), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_cat_unbacked_2d_cuda (__main__.GPUTests.test_cat_unbacked_2d_cuda) ... inline_call []
 stats [('calls_captured', 24), ('unique_graphs', 4)]
 aot_autograd [('total', 4), ('ok', 4), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('autograd_cache_hit', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('extern_calls', 8), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2), ('fxgraph_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_cat_unbacked_empty_1d_cuda (__main__.GPUTests.test_cat_unbacked_empty_1d_cuda) ... inline_call []
 stats [('calls_captured', 24), ('unique_graphs', 4)]
 aot_autograd [('total', 4), ('ok', 4), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('autograd_cache_hit', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('extern_calls', 8), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2), ('fxgraph_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_cat_unbacked_legacy_empty_cuda (__main__.GPUTests.test_cat_unbacked_legacy_empty_cuda) ... inline_call []
 ok
 test_cat_upcasting_cuda (__main__.GPUTests.test_cat_upcasting_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_cauchy_cuda (__main__.GPUTests.test_cauchy_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_check_stack_no_cycles_cuda (__main__.GPUTests.test_check_stack_no_cycles_cuda) ... frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_chunk_recompiles_cuda (__main__.GPUTests.test_chunk_recompiles_cuda) ... stats [('calls_captured', 19), ('unique_graphs', 4)]
 aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_bypass', 4), ('ok', 4)]
 inductor [('fxgraph_cache_miss', 4)]
 graph_break []
 aten_mm_info []
 ok
 test_clamp_cuda (__main__.GPUTests.test_clamp_cuda) ... inline_call []
 stats [('calls_captured', 8), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_clamp_type_promotion_cuda (__main__.GPUTests.test_clamp_type_promotion_cuda) ... inline_call []
 stats [('calls_captured', 6), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('async_compile_cache_miss', 3), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_clone_cuda (__main__.GPUTests.test_clone_cuda) ... inline_call []
 stats [('calls_captured', 8), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_compar_cuda (__main__.GPUTests.test_compar_cuda) ... inline_call []
 stats [('calls_captured', 12), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)]
 inductor [('triton_bundler_save_kernel', 42), ('async_compile_cache_miss', 18), ('async_compile_cache_hit', 12), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_complex_fallback_cuda (__main__.GPUTests.test_complex_fallback_cuda) ... inline_call []
 stats [('calls_captured', 7), ('unique_graphs', 3)]
 aot_autograd [('total', 3), ('ok', 3), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('autograd_cache_hit', 1)]
 inductor [('extern_calls', 9), ('triton_bundler_save_kernel', 7), ('intermediate_hooks', 4), ('fxgraph_cache_miss', 2), ('fxgraph_cache_hit', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_complex_memory_overlap_cuda (__main__.GPUTests.test_complex_memory_overlap_cuda) ... ok
 test_computed_buffer_inlining_cuda (__main__.GPUTests.test_computed_buffer_inlining_cuda) ... frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 2), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 14), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2), ('pattern_matcher_count', 1), ('pattern_matcher_nodes', 1), ('fxgraph_cache_miss', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_concat_add_inplace_cuda (__main__.GPUTests.test_concat_add_inplace_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_config_option_dont_assume_alignment_cuda (__main__.GPUTests.test_config_option_dont_assume_alignment_cuda) ... frames [('total', 5), ('ok', 5)]
 stats [('calls_captured', 15), ('unique_graphs', 5)]
 aot_autograd [('total', 5), ('ok', 5), ('autograd_cache_hit', 4), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1)]
 inductor [('async_compile_cache_miss', 9), ('triton_bundler_save_kernel', 7), ('async_compile_cache_hit', 5), ('fxgraph_cache_hit', 4), ('fxgraph_cache_miss', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_config_option_dont_assume_alignment_cudagraphs_cuda (__main__.GPUTests.test_config_option_dont_assume_alignment_cudagraphs_cuda) ... frames [('total', 4), ('ok', 4)]
 stats [('calls_captured', 3), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_config_option_dont_assume_alignment_recompiles_cuda (__main__.GPUTests.test_config_option_dont_assume_alignment_recompiles_cuda) ... frames [('total', 5), ('ok', 5)]
 stats [('calls_captured', 6), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_consecutive_split_cumprod_cuda (__main__.GPUTests.test_consecutive_split_cumprod_cuda) ... inline_call []
 stats [('calls_captured', 3), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_consecutive_split_cumsum_cuda (__main__.GPUTests.test_consecutive_split_cumsum_cuda) ... inline_call []
 stats [('calls_captured', 5), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_const_int32_to_float_cuda (__main__.GPUTests.test_const_int32_to_float_cuda) ... inline_call []
 stats [('calls_captured', 8), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)]
 inductor [('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_constant_pad_1d_cuda (__main__.GPUTests.test_constant_pad_1d_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_constant_pad_2d_cuda (__main__.GPUTests.test_constant_pad_2d_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_constant_pad_3d_cuda (__main__.GPUTests.test_constant_pad_3d_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_constant_pad_fill_dtype_cuda (__main__.GPUTests.test_constant_pad_fill_dtype_cuda) ... inline_call []
 stats [('calls_captured', 8), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('async_compile_cache_miss', 3), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_constant_pad_float64_cuda (__main__.GPUTests.test_constant_pad_float64_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('async_compile_cache_miss', 3), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_constant_pad_nd_inplace_cuda (__main__.GPUTests.test_constant_pad_nd_inplace_cuda) ... frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_conv2d_backward_channels_last_cuda (__main__.GPUTests.test_conv2d_backward_channels_last_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('extern_calls', 3), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_conv2d_channels_last_cuda (__main__.GPUTests.test_conv2d_channels_last_cuda) ... skipped 'only support cpu conv2d channels_last'
 test_conv3d_channels_last_use_block_ptr_False_cuda (__main__.GPUTests.test_conv3d_channels_last_use_block_ptr_False_cuda) ... skipped 'only support cpu conv3d channels_last'
 test_conv3d_channels_last_use_block_ptr_True_cuda (__main__.GPUTests.test_conv3d_channels_last_use_block_ptr_True_cuda) ... skipped 'only support cpu conv3d channels_last'
 test_conv3d_cuda (__main__.GPUTests.test_conv3d_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('extern_calls', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_conv_backward_cuda (__main__.GPUTests.test_conv_backward_cuda) ... inline_call []
 stats [('calls_captured', 28), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)]
 inductor [('extern_calls', 28), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_conv_bn_fuse_cuda (__main__.GPUTests.test_conv_bn_fuse_cuda) ... skipped 'test is slow; run with PYTORCH_TEST_WITH_SLOW to enable test'
 test_conv_functional_bn_fuse_cuda (__main__.GPUTests.test_conv_functional_bn_fuse_cuda) ... skipped 'only support cpu conv bn test'
 test_conv_inference_heuristics_cuda (__main__.GPUTests.test_conv_inference_heuristics_cuda) ... frames [('total', 2), ('ok', 2)]
 inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2), ('extern_calls', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_conv_shape_check_cuda (__main__.GPUTests.test_conv_shape_check_cuda) ... frames [('total', 3)]
 inline_call []
 ok
 test_conv_with_as_strided_cuda (__main__.GPUTests.test_conv_with_as_strided_cuda) ... inline_call []
 stats [('calls_captured', 10), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 42), ('pattern_matcher_count', 14), ('pattern_matcher_nodes', 14), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 6), ('fxgraph_cache_miss', 2), ('extern_calls', 2), ('intermediate_hooks', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_convolution1_cuda (__main__.GPUTests.test_convolution1_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('extern_calls', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_convolution2_cuda (__main__.GPUTests.test_convolution2_cuda) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('extern_calls', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_convolution3_cuda (__main__.GPUTests.test_convolution3_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('extern_calls', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_convolution4_cuda (__main__.GPUTests.test_convolution4_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2), ('extern_calls', 2), ('intermediate_hooks', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_convolution5_cuda (__main__.GPUTests.test_convolution5_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('extern_calls', 2), ('intermediate_hooks', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_cos_cuda (__main__.GPUTests.test_cos_cuda) ... inline_call []
 stats [('calls_captured', 8), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_cudnn_rnn_cuda (__main__.GPUTests.test_cudnn_rnn_cuda) ... inline_call []
 stats [('calls_captured', 6), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('extern_calls', 6), ('fxgraph_cache_miss', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_cumprod_zero_dim_cuda (__main__.GPUTests.test_cumprod_zero_dim_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_cumsum_cuda (__main__.GPUTests.test_cumsum_cuda) ... inline_call []
 stats [('calls_captured', 12), ('unique_graphs', 6)]
 aot_autograd [('total', 6), ('autograd_cache_miss', 6), ('autograd_cache_saved', 6), ('ok', 6)]
 inductor [('triton_bundler_save_kernel', 84), ('async_compile_cache_miss', 12), ('async_compile_cache_hit', 12), ('fxgraph_cache_miss', 6)]
 graph_break []
 aten_mm_info []
 ok
 test_cumsum_inf_cuda (__main__.GPUTests.test_cumsum_inf_cuda) ... frames [('total', 5), ('ok', 5)]
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_cumsum_no_mask_cuda (__main__.GPUTests.test_cumsum_no_mask_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 4)]
 aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)]
 inductor [('triton_bundler_save_kernel', 28), ('fxgraph_cache_miss', 4), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4)]
 graph_break []
 aten_mm_info []
 ok
 test_cumsum_pattern_matcher_issue_cuda (__main__.GPUTests.test_cumsum_pattern_matcher_issue_cuda) ... inline_call []
 stats [('calls_captured', 8), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2), ('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_cumsum_zero_dim_cuda (__main__.GPUTests.test_cumsum_zero_dim_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_custom_op_1_cuda (__main__.GPUTests.test_custom_op_1_cuda) ... inline_call []
 stats [('calls_captured', 3), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 14), ('async_compile_cache_miss', 2), ('extern_calls', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_custom_op_2_cuda (__main__.GPUTests.test_custom_op_2_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('extern_calls', 3), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_custom_op_3_cuda (__main__.GPUTests.test_custom_op_3_cuda) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('extern_calls', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_custom_op_default_layout_constraint_cuda (__main__.GPUTests.test_custom_op_default_layout_constraint_cuda) ... stats [('calls_captured', 3), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('ok', 1)]
 inductor [('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_bypass', 1), ('pattern_matcher_count', 1), ('pattern_matcher_nodes', 1), ('extern_calls', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_custom_op_fixed_layout_channels_last_cuda (__main__.GPUTests.test_custom_op_fixed_layout_channels_last_cuda) ... frames [('total', 1), ('ok', 1)]
 inline_call []
 stats [('calls_captured', 3), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 14), ('extern_calls', 3), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2), ('pattern_matcher_count', 1), ('pattern_matcher_nodes', 1), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_custom_op_fixed_layout_sequential_cuda (__main__.GPUTests.test_custom_op_fixed_layout_sequential_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 42), ('async_compile_cache_miss', 6), ('extern_calls', 6), ('async_compile_cache_hit', 6), ('fxgraph_cache_miss', 2), ('intermediate_hooks', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_custom_op_unbacked_symints_cuda (__main__.GPUTests.test_custom_op_unbacked_symints_cuda) ... inline_call []
 stats [('calls_captured', 14), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('extern_calls', 7), ('intermediate_hooks', 2), ('fxgraph_cache_miss', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_custom_scan_op_compiled_cuda (__main__.GPUTests.test_custom_scan_op_compiled_cuda) ... inline_call []
 stats [('calls_captured', 51), ('unique_graphs', 6)]
 aot_autograd [('total', 3), ('autograd_cache_bypass', 3), ('ok', 3)]
 inductor [('fxgraph_cache_bypass', 3), ('async_compile_cache_miss', 3), ('async_compile_cache_hit', 3)]
 graph_break []
 aten_mm_info []
 frames [('total', 1), ('ok', 1)]
 ok
 test_custom_scan_op_cuda (__main__.GPUTests.test_custom_scan_op_cuda) ... inline_call []
 stats [('calls_captured', 24), ('unique_graphs', 2)]
 ok
 test_custom_scan_op_multi_input_cuda (__main__.GPUTests.test_custom_scan_op_multi_input_cuda) ... inline_call []
 stats [('calls_captured', 18), ('unique_graphs', 1)]
 ok
 test_custom_scan_would_split_cuda (__main__.GPUTests.test_custom_scan_would_split_cuda) ... frames [('total', 1), ('ok', 1)]
 inline_call []
 stats [('calls_captured', 14), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_bypass', 1), ('ok', 1)]
 inductor [('fxgraph_cache_bypass', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_data_type_propogation_cuda (__main__.GPUTests.test_data_type_propogation_cuda) ... skipped 'triton not supported'
 test_dense_mask_index_cuda (__main__.GPUTests.test_dense_mask_index_cuda)
 There will be a little difference for reduce order between aten and inductor ... inline_call []
 stats [('calls_captured', 6), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_deterministic_codegen_cuda (__main__.GPUTests.test_deterministic_codegen_cuda) ... /workspace/pytorch/torch/_dynamo/pgo.py:465: UserWarning: dynamo_pgo force disabled by torch._inductor.config.force_disable_caches
  warn_once(
 /workspace/pytorch/torch/_dynamo/pgo.py:465: UserWarning: dynamo_pgo force disabled by torch._inductor.config.force_disable_caches
  warn_once(
 /workspace/pytorch/torch/_dynamo/pgo.py:465: UserWarning: dynamo_pgo force disabled by torch._inductor.config.force_disable_caches
  warn_once(
 /workspace/pytorch/torch/_dynamo/pgo.py:465: UserWarning: dynamo_pgo force disabled by torch._inductor.config.force_disable_caches
  warn_once(
 /workspace/pytorch/torch/_dynamo/pgo.py:465: UserWarning: dynamo_pgo force disabled by torch._inductor.config.force_disable_caches
  warn_once(
 /workspace/pytorch/torch/_dynamo/pgo.py:465: UserWarning: dynamo_pgo force disabled by torch._inductor.config.force_disable_caches
  warn_once(
 /workspace/pytorch/torch/_dynamo/pgo.py:465: UserWarning: dynamo_pgo force disabled by torch._inductor.config.force_disable_caches
  warn_once(
 /workspace/pytorch/torch/_dynamo/pgo.py:465: UserWarning: dynamo_pgo force disabled by torch._inductor.config.force_disable_caches
  warn_once(
 /workspace/pytorch/torch/_dynamo/pgo.py:465: UserWarning: dynamo_pgo force disabled by torch._inductor.config.force_disable_caches
  warn_once(
 stats [('calls_captured', 27), ('unique_graphs', 9)]
 aot_autograd [('total', 9), ('ok', 9)]
 inductor [('pattern_matcher_nodes', 36), ('pattern_matcher_count', 9), ('async_compile_cache_miss', 9), ('async_compile_cache_hit', 9)]
 graph_break []
 aten_mm_info []
 ok
 test_deterministic_codegen_on_graph_break_cuda (__main__.GPUTests.test_deterministic_codegen_on_graph_break_cuda) ... /workspace/pytorch/torch/_dynamo/pgo.py:465: UserWarning: dynamo_pgo force disabled by torch._inductor.config.force_disable_caches
  warn_once(
 frames [('total', 2), ('ok', 2)]
 inline_call []
 unimplemented []
 graph_break [('Call to `torch._dynamo.graph_break()`\n  Explanation: User-inserted graph break. Message: None\n  Hint: Remove the `torch._dynamo.graph_break()` call.\n\n  Developer debug context: Called `torch._dynamo.graph_break()` with args `[]`, kwargs `{}`\n', 1)]
 stats [('calls_captured', 6), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('ok', 2)]
 inductor [('pattern_matcher_nodes', 8), ('pattern_matcher_count', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 aten_mm_info []
 ok
 test_deterministic_codegen_with_suffix_cuda (__main__.GPUTests.test_deterministic_codegen_with_suffix_cuda) ... /workspace/pytorch/torch/_dynamo/pgo.py:465: UserWarning: dynamo_pgo force disabled by torch._inductor.config.force_disable_caches
  warn_once(
 /workspace/pytorch/torch/_dynamo/pgo.py:465: UserWarning: dynamo_pgo force disabled by torch._inductor.config.force_disable_caches
  warn_once(
 stats [('calls_captured', 7), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('ok', 2)]
 inductor [('pattern_matcher_nodes', 8), ('pattern_matcher_count', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2), ('extern_calls', 1)]
 graph_break []
 aten_mm_info [('aten.mm_16_256_256', 1)]
 ok
 test_device_assert_cuda (__main__.GPUTests.test_device_assert_cuda) ... frames [('total', 5), ('ok', 5)]
 stats [('calls_captured', 12), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_diagonal_copy_cuda (__main__.GPUTests.test_diagonal_copy_cuda) ... /workspace/pytorch/torch/_inductor/lowering.py:1725: FutureWarning: `torch._prims_common.check` is deprecated and will be removed in the future. Please use `torch._check*` functions instead.
  check(
 /workspace/pytorch/torch/_inductor/lowering.py:1725: FutureWarning: `torch._prims_common.check` is deprecated and will be removed in the future. Please use `torch._check*` functions instead.
  check(
 /workspace/pytorch/torch/_inductor/lowering.py:1725: FutureWarning: `torch._prims_common.check` is deprecated and will be removed in the future. Please use `torch._check*` functions instead.
  check(
 /workspace/pytorch/torch/_inductor/lowering.py:1725: FutureWarning: `torch._prims_common.check` is deprecated and will be removed in the future. Please use `torch._check*` functions instead.
  check(
 /workspace/pytorch/torch/_inductor/lowering.py:1725: FutureWarning: `torch._prims_common.check` is deprecated and will be removed in the future. Please use `torch._check*` functions instead.
  check(
 /workspace/pytorch/torch/_inductor/lowering.py:1725: FutureWarning: `torch._prims_common.check` is deprecated and will be removed in the future. Please use `torch._check*` functions instead.
  check(
 inline_call []
 stats [('calls_captured', 6), ('unique_graphs', 6)]
 aot_autograd [('total', 6), ('autograd_cache_miss', 6), ('autograd_cache_saved', 6), ('ok', 6)]
 inductor [('triton_bundler_save_kernel', 42), ('fxgraph_cache_miss', 6), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 6)]
 graph_break []
 aten_mm_info []
 ok
 test_dist_bf16_cuda (__main__.GPUTests.test_dist_bf16_cuda) ... inline_call []
 stats [('calls_captured', 6), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_dist_cuda (__main__.GPUTests.test_dist_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_div1_cuda (__main__.GPUTests.test_div1_cuda) ... inline_call []
 stats [('calls_captured', 10), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_div2_cuda (__main__.GPUTests.test_div2_cuda) ... inline_call []
 stats [('calls_captured', 10), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_div3_cuda (__main__.GPUTests.test_div3_cuda) ... inline_call []
 stats [('calls_captured', 10), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('async_compile_cache_miss', 3), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_div4_cuda (__main__.GPUTests.test_div4_cuda) ... inline_call []
 stats [('calls_captured', 10), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('async_compile_cache_miss', 3), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_div5_cuda (__main__.GPUTests.test_div5_cuda) ... inline_call []
 stats [('calls_captured', 10), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('async_compile_cache_miss', 3), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_div6_cuda (__main__.GPUTests.test_div6_cuda) ... inline_call []
 stats [('calls_captured', 10), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('async_compile_cache_miss', 3), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_div7_cuda (__main__.GPUTests.test_div7_cuda) ... inline_call []
 stats [('calls_captured', 10), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('async_compile_cache_miss', 3), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_div8_cuda (__main__.GPUTests.test_div8_cuda) ... inline_call []
 stats [('calls_captured', 10), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)]
 inductor [('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_div9_cuda (__main__.GPUTests.test_div9_cuda) ... inline_call []
 stats [('calls_captured', 6), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_div_by_zero_cuda (__main__.GPUTests.test_div_by_zero_cuda) ... inline_call []
 stats [('calls_captured', 22), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_div_precision_cuda (__main__.GPUTests.test_div_precision_cuda) ... inline_call []
 stats [('calls_captured', 8), ('unique_graphs', 4)]
 aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)]
 inductor [('triton_bundler_save_kernel', 42), ('pattern_matcher_nodes', 14), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 6), ('pattern_matcher_count', 4), ('fxgraph_cache_miss', 4)]
 graph_break []
 aten_mm_info []
 ok
 test_div_prim_cuda (__main__.GPUTests.test_div_prim_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 4)]
 aot_autograd [('total', 4), ('ok', 4), ('autograd_cache_miss', 3), ('autograd_cache_saved', 3), ('autograd_cache_hit', 1)]
 inductor [('triton_bundler_save_kernel', 21), ('async_compile_cache_miss', 5), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 3), ('fxgraph_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_div_softmax_symfloat_cuda (__main__.GPUTests.test_div_softmax_symfloat_cuda) ... frames [('total', 5), ('ok', 5)]
 stats [('calls_captured', 11), ('unique_graphs', 2)]
 aot_autograd [('total', 3), ('autograd_cache_miss', 3), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('pattern_matcher_nodes', 6), ('pattern_matcher_count', 2), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_div_zero_dim_cuda (__main__.GPUTests.test_div_zero_dim_cuda) ... inline_call []
 stats [('calls_captured', 40), ('unique_graphs', 8)]
 aot_autograd [('total', 8), ('ok', 8), ('autograd_cache_miss', 6), ('autograd_cache_saved', 6), ('autograd_cache_hit', 2)]
 inductor [('triton_bundler_save_kernel', 42), ('async_compile_cache_miss', 10), ('async_compile_cache_hit', 8), ('fxgraph_cache_miss', 6), ('fxgraph_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_dropout2_cuda (__main__.GPUTests.test_dropout2_cuda) ... stats [('calls_captured', 4), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('ok', 2), ('autograd_cache_saved', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('fxgraph_cache_miss', 4), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('pattern_matcher_count', 1), ('pattern_matcher_nodes', 1), ('extern_calls', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_dropout3_cuda (__main__.GPUTests.test_dropout3_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('ok', 1), ('autograd_cache_saved', 1)]
 inductor [('triton_bundler_save_kernel', 28), ('pattern_matcher_nodes', 6), ('extern_calls', 6), ('pattern_matcher_count', 4), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info [('aten.mm_8_32_32', 3), ('aten.mm_32_32_8', 2)]
 ok
 test_dropout_cuda (__main__.GPUTests.test_dropout_cuda) ... frames [('total', 2), ('ok', 2)]
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 7), ('async_compile_cache_miss', 3), ('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('extern_calls', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_dropout_deterministic_cuda (__main__.GPUTests.test_dropout_deterministic_cuda) ... frames [('total', 2), ('ok', 2)]
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('fxgraph_cache_miss', 2), ('extern_calls', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2), ('cudagraph_recorded_non_static_inputs', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_dropout_trivial_0_cuda (__main__.GPUTests.test_dropout_trivial_0_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_dropout_trivial_1_cuda (__main__.GPUTests.test_dropout_trivial_1_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_dtype_mismatch_issue_cuda (__main__.GPUTests.test_dtype_mismatch_issue_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('pattern_matcher_nodes', 8), ('fxgraph_cache_miss', 2), ('pattern_matcher_count', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_dtype_sympy_expr_cuda (__main__.GPUTests.test_dtype_sympy_expr_cuda) ... stats [('calls_captured', 2), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_dtypeview_bfloat16_bfloat16_cuda (__main__.GPUTests.test_dtypeview_bfloat16_bfloat16_cuda) ... inline_call []
 stats [('calls_captured', 14), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('benchmarking.InductorBenchmarker.benchmark_gpu', 6), ('extern_calls', 4), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info [('aten.mm_2_2_2', 4)]
 ok
 test_dtypeview_bfloat16_float16_cuda (__main__.GPUTests.test_dtypeview_bfloat16_float16_cuda) ... inline_call []
 stats [('calls_captured', 14), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('benchmarking.InductorBenchmarker.benchmark_gpu', 6), ('async_compile_cache_miss', 4), ('extern_calls', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info [('aten.mm_2_2_2', 4)]
 ok
 test_dtypeview_bfloat16_float32_cuda (__main__.GPUTests.test_dtypeview_bfloat16_float32_cuda) ... inline_call []
 stats [('calls_captured', 14), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('benchmarking.InductorBenchmarker.benchmark_gpu', 8), ('extern_calls', 8), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2), ('intermediate_hooks', 2)]
 graph_break []
 aten_mm_info [('aten.mm_2_4_2', 2), ('aten.mm_2_2_2', 2)]
 ok
 test_dtypeview_bfloat16_float64_cuda (__main__.GPUTests.test_dtypeview_bfloat16_float64_cuda) ... inline_call []
 stats [('calls_captured', 14), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('benchmarking.InductorBenchmarker.benchmark_gpu', 8), ('extern_calls', 8), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2), ('intermediate_hooks', 2)]
 graph_break []
 aten_mm_info [('aten.mm_2_8_2', 2), ('aten.mm_2_2_2', 2)]
 ok
 test_dtypeview_bfloat16_int16_cuda (__main__.GPUTests.test_dtypeview_bfloat16_int16_cuda) ... inline_call []
 stats [('calls_captured', 14), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('benchmarking.InductorBenchmarker.benchmark_gpu', 6), ('async_compile_cache_miss', 4), ('extern_calls', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info [('aten.mm_2_2_2', 4)]
 ok
 test_dtypeview_bfloat16_int32_cuda (__main__.GPUTests.test_dtypeview_bfloat16_int32_cuda) ... inline_call []
 stats [('calls_captured', 14), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('benchmarking.InductorBenchmarker.benchmark_gpu', 8), ('extern_calls', 8), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2), ('intermediate_hooks', 2)]
 graph_break []
 aten_mm_info [('aten.mm_2_4_2', 2), ('aten.mm_2_2_2', 2)]
 ok
 test_dtypeview_bfloat16_int64_cuda (__main__.GPUTests.test_dtypeview_bfloat16_int64_cuda) ... inline_call []
 stats [('calls_captured', 14), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('benchmarking.InductorBenchmarker.benchmark_gpu', 8), ('extern_calls', 8), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2), ('intermediate_hooks', 2)]
 graph_break []
 aten_mm_info [('aten.mm_2_8_2', 2), ('aten.mm_2_2_2', 2)]
 ok
 test_dtypeview_bfloat16_int8_cuda (__main__.GPUTests.test_dtypeview_bfloat16_int8_cuda) ... inline_call []
 stats [('calls_captured', 14), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('benchmarking.InductorBenchmarker.benchmark_gpu', 8), ('extern_calls', 8), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2), ('intermediate_hooks', 2)]
 graph_break []
 aten_mm_info [('aten.mm_2_1_2', 2), ('aten.mm_2_2_2', 2)]
 ok
 test_dtypeview_bfloat16_uint8_cuda (__main__.GPUTests.test_dtypeview_bfloat16_uint8_cuda) ... inline_call []
 stats [('calls_captured', 14), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('benchmarking.InductorBenchmarker.benchmark_gpu', 8), ('extern_calls', 8), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2), ('intermediate_hooks', 2)]
 graph_break []
 aten_mm_info [('aten.mm_2_1_2', 2), ('aten.mm_2_2_2', 2)]
 ok
 test_dtypeview_float16_bfloat16_cuda (__main__.GPUTests.test_dtypeview_float16_bfloat16_cuda) ... inline_call []
 stats [('calls_captured', 14), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('benchmarking.InductorBenchmarker.benchmark_gpu', 6), ('async_compile_cache_miss', 4), ('extern_calls', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info [('aten.mm_2_2_2', 4)]
 ok
 test_dtypeview_float16_float16_cuda (__main__.GPUTests.test_dtypeview_float16_float16_cuda) ... inline_call []
 stats [('calls_captured', 14), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('benchmarking.InductorBenchmarker.benchmark_gpu', 6), ('extern_calls', 4), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info [('aten.mm_2_2_2', 4)]
 ok
 test_dtypeview_float16_float32_cuda (__main__.GPUTests.test_dtypeview_float16_float32_cuda) ... inline_call []
 stats [('calls_captured', 14), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('benchmarking.InductorBenchmarker.benchmark_gpu', 8), ('extern_calls', 8), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2), ('intermediate_hooks', 2)]
 graph_break []
 aten_mm_info [('aten.mm_2_4_2', 2), ('aten.mm_2_2_2', 2)]
 ok
 test_dtypeview_float16_float64_cuda (__main__.GPUTests.test_dtypeview_float16_float64_cuda) ... inline_call []
 stats [('calls_captured', 14), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('benchmarking.InductorBenchmarker.benchmark_gpu', 8), ('extern_calls', 8), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2), ('intermediate_hooks', 2)]
 graph_break []
 aten_mm_info [('aten.mm_2_8_2', 2), ('aten.mm_2_2_2', 2)]
 ok
 test_dtypeview_float16_int16_cuda (__main__.GPUTests.test_dtypeview_float16_int16_cuda) ... inline_call []
 stats [('calls_captured', 14), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('benchmarking.InductorBenchmarker.benchmark_gpu', 6), ('async_compile_cache_miss', 4), ('extern_calls', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info [('aten.mm_2_2_2', 4)]
 ok
 test_dtypeview_float16_int32_cuda (__main__.GPUTests.test_dtypeview_float16_int32_cuda) ... inline_call []
 stats [('calls_captured', 14), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('benchmarking.InductorBenchmarker.benchmark_gpu', 8), ('extern_calls', 8), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2), ('intermediate_hooks', 2)]
 graph_break []
 aten_mm_info [('aten.mm_2_4_2', 2), ('aten.mm_2_2_2', 2)]
 ok
 test_dtypeview_float16_int64_cuda (__main__.GPUTests.test_dtypeview_float16_int64_cuda) ... inline_call []
 stats [('calls_captured', 14), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('benchmarking.InductorBenchmarker.benchmark_gpu', 8), ('extern_calls', 8), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2), ('intermediate_hooks', 2)]
 graph_break []
 aten_mm_info [('aten.mm_2_8_2', 2), ('aten.mm_2_2_2', 2)]
 ok
 test_dtypeview_float16_int8_cuda (__main__.GPUTests.test_dtypeview_float16_int8_cuda) ... inline_call []
 stats [('calls_captured', 14), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('benchmarking.InductorBenchmarker.benchmark_gpu', 8), ('extern_calls', 8), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2), ('intermediate_hooks', 2)]
 graph_break []
 aten_mm_info [('aten.mm_2_1_2', 2), ('aten.mm_2_2_2', 2)]
 ok
 test_dtypeview_float16_uint8_cuda (__main__.GPUTests.test_dtypeview_float16_uint8_cuda) ... inline_call []
 stats [('calls_captured', 14), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('benchmarking.InductorBenchmarker.benchmark_gpu', 8), ('extern_calls', 8), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2), ('intermediate_hooks', 2)]
 graph_break []
 aten_mm_info [('aten.mm_2_1_2', 2), ('aten.mm_2_2_2', 2)]
 ok
 test_dtypeview_float32_bfloat16_cuda (__main__.GPUTests.test_dtypeview_float32_bfloat16_cuda) ... inline_call []
 stats [('calls_captured', 7), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 14), ('benchmarking.InductorBenchmarker.benchmark_gpu', 4), ('extern_calls', 4), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)]
 graph_break []
 aten_mm_info [('aten.mm_2_1_2', 1), ('aten.mm_2_2_2', 1)]
 ok
 test_dtypeview_float32_float16_cuda (__main__.GPUTests.test_dtypeview_float32_float16_cuda) ... inline_call []
 stats [('calls_captured', 7), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 14), ('benchmarking.InductorBenchmarker.benchmark_gpu', 4), ('extern_calls', 4), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)]
 graph_break []
 aten_mm_info [('aten.mm_2_1_2', 1), ('aten.mm_2_2_2', 1)]
 ok
 test_dtypeview_float32_float32_cuda (__main__.GPUTests.test_dtypeview_float32_float32_cuda) ... inline_call []
 stats [('calls_captured', 7), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('benchmarking.InductorBenchmarker.benchmark_gpu', 3), ('extern_calls', 2), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info [('aten.mm_2_2_2', 2)]
 ok
 test_dtypeview_float32_float64_cuda (__main__.GPUTests.test_dtypeview_float32_float64_cuda) ... inline_call []
 stats [('calls_captured', 7), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 14), ('benchmarking.InductorBenchmarker.benchmark_gpu', 4), ('extern_calls', 4), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)]
 graph_break []
 aten_mm_info [('aten.mm_2_4_2', 1), ('aten.mm_2_2_2', 1)]
 ok
 test_dtypeview_float32_int16_cuda (__main__.GPUTests.test_dtypeview_float32_int16_cuda) ... inline_call []
 stats [('calls_captured', 7), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 14), ('benchmarking.InductorBenchmarker.benchmark_gpu', 4), ('extern_calls', 4), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)]
 graph_break []
 aten_mm_info [('aten.mm_2_1_2', 1), ('aten.mm_2_2_2', 1)]
 ok
 test_dtypeview_float32_int32_cuda (__main__.GPUTests.test_dtypeview_float32_int32_cuda) ... inline_call []
 stats [('calls_captured', 7), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 14), ('benchmarking.InductorBenchmarker.benchmark_gpu', 3), ('async_compile_cache_miss', 2), ('extern_calls', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1)]
 graph_break []
 aten_mm_info [('aten.mm_2_2_2', 2)]
 ok
 test_dtypeview_float32_int64_cuda (__main__.GPUTests.test_dtypeview_float32_int64_cuda) ... inline_call []
 stats [('calls_captured', 7), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 14), ('benchmarking.InductorBenchmarker.benchmark_gpu', 4), ('extern_calls', 4), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)]
 graph_break []
 aten_mm_info [('aten.mm_2_4_2', 1), ('aten.mm_2_2_2', 1)]
 ok
 test_dtypeview_float32_int8_cuda (__main__.GPUTests.test_dtypeview_float32_int8_cuda) ... ok
 test_dtypeview_float32_uint8_cuda (__main__.GPUTests.test_dtypeview_float32_uint8_cuda) ... ok
 test_dtypeview_float64_bfloat16_cuda (__main__.GPUTests.test_dtypeview_float64_bfloat16_cuda) ... ok
 test_dtypeview_float64_float16_cuda (__main__.GPUTests.test_dtypeview_float64_float16_cuda) ... ok
 test_dtypeview_float64_float32_cuda (__main__.GPUTests.test_dtypeview_float64_float32_cuda) ... inline_call []
 stats [('calls_captured', 7), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 14), ('extern_calls', 4), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)]
 graph_break []
 aten_mm_info [('aten.mm_2_1_2', 1), ('aten.mm_2_2_2', 1)]
 ok
 test_dtypeview_float64_float64_cuda (__main__.GPUTests.test_dtypeview_float64_float64_cuda) ... inline_call []
 stats [('calls_captured', 7), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('extern_calls', 2), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info [('aten.mm_2_2_2', 2)]
 ok
 test_dtypeview_float64_int16_cuda (__main__.GPUTests.test_dtypeview_float64_int16_cuda) ... ok
 test_dtypeview_float64_int32_cuda (__main__.GPUTests.test_dtypeview_float64_int32_cuda) ... inline_call []
 stats [('calls_captured', 7), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 14), ('extern_calls', 4), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)]
 graph_break []
 aten_mm_info [('aten.mm_2_1_2', 1), ('aten.mm_2_2_2', 1)]
 ok
 test_dtypeview_float64_int64_cuda (__main__.GPUTests.test_dtypeview_float64_int64_cuda) ... inline_call []
 stats [('calls_captured', 7), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 14), ('async_compile_cache_miss', 2), ('extern_calls', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1)]
 graph_break []
 aten_mm_info [('aten.mm_2_2_2', 2)]
 ok
 test_dtypeview_float64_int8_cuda (__main__.GPUTests.test_dtypeview_float64_int8_cuda) ... ok
 test_dtypeview_float64_uint8_cuda (__main__.GPUTests.test_dtypeview_float64_uint8_cuda) ... ok
 test_dtypeview_fusion_cuda (__main__.GPUTests.test_dtypeview_fusion_cuda) ... frames [('total', 2), ('ok', 2)]
 stats [('calls_captured', 12), ('unique_graphs', 4)]
 aot_autograd [('total', 4), ('ok', 4), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('autograd_cache_hit', 2)]
 inductor [('async_compile_cache_miss', 7), ('triton_bundler_save_kernel', 7), ('async_compile_cache_hit', 4), ('fxgraph_cache_hit', 3), ('fxgraph_cache_miss', 1)]
 graph_break []
 aten_mm_info []
 inline_call []
 ok
 test_dtypeview_int16_bfloat16_cuda (__main__.GPUTests.test_dtypeview_int16_bfloat16_cuda) ... inline_call []
 stats [('calls_captured', 14), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('benchmarking.InductorBenchmarker.benchmark_gpu', 6), ('async_compile_cache_miss', 4), ('extern_calls', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info [('aten.mm_2_2_2', 4)]
 ok
 test_dtypeview_int16_float16_cuda (__main__.GPUTests.test_dtypeview_int16_float16_cuda) ... inline_call []
 stats [('calls_captured', 14), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('benchmarking.InductorBenchmarker.benchmark_gpu', 6), ('async_compile_cache_miss', 4), ('extern_calls', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info [('aten.mm_2_2_2', 4)]
 ok
 test_dtypeview_int16_float32_cuda (__main__.GPUTests.test_dtypeview_int16_float32_cuda) ... inline_call []
 stats [('calls_captured', 14), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('benchmarking.InductorBenchmarker.benchmark_gpu', 8), ('extern_calls', 8), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2), ('intermediate_hooks', 2)]
 graph_break []
 aten_mm_info [('aten.mm_2_4_2', 2), ('aten.mm_2_2_2', 2)]
 ok
 test_dtypeview_int16_float64_cuda (__main__.GPUTests.test_dtypeview_int16_float64_cuda) ... inline_call []
 stats [('calls_captured', 14), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('benchmarking.InductorBenchmarker.benchmark_gpu', 8), ('extern_calls', 8), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2), ('intermediate_hooks', 2)]
 graph_break []
 aten_mm_info [('aten.mm_2_8_2', 2), ('aten.mm_2_2_2', 2)]
 ok
 test_dtypeview_int16_int16_cuda (__main__.GPUTests.test_dtypeview_int16_int16_cuda) ... inline_call []
 stats [('calls_captured', 14), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('benchmarking.InductorBenchmarker.benchmark_gpu', 6), ('extern_calls', 4), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info [('aten.mm_2_2_2', 4)]
 ok
 test_dtypeview_int16_int32_cuda (__main__.GPUTests.test_dtypeview_int16_int32_cuda) ... inline_call []
 stats [('calls_captured', 14), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('benchmarking.InductorBenchmarker.benchmark_gpu', 8), ('extern_calls', 8), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2), ('intermediate_hooks', 2)]
 graph_break []
 aten_mm_info [('aten.mm_2_4_2', 2), ('aten.mm_2_2_2', 2)]
 ok
 test_dtypeview_int16_int64_cuda (__main__.GPUTests.test_dtypeview_int16_int64_cuda) ... inline_call []
 stats [('calls_captured', 14), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('benchmarking.InductorBenchmarker.benchmark_gpu', 8), ('extern_calls', 8), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2), ('intermediate_hooks', 2)]
 graph_break []
 aten_mm_info [('aten.mm_2_8_2', 2), ('aten.mm_2_2_2', 2)]
 ok
 test_dtypeview_int16_int8_cuda (__main__.GPUTests.test_dtypeview_int16_int8_cuda) ... inline_call []
 stats [('calls_captured', 14), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('benchmarking.InductorBenchmarker.benchmark_gpu', 8), ('extern_calls', 8), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2), ('intermediate_hooks', 2)]
 graph_break []
 aten_mm_info [('aten.mm_2_1_2', 2), ('aten.mm_2_2_2', 2)]
 ok
 test_dtypeview_int16_uint8_cuda (__main__.GPUTests.test_dtypeview_int16_uint8_cuda) ... inline_call []
 stats [('calls_captured', 14), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('benchmarking.InductorBenchmarker.benchmark_gpu', 8), ('extern_calls', 8), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2), ('intermediate_hooks', 2)]
 graph_break []
 aten_mm_info [('aten.mm_2_1_2', 2), ('aten.mm_2_2_2', 2)]
 ok
 test_dtypeview_int32_bfloat16_cuda (__main__.GPUTests.test_dtypeview_int32_bfloat16_cuda) ... inline_call []
 stats [('calls_captured', 7), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 14), ('benchmarking.InductorBenchmarker.benchmark_gpu', 4), ('extern_calls', 4), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)]
 graph_break []
 aten_mm_info [('aten.mm_2_1_2', 1), ('aten.mm_2_2_2', 1)]
 ok
 test_dtypeview_int32_float16_cuda (__main__.GPUTests.test_dtypeview_int32_float16_cuda) ... inline_call []
 stats [('calls_captured', 7), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 14), ('benchmarking.InductorBenchmarker.benchmark_gpu', 4), ('extern_calls', 4), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)]
 graph_break []
 aten_mm_info [('aten.mm_2_1_2', 1), ('aten.mm_2_2_2', 1)]
 ok
 test_dtypeview_int32_float32_cuda (__main__.GPUTests.test_dtypeview_int32_float32_cuda) ... inline_call []
 stats [('calls_captured', 7), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 14), ('benchmarking.InductorBenchmarker.benchmark_gpu', 3), ('async_compile_cache_miss', 2), ('extern_calls', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1)]
 graph_break []
 aten_mm_info [('aten.mm_2_2_2', 2)]
 ok
 test_dtypeview_int32_float64_cuda (__main__.GPUTests.test_dtypeview_int32_float64_cuda) ... inline_call []
 stats [('calls_captured', 7), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 14), ('benchmarking.InductorBenchmarker.benchmark_gpu', 4), ('extern_calls', 4), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)]
 graph_break []
 aten_mm_info [('aten.mm_2_4_2', 1), ('aten.mm_2_2_2', 1)]
 ok
 test_dtypeview_int32_int16_cuda (__main__.GPUTests.test_dtypeview_int32_int16_cuda) ... inline_call []
 stats [('calls_captured', 7), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 14), ('benchmarking.InductorBenchmarker.benchmark_gpu', 4), ('extern_calls', 4), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)]
 graph_break []
 aten_mm_info [('aten.mm_2_1_2', 1), ('aten.mm_2_2_2', 1)]
 ok
 test_dtypeview_int32_int32_cuda (__main__.GPUTests.test_dtypeview_int32_int32_cuda) ... inline_call []
 stats [('calls_captured', 7), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('benchmarking.InductorBenchmarker.benchmark_gpu', 3), ('extern_calls', 2), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info [('aten.mm_2_2_2', 2)]
 ok
 test_dtypeview_int32_int64_cuda (__main__.GPUTests.test_dtypeview_int32_int64_cuda) ... inline_call []
 stats [('calls_captured', 7), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 14), ('benchmarking.InductorBenchmarker.benchmark_gpu', 4), ('extern_calls', 4), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)]
 graph_break []
 aten_mm_info [('aten.mm_2_4_2', 1), ('aten.mm_2_2_2', 1)]
 ok
 test_dtypeview_int32_int8_cuda (__main__.GPUTests.test_dtypeview_int32_int8_cuda) ... ok
 test_dtypeview_int32_uint8_cuda (__main__.GPUTests.test_dtypeview_int32_uint8_cuda) ... ok
 test_dtypeview_int64_bfloat16_cuda (__main__.GPUTests.test_dtypeview_int64_bfloat16_cuda) ... ok
 test_dtypeview_int64_float16_cuda (__main__.GPUTests.test_dtypeview_int64_float16_cuda) ... ok
 test_dtypeview_int64_float32_cuda (__main__.GPUTests.test_dtypeview_int64_float32_cuda) ... inline_call []
 stats [('calls_captured', 7), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 14), ('extern_calls', 4), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)]
 graph_break []
 aten_mm_info [('aten.mm_2_1_2', 1), ('aten.mm_2_2_2', 1)]
 ok
 test_dtypeview_int64_float64_cuda (__main__.GPUTests.test_dtypeview_int64_float64_cuda) ... inline_call []
 stats [('calls_captured', 7), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 14), ('async_compile_cache_miss', 2), ('extern_calls', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1)]
 graph_break []
 aten_mm_info [('aten.mm_2_2_2', 2)]
 ok
 test_dtypeview_int64_int16_cuda (__main__.GPUTests.test_dtypeview_int64_int16_cuda) ... ok
 test_dtypeview_int64_int32_cuda (__main__.GPUTests.test_dtypeview_int64_int32_cuda) ... inline_call []
 stats [('calls_captured', 7), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 14), ('extern_calls', 4), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)]
 graph_break []
 aten_mm_info [('aten.mm_2_1_2', 1), ('aten.mm_2_2_2', 1)]
 ok
 test_dtypeview_int64_int64_cuda (__main__.GPUTests.test_dtypeview_int64_int64_cuda) ... inline_call []
 stats [('calls_captured', 7), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('extern_calls', 2), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info [('aten.mm_2_2_2', 2)]
 ok
 test_dtypeview_int64_int8_cuda (__main__.GPUTests.test_dtypeview_int64_int8_cuda) ... ok
 test_dtypeview_int64_uint8_cuda (__main__.GPUTests.test_dtypeview_int64_uint8_cuda) ... ok
 test_dtypeview_int8_bfloat16_cuda (__main__.GPUTests.test_dtypeview_int8_bfloat16_cuda) ... ok
 test_dtypeview_int8_float16_cuda (__main__.GPUTests.test_dtypeview_int8_float16_cuda) ... ok
 test_dtypeview_int8_float32_cuda (__main__.GPUTests.test_dtypeview_int8_float32_cuda) ... ok
 test_dtypeview_int8_float64_cuda (__main__.GPUTests.test_dtypeview_int8_float64_cuda) ... ok
 test_dtypeview_int8_int16_cuda (__main__.GPUTests.test_dtypeview_int8_int16_cuda) ... ok
 test_dtypeview_int8_int32_cuda (__main__.GPUTests.test_dtypeview_int8_int32_cuda) ... ok
 test_dtypeview_int8_int64_cuda (__main__.GPUTests.test_dtypeview_int8_int64_cuda) ... ok
 test_dtypeview_int8_int8_cuda (__main__.GPUTests.test_dtypeview_int8_int8_cuda) ... ok
 test_dtypeview_int8_uint8_cuda (__main__.GPUTests.test_dtypeview_int8_uint8_cuda) ... ok
 test_dtypeview_uint8_bfloat16_cuda (__main__.GPUTests.test_dtypeview_uint8_bfloat16_cuda) ... ok
 test_dtypeview_uint8_float16_cuda (__main__.GPUTests.test_dtypeview_uint8_float16_cuda) ... ok
 test_dtypeview_uint8_float32_cuda (__main__.GPUTests.test_dtypeview_uint8_float32_cuda) ... ok
 test_dtypeview_uint8_float64_cuda (__main__.GPUTests.test_dtypeview_uint8_float64_cuda) ... ok
 test_dtypeview_uint8_int16_cuda (__main__.GPUTests.test_dtypeview_uint8_int16_cuda) ... ok
 test_dtypeview_uint8_int32_cuda (__main__.GPUTests.test_dtypeview_uint8_int32_cuda) ... ok
 test_dtypeview_uint8_int64_cuda (__main__.GPUTests.test_dtypeview_uint8_int64_cuda) ... ok
 test_dtypeview_uint8_int8_cuda (__main__.GPUTests.test_dtypeview_uint8_int8_cuda) ... ok
 test_dtypeview_uint8_uint8_cuda (__main__.GPUTests.test_dtypeview_uint8_uint8_cuda) ... ok
 test_elu_cuda (__main__.GPUTests.test_elu_cuda) ... inline_call []
 stats [('calls_captured', 8), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_embedding_bag_byte_unpack_cuda (__main__.GPUTests.test_embedding_bag_byte_unpack_cuda) ... skipped 'No cuda implementation (it returns empty)'
 test_embedding_bag_cuda (__main__.GPUTests.test_embedding_bag_cuda) ... inline_call []
 stats [('calls_captured', 10), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('extern_calls', 10), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_embedding_cuda (__main__.GPUTests.test_embedding_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_empty1_cuda (__main__.GPUTests.test_empty1_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)]
 inductor [('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_empty2_cuda (__main__.GPUTests.test_empty2_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)]
 inductor [('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_empty_strided_cuda (__main__.GPUTests.test_empty_strided_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)]
 inductor [('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_erfc_cuda (__main__.GPUTests.test_erfc_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_erfinv_cuda (__main__.GPUTests.test_erfinv_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_exact_stride_cuda (__main__.GPUTests.test_exact_stride_cuda) ... inline_call []
 stats [('calls_captured', 9), ('unique_graphs', 3)]
 aot_autograd [('total', 3), ('autograd_cache_miss', 3), ('autograd_cache_saved', 3), ('ok', 3)]
 inductor [('triton_bundler_save_kernel', 14), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 3), ('fxgraph_cache_miss', 2), ('fxgraph_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 frames [('total', 1), ('ok', 1)]
 ok
 test_exp2_cuda (__main__.GPUTests.test_exp2_cuda) ... inline_call []
 stats [('calls_captured', 14), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_exp_cuda (__main__.GPUTests.test_exp_cuda) ... inline_call []
 stats [('calls_captured', 6), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_expand_as_cuda (__main__.GPUTests.test_expand_as_cuda) ... inline_call []
 stats [('calls_captured', 10), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_bypass', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_expand_cuda (__main__.GPUTests.test_expand_cuda) ... inline_call []
 stats [('calls_captured', 12), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_bypass', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_expanded_reduction_cuda (__main__.GPUTests.test_expanded_reduction_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_expm1_cuda (__main__.GPUTests.test_expm1_cuda) ... inline_call []
 stats [('calls_captured', 60), ('unique_graphs', 20)]
 aot_autograd [('total', 20), ('ok', 20), ('autograd_cache_miss', 10), ('autograd_cache_saved', 10), ('autograd_cache_hit', 10)]
 inductor [('triton_bundler_save_kernel', 70), ('async_compile_cache_miss', 30), ('async_compile_cache_hit', 20), ('fxgraph_cache_miss', 10), ('fxgraph_cache_hit', 10)]
 graph_break []
 aten_mm_info []
 ok
 test_fallback_mutable_op_basic_cuda (__main__.GPUTests.test_fallback_mutable_op_basic_cuda) ... inductor [('fxgraph_cache_bypass', 1), ('extern_calls', 1)]
 aten_mm_info []
 ok
 test_fallback_mutable_op_list_cuda (__main__.GPUTests.test_fallback_mutable_op_list_cuda) ... inductor [('fxgraph_cache_bypass', 2), ('extern_calls', 2), ('pattern_matcher_count', 1), ('pattern_matcher_nodes', 1)]
 aten_mm_info []
 frames [('total', 1), ('ok', 1)]
 inline_call []
 stats [('calls_captured', 3), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('ok', 1)]
 graph_break []
 ok
 test_fallback_mutable_op_list_tensor_cuda (__main__.GPUTests.test_fallback_mutable_op_list_tensor_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('ok', 1)]
 inductor [('extern_calls', 2), ('fxgraph_cache_bypass', 1), ('pattern_matcher_count', 1), ('pattern_matcher_nodes', 1), ('intermediate_hooks', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_fallback_mutable_op_no_mutated_tensors_cuda (__main__.GPUTests.test_fallback_mutable_op_no_mutated_tensors_cuda) ... inductor [('fxgraph_cache_bypass', 1), ('extern_calls', 1)]
 aten_mm_info []
 ok
 test_fallback_mutable_op_with_return_cuda (__main__.GPUTests.test_fallback_mutable_op_with_return_cuda) ... inductor [('extern_calls', 2), ('fxgraph_cache_bypass', 1), ('intermediate_hooks', 1)]
 aten_mm_info []
 ok
 test_fft_real_input_cuda (__main__.GPUTests.test_fft_real_input_cuda) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('extern_calls', 3), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_fft_real_input_real_output_cuda (__main__.GPUTests.test_fft_real_input_real_output_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('extern_calls', 5), ('intermediate_hooks', 2), ('fxgraph_cache_miss', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_fill1_cuda (__main__.GPUTests.test_fill1_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_fill2_cuda (__main__.GPUTests.test_fill2_cuda) ... inline_call []
 stats [('calls_captured', 6), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_flip_cat_cuda (__main__.GPUTests.test_flip_cat_cuda) ... inline_call []
 stats [('calls_captured', 8), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('pattern_matcher_count', 4), ('pattern_matcher_nodes', 4), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_flip_cuda (__main__.GPUTests.test_flip_cuda) ... inline_call []
 stats [('calls_captured', 6), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_float16_to_int16_cuda (__main__.GPUTests.test_float16_to_int16_cuda) ... frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 4), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_float32_to_int32_cuda (__main__.GPUTests.test_float32_to_int32_cuda) ... frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 5), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_float_index_expression_cuda (__main__.GPUTests.test_float_index_expression_cuda) ... ok
 test_float_index_expression_type_promotion_cuda (__main__.GPUTests.test_float_index_expression_type_promotion_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('async_compile_cache_miss', 3), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_floordiv_cuda (__main__.GPUTests.test_floordiv_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 4)]
 aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)]
 inductor [('triton_bundler_save_kernel', 28), ('fxgraph_cache_miss', 4), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4)]
 graph_break []
 aten_mm_info []
 ok
 test_fmin_fmax_cuda (__main__.GPUTests.test_fmin_fmax_cuda) ... inline_call []
 stats [('calls_captured', 10), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_fmod_cuda (__main__.GPUTests.test_fmod_cuda) ... inline_call []
 stats [('calls_captured', 8), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_fmod_zero_dim_cuda (__main__.GPUTests.test_fmod_zero_dim_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 4)]
 aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)]
 inductor [('triton_bundler_save_kernel', 28), ('fxgraph_cache_miss', 4), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4)]
 graph_break []
 aten_mm_info []
 ok
 test_forced_buffer_realize_cuda (__main__.GPUTests.test_forced_buffer_realize_cuda) ... inline_call []
 stats [('calls_captured', 8), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_bypass', 2), ('ok', 2)]
 inductor [('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_fractional_max_pool2d1_cuda (__main__.GPUTests.test_fractional_max_pool2d1_cuda) ... inline_call []
 stats [('calls_captured', 3), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_fractional_max_pool2d2_cuda (__main__.GPUTests.test_fractional_max_pool2d2_cuda) ... inline_call []
 stats [('calls_captured', 3), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('extern_calls', 3), ('fxgraph_cache_miss', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_fractional_max_pool2d3_cuda (__main__.GPUTests.test_fractional_max_pool2d3_cuda) ... inline_call []
 stats [('calls_captured', 3), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_fractional_max_pool2d4_cuda (__main__.GPUTests.test_fractional_max_pool2d4_cuda) ... inline_call []
 stats [('calls_captured', 3), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('extern_calls', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_full_boolean_cuda (__main__.GPUTests.test_full_boolean_cuda) ... inline_call []
 stats [('calls_captured', 8), ('unique_graphs', 4)]
 aot_autograd [('total', 4), ('ok', 4), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('autograd_cache_hit', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 12), ('async_compile_cache_hit', 8), ('fxgraph_cache_miss', 2), ('fxgraph_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_full_like_cuda (__main__.GPUTests.test_full_like_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_full_truncation_cuda (__main__.GPUTests.test_full_truncation_cuda) ... inline_call []
 stats [('calls_captured', 14), ('unique_graphs', 7)]
 aot_autograd [('total', 7), ('autograd_cache_miss', 7), ('autograd_cache_saved', 7), ('ok', 7)]
 inductor [('triton_bundler_save_kernel', 49), ('fxgraph_cache_miss', 7), ('async_compile_cache_miss', 7), ('async_compile_cache_hit', 7)]
 graph_break []
 aten_mm_info []
 ok
 test_functionalize_rng_wrappers_cuda (__main__.GPUTests.test_functionalize_rng_wrappers_cuda) ... inductor [('extern_calls', 10), ('intermediate_hooks', 2), ('fxgraph_cache_bypass', 1)]
 aten_mm_info []
 ok
 test_fuse_large_params_cuda (__main__.GPUTests.test_fuse_large_params_cuda) ... skipped 'test is slow; run with PYTORCH_TEST_WITH_SLOW to enable test'
 test_fuse_tiled_cuda (__main__.GPUTests.test_fuse_tiled_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_fusing_write_into_disjoint_read_cuda (__main__.GPUTests.test_fusing_write_into_disjoint_read_cuda) ... inline_call []
 stats [('calls_captured', 16), ('unique_graphs', 4)]
 aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)]
 inductor [('triton_bundler_save_kernel', 56), ('async_compile_cache_miss', 8), ('async_compile_cache_hit', 8), ('fxgraph_cache_miss', 4)]
 graph_break []
 aten_mm_info []
 ok
 test_gather1_cuda (__main__.GPUTests.test_gather1_cuda) ... inline_call []
 stats [('calls_captured', 12), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_gather2_cuda (__main__.GPUTests.test_gather2_cuda) ... ok
 test_gather3_cuda (__main__.GPUTests.test_gather3_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('ok', 2)]
 inductor [('pattern_matcher_count', 16), ('pattern_matcher_nodes', 16), ('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_gather_scatter_cuda (__main__.GPUTests.test_gather_scatter_cuda) ... inline_call []
 stats [('calls_captured', 11), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 14), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_gelu_cuda (__main__.GPUTests.test_gelu_cuda) ... inline_call []
 stats [('calls_captured', 8), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_generate_rand_fp8_cuda (__main__.GPUTests.test_generate_rand_fp8_cuda)
 PyTorch can not generate fp8 tensors with a normal distribution because of ... ok
 test_getitem_cuda (__main__.GPUTests.test_getitem_cuda) ... frames [('total', 1), ('ok', 1)]
 inline_call []
 ok
 test_glu_cuda (__main__.GPUTests.test_glu_cuda) ... inline_call []
 stats [('calls_captured', 6), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 42), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 6), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_graph_partition_arange1_cuda (__main__.GPUTests.test_graph_partition_arange1_cuda) ... frames [('total', 2), ('ok', 2)]
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_graph_partition_arange2_cuda (__main__.GPUTests.test_graph_partition_arange2_cuda) ... frames [('total', 2), ('ok', 2)]
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_graph_partition_argmax_cuda (__main__.GPUTests.test_graph_partition_argmax_cuda) ... frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 4), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('fxgraph_cache_miss', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_graph_partition_both_scalars_cuda (__main__.GPUTests.test_graph_partition_both_scalars_cuda) ... frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 6), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('fxgraph_cache_miss', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_graph_partition_constant_tensor1_cuda (__main__.GPUTests.test_graph_partition_constant_tensor1_cuda) ... frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 4), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('fxgraph_cache_miss', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_graph_partition_constant_tensor2_cuda (__main__.GPUTests.test_graph_partition_constant_tensor2_cuda) ... frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 2), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_graph_partition_misaligned_input_cuda (__main__.GPUTests.test_graph_partition_misaligned_input_cuda) ... frames [('total', 4), ('ok', 4)]
 stats [('calls_captured', 3), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_graph_partition_no_inputs_cuda (__main__.GPUTests.test_graph_partition_no_inputs_cuda) ... frames [('total', 2), ('ok', 2)]
 unimplemented []
 graph_break [('Attempted to call function marked as skipped\n  Explanation: Dynamo developers have intentionally marked that the function `manual_seed` in file `/workspace/pytorch/torch/_compile.py` should not be traced.\n  Hint: Avoid calling the function `manual_seed`.\n  Hint: Remove the function `manual_seed` or the file `/workspace/pytorch/torch/_compile.py` from torch/_dynamo/trace_rules.py. More graph breaks may occur as a result of attempting to trace into the function.\n  Hint: Please file an issue to PyTorch.\n\n  Developer debug context: module: torch.random, qualname: manual_seed, skip reason: <missing reason>\n', 1)]
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('pattern_matcher_count', 1), ('pattern_matcher_nodes', 1), ('fxgraph_cache_miss', 1), ('extern_calls', 1)]
 aten_mm_info []
 ok
 test_graph_partition_refcount_cuda (__main__.GPUTests.test_graph_partition_refcount_cuda) ... inductor [('fxgraph_cache_bypass', 2), ('async_compile_cache_miss', 2), ('extern_calls', 2), ('async_compile_cache_hit', 2), ('cudagraph_recorded_non_static_inputs', 2)]
 aten_mm_info [('aten.mm_5_5_5', 2)]
 ok
 test_graph_partition_scalar_inputs_cuda (__main__.GPUTests.test_graph_partition_scalar_inputs_cuda) ... frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 5), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('fxgraph_cache_miss', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_graph_partition_unbacked_symint_as_output_cuda (__main__.GPUTests.test_graph_partition_unbacked_symint_as_output_cuda) ... frames [('total', 5), ('ok', 5)]
 unimplemented []
 graph_break [("Dynamic shape operator\n  Explanation: Operator `aten.repeat_interleave.Tensor`'s output shape depends on input Tensor data.\n  Hint: Enable tracing of dynamic shape operators with `torch._dynamo.config.capture_dynamic_output_shape_ops = True`\n\n  Developer debug context: aten.repeat_interleave.Tensor\n", 1)]
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2), ('pattern_matcher_count', 1), ('pattern_matcher_nodes', 1)]
 aten_mm_info []
 ok
 test_grid_sampler_2d_cuda (__main__.GPUTests.test_grid_sampler_2d_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('pattern_matcher_count', 34), ('pattern_matcher_nodes', 34), ('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_hardsigmoid_cuda (__main__.GPUTests.test_hardsigmoid_cuda) ... inline_call []
 stats [('calls_captured', 10), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_hardswish_cuda (__main__.GPUTests.test_hardswish_cuda) ... inline_call []
 stats [('calls_captured', 10), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_hardtanh_cuda (__main__.GPUTests.test_hardtanh_cuda) ... inline_call []
 stats [('calls_captured', 10), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_horizonal_fusion1_cuda (__main__.GPUTests.test_horizonal_fusion1_cuda) ... inline_call []
 stats [('calls_captured', 6), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_horizonal_fusion2_cuda (__main__.GPUTests.test_horizonal_fusion2_cuda) ... inline_call []
 stats [('calls_captured', 6), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 42), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 6), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_index1_cuda (__main__.GPUTests.test_index1_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 4)]
 aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)]
 inductor [('triton_bundler_save_kernel', 28), ('fxgraph_cache_miss', 4), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4)]
 graph_break []
 aten_mm_info []
 ok
 test_index2_cuda (__main__.GPUTests.test_index2_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_index3_cuda (__main__.GPUTests.test_index3_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_index_dynamic_shapes_cuda (__main__.GPUTests.test_index_dynamic_shapes_cuda) ... inline_call []
 stats [('calls_captured', 64), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('pattern_matcher_count', 4), ('pattern_matcher_nodes', 4), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_index_propagation_abs_cuda (__main__.GPUTests.test_index_propagation_abs_cuda) ... frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 4), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('pattern_matcher_count', 1), ('pattern_matcher_nodes', 1), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_index_propagation_cuda (__main__.GPUTests.test_index_propagation_cuda) ... frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 2), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('pattern_matcher_count', 1), ('pattern_matcher_nodes', 1), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_index_propagation_device_assert_masked_cuda (__main__.GPUTests.test_index_propagation_device_assert_masked_cuda) ... inline_call []
 stats [('calls_captured', 10), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_index_propagation_flip_cuda (__main__.GPUTests.test_index_propagation_flip_cuda) ... frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 2), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('pattern_matcher_count', 1), ('pattern_matcher_nodes', 1), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_index_propagation_floordiv_cuda (__main__.GPUTests.test_index_propagation_floordiv_cuda) ... frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 3), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('pattern_matcher_count', 1), ('pattern_matcher_nodes', 1), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_index_propagation_nested_indirect_indexing_cuda (__main__.GPUTests.test_index_propagation_nested_indirect_indexing_cuda) ... frames [('total', 4), ('ok', 4)]
 stats [('calls_captured', 7), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('extern_calls', 3), ('pattern_matcher_count', 1), ('pattern_matcher_nodes', 1), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_index_propagation_remainder_cuda (__main__.GPUTests.test_index_propagation_remainder_cuda) ... frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 3), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('pattern_matcher_count', 1), ('pattern_matcher_nodes', 1), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_index_put1_cuda (__main__.GPUTests.test_index_put1_cuda) ... inline_call []
 stats [('calls_captured', 24), ('unique_graphs', 4)]
 aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)]
 inductor [('triton_bundler_save_kernel', 98), ('async_compile_cache_miss', 14), ('async_compile_cache_hit', 14), ('fxgraph_cache_miss', 4)]
 graph_break []
 aten_mm_info []
 ok
 test_index_put2_cuda (__main__.GPUTests.test_index_put2_cuda) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 14), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_index_put3_cuda (__main__.GPUTests.test_index_put3_cuda) ... inline_call []
 stats [('calls_captured', 10), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 42), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 6), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_index_put4_cuda (__main__.GPUTests.test_index_put4_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_index_put_as_masked_fill_cuda (__main__.GPUTests.test_index_put_as_masked_fill_cuda) ... inline_call []
 stats [('calls_captured', 8), ('unique_graphs', 4)]
 aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)]
 inductor [('triton_bundler_save_kernel', 28), ('fxgraph_cache_miss', 4), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4)]
 graph_break []
 aten_mm_info []
 ok
 test_index_put_deterministic_fallback_cuda (__main__.GPUTests.test_index_put_deterministic_fallback_cuda) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('extern_calls', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_index_put_failed_reinplace_cuda (__main__.GPUTests.test_index_put_failed_reinplace_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_index_put_fallback1_cuda (__main__.GPUTests.test_index_put_fallback1_cuda) ... inline_call []
 stats [('calls_captured', 8), ('unique_graphs', 4)]
 aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)]
 inductor [('triton_bundler_save_kernel', 28), ('fxgraph_cache_miss', 4), ('async_compile_cache_miss', 4), ('extern_calls', 4), ('async_compile_cache_hit', 4)]
 graph_break []
 aten_mm_info []
 ok
 test_index_put_fallback2_cuda (__main__.GPUTests.test_index_put_fallback2_cuda) ... inline_call []
 stats [('calls_captured', 8), ('unique_graphs', 4)]
 aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)]
 inductor [('triton_bundler_save_kernel', 28), ('fxgraph_cache_miss', 4), ('async_compile_cache_miss', 4), ('extern_calls', 4), ('async_compile_cache_hit', 4)]
 graph_break []
 aten_mm_info []
 ok
 test_index_put_index_cuda (__main__.GPUTests.test_index_put_index_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 42), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 6), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_index_put_reinplace_cuda (__main__.GPUTests.test_index_put_reinplace_cuda) ... inline_call []
 stats [('calls_captured', 6), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_bypass', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_index_select_cuda (__main__.GPUTests.test_index_select_cuda) ... inline_call []
 stats [('calls_captured', 16), ('unique_graphs', 4)]
 aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)]
 inductor [('triton_bundler_save_kernel', 84), ('async_compile_cache_miss', 12), ('async_compile_cache_hit', 12), ('fxgraph_cache_miss', 4)]
 graph_break []
 aten_mm_info []
 ok
 test_index_tensor_cuda (__main__.GPUTests.test_index_tensor_cuda) ... inline_call []
 stats [('calls_captured', 10), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_indirect_load_broadcast_cuda (__main__.GPUTests.test_indirect_load_broadcast_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_inductor_assert_cuda (__main__.GPUTests.test_inductor_assert_cuda) ... frames [('total', 4), ('ok', 4)]
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('fxgraph_cache_miss', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_inductor_layout_optimization_input_mutations_cuda (__main__.GPUTests.test_inductor_layout_optimization_input_mutations_cuda) ... frames [('total', 2), ('ok', 2)]
 inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2), ('extern_calls', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_inf_cuda (__main__.GPUTests.test_inf_cuda) ... inline_call []
 stats [('calls_captured', 6), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_inner_fn_str_and_stride_cuda (__main__.GPUTests.test_inner_fn_str_and_stride_cuda) ... inline_call []
 stats [('calls_captured', 6), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_bypass', 1), ('ok', 1)]
 inductor [('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_inplace_activations_cuda (__main__.GPUTests.test_inplace_activations_cuda) ... inline_call []
 stats [('calls_captured', 32), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_inplace_add_cuda (__main__.GPUTests.test_inplace_add_cuda) ... frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_inplace_mixed_dtype_ops_cuda (__main__.GPUTests.test_inplace_mixed_dtype_ops_cuda) ... frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 4), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_inplace_resize_as_cuda (__main__.GPUTests.test_inplace_resize_as_cuda) ... E0509 18:22:34.546000 415353 torch/_dynamo/utils.py:2906] Accuracy failed: allclose not within tol=0.0001
 frames [('total', 2), ('ok', 2)]
 unimplemented []
 graph_break [('Unsupported function call\n  Explanation: Dynamo does not know how to trace the function `DelayGraphBreakVariable()`\n  Hint: Avoid calling `DelayGraphBreakVariable()` in your code.\n  Hint: Please report an issue to PyTorch.\n\n  Developer debug context: call_function DelayGraphBreakVariable() [LazyVariableTracker()] {}\n', 1)]
 ok
 test_inplace_where_pointwise_cuda (__main__.GPUTests.test_inplace_where_pointwise_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_input_mutation1_cuda (__main__.GPUTests.test_input_mutation1_cuda) ... skipping cudagraphs due to mutated inputs (1 instances). Found from : 
   File "/workspace/pytorch/test/inductor/test_torchinductor.py", line 7321, in fn
    a.copy_(b)

 stats [('calls_captured', 5), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1), ('cudagraph_skips', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_input_mutation2_cuda (__main__.GPUTests.test_input_mutation2_cuda) ... stats [('calls_captured', 5), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 14), ('pattern_matcher_count', 3), ('pattern_matcher_nodes', 3), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_input_mutation3_cuda (__main__.GPUTests.test_input_mutation3_cuda) ... stats [('calls_captured', 7), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_bypass', 1), ('ok', 1)]
 inductor [('pattern_matcher_nodes', 9), ('pattern_matcher_count', 7), ('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_input_mutation4_cuda (__main__.GPUTests.test_input_mutation4_cuda) ... stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_input_mutation5_cuda (__main__.GPUTests.test_input_mutation5_cuda) ... frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 2), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_insignificant_strides_cuda (__main__.GPUTests.test_insignificant_strides_cuda) ... frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 2), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('pattern_matcher_count', 1), ('pattern_matcher_nodes', 1), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_int8_weight_only_quant_cuda (__main__.GPUTests.test_int8_weight_only_quant_cuda) ... skipped 'No _weight_int8pack_mm implementation on CUDA'
 test_int_input_dynamic_shapes_cuda (__main__.GPUTests.test_int_input_dynamic_shapes_cuda) ... frames [('total', 9), ('ok', 9)]
 stats [('calls_captured', 5), ('unique_graphs', 5)]
 aot_autograd [('total', 5), ('ok', 5), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('autograd_cache_hit', 1)]
 inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 5), ('fxgraph_cache_miss', 4), ('fxgraph_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 inline_call []
 ok
 test_invalid_operand_issue1_cuda (__main__.GPUTests.test_invalid_operand_issue1_cuda) ... inline_call []
 stats [('calls_captured', 14), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_isin_tensor_scalar_cuda (__main__.GPUTests.test_isin_tensor_scalar_cuda) ... stats [('calls_captured', 8), ('unique_graphs', 8)]
 aot_autograd [('total', 8), ('ok', 8), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('autograd_cache_hit', 4)]
 inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 12), ('async_compile_cache_hit', 8), ('fxgraph_cache_miss', 4), ('fxgraph_cache_hit', 4)]
 graph_break []
 aten_mm_info []
 ok
 test_isinf2_cuda (__main__.GPUTests.test_isinf2_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_isinf_cuda (__main__.GPUTests.test_isinf_cuda) ... inline_call []
 stats [('calls_captured', 8), ('unique_graphs', 4)]
 aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)]
 inductor [('triton_bundler_save_kernel', 28), ('fxgraph_cache_miss', 4), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4)]
 graph_break []
 aten_mm_info []
 ok
 test_issue102546_cuda (__main__.GPUTests.test_issue102546_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_kernel_names_cuda (__main__.GPUTests.test_kernel_names_cuda) ... frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_kwargs_cuda (__main__.GPUTests.test_kwargs_cuda) ... skipped 'histogramdd only supports cpu'
 test_l1_loss_cuda (__main__.GPUTests.test_l1_loss_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_large_block_sizes_cuda (__main__.GPUTests.test_large_block_sizes_cuda)
 Inductor will try triton configs like x = 64 and y = 1024 which will ... frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 2), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 35), ('benchmarking.InductorBenchmarker.benchmark_gpu', 3), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_large_broadcast_reduction_cuda (__main__.GPUTests.test_large_broadcast_reduction_cuda) ... frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 2), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 14), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_large_grid_cuda (__main__.GPUTests.test_large_grid_cuda) ... frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 3), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('pattern_matcher_count', 1), ('pattern_matcher_nodes', 1), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_large_offset_pointwise_cuda (__main__.GPUTests.test_large_offset_pointwise_cuda) ... frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_large_pointwise_cuda (__main__.GPUTests.test_large_pointwise_cuda) ... frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_large_strided_reduction_cuda (__main__.GPUTests.test_large_strided_reduction_cuda) ... frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 21), ('async_compile_cache_miss', 3), ('async_compile_cache_hit', 3), ('fxgraph_cache_miss', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_large_tensor_reduction_cuda (__main__.GPUTests.test_large_tensor_reduction_cuda) ... frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 21), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2), ('benchmarking.InductorBenchmarker.benchmark_gpu', 2), ('fxgraph_cache_miss', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_layer_norm_cuda (__main__.GPUTests.test_layer_norm_cuda) ... W0509 18:22:45.117000 415353 torch/_inductor/debug.py:454] [0/0] model__752_inference_718 debug trace: /workspace/pytorch/torch_compile_debug/run_2025_05_09_18_22_44_935431-pid_415353/torchinductor/model__752_inference_718.0
 inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_leaky_relu_cuda (__main__.GPUTests.test_leaky_relu_cuda) ... inline_call []
 stats [('calls_captured', 8), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_lerp_cuda (__main__.GPUTests.test_lerp_cuda) ... inline_call []
 stats [('calls_captured', 6), ('unique_graphs', 4)]
 aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)]
 inductor [('triton_bundler_save_kernel', 28), ('fxgraph_cache_miss', 4), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4)]
 graph_break []
 aten_mm_info []
 ok
 test_lgamma_cuda (__main__.GPUTests.test_lgamma_cuda) ... inline_call []
 stats [('calls_captured', 8), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_like_channels_last_cuda (__main__.GPUTests.test_like_channels_last_cuda) ... frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 4), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 21), ('async_compile_cache_miss', 3), ('async_compile_cache_hit', 3), ('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('extern_calls', 2), ('fxgraph_cache_miss', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_like_rands2_cuda (__main__.GPUTests.test_like_rands2_cuda) ... frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('pattern_matcher_count', 1), ('pattern_matcher_nodes', 1), ('fxgraph_cache_miss', 1), ('extern_calls', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_like_rands3_cuda (__main__.GPUTests.test_like_rands3_cuda) ... frames [('total', 2), ('ok', 2)]
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 7), ('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('fxgraph_cache_miss', 2), ('extern_calls', 2), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_like_rands_cuda (__main__.GPUTests.test_like_rands_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('extern_calls', 8), ('intermediate_hooks', 4), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_linear1_cuda (__main__.GPUTests.test_linear1_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('pattern_matcher_nodes', 8), ('benchmarking.InductorBenchmarker.benchmark_gpu', 8), ('pattern_matcher_count', 6), ('fxgraph_cache_miss', 2), ('extern_calls', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info [('aten.mm_2_16_8', 2)]
 ok
 test_linear2_cuda (__main__.GPUTests.test_linear2_cuda) ... inline_call []
 stats [('calls_captured', 16), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('ok', 2)]
 inductor [('pattern_matcher_nodes', 32), ('triton_bundler_save_kernel', 28), ('pattern_matcher_count', 24), ('benchmarking.InductorBenchmarker.benchmark_gpu', 16), ('extern_calls', 8), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info [('aten.mm_2_8_8', 8)]
 ok
 test_linear_dynamic_maxautotune_cuda (__main__.GPUTests.test_linear_dynamic_maxautotune_cuda) ... AUTOTUNE addmm(10x1, 10x1, 1x1)
  triton_mm_1 0.0036 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=1
  triton_mm_0 0.0038 ms 95.8% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=1
  triton_mm_2 0.0038 ms 95.8% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=1
  triton_mm_3 0.0038 ms 95.8% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=1
  triton_mm_4 0.0038 ms 95.8% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=1
 SingleProcess AUTOTUNE benchmarking takes 0.1773 seconds and 0.1201 seconds precompiling for 5 choices
 AUTOTUNE mm(1x10, 10x1)
  triton_mm_5 0.0038 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=1
  triton_mm_7 0.0038 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=1
  triton_mm_9 0.0038 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=1
  triton_mm_6 0.0041 ms 93.0% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=1
  triton_mm_8 0.0041 ms 93.0% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=1
 SingleProcess AUTOTUNE benchmarking takes 0.1778 seconds and 0.1066 seconds precompiling for 5 choices
 AUTOTUNE addmm(10x1, 10x1, 1x1)
  triton_mm_12 0.0036 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=1
  triton_mm_14 0.0037 ms 98.3% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=1
  triton_mm_11 0.0038 ms 96.6% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=1
  triton_mm_13 0.0038 ms 95.8% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=1
  triton_mm_10 0.0041 ms 89.1% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=1
 SingleProcess AUTOTUNE benchmarking takes 0.1805 seconds and 0.1140 seconds precompiling for 5 choices
 AUTOTUNE addmm(10x1, 10x1, 1x1)
  triton_mm_17 0.0035 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=1
  triton_mm_18 0.0036 ms 98.2% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=1
  triton_mm_16 0.0036 ms 97.3% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=1
  triton_mm_15 0.0038 ms 92.4% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=1
  triton_mm_19 0.0038 ms 92.4% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=1
 SingleProcess AUTOTUNE benchmarking takes 0.0914 seconds and 0.0863 seconds precompiling for 5 choices
 AUTOTUNE mm(1x10, 10x1)
  triton_mm_21 0.0035 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=1
  triton_mm_20 0.0038 ms 91.6% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=1
  triton_mm_23 0.0038 ms 91.6% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=1
  triton_mm_24 0.0038 ms 91.6% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=1
  triton_mm_22 0.0038 ms 90.8% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=1
 SingleProcess AUTOTUNE benchmarking takes 0.0891 seconds and 0.0764 seconds precompiling for 5 choices
 AUTOTUNE addmm(10x1, 10x1, 1x1)
  triton_mm_26 0.0035 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=1
  triton_mm_28 0.0037 ms 93.1% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=1
  triton_mm_25 0.0038 ms 90.8% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=1
  triton_mm_27 0.0038 ms 90.8% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=1
  triton_mm_29 0.0038 ms 90.8% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=1
 SingleProcess AUTOTUNE benchmarking takes 0.0900 seconds and 0.0725 seconds precompiling for 5 choices
 frames [('total', 9), ('ok', 9)]
 inline_call []
 stats [('calls_captured', 5), ('unique_graphs', 5)]
 aot_autograd [('total', 5), ('ok', 5), ('autograd_cache_miss', 4), ('autograd_cache_saved', 2), ('autograd_cache_hit', 1)]
 inductor [('triton_bundler_save_kernel', 294), ('async_compile_cache_miss', 68), ('benchmarking.InductorBenchmarker.benchmark_gpu', 48), ('select_algorithm_num_precompiles', 40), ('pattern_matcher_nodes', 12), ('pattern_matcher_count', 8), ('select_algorithm_precompile', 8), ('select_algorithm_autotune', 8), ('async_compile_cache_hit', 8), ('fxgraph_cache_miss', 6), ('fxgraph_cache_hit', 2)]
 graph_break []
 aten_mm_info [('aten.addmm_s0_1_1', 2), ('aten.mm_1_1_s0', 2), ('aten.addmm_10_1_1', 2)]
 ok
 test_linear_float64_cuda (__main__.GPUTests.test_linear_float64_cuda) ... skipped 'cuda failed for float64 linear'
 test_linear_mixed_dtype_cuda (__main__.GPUTests.test_linear_mixed_dtype_cuda) ... frames [('total', 2), ('ok', 1)]
 inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('fxgraph_cache_miss', 1), ('extern_calls', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info [('aten.mm_9_3_3', 1)]
 ok
 test_linspace1_cuda (__main__.GPUTests.test_linspace1_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_linspace2_cuda (__main__.GPUTests.test_linspace2_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_linspace3_cuda (__main__.GPUTests.test_linspace3_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)]
 inductor [('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_linspace4_cuda (__main__.GPUTests.test_linspace4_cuda) ... skipped 'requires multiple cuda devices'
 test_list_clearing_cuda (__main__.GPUTests.test_list_clearing_cuda) ... inductor [('fxgraph_cache_bypass', 2), ('async_compile_cache_miss', 2), ('extern_calls', 2), ('async_compile_cache_hit', 2), ('cudagraph_recorded_non_static_inputs', 2)]
 aten_mm_info [('aten.mm_5_5_5', 2)]
 ok
 test_log1p_cuda (__main__.GPUTests.test_log1p_cuda) ... inline_call []
 stats [('calls_captured', 60), ('unique_graphs', 20)]
 aot_autograd [('total', 20), ('ok', 20), ('autograd_cache_miss', 10), ('autograd_cache_saved', 10), ('autograd_cache_hit', 10)]
 inductor [('triton_bundler_save_kernel', 70), ('async_compile_cache_miss', 30), ('async_compile_cache_hit', 20), ('fxgraph_cache_miss', 10), ('fxgraph_cache_hit', 10)]
 graph_break []
 aten_mm_info []
 ok
 test_log2_cuda (__main__.GPUTests.test_log2_cuda) ... inline_call []
 stats [('calls_captured', 8), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_log_fp64_cuda (__main__.GPUTests.test_log_fp64_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('async_compile_cache_miss', 3), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_log_softmax_cuda (__main__.GPUTests.test_log_softmax_cuda) ... inline_call []
 stats [('calls_captured', 8), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('pattern_matcher_nodes', 24), ('pattern_matcher_count', 6), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_logaddexp_cuda (__main__.GPUTests.test_logaddexp_cuda) ... skipped 'Not implemented for CUDA'
 test_logcumsumexp_cuda (__main__.GPUTests.test_logcumsumexp_cuda) ... inline_call []
 stats [('calls_captured', 12), ('unique_graphs', 6)]
 aot_autograd [('total', 6), ('autograd_cache_miss', 6), ('autograd_cache_saved', 6), ('ok', 6)]
 inductor [('triton_bundler_save_kernel', 84), ('async_compile_cache_miss', 12), ('async_compile_cache_hit', 12), ('fxgraph_cache_miss', 6)]
 graph_break []
 aten_mm_info []
 ok
 test_logcumsumexp_zero_dim_cuda (__main__.GPUTests.test_logcumsumexp_zero_dim_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_logsumexp_cuda (__main__.GPUTests.test_logsumexp_cuda) ... inline_call []
 stats [('calls_captured', 6), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_long_tensor_cuda (__main__.GPUTests.test_long_tensor_cuda) ... inline_call []
 stats [('calls_captured', 12), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_bypass', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 7), ('async_compile_cache_miss', 3), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_low_memory_max_pool_cuda (__main__.GPUTests.test_low_memory_max_pool_cuda) ... inline_call []
 stats [('calls_captured', 8), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_masked_fill_cuda (__main__.GPUTests.test_masked_fill_cuda) ... inline_call []
 stats [('calls_captured', 10), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_masked_fill_promotion_cuda (__main__.GPUTests.test_masked_fill_promotion_cuda) ... frames [('total', 2), ('ok', 2)]
 stats [('calls_captured', 4), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_masked_scatter_cuda (__main__.GPUTests.test_masked_scatter_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('pattern_matcher_count', 8), ('pattern_matcher_nodes', 8), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_matmul_layer_norm_cuda (__main__.GPUTests.test_matmul_layer_norm_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('pattern_matcher_count', 4), ('pattern_matcher_nodes', 4), ('fxgraph_cache_miss', 1), ('extern_calls', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info [('aten.mm_1600_256_256', 1)]
 ok
 test_max_min_cuda (__main__.GPUTests.test_max_min_cuda) ... inline_call []
 stats [('calls_captured', 8), ('unique_graphs', 4)]
 aot_autograd [('total', 4), ('ok', 4), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('autograd_cache_hit', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2), ('fxgraph_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_max_pool2d1_cuda (__main__.GPUTests.test_max_pool2d1_cuda) ... inline_call []
 stats [('calls_captured', 6), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_max_pool2d2_cuda (__main__.GPUTests.test_max_pool2d2_cuda) ... inline_call []
 stats [('calls_captured', 6), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_max_pool2d3_cuda (__main__.GPUTests.test_max_pool2d3_cuda) ... inline_call []
 stats [('calls_captured', 12), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_max_pool2d4_cuda (__main__.GPUTests.test_max_pool2d4_cuda) ... inline_call []
 stats [('calls_captured', 6), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_max_pool2d5_cuda (__main__.GPUTests.test_max_pool2d5_cuda) ... inline_call []
 stats [('calls_captured', 6), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_max_pool2d6_cuda (__main__.GPUTests.test_max_pool2d6_cuda) ... inline_call []
 stats [('calls_captured', 6), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('benchmarking.InductorBenchmarker.benchmark_gpu', 4), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_max_pool2d7_cuda (__main__.GPUTests.test_max_pool2d7_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_max_pool2d8_cuda (__main__.GPUTests.test_max_pool2d8_cuda) ... inline_call []
 stats [('calls_captured', 6), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('extern_calls', 6), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_max_pool2d_with_indices_backward2_cuda (__main__.GPUTests.test_max_pool2d_with_indices_backward2_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_max_pool2d_with_indices_backward3_cuda (__main__.GPUTests.test_max_pool2d_with_indices_backward3_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_max_pool2d_with_indices_backward4_cuda (__main__.GPUTests.test_max_pool2d_with_indices_backward4_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_max_pool2d_with_indices_backward5_cuda (__main__.GPUTests.test_max_pool2d_with_indices_backward5_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('extern_calls', 4), ('fxgraph_cache_miss', 2), ('intermediate_hooks', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_max_pool2d_with_indices_backward6_cuda (__main__.GPUTests.test_max_pool2d_with_indices_backward6_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('extern_calls', 4), ('fxgraph_cache_miss', 2), ('intermediate_hooks', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_max_pool2d_with_indices_backward_cuda (__main__.GPUTests.test_max_pool2d_with_indices_backward_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_mean_cuda (__main__.GPUTests.test_mean_cuda) ... inline_call []
 stats [('calls_captured', 8), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 56), ('async_compile_cache_miss', 8), ('async_compile_cache_hit', 8), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_min_max_reduction_cuda (__main__.GPUTests.test_min_max_reduction_cuda) ... inline_call []
 stats [('calls_captured', 48), ('unique_graphs', 6)]
 aot_autograd [('total', 6), ('ok', 6), ('autograd_cache_miss', 3), ('autograd_cache_saved', 3), ('autograd_cache_hit', 3)]
 inductor [('triton_bundler_save_kernel', 21), ('async_compile_cache_miss', 9), ('async_compile_cache_hit', 6), ('fxgraph_cache_miss', 3), ('fxgraph_cache_hit', 3)]
 graph_break []
 aten_mm_info []
 ok
 test_min_max_reduction_nan_cuda (__main__.GPUTests.test_min_max_reduction_nan_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_misaligned_address_issue1_cuda (__main__.GPUTests.test_misaligned_address_issue1_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_mix_device_index_cuda (__main__.GPUTests.test_mix_device_index_cuda)
 A tiny repro for this meta internal issue: https://fb.workplace.com/groups/1075192433118967/posts/1567334737238065 ... frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 7), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('extern_calls', 6), ('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('intermediate_hooks', 2), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_mixed_mm2_cuda (__main__.GPUTests.test_mixed_mm2_cuda) ... skipped 'Not supported in Python 3.12+'
 test_mixed_mm3_cuda (__main__.GPUTests.test_mixed_mm3_cuda) ... skipped 'Not supported in Python 3.12+'
 test_mixed_mm_cuda (__main__.GPUTests.test_mixed_mm_cuda) ... skipped 'Not supported in Python 3.12+'
 test_mm_mixed_dtype_cuda (__main__.GPUTests.test_mm_mixed_dtype_cuda) ... frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('fxgraph_cache_miss', 1), ('extern_calls', 1)]
 graph_break []
 aten_mm_info [('aten.mm_2_3_3', 1)]
 ok
 test_mm_views_cuda (__main__.GPUTests.test_mm_views_cuda) ... inline_call []
 stats [('calls_captured', 3), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('fxgraph_cache_miss', 1), ('extern_calls', 1)]
 graph_break []
 aten_mm_info [('aten.mm_32_32_32', 1)]
 ok
 test_move_arange_cuda (__main__.GPUTests.test_move_arange_cuda) ... inline_call []
 stats [('calls_captured', 3), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('pattern_matcher_count', 1), ('pattern_matcher_nodes', 1), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_mul_index_expr_cuda (__main__.GPUTests.test_mul_index_expr_cuda) ... inline_call []
 stats [('calls_captured', 10), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('async_compile_cache_miss', 3), ('async_compile_cache_hit', 2), ('pattern_matcher_count', 1), ('pattern_matcher_nodes', 1), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_mul_softmax_symfloat_cuda (__main__.GPUTests.test_mul_softmax_symfloat_cuda) ... frames [('total', 5), ('ok', 5)]
 stats [('calls_captured', 11), ('unique_graphs', 2)]
 aot_autograd [('total', 3), ('autograd_cache_miss', 3), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('pattern_matcher_nodes', 6), ('pattern_matcher_count', 2), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_multi_device_cuda (__main__.GPUTests.test_multi_device_cuda) ... W0509 18:23:21.436000 415353 torch/_inductor/utils.py:1762] [0/0] DeviceCopy in input program
 W0509 18:23:21.438000 415353 torch/_inductor/utils.py:1762] [0/0] DeviceCopy in input program
 W0509 18:23:21.438000 415353 torch/_inductor/utils.py:1762] [0/0] DeviceCopy in input program
 inline_call []
 stats [('calls_captured', 14), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 14), ('extern_calls', 3), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_multi_gpu_device_cuda (__main__.GPUTests.test_multi_gpu_device_cuda) ... skipped 'requires multiple cuda devices'
 test_multi_gpu_recompile_on_index_cuda (__main__.GPUTests.test_multi_gpu_recompile_on_index_cuda) ... skipped 'requires multiple cuda devices'
 test_multi_threading_cuda (__main__.GPUTests.test_multi_threading_cuda) ... frames [('total', 2), ('ok', 2)]
 inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)]
 inductor [('extern_calls', 2), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)]
 graph_break []
 aten_mm_info [('aten.addmm_4_3_2', 1)]
 ok
 test_multilayer_any_cuda (__main__.GPUTests.test_multilayer_any_cuda) ... inline_call []
 stats [('calls_captured', 16), ('unique_graphs', 4)]
 aot_autograd [('total', 4), ('ok', 4), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('autograd_cache_hit', 2)]
 inductor [('triton_bundler_save_kernel', 42), ('async_compile_cache_miss', 18), ('async_compile_cache_hit', 12), ('fxgraph_cache_miss', 2), ('fxgraph_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_multilayer_prime_size_cuda (__main__.GPUTests.test_multilayer_prime_size_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)]
 inductor [('triton_bundler_save_kernel', 21), ('async_compile_cache_miss', 9), ('async_compile_cache_hit', 6), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_multilayer_sum_low_prec_cuda (__main__.GPUTests.test_multilayer_sum_low_prec_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)]
 inductor [('triton_bundler_save_kernel', 14), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_multilayer_var_cuda (__main__.GPUTests.test_multilayer_var_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 4)]
 aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)]
 inductor [('triton_bundler_save_kernel', 56), ('async_compile_cache_miss', 8), ('async_compile_cache_hit', 8), ('fxgraph_cache_miss', 4)]
 graph_break []
 aten_mm_info []
 ok
 test_multilayer_var_lowp_cuda (__main__.GPUTests.test_multilayer_var_lowp_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 4)]
 aot_autograd [('total', 4), ('ok', 4), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('autograd_cache_hit', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 12), ('async_compile_cache_hit', 8), ('fxgraph_cache_miss', 2), ('fxgraph_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_mutable_custom_op_fixed_layout2_cuda (__main__.GPUTests.test_mutable_custom_op_fixed_layout2_cuda) ... inline_call []
 stats [('calls_captured', 8), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('ok', 2)]
 inductor [('extern_calls', 8), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 6), ('intermediate_hooks', 4), ('fxgraph_cache_bypass', 2), ('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_mutable_custom_op_fixed_layout_cuda (__main__.GPUTests.test_mutable_custom_op_fixed_layout_cuda) ... stats [('calls_captured', 3), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('ok', 1)]
 inductor [('fxgraph_cache_bypass', 1), ('pattern_matcher_count', 1), ('pattern_matcher_nodes', 1), ('extern_calls', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_mutations_loop_fusion_cuda (__main__.GPUTests.test_mutations_loop_fusion_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)]
 inductor [('triton_bundler_save_kernel', 21), ('async_compile_cache_miss', 9), ('async_compile_cache_hit', 6), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_nan_to_num_cuda (__main__.GPUTests.test_nan_to_num_cuda) ... inline_call []
 stats [('calls_captured', 6), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_narrow_cuda (__main__.GPUTests.test_narrow_cuda) ... inline_call []
 stats [('calls_captured', 10), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_bypass', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_neg_index_cuda (__main__.GPUTests.test_neg_index_cuda) ... frames [('total', 9), ('ok', 9)]
 stats [('calls_captured', 17), ('unique_graphs', 9)]
 aot_autograd [('total', 9), ('autograd_cache_miss', 9), ('ok', 9), ('autograd_cache_saved', 8), ('autograd_cache_bypass', 1)]
 inductor [('triton_bundler_save_kernel', 56), ('fxgraph_cache_miss', 9), ('async_compile_cache_miss', 8), ('async_compile_cache_hit', 8), ('pattern_matcher_count', 3), ('pattern_matcher_nodes', 3)]
 graph_break []
 aten_mm_info []
 ok
 test_neg_max_uint8_cuda (__main__.GPUTests.test_neg_max_uint8_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('async_compile_cache_miss', 3), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_new_empty_cuda (__main__.GPUTests.test_new_empty_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_new_empty_strided_cuda (__main__.GPUTests.test_new_empty_strided_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_new_ones_cuda (__main__.GPUTests.test_new_ones_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_nll_loss_backward_cuda (__main__.GPUTests.test_nll_loss_backward_cuda) ... inline_call []
 stats [('calls_captured', 8), ('unique_graphs', 4)]
 aot_autograd [('total', 4), ('ok', 4), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('autograd_cache_hit', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('pattern_matcher_nodes', 8), ('async_compile_cache_miss', 6), ('pattern_matcher_count', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2), ('fxgraph_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_nll_loss_forward_cuda (__main__.GPUTests.test_nll_loss_forward_cuda) ... inline_call []
 stats [('calls_captured', 12), ('unique_graphs', 4)]
 aot_autograd [('total', 4), ('ok', 4), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('autograd_cache_hit', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2), ('fxgraph_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_no_mega_fusion_during_lowering_cuda (__main__.GPUTests.test_no_mega_fusion_during_lowering_cuda) ... --> 7
 inline_call []
 stats [('calls_captured', 50), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 21), ('async_compile_cache_miss', 3), ('async_compile_cache_hit', 3), ('fxgraph_cache_miss', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_no_op_reduction_cuda (__main__.GPUTests.test_no_op_reduction_cuda) ... inline_call []
 stats [('calls_captured', 6), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_no_specization_over_symbolic_value_cuda (__main__.GPUTests.test_no_specization_over_symbolic_value_cuda) ... stats [('calls_captured', 2), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('fxgraph_cache_miss', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_nonzero_unbacked_refinement_cuda (__main__.GPUTests.test_nonzero_unbacked_refinement_cuda) ... inline_call []
 stats [('calls_captured', 30), ('unique_graphs', 3)]
 aot_autograd [('total', 3), ('ok', 3), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('autograd_cache_hit', 1)]
 inductor [('extern_calls', 12), ('triton_bundler_save_kernel', 7), ('intermediate_hooks', 3), ('async_compile_cache_miss', 3), ('fxgraph_cache_miss', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 frames [('total', 1), ('ok', 1)]
 ok
 test_one_hot_cuda (__main__.GPUTests.test_one_hot_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('pattern_matcher_count', 1), ('pattern_matcher_nodes', 1), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_output_strides_cuda (__main__.GPUTests.test_output_strides_cuda) ... /workspace/pytorch/torch/_dynamo/utils.py:3284: UserWarning: The use of `x.T` on tensors of dimension other than 2 to reverse their shape is deprecated and it will throw an error in a future release. Consider `x.mT` to transpose batches of matrices or `x.permute(*torch.arange(x.ndim - 1, -1, -1))` to reverse the dimensions of a tensor. (Triggered internally at /workspace/pytorch/aten/src/ATen/native/TensorShape.cpp:4413.)
  return node.target(*args, **kwargs)
 /workspace/pytorch/test/inductor/test_torchinductor.py:6789: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()
  self.assertEqual(inp.storage(), out.storage())
 frames [('total', 3), ('ok', 3)]
 unimplemented []
 graph_break [('Call to `torch._dynamo.graph_break()`\n  Explanation: User-inserted graph break. Message: None\n  Hint: Remove the `torch._dynamo.graph_break()` call.\n\n  Developer debug context: Called `torch._dynamo.graph_break()` with args `[]`, kwargs `{}`\n', 1)]
 stats [('calls_captured', 7), ('unique_graphs', 3)]
 aot_autograd [('total', 3), ('autograd_cache_miss', 3), ('ok', 3), ('autograd_cache_bypass', 2), ('autograd_cache_saved', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 3), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1), ('pattern_matcher_count', 1), ('pattern_matcher_nodes', 1)]
 aten_mm_info []
 ok
 test_pad_cast_cuda (__main__.GPUTests.test_pad_cast_cuda) ... inline_call []
 stats [('calls_captured', 8), ('unique_graphs', 4)]
 aot_autograd [('total', 4), ('ok', 4), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('autograd_cache_hit', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2), ('fxgraph_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_pad_single_cuda (__main__.GPUTests.test_pad_single_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_pad_view_cuda (__main__.GPUTests.test_pad_view_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_pattern_matcher_multi_user_cuda (__main__.GPUTests.test_pattern_matcher_multi_user_cuda) ... inline_call []
 stats [('calls_captured', 6), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('pattern_matcher_nodes', 7), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('pattern_matcher_count', 2), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_permute1_cuda (__main__.GPUTests.test_permute1_cuda) ... inline_call []
 stats [('calls_captured', 10), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_permute2_cuda (__main__.GPUTests.test_permute2_cuda) ... inline_call []
 stats [('calls_captured', 6), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_bypass', 2), ('ok', 2)]
 inductor [('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_philox_rand_cuda (__main__.GPUTests.test_philox_rand_cuda) ... frames [('total', 5), ('ok', 5)]
 stats [('calls_captured', 8), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_pixel_shuffle_channels_last_cuda (__main__.GPUTests.test_pixel_shuffle_channels_last_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('pattern_matcher_count', 4), ('pattern_matcher_nodes', 4), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_pointwise_airy_ai_cuda (__main__.GPUTests.test_pointwise_airy_ai_cuda) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('extern_calls', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_pointwise_bessel_j0_cuda (__main__.GPUTests.test_pointwise_bessel_j0_cuda) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_pointwise_bessel_j1_cuda (__main__.GPUTests.test_pointwise_bessel_j1_cuda) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_pointwise_bessel_y0_cuda (__main__.GPUTests.test_pointwise_bessel_y0_cuda) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_pointwise_bessel_y1_cuda (__main__.GPUTests.test_pointwise_bessel_y1_cuda) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_pointwise_chebyshev_polynomial_t_cuda (__main__.GPUTests.test_pointwise_chebyshev_polynomial_t_cuda) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('extern_calls', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_pointwise_chebyshev_polynomial_u_cuda (__main__.GPUTests.test_pointwise_chebyshev_polynomial_u_cuda) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('extern_calls', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_pointwise_chebyshev_polynomial_v_cuda (__main__.GPUTests.test_pointwise_chebyshev_polynomial_v_cuda) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('extern_calls', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_pointwise_chebyshev_polynomial_w_cuda (__main__.GPUTests.test_pointwise_chebyshev_polynomial_w_cuda) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('extern_calls', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_pointwise_digamma_cuda (__main__.GPUTests.test_pointwise_digamma_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('extern_calls', 4), ('fxgraph_cache_miss', 2), ('intermediate_hooks', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_pointwise_entr_cuda (__main__.GPUTests.test_pointwise_entr_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_pointwise_erf_cuda (__main__.GPUTests.test_pointwise_erf_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_pointwise_erfc_cuda (__main__.GPUTests.test_pointwise_erfc_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_pointwise_erfcx_cuda (__main__.GPUTests.test_pointwise_erfcx_cuda) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_pointwise_erfinv_cuda (__main__.GPUTests.test_pointwise_erfinv_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_pointwise_exp2_cuda (__main__.GPUTests.test_pointwise_exp2_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_pointwise_expit_cuda (__main__.GPUTests.test_pointwise_expit_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_pointwise_expm1_cuda (__main__.GPUTests.test_pointwise_expm1_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_pointwise_gammainc_cuda (__main__.GPUTests.test_pointwise_gammainc_cuda) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('extern_calls', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_pointwise_gammaincc_cuda (__main__.GPUTests.test_pointwise_gammaincc_cuda) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('extern_calls', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_pointwise_gammaln_cuda (__main__.GPUTests.test_pointwise_gammaln_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_pointwise_hermite_polynomial_h_cuda (__main__.GPUTests.test_pointwise_hermite_polynomial_h_cuda) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('extern_calls', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_pointwise_hermite_polynomial_he_cuda (__main__.GPUTests.test_pointwise_hermite_polynomial_he_cuda) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('extern_calls', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_pointwise_i0_cuda (__main__.GPUTests.test_pointwise_i0_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_pointwise_i0e_cuda (__main__.GPUTests.test_pointwise_i0e_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('extern_calls', 4), ('fxgraph_cache_miss', 2), ('intermediate_hooks', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_pointwise_i1_cuda (__main__.GPUTests.test_pointwise_i1_cuda) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_pointwise_i1e_cuda (__main__.GPUTests.test_pointwise_i1e_cuda) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('extern_calls', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_pointwise_laguerre_polynomial_l_cuda (__main__.GPUTests.test_pointwise_laguerre_polynomial_l_cuda) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('extern_calls', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_pointwise_legendre_polynomial_p_cuda (__main__.GPUTests.test_pointwise_legendre_polynomial_p_cuda) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('extern_calls', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_pointwise_log1p_cuda (__main__.GPUTests.test_pointwise_log1p_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_pointwise_log_ndtr_cuda (__main__.GPUTests.test_pointwise_log_ndtr_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_pointwise_logit_cuda (__main__.GPUTests.test_pointwise_logit_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_pointwise_modified_bessel_i0_cuda (__main__.GPUTests.test_pointwise_modified_bessel_i0_cuda) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_pointwise_modified_bessel_i1_cuda (__main__.GPUTests.test_pointwise_modified_bessel_i1_cuda) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_pointwise_modified_bessel_k0_cuda (__main__.GPUTests.test_pointwise_modified_bessel_k0_cuda) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('extern_calls', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_pointwise_modified_bessel_k1_cuda (__main__.GPUTests.test_pointwise_modified_bessel_k1_cuda) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('extern_calls', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_pointwise_multigammaln_cuda (__main__.GPUTests.test_pointwise_multigammaln_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_pointwise_ndtr_cuda (__main__.GPUTests.test_pointwise_ndtr_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_pointwise_ndtri_cuda (__main__.GPUTests.test_pointwise_ndtri_cuda) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('extern_calls', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_pointwise_polygamma_cuda (__main__.GPUTests.test_pointwise_polygamma_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('extern_calls', 4), ('fxgraph_cache_miss', 2), ('intermediate_hooks', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_pointwise_psi_cuda (__main__.GPUTests.test_pointwise_psi_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('extern_calls', 4), ('fxgraph_cache_miss', 2), ('intermediate_hooks', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_pointwise_round_cuda (__main__.GPUTests.test_pointwise_round_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_pointwise_scaled_modified_bessel_k0_cuda (__main__.GPUTests.test_pointwise_scaled_modified_bessel_k0_cuda) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('extern_calls', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_pointwise_scaled_modified_bessel_k1_cuda (__main__.GPUTests.test_pointwise_scaled_modified_bessel_k1_cuda) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('extern_calls', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_pointwise_shifted_chebyshev_polynomial_t_cuda (__main__.GPUTests.test_pointwise_shifted_chebyshev_polynomial_t_cuda) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('extern_calls', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_pointwise_shifted_chebyshev_polynomial_u_cuda (__main__.GPUTests.test_pointwise_shifted_chebyshev_polynomial_u_cuda) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('extern_calls', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_pointwise_shifted_chebyshev_polynomial_v_cuda (__main__.GPUTests.test_pointwise_shifted_chebyshev_polynomial_v_cuda) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('extern_calls', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_pointwise_shifted_chebyshev_polynomial_w_cuda (__main__.GPUTests.test_pointwise_shifted_chebyshev_polynomial_w_cuda) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('extern_calls', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_pointwise_sinc_cuda (__main__.GPUTests.test_pointwise_sinc_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_pointwise_spherical_bessel_j0_cuda (__main__.GPUTests.test_pointwise_spherical_bessel_j0_cuda) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('extern_calls', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_pointwise_xlog1py_cuda (__main__.GPUTests.test_pointwise_xlog1py_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_pointwise_xlogy_cuda (__main__.GPUTests.test_pointwise_xlogy_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_pointwise_zeta_cuda (__main__.GPUTests.test_pointwise_zeta_cuda) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('extern_calls', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_polar_cuda (__main__.GPUTests.test_polar_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)]
 inductor [('extern_calls', 4), ('intermediate_hooks', 2), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_pow1_cuda (__main__.GPUTests.test_pow1_cuda) ... inline_call []
 stats [('calls_captured', 34), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_pow2_cuda (__main__.GPUTests.test_pow2_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_pow3_cuda (__main__.GPUTests.test_pow3_cuda) ... frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 3), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_pow_int_cuda (__main__.GPUTests.test_pow_int_cuda) ... inline_call []
 stats [('calls_captured', 20), ('unique_graphs', 10)]
 aot_autograd [('total', 10), ('ok', 10), ('autograd_cache_miss', 5), ('autograd_cache_saved', 5), ('autograd_cache_hit', 5)]
 inductor [('triton_bundler_save_kernel', 35), ('extern_calls', 20), ('async_compile_cache_miss', 15), ('intermediate_hooks', 10), ('async_compile_cache_hit', 10), ('fxgraph_cache_miss', 5), ('fxgraph_cache_hit', 5)]
 graph_break []
 aten_mm_info []
 ok
 test_pow_symfloat_cuda (__main__.GPUTests.test_pow_symfloat_cuda) ... stats [('calls_captured', 3), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_prepare_softmax_with_fast_math_cuda (__main__.GPUTests.test_prepare_softmax_with_fast_math_cuda)
 Measure on a A100, perf is 3.487ms v.s. 3.358ms without or with flushing to zero. A 4% speedup. ... frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 5), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('pattern_matcher_nodes', 4), ('fxgraph_cache_miss', 1), ('pattern_matcher_count', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_prod_cuda (__main__.GPUTests.test_prod_cuda) ... inline_call []
 stats [('calls_captured', 12), ('unique_graphs', 4)]
 aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)]
 inductor [('triton_bundler_save_kernel', 56), ('async_compile_cache_miss', 8), ('async_compile_cache_hit', 8), ('fxgraph_cache_miss', 4)]
 graph_break []
 aten_mm_info []
 ok
 test_profiler_mark_wrapper_call_cuda (__main__.GPUTests.test_profiler_mark_wrapper_call_cuda) ... stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_rand_like_deterministic_cuda (__main__.GPUTests.test_rand_like_deterministic_cuda) ... frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 2), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 14), ('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('extern_calls', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_randint_cuda (__main__.GPUTests.test_randint_cuda) ... stats [('calls_captured', 3), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 21), ('pattern_matcher_count', 3), ('pattern_matcher_nodes', 3), ('async_compile_cache_miss', 3), ('async_compile_cache_hit', 3), ('fxgraph_cache_miss', 1), ('extern_calls', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_randint_distribution_cuda (__main__.GPUTests.test_randint_distribution_cuda) ... stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('pattern_matcher_count', 1), ('pattern_matcher_nodes', 1), ('fxgraph_cache_miss', 1), ('extern_calls', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_randint_int64_mod_cuda (__main__.GPUTests.test_randint_int64_mod_cuda) ... frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 2), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('pattern_matcher_count', 1), ('pattern_matcher_nodes', 1), ('fxgraph_cache_miss', 1), ('extern_calls', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_randint_kernel_count_cuda (__main__.GPUTests.test_randint_kernel_count_cuda) ... stats [('calls_captured', 3), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 14), ('pattern_matcher_count', 3), ('pattern_matcher_nodes', 3), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('extern_calls', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_randn_generator_cuda (__main__.GPUTests.test_randn_generator_cuda) ... inline_call [("Failed to convert args/kwargs to proxy\n  Explanation: Missing `as_proxy()` implementation for some arg/kwarg.\n\n\n  Developer debug context: call_function args: ListVariable(length=2) UserDefinedObjectVariable(Generator) ConstantVariable(device: device(type='cuda', index=0))\n", 1)]
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('async_compile_cache_miss', 3), ('extern_calls', 2), ('async_compile_cache_hit', 2), ('pattern_matcher_count', 1), ('pattern_matcher_nodes', 1), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_randn_like_empty_cuda (__main__.GPUTests.test_randn_like_empty_cuda) ... inline_call []
 stats [('calls_captured', 6), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_randn_with_dtype_and_device_cuda (__main__.GPUTests.test_randn_with_dtype_and_device_cuda) ... skipped 'only support cpu randn_with_dtype_and_device test'
 test_reduction1_cuda (__main__.GPUTests.test_reduction1_cuda) ... inline_call []
 stats [('calls_captured', 10), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 42), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 6), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_reduction2_cuda (__main__.GPUTests.test_reduction2_cuda) ... inline_call []
 stats [('calls_captured', 8), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 35), ('async_compile_cache_miss', 5), ('async_compile_cache_hit', 5), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_reduction3_cuda (__main__.GPUTests.test_reduction3_cuda) ... inline_call []
 stats [('calls_captured', 8), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 35), ('async_compile_cache_miss', 5), ('async_compile_cache_hit', 5), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_reduction4_cuda (__main__.GPUTests.test_reduction4_cuda) ... inline_call []
 stats [('calls_captured', 8), ('unique_graphs', 4)]
 aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)]
 inductor [('triton_bundler_save_kernel', 28), ('fxgraph_cache_miss', 4), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4)]
 graph_break []
 aten_mm_info []
 ok
 test_reduction5_cuda (__main__.GPUTests.test_reduction5_cuda) ... inline_call []
 stats [('calls_captured', 8), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 35), ('async_compile_cache_miss', 5), ('async_compile_cache_hit', 5), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_reduction_config_limit_cuda (__main__.GPUTests.test_reduction_config_limit_cuda)
 This unit-test tests whether we exceed cudaDeviceProperties.maxGridSize in ... ok
 test_reflection_pad2d_backward_cuda (__main__.GPUTests.test_reflection_pad2d_backward_cuda) ... inline_call []
 stats [('calls_captured', 14), ('unique_graphs', 14)]
 aot_autograd [('total', 14), ('autograd_cache_miss', 14), ('autograd_cache_saved', 14), ('ok', 14)]
 inductor [('pattern_matcher_count', 112), ('pattern_matcher_nodes', 112), ('triton_bundler_save_kernel', 98), ('fxgraph_cache_miss', 14), ('async_compile_cache_miss', 14), ('async_compile_cache_hit', 14)]
 graph_break []
 aten_mm_info []
 ok
 test_reflection_pad2d_cuda (__main__.GPUTests.test_reflection_pad2d_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('pattern_matcher_count', 8), ('pattern_matcher_nodes', 8), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_reinterpret_dtypeview_cuda (__main__.GPUTests.test_reinterpret_dtypeview_cuda) ... frames [('total', 2), ('ok', 2)]
 stats [('calls_captured', 12), ('unique_graphs', 3)]
 aot_autograd [('total', 3), ('autograd_cache_miss', 3), ('autograd_cache_bypass', 3), ('ok', 3)]
 inductor [('pattern_matcher_count', 6), ('pattern_matcher_nodes', 6), ('fxgraph_cache_hit', 2), ('fxgraph_cache_miss', 1)]
 graph_break []
 aten_mm_info []
 inline_call []
 ok
 test_relu_cuda (__main__.GPUTests.test_relu_cuda) ... inline_call []
 stats [('calls_captured', 8), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_remainder_cuda (__main__.GPUTests.test_remainder_cuda) ... inline_call []
 stats [('calls_captured', 14), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_remove_no_ops_cuda (__main__.GPUTests.test_remove_no_ops_cuda) ... frames [('total', 14), ('ok', 14)]
 inline_call []
 stats [('calls_captured', 42), ('unique_graphs', 14)]
 aot_autograd [('total', 14), ('autograd_cache_miss', 14), ('autograd_cache_saved', 14), ('ok', 14)]
 inductor [('triton_bundler_save_kernel', 42), ('extern_calls', 14), ('async_compile_cache_miss', 14), ('async_compile_cache_hit', 10), ('fxgraph_cache_miss', 8), ('fxgraph_cache_hit', 6)]
 graph_break []
 aten_mm_info [('aten.mm_256_256_256', 8)]
 ok
 test_remove_noop_clone_cuda (__main__.GPUTests.test_remove_noop_clone_cuda) ... inline_call []
 stats [('calls_captured', 10), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 42), ('pattern_matcher_nodes', 12), ('pattern_matcher_count', 10), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 6), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_remove_noop_copy_cuda (__main__.GPUTests.test_remove_noop_copy_cuda) ... inline_call []
 stats [('calls_captured', 14), ('unique_graphs', 4)]
 aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)]
 inductor [('triton_bundler_save_kernel', 28), ('fxgraph_cache_miss', 4), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4)]
 graph_break []
 aten_mm_info []
 ok
 test_repeat_as_strided_cuda (__main__.GPUTests.test_repeat_as_strided_cuda) ... inline_call []
 stats [('calls_captured', 6), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_repeat_cuda (__main__.GPUTests.test_repeat_cuda) ... inline_call []
 stats [('calls_captured', 8), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 42), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 6), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_repeat_interleave_2_cuda (__main__.GPUTests.test_repeat_interleave_2_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)]
 inductor [('extern_calls', 4), ('intermediate_hooks', 2), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_repeat_interleave_cuda (__main__.GPUTests.test_repeat_interleave_cuda) ... inline_call []
 stats [('calls_captured', 6), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 42), ('pattern_matcher_count', 8), ('pattern_matcher_nodes', 8), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 6), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_require_stride_expanded_cuda (__main__.GPUTests.test_require_stride_expanded_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('fxgraph_cache_miss', 2), ('extern_calls', 2), ('intermediate_hooks', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_resize_as_cuda (__main__.GPUTests.test_resize_as_cuda) ... stats [('calls_captured', 102), ('unique_graphs', 102)]
 aot_autograd [('total', 102), ('ok', 102), ('autograd_cache_miss', 86), ('autograd_cache_saved', 86), ('autograd_cache_hit', 16)]
 inductor [('triton_bundler_save_kernel', 476), ('async_compile_cache_miss', 136), ('async_compile_cache_hit', 102), ('fxgraph_cache_miss', 68), ('fxgraph_cache_hit', 34)]
 graph_break []
 aten_mm_info []
 ok
 test_resize_cuda (__main__.GPUTests.test_resize_cuda) ... stats [('calls_captured', 34), ('unique_graphs', 34)]
 aot_autograd [('total', 34), ('autograd_cache_miss', 34), ('autograd_cache_saved', 34), ('ok', 34)]
 inductor [('triton_bundler_save_kernel', 238), ('fxgraph_cache_miss', 34), ('async_compile_cache_miss', 34), ('async_compile_cache_hit', 34)]
 graph_break []
 aten_mm_info []
 ok
 test_reuse_buffers_with_aliasing_cuda (__main__.GPUTests.test_reuse_buffers_with_aliasing_cuda) ... inline_call []
 stats [('calls_captured', 30), ('unique_graphs', 5)]
 aot_autograd [('total', 5), ('ok', 5), ('autograd_cache_miss', 3), ('autograd_cache_saved', 3), ('autograd_cache_hit', 2)]
 inductor [('extern_calls', 40), ('intermediate_hooks', 20), ('triton_bundler_save_kernel', 14), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 3), ('fxgraph_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 frames [('total', 1), ('ok', 1)]
 ok
 test_roi_align_cuda (__main__.GPUTests.test_roi_align_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('extern_calls', 4), ('fxgraph_cache_miss', 2), ('intermediate_hooks', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_roll_cuda (__main__.GPUTests.test_roll_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('pattern_matcher_count', 10), ('pattern_matcher_nodes', 10), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_round_correctness_cuda (__main__.GPUTests.test_round_correctness_cuda) ... skipped 'need to debug tl.libdevice on A100/V100'
 test_round_cuda (__main__.GPUTests.test_round_cuda) ... inline_call []
 stats [('calls_captured', 8), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_rsqrt_cuda (__main__.GPUTests.test_rsqrt_cuda) ... inline_call []
 stats [('calls_captured', 8), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_rsqrt_dynamic_shapes_cuda (__main__.GPUTests.test_rsqrt_dynamic_shapes_cuda) ... frames [('total', 9), ('ok', 9)]
 stats [('calls_captured', 16), ('unique_graphs', 5)]
 aot_autograd [('total', 5), ('autograd_cache_miss', 5), ('autograd_cache_saved', 5), ('ok', 5)]
 inductor [('triton_bundler_save_kernel', 35), ('fxgraph_cache_miss', 5), ('extern_calls', 5), ('async_compile_cache_miss', 5), ('async_compile_cache_hit', 5), ('benchmarking.InductorBenchmarker.benchmark_gpu', 2)]
 graph_break []
 aten_mm_info [('aten.bmm_s2_s2_s2', 2), ('aten.bmm_4_4_4', 2), ('aten.bmm_s1_s1_s1', 1)]
 inline_call []
 ok
 test_scalar_cpu_tensor_arg_cuda (__main__.GPUTests.test_scalar_cpu_tensor_arg_cuda) ... inline_call []
 stats [('calls_captured', 8), ('unique_graphs', 4)]
 aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)]
 inductor [('triton_bundler_save_kernel', 28), ('fxgraph_cache_miss', 4), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4)]
 graph_break []
 aten_mm_info []
 ok
 test_scalar_input_cuda (__main__.GPUTests.test_scalar_input_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('async_compile_cache_miss', 3), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_scalar_output_cuda (__main__.GPUTests.test_scalar_output_cuda) ... inline_call []
 stats [('calls_captured', 6), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_scaled_dot_product_attention_cuda (__main__.GPUTests.test_scaled_dot_product_attention_cuda) ... /workspace/pytorch/torch/_inductor/lowering.py:7007: UserWarning: 
 Online softmax is disabled on the fly since Inductor decides to
 split the reduction. Cut an issue to PyTorch if this is an
 important use case and you want to speed it up with online
 softmax.

  warnings.warn(
 inline_call []
 stats [('calls_captured', 12), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 35), ('pattern_matcher_nodes', 12), ('pattern_matcher_count', 8), ('async_compile_cache_miss', 5), ('async_compile_cache_hit', 5), ('extern_calls', 4), ('benchmarking.InductorBenchmarker.benchmark_gpu', 2), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info [('aten.bmm_2_2_2', 2)]
 ok
 test_scaled_dot_product_efficient_attention_cuda (__main__.GPUTests.test_scaled_dot_product_efficient_attention_cuda) ... inline_call []
 stats [('calls_captured', 3), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('extern_calls', 3), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_scatter1_cuda (__main__.GPUTests.test_scatter1_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_scatter2_cuda (__main__.GPUTests.test_scatter2_cuda) ... skipped 'unstable on sm86'
 test_scatter3_cuda (__main__.GPUTests.test_scatter3_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_scatter4_cuda (__main__.GPUTests.test_scatter4_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 4)]
 aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)]
 inductor [('triton_bundler_save_kernel', 42), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 6), ('fxgraph_cache_miss', 4), ('extern_calls', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_scatter5_cuda (__main__.GPUTests.test_scatter5_cuda) ... /workspace/pytorch/test/inductor/test_torchinductor.py:8207: UserWarning: The reduce argument of torch.scatter with Tensor src is deprecated and will be removed in a future PyTorch release. Use torch.scatter_reduce instead for more reduction options. (Triggered internally at /workspace/pytorch/aten/src/ATen/native/TensorAdvancedIndexing.cpp:232.)
  a.scatter_(dim, index, b, reduce=reduce)
 inline_call []
 stats [('calls_captured', 16), ('unique_graphs', 4)]
 aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)]
 inductor [('triton_bundler_save_kernel', 70), ('async_compile_cache_miss', 10), ('async_compile_cache_hit', 10), ('fxgraph_cache_miss', 4), ('extern_calls', 4)]
 graph_break []
 aten_mm_info []
 ok
 test_scatter6_cuda (__main__.GPUTests.test_scatter6_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 4)]
 aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)]
 inductor [('triton_bundler_save_kernel', 42), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 6), ('fxgraph_cache_miss', 4), ('extern_calls', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_scatter_add1_cuda (__main__.GPUTests.test_scatter_add1_cuda) ... skipped 'Flaky test, needs debugging'
 test_scatter_add2_cuda (__main__.GPUTests.test_scatter_add2_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_scatter_add3_cuda (__main__.GPUTests.test_scatter_add3_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 4)]
 aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)]
 inductor [('triton_bundler_save_kernel', 42), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 6), ('fxgraph_cache_miss', 4), ('extern_calls', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_scatter_bf16_cuda (__main__.GPUTests.test_scatter_bf16_cuda) ... inline_call []
 stats [('calls_captured', 6), ('unique_graphs', 6)]
 aot_autograd [('total', 6), ('ok', 6), ('autograd_cache_miss', 3), ('autograd_cache_saved', 3), ('autograd_cache_hit', 3)]
 inductor [('triton_bundler_save_kernel', 21), ('async_compile_cache_miss', 9), ('extern_calls', 6), ('async_compile_cache_hit', 6), ('fxgraph_cache_miss', 3), ('fxgraph_cache_hit', 3)]
 graph_break []
 aten_mm_info []
 ok
 test_scatter_reduce1_cuda (__main__.GPUTests.test_scatter_reduce1_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_scatter_reduce2_cuda (__main__.GPUTests.test_scatter_reduce2_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 4)]
 aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)]
 inductor [('triton_bundler_save_kernel', 56), ('async_compile_cache_miss', 8), ('async_compile_cache_hit', 8), ('fxgraph_cache_miss', 4), ('extern_calls', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_scatter_reduce3_cuda (__main__.GPUTests.test_scatter_reduce3_cuda) ... inline_call []
 stats [('calls_captured', 16), ('unique_graphs', 4)]
 aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)]
 inductor [('triton_bundler_save_kernel', 70), ('async_compile_cache_miss', 10), ('async_compile_cache_hit', 10), ('fxgraph_cache_miss', 4), ('extern_calls', 4)]
 graph_break []
 aten_mm_info []
 ok
 test_scheduler_vertical_fusion1_cuda (__main__.GPUTests.test_scheduler_vertical_fusion1_cuda) ... inline_call []
 stats [('calls_captured', 34), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_bypass', 2), ('ok', 2)]
 inductor [('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_sdpa_prefer_nd_tiling_False_use_block_ptr_False_cuda (__main__.GPUTests.test_sdpa_prefer_nd_tiling_False_use_block_ptr_False_cuda) ... inline_call []
 stats [('calls_captured', 30), ('unique_graphs', 3)]
 aot_autograd [('total', 3), ('ok', 3), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('autograd_cache_hit', 1)]
 inductor [('triton_bundler_save_kernel', 14), ('extern_calls', 9), ('pattern_matcher_count', 8), ('pattern_matcher_nodes', 8), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 3), ('fxgraph_cache_miss', 2), ('fxgraph_cache_hit', 1)]
 graph_break []
 aten_mm_info [('aten.mm_23760_8_128', 2)]
 frames [('total', 1), ('ok', 1)]
 ok
 test_sdpa_prefer_nd_tiling_False_use_block_ptr_True_cuda (__main__.GPUTests.test_sdpa_prefer_nd_tiling_False_use_block_ptr_True_cuda) ... inline_call []
 stats [('calls_captured', 30), ('unique_graphs', 3)]
 aot_autograd [('total', 3), ('ok', 3), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('autograd_cache_hit', 1)]
 inductor [('triton_bundler_save_kernel', 14), ('extern_calls', 9), ('pattern_matcher_count', 8), ('pattern_matcher_nodes', 8), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 3), ('fxgraph_cache_miss', 2), ('fxgraph_cache_hit', 1)]
 graph_break []
 aten_mm_info [('aten.mm_23760_8_128', 2)]
 frames [('total', 1), ('ok', 1)]
 ok
 test_sdpa_prefer_nd_tiling_True_use_block_ptr_False_cuda (__main__.GPUTests.test_sdpa_prefer_nd_tiling_True_use_block_ptr_False_cuda) ... inline_call []
 stats [('calls_captured', 30), ('unique_graphs', 3)]
 aot_autograd [('total', 3), ('ok', 3), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('autograd_cache_hit', 1)]
 inductor [('triton_bundler_save_kernel', 14), ('extern_calls', 9), ('pattern_matcher_count', 8), ('pattern_matcher_nodes', 8), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 3), ('fxgraph_cache_miss', 2), ('fxgraph_cache_hit', 1)]
 graph_break []
 aten_mm_info [('aten.mm_23760_8_128', 2)]
 frames [('total', 1), ('ok', 1)]
 ok
 test_sdpa_prefer_nd_tiling_True_use_block_ptr_True_cuda (__main__.GPUTests.test_sdpa_prefer_nd_tiling_True_use_block_ptr_True_cuda) ... inline_call []
 stats [('calls_captured', 30), ('unique_graphs', 3)]
 aot_autograd [('total', 3), ('ok', 3), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('autograd_cache_hit', 1)]
 inductor [('triton_bundler_save_kernel', 14), ('extern_calls', 9), ('pattern_matcher_count', 8), ('pattern_matcher_nodes', 8), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 3), ('fxgraph_cache_miss', 2), ('fxgraph_cache_hit', 1)]
 graph_break []
 aten_mm_info [('aten.mm_23760_8_128', 2)]
 frames [('total', 1), ('ok', 1)]
 ok
 test_sdpa_unaligned_mask_cuda (__main__.GPUTests.test_sdpa_unaligned_mask_cuda) ... inline_call []
 stats [('calls_captured', 10), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('extern_calls', 4), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_sdpa_unaligned_mask_freezing_cuda (__main__.GPUTests.test_sdpa_unaligned_mask_freezing_cuda) ... frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 5), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_bypass', 1), ('ok', 1)]
 inductor [('extern_calls', 2), ('fxgraph_cache_miss', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_searchsorted_cuda (__main__.GPUTests.test_searchsorted_cuda) ... inline_call []
 stats [('calls_captured', 40), ('unique_graphs', 40)]
 aot_autograd [('total', 40), ('autograd_cache_miss', 40), ('autograd_cache_saved', 40), ('ok', 40)]
 inductor [('triton_bundler_save_kernel', 280), ('fxgraph_cache_miss', 40), ('async_compile_cache_miss', 40), ('async_compile_cache_hit', 40)]
 graph_break []
 aten_mm_info []
 ok
 test_select_scatter_cuda (__main__.GPUTests.test_select_scatter_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_setitem_with_int_parameter_cuda (__main__.GPUTests.test_setitem_with_int_parameter_cuda) ... stats [('calls_captured', 3), ('unique_graphs', 3)]
 aot_autograd [('total', 3), ('autograd_cache_miss', 3), ('autograd_cache_saved', 3), ('ok', 3), ('autograd_cache_guard_miss', 1)]
 inductor [('triton_bundler_save_kernel', 21), ('fxgraph_cache_miss', 4), ('async_compile_cache_miss', 3), ('async_compile_cache_hit', 3)]
 graph_break []
 aten_mm_info []
 ok
 test_sgn_cuda (__main__.GPUTests.test_sgn_cuda) ... inline_call []
 stats [('calls_captured', 8), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_sgn_extremal_cuda (__main__.GPUTests.test_sgn_extremal_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_shape_padding_cuda (__main__.GPUTests.test_shape_padding_cuda) ... inline_call []
 stats [('calls_captured', 24), ('unique_graphs', 24)]
 aot_autograd [('total', 24), ('ok', 24), ('autograd_cache_miss', 12), ('autograd_cache_saved', 12), ('autograd_cache_hit', 12)]
 inductor [('extern_calls', 24), ('fxgraph_cache_hit', 14), ('benchmarking.InductorBenchmarker.benchmark_gpu', 12), ('fxgraph_cache_miss', 10), ('pattern_matcher_count', 6), ('pattern_matcher_nodes', 6)]
 graph_break []
 aten_mm_info [('aten.bmm_11_13_15', 4), ('aten.mm_11_13_15', 2), ('aten.addmm_11_13_15', 2), ('aten.baddbmm_11_13_15', 2)]
 ok
 test_shape_prop_torch_ones_cuda (__main__.GPUTests.test_shape_prop_torch_ones_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_should_pad_bench_for_bmm_cuda (__main__.GPUTests.test_should_pad_bench_for_bmm_cuda) ... ok
 test_sigmoid_cuda (__main__.GPUTests.test_sigmoid_cuda) ... inline_call []
 stats [('calls_captured', 6), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_sign_dtype_cuda (__main__.GPUTests.test_sign_dtype_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_signbit_cuda (__main__.GPUTests.test_signbit_cuda) ... inline_call []
 stats [('calls_captured', 10), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_silu_cuda (__main__.GPUTests.test_silu_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_simplify_loops_cuda (__main__.GPUTests.test_simplify_loops_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_sin_cuda (__main__.GPUTests.test_sin_cuda) ... inline_call []
 stats [('calls_captured', 8), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_single_elem_cuda (__main__.GPUTests.test_single_elem_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_single_elem_indirect_cuda (__main__.GPUTests.test_single_elem_indirect_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('extern_calls', 4), ('fxgraph_cache_miss', 2), ('intermediate_hooks', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_size_asserts_for_multi_output_fallback_cuda (__main__.GPUTests.test_size_asserts_for_multi_output_fallback_cuda) ... frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 3), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('extern_calls', 3), ('fxgraph_cache_miss', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_sizehint_issue1_cuda (__main__.GPUTests.test_sizehint_issue1_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('pattern_matcher_count', 10), ('pattern_matcher_nodes', 10), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_slice1_cuda (__main__.GPUTests.test_slice1_cuda) ... inline_call []
 stats [('calls_captured', 20), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_bypass', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_slice2_cuda (__main__.GPUTests.test_slice2_cuda) ... inline_call []
 stats [('calls_captured', 16), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_slice3_cuda (__main__.GPUTests.test_slice3_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_bypass', 2), ('ok', 2)]
 inductor [('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_slice4_cuda (__main__.GPUTests.test_slice4_cuda) ... inline_call []
 stats [('calls_captured', 8), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_bypass', 2), ('ok', 2)]
 inductor [('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_slice_mutation1_cuda (__main__.GPUTests.test_slice_mutation1_cuda) ... inline_call []
 stats [('calls_captured', 12), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 56), ('async_compile_cache_miss', 8), ('async_compile_cache_hit', 8), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_slice_mutation2_cuda (__main__.GPUTests.test_slice_mutation2_cuda) ... stats [('calls_captured', 6), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 14), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_slice_mutation3_cuda (__main__.GPUTests.test_slice_mutation3_cuda) ... stats [('calls_captured', 2), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_slice_scatter2_cuda (__main__.GPUTests.test_slice_scatter2_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_slice_scatter3_cuda (__main__.GPUTests.test_slice_scatter3_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_slice_scatter4_cuda (__main__.GPUTests.test_slice_scatter4_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_slice_scatter5_cuda (__main__.GPUTests.test_slice_scatter5_cuda) ... inline_call []
 stats [('calls_captured', 8), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_slice_scatter_cuda (__main__.GPUTests.test_slice_scatter_cuda) ... inline_call []
 stats [('calls_captured', 6), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_slice_scatter_reinplace_cuda (__main__.GPUTests.test_slice_scatter_reinplace_cuda) ... inline_call []
 stats [('calls_captured', 7), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('pattern_matcher_nodes', 7), ('triton_bundler_save_kernel', 7), ('pattern_matcher_count', 6), ('benchmarking.InductorBenchmarker.benchmark_gpu', 2), ('extern_calls', 2), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info [('aten.mm_256_64_64', 1), ('aten.bmm_32_33_64', 1)]
 ok
 test_slice_view_with_graph_break_cuda (__main__.GPUTests.test_slice_view_with_graph_break_cuda) ... frames [('total', 2), ('ok', 2)]
 stats [('calls_captured', 7), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_softmax_backward_data_cuda (__main__.GPUTests.test_softmax_backward_data_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_softmax_cuda (__main__.GPUTests.test_softmax_cuda) ... inline_call []
 stats [('calls_captured', 8), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('pattern_matcher_nodes', 24), ('pattern_matcher_count', 6), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_softmax_one_kernel_loop_cuda (__main__.GPUTests.test_softmax_one_kernel_loop_cuda) ... inline_call []
 stats [('calls_captured', 5), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('pattern_matcher_nodes', 4), ('fxgraph_cache_miss', 1), ('pattern_matcher_count', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_softmax_one_kernel_persist_cuda (__main__.GPUTests.test_softmax_one_kernel_persist_cuda) ... inline_call []
 stats [('calls_captured', 5), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('pattern_matcher_nodes', 4), ('fxgraph_cache_miss', 1), ('pattern_matcher_count', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_sort_bool_cuda (__main__.GPUTests.test_sort_bool_cuda) ... inline_call []
 stats [('calls_captured', 16), ('unique_graphs', 4)]
 aot_autograd [('total', 4), ('ok', 4), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('autograd_cache_hit', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2), ('fxgraph_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_sort_cuda (__main__.GPUTests.test_sort_cuda) ... inline_call []
 stats [('calls_captured', 12), ('unique_graphs', 4)]
 aot_autograd [('total', 4), ('ok', 4), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('autograd_cache_hit', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2), ('fxgraph_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_sort_stable_cuda (__main__.GPUTests.test_sort_stable_cuda) ... inline_call []
 stats [('calls_captured', 24), ('unique_graphs', 8)]
 aot_autograd [('total', 8), ('autograd_cache_miss', 8), ('autograd_cache_saved', 8), ('ok', 8)]
 inductor [('triton_bundler_save_kernel', 56), ('fxgraph_cache_miss', 8), ('async_compile_cache_miss', 8), ('async_compile_cache_hit', 8)]
 graph_break []
 aten_mm_info []
 ok
 test_sort_transpose_cuda (__main__.GPUTests.test_sort_transpose_cuda) ... inline_call []
 stats [('calls_captured', 12), ('unique_graphs', 4)]
 aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)]
 inductor [('triton_bundler_save_kernel', 28), ('fxgraph_cache_miss', 4), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4)]
 graph_break []
 aten_mm_info []
 ok
 test_split_cuda (__main__.GPUTests.test_split_cuda) ... inline_call []
 stats [('calls_captured', 22), ('unique_graphs', 4)]
 aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('ok', 4), ('autograd_cache_bypass', 2), ('autograd_cache_saved', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 4), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_split_cumprod_cuda (__main__.GPUTests.test_split_cumprod_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 4)]
 aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)]
 inductor [('triton_bundler_save_kernel', 28), ('fxgraph_cache_miss', 4), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4)]
 graph_break []
 aten_mm_info []
 ok
 test_split_cumprod_low_prec_cuda (__main__.GPUTests.test_split_cumprod_low_prec_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_split_cumsum_cuda (__main__.GPUTests.test_split_cumsum_cuda) ... inline_call []
 stats [('calls_captured', 16), ('unique_graphs', 16)]
 aot_autograd [('total', 16), ('autograd_cache_miss', 16), ('autograd_cache_saved', 16), ('ok', 16)]
 inductor [('triton_bundler_save_kernel', 112), ('fxgraph_cache_miss', 16), ('async_compile_cache_miss', 16), ('async_compile_cache_hit', 16)]
 graph_break []
 aten_mm_info []
 ok
 test_split_cumsum_index_cuda (__main__.GPUTests.test_split_cumsum_index_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_split_cumsum_low_prec_cuda (__main__.GPUTests.test_split_cumsum_low_prec_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('pattern_matcher_count', 1), ('pattern_matcher_nodes', 1), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_split_failed_cuda (__main__.GPUTests.test_split_failed_cuda) ... E0509 18:24:53.529000 415353 torch/_subclasses/fake_tensor.py:2431] [0/0] failed while attempting to run meta for aten.split_with_sizes.default
 E0509 18:24:53.529000 415353 torch/_subclasses/fake_tensor.py:2431] [0/0] Traceback (most recent call last):
 E0509 18:24:53.529000 415353 torch/_subclasses/fake_tensor.py:2431] [0/0]   File "/workspace/pytorch/torch/_subclasses/fake_tensor.py", line 2427, in _dispatch_impl
 E0509 18:24:53.529000 415353 torch/_subclasses/fake_tensor.py:2431] [0/0]     r = func(*args, **kwargs)
 E0509 18:24:53.529000 415353 torch/_subclasses/fake_tensor.py:2431] [0/0]         ^^^^^^^^^^^^^^^^^^^^^
 E0509 18:24:53.529000 415353 torch/_subclasses/fake_tensor.py:2431] [0/0]   File "/workspace/pytorch/torch/_ops.py", line 756, in __call__
 E0509 18:24:53.529000 415353 torch/_subclasses/fake_tensor.py:2431] [0/0]     return self._op(*args, **kwargs)
 E0509 18:24:53.529000 415353 torch/_subclasses/fake_tensor.py:2431] [0/0]            ^^^^^^^^^^^^^^^^^^^^^^^^^
 E0509 18:24:53.529000 415353 torch/_subclasses/fake_tensor.py:2431] [0/0]   File "/workspace/pytorch/torch/_refs/__init__.py", line 4167, in split_with_sizes
 E0509 18:24:53.529000 415353 torch/_subclasses/fake_tensor.py:2431] [0/0]     torch._check_with(
 E0509 18:24:53.529000 415353 torch/_subclasses/fake_tensor.py:2431] [0/0]   File "/workspace/pytorch/torch/__init__.py", line 1642, in _check_with
 E0509 18:24:53.529000 415353 torch/_subclasses/fake_tensor.py:2431] [0/0]     raise error_type(message_evaluated)
 E0509 18:24:53.529000 415353 torch/_subclasses/fake_tensor.py:2431] [0/0] ValueError: Split sizes add up to 4 but got the tensor's size of 5
 frames [('total', 1)]
 ok
 test_split_with_integer_cuda (__main__.GPUTests.test_split_with_integer_cuda) ... frames [('total', 6), ('ok', 6)]
 stats [('calls_captured', 12), ('unique_graphs', 3)]
 aot_autograd [('total', 3), ('autograd_cache_miss', 3), ('autograd_cache_bypass', 3), ('ok', 3)]
 inductor [('fxgraph_cache_miss', 3)]
 graph_break []
 aten_mm_info []
 ok
 test_split_with_list_cuda (__main__.GPUTests.test_split_with_list_cuda) ... inline_call []
 stats [('calls_captured', 52), ('unique_graphs', 6)]
 aot_autograd [('total', 6), ('autograd_cache_miss', 6), ('autograd_cache_saved', 6), ('ok', 6)]
 inductor [('triton_bundler_save_kernel', 140), ('async_compile_cache_miss', 20), ('async_compile_cache_hit', 20), ('fxgraph_cache_miss', 6)]
 graph_break []
 aten_mm_info []
 ok
 test_split_with_sizes_with_unbacked_symints_cuda (__main__.GPUTests.test_split_with_sizes_with_unbacked_symints_cuda) ... frames [('total', 2), ('ok', 2)]
 stats [('calls_captured', 35), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_bypass', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1)]
 inductor [('extern_calls', 9), ('fxgraph_cache_miss', 2), ('pattern_matcher_count', 1), ('pattern_matcher_nodes', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_split_with_unbacked_symints_cuda (__main__.GPUTests.test_split_with_unbacked_symints_cuda) ... frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 13), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('pattern_matcher_count', 1), ('pattern_matcher_nodes', 1), ('fxgraph_cache_miss', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_sqrt_dynamic_shapes_cuda (__main__.GPUTests.test_sqrt_dynamic_shapes_cuda) ... skipped 'sqrt dynamic shapes only supports cpu'
 test_squeeze1_cuda (__main__.GPUTests.test_squeeze1_cuda) ... inline_call []
 stats [('calls_captured', 10), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_squeeze2_cuda (__main__.GPUTests.test_squeeze2_cuda) ... inline_call []
 stats [('calls_captured', 12), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_squeeze_varargs_cuda (__main__.GPUTests.test_squeeze_varargs_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_stack_cuda (__main__.GPUTests.test_stack_cuda) ... inline_call []
 stats [('calls_captured', 6), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_std_cuda (__main__.GPUTests.test_std_cuda) ... inline_call []
 stats [('calls_captured', 16), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 56), ('async_compile_cache_miss', 8), ('async_compile_cache_hit', 8), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_stride_preservation_with_stride_modifying_fx_pass_cuda (__main__.GPUTests.test_stride_preservation_with_stride_modifying_fx_pass_cuda) ... frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_bypass', 1), ('ok', 1)]
 inductor [('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_bypass', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_strided_inputs_cuda (__main__.GPUTests.test_strided_inputs_cuda) ... frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_sum1_cuda (__main__.GPUTests.test_sum1_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_sum2_cuda (__main__.GPUTests.test_sum2_cuda) ... inline_call []
 stats [('calls_captured', 8), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_sum3_cuda (__main__.GPUTests.test_sum3_cuda) ... inline_call []
 stats [('calls_captured', 8), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_sum4_cuda (__main__.GPUTests.test_sum4_cuda) ... inline_call []
 stats [('calls_captured', 10), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_sum5_cuda (__main__.GPUTests.test_sum5_cuda) ... inline_call []
 stats [('calls_captured', 10), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_sum_dtype_cuda (__main__.GPUTests.test_sum_dtype_cuda) ... inline_call []
 stats [('calls_captured', 8), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_sum_int_cuda (__main__.GPUTests.test_sum_int_cuda) ... inline_call []
 stats [('calls_captured', 12), ('unique_graphs', 3)]
 aot_autograd [('total', 3), ('autograd_cache_miss', 3), ('autograd_cache_saved', 3), ('ok', 3)]
 inductor [('triton_bundler_save_kernel', 21), ('fxgraph_cache_miss', 3), ('async_compile_cache_miss', 3), ('async_compile_cache_hit', 3)]
 graph_break []
 aten_mm_info []
 ok
 test_sum_keepdims_cuda (__main__.GPUTests.test_sum_keepdims_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_tan_cuda (__main__.GPUTests.test_tan_cuda) ... inline_call []
 stats [('calls_captured', 8), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_tanh_cuda (__main__.GPUTests.test_tanh_cuda) ... inline_call []
 stats [('calls_captured', 8), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_tensor1_cuda (__main__.GPUTests.test_tensor1_cuda) ... inline_call []
 stats [('calls_captured', 6), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_tensor2_cuda (__main__.GPUTests.test_tensor2_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_tensor3_cuda (__main__.GPUTests.test_tensor3_cuda) ... inline_call []
 stats [('calls_captured', 14), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 42), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 6), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_tensor_index_put_slice_cuda (__main__.GPUTests.test_tensor_index_put_slice_cuda) ... frames [('total', 10), ('ok', 10)]
 stats [('calls_captured', 90), ('unique_graphs', 10)]
 aot_autograd [('total', 10), ('autograd_cache_miss', 10), ('autograd_cache_saved', 10), ('ok', 10)]
 inductor [('triton_bundler_save_kernel', 56), ('async_compile_cache_miss', 12), ('async_compile_cache_hit', 10), ('pattern_matcher_count', 10), ('pattern_matcher_nodes', 10), ('fxgraph_cache_miss', 8), ('fxgraph_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_tensor_index_slice_cuda (__main__.GPUTests.test_tensor_index_slice_cuda) ... frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 16), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 56), ('async_compile_cache_miss', 8), ('async_compile_cache_hit', 8), ('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('fxgraph_cache_miss', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_tmp_not_defined_issue1_use_block_ptr_False_cuda (__main__.GPUTests.test_tmp_not_defined_issue1_use_block_ptr_False_cuda) ... inline_call []
 stats [('calls_captured', 22), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_tmp_not_defined_issue1_use_block_ptr_True_cuda (__main__.GPUTests.test_tmp_not_defined_issue1_use_block_ptr_True_cuda) ... inline_call []
 stats [('calls_captured', 22), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_tmp_not_defined_issue2_cuda (__main__.GPUTests.test_tmp_not_defined_issue2_cuda) ... inline_call []
 stats [('calls_captured', 6), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_tmp_not_defined_issue3_cuda (__main__.GPUTests.test_tmp_not_defined_issue3_cuda) ... inline_call []
 stats [('calls_captured', 66), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_bypass', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 42), ('pattern_matcher_nodes', 16), ('async_compile_cache_miss', 15), ('extern_calls', 12), ('pattern_matcher_count', 10), ('async_compile_cache_hit', 10), ('benchmarking.InductorBenchmarker.benchmark_gpu', 6), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)]
 graph_break []
 aten_mm_info [('aten.addmm_6144_1001_6', 1)]
 ok
 test_to_device_constant_cuda (__main__.GPUTests.test_to_device_constant_cuda) ... inline_call []
 stats [('calls_captured', 14), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 42), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 6), ('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_to_device_cuda (__main__.GPUTests.test_to_device_cuda) ... W0509 18:25:06.079000 415353 torch/_inductor/utils.py:1762] [0/0] DeviceCopy in input program
 W0509 18:25:06.104000 415353 torch/_inductor/utils.py:1762] [0/0] DeviceCopy in input program
 inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 2), ('extern_calls', 2), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_to_dtype_cuda (__main__.GPUTests.test_to_dtype_cuda) ... inline_call []
 stats [('calls_captured', 10), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_to_memory_format_cuda (__main__.GPUTests.test_to_memory_format_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 4)]
 aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)]
 inductor [('triton_bundler_save_kernel', 28), ('fxgraph_cache_miss', 4), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4)]
 graph_break []
 aten_mm_info []
 ok
 test_topk_cuda (__main__.GPUTests.test_topk_cuda) ... inline_call []
 stats [('calls_captured', 6), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('extern_calls', 6), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_transpose_add_cuda (__main__.GPUTests.test_transpose_add_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_transpose_cuda (__main__.GPUTests.test_transpose_cuda) ... inline_call []
 stats [('calls_captured', 10), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_transposed_propagates_cuda (__main__.GPUTests.test_transposed_propagates_cuda) ... stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_triu_cuda (__main__.GPUTests.test_triu_cuda) ... inline_call []
 stats [('calls_captured', 6), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('pattern_matcher_count', 12), ('pattern_matcher_nodes', 12), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_uint4x2_mixed_mm_cuda (__main__.GPUTests.test_uint4x2_mixed_mm_cuda) ... inline_call []
 stats [('calls_captured', 14), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('extern_calls', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info [('aten.mm_8_8_8', 2)]
 ok
 test_uint_cuda (__main__.GPUTests.test_uint_cuda) ... inline_call []
 stats [('calls_captured', 6), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('async_compile_cache_miss', 3), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_unbacked_floordiv_simplify_cuda (__main__.GPUTests.test_unbacked_floordiv_simplify_cuda) ... inline_call []
 stats [('calls_captured', 52), ('unique_graphs', 4)]
 aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)]
 inductor [('triton_bundler_save_kernel', 28), ('fxgraph_cache_miss', 4), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4)]
 graph_break []
 aten_mm_info []
 ok
 test_unbacked_floordiv_simplify_errors_cuda (__main__.GPUTests.test_unbacked_floordiv_simplify_errors_cuda) ... frames [('total', 1)]
 ok
 test_unbind_cuda (__main__.GPUTests.test_unbind_cuda) ... inline_call []
 stats [('calls_captured', 20), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_bypass', 2), ('ok', 2)]
 inductor [('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_unfold_zero_dimension_tensor_cuda (__main__.GPUTests.test_unfold_zero_dimension_tensor_cuda) ... stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('fxgraph_cache_miss', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_unroll_small_reduction_cuda (__main__.GPUTests.test_unroll_small_reduction_cuda) ... inline_call []
 stats [('calls_captured', 72), ('unique_graphs', 4)]
 aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)]
 inductor [('triton_bundler_save_kernel', 56), ('async_compile_cache_miss', 8), ('async_compile_cache_hit', 8), ('fxgraph_cache_miss', 4)]
 graph_break []
 aten_mm_info []
 ok
 test_unspec_inputs_bfloat16_cuda (__main__.GPUTests.test_unspec_inputs_bfloat16_cuda) ... frames [('total', 5), ('ok', 5)]
 stats [('calls_captured', 6), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_unspec_inputs_float16_cuda (__main__.GPUTests.test_unspec_inputs_float16_cuda) ... frames [('total', 5), ('ok', 5)]
 stats [('calls_captured', 6), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_unspec_inputs_float32_cuda (__main__.GPUTests.test_unspec_inputs_float32_cuda) ... frames [('total', 5), ('ok', 5)]
 stats [('calls_captured', 6), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_unspec_inputs_float64_cuda (__main__.GPUTests.test_unspec_inputs_float64_cuda) ... frames [('total', 5), ('ok', 5)]
 stats [('calls_captured', 9), ('unique_graphs', 2)]
 aot_autograd [('total', 3), ('autograd_cache_miss', 3), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_unspec_inputs_int16_cuda (__main__.GPUTests.test_unspec_inputs_int16_cuda) ... frames [('total', 5), ('ok', 5)]
 stats [('calls_captured', 6), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_unspec_inputs_int32_cuda (__main__.GPUTests.test_unspec_inputs_int32_cuda) ... frames [('total', 5), ('ok', 5)]
 stats [('calls_captured', 6), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_unspec_inputs_int64_cuda (__main__.GPUTests.test_unspec_inputs_int64_cuda) ... frames [('total', 5), ('ok', 5)]
 stats [('calls_captured', 6), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_unspec_inputs_int8_cuda (__main__.GPUTests.test_unspec_inputs_int8_cuda) ... frames [('total', 5), ('ok', 5)]
 stats [('calls_captured', 6), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_unspec_inputs_uint8_cuda (__main__.GPUTests.test_unspec_inputs_uint8_cuda) ... frames [('total', 5), ('ok', 5)]
 stats [('calls_captured', 6), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_unsqueeze_cuda (__main__.GPUTests.test_unsqueeze_cuda) ... inline_call []
 stats [('calls_captured', 20), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_unsqueeze_inplace_cuda (__main__.GPUTests.test_unsqueeze_inplace_cuda) ... inline_call []
 stats [('calls_captured', 10), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_upsample_bicubic2d_cuda (__main__.GPUTests.test_upsample_bicubic2d_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('pattern_matcher_count', 8), ('pattern_matcher_nodes', 8), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_upsample_bilinear2d_a_cuda (__main__.GPUTests.test_upsample_bilinear2d_a_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('pattern_matcher_count', 16), ('pattern_matcher_nodes', 16), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_upsample_bilinear2d_b_cuda (__main__.GPUTests.test_upsample_bilinear2d_b_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('pattern_matcher_count', 8), ('pattern_matcher_nodes', 8), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_upsample_cat_conv_cuda (__main__.GPUTests.test_upsample_cat_conv_cuda) ... skipped 'only support cpu upsample_cat_conv test'
 test_upsample_nearest1d_cuda (__main__.GPUTests.test_upsample_nearest1d_cuda) ... inline_call []
 stats [('calls_captured', 10), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 56), ('pattern_matcher_count', 10), ('pattern_matcher_nodes', 10), ('async_compile_cache_miss', 8), ('async_compile_cache_hit', 8), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_upsample_nearest2d_backward_cuda (__main__.GPUTests.test_upsample_nearest2d_backward_cuda) ... inline_call []
 stats [('calls_captured', 10), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 56), ('async_compile_cache_miss', 8), ('async_compile_cache_hit', 8), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_upsample_nearest2d_cuda (__main__.GPUTests.test_upsample_nearest2d_cuda) ... inline_call []
 stats [('calls_captured', 10), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 56), ('pattern_matcher_count', 20), ('pattern_matcher_nodes', 20), ('async_compile_cache_miss', 8), ('async_compile_cache_hit', 8), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_upsample_nearest3d_cuda (__main__.GPUTests.test_upsample_nearest3d_cuda) ... inline_call []
 stats [('calls_captured', 10), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 56), ('pattern_matcher_count', 30), ('pattern_matcher_nodes', 30), ('async_compile_cache_miss', 8), ('async_compile_cache_hit', 8), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_var_correction_cuda (__main__.GPUTests.test_var_correction_cuda) ... /workspace/pytorch/test/inductor/test_torchinductor.py:5381: UserWarning: var(): degrees of freedom is <= 0. Correction should be strictly less than the reduction factor (input numel divided by output numel). (Triggered internally at /workspace/pytorch/aten/src/ATen/native/ReduceOps.cpp:1839.)
  torch.var(x, dim=dim, correction=10),
 /workspace/pytorch/test/inductor/test_torchinductor.py:5381: UserWarning: var(): degrees of freedom is <= 0. Correction should be strictly less than the reduction factor (input numel divided by output numel). (Triggered internally at /workspace/pytorch/aten/src/ATen/native/ReduceOps.cpp:1839.)
  torch.var(x, dim=dim, correction=10),
 /workspace/pytorch/test/inductor/test_torchinductor.py:5381: UserWarning: var(): degrees of freedom is <= 0. Correction should be strictly less than the reduction factor (input numel divided by output numel). (Triggered internally at /workspace/pytorch/aten/src/ATen/native/ReduceOps.cpp:1839.)
  torch.var(x, dim=dim, correction=10),
 /workspace/pytorch/test/inductor/test_torchinductor.py:5381: UserWarning: var(): degrees of freedom is <= 0. Correction should be strictly less than the reduction factor (input numel divided by output numel). (Triggered internally at /workspace/pytorch/aten/src/ATen/native/ReduceOps.cpp:1839.)
  torch.var(x, dim=dim, correction=10),
 inline_call []
 stats [('calls_captured', 12), ('unique_graphs', 4)]
 aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)]
 inductor [('triton_bundler_save_kernel', 28), ('fxgraph_cache_miss', 4), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4)]
 graph_break []
 aten_mm_info []
 ok
 test_var_mean_tile_reduction_False_cuda (__main__.GPUTests.test_var_mean_tile_reduction_False_cuda) ... inline_call []
 stats [('calls_captured', 12), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_var_mean_tile_reduction_True_cuda (__main__.GPUTests.test_var_mean_tile_reduction_True_cuda) ... inline_call []
 stats [('calls_captured', 12), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_vdd_clamp_cuda (__main__.GPUTests.test_vdd_clamp_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_vectorized_ops_masked_cuda (__main__.GPUTests.test_vectorized_ops_masked_cuda) ... inline_call []
 stats [('calls_captured', 8), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('pattern_matcher_count', 4), ('pattern_matcher_nodes', 4), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_vectorized_ops_masked_var_novec_cuda (__main__.GPUTests.test_vectorized_ops_masked_var_novec_cuda) ... inline_call []
 stats [('calls_captured', 8), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('pattern_matcher_count', 4), ('pattern_matcher_nodes', 4), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_vertical_fusion1_cuda (__main__.GPUTests.test_vertical_fusion1_cuda) ... inline_call []
 stats [('calls_captured', 18), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_view_as_complex_cuda (__main__.GPUTests.test_view_as_complex_cuda) ... frames [('total', 1), ('ok', 1)]
 stats [('calls_captured', 2), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('extern_calls', 2), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('intermediate_hooks', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_view_as_real_cuda (__main__.GPUTests.test_view_as_real_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('extern_calls', 4), ('async_compile_cache_miss', 3), ('intermediate_hooks', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_view_detach_cuda (__main__.GPUTests.test_view_detach_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_bypass', 2), ('ok', 2)]
 inductor [('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_view_on_aliased_cuda (__main__.GPUTests.test_view_on_aliased_cuda) ... inline_call []
 stats [('calls_captured', 20), ('unique_graphs', 4)]
 aot_autograd [('total', 4), ('ok', 4), ('autograd_cache_miss', 3), ('autograd_cache_saved', 3), ('autograd_cache_hit', 1)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 3), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2), ('pattern_matcher_count', 1), ('pattern_matcher_nodes', 1), ('fxgraph_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_view_uint8_through_differing_bitwidths_cuda (__main__.GPUTests.test_view_uint8_through_differing_bitwidths_cuda) ... inline_call []
 stats [('calls_captured', 12), ('unique_graphs', 6)]
 aot_autograd [('total', 6), ('autograd_cache_miss', 6), ('ok', 6)]
 ok
 test_views1_cuda (__main__.GPUTests.test_views1_cuda) ... inline_call []
 stats [('calls_captured', 140), ('unique_graphs', 56)]
 aot_autograd [('total', 56), ('autograd_cache_miss', 56), ('autograd_cache_saved', 56), ('ok', 56)]
 inductor [('triton_bundler_save_kernel', 392), ('pattern_matcher_count', 56), ('pattern_matcher_nodes', 56), ('fxgraph_cache_miss', 56), ('async_compile_cache_miss', 56), ('async_compile_cache_hit', 56)]
 graph_break []
 aten_mm_info []
 ok
 test_views2_cuda (__main__.GPUTests.test_views2_cuda) ... inline_call []
 stats [('calls_captured', 30), ('unique_graphs', 12)]
 aot_autograd [('total', 12), ('autograd_cache_miss', 12), ('autograd_cache_saved', 12), ('ok', 12)]
 inductor [('triton_bundler_save_kernel', 84), ('pattern_matcher_count', 12), ('pattern_matcher_nodes', 12), ('fxgraph_cache_miss', 12), ('async_compile_cache_miss', 12), ('async_compile_cache_hit', 12)]
 graph_break []
 aten_mm_info []
 ok
 test_views3_cuda (__main__.GPUTests.test_views3_cuda) ... inline_call []
 stats [('calls_captured', 6), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('pattern_matcher_nodes', 8), ('pattern_matcher_count', 6), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_views4_cuda (__main__.GPUTests.test_views4_cuda) ... inline_call []
 stats [('calls_captured', 6), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('pattern_matcher_nodes', 8), ('pattern_matcher_count', 6), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_views5_cuda (__main__.GPUTests.test_views5_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_bypass', 2), ('ok', 2)]
 inductor [('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_views6_cuda (__main__.GPUTests.test_views6_cuda) ... inline_call []
 stats [('calls_captured', 10), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_views7_cuda (__main__.GPUTests.test_views7_cuda) ... inline_call []
 stats [('calls_captured', 12), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_weight_norm_bwd_cuda (__main__.GPUTests.test_weight_norm_bwd_cuda)
 Weight norm backward eager kernel does not support non-contiguous ... frames [('total', 9), ('ok', 9)]
 inline_call []
 unimplemented []
 graph_break [('Tensor.backward', 1)]
 stats [('calls_captured', 8), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('ok', 2), ('autograd_cache_saved', 1)]
 inductor [('triton_bundler_save_kernel', 63), ('benchmarking.InductorBenchmarker.benchmark_gpu', 12), ('async_compile_cache_miss', 9), ('extern_calls', 9), ('async_compile_cache_hit', 9), ('pattern_matcher_nodes', 6), ('pattern_matcher_count', 4), ('fxgraph_cache_miss', 2)]
 aten_mm_info [('aten.mm_2_1025_2', 2), ('aten.addmm_2_2_1025', 1), ('aten.addmm_2_1_2', 1), ('aten.mm_2_2_1', 1), ('aten.mm_1_2_2', 1)]
 ok
 test_where_broadcast_cuda (__main__.GPUTests.test_where_broadcast_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_bypass', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_where_with_logical_op_cuda (__main__.GPUTests.test_where_with_logical_op_cuda) ... inline_call []
 stats [('calls_captured', 8), ('unique_graphs', 4)]
 aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)]
 inductor [('triton_bundler_save_kernel', 28), ('fxgraph_cache_miss', 4), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4)]
 graph_break []
 aten_mm_info []
 ok
 test_xblock_divides_xnumel_cuda (__main__.GPUTests.test_xblock_divides_xnumel_cuda) ... inline_call []
 stats [('calls_captured', 4), ('unique_graphs', 4)]
 aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)]
 inductor [('triton_bundler_save_kernel', 28), ('fxgraph_cache_miss', 4), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4)]
 graph_break []
 aten_mm_info []
 ok
 test_zero_dim_reductions_cuda (__main__.GPUTests.test_zero_dim_reductions_cuda) ... frames [('total', 2), ('ok', 2)]
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_zero_element_mutation_cuda (__main__.GPUTests.test_zero_element_mutation_cuda) ... inline_call []
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_zeros_cuda (__main__.GPUTests.test_zeros_cuda) ... inline_call []
 stats [('calls_captured', 14), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
 inductor [('triton_bundler_save_kernel', 42), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 6), ('fxgraph_cache_miss', 2)]
 graph_break []
 aten_mm_info []
 ok
 test_cuda_broadcast1_broadcast1 (__main__.SweepInputsGPUTest.test_cuda_broadcast1_broadcast1) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_cuda_broadcast1_broadcast2 (__main__.SweepInputsGPUTest.test_cuda_broadcast1_broadcast2) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_cuda_broadcast1_broadcast3 (__main__.SweepInputsGPUTest.test_cuda_broadcast1_broadcast3) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_cuda_broadcast1_dense (__main__.SweepInputsGPUTest.test_cuda_broadcast1_dense) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_cuda_broadcast1_double (__main__.SweepInputsGPUTest.test_cuda_broadcast1_double) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_cuda_broadcast1_int (__main__.SweepInputsGPUTest.test_cuda_broadcast1_int) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_cuda_broadcast1_strided (__main__.SweepInputsGPUTest.test_cuda_broadcast1_strided) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_cuda_broadcast1_transposed (__main__.SweepInputsGPUTest.test_cuda_broadcast1_transposed) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_cuda_broadcast2_broadcast1 (__main__.SweepInputsGPUTest.test_cuda_broadcast2_broadcast1) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_cuda_broadcast2_broadcast2 (__main__.SweepInputsGPUTest.test_cuda_broadcast2_broadcast2) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_cuda_broadcast2_broadcast3 (__main__.SweepInputsGPUTest.test_cuda_broadcast2_broadcast3) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_cuda_broadcast2_dense (__main__.SweepInputsGPUTest.test_cuda_broadcast2_dense) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_cuda_broadcast2_double (__main__.SweepInputsGPUTest.test_cuda_broadcast2_double) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_cuda_broadcast2_int (__main__.SweepInputsGPUTest.test_cuda_broadcast2_int) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_cuda_broadcast2_strided (__main__.SweepInputsGPUTest.test_cuda_broadcast2_strided) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_cuda_broadcast2_transposed (__main__.SweepInputsGPUTest.test_cuda_broadcast2_transposed) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_cuda_broadcast3_broadcast1 (__main__.SweepInputsGPUTest.test_cuda_broadcast3_broadcast1) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_cuda_broadcast3_broadcast2 (__main__.SweepInputsGPUTest.test_cuda_broadcast3_broadcast2) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_cuda_broadcast3_broadcast3 (__main__.SweepInputsGPUTest.test_cuda_broadcast3_broadcast3) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_cuda_broadcast3_dense (__main__.SweepInputsGPUTest.test_cuda_broadcast3_dense) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_cuda_broadcast3_double (__main__.SweepInputsGPUTest.test_cuda_broadcast3_double) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_cuda_broadcast3_int (__main__.SweepInputsGPUTest.test_cuda_broadcast3_int) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_cuda_broadcast3_strided (__main__.SweepInputsGPUTest.test_cuda_broadcast3_strided) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_cuda_broadcast3_transposed (__main__.SweepInputsGPUTest.test_cuda_broadcast3_transposed) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_cuda_dense_broadcast1 (__main__.SweepInputsGPUTest.test_cuda_dense_broadcast1) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_cuda_dense_broadcast2 (__main__.SweepInputsGPUTest.test_cuda_dense_broadcast2) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_cuda_dense_broadcast3 (__main__.SweepInputsGPUTest.test_cuda_dense_broadcast3) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_cuda_dense_dense (__main__.SweepInputsGPUTest.test_cuda_dense_dense) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_cuda_dense_double (__main__.SweepInputsGPUTest.test_cuda_dense_double) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_cuda_dense_int (__main__.SweepInputsGPUTest.test_cuda_dense_int) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_cuda_dense_strided (__main__.SweepInputsGPUTest.test_cuda_dense_strided) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_cuda_dense_transposed (__main__.SweepInputsGPUTest.test_cuda_dense_transposed) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_cuda_double_broadcast1 (__main__.SweepInputsGPUTest.test_cuda_double_broadcast1) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_cuda_double_broadcast2 (__main__.SweepInputsGPUTest.test_cuda_double_broadcast2) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_cuda_double_broadcast3 (__main__.SweepInputsGPUTest.test_cuda_double_broadcast3) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_cuda_double_dense (__main__.SweepInputsGPUTest.test_cuda_double_dense) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_cuda_double_double (__main__.SweepInputsGPUTest.test_cuda_double_double) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_cuda_double_int (__main__.SweepInputsGPUTest.test_cuda_double_int) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_cuda_double_strided (__main__.SweepInputsGPUTest.test_cuda_double_strided) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_cuda_double_transposed (__main__.SweepInputsGPUTest.test_cuda_double_transposed) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_cuda_int_broadcast1 (__main__.SweepInputsGPUTest.test_cuda_int_broadcast1) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_cuda_int_broadcast2 (__main__.SweepInputsGPUTest.test_cuda_int_broadcast2) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_cuda_int_broadcast3 (__main__.SweepInputsGPUTest.test_cuda_int_broadcast3) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_cuda_int_dense (__main__.SweepInputsGPUTest.test_cuda_int_dense) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_cuda_int_double (__main__.SweepInputsGPUTest.test_cuda_int_double) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_cuda_int_int (__main__.SweepInputsGPUTest.test_cuda_int_int) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_cuda_int_strided (__main__.SweepInputsGPUTest.test_cuda_int_strided) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_cuda_int_transposed (__main__.SweepInputsGPUTest.test_cuda_int_transposed) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_cuda_strided_broadcast1 (__main__.SweepInputsGPUTest.test_cuda_strided_broadcast1) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_cuda_strided_broadcast2 (__main__.SweepInputsGPUTest.test_cuda_strided_broadcast2) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_cuda_strided_broadcast3 (__main__.SweepInputsGPUTest.test_cuda_strided_broadcast3) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_cuda_strided_dense (__main__.SweepInputsGPUTest.test_cuda_strided_dense) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_cuda_strided_double (__main__.SweepInputsGPUTest.test_cuda_strided_double) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_cuda_strided_int (__main__.SweepInputsGPUTest.test_cuda_strided_int) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_cuda_strided_strided (__main__.SweepInputsGPUTest.test_cuda_strided_strided) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_cuda_strided_transposed (__main__.SweepInputsGPUTest.test_cuda_strided_transposed) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_cuda_transposed_broadcast1 (__main__.SweepInputsGPUTest.test_cuda_transposed_broadcast1) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_cuda_transposed_broadcast2 (__main__.SweepInputsGPUTest.test_cuda_transposed_broadcast2) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_cuda_transposed_broadcast3 (__main__.SweepInputsGPUTest.test_cuda_transposed_broadcast3) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_cuda_transposed_dense (__main__.SweepInputsGPUTest.test_cuda_transposed_dense) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_cuda_transposed_double (__main__.SweepInputsGPUTest.test_cuda_transposed_double) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_cuda_transposed_int (__main__.SweepInputsGPUTest.test_cuda_transposed_int) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_cuda_transposed_strided (__main__.SweepInputsGPUTest.test_cuda_transposed_strided) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_cuda_transposed_transposed (__main__.SweepInputsGPUTest.test_cuda_transposed_transposed) ... inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
 inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok
 test_ctr_not_moved_to_cuda_when_used_in_index_put (__main__.TritonCodeGenTests.test_ctr_not_moved_to_cuda_when_used_in_index_put) ... frames [('total', 2), ('ok', 2)]
 stats [('calls_captured', 2), ('unique_graphs', 2)]
 aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)]
 inductor [('triton_bundler_save_kernel', 14), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 4), ('extern_calls', 2), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)]
 graph_break []
 aten_mm_info []
 ok

 ----------------------------------------------------------------------
 Ran 835 tests in 418.647s

 OK (skipped=26)