Last active
May 9, 2025 21:16
-
-
Save davidberard98/1279c372f49c3161b6986f289e2ebc10 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
AUTOTUNE addmm(4096x16, 4096x3, 3x16) | |
triton_mm_4 0.0034 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_5 0.0035 ms 98.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 | |
triton_mm_2 0.0035 ms 97.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=2 | |
triton_mm_3 0.0035 ms 97.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4 | |
triton_mm_6 0.0035 ms 97.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=4 | |
triton_mm_8 0.0035 ms 96.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=128, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 | |
triton_mm_10 0.0035 ms 96.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=128, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_9 0.0036 ms 93.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=128, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=8 | |
triton_mm_1 0.0036 ms 93.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=2 | |
triton_mm_0 0.0038 ms 89.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=2 | |
SingleProcess AUTOTUNE benchmarking takes 0.2860 seconds and 0.1056 seconds precompiling for 11 choices | |
frames [('total', 1), ('ok', 1)] | |
inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
inductor [('triton_bundler_save_kernel', 84), ('async_compile_cache_miss', 13), ('select_algorithm_num_precompiles', 11), ('benchmarking.InductorBenchmarker.benchmark_gpu', 11), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
graph_break [] | |
aten_mm_info [('aten.addmm_4096_16_3', 1)] | |
.sAUTOTUNE baddbmm(64x2048x192, 64x2048x64, 64x64x192) | |
triton_bmm_25 0.0939 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 | |
triton_bmm_29 0.0939 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_bmm_18 0.0939 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 | |
triton_bmm_24 0.0939 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 | |
triton_bmm_27 0.0939 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 | |
triton_bmm_20 0.0942 ms 99.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 | |
triton_bmm_17 0.0942 ms 99.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_bmm_19 0.0942 ms 99.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 | |
triton_bmm_22 0.0942 ms 99.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 | |
triton_bmm_23 0.0942 ms 99.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 | |
SingleProcess AUTOTUNE benchmarking takes 0.6409 seconds and 0.3905 seconds precompiling for 19 choices | |
frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 140), ('async_compile_cache_miss', 21), ('select_algorithm_num_precompiles', 19), ('benchmarking.InductorBenchmarker.benchmark_gpu', 19), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)] | |
graph_break [] | |
aten_mm_info [('aten.baddbmm_2048_192_64', 1)] | |
.Process SpawnProcess-1: | |
Traceback (most recent call last): | |
File "/root/miniconda3/envs/triton/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap | |
self.run() | |
File "/root/miniconda3/envs/triton/lib/python3.12/multiprocessing/process.py", line 108, in run | |
self._target(*self._args, **self._kwargs) | |
File "/workspace/pytorch/test/inductor/test_max_autotune.py", line 64, in benchmark_choice | |
result = choice.benchmark(*args, out=out) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/test/inductor/test_max_autotune.py", line 73, in benchmark | |
raise RuntimeError("This choice caller will always throw") | |
RuntimeError: This choice caller will always throw | |
.timings is tensor([0.0077, 0.0077, 0.0077]), out tensor([[ 5.8251, -2.6431], | |
[ 4.6481, 1.1723]], device='cuda:0'), expected_out None | |
.AUTOTUNE addmm(4x4, 4x4, 4x4) | |
triton_mm_30 0.0034 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=1 | |
triton_mm_31 0.0035 ms 99.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=1 | |
triton_mm_32 0.0035 ms 98.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=1 | |
triton_mm_34 0.0036 ms 96.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=1 | |
triton_mm_33 0.0036 ms 93.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=1 | |
SingleProcess AUTOTUNE benchmarking takes 0.1741 seconds and 0.0496 seconds precompiling for 5 choices | |
frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 3), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 56), ('async_compile_cache_miss', 9), ('select_algorithm_num_precompiles', 5), ('benchmarking.InductorBenchmarker.benchmark_gpu', 5), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)] | |
graph_break [] | |
aten_mm_info [('aten.addmm_4_4_4', 2)] | |
.AUTOTUNE mm(32x32, 32x32) | |
triton_mm_41 0.0033 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_44 0.0034 ms 98.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 | |
triton_mm_45 0.0034 ms 96.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 | |
triton_mm_43 0.0036 ms 90.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_42 0.0037 ms 88.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 | |
triton_mm_40 0.0038 ms 86.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=2 | |
mm 0.0061 ms 53.9% | |
SingleProcess AUTOTUNE benchmarking takes 0.2061 seconds and 0.0794 seconds precompiling for 7 choices | |
frames [('total', 3), ('ok', 3)] | |
stats [('calls_captured', 11), ('unique_graphs', 3)] | |
aot_autograd [('total', 3), ('autograd_cache_miss', 3), ('autograd_cache_saved', 3), ('ok', 3)] | |
inductor [('triton_bundler_save_kernel', 168), ('benchmarking.InductorBenchmarker.benchmark_gpu', 33), ('async_compile_cache_miss', 13), ('select_algorithm_num_precompiles', 6), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 3), ('extern_calls', 2), ('select_algorithm_precompile', 1), ('benchmarking.InductorBenchmarker.benchmark', 1), ('select_algorithm_autotune', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_32_32_32', 3)] | |
.AUTOTUNE mm(32x32, 32x32) | |
triton_mm_62 0.0034 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 | |
triton_mm_59 0.0034 ms 99.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_60 0.0035 ms 98.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 | |
triton_mm_63 0.0035 ms 98.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 | |
triton_mm_58 0.0038 ms 89.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=2 | |
triton_mm_61 0.0038 ms 89.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 | |
SingleProcess AUTOTUNE benchmarking takes 0.1909 seconds and 0.0791 seconds precompiling for 6 choices | |
frames [('total', 3), ('ok', 3)] | |
stats [('calls_captured', 11), ('unique_graphs', 3)] | |
aot_autograd [('total', 3), ('autograd_cache_miss', 3), ('autograd_cache_saved', 3), ('ok', 3)] | |
inductor [('triton_bundler_save_kernel', 154), ('benchmarking.InductorBenchmarker.benchmark_gpu', 24), ('async_compile_cache_miss', 17), ('select_algorithm_num_precompiles', 6), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 3), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_32_32_32', 3)] | |
.AUTOTUNE mm(168084x3, 3x64) | |
triton_mm_80 0.0603 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4 | |
triton_mm_81 0.0609 ms 98.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_82 0.0612 ms 98.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=8 | |
triton_mm_84 0.0612 ms 98.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 | |
triton_mm_85 0.0612 ms 98.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 | |
triton_mm_86 0.0612 ms 98.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=4 | |
triton_mm_87 0.0612 ms 98.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 | |
triton_mm_88 0.0612 ms 98.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 | |
triton_mm_89 0.0612 ms 98.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=8 | |
triton_mm_90 0.0612 ms 98.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 | |
SingleProcess AUTOTUNE benchmarking takes 0.4810 seconds and 0.1080 seconds precompiling for 16 choices | |
AUTOTUNE mm(159856x3, 3x64) | |
triton_mm_102 0.0573 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 | |
triton_mm_95 0.0588 ms 97.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4 | |
triton_mm_104 0.0589 ms 97.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=8 | |
triton_mm_101 0.0589 ms 97.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=4 | |
triton_mm_105 0.0590 ms 97.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_93 0.0591 ms 97.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_94 0.0591 ms 97.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_99 0.0591 ms 97.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 | |
triton_mm_100 0.0591 ms 97.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 | |
triton_mm_103 0.0591 ms 97.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 | |
SingleProcess AUTOTUNE benchmarking takes 0.4557 seconds and 0.1030 seconds precompiling for 16 choices | |
frames [('total', 8), ('ok', 8)] | |
unimplemented [] | |
graph_break [("Dynamic shape operator\n Explanation: Operator `aten.nonzero.default`'s output shape depends on input Tensor data.\n Hint: Enable tracing of dynamic shape operators with `torch._dynamo.config.capture_dynamic_output_shape_ops = True`\n\n Developer debug context: aten.nonzero.default\n", 3)] | |
inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 3)] | |
aot_autograd [('total', 3), ('autograd_cache_miss', 3), ('ok', 3), ('autograd_cache_saved', 1)] | |
inductor [('triton_bundler_save_kernel', 280), ('benchmarking.InductorBenchmarker.benchmark_gpu', 40), ('async_compile_cache_miss', 36), ('select_algorithm_num_precompiles', 30), ('fxgraph_cache_miss', 3), ('select_algorithm_precompile', 2), ('benchmarking.InductorBenchmarker.benchmark', 2), ('select_algorithm_autotune', 2), ('extern_calls', 2)] | |
aten_mm_info [('aten.mm_168084_64_3', 1), ('aten.mm_4*s0*s1_64_3', 1)] | |
.frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('fxgraph_cache_miss', 1), ('extern_calls', 1)] | |
graph_break [] | |
aten_mm_info [] | |
.frames [('total', 1)] | |
inline_call [] | |
stats [('calls_captured', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('not_ok', 1)] | |
inductor [('fxgraph_cache_miss', 1)] | |
graph_break [] | |
.AUTOTUNE convolution(32x3x64x64, 64x3x3x3) | |
convolution 0.0563 ms 100.0% | |
triton_convolution2d_106 0.0922 ms 61.1% ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, GROUPS=1, KERNEL_H=3, KERNEL_W=3, PADDING_H=1, PADDING_W=1, STRIDE_H=1, STRIDE_W=1, UNROLL=False, num_stages=2, num_warps=4 | |
triton_convolution2d_110 0.0939 ms 60.0% ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, GROUPS=1, KERNEL_H=3, KERNEL_W=3, PADDING_H=1, PADDING_W=1, STRIDE_H=1, STRIDE_W=1, UNROLL=False, num_stages=2, num_warps=8 | |
triton_convolution2d_111 0.1120 ms 50.3% ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=256, BLOCK_N=64, GROUPS=1, KERNEL_H=3, KERNEL_W=3, PADDING_H=1, PADDING_W=1, STRIDE_H=1, STRIDE_W=1, UNROLL=False, num_stages=2, num_warps=8 | |
triton_convolution2d_109 0.1203 ms 46.8% ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=128, BLOCK_N=64, GROUPS=1, KERNEL_H=3, KERNEL_W=3, PADDING_H=1, PADDING_W=1, STRIDE_H=1, STRIDE_W=1, UNROLL=False, num_stages=2, num_warps=8 | |
triton_convolution2d_108 0.1393 ms 40.4% ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=1024, BLOCK_N=16, GROUPS=1, KERNEL_H=3, KERNEL_W=3, PADDING_H=1, PADDING_W=1, STRIDE_H=1, STRIDE_W=1, UNROLL=False, num_stages=1, num_warps=8 | |
triton_convolution2d_107 0.2045 ms 27.5% ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=256, BLOCK_N=64, GROUPS=1, KERNEL_H=3, KERNEL_W=3, PADDING_H=1, PADDING_W=1, STRIDE_H=1, STRIDE_W=1, UNROLL=False, num_stages=2, num_warps=4 | |
SingleProcess AUTOTUNE benchmarking takes 0.2998 seconds and 0.3119 seconds precompiling for 7 choices | |
frames [('total', 1), ('ok', 1)] | |
inline_call [] | |
stats [('calls_captured', 3), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 147), ('benchmarking.InductorBenchmarker.benchmark_gpu', 49), ('async_compile_cache_miss', 9), ('select_algorithm_num_precompiles', 6), ('async_compile_cache_hit', 3), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('benchmarking.InductorBenchmarker.benchmark', 1), ('select_algorithm_autotune', 1), ('extern_calls', 1)] | |
graph_break [] | |
aten_mm_info [] | |
.frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('async_compile_cache_miss', 6), ('fxgraph_cache_miss', 1)] | |
graph_break [] | |
aten_mm_info [] | |
.E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] Triton compilation failed: Placeholder.DESCRIPTIVE_NAME | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] def triton_convolution2d(arg_X, arg_W, out_ptr0): | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] KERNEL_H : tl.constexpr = 3 | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] KERNEL_W : tl.constexpr = 3 | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] STRIDE_H : tl.constexpr = 1 | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] STRIDE_W : tl.constexpr = 1 | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] PADDING_H : tl.constexpr = 0 | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] PADDING_W : tl.constexpr = 0 | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] GROUPS : tl.constexpr = 1 | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] UNROLL : tl.constexpr = False | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] ALLOW_TF32 : tl.constexpr = True | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] BLOCK_M : tl.constexpr = 16 | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] BLOCK_N : tl.constexpr = 16 | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] BLOCK_K : tl.constexpr = 16 | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] X = arg_X | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] W = arg_W | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] # Tensor dimensions | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] BATCH = 0 | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] IN_C = 256 | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] IN_H = 14 | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] IN_W = 14 | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] OUT_C = 256 | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] OUT_H = 12 | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] OUT_W = 12 | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] # Strides: | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] stride_xn = 50176 | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] stride_xc = 196 | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] stride_xh = 14 | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] stride_xw = 1 | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] stride_wc_out = 2304 | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] stride_wc_in = 1 | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] stride_wh = 768 | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] stride_ww = 256 | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] nhw = tl.program_id(0) * BLOCK_M + tl.arange(0, BLOCK_M) | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] idx_y_w = nhw % OUT_W | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] nh = nhw // OUT_W | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] idx_y_h = nh % OUT_H | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] idx_n = nh // OUT_H | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] idx_y_c = tl.program_id(1) * BLOCK_N + tl.arange(0, BLOCK_N) | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] group = 0 | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] GROUP_IN_C = IN_C | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] GROUP_OUT_C = OUT_C | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] x_base = X + (group * stride_xc * GROUP_IN_C + idx_n * stride_xn)[:, None] | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] w_base = ( | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] W + (group * stride_wc_out * GROUP_OUT_C + idx_y_c * stride_wc_out)[None, :] | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] ) | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32) | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] # Could be simplified, but slightly slower: | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] # for i in range(KERNEL_H): | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] # for j in range(KERNEL_W): | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] # for k in range(0, GROUP_IN_C, BLOCK_K): | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] BLOCK_K_COUNT = (GROUP_IN_C + BLOCK_K - 1) // BLOCK_K | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] for ijk in range(KERNEL_H * KERNEL_W * BLOCK_K_COUNT): | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] k = (ijk % BLOCK_K_COUNT) * BLOCK_K | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] ij = ijk // BLOCK_K_COUNT | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] i = ij // KERNEL_W | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] j = ij % KERNEL_W | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] idx_x_h = i - PADDING_H + idx_y_h * STRIDE_H | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] idx_x_w = j - PADDING_W + idx_y_w * STRIDE_W | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] idx_x_c = tl.arange(0, BLOCK_K) + k | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] x_ptrs = x_base + ( | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] (idx_x_h * stride_xh)[:, None] | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] + (idx_x_w * stride_xw)[:, None] | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] + (idx_x_c * stride_xc)[None, :] | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] ) | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] mask_x = ( | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] (idx_n < BATCH)[:, None] | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] & (idx_x_h >= 0)[:, None] | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] & (idx_x_h < IN_H)[:, None] | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] & (idx_x_w >= 0)[:, None] | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] & (idx_x_w < IN_W)[:, None] | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] & (idx_x_c < GROUP_IN_C)[None, :] | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] ) | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] matrix_x = tl.load(x_ptrs, mask=mask_x, other=0.0) | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] w_ptrs = w_base + ( | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] (idx_x_c * stride_wc_in)[:, None] + (i * stride_wh) + (j * stride_ww) | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] ) | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] mask_w = (idx_x_c[:, None] < GROUP_IN_C) & (idx_y_c[None, :] < GROUP_OUT_C) | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] matrix_w = tl.load(w_ptrs, mask=mask_w, other=0.0) | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] acc += tl.dot(matrix_x, matrix_w, allow_tf32=ALLOW_TF32) | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] mask = ( | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] (idx_n < BATCH)[:, None] | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] & (idx_y_h < OUT_H)[:, None] | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] & (idx_y_w < OUT_W)[:, None] | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] & (idx_y_c < GROUP_OUT_C)[None, :] | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] ) | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] idx_n = idx_n[:, None] | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] idx_c = idx_y_c[None, :] + group * GROUP_OUT_C | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] idx_h = idx_y_h[:, None] | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] idx_w = idx_y_w[:, None] | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] # inductor generates a suffix | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] xindex = idx_w + 12*idx_h + 144*idx_c + 36864*idx_n | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] tl.store(out_ptr0 + (tl.broadcast_to(idx_c + 256*idx_w + 3072*idx_h + 36864*idx_n, acc.shape)), acc, mask) | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] metadata: {'signature': {'arg_X': '*fp32', 'arg_W': '*fp32', 'out_ptr0': '*fp32'}, 'device': 0, 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}], 'device_type': 'cuda', 'num_warps': 1, 'num_stages': 1, 'debug': True, 'cc': 120} | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] Traceback (most recent call last): | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] File "/workspace/pytorch/torch/_inductor/runtime/triton_heuristics.py", line 537, in _precompile_config | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] binary = triton.compile(*compile_args, **compile_kwargs) | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] File "/workspace/triton/python/triton/compiler/compiler.py", line 316, in compile | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] return CompiledKernel(src, metadata_group, hash) | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] File "/workspace/triton/python/triton/compiler/compiler.py", line 382, in __init__ | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] file.suffix[1:]: file.read_bytes() if file.suffix[1:] == binary_ext else file.read_text() | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] ^^^^^^^^^^^^^^^^ | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] File "/root/miniconda3/envs/triton/lib/python3.12/pathlib.py", line 1027, in read_text | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] with self.open(mode='r', encoding=encoding, errors=errors) as f: | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] File "/root/miniconda3/envs/triton/lib/python3.12/pathlib.py", line 1013, in open | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] return io.open(self, mode, buffering, encoding, errors, newline) | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
E0509 19:03:36.433000 434590 torch/_inductor/runtime/triton_heuristics.py:539] FileNotFoundError: [Errno 2] No such file or directory: '/tmp/tmp8jpqfdds/triton/OKBX2FGS2GKNCNNM3EGQETF6HKRQ3TWANAB6W2NPWMSAR3BN6HGA/triton_convolution2d.ttir' | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] Triton compilation failed: Placeholder.DESCRIPTIVE_NAME | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] def triton_convolution2d(arg_X, arg_W, out_ptr0): | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] KERNEL_H : tl.constexpr = 3 | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] KERNEL_W : tl.constexpr = 3 | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] STRIDE_H : tl.constexpr = 1 | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] STRIDE_W : tl.constexpr = 1 | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] PADDING_H : tl.constexpr = 0 | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] PADDING_W : tl.constexpr = 0 | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] GROUPS : tl.constexpr = 1 | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] UNROLL : tl.constexpr = False | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] ALLOW_TF32 : tl.constexpr = True | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] BLOCK_M : tl.constexpr = 16 | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] BLOCK_N : tl.constexpr = 64 | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] BLOCK_K : tl.constexpr = 16 | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] X = arg_X | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] W = arg_W | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] # Tensor dimensions | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] BATCH = 0 | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] IN_C = 256 | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] IN_H = 14 | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] IN_W = 14 | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] OUT_C = 256 | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] OUT_H = 12 | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] OUT_W = 12 | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] # Strides: | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] stride_xn = 50176 | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] stride_xc = 196 | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] stride_xh = 14 | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] stride_xw = 1 | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] stride_wc_out = 2304 | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] stride_wc_in = 1 | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] stride_wh = 768 | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] stride_ww = 256 | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] nhw = tl.program_id(0) * BLOCK_M + tl.arange(0, BLOCK_M) | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] idx_y_w = nhw % OUT_W | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] nh = nhw // OUT_W | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] idx_y_h = nh % OUT_H | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] idx_n = nh // OUT_H | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] idx_y_c = tl.program_id(1) * BLOCK_N + tl.arange(0, BLOCK_N) | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] group = 0 | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] GROUP_IN_C = IN_C | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] GROUP_OUT_C = OUT_C | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] x_base = X + (group * stride_xc * GROUP_IN_C + idx_n * stride_xn)[:, None] | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] w_base = ( | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] W + (group * stride_wc_out * GROUP_OUT_C + idx_y_c * stride_wc_out)[None, :] | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] ) | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32) | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] # Could be simplified, but slightly slower: | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] # for i in range(KERNEL_H): | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] # for j in range(KERNEL_W): | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] # for k in range(0, GROUP_IN_C, BLOCK_K): | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] BLOCK_K_COUNT = (GROUP_IN_C + BLOCK_K - 1) // BLOCK_K | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] for ijk in range(KERNEL_H * KERNEL_W * BLOCK_K_COUNT): | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] k = (ijk % BLOCK_K_COUNT) * BLOCK_K | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] ij = ijk // BLOCK_K_COUNT | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] i = ij // KERNEL_W | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] j = ij % KERNEL_W | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] idx_x_h = i - PADDING_H + idx_y_h * STRIDE_H | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] idx_x_w = j - PADDING_W + idx_y_w * STRIDE_W | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] idx_x_c = tl.arange(0, BLOCK_K) + k | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] x_ptrs = x_base + ( | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] (idx_x_h * stride_xh)[:, None] | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] + (idx_x_w * stride_xw)[:, None] | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] + (idx_x_c * stride_xc)[None, :] | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] ) | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] mask_x = ( | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] (idx_n < BATCH)[:, None] | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] & (idx_x_h >= 0)[:, None] | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] & (idx_x_h < IN_H)[:, None] | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] & (idx_x_w >= 0)[:, None] | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] & (idx_x_w < IN_W)[:, None] | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] & (idx_x_c < GROUP_IN_C)[None, :] | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] ) | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] matrix_x = tl.load(x_ptrs, mask=mask_x, other=0.0) | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] w_ptrs = w_base + ( | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] (idx_x_c * stride_wc_in)[:, None] + (i * stride_wh) + (j * stride_ww) | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] ) | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] mask_w = (idx_x_c[:, None] < GROUP_IN_C) & (idx_y_c[None, :] < GROUP_OUT_C) | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] matrix_w = tl.load(w_ptrs, mask=mask_w, other=0.0) | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] acc += tl.dot(matrix_x, matrix_w, allow_tf32=ALLOW_TF32) | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] mask = ( | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] (idx_n < BATCH)[:, None] | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] & (idx_y_h < OUT_H)[:, None] | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] & (idx_y_w < OUT_W)[:, None] | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] & (idx_y_c < GROUP_OUT_C)[None, :] | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] ) | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] idx_n = idx_n[:, None] | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] idx_c = idx_y_c[None, :] + group * GROUP_OUT_C | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] idx_h = idx_y_h[:, None] | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] idx_w = idx_y_w[:, None] | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] # inductor generates a suffix | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] xindex = idx_w + 12*idx_h + 144*idx_c + 36864*idx_n | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] tl.store(out_ptr0 + (tl.broadcast_to(idx_c + 256*idx_w + 3072*idx_h + 36864*idx_n, acc.shape)), acc, mask) | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] metadata: {'signature': {'arg_X': '*fp32', 'arg_W': '*fp32', 'out_ptr0': '*fp32'}, 'device': 0, 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}], 'device_type': 'cuda', 'num_warps': 4, 'num_stages': 2, 'debug': True, 'cc': 120} | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] Traceback (most recent call last): | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] File "/workspace/pytorch/torch/_inductor/runtime/triton_heuristics.py", line 537, in _precompile_config | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] binary = triton.compile(*compile_args, **compile_kwargs) | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] File "/workspace/triton/python/triton/compiler/compiler.py", line 316, in compile | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] return CompiledKernel(src, metadata_group, hash) | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] File "/workspace/triton/python/triton/compiler/compiler.py", line 382, in __init__ | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] file.suffix[1:]: file.read_bytes() if file.suffix[1:] == binary_ext else file.read_text() | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] ^^^^^^^^^^^^^^^^ | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] File "/root/miniconda3/envs/triton/lib/python3.12/pathlib.py", line 1027, in read_text | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] with self.open(mode='r', encoding=encoding, errors=errors) as f: | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] File "/root/miniconda3/envs/triton/lib/python3.12/pathlib.py", line 1013, in open | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] return io.open(self, mode, buffering, encoding, errors, newline) | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
E0509 19:03:36.447000 434576 torch/_inductor/runtime/triton_heuristics.py:539] FileNotFoundError: [Errno 2] No such file or directory: '/tmp/tmp8jpqfdds/triton/ADUSF5KEFEADV5PKAOTDGCRC6SJ7HGCR67UP5W6TLE2DQIRRAVWA/triton_convolution2d.ttir' | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] Triton compilation failed: Placeholder.DESCRIPTIVE_NAME | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] def triton_convolution2d(arg_X, arg_W, out_ptr0): | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] KERNEL_H : tl.constexpr = 3 | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] KERNEL_W : tl.constexpr = 3 | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] STRIDE_H : tl.constexpr = 1 | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] STRIDE_W : tl.constexpr = 1 | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] PADDING_H : tl.constexpr = 0 | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] PADDING_W : tl.constexpr = 0 | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] GROUPS : tl.constexpr = 1 | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] UNROLL : tl.constexpr = False | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] ALLOW_TF32 : tl.constexpr = True | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] BLOCK_M : tl.constexpr = 16 | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] BLOCK_N : tl.constexpr = 64 | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] BLOCK_K : tl.constexpr = 32 | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] X = arg_X | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] W = arg_W | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] # Tensor dimensions | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] BATCH = 0 | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] IN_C = 256 | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] IN_H = 14 | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] IN_W = 14 | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] OUT_C = 256 | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] OUT_H = 12 | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] OUT_W = 12 | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] # Strides: | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] stride_xn = 50176 | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] stride_xc = 196 | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] stride_xh = 14 | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] stride_xw = 1 | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] stride_wc_out = 2304 | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] stride_wc_in = 1 | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] stride_wh = 768 | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] stride_ww = 256 | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] nhw = tl.program_id(0) * BLOCK_M + tl.arange(0, BLOCK_M) | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] idx_y_w = nhw % OUT_W | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] nh = nhw // OUT_W | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] idx_y_h = nh % OUT_H | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] idx_n = nh // OUT_H | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] idx_y_c = tl.program_id(1) * BLOCK_N + tl.arange(0, BLOCK_N) | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] group = 0 | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] GROUP_IN_C = IN_C | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] GROUP_OUT_C = OUT_C | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] x_base = X + (group * stride_xc * GROUP_IN_C + idx_n * stride_xn)[:, None] | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] w_base = ( | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] W + (group * stride_wc_out * GROUP_OUT_C + idx_y_c * stride_wc_out)[None, :] | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] ) | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32) | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] # Could be simplified, but slightly slower: | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] # for i in range(KERNEL_H): | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] # for j in range(KERNEL_W): | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] # for k in range(0, GROUP_IN_C, BLOCK_K): | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] BLOCK_K_COUNT = (GROUP_IN_C + BLOCK_K - 1) // BLOCK_K | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] for ijk in range(KERNEL_H * KERNEL_W * BLOCK_K_COUNT): | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] k = (ijk % BLOCK_K_COUNT) * BLOCK_K | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] ij = ijk // BLOCK_K_COUNT | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] i = ij // KERNEL_W | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] j = ij % KERNEL_W | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] idx_x_h = i - PADDING_H + idx_y_h * STRIDE_H | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] idx_x_w = j - PADDING_W + idx_y_w * STRIDE_W | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] idx_x_c = tl.arange(0, BLOCK_K) + k | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] x_ptrs = x_base + ( | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] (idx_x_h * stride_xh)[:, None] | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] + (idx_x_w * stride_xw)[:, None] | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] + (idx_x_c * stride_xc)[None, :] | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] ) | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] mask_x = ( | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] (idx_n < BATCH)[:, None] | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] & (idx_x_h >= 0)[:, None] | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] & (idx_x_h < IN_H)[:, None] | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] & (idx_x_w >= 0)[:, None] | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] & (idx_x_w < IN_W)[:, None] | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] & (idx_x_c < GROUP_IN_C)[None, :] | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] ) | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] matrix_x = tl.load(x_ptrs, mask=mask_x, other=0.0) | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] w_ptrs = w_base + ( | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] (idx_x_c * stride_wc_in)[:, None] + (i * stride_wh) + (j * stride_ww) | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] ) | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] mask_w = (idx_x_c[:, None] < GROUP_IN_C) & (idx_y_c[None, :] < GROUP_OUT_C) | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] matrix_w = tl.load(w_ptrs, mask=mask_w, other=0.0) | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] acc += tl.dot(matrix_x, matrix_w, allow_tf32=ALLOW_TF32) | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] mask = ( | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] (idx_n < BATCH)[:, None] | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] & (idx_y_h < OUT_H)[:, None] | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] & (idx_y_w < OUT_W)[:, None] | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] & (idx_y_c < GROUP_OUT_C)[None, :] | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] ) | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] idx_n = idx_n[:, None] | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] idx_c = idx_y_c[None, :] + group * GROUP_OUT_C | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] idx_h = idx_y_h[:, None] | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] idx_w = idx_y_w[:, None] | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] # inductor generates a suffix | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] xindex = idx_w + 12*idx_h + 144*idx_c + 36864*idx_n | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] tl.store(out_ptr0 + (tl.broadcast_to(idx_c + 256*idx_w + 3072*idx_h + 36864*idx_n, acc.shape)), acc, mask) | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] metadata: {'signature': {'arg_X': '*fp32', 'arg_W': '*fp32', 'out_ptr0': '*fp32'}, 'device': 0, 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}], 'device_type': 'cuda', 'num_warps': 4, 'num_stages': 2, 'debug': True, 'cc': 120} | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] Traceback (most recent call last): | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] File "/workspace/pytorch/torch/_inductor/runtime/triton_heuristics.py", line 537, in _precompile_config | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] binary = triton.compile(*compile_args, **compile_kwargs) | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] File "/workspace/triton/python/triton/compiler/compiler.py", line 316, in compile | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] return CompiledKernel(src, metadata_group, hash) | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] File "/workspace/triton/python/triton/compiler/compiler.py", line 382, in __init__ | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] file.suffix[1:]: file.read_bytes() if file.suffix[1:] == binary_ext else file.read_text() | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] ^^^^^^^^^^^^^^^^ | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] File "/root/miniconda3/envs/triton/lib/python3.12/pathlib.py", line 1027, in read_text | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] with self.open(mode='r', encoding=encoding, errors=errors) as f: | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] File "/root/miniconda3/envs/triton/lib/python3.12/pathlib.py", line 1013, in open | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] return io.open(self, mode, buffering, encoding, errors, newline) | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
E0509 19:03:36.448000 434562 torch/_inductor/runtime/triton_heuristics.py:539] FileNotFoundError: [Errno 2] No such file or directory: '/tmp/tmp8jpqfdds/triton/XVSZCZAGZM4IIKDSICNUCKNWZDCSZB3WTBJWXA6NA6STV4YQ4IUA/triton_convolution2d.ttir' | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] Triton compilation failed: Placeholder.DESCRIPTIVE_NAME | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] def triton_convolution2d(arg_X, arg_W, out_ptr0): | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] KERNEL_H : tl.constexpr = 3 | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] KERNEL_W : tl.constexpr = 3 | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] STRIDE_H : tl.constexpr = 1 | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] STRIDE_W : tl.constexpr = 1 | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] PADDING_H : tl.constexpr = 0 | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] PADDING_W : tl.constexpr = 0 | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] GROUPS : tl.constexpr = 1 | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] UNROLL : tl.constexpr = False | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] ALLOW_TF32 : tl.constexpr = True | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] BLOCK_M : tl.constexpr = 16 | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] BLOCK_N : tl.constexpr = 128 | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] BLOCK_K : tl.constexpr = 32 | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] X = arg_X | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] W = arg_W | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] # Tensor dimensions | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] BATCH = 0 | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] IN_C = 256 | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] IN_H = 14 | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] IN_W = 14 | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] OUT_C = 256 | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] OUT_H = 12 | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] OUT_W = 12 | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] # Strides: | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] stride_xn = 50176 | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] stride_xc = 196 | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] stride_xh = 14 | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] stride_xw = 1 | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] stride_wc_out = 2304 | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] stride_wc_in = 1 | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] stride_wh = 768 | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] stride_ww = 256 | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] nhw = tl.program_id(0) * BLOCK_M + tl.arange(0, BLOCK_M) | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] idx_y_w = nhw % OUT_W | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] nh = nhw // OUT_W | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] idx_y_h = nh % OUT_H | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] idx_n = nh // OUT_H | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] idx_y_c = tl.program_id(1) * BLOCK_N + tl.arange(0, BLOCK_N) | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] group = 0 | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] GROUP_IN_C = IN_C | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] GROUP_OUT_C = OUT_C | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] x_base = X + (group * stride_xc * GROUP_IN_C + idx_n * stride_xn)[:, None] | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] w_base = ( | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] W + (group * stride_wc_out * GROUP_OUT_C + idx_y_c * stride_wc_out)[None, :] | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] ) | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32) | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] # Could be simplified, but slightly slower: | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] # for i in range(KERNEL_H): | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] # for j in range(KERNEL_W): | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] # for k in range(0, GROUP_IN_C, BLOCK_K): | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] BLOCK_K_COUNT = (GROUP_IN_C + BLOCK_K - 1) // BLOCK_K | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] for ijk in range(KERNEL_H * KERNEL_W * BLOCK_K_COUNT): | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] k = (ijk % BLOCK_K_COUNT) * BLOCK_K | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] ij = ijk // BLOCK_K_COUNT | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] i = ij // KERNEL_W | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] j = ij % KERNEL_W | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] idx_x_h = i - PADDING_H + idx_y_h * STRIDE_H | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] idx_x_w = j - PADDING_W + idx_y_w * STRIDE_W | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] idx_x_c = tl.arange(0, BLOCK_K) + k | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] x_ptrs = x_base + ( | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] (idx_x_h * stride_xh)[:, None] | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] + (idx_x_w * stride_xw)[:, None] | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] + (idx_x_c * stride_xc)[None, :] | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] ) | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] mask_x = ( | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] (idx_n < BATCH)[:, None] | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] & (idx_x_h >= 0)[:, None] | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] & (idx_x_h < IN_H)[:, None] | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] & (idx_x_w >= 0)[:, None] | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] & (idx_x_w < IN_W)[:, None] | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] & (idx_x_c < GROUP_IN_C)[None, :] | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] ) | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] matrix_x = tl.load(x_ptrs, mask=mask_x, other=0.0) | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] w_ptrs = w_base + ( | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] (idx_x_c * stride_wc_in)[:, None] + (i * stride_wh) + (j * stride_ww) | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] ) | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] mask_w = (idx_x_c[:, None] < GROUP_IN_C) & (idx_y_c[None, :] < GROUP_OUT_C) | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] matrix_w = tl.load(w_ptrs, mask=mask_w, other=0.0) | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] acc += tl.dot(matrix_x, matrix_w, allow_tf32=ALLOW_TF32) | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] mask = ( | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] (idx_n < BATCH)[:, None] | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] & (idx_y_h < OUT_H)[:, None] | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] & (idx_y_w < OUT_W)[:, None] | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] & (idx_y_c < GROUP_OUT_C)[None, :] | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] ) | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] idx_n = idx_n[:, None] | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] idx_c = idx_y_c[None, :] + group * GROUP_OUT_C | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] idx_h = idx_y_h[:, None] | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] idx_w = idx_y_w[:, None] | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] # inductor generates a suffix | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] xindex = idx_w + 12*idx_h + 144*idx_c + 36864*idx_n | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] tl.store(out_ptr0 + (tl.broadcast_to(idx_c + 256*idx_w + 3072*idx_h + 36864*idx_n, acc.shape)), acc, mask) | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] metadata: {'signature': {'arg_X': '*fp32', 'arg_W': '*fp32', 'out_ptr0': '*fp32'}, 'device': 0, 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}], 'device_type': 'cuda', 'num_warps': 8, 'num_stages': 2, 'debug': True, 'cc': 120} | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] Traceback (most recent call last): | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] File "/workspace/pytorch/torch/_inductor/runtime/triton_heuristics.py", line 537, in _precompile_config | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] binary = triton.compile(*compile_args, **compile_kwargs) | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] File "/workspace/triton/python/triton/compiler/compiler.py", line 316, in compile | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] return CompiledKernel(src, metadata_group, hash) | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] File "/workspace/triton/python/triton/compiler/compiler.py", line 382, in __init__ | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] file.suffix[1:]: file.read_bytes() if file.suffix[1:] == binary_ext else file.read_text() | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] ^^^^^^^^^^^^^^^^ | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] File "/root/miniconda3/envs/triton/lib/python3.12/pathlib.py", line 1027, in read_text | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] with self.open(mode='r', encoding=encoding, errors=errors) as f: | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] File "/root/miniconda3/envs/triton/lib/python3.12/pathlib.py", line 1013, in open | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] return io.open(self, mode, buffering, encoding, errors, newline) | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
E0509 19:03:36.452000 434534 torch/_inductor/runtime/triton_heuristics.py:539] FileNotFoundError: [Errno 2] No such file or directory: '/tmp/tmp8jpqfdds/triton/HFCWHWC5X7TRLLBPY4QK6UJSXJVFBJ3TWILWKLHULF27TZOFFDGQ/triton_convolution2d.ttir' | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] Triton compilation failed: Placeholder.DESCRIPTIVE_NAME | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] def triton_convolution2d(arg_X, arg_W, out_ptr0): | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] KERNEL_H : tl.constexpr = 3 | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] KERNEL_W : tl.constexpr = 3 | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] STRIDE_H : tl.constexpr = 1 | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] STRIDE_W : tl.constexpr = 1 | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] PADDING_H : tl.constexpr = 0 | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] PADDING_W : tl.constexpr = 0 | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] GROUPS : tl.constexpr = 1 | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] UNROLL : tl.constexpr = False | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] ALLOW_TF32 : tl.constexpr = True | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] BLOCK_M : tl.constexpr = 16 | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] BLOCK_N : tl.constexpr = 256 | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] BLOCK_K : tl.constexpr = 32 | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] X = arg_X | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] W = arg_W | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] # Tensor dimensions | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] BATCH = 0 | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] IN_C = 256 | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] IN_H = 14 | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] IN_W = 14 | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] OUT_C = 256 | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] OUT_H = 12 | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] OUT_W = 12 | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] # Strides: | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] stride_xn = 50176 | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] stride_xc = 196 | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] stride_xh = 14 | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] stride_xw = 1 | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] stride_wc_out = 2304 | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] stride_wc_in = 1 | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] stride_wh = 768 | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] stride_ww = 256 | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] nhw = tl.program_id(0) * BLOCK_M + tl.arange(0, BLOCK_M) | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] idx_y_w = nhw % OUT_W | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] nh = nhw // OUT_W | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] idx_y_h = nh % OUT_H | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] idx_n = nh // OUT_H | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] idx_y_c = tl.program_id(1) * BLOCK_N + tl.arange(0, BLOCK_N) | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] group = 0 | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] GROUP_IN_C = IN_C | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] GROUP_OUT_C = OUT_C | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] x_base = X + (group * stride_xc * GROUP_IN_C + idx_n * stride_xn)[:, None] | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] w_base = ( | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] W + (group * stride_wc_out * GROUP_OUT_C + idx_y_c * stride_wc_out)[None, :] | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] ) | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32) | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] # Could be simplified, but slightly slower: | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] # for i in range(KERNEL_H): | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] # for j in range(KERNEL_W): | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] # for k in range(0, GROUP_IN_C, BLOCK_K): | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] BLOCK_K_COUNT = (GROUP_IN_C + BLOCK_K - 1) // BLOCK_K | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] for ijk in range(KERNEL_H * KERNEL_W * BLOCK_K_COUNT): | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] k = (ijk % BLOCK_K_COUNT) * BLOCK_K | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] ij = ijk // BLOCK_K_COUNT | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] i = ij // KERNEL_W | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] j = ij % KERNEL_W | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] idx_x_h = i - PADDING_H + idx_y_h * STRIDE_H | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] idx_x_w = j - PADDING_W + idx_y_w * STRIDE_W | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] idx_x_c = tl.arange(0, BLOCK_K) + k | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] x_ptrs = x_base + ( | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] (idx_x_h * stride_xh)[:, None] | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] + (idx_x_w * stride_xw)[:, None] | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] + (idx_x_c * stride_xc)[None, :] | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] ) | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] mask_x = ( | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] (idx_n < BATCH)[:, None] | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] & (idx_x_h >= 0)[:, None] | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] & (idx_x_h < IN_H)[:, None] | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] & (idx_x_w >= 0)[:, None] | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] & (idx_x_w < IN_W)[:, None] | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] & (idx_x_c < GROUP_IN_C)[None, :] | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] ) | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] matrix_x = tl.load(x_ptrs, mask=mask_x, other=0.0) | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] w_ptrs = w_base + ( | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] (idx_x_c * stride_wc_in)[:, None] + (i * stride_wh) + (j * stride_ww) | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] ) | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] mask_w = (idx_x_c[:, None] < GROUP_IN_C) & (idx_y_c[None, :] < GROUP_OUT_C) | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] matrix_w = tl.load(w_ptrs, mask=mask_w, other=0.0) | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] acc += tl.dot(matrix_x, matrix_w, allow_tf32=ALLOW_TF32) | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] mask = ( | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] (idx_n < BATCH)[:, None] | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] & (idx_y_h < OUT_H)[:, None] | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] & (idx_y_w < OUT_W)[:, None] | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] & (idx_y_c < GROUP_OUT_C)[None, :] | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] ) | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] idx_n = idx_n[:, None] | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] idx_c = idx_y_c[None, :] + group * GROUP_OUT_C | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] idx_h = idx_y_h[:, None] | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] idx_w = idx_y_w[:, None] | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] # inductor generates a suffix | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] xindex = idx_w + 12*idx_h + 144*idx_c + 36864*idx_n | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] tl.store(out_ptr0 + (tl.broadcast_to(idx_c + 256*idx_w + 3072*idx_h + 36864*idx_n, acc.shape)), acc, mask) | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] metadata: {'signature': {'arg_X': '*fp32', 'arg_W': '*fp32', 'out_ptr0': '*fp32'}, 'device': 0, 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}], 'device_type': 'cuda', 'num_warps': 8, 'num_stages': 2, 'debug': True, 'cc': 120} | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] Traceback (most recent call last): | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] File "/workspace/pytorch/torch/_inductor/runtime/triton_heuristics.py", line 537, in _precompile_config | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] binary = triton.compile(*compile_args, **compile_kwargs) | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] File "/workspace/triton/python/triton/compiler/compiler.py", line 316, in compile | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] return CompiledKernel(src, metadata_group, hash) | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] File "/workspace/triton/python/triton/compiler/compiler.py", line 382, in __init__ | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] file.suffix[1:]: file.read_bytes() if file.suffix[1:] == binary_ext else file.read_text() | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] ^^^^^^^^^^^^^^^^ | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] File "/root/miniconda3/envs/triton/lib/python3.12/pathlib.py", line 1027, in read_text | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] with self.open(mode='r', encoding=encoding, errors=errors) as f: | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] File "/root/miniconda3/envs/triton/lib/python3.12/pathlib.py", line 1013, in open | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] return io.open(self, mode, buffering, encoding, errors, newline) | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
E0509 19:03:36.454000 434598 torch/_inductor/runtime/triton_heuristics.py:539] FileNotFoundError: [Errno 2] No such file or directory: '/tmp/tmp8jpqfdds/triton/TYN3MXM4E73ZH7RBDASGFLNCZ6AHOJBB52BSNH55QJSE4NPI3X3Q/triton_convolution2d.ttir' | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] Triton compilation failed: Placeholder.DESCRIPTIVE_NAME | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] def triton_convolution2d(arg_X, arg_W, out_ptr0): | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] KERNEL_H : tl.constexpr = 3 | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] KERNEL_W : tl.constexpr = 3 | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] STRIDE_H : tl.constexpr = 1 | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] STRIDE_W : tl.constexpr = 1 | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] PADDING_H : tl.constexpr = 0 | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] PADDING_W : tl.constexpr = 0 | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] GROUPS : tl.constexpr = 1 | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] UNROLL : tl.constexpr = False | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] ALLOW_TF32 : tl.constexpr = True | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] BLOCK_M : tl.constexpr = 16 | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] BLOCK_N : tl.constexpr = 256 | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] BLOCK_K : tl.constexpr = 16 | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] X = arg_X | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] W = arg_W | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] # Tensor dimensions | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] BATCH = 0 | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] IN_C = 256 | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] IN_H = 14 | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] IN_W = 14 | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] OUT_C = 256 | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] OUT_H = 12 | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] OUT_W = 12 | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] # Strides: | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] stride_xn = 50176 | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] stride_xc = 196 | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] stride_xh = 14 | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] stride_xw = 1 | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] stride_wc_out = 2304 | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] stride_wc_in = 1 | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] stride_wh = 768 | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] stride_ww = 256 | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] nhw = tl.program_id(0) * BLOCK_M + tl.arange(0, BLOCK_M) | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] idx_y_w = nhw % OUT_W | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] nh = nhw // OUT_W | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] idx_y_h = nh % OUT_H | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] idx_n = nh // OUT_H | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] idx_y_c = tl.program_id(1) * BLOCK_N + tl.arange(0, BLOCK_N) | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] group = 0 | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] GROUP_IN_C = IN_C | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] GROUP_OUT_C = OUT_C | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] x_base = X + (group * stride_xc * GROUP_IN_C + idx_n * stride_xn)[:, None] | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] w_base = ( | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] W + (group * stride_wc_out * GROUP_OUT_C + idx_y_c * stride_wc_out)[None, :] | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] ) | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32) | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] # Could be simplified, but slightly slower: | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] # for i in range(KERNEL_H): | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] # for j in range(KERNEL_W): | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] # for k in range(0, GROUP_IN_C, BLOCK_K): | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] BLOCK_K_COUNT = (GROUP_IN_C + BLOCK_K - 1) // BLOCK_K | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] for ijk in range(KERNEL_H * KERNEL_W * BLOCK_K_COUNT): | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] k = (ijk % BLOCK_K_COUNT) * BLOCK_K | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] ij = ijk // BLOCK_K_COUNT | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] i = ij // KERNEL_W | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] j = ij % KERNEL_W | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] idx_x_h = i - PADDING_H + idx_y_h * STRIDE_H | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] idx_x_w = j - PADDING_W + idx_y_w * STRIDE_W | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] idx_x_c = tl.arange(0, BLOCK_K) + k | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] x_ptrs = x_base + ( | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] (idx_x_h * stride_xh)[:, None] | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] + (idx_x_w * stride_xw)[:, None] | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] + (idx_x_c * stride_xc)[None, :] | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] ) | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] mask_x = ( | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] (idx_n < BATCH)[:, None] | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] & (idx_x_h >= 0)[:, None] | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] & (idx_x_h < IN_H)[:, None] | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] & (idx_x_w >= 0)[:, None] | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] & (idx_x_w < IN_W)[:, None] | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] & (idx_x_c < GROUP_IN_C)[None, :] | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] ) | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] matrix_x = tl.load(x_ptrs, mask=mask_x, other=0.0) | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] w_ptrs = w_base + ( | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] (idx_x_c * stride_wc_in)[:, None] + (i * stride_wh) + (j * stride_ww) | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] ) | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] mask_w = (idx_x_c[:, None] < GROUP_IN_C) & (idx_y_c[None, :] < GROUP_OUT_C) | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] matrix_w = tl.load(w_ptrs, mask=mask_w, other=0.0) | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] acc += tl.dot(matrix_x, matrix_w, allow_tf32=ALLOW_TF32) | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] mask = ( | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] (idx_n < BATCH)[:, None] | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] & (idx_y_h < OUT_H)[:, None] | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] & (idx_y_w < OUT_W)[:, None] | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] & (idx_y_c < GROUP_OUT_C)[None, :] | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] ) | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] idx_n = idx_n[:, None] | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] idx_c = idx_y_c[None, :] + group * GROUP_OUT_C | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] idx_h = idx_y_h[:, None] | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] idx_w = idx_y_w[:, None] | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] # inductor generates a suffix | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] xindex = idx_w + 12*idx_h + 144*idx_c + 36864*idx_n | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] tl.store(out_ptr0 + (tl.broadcast_to(idx_c + 256*idx_w + 3072*idx_h + 36864*idx_n, acc.shape)), acc, mask) | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] metadata: {'signature': {'arg_X': '*fp32', 'arg_W': '*fp32', 'out_ptr0': '*fp32'}, 'device': 0, 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}], 'device_type': 'cuda', 'num_warps': 4, 'num_stages': 2, 'debug': True, 'cc': 120} | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] Traceback (most recent call last): | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] File "/workspace/pytorch/torch/_inductor/runtime/triton_heuristics.py", line 537, in _precompile_config | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] binary = triton.compile(*compile_args, **compile_kwargs) | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] File "/workspace/triton/python/triton/compiler/compiler.py", line 316, in compile | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] return CompiledKernel(src, metadata_group, hash) | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] File "/workspace/triton/python/triton/compiler/compiler.py", line 382, in __init__ | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] file.suffix[1:]: file.read_bytes() if file.suffix[1:] == binary_ext else file.read_text() | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] ^^^^^^^^^^^^^^^^ | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] File "/root/miniconda3/envs/triton/lib/python3.12/pathlib.py", line 1027, in read_text | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] with self.open(mode='r', encoding=encoding, errors=errors) as f: | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] File "/root/miniconda3/envs/triton/lib/python3.12/pathlib.py", line 1013, in open | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] return io.open(self, mode, buffering, encoding, errors, newline) | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
E0509 19:03:36.480000 434560 torch/_inductor/runtime/triton_heuristics.py:539] FileNotFoundError: [Errno 2] No such file or directory: '/tmp/tmp8jpqfdds/triton/A2N73RSRHPH2Z2I7RZ5P43Y7EPOVGA52QQJESLGX5WFQHWKKTOYQ/triton_convolution2d.ttir' | |
AUTOTUNE convolution(0x256x14x14, 256x256x1x1) | |
convolution 0.0000 ms <DIVIDED BY ZERO ERROR> | |
conv1x1_via_mm 0.0000 ms <DIVIDED BY ZERO ERROR> | |
triton_convolution2d_118 0.0004 ms 0.0% ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=16, BLOCK_N=256, GROUPS=1, KERNEL_H=1, KERNEL_W=1, PADDING_H=0, PADDING_W=0, STRIDE_H=1, STRIDE_W=1, UNROLL=True, num_stages=2, num_warps=4 | |
triton_convolution2d_119 0.0004 ms 0.0% ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=16, BLOCK_N=64, GROUPS=1, KERNEL_H=1, KERNEL_W=1, PADDING_H=0, PADDING_W=0, STRIDE_H=1, STRIDE_W=1, UNROLL=True, num_stages=2, num_warps=4 | |
triton_convolution2d_120 0.0004 ms 0.0% ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, GROUPS=1, KERNEL_H=1, KERNEL_W=1, PADDING_H=0, PADDING_W=0, STRIDE_H=1, STRIDE_W=1, UNROLL=True, num_stages=1, num_warps=1 | |
triton_convolution2d_121 0.0004 ms 0.0% ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=16, BLOCK_N=128, GROUPS=1, KERNEL_H=1, KERNEL_W=1, PADDING_H=0, PADDING_W=0, STRIDE_H=1, STRIDE_W=1, UNROLL=True, num_stages=2, num_warps=8 | |
triton_convolution2d_122 0.0004 ms 0.0% ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=16, BLOCK_N=64, GROUPS=1, KERNEL_H=1, KERNEL_W=1, PADDING_H=0, PADDING_W=0, STRIDE_H=1, STRIDE_W=1, UNROLL=True, num_stages=2, num_warps=4 | |
triton_convolution2d_123 0.0004 ms 0.0% ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=16, BLOCK_N=256, GROUPS=1, KERNEL_H=1, KERNEL_W=1, PADDING_H=0, PADDING_W=0, STRIDE_H=1, STRIDE_W=1, UNROLL=True, num_stages=2, num_warps=8 | |
SingleProcess AUTOTUNE benchmarking takes 0.1891 seconds and 0.1215 seconds precompiling for 8 choices | |
frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 42), ('async_compile_cache_miss', 6), ('select_algorithm_num_precompiles', 6), ('benchmarking.InductorBenchmarker.benchmark_gpu', 6), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1), ('extern_calls', 1)] | |
graph_break [] | |
aten_mm_info [] | |
.E0509 19:03:37.800000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:03:37.800000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 196608, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:03:37.800000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
E0509 19:03:37.858000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:03:37.858000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:03:37.858000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
E0509 19:03:37.915000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:03:37.915000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:03:37.915000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
E0509 19:03:37.992000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:03:37.992000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 131072, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:03:37.992000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
E0509 19:03:37.995000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:03:37.995000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:03:37.995000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
AUTOTUNE mm(256x256, 256x256) | |
triton_mm_145 0.0061 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_144 0.0061 ms 99.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_146 0.0061 ms 99.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 | |
mm 0.0074 ms 83.0% | |
triton_mm_150 0.0082 ms 74.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 | |
triton_mm_153 0.0116 ms 52.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 | |
triton_mm_157 0.0116 ms 52.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 | |
triton_mm_154 0.0118 ms 51.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 | |
triton_mm_156 0.0119 ms 51.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 | |
triton_mm_152 0.0119 ms 51.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 | |
SingleProcess AUTOTUNE benchmarking takes 0.3662 seconds and 0.1939 seconds precompiling for 20 choices | |
E0509 19:03:38.839000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:03:38.839000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 131072, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:03:38.839000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
E0509 19:03:38.895000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:03:38.895000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 147456, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:03:38.895000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
E0509 19:03:38.988000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:03:38.988000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 131072, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:03:38.988000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
AUTOTUNE mm(256x256, 256x256) | |
triton_mm_164 0.0041 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_165 0.0048 ms 85.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_166 0.0054 ms 76.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 | |
triton_mm_169 0.0056 ms 73.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 | |
triton_mm_163 0.0057 ms 71.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 | |
mm 0.0059 ms 69.9% | |
triton_mm_173 0.0072 ms 57.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 | |
triton_mm_176 0.0075 ms 54.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 | |
triton_mm_172 0.0075 ms 54.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 | |
triton_mm_175 0.0076 ms 53.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 | |
SingleProcess AUTOTUNE benchmarking takes 0.3083 seconds and 0.0002 seconds precompiling for 20 choices | |
inductor [] | |
.E0509 19:03:39.899000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:03:39.899000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 131088, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:03:39.899000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
E0509 19:03:39.946000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:03:39.946000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 163856, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:03:39.946000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
E0509 19:03:39.950000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:03:39.950000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 163856, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:03:39.950000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
E0509 19:03:39.954000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:03:39.954000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 114712, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:03:39.954000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
AUTOTUNE mm(2560x2560, 2560x2560) | |
triton_mm_persistent_tma_220 0.6267 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=False, EVEN_K=True, GROUP_M=8, NUM_SMS=70, TMA_SIZE=128, num_stages=3, num_warps=8 | |
triton_mm_persistent_tma_219 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=256, B_ROW_MAJOR=False, EVEN_K=True, GROUP_M=8, NUM_SMS=70, TMA_SIZE=128, num_stages=3, num_warps=8 | |
triton_mm_persistent_tma_221 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=True, BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=False, EVEN_K=True, GROUP_M=8, NUM_SMS=70, TMA_SIZE=128, num_stages=3, num_warps=8 | |
triton_mm_persistent_tma_222 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=True, BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=False, EVEN_K=True, GROUP_M=8, NUM_SMS=70, TMA_SIZE=128, num_stages=3, num_warps=4 | |
triton_mm_persistent_tma_223 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=False, EVEN_K=True, GROUP_M=8, NUM_SMS=70, TMA_SIZE=128, num_stages=4, num_warps=8 | |
SingleProcess AUTOTUNE benchmarking takes 0.1509 seconds and 0.5661 seconds precompiling for 5 choices | |
/workspace/pytorch/test/inductor/test_max_autotune.py:545: ResourceWarning: unclosed file <_io.TextIOWrapper name='/tmp/tmp51okix8i' mode='r' encoding='UTF-8'> | |
for evt in json.load(open(f.name))["traceEvents"] | |
ResourceWarning: Enable tracemalloc to get the object allocation traceback | |
frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 42), ('async_compile_cache_miss', 7), ('select_algorithm_num_precompiles', 5), ('benchmarking.InductorBenchmarker.benchmark_gpu', 5), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_2560_2560_2560', 1)] | |
.E0509 19:03:41.098000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:03:41.098000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 114712, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:03:41.098000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
E0509 19:03:41.103000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:03:41.103000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 114712, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:03:41.103000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
E0509 19:03:41.154000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:03:41.154000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 147488, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:03:41.154000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
E0509 19:03:41.158000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:03:41.158000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 147488, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:03:41.158000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
E0509 19:03:41.163000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:03:41.163000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 180264, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:03:41.163000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
AUTOTUNE scaled_mm(2560x2560, 2560x2560, , ) | |
triton_scaled_mm_device_tma_232 0.5816 ms 100.0% ACC_TYPE='tl.float32', BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, NUM_SMS=70, SCALING_ROWWISE=False, TMA_SIZE=128, USE_FAST_ACCUM=False, num_stages=4, num_warps=8 | |
triton_scaled_mm_device_tma_224 0.5834 ms 99.7% ACC_TYPE='tl.float32', BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, NUM_SMS=70, SCALING_ROWWISE=False, TMA_SIZE=128, USE_FAST_ACCUM=False, num_stages=3, num_warps=8 | |
triton_scaled_mm_device_tma_225 0.7854 ms 74.1% ACC_TYPE='tl.float32', BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, NUM_SMS=70, SCALING_ROWWISE=False, TMA_SIZE=128, USE_FAST_ACCUM=False, num_stages=3, num_warps=8 | |
triton_scaled_mm_device_tma_228 0.9073 ms 64.1% ACC_TYPE='tl.float32', BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, NUM_SMS=70, SCALING_ROWWISE=False, TMA_SIZE=128, USE_FAST_ACCUM=False, num_stages=3, num_warps=4 | |
triton_scaled_mm_device_tma_226 inf ms 0.0% ACC_TYPE='tl.float32', BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, NUM_SMS=70, SCALING_ROWWISE=False, TMA_SIZE=128, USE_FAST_ACCUM=False, num_stages=4, num_warps=8 | |
triton_scaled_mm_device_tma_227 inf ms 0.0% ACC_TYPE='tl.float32', BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, NUM_SMS=70, SCALING_ROWWISE=False, TMA_SIZE=128, USE_FAST_ACCUM=False, num_stages=4, num_warps=4 | |
triton_scaled_mm_device_tma_229 inf ms 0.0% ACC_TYPE='tl.float32', BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, NUM_SMS=70, SCALING_ROWWISE=False, TMA_SIZE=128, USE_FAST_ACCUM=False, num_stages=5, num_warps=4 | |
triton_scaled_mm_device_tma_230 inf ms 0.0% ACC_TYPE='tl.float32', BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, NUM_SMS=70, SCALING_ROWWISE=False, TMA_SIZE=128, USE_FAST_ACCUM=False, num_stages=5, num_warps=8 | |
triton_scaled_mm_device_tma_231 inf ms 0.0% ACC_TYPE='tl.float32', BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, NUM_SMS=70, SCALING_ROWWISE=False, TMA_SIZE=128, USE_FAST_ACCUM=False, num_stages=6, num_warps=8 | |
SingleProcess AUTOTUNE benchmarking takes 0.3013 seconds and 0.6370 seconds precompiling for 9 choices | |
/workspace/pytorch/test/inductor/test_max_autotune.py:545: ResourceWarning: unclosed file <_io.TextIOWrapper name='/tmp/tmp3h_w8d9r' mode='r' encoding='UTF-8'> | |
for evt in json.load(open(f.name))["traceEvents"] | |
ResourceWarning: Enable tracemalloc to get the object allocation traceback | |
frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 70), ('async_compile_cache_miss', 11), ('select_algorithm_num_precompiles', 9), ('benchmarking.InductorBenchmarker.benchmark_gpu', 9), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)] | |
graph_break [] | |
aten_mm_info [('aten._scaled_mm.default_2560_2560_2560', 1)] | |
.[W509 19:03:41.700913810 Context.cpp:469] Warning: Setting the SM carveout for matmuls is a temporary experimental mitigation for performance issues, while more robust solutions are developed. It may be removed at any moment without notice. (function operator()) | |
E0509 19:03:42.186000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:03:42.186000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 131088, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:03:42.186000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
E0509 19:03:42.233000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:03:42.233000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 163856, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:03:42.233000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
E0509 19:03:42.237000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:03:42.237000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 163856, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:03:42.237000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
E0509 19:03:42.242000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:03:42.242000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 114712, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:03:42.242000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
AUTOTUNE mm(2560x2560, 2560x2560) | |
triton_mm_persistent_tma_253 0.6273 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=False, EVEN_K=True, GROUP_M=8, NUM_SMS=70, TMA_SIZE=128, num_stages=3, num_warps=8 | |
triton_mm_persistent_tma_252 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=256, B_ROW_MAJOR=False, EVEN_K=True, GROUP_M=8, NUM_SMS=70, TMA_SIZE=128, num_stages=3, num_warps=8 | |
triton_mm_persistent_tma_254 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=True, BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=False, EVEN_K=True, GROUP_M=8, NUM_SMS=70, TMA_SIZE=128, num_stages=3, num_warps=8 | |
triton_mm_persistent_tma_255 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=True, BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=False, EVEN_K=True, GROUP_M=8, NUM_SMS=70, TMA_SIZE=128, num_stages=3, num_warps=4 | |
triton_mm_persistent_tma_256 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=False, EVEN_K=True, GROUP_M=8, NUM_SMS=70, TMA_SIZE=128, num_stages=4, num_warps=8 | |
SingleProcess AUTOTUNE benchmarking takes 0.1578 seconds and 0.5631 seconds precompiling for 5 choices | |
/workspace/pytorch/test/inductor/test_max_autotune.py:545: ResourceWarning: unclosed file <_io.TextIOWrapper name='/tmp/tmp507pmtl1' mode='r' encoding='UTF-8'> | |
for evt in json.load(open(f.name))["traceEvents"] | |
ResourceWarning: Enable tracemalloc to get the object allocation traceback | |
frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 42), ('async_compile_cache_miss', 7), ('select_algorithm_num_precompiles', 5), ('benchmarking.InductorBenchmarker.benchmark_gpu', 5), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_2560_2560_2560', 1)] | |
.E0509 19:03:43.308000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:03:43.308000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 114712, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:03:43.308000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
E0509 19:03:43.313000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:03:43.313000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 114712, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:03:43.313000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
E0509 19:03:43.360000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:03:43.360000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 147488, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:03:43.360000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
E0509 19:03:43.365000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:03:43.365000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 147488, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:03:43.365000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
E0509 19:03:43.370000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:03:43.370000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 180264, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:03:43.370000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
AUTOTUNE scaled_mm(2560x2560, 2560x2560, , ) | |
triton_scaled_mm_device_tma_265 0.5819 ms 100.0% ACC_TYPE='tl.float32', BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, NUM_SMS=70, SCALING_ROWWISE=False, TMA_SIZE=128, USE_FAST_ACCUM=False, num_stages=4, num_warps=8 | |
triton_scaled_mm_device_tma_257 0.5834 ms 99.7% ACC_TYPE='tl.float32', BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, NUM_SMS=70, SCALING_ROWWISE=False, TMA_SIZE=128, USE_FAST_ACCUM=False, num_stages=3, num_warps=8 | |
triton_scaled_mm_device_tma_258 0.7820 ms 74.4% ACC_TYPE='tl.float32', BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, NUM_SMS=70, SCALING_ROWWISE=False, TMA_SIZE=128, USE_FAST_ACCUM=False, num_stages=3, num_warps=8 | |
triton_scaled_mm_device_tma_261 0.9103 ms 63.9% ACC_TYPE='tl.float32', BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, NUM_SMS=70, SCALING_ROWWISE=False, TMA_SIZE=128, USE_FAST_ACCUM=False, num_stages=3, num_warps=4 | |
triton_scaled_mm_device_tma_259 inf ms 0.0% ACC_TYPE='tl.float32', BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, NUM_SMS=70, SCALING_ROWWISE=False, TMA_SIZE=128, USE_FAST_ACCUM=False, num_stages=4, num_warps=8 | |
triton_scaled_mm_device_tma_260 inf ms 0.0% ACC_TYPE='tl.float32', BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, NUM_SMS=70, SCALING_ROWWISE=False, TMA_SIZE=128, USE_FAST_ACCUM=False, num_stages=4, num_warps=4 | |
triton_scaled_mm_device_tma_262 inf ms 0.0% ACC_TYPE='tl.float32', BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, NUM_SMS=70, SCALING_ROWWISE=False, TMA_SIZE=128, USE_FAST_ACCUM=False, num_stages=5, num_warps=4 | |
triton_scaled_mm_device_tma_263 inf ms 0.0% ACC_TYPE='tl.float32', BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, NUM_SMS=70, SCALING_ROWWISE=False, TMA_SIZE=128, USE_FAST_ACCUM=False, num_stages=5, num_warps=8 | |
triton_scaled_mm_device_tma_264 inf ms 0.0% ACC_TYPE='tl.float32', BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, NUM_SMS=70, SCALING_ROWWISE=False, TMA_SIZE=128, USE_FAST_ACCUM=False, num_stages=6, num_warps=8 | |
SingleProcess AUTOTUNE benchmarking takes 0.2965 seconds and 0.5921 seconds precompiling for 9 choices | |
/workspace/pytorch/test/inductor/test_max_autotune.py:545: ResourceWarning: unclosed file <_io.TextIOWrapper name='/tmp/tmpqk38a3b3' mode='r' encoding='UTF-8'> | |
for evt in json.load(open(f.name))["traceEvents"] | |
ResourceWarning: Enable tracemalloc to get the object allocation traceback | |
frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 70), ('async_compile_cache_miss', 11), ('select_algorithm_num_precompiles', 9), ('benchmarking.InductorBenchmarker.benchmark_gpu', 9), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)] | |
graph_break [] | |
aten_mm_info [('aten._scaled_mm.default_2560_2560_2560', 1)] | |
.E0509 19:03:44.409000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:03:44.409000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 131088, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:03:44.409000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
E0509 19:03:44.458000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:03:44.458000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 163856, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:03:44.458000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
E0509 19:03:44.463000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:03:44.463000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 163856, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:03:44.463000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
E0509 19:03:44.467000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:03:44.467000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 114712, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:03:44.467000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
AUTOTUNE mm(2560x2560, 2560x2560) | |
triton_mm_persistent_tma_286 1.0383 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=False, EVEN_K=True, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=3, num_warps=8 | |
triton_mm_persistent_tma_285 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=256, B_ROW_MAJOR=False, EVEN_K=True, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=3, num_warps=8 | |
triton_mm_persistent_tma_287 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=True, BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=False, EVEN_K=True, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=3, num_warps=8 | |
triton_mm_persistent_tma_288 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=True, BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=False, EVEN_K=True, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=3, num_warps=4 | |
triton_mm_persistent_tma_289 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=False, EVEN_K=True, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=4, num_warps=8 | |
SingleProcess AUTOTUNE benchmarking takes 0.1597 seconds and 0.5892 seconds precompiling for 5 choices | |
/workspace/pytorch/test/inductor/test_max_autotune.py:545: ResourceWarning: unclosed file <_io.TextIOWrapper name='/tmp/tmpdxhdeq4z' mode='r' encoding='UTF-8'> | |
for evt in json.load(open(f.name))["traceEvents"] | |
ResourceWarning: Enable tracemalloc to get the object allocation traceback | |
frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 42), ('async_compile_cache_miss', 7), ('select_algorithm_num_precompiles', 5), ('benchmarking.InductorBenchmarker.benchmark_gpu', 5), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_2560_2560_2560', 1)] | |
.E0509 19:03:45.522000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:03:45.522000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 114712, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:03:45.522000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
E0509 19:03:45.527000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:03:45.527000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 114712, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:03:45.527000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
E0509 19:03:45.577000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:03:45.577000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 147488, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:03:45.577000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
E0509 19:03:45.581000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:03:45.581000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 147488, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:03:45.581000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
E0509 19:03:45.586000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:03:45.586000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 180264, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:03:45.586000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
AUTOTUNE scaled_mm(2560x2560, 2560x2560, , ) | |
triton_scaled_mm_device_tma_298 0.9646 ms 100.0% ACC_TYPE='tl.float32', BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, NUM_SMS=43, SCALING_ROWWISE=False, TMA_SIZE=128, USE_FAST_ACCUM=False, num_stages=4, num_warps=8 | |
triton_scaled_mm_device_tma_290 0.9656 ms 99.9% ACC_TYPE='tl.float32', BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, NUM_SMS=43, SCALING_ROWWISE=False, TMA_SIZE=128, USE_FAST_ACCUM=False, num_stages=3, num_warps=8 | |
triton_scaled_mm_device_tma_291 1.1715 ms 82.3% ACC_TYPE='tl.float32', BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, NUM_SMS=43, SCALING_ROWWISE=False, TMA_SIZE=128, USE_FAST_ACCUM=False, num_stages=3, num_warps=8 | |
triton_scaled_mm_device_tma_294 1.4828 ms 65.1% ACC_TYPE='tl.float32', BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, NUM_SMS=43, SCALING_ROWWISE=False, TMA_SIZE=128, USE_FAST_ACCUM=False, num_stages=3, num_warps=4 | |
triton_scaled_mm_device_tma_292 inf ms 0.0% ACC_TYPE='tl.float32', BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, NUM_SMS=43, SCALING_ROWWISE=False, TMA_SIZE=128, USE_FAST_ACCUM=False, num_stages=4, num_warps=8 | |
triton_scaled_mm_device_tma_293 inf ms 0.0% ACC_TYPE='tl.float32', BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, NUM_SMS=43, SCALING_ROWWISE=False, TMA_SIZE=128, USE_FAST_ACCUM=False, num_stages=4, num_warps=4 | |
triton_scaled_mm_device_tma_295 inf ms 0.0% ACC_TYPE='tl.float32', BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, NUM_SMS=43, SCALING_ROWWISE=False, TMA_SIZE=128, USE_FAST_ACCUM=False, num_stages=5, num_warps=4 | |
triton_scaled_mm_device_tma_296 inf ms 0.0% ACC_TYPE='tl.float32', BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, NUM_SMS=43, SCALING_ROWWISE=False, TMA_SIZE=128, USE_FAST_ACCUM=False, num_stages=5, num_warps=8 | |
triton_scaled_mm_device_tma_297 inf ms 0.0% ACC_TYPE='tl.float32', BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, NUM_SMS=43, SCALING_ROWWISE=False, TMA_SIZE=128, USE_FAST_ACCUM=False, num_stages=6, num_warps=8 | |
SingleProcess AUTOTUNE benchmarking takes 0.3033 seconds and 0.5743 seconds precompiling for 9 choices | |
/workspace/pytorch/test/inductor/test_max_autotune.py:545: ResourceWarning: unclosed file <_io.TextIOWrapper name='/tmp/tmp_e__0eio' mode='r' encoding='UTF-8'> | |
for evt in json.load(open(f.name))["traceEvents"] | |
ResourceWarning: Enable tracemalloc to get the object allocation traceback | |
frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 70), ('async_compile_cache_miss', 11), ('select_algorithm_num_precompiles', 9), ('benchmarking.InductorBenchmarker.benchmark_gpu', 9), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)] | |
graph_break [] | |
aten_mm_info [('aten._scaled_mm.default_2560_2560_2560', 1)] | |
.frames [('total', 1)] | |
stats [('calls_captured', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('not_ok', 1)] | |
inductor [('async_compile_cache_miss', 5), ('select_algorithm_num_precompiles', 5), ('benchmarking.InductorBenchmarker.benchmark_gpu', 5), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_16_16_16', 1)] | |
.frames [('total', 1)] | |
stats [('calls_captured', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('not_ok', 1)] | |
inductor [('async_compile_cache_miss', 5), ('select_algorithm_num_precompiles', 5), ('benchmarking.InductorBenchmarker.benchmark_gpu', 5), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_16_16_16', 1)] | |
.W0509 19:03:46.433000 434429 torch/_export/__init__.py:67] +============================+ | |
W0509 19:03:46.434000 434429 torch/_export/__init__.py:68] | !!! WARNING !!! | | |
W0509 19:03:46.434000 434429 torch/_export/__init__.py:69] +============================+ | |
W0509 19:03:46.434000 434429 torch/_export/__init__.py:70] torch._export.aot_compile()/torch._export.aot_load() is being deprecated, please switch to directly calling torch._inductor.aoti_compile_and_package(torch.export.export())/torch._inductor.aoti_load_package() instead. | |
E0509 19:03:47.184000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:03:47.184000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 196608, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:03:47.184000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
E0509 19:03:47.242000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:03:47.242000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:03:47.242000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
E0509 19:03:47.302000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:03:47.302000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:03:47.302000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
E0509 19:03:47.381000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:03:47.381000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 131072, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:03:47.381000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
E0509 19:03:47.385000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:03:47.385000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:03:47.385000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
AUTOTUNE mm(256x256, 256x256) | |
triton_mm_311 0.0058 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_310 0.0059 ms 98.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_312 0.0059 ms 98.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 | |
mm 0.0076 ms 76.4% | |
triton_mm_316 0.0077 ms 75.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 | |
triton_mm_319 0.0114 ms 50.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 | |
triton_mm_315 0.0116 ms 50.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_320 0.0116 ms 49.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 | |
triton_mm_318 0.0117 ms 49.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 | |
triton_mm_323 0.0117 ms 49.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 | |
SingleProcess AUTOTUNE benchmarking takes 0.3755 seconds and 0.4873 seconds precompiling for 20 choices | |
stats [('calls_captured', 5), ('unique_graphs', 1)] | |
inductor [('benchmarking.InductorBenchmarker.benchmark_gpu', 26), ('async_compile_cache_miss', 23), ('select_algorithm_num_precompiles', 19), ('select_algorithm_precompile', 1), ('benchmarking.InductorBenchmarker.benchmark', 1), ('select_algorithm_autotune', 1), ('extern_calls', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_256_256_256', 1)] | |
.sCompiled module path: /tmp/tmpyumonn_d/kf/ckf24tdynyym2ganzaqomb25tiktyvf3wzxn5e5qf53scj65g2w3.py | |
Compiled module path: /tmp/tmpyumonn_d/7w/c7w2sojsp5z6u6hlqvzlw4ghmkbikuq6sflwfr4myo77rhxygnrz.py | |
frames [('total', 96), ('ok', 92)] | |
inline_call [] | |
unimplemented [('Attempt to trace generator\n Explanation: Generators cannot be compiled directly with `torch.compile`.\n Hint: Call a generator from inside of a non-generator Python function and compile that function instead.\n Hint: This graph break is fundamental - it is unlikely that Dynamo will ever be able to trace through your code. Consider finding a workaround.\n\n Developer debug context: \n', 4)] | |
graph_break [('Tensor.backward', 1)] | |
stats [('calls_captured', 3), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('ok', 1), ('autograd_cache_saved', 1)] | |
inductor [('extern_calls', 5), ('fxgraph_cache_miss', 2)] | |
aten_mm_info [('aten.mm_128_128_128', 2)] | |
.E0509 19:03:52.871000 434429 torch/_inductor/select_algorithm.py:2100] [0/0_1] Runtime error during autotuning: | |
E0509 19:03:52.871000 434429 torch/_inductor/select_algorithm.py:2100] [0/0_1] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 131072, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:03:52.871000 434429 torch/_inductor/select_algorithm.py:2100] [0/0_1] Ignoring this choice. | |
E0509 19:03:52.875000 434429 torch/_inductor/select_algorithm.py:2100] [0/0_1] Runtime error during autotuning: | |
E0509 19:03:52.875000 434429 torch/_inductor/select_algorithm.py:2100] [0/0_1] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:03:52.875000 434429 torch/_inductor/select_algorithm.py:2100] [0/0_1] Ignoring this choice. | |
AUTOTUNE mm(128x128, 128x128) | |
triton_mm_330 0.0041 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_331 0.0041 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 | |
mm 0.0058 ms 70.3% | |
triton_mm_335 0.0058 ms 70.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 | |
triton_mm_329 0.0059 ms 69.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_332 0.0059 ms 69.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 | |
triton_mm_336 0.0059 ms 69.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 | |
triton_mm_334 0.0075 ms 54.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_337 0.0078 ms 52.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 | |
triton_mm_338 0.0079 ms 52.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 | |
SingleProcess AUTOTUNE benchmarking takes 0.4238 seconds and 0.4223 seconds precompiling for 20 choices | |
Compiled module path: /tmp/tmpq56gtiip/zz/czz4ecoozipcry2vnodb4iw5zfsawonb6iza5re7zvzy32bkehs6.py | |
E0509 19:03:53.868000 434429 torch/_inductor/select_algorithm.py:2100] [0/0_1] Runtime error during autotuning: | |
E0509 19:03:53.868000 434429 torch/_inductor/select_algorithm.py:2100] [0/0_1] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 131072, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:03:53.868000 434429 torch/_inductor/select_algorithm.py:2100] [0/0_1] Ignoring this choice. | |
E0509 19:03:53.872000 434429 torch/_inductor/select_algorithm.py:2100] [0/0_1] Runtime error during autotuning: | |
E0509 19:03:53.872000 434429 torch/_inductor/select_algorithm.py:2100] [0/0_1] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:03:53.872000 434429 torch/_inductor/select_algorithm.py:2100] [0/0_1] Ignoring this choice. | |
AUTOTUNE mm(128x128, 128x128) | |
triton_mm_348 0.0041 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_349 0.0041 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_350 0.0041 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 | |
mm 0.0055 ms 74.0% | |
triton_mm_354 0.0056 ms 72.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 | |
triton_mm_351 0.0059 ms 69.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 | |
triton_mm_355 0.0061 ms 66.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 | |
triton_mm_357 0.0075 ms 54.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 | |
triton_mm_360 0.0075 ms 54.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 | |
triton_mm_361 0.0076 ms 54.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 | |
SingleProcess AUTOTUNE benchmarking takes 0.3251 seconds and 0.3140 seconds precompiling for 20 choices | |
Compiled module path: /tmp/tmpq56gtiip/y4/cy4phkkchieez3q5cck2pcmpqw4o3goqhrjlltrh5q2u7ga2cryr.py | |
frames [('total', 5), ('ok', 5)] | |
inline_call [] | |
unimplemented [] | |
graph_break [('Tensor.backward', 1)] | |
stats [('calls_captured', 3), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('ok', 1), ('autograd_cache_saved', 1)] | |
inductor [('triton_bundler_save_kernel', 322), ('async_compile_cache_miss', 45), ('benchmarking.InductorBenchmarker.benchmark_gpu', 44), ('select_algorithm_num_precompiles', 38), ('extern_calls', 3), ('fxgraph_cache_miss', 2), ('select_algorithm_precompile', 2), ('benchmarking.InductorBenchmarker.benchmark', 2), ('select_algorithm_autotune', 2), ('async_compile_cache_hit', 2)] | |
aten_mm_info [('aten.mm_128_128_128', 2)] | |
.AUTOTUNE addmm(100x100, 100x10, 10x100) | |
triton_mm_367 0.0034 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_368 0.0034 ms 99.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_366 0.0035 ms 97.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=2 | |
triton_mm_373 0.0036 ms 92.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4 | |
triton_mm_369 0.0037 ms 91.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_371 0.0037 ms 90.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_370 0.0038 ms 89.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4 | |
triton_mm_372 0.0041 ms 82.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=8 | |
triton_mm_375 0.0041 ms 82.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 | |
triton_mm_376 0.0041 ms 82.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=4 | |
SubProcess AUTOTUNE benchmarking takes 0.8253 seconds and 0.2758 seconds precompiling for 18 choices | |
frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('async_compile_cache_miss', 18), ('select_algorithm_num_precompiles', 16), ('triton_bundler_save_kernel', 7), ('benchmarking.InductorBenchmarker.benchmark_gpu', 4), ('benchmarking.InductorBenchmarker.benchmark', 2), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)] | |
graph_break [] | |
aten_mm_info [('aten.addmm_100_100_10', 1)] | |
.AUTOTUNE addmm(100x100, 100x10, 10x100) | |
triton_mm_383 0.0041 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_384 0.0041 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_386 0.0041 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4 | |
triton_mm_388 0.0049 ms 84.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=8 | |
triton_mm_387 0.0051 ms 80.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_389 0.0054 ms 75.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4 | |
triton_mm_385 0.0055 ms 74.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_391 0.0057 ms 72.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 | |
triton_mm_382 0.0058 ms 70.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=2 | |
triton_mm_390 0.0059 ms 69.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 | |
SubProcess AUTOTUNE benchmarking takes 3.3739 seconds and 0.4662 seconds precompiling for 18 choices | |
frames [('total', 4), ('ok', 4)] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('async_compile_cache_miss', 18), ('select_algorithm_num_precompiles', 16), ('triton_bundler_save_kernel', 7), ('benchmarking.InductorBenchmarker.benchmark', 2), ('benchmarking.InductorBenchmarker.benchmark_gpu', 2), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)] | |
graph_break [] | |
aten_mm_info [('aten.addmm_s1_s1_s0', 1)] | |
.E0509 19:04:02.553000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:04:02.553000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 115216, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:04:02.553000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
E0509 19:04:02.580000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:04:02.580000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 139792, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:04:02.580000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
E0509 19:04:02.585000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:04:02.585000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 139792, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:04:02.585000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
E0509 19:04:02.589000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:04:02.589000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 107032, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:04:02.589000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
AUTOTUNE addmm(168x248, 168x88, 88x248) | |
triton_mm_persistent_tma_418 0.0100 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=True, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=3, num_warps=8 | |
triton_mm_persistent_tma_417 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=256, B_ROW_MAJOR=True, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=3, num_warps=8 | |
triton_mm_persistent_tma_419 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=True, BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=True, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=3, num_warps=8 | |
triton_mm_persistent_tma_420 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=True, BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=True, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=3, num_warps=4 | |
triton_mm_persistent_tma_421 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=True, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=4, num_warps=8 | |
SingleProcess AUTOTUNE benchmarking takes 0.1399 seconds and 0.5961 seconds precompiling for 5 choices | |
frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 4), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 84), ('benchmarking.InductorBenchmarker.benchmark_gpu', 11), ('async_compile_cache_miss', 10), ('select_algorithm_num_precompiles', 5), ('async_compile_cache_hit', 3), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)] | |
graph_break [] | |
aten_mm_info [('aten.addmm_168_248_88', 1)] | |
.E0509 19:04:03.956000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:04:03.956000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 115216, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:04:03.956000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
Eframes [('total', 1)] | |
stats [('calls_captured', 4)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('not_ok', 1)] | |
inductor [('async_compile_cache_miss', 5), ('select_algorithm_num_precompiles', 5), ('benchmarking.InductorBenchmarker.benchmark_gpu', 2), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1)] | |
graph_break [] | |
aten_mm_info [('aten.addmm_8*s1_8*s0_8*s2', 1)] | |
E0509 19:04:04.819000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:04:04.819000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 131088, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:04:04.819000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
E0509 19:04:04.850000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:04:04.850000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 163856, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:04:04.850000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
E0509 19:04:04.855000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:04:04.855000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 163856, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:04:04.855000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
E0509 19:04:04.859000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:04:04.859000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 114712, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:04:04.859000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
AUTOTUNE addmm(168x248, 168x88, 88x248) | |
triton_mm_persistent_tma_466 0.0119 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=False, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=3, num_warps=8 | |
triton_mm_persistent_tma_465 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=256, B_ROW_MAJOR=False, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=3, num_warps=8 | |
triton_mm_persistent_tma_467 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=True, BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=False, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=3, num_warps=8 | |
triton_mm_persistent_tma_468 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=True, BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=False, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=3, num_warps=4 | |
triton_mm_persistent_tma_469 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=False, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=4, num_warps=8 | |
SingleProcess AUTOTUNE benchmarking takes 0.1379 seconds and 0.6032 seconds precompiling for 5 choices | |
frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 5), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 84), ('benchmarking.InductorBenchmarker.benchmark_gpu', 11), ('async_compile_cache_miss', 10), ('select_algorithm_num_precompiles', 5), ('async_compile_cache_hit', 3), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)] | |
graph_break [] | |
aten_mm_info [('aten.addmm_168_248_88', 1)] | |
.E0509 19:04:06.158000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:04:06.158000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 131088, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:04:06.158000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
Eframes [('total', 1)] | |
stats [('calls_captured', 5)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('not_ok', 1)] | |
inductor [('async_compile_cache_miss', 5), ('select_algorithm_num_precompiles', 5), ('benchmarking.InductorBenchmarker.benchmark_gpu', 2), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1)] | |
graph_break [] | |
aten_mm_info [('aten.addmm_8*s1_8*s0_8*s2', 1)] | |
E0509 19:04:07.131000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:04:07.131000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 115216, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:04:07.131000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
E0509 19:04:07.162000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:04:07.162000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 163856, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:04:07.162000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
E0509 19:04:07.167000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:04:07.167000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 163856, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:04:07.167000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
E0509 19:04:07.171000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:04:07.171000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 114712, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:04:07.171000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
AUTOTUNE addmm(168x248, 168x88, 88x248) | |
triton_mm_persistent_tma_514 0.0117 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=False, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=True, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=3, num_warps=8 | |
triton_mm_persistent_tma_513 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=False, BLOCK_K=64, BLOCK_M=128, BLOCK_N=256, B_ROW_MAJOR=True, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=3, num_warps=8 | |
triton_mm_persistent_tma_515 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=False, BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=True, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=3, num_warps=8 | |
triton_mm_persistent_tma_516 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=False, BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=True, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=3, num_warps=4 | |
triton_mm_persistent_tma_517 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=False, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=True, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=4, num_warps=8 | |
SingleProcess AUTOTUNE benchmarking takes 0.1419 seconds and 0.7292 seconds precompiling for 5 choices | |
frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 5), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 84), ('benchmarking.InductorBenchmarker.benchmark_gpu', 11), ('async_compile_cache_miss', 10), ('select_algorithm_num_precompiles', 5), ('async_compile_cache_hit', 3), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)] | |
graph_break [] | |
aten_mm_info [('aten.addmm_168_248_88', 1)] | |
.E0509 19:04:08.692000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:04:08.692000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 115216, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:04:08.692000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
Eframes [('total', 1)] | |
stats [('calls_captured', 5)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('not_ok', 1)] | |
inductor [('async_compile_cache_miss', 5), ('select_algorithm_num_precompiles', 5), ('benchmarking.InductorBenchmarker.benchmark_gpu', 2), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1)] | |
graph_break [] | |
aten_mm_info [('aten.addmm_8*s2_8*s0_8*s1', 1)] | |
E0509 19:04:09.642000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:04:09.642000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 147472, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:04:09.642000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
E0509 19:04:09.671000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:04:09.671000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 196624, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:04:09.671000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
E0509 19:04:09.678000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:04:09.678000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 196624, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:04:09.678000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
E0509 19:04:09.682000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:04:09.682000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 131096, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:04:09.682000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
AUTOTUNE addmm(168x248, 168x88, 88x248) | |
triton_mm_persistent_tma_562 0.0117 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=False, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=False, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=3, num_warps=8 | |
triton_mm_persistent_tma_561 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=False, BLOCK_K=64, BLOCK_M=128, BLOCK_N=256, B_ROW_MAJOR=False, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=3, num_warps=8 | |
triton_mm_persistent_tma_563 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=False, BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=False, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=3, num_warps=8 | |
triton_mm_persistent_tma_564 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=False, BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=False, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=3, num_warps=4 | |
triton_mm_persistent_tma_565 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=False, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=False, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=4, num_warps=8 | |
SingleProcess AUTOTUNE benchmarking takes 0.1404 seconds and 0.6975 seconds precompiling for 5 choices | |
frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 6), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 84), ('benchmarking.InductorBenchmarker.benchmark_gpu', 11), ('async_compile_cache_miss', 10), ('select_algorithm_num_precompiles', 5), ('async_compile_cache_hit', 3), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)] | |
graph_break [] | |
aten_mm_info [('aten.addmm_168_248_88', 1)] | |
.E0509 19:04:10.979000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:04:10.979000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 147472, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:04:10.979000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
Eframes [('total', 1)] | |
stats [('calls_captured', 6)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('not_ok', 1)] | |
inductor [('async_compile_cache_miss', 5), ('select_algorithm_num_precompiles', 5), ('benchmarking.InductorBenchmarker.benchmark_gpu', 2), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1)] | |
graph_break [] | |
aten_mm_info [('aten.addmm_8*s2_8*s0_8*s1', 1)] | |
frames [('total', 1)] | |
stats [('calls_captured', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('not_ok', 1)] | |
inductor [('benchmarking.InductorBenchmarker.benchmark_gpu', 2), ('fxgraph_cache_miss', 1)] | |
graph_break [] | |
aten_mm_info [('aten.addmm_21_31_11', 1)] | |
.frames [('total', 1)] | |
stats [('calls_captured', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('not_ok', 1)] | |
inductor [('fxgraph_cache_miss', 1)] | |
graph_break [] | |
aten_mm_info [('aten.addmm_s2_s1_s0', 1)] | |
.frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('fxgraph_cache_miss', 1), ('extern_calls', 1)] | |
graph_break [] | |
aten_mm_info [('aten.addmm_0_100_10', 1)] | |
.frames [('total', 4), ('ok', 4)] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('fxgraph_cache_miss', 1), ('extern_calls', 1)] | |
graph_break [] | |
aten_mm_info [('aten.addmm_0_s1_s0', 1)] | |
.AUTOTUNE mm_plus_mm(2048x64, 64x1536, 2048x64, 64x1536) | |
triton_mm_plus_mm_600 0.0287 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_plus_mm_601 0.0302 ms 94.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 | |
triton_mm_plus_mm_606 0.0304 ms 94.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_plus_mm_602 0.0362 ms 79.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=16 | |
triton_mm_plus_mm_607 0.0365 ms 78.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=2 | |
triton_mm_plus_mm_605 0.0407 ms 70.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=8 | |
triton_mm_plus_mm_604 0.0423 ms 67.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 | |
triton_mm_plus_mm_603 0.0426 ms 67.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 | |
_mm_plus_mm 0.0448 ms 64.0% | |
SingleProcess AUTOTUNE benchmarking takes 0.2765 seconds and 0.1758 seconds precompiling for 9 choices | |
frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 3), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 63), ('async_compile_cache_miss', 10), ('benchmarking.InductorBenchmarker.benchmark_gpu', 9), ('select_algorithm_num_precompiles', 8), ('pattern_matcher_nodes', 3), ('fxgraph_cache_miss', 1), ('pattern_matcher_count', 1), ('select_algorithm_precompile', 1), ('benchmarking.InductorBenchmarker.benchmark', 1), ('select_algorithm_autotune', 1)] | |
graph_break [] | |
aten_mm_info [] | |
.AUTOTUNE mm_plus_mm(2048x64, 64x1536, 2048x64, 64x1536) | |
triton_mm_plus_mm_608 0.0292 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_plus_mm_609 0.0304 ms 96.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 | |
triton_mm_plus_mm_614 0.0307 ms 95.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_plus_mm_610 0.0365 ms 79.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=16 | |
triton_mm_plus_mm_615 0.0368 ms 79.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=2 | |
triton_mm_plus_mm_613 0.0406 ms 71.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=8 | |
triton_mm_plus_mm_612 0.0410 ms 71.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 | |
triton_mm_plus_mm_611 0.0427 ms 68.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 | |
_mm_plus_mm 0.0447 ms 65.2% | |
SingleProcess AUTOTUNE benchmarking takes 0.2755 seconds and 0.1820 seconds precompiling for 9 choices | |
frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 3), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 63), ('async_compile_cache_miss', 10), ('benchmarking.InductorBenchmarker.benchmark_gpu', 9), ('select_algorithm_num_precompiles', 8), ('pattern_matcher_nodes', 3), ('fxgraph_cache_miss', 1), ('pattern_matcher_count', 1), ('select_algorithm_precompile', 1), ('benchmarking.InductorBenchmarker.benchmark', 1), ('select_algorithm_autotune', 1)] | |
graph_break [] | |
aten_mm_info [] | |
.AUTOTUNE mm_plus_mm(2048x64, 64x1536, 2048x64, 64x1536) | |
triton_mm_plus_mm_616 0.0280 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_plus_mm_617 0.0286 ms 97.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 | |
triton_mm_plus_mm_622 0.0297 ms 94.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_plus_mm_618 0.0359 ms 77.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=16 | |
triton_mm_plus_mm_623 0.0382 ms 73.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=2 | |
triton_mm_plus_mm_621 0.0409 ms 68.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=8 | |
triton_mm_plus_mm_620 0.0420 ms 66.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 | |
triton_mm_plus_mm_619 0.0427 ms 65.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 | |
_mm_plus_mm 0.0447 ms 62.5% | |
SubProcess AUTOTUNE benchmarking takes 1.2338 seconds and 0.1758 seconds precompiling for 9 choices | |
frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 3), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('async_compile_cache_miss', 10), ('select_algorithm_num_precompiles', 8), ('triton_bundler_save_kernel', 7), ('pattern_matcher_nodes', 3), ('fxgraph_cache_miss', 1), ('pattern_matcher_count', 1), ('select_algorithm_precompile', 1), ('benchmarking.InductorBenchmarker.benchmark', 1), ('benchmarking.InductorBenchmarker.benchmark_gpu', 1), ('select_algorithm_autotune', 1)] | |
graph_break [] | |
aten_mm_info [] | |
.AUTOTUNE mm_plus_mm(2048x64, 64x1536, 2048x64, 64x1536) | |
triton_mm_plus_mm_624 0.0281 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_plus_mm_625 0.0286 ms 98.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 | |
triton_mm_plus_mm_630 0.0291 ms 96.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_plus_mm_626 0.0364 ms 77.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=16 | |
triton_mm_plus_mm_631 0.0371 ms 75.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=2 | |
triton_mm_plus_mm_629 0.0399 ms 70.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=8 | |
triton_mm_plus_mm_628 0.0423 ms 66.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 | |
triton_mm_plus_mm_627 0.0424 ms 66.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 | |
_mm_plus_mm 0.0450 ms 62.4% | |
SubProcess AUTOTUNE benchmarking takes 0.1845 seconds and 0.1787 seconds precompiling for 9 choices | |
frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 3), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('async_compile_cache_miss', 10), ('select_algorithm_num_precompiles', 8), ('triton_bundler_save_kernel', 7), ('pattern_matcher_nodes', 3), ('fxgraph_cache_miss', 1), ('pattern_matcher_count', 1), ('select_algorithm_precompile', 1), ('benchmarking.InductorBenchmarker.benchmark', 1), ('benchmarking.InductorBenchmarker.benchmark_gpu', 1), ('select_algorithm_autotune', 1)] | |
graph_break [] | |
aten_mm_info [] | |
.frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 3), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('pattern_matcher_nodes', 3), ('fxgraph_cache_miss', 1), ('pattern_matcher_count', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_0_1536_64', 2)] | |
.frames [('total', 4), ('ok', 4)] | |
stats [('calls_captured', 3), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('pattern_matcher_nodes', 3), ('fxgraph_cache_miss', 1), ('pattern_matcher_count', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_0_s1_s0', 2)] | |
.AUTOTUNE mm(100x10, 10x100) | |
triton_mm_633 0.0031 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_636 0.0035 ms 89.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4 | |
triton_mm_635 0.0036 ms 87.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_632 0.0036 ms 86.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=2 | |
triton_mm_634 0.0036 ms 86.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_638 0.0036 ms 86.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=8 | |
triton_mm_637 0.0038 ms 83.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_639 0.0038 ms 82.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4 | |
triton_mm_640 0.0041 ms 77.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 | |
triton_mm_641 0.0041 ms 76.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 | |
SubProcess AUTOTUNE benchmarking takes 1.9723 seconds and 0.1850 seconds precompiling for 17 choices | |
frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 2), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 35), ('async_compile_cache_miss', 19), ('select_algorithm_num_precompiles', 16), ('benchmarking.InductorBenchmarker.benchmark_gpu', 7), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('benchmarking.InductorBenchmarker.benchmark', 1), ('select_algorithm_autotune', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_100_100_10', 1)] | |
.AUTOTUNE mm(100x10, 10x100) | |
triton_mm_652 0.0037 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4 | |
triton_mm_650 0.0038 ms 97.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_648 0.0041 ms 90.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=2 | |
triton_mm_649 0.0041 ms 90.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_651 0.0041 ms 90.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_653 0.0041 ms 90.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_654 0.0041 ms 90.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=8 | |
triton_mm_655 0.0041 ms 90.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4 | |
triton_mm_658 0.0047 ms 78.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=4 | |
triton_mm_660 0.0051 ms 72.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 | |
SubProcess AUTOTUNE benchmarking takes 2.7352 seconds and 0.3131 seconds precompiling for 17 choices | |
frames [('total', 4), ('ok', 4)] | |
stats [('calls_captured', 2), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 35), ('async_compile_cache_miss', 19), ('select_algorithm_num_precompiles', 16), ('benchmarking.InductorBenchmarker.benchmark_gpu', 5), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('benchmarking.InductorBenchmarker.benchmark', 1), ('select_algorithm_autotune', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_s0_s0_s1', 1)] | |
.E0509 19:04:22.750000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:04:22.750000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 115216, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:04:22.750000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
E0509 19:04:22.778000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:04:22.778000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 139792, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:04:22.778000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
E0509 19:04:22.785000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:04:22.785000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 139792, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:04:22.785000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
E0509 19:04:22.790000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:04:22.790000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 107032, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:04:22.790000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
AUTOTUNE mm(168x88, 88x248) | |
triton_mm_persistent_tma_684 0.0097 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=True, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=3, num_warps=8 | |
triton_mm_persistent_tma_683 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=256, B_ROW_MAJOR=True, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=3, num_warps=8 | |
triton_mm_persistent_tma_685 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=True, BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=True, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=3, num_warps=8 | |
triton_mm_persistent_tma_686 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=True, BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=True, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=3, num_warps=4 | |
triton_mm_persistent_tma_687 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=True, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=4, num_warps=8 | |
SingleProcess AUTOTUNE benchmarking takes 0.1341 seconds and 0.5370 seconds precompiling for 5 choices | |
frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 3), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 70), ('async_compile_cache_miss', 9), ('benchmarking.InductorBenchmarker.benchmark_gpu', 9), ('select_algorithm_num_precompiles', 5), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_168_248_88', 1)] | |
.E0509 19:04:24.023000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:04:24.023000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 115216, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:04:24.023000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
Eframes [('total', 1)] | |
stats [('calls_captured', 3)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('not_ok', 1)] | |
inductor [('async_compile_cache_miss', 5), ('select_algorithm_num_precompiles', 5), ('benchmarking.InductorBenchmarker.benchmark_gpu', 2), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_8*s0_8*s2_8*s1', 1)] | |
E0509 19:04:24.785000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:04:24.785000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 131088, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:04:24.785000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
E0509 19:04:24.809000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:04:24.809000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 163856, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:04:24.809000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
E0509 19:04:24.813000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:04:24.813000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 163856, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:04:24.813000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
E0509 19:04:24.818000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:04:24.818000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 114712, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:04:24.818000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
AUTOTUNE mm(168x88, 88x248) | |
triton_mm_persistent_tma_732 0.0102 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=False, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=3, num_warps=8 | |
triton_mm_persistent_tma_731 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=256, B_ROW_MAJOR=False, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=3, num_warps=8 | |
triton_mm_persistent_tma_733 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=True, BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=False, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=3, num_warps=8 | |
triton_mm_persistent_tma_734 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=True, BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=False, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=3, num_warps=4 | |
triton_mm_persistent_tma_735 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=False, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=4, num_warps=8 | |
SingleProcess AUTOTUNE benchmarking takes 0.1279 seconds and 0.5386 seconds precompiling for 5 choices | |
frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 4), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 70), ('async_compile_cache_miss', 9), ('benchmarking.InductorBenchmarker.benchmark_gpu', 9), ('select_algorithm_num_precompiles', 5), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_168_248_88', 1)] | |
.E0509 19:04:26.040000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:04:26.040000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 131088, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:04:26.040000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
Eframes [('total', 1)] | |
stats [('calls_captured', 4)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('not_ok', 1)] | |
inductor [('async_compile_cache_miss', 5), ('select_algorithm_num_precompiles', 5), ('benchmarking.InductorBenchmarker.benchmark_gpu', 2), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_8*s0_8*s2_8*s1', 1)] | |
E0509 19:04:26.887000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:04:26.887000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 115216, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:04:26.887000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
E0509 19:04:26.915000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:04:26.915000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 163856, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:04:26.915000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
E0509 19:04:26.922000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:04:26.922000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 163856, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:04:26.922000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
E0509 19:04:26.926000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:04:26.926000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 114712, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:04:26.926000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
AUTOTUNE mm(168x88, 88x248) | |
triton_mm_persistent_tma_780 0.0100 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=False, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=True, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=3, num_warps=8 | |
triton_mm_persistent_tma_779 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=False, BLOCK_K=64, BLOCK_M=128, BLOCK_N=256, B_ROW_MAJOR=True, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=3, num_warps=8 | |
triton_mm_persistent_tma_781 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=False, BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=True, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=3, num_warps=8 | |
triton_mm_persistent_tma_782 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=False, BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=True, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=3, num_warps=4 | |
triton_mm_persistent_tma_783 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=False, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=True, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=4, num_warps=8 | |
SingleProcess AUTOTUNE benchmarking takes 0.1326 seconds and 0.6240 seconds precompiling for 5 choices | |
frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 4), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 70), ('async_compile_cache_miss', 9), ('benchmarking.InductorBenchmarker.benchmark_gpu', 9), ('select_algorithm_num_precompiles', 5), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_168_248_88', 1)] | |
.E0509 19:04:28.267000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:04:28.267000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 115216, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:04:28.267000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
Eframes [('total', 1)] | |
stats [('calls_captured', 4)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('not_ok', 1)] | |
inductor [('async_compile_cache_miss', 5), ('select_algorithm_num_precompiles', 5), ('benchmarking.InductorBenchmarker.benchmark_gpu', 2), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_8*s1_8*s2_8*s0', 1)] | |
E0509 19:04:29.132000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:04:29.132000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 147472, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:04:29.132000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
E0509 19:04:29.159000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:04:29.159000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 196624, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:04:29.159000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
E0509 19:04:29.164000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:04:29.164000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 196624, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:04:29.164000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
E0509 19:04:29.169000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:04:29.169000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 131096, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:04:29.169000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
AUTOTUNE mm(168x88, 88x248) | |
triton_mm_persistent_tma_828 0.0116 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=False, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=False, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=3, num_warps=8 | |
triton_mm_persistent_tma_827 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=False, BLOCK_K=64, BLOCK_M=128, BLOCK_N=256, B_ROW_MAJOR=False, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=3, num_warps=8 | |
triton_mm_persistent_tma_829 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=False, BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=False, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=3, num_warps=8 | |
triton_mm_persistent_tma_830 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=False, BLOCK_K=128, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=False, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=3, num_warps=4 | |
triton_mm_persistent_tma_831 inf ms 0.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, A_ROW_MAJOR=False, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, B_ROW_MAJOR=False, EVEN_K=False, GROUP_M=8, NUM_SMS=43, TMA_SIZE=128, num_stages=4, num_warps=8 | |
SingleProcess AUTOTUNE benchmarking takes 0.1342 seconds and 0.6350 seconds precompiling for 5 choices | |
frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 5), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 70), ('async_compile_cache_miss', 9), ('benchmarking.InductorBenchmarker.benchmark_gpu', 9), ('select_algorithm_num_precompiles', 5), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_168_248_88', 1)] | |
.E0509 19:04:30.332000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:04:30.332000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 147472, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:04:30.332000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
Eframes [('total', 1)] | |
stats [('calls_captured', 5)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('not_ok', 1)] | |
inductor [('async_compile_cache_miss', 5), ('select_algorithm_num_precompiles', 5), ('benchmarking.InductorBenchmarker.benchmark_gpu', 2), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_8*s1_8*s2_8*s0', 1)] | |
frames [('total', 1)] | |
stats [('calls_captured', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('not_ok', 1)] | |
inductor [('benchmarking.InductorBenchmarker.benchmark_gpu', 2), ('fxgraph_cache_miss', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_21_31_11', 1)] | |
.frames [('total', 1)] | |
stats [('calls_captured', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('not_ok', 1)] | |
inductor [('fxgraph_cache_miss', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_s2_s1_s0', 1)] | |
.frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 2), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('fxgraph_cache_miss', 1), ('extern_calls', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_0_100_10', 1)] | |
.frames [('total', 4), ('ok', 4)] | |
stats [('calls_captured', 2), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('fxgraph_cache_miss', 1), ('extern_calls', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_0_s1_s0', 1)] | |
.W0509 19:04:30.598000 434429 torch/_inductor/kernel/mm_common.py:447] [0/0] No choices for GEMM, chose not to fallback to ATen backend. To temporarily change this behavior, set autotune_fallback_to_aten to True via TORCHINDUCTOR_AUTOTUNE_FALLBACK_TO_ATEN=1, but this knob is being deprecated. The long term fix is to include Aten in max_autotune_gemm_backends. | |
frames [('total', 1)] | |
stats [('calls_captured', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('not_ok', 1)] | |
inductor [('benchmarking.InductorBenchmarker.benchmark_gpu', 2), ('fxgraph_cache_miss', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_2_2_2', 1)] | |
.E0509 19:04:36.188000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:04:36.188000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 114688, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:04:36.188000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
AUTOTUNE addmm(50257x768, 50257x32768, 32768x768) | |
bias_addmm 27.0531 ms 100.0% | |
addmm 27.2312 ms 99.3% | |
triton_mm_882 28.6515 ms 94.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 | |
triton_mm_875 29.2564 ms 92.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 | |
triton_mm_883 31.4952 ms 85.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 | |
triton_mm_872 31.8126 ms 85.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_877 31.9027 ms 84.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 | |
triton_mm_876 34.2344 ms 79.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 | |
triton_mm_871 34.5999 ms 78.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_880 34.7853 ms 77.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 | |
SingleProcess AUTOTUNE benchmarking takes 5.9978 seconds and 0.3724 seconds precompiling for 21 choices | |
frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 133), ('benchmarking.InductorBenchmarker.benchmark_gpu', 23), ('async_compile_cache_miss', 19), ('select_algorithm_num_precompiles', 19), ('benchmarking.InductorBenchmarker.benchmark', 2), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1), ('extern_calls', 1)] | |
graph_break [] | |
aten_mm_info [('aten.addmm_50257_768_32768', 1)] | |
.E0509 19:04:43.404000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:04:43.404000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 114688, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:04:43.404000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
AUTOTUNE bmm(1x50257x32768, 1x32768x768) | |
bmm 27.0459 ms 100.0% | |
triton_bmm_901 28.0566 ms 96.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 | |
triton_bmm_895 28.9188 ms 93.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 | |
triton_bmm_894 29.8875 ms 90.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 | |
triton_bmm_892 30.8091 ms 87.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 | |
triton_bmm_902 31.2515 ms 86.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 | |
triton_bmm_891 32.2224 ms 83.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_bmm_890 32.4854 ms 83.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_bmm_898 33.7879 ms 80.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 | |
triton_bmm_896 34.5201 ms 78.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 | |
SingleProcess AUTOTUNE benchmarking takes 5.7565 seconds and 0.3608 seconds precompiling for 20 choices | |
frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 133), ('benchmarking.InductorBenchmarker.benchmark_gpu', 22), ('async_compile_cache_miss', 19), ('select_algorithm_num_precompiles', 19), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('benchmarking.InductorBenchmarker.benchmark', 1), ('select_algorithm_autotune', 1), ('extern_calls', 1)] | |
graph_break [] | |
aten_mm_info [('aten.bmm_50257_768_32768', 1)] | |
.E0509 19:04:50.648000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:04:50.648000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 114688, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:04:50.648000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
AUTOTUNE mm(50257x32768, 32768x768) | |
mm 27.0220 ms 100.0% | |
triton_mm_920 28.6218 ms 94.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 | |
triton_mm_913 29.2509 ms 92.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 | |
triton_mm_917 30.7866 ms 87.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 | |
triton_mm_921 31.4286 ms 86.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 | |
triton_mm_910 31.7901 ms 85.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_915 31.8812 ms 84.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 | |
triton_mm_914 34.2467 ms 78.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 | |
triton_mm_918 34.8518 ms 77.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 | |
triton_mm_909 35.6383 ms 75.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 | |
SingleProcess AUTOTUNE benchmarking takes 5.8182 seconds and 0.3117 seconds precompiling for 20 choices | |
frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 133), ('benchmarking.InductorBenchmarker.benchmark_gpu', 22), ('async_compile_cache_miss', 19), ('select_algorithm_num_precompiles', 19), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('benchmarking.InductorBenchmarker.benchmark', 1), ('select_algorithm_autotune', 1), ('extern_calls', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_50257_768_32768', 1)] | |
.Einductor [('select_algorithm_num_precompiles', 10), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)] | |
.E0509 19:04:53.503000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:04:53.503000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 196608, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:04:53.503000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
E0509 19:04:53.562000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:04:53.562000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:04:53.562000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
E0509 19:04:53.620000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:04:53.620000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:04:53.620000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
E0509 19:04:53.698000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:04:53.698000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 131072, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:04:53.698000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
E0509 19:04:53.702000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:04:53.702000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:04:53.702000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
AUTOTUNE mm(256x256, 256x256) | |
triton_mm_944 0.0066 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_943 0.0076 ms 87.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_945 0.0078 ms 84.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 | |
mm 0.0082 ms 80.9% | |
triton_mm_949 0.0082 ms 80.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 | |
triton_mm_951 0.0119 ms 55.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 | |
triton_mm_952 0.0119 ms 55.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 | |
triton_mm_956 0.0120 ms 55.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 | |
triton_mm_948 0.0123 ms 53.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_953 0.0123 ms 53.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 | |
SingleProcess AUTOTUNE benchmarking takes 0.3827 seconds and 0.3606 seconds precompiling for 20 choices | |
E0509 19:04:54.134000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:04:54.134000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 196608, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:04:54.134000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
E0509 19:04:54.193000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:04:54.193000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:04:54.193000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
E0509 19:04:54.251000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:04:54.251000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:04:54.251000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
E0509 19:04:54.328000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:04:54.328000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 131072, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:04:54.328000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
E0509 19:04:54.332000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:04:54.332000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:04:54.332000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
AUTOTUNE mm(256x256, 256x256) | |
triton_mm_925 0.0075 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_926 0.0075 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_924 0.0076 ms 98.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 | |
mm 0.0079 ms 94.7% | |
triton_mm_930 0.0082 ms 91.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 | |
triton_mm_933 0.0118 ms 63.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 | |
triton_mm_932 0.0119 ms 63.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 | |
triton_mm_934 0.0123 ms 61.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 | |
triton_mm_929 0.0123 ms 60.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_936 0.0123 ms 60.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 | |
SingleProcess AUTOTUNE benchmarking takes 0.2869 seconds and 0.0001 seconds precompiling for 20 choices | |
E0509 19:04:54.718000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:04:54.718000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 131072, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:04:54.718000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
E0509 19:04:54.775000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:04:54.775000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 147456, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:04:54.775000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
E0509 19:04:54.869000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:04:54.869000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 131072, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:04:54.869000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
AUTOTUNE mm(256x256, 256x256) | |
triton_mm_964 0.0055 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_965 0.0056 ms 98.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 | |
triton_mm_962 0.0057 ms 97.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_963 0.0059 ms 94.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 | |
mm 0.0061 ms 90.1% | |
triton_mm_968 0.0061 ms 90.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 | |
triton_mm_971 0.0075 ms 73.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 | |
triton_mm_972 0.0076 ms 73.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 | |
triton_mm_975 0.0077 ms 72.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 | |
triton_mm_970 0.0082 ms 67.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 | |
SingleProcess AUTOTUNE benchmarking takes 0.3099 seconds and 0.0001 seconds precompiling for 20 choices | |
E0509 19:04:55.039000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:04:55.039000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 131072, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:04:55.039000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
E0509 19:04:55.101000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:04:55.101000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 147456, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:04:55.101000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
E0509 19:04:55.200000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Runtime error during autotuning: | |
E0509 19:04:55.200000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] No valid triton configs. OutOfResources: out of resource: shared memory, Required: 131072, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.. | |
E0509 19:04:55.200000 434429 torch/_inductor/select_algorithm.py:2100] [0/0] Ignoring this choice. | |
AUTOTUNE mm(256x256, 256x256) | |
triton_mm_982 0.0051 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_981 0.0058 ms 88.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_983 0.0058 ms 87.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_984 0.0059 ms 86.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 | |
mm 0.0061 ms 82.8% | |
triton_mm_987 0.0061 ms 82.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8 | |
triton_mm_991 0.0076 ms 67.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 | |
triton_mm_990 0.0077 ms 66.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 | |
triton_mm_994 0.0077 ms 66.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8 | |
triton_mm_993 0.0079 ms 64.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 | |
SingleProcess AUTOTUNE benchmarking takes 0.3306 seconds and 0.0001 seconds precompiling for 20 choices | |
frames [('total', 1), ('ok', 1)] | |
inline_call [] | |
unimplemented [] | |
stats [('calls_captured', 7), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 637), ('benchmarking.InductorBenchmarker.benchmark_gpu', 89), ('async_compile_cache_miss', 49), ('select_algorithm_num_precompiles', 38), ('benchmarking.InductorBenchmarker.benchmark', 4), ('select_algorithm_autotune', 4), ('select_algorithm_precompile', 2), ('fxgraph_cache_miss', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_256_256_256', 4)] | |
.frames [('total', 1), ('ok', 1)] | |
inline_call [] | |
unimplemented [] | |
stats [('calls_captured', 7), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 14), ('extern_calls', 4), ('benchmarking.InductorBenchmarker.benchmark_gpu', 2), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_256_256_256', 4)] | |
.AUTOTUNE mm(5x4, 4x3) | |
triton_mm_1075 0.0034 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=1 | |
triton_mm_1076 0.0034 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=1 | |
triton_mm_1077 0.0034 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=1 | |
triton_mm_1079 0.0036 ms 96.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=1 | |
triton_mm_1078 0.0037 ms 91.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=1 | |
SubProcess AUTOTUNE benchmarking takes 0.6042 seconds and 0.0867 seconds precompiling for 5 choices | |
stats [('calls_captured', 6), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 49), ('async_compile_cache_miss', 8), ('benchmarking.InductorBenchmarker.benchmark_gpu', 6), ('select_algorithm_num_precompiles', 5), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_s1_s2_s0', 1)] | |
.AUTOTUNE mm(100x10, 10x100) | |
triton_mm_1081 0.0031 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_1084 0.0033 ms 92.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4 | |
triton_mm_1080 0.0034 ms 90.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=2 | |
triton_mm_1082 0.0034 ms 90.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_1083 0.0037 ms 82.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_1085 0.0037 ms 82.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_1089 0.0038 ms 81.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 | |
triton_mm_1086 0.0038 ms 80.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=8 | |
triton_mm_1087 0.0038 ms 80.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4 | |
triton_mm_1088 0.0041 ms 75.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 | |
SingleProcess AUTOTUNE benchmarking takes 0.3161 seconds and 1.0394 seconds precompiling for 17 choices | |
AUTOTUNE mm(100x10, 10x100) | |
triton_mm_1096 0.0035 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=2 | |
triton_mm_1100 0.0035 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4 | |
triton_mm_1097 0.0036 ms 97.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_1099 0.0036 ms 96.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_1098 0.0036 ms 95.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_1101 0.0037 ms 94.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_1103 0.0037 ms 94.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4 | |
triton_mm_1102 0.0038 ms 91.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=8 | |
triton_mm_1104 0.0041 ms 85.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 | |
triton_mm_1105 0.0041 ms 85.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 | |
SingleProcess AUTOTUNE benchmarking takes 0.3056 seconds and 0.9424 seconds precompiling for 17 choices | |
AUTOTUNE mm(100x10, 10x100) | |
triton_mm_1119 0.0034 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4 | |
triton_mm_1116 0.0034 ms 99.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4 | |
triton_mm_1115 0.0035 ms 98.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_1112 0.0035 ms 96.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=2 | |
triton_mm_1113 0.0035 ms 96.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_1118 0.0035 ms 96.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=8 | |
triton_mm_1114 0.0036 ms 93.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_1117 0.0037 ms 90.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_1122 0.0038 ms 89.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=4 | |
triton_mm_1120 0.0041 ms 82.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 | |
SingleProcess AUTOTUNE benchmarking takes 0.3135 seconds and 0.7753 seconds precompiling for 17 choices | |
AUTOTUNE mm(100x10, 10x100) | |
triton_mm_1130 0.0034 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_1128 0.0034 ms 99.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=2 | |
triton_mm_1135 0.0034 ms 99.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4 | |
triton_mm_1129 0.0035 ms 95.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_1132 0.0036 ms 92.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4 | |
triton_mm_1131 0.0037 ms 90.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_1133 0.0038 ms 89.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_1137 0.0038 ms 89.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8 | |
triton_mm_1138 0.0038 ms 87.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=4 | |
triton_mm_1134 0.0040 ms 84.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=8 | |
SingleProcess AUTOTUNE benchmarking takes 0.3112 seconds and 1.1342 seconds precompiling for 17 choices | |
AUTOTUNE mm(100x10, 10x100) | |
triton_mm_1144 0.0029 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=2 | |
triton_mm_1147 0.0035 ms 83.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_1148 0.0036 ms 82.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4 | |
triton_mm_1146 0.0036 ms 81.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_1145 0.0036 ms 79.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_1149 0.0038 ms 77.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_1151 0.0038 ms 77.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4 | |
triton_mm_1152 0.0038 ms 75.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 | |
triton_mm_1154 0.0040 ms 72.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=4 | |
triton_mm_1150 0.0041 ms 71.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=8 | |
SingleProcess AUTOTUNE benchmarking takes 0.3073 seconds and 1.0976 seconds precompiling for 17 choices | |
Cache Stats: | |
autotune_local: puts: 0, misses: 0, hits: 0, | |
autotune_remote: puts: 2, misses: 2, hits: 3, | |
bundled_autotune: puts: 0, misses: 0, hits: 0, | |
fx_graph: puts: 0, misses: 0, hits: 0, | |
triton: puts: 0, misses: 0, hits: 0, | |
aot_autograd: puts: 0, misses: 0, hits: 0, | |
dynamo_pgo: puts: 0, misses: 0, hits: 0, | |
Cache Entries: | |
autotune_remote: | |
'pt2:de08cb2292af23d436d8389fec3ec308647c494f61372e4d803a3cd48bee1827::51358a087e4794b75273376732b5ed9b0fe534dff4bfcda33e6a91b52c4cd428:c1': b'{"XBLOCK": 128, "num_warps": 4, "num_stages": 1, "configs_hash": "1b2cc4dbebb9680d3ce31843331593b1... | |
'pt2:de08cb2292af23d436d8389fec3ec308647c494f61372e4d803a3cd48bee1827::eb21653aa258b18c3119d2ff55ce0f124ed954d0df0253f8e37c5c036f80a418:c1': b'{"XBLOCK": 128, "num_warps": 4, "num_stages": 1, "configs_hash": "1b2cc4dbebb9680d3ce31843331593b1... | |
Cache Stats: | |
autotune_local: puts: 0, misses: 0, hits: 0, | |
autotune_remote: puts: 2, misses: 2, hits: 3, | |
bundled_autotune: puts: 0, misses: 0, hits: 0, | |
fx_graph: puts: 0, misses: 0, hits: 0, | |
triton: puts: 0, misses: 0, hits: 0, | |
aot_autograd: puts: 0, misses: 0, hits: 0, | |
dynamo_pgo: puts: 0, misses: 0, hits: 0, | |
Cache Entries: | |
autotune_remote: | |
'pt2:de08cb2292af23d436d8389fec3ec308647c494f61372e4d803a3cd48bee1827::2104d7bc884978e3985bdb2c8f01e511e320b9198192788d15ae25e1469d3425:c1': b'{"XBLOCK": 256, "num_warps": 4, "num_stages": 1, "configs_hash": "1b2cc4dbebb9680d3ce31843331593b1... | |
'pt2:de08cb2292af23d436d8389fec3ec308647c494f61372e4d803a3cd48bee1827::e53bbadf30ab7b008055c8eed484fc260c45957d83de810cbd62287008666588:c1': b'{"XBLOCK": 256, "num_warps": 4, "num_stages": 1, "configs_hash": "1b2cc4dbebb9680d3ce31843331593b1... | |
frames [('total', 10), ('ok', 10)] | |
stats [('calls_captured', 20), ('unique_graphs', 10)] | |
aot_autograd [('total', 10), ('autograd_cache_bypass', 10), ('ok', 10)] | |
inductor [('benchmarking.InductorBenchmarker.benchmark_gpu', 105), ('select_algorithm_num_precompiles', 80), ('async_compile_cache_miss', 15), ('select_algorithm_precompile', 5), ('benchmarking.InductorBenchmarker.benchmark', 5), ('select_algorithm_autotune', 5), ('extern_calls', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_100_100_10', 6)] | |
inline_call [] | |
.AUTOTUNE mm(100x10, 10x100) | |
triton_mm_1161 0.0035 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_1160 0.0036 ms 96.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=2 | |
triton_mm_1162 0.0036 ms 94.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_1164 0.0038 ms 91.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4 | |
triton_mm_1163 0.0041 ms 84.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_1165 0.0041 ms 84.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_1166 0.0041 ms 84.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=8 | |
triton_mm_1167 0.0041 ms 84.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4 | |
triton_mm_1168 0.0041 ms 84.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 | |
triton_mm_1170 0.0041 ms 84.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=4 | |
SingleProcess AUTOTUNE benchmarking takes 0.2971 seconds and 1.4003 seconds precompiling for 17 choices | |
AUTOTUNE mm(100x10, 10x100) | |
triton_mm_1177 0.0036 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_1180 0.0037 ms 95.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4 | |
triton_mm_1179 0.0038 ms 94.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_1176 0.0041 ms 87.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=2 | |
triton_mm_1178 0.0041 ms 87.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_1181 0.0041 ms 87.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_1182 0.0041 ms 87.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=8 | |
triton_mm_1183 0.0041 ms 87.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4 | |
triton_mm_1186 0.0041 ms 87.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=4 | |
triton_mm_1187 0.0054 ms 65.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 | |
SingleProcess AUTOTUNE benchmarking takes 0.2962 seconds and 0.9495 seconds precompiling for 17 choices | |
AUTOTUNE mm(100x10, 10x100) | |
triton_mm_1193 0.0038 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_1194 0.0038 ms 99.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_1196 0.0038 ms 99.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4 | |
triton_mm_1195 0.0038 ms 98.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_1192 0.0041 ms 92.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=2 | |
triton_mm_1197 0.0041 ms 92.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_1198 0.0041 ms 92.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=8 | |
triton_mm_1199 0.0041 ms 92.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4 | |
triton_mm_1200 0.0041 ms 92.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 | |
triton_mm_1202 0.0041 ms 91.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=4 | |
SingleProcess AUTOTUNE benchmarking takes 0.2969 seconds and 1.4035 seconds precompiling for 17 choices | |
AUTOTUNE mm(100x10, 10x100) | |
triton_mm_1209 0.0035 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_1210 0.0037 ms 93.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_1211 0.0038 ms 91.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_1208 0.0041 ms 84.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=2 | |
triton_mm_1212 0.0041 ms 84.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4 | |
triton_mm_1213 0.0041 ms 84.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_1214 0.0041 ms 84.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=8 | |
triton_mm_1215 0.0041 ms 84.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4 | |
triton_mm_1218 0.0041 ms 84.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=4 | |
triton_mm_1219 0.0054 ms 63.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 | |
SingleProcess AUTOTUNE benchmarking takes 0.3139 seconds and 1.3179 seconds precompiling for 17 choices | |
AUTOTUNE mm(100x10, 10x100) | |
triton_mm_1227 0.0037 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_1225 0.0038 ms 98.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_1226 0.0038 ms 98.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_1228 0.0038 ms 97.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4 | |
triton_mm_1224 0.0041 ms 91.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=2 | |
triton_mm_1229 0.0041 ms 91.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_1230 0.0041 ms 91.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=8 | |
triton_mm_1231 0.0041 ms 91.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4 | |
triton_mm_1234 0.0041 ms 91.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=4 | |
triton_mm_1235 0.0054 ms 68.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4 | |
SingleProcess AUTOTUNE benchmarking takes 0.2981 seconds and 1.5387 seconds precompiling for 17 choices | |
Cache Stats: | |
autotune_local: puts: 0, misses: 0, hits: 0, | |
autotune_remote: puts: 2, misses: 2, hits: 3, | |
bundled_autotune: puts: 0, misses: 0, hits: 0, | |
fx_graph: puts: 0, misses: 0, hits: 0, | |
triton: puts: 0, misses: 0, hits: 0, | |
aot_autograd: puts: 0, misses: 0, hits: 0, | |
dynamo_pgo: puts: 0, misses: 0, hits: 0, | |
Cache Entries: | |
autotune_remote: | |
'pt2:de08cb2292af23d436d8389fec3ec308647c494f61372e4d803a3cd48bee1827::103bad2bba162792a246553a9660490b13c42cb9fc556ee026a2ee76225c5e39:c1': b'{"XBLOCK": 256, "num_warps": 4, "num_stages": 1, "configs_hash": "1b2cc4dbebb9680d3ce31843331593b1... | |
'pt2:de08cb2292af23d436d8389fec3ec308647c494f61372e4d803a3cd48bee1827::75d7fc095dca95edbe9b36a582f8a7a52af37ea17ffa0cf4b1f698f3da660974:c1': b'{"XBLOCK": 128, "num_warps": 4, "num_stages": 1, "configs_hash": "1b2cc4dbebb9680d3ce31843331593b1... | |
Cache Stats: | |
autotune_local: puts: 0, misses: 0, hits: 0, | |
autotune_remote: puts: 2, misses: 2, hits: 3, | |
bundled_autotune: puts: 0, misses: 0, hits: 0, | |
fx_graph: puts: 0, misses: 0, hits: 0, | |
triton: puts: 0, misses: 0, hits: 0, | |
aot_autograd: puts: 0, misses: 0, hits: 0, | |
dynamo_pgo: puts: 0, misses: 0, hits: 0, | |
Cache Entries: | |
autotune_remote: | |
'pt2:de08cb2292af23d436d8389fec3ec308647c494f61372e4d803a3cd48bee1827::74d8befde4020dffc0fb6b69ebb0e418c3f11ac8bc30ece94f24cd314c0d60da:c1': b'{"XBLOCK": 128, "num_warps": 4, "num_stages": 1, "configs_hash": "1b2cc4dbebb9680d3ce31843331593b1... | |
'pt2:de08cb2292af23d436d8389fec3ec308647c494f61372e4d803a3cd48bee1827::945b170871dfe60acaf2756015ad6acb1a693ea876400ff39314054d40efc61c:c1': b'{"XBLOCK": 128, "num_warps": 4, "num_stages": 1, "configs_hash": "1b2cc4dbebb9680d3ce31843331593b1... | |
frames [('total', 40), ('ok', 40)] | |
stats [('calls_captured', 20), ('unique_graphs', 10)] | |
aot_autograd [('total', 10), ('autograd_cache_bypass', 10), ('ok', 10)] | |
inductor [('benchmarking.InductorBenchmarker.benchmark_gpu', 93), ('select_algorithm_num_precompiles', 80), ('async_compile_cache_miss', 15), ('select_algorithm_precompile', 5), ('benchmarking.InductorBenchmarker.benchmark', 5), ('select_algorithm_autotune', 5), ('extern_calls', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_s0_s0_s1', 6)] | |
inline_call [] | |
.AUTOTUNE mm(1x63, 63x128) | |
triton_mm_1243 0.0041 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=16, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=2 | |
triton_mm_1241 0.0051 ms 80.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=16, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=2 | |
triton_mm_1242 0.0056 ms 73.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=16, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4 | |
triton_mm_1240 0.0072 ms 57.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=16, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=2 | |
SingleProcess AUTOTUNE benchmarking takes 0.1558 seconds and 0.1524 seconds precompiling for 4 choices | |
frames [('total', 4), ('ok', 4)] | |
stats [('calls_captured', 3), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 35), ('async_compile_cache_miss', 6), ('select_algorithm_num_precompiles', 4), ('benchmarking.InductorBenchmarker.benchmark_gpu', 4), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_1_s1_s0', 1)] | |
.AUTOTUNE mm(1x64, 64x128) | |
triton_mm_1245 0.0041 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=16, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=2 | |
triton_mm_1246 0.0057 ms 71.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=16, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4 | |
triton_mm_1247 0.0057 ms 71.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=16, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=2 | |
triton_mm_1244 0.0061 ms 66.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=16, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=2 | |
SingleProcess AUTOTUNE benchmarking takes 0.1575 seconds and 0.1463 seconds precompiling for 4 choices | |
frames [('total', 4), ('ok', 4)] | |
stats [('calls_captured', 3), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 35), ('async_compile_cache_miss', 6), ('select_algorithm_num_precompiles', 4), ('benchmarking.InductorBenchmarker.benchmark_gpu', 4), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_1_s1_s0', 1)] | |
.AUTOTUNE mm(20x1, 1x1) | |
triton_mm_1251 0.0028 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=2 | |
triton_mm_1250 0.0034 ms 84.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=2 | |
triton_mm_1248 0.0035 ms 82.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=2 | |
triton_mm_1249 0.0035 ms 81.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=2 | |
SingleProcess AUTOTUNE benchmarking takes 0.1526 seconds and 0.0779 seconds precompiling for 4 choices | |
frames [('total', 4), ('ok', 4)] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 35), ('async_compile_cache_miss', 6), ('select_algorithm_num_precompiles', 4), ('benchmarking.InductorBenchmarker.benchmark_gpu', 4), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_s0_1_1', 1)] | |
.AUTOTUNE mm(64x128, 128x256) | |
triton_mm_1254 0.0036 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_1255 0.0038 ms 94.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_1253 0.0041 ms 87.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_1252 0.0076 ms 47.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=2 | |
SingleProcess AUTOTUNE benchmarking takes 0.1604 seconds and 0.0812 seconds precompiling for 4 choices | |
frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 2), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 49), ('async_compile_cache_miss', 7), ('benchmarking.InductorBenchmarker.benchmark_gpu', 6), ('select_algorithm_num_precompiles', 4), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_64_256_128', 1)] | |
.sAUTOTUNE mm(128x128, 128x128) | |
triton_mm_1259 0.0036 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_1257 0.0037 ms 94.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_1258 0.0041 ms 87.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_1256 0.0061 ms 57.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=2 | |
SingleProcess AUTOTUNE benchmarking takes 0.1626 seconds and 0.0937 seconds precompiling for 4 choices | |
Eframes [('total', 2), ('ok', 2)] | |
stats [('calls_captured', 5), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 49), ('async_compile_cache_miss', 9), ('select_algorithm_num_precompiles', 4), ('benchmarking.InductorBenchmarker.benchmark_gpu', 4), ('fxgraph_cache_miss', 2), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_128_128_128', 2)] | |
AUTOTUNE mm(256x256, 256x128) | |
triton_mm_1265 0.0058 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_1266 0.0059 ms 98.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_1267 0.0061 ms 94.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_1264 0.0123 ms 47.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=2 | |
SingleProcess AUTOTUNE benchmarking takes 0.1550 seconds and 0.1321 seconds precompiling for 4 choices | |
frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 4), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 49), ('async_compile_cache_miss', 7), ('benchmarking.InductorBenchmarker.benchmark_gpu', 6), ('select_algorithm_num_precompiles', 4), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_256_128_256', 1)] | |
.AUTOTUNE mm(64x128, 128x256) | |
triton_mm_1271 0.0041 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_1270 0.0056 ms 73.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_1269 0.0061 ms 66.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_1268 0.0082 ms 50.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=2 | |
SingleProcess AUTOTUNE benchmarking takes 0.1526 seconds and 0.1262 seconds precompiling for 4 choices | |
frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 4), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 35), ('async_compile_cache_miss', 6), ('select_algorithm_num_precompiles', 4), ('benchmarking.InductorBenchmarker.benchmark_gpu', 4), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_64_256_128', 1)] | |
.AUTOTUNE mm(64x64, 64x64) | |
triton_mm_1275 0.0036 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_1274 0.0040 ms 89.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_1273 0.0041 ms 86.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_1272 0.0058 ms 61.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=2 | |
SingleProcess AUTOTUNE benchmarking takes 0.1591 seconds and 0.1152 seconds precompiling for 4 choices | |
frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 4), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 35), ('async_compile_cache_miss', 6), ('select_algorithm_num_precompiles', 4), ('benchmarking.InductorBenchmarker.benchmark_gpu', 4), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_64_64_64', 1)] | |
.AUTOTUNE mm(64x120, 120x64) | |
triton_mm_1278 0.0041 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_1279 0.0041 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_1277 0.0073 ms 55.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_1276 0.0078 ms 52.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=2 | |
SingleProcess AUTOTUNE benchmarking takes 0.1522 seconds and 0.1398 seconds precompiling for 4 choices | |
frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 4), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 35), ('async_compile_cache_miss', 6), ('select_algorithm_num_precompiles', 4), ('benchmarking.InductorBenchmarker.benchmark_gpu', 4), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_64_64_120', 1)] | |
.AUTOTUNE mm(64x128, 128x256) | |
triton_mm_1283 0.0041 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_1282 0.0055 ms 74.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_1281 0.0061 ms 66.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_1280 0.0082 ms 50.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=2 | |
SingleProcess AUTOTUNE benchmarking takes 0.1560 seconds and 0.1241 seconds precompiling for 4 choices | |
frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 3), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 35), ('async_compile_cache_miss', 6), ('select_algorithm_num_precompiles', 4), ('benchmarking.InductorBenchmarker.benchmark_gpu', 4), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_64_256_128', 1)] | |
.AUTOTUNE mm(128x128, 128x128) | |
triton_mm_1287 0.0041 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_1286 0.0055 ms 74.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_1285 0.0059 ms 69.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_1284 0.0079 ms 51.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=2 | |
SingleProcess AUTOTUNE benchmarking takes 0.1569 seconds and 0.1299 seconds precompiling for 4 choices | |
frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 3), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 35), ('async_compile_cache_miss', 6), ('select_algorithm_num_precompiles', 4), ('benchmarking.InductorBenchmarker.benchmark_gpu', 4), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_128_128_128', 1)] | |
.AUTOTUNE mm(63x120, 120x250) | |
triton_mm_1291 0.0050 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_1290 0.0054 ms 91.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_1289 0.0058 ms 85.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_1288 0.0098 ms 51.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=2 | |
SingleProcess AUTOTUNE benchmarking takes 0.1597 seconds and 0.1381 seconds precompiling for 4 choices | |
frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 3), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 35), ('async_compile_cache_miss', 6), ('select_algorithm_num_precompiles', 4), ('benchmarking.InductorBenchmarker.benchmark_gpu', 4), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_63_250_120', 1)] | |
.AUTOTUNE mm(128x128, 128x128) | |
triton_mm_1295 0.0041 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_1293 0.0055 ms 74.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_1294 0.0055 ms 74.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_1292 0.0075 ms 54.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=2 | |
SingleProcess AUTOTUNE benchmarking takes 0.1517 seconds and 0.1158 seconds precompiling for 4 choices | |
frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 3), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 147), ('benchmarking.InductorBenchmarker.benchmark_gpu', 17), ('async_compile_cache_miss', 12), ('select_algorithm_num_precompiles', 4), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_128_128_128', 1)] | |
.AUTOTUNE mm(128x32, 32x128) | |
triton_mm_1301 0.0034 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_1302 0.0038 ms 89.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_1300 0.0038 ms 89.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=2 | |
triton_mm_1303 0.0038 ms 89.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 | |
SingleProcess AUTOTUNE benchmarking takes 0.1504 seconds and 0.0751 seconds precompiling for 4 choices | |
AUTOTUNE mm(128x16, 16x128) | |
triton_mm_1296 0.0033 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=2 | |
triton_mm_1297 0.0034 ms 96.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_1298 0.0038 ms 86.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_1299 0.0038 ms 86.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 | |
SingleProcess AUTOTUNE benchmarking takes 0.0811 seconds and 0.0003 seconds precompiling for 4 choices | |
AUTOTUNE mm(128x128, 128x128) | |
triton_mm_1306 0.0041 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_1307 0.0054 ms 75.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_1305 0.0059 ms 69.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_1304 0.0076 ms 53.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=2 | |
SingleProcess AUTOTUNE benchmarking takes 0.0714 seconds and 0.1176 seconds precompiling for 4 choices | |
frames [('total', 2), ('ok', 2)] | |
stats [('calls_captured', 7), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 203), ('benchmarking.InductorBenchmarker.benchmark_gpu', 25), ('async_compile_cache_miss', 23), ('select_algorithm_num_precompiles', 12), ('select_algorithm_precompile', 3), ('select_algorithm_autotune', 3), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [('aten.mm_128_128_16', 1), ('aten.mm_128_128_32', 1), ('aten.mm_128_128_128', 1)] | |
.AUTOTUNE mm(64x127, 127x64) | |
triton_mm_1309 0.0054 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_1311 0.0054 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_1310 0.0055 ms 98.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_1308 0.0097 ms 56.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=2 | |
SingleProcess AUTOTUNE benchmarking takes 0.1544 seconds and 0.1414 seconds precompiling for 4 choices | |
frames [('total', 3), ('ok', 3)] | |
inline_call [] | |
stats [('calls_captured', 6), ('unique_graphs', 3)] | |
aot_autograd [('total', 3), ('autograd_cache_miss', 3), ('autograd_cache_saved', 3), ('ok', 3)] | |
inductor [('triton_bundler_save_kernel', 49), ('async_compile_cache_miss', 10), ('select_algorithm_num_precompiles', 4), ('benchmarking.InductorBenchmarker.benchmark_gpu', 4), ('fxgraph_cache_miss', 3), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_64_64_127', 3)] | |
.AUTOTUNE mm(252x248, 248x128) | |
triton_mm_1321 0.0059 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_1322 0.0059 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_1323 0.0059 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_1320 0.0140 ms 41.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=2 | |
SingleProcess AUTOTUNE benchmarking takes 0.1544 seconds and 0.1310 seconds precompiling for 4 choices | |
frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 63), ('async_compile_cache_miss', 8), ('benchmarking.InductorBenchmarker.benchmark_gpu', 8), ('select_algorithm_num_precompiles', 4), ('async_compile_cache_hit', 2), ('pattern_matcher_count', 1), ('pattern_matcher_nodes', 1), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_252_128_248', 1)] | |
.AUTOTUNE mm(252x248, 248x128) | |
triton_mm_1325 0.0056 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_1326 0.0058 ms 97.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_1327 0.0058 ms 97.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_1324 0.0137 ms 41.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=2 | |
SingleProcess AUTOTUNE benchmarking takes 0.1596 seconds and 0.1340 seconds precompiling for 4 choices | |
frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 63), ('async_compile_cache_miss', 8), ('benchmarking.InductorBenchmarker.benchmark_gpu', 8), ('select_algorithm_num_precompiles', 4), ('async_compile_cache_hit', 2), ('pattern_matcher_count', 1), ('pattern_matcher_nodes', 1), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_252_128_248', 1)] | |
.AUTOTUNE mm(252x248, 248x128) | |
triton_mm_1329 0.0055 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_1330 0.0059 ms 93.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_1331 0.0059 ms 93.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_1328 0.0123 ms 44.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=2 | |
SingleProcess AUTOTUNE benchmarking takes 0.1566 seconds and 0.1339 seconds precompiling for 4 choices | |
frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 63), ('async_compile_cache_miss', 8), ('benchmarking.InductorBenchmarker.benchmark_gpu', 8), ('select_algorithm_num_precompiles', 4), ('async_compile_cache_hit', 2), ('pattern_matcher_count', 1), ('pattern_matcher_nodes', 1), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_252_128_248', 1)] | |
.AUTOTUNE mm(64x128, 128x256) | |
triton_mm_1334 0.0041 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_1335 0.0053 ms 77.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_1333 0.0061 ms 66.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_1332 0.0079 ms 52.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=2 | |
SingleProcess AUTOTUNE benchmarking takes 0.1609 seconds and 0.1504 seconds precompiling for 4 choices | |
frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 6), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 35), ('async_compile_cache_miss', 6), ('select_algorithm_num_precompiles', 4), ('benchmarking.InductorBenchmarker.benchmark_gpu', 4), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_64_256_128', 1)] | |
.AUTOTUNE mm(128x128, 128x128) | |
triton_mm_1338 0.0041 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_1339 0.0056 ms 72.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_1337 0.0061 ms 66.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_1336 0.0076 ms 54.2% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=2 | |
SingleProcess AUTOTUNE benchmarking takes 0.1566 seconds and 0.1363 seconds precompiling for 4 choices | |
frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 6), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 35), ('async_compile_cache_miss', 6), ('select_algorithm_num_precompiles', 4), ('benchmarking.InductorBenchmarker.benchmark_gpu', 4), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_128_128_128', 1)] | |
.AUTOTUNE mm(63x120, 120x250) | |
triton_mm_1342 0.0054 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_1343 0.0055 ms 98.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_1341 0.0061 ms 88.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_1340 0.0100 ms 54.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=2 | |
SingleProcess AUTOTUNE benchmarking takes 0.1582 seconds and 0.1255 seconds precompiling for 4 choices | |
frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 6), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 35), ('async_compile_cache_miss', 6), ('select_algorithm_num_precompiles', 4), ('benchmarking.InductorBenchmarker.benchmark_gpu', 4), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_63_250_120', 1)] | |
.AUTOTUNE mm(256x256, 256x256) | |
triton_mm_1347 0.0059 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_1345 0.0061 ms 95.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_1346 0.0061 ms 95.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_1344 0.0134 ms 43.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=2 | |
SingleProcess AUTOTUNE benchmarking takes 0.1592 seconds and 0.1350 seconds precompiling for 4 choices | |
frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 4), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 49), ('async_compile_cache_miss', 7), ('benchmarking.InductorBenchmarker.benchmark_gpu', 6), ('select_algorithm_num_precompiles', 4), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_256_256_256', 1)] | |
.AUTOTUNE mm(256x256, 256x256) | |
triton_mm_1350 0.0061 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_1349 0.0061 ms 99.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_1351 0.0061 ms 99.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_1348 0.0126 ms 48.6% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=2 | |
SingleProcess AUTOTUNE benchmarking takes 0.1628 seconds and 0.1518 seconds precompiling for 4 choices | |
frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 4), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 49), ('async_compile_cache_miss', 7), ('benchmarking.InductorBenchmarker.benchmark_gpu', 6), ('select_algorithm_num_precompiles', 4), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_256_256_256', 1)] | |
.AUTOTUNE mm(64x64, 64x64) | |
triton_mm_1354 0.0036 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_1353 0.0038 ms 95.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_1355 0.0041 ms 89.1% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_1352 0.0056 ms 65.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=2 | |
SingleProcess AUTOTUNE benchmarking takes 0.1525 seconds and 0.0915 seconds precompiling for 4 choices | |
frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 5), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 35), ('async_compile_cache_miss', 6), ('select_algorithm_num_precompiles', 4), ('benchmarking.InductorBenchmarker.benchmark_gpu', 4), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_64_64_64', 1)] | |
.AUTOTUNE mm(64x128, 128x256) | |
triton_mm_1358 0.0041 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_1359 0.0041 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_1357 0.0061 ms 66.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_1356 0.0079 ms 51.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=2 | |
SingleProcess AUTOTUNE benchmarking takes 0.1556 seconds and 0.1287 seconds precompiling for 4 choices | |
frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 2), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 35), ('async_compile_cache_miss', 6), ('select_algorithm_num_precompiles', 4), ('benchmarking.InductorBenchmarker.benchmark_gpu', 4), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_64_256_128', 1)] | |
.AUTOTUNE mm(128x128, 128x128) | |
triton_mm_1362 0.0041 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_1363 0.0055 ms 74.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_1361 0.0061 ms 66.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_1360 0.0079 ms 51.8% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=2 | |
SingleProcess AUTOTUNE benchmarking takes 0.1567 seconds and 0.1207 seconds precompiling for 4 choices | |
frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 2), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 35), ('async_compile_cache_miss', 6), ('select_algorithm_num_precompiles', 4), ('benchmarking.InductorBenchmarker.benchmark_gpu', 4), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_128_128_128', 1)] | |
.AUTOTUNE mm(63x120, 120x250) | |
triton_mm_1366 0.0053 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_1367 0.0055 ms 96.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=8 | |
triton_mm_1365 0.0059 ms 91.3% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4 | |
triton_mm_1364 0.0100 ms 53.7% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=2 | |
SingleProcess AUTOTUNE benchmarking takes 0.1642 seconds and 0.1487 seconds precompiling for 4 choices | |
frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 2), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 35), ('async_compile_cache_miss', 6), ('select_algorithm_num_precompiles', 4), ('benchmarking.InductorBenchmarker.benchmark_gpu', 4), ('fxgraph_cache_miss', 1), ('select_algorithm_precompile', 1), ('select_algorithm_autotune', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_63_250_120', 1)] | |
.Exception in TuningProcess | |
Traceback (most recent call last): | |
File "/workspace/pytorch/torch/_inductor/autotune_process.py", line 118, in process_main | |
TuningProcess.workloop(request_queue, response_queue) | |
File "/workspace/pytorch/torch/_inductor/autotune_process.py", line 135, in workloop | |
response_queue.put(obj.benchmark()) | |
^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/test/inductor/test_max_autotune.py", line 1220, in benchmark | |
assert visible_devices == self.parent_visible_devices | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
AssertionError | |
/workspace/pytorch/torch/_inductor/autotune_process.py:382: UserWarning: Failed to benchmark choice 'test'. It will be ignored. Please debug the root cause in case the choice can bring perf gains. | |
warnings.warn( | |
.. | |
====================================================================== | |
ERROR: test_max_autotune_addmm_persistent_tma_a_transposed_False_b_transposed_False_dynamic_True (__main__.TestMaxAutotune.test_max_autotune_addmm_persistent_tma_a_transposed_False_b_transposed_False_dynamic_True) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/workspace/pytorch/torch/testing/_internal/common_utils.py", line 3153, in wrapper | |
method(*args, **kwargs) | |
File "/workspace/pytorch/torch/testing/_internal/common_utils.py", line 552, in instantiated_test | |
test(self, **param_kwargs) | |
File "/workspace/pytorch/test/inductor/test_max_autotune.py", line 437, in test_max_autotune_addmm_persistent_tma | |
c_actual = torch.compile(addmm, dynamic=dynamic)(x, a, b) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_dynamo/eval_frame.py", line 663, in _fn | |
raise e.remove_dynamo_frames() from None # see TORCHDYNAMO_VERBOSE=1 | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/compile_fx.py", line 760, in _compile_fx_inner | |
raise InductorError(e, currentframe()).with_traceback( | |
File "/workspace/pytorch/torch/_inductor/compile_fx.py", line 745, in _compile_fx_inner | |
mb_compiled_graph = fx_codegen_and_compile( | |
^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/compile_fx.py", line 1295, in fx_codegen_and_compile | |
return scheme.codegen_and_compile(gm, example_inputs, inputs_to_check, graph_kwargs) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/compile_fx.py", line 1197, in codegen_and_compile | |
compiled_fn = graph.compile_to_module().call | |
^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/graph.py", line 2083, in compile_to_module | |
return self._compile_to_module() | |
^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/graph.py", line 2091, in _compile_to_module | |
self.codegen_with_cpp_wrapper() if self.cpp_wrapper else self.codegen() | |
^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/graph.py", line 1998, in codegen | |
self._update_scheduler() | |
File "/workspace/pytorch/torch/_inductor/graph.py", line 1992, in _update_scheduler | |
self.scheduler = Scheduler(self.operations) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/scheduler.py", line 1985, in __init__ | |
self._init(nodes) | |
File "/workspace/pytorch/torch/_inductor/scheduler.py", line 2065, in _init | |
self.finalize_multi_template_buffers() | |
File "/workspace/pytorch/torch/_inductor/scheduler.py", line 2671, in finalize_multi_template_buffers | |
min_node_unfused, _ = multi_node.get_min_choice() | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/ir.py", line 4578, in get_min_choice | |
min_choice = min(self.choice_timings, key=self.choice_timings.get) # type: ignore[arg-type] | |
^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/ir.py", line 4556, in choice_timings | |
self._choice_timings = self._choice_timings_fn() | |
^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 1949, in get_timings | |
timings = do_autotuning(precompile_fn) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 1913, in do_autotuning | |
timings = self.lookup( | |
^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/codecache.py", line 321, in lookup | |
timings = benchmark(choices) | |
^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 1893, in autotune | |
return make_benchmark_fn()(choices) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 2117, in benchmark_in_current_process | |
raise e | |
File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 2083, in benchmark_in_current_process | |
timing = benchmark_choice_in_current_process(choice, inputs) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 2063, in benchmark_choice_in_current_process | |
result = choice.benchmark(*inpts, out=output) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 1353, in benchmark | |
return self.bmreq.benchmark(*args, output_tensor=out) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/autotune_process.py", line 554, in benchmark | |
out = self.do_bench(fn, *input_tensors, output_tensor) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/autotune_process.py", line 615, in do_bench | |
out = benchmarker.benchmark_gpu(fn) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/runtime/benchmarking.py", line 39, in wrapper | |
return fn(self, *args, **kwargs) | |
^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/runtime/benchmarking.py", line 243, in benchmark_gpu | |
_callable() | |
File "/workspace/pytorch/torch/_inductor/autotune_process.py", line 709, in run_with_workspace | |
run_method( | |
File "/workspace/pytorch/torch/_inductor/runtime/triton_heuristics.py", line 956, in run | |
return launcher( | |
^^^^^^^^^ | |
File "<string>", line 5, in launcher | |
File "/workspace/triton/python/triton/backends/nvidia/driver.py", line 529, in __call__ | |
self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args) | |
torch._inductor.exc.InductorError: TypeError: only integer tensors of a single element can be converted to an index | |
Set TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS="+dynamo" | |
To execute this test, run the following from the base repo dir: | |
python test/inductor/test_max_autotune.py TestMaxAutotune.test_max_autotune_addmm_persistent_tma_a_transposed_False_b_transposed_False_dynamic_True | |
This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0 | |
====================================================================== | |
ERROR: test_max_autotune_addmm_persistent_tma_a_transposed_False_b_transposed_True_dynamic_True (__main__.TestMaxAutotune.test_max_autotune_addmm_persistent_tma_a_transposed_False_b_transposed_True_dynamic_True) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/workspace/pytorch/torch/testing/_internal/common_utils.py", line 3153, in wrapper | |
method(*args, **kwargs) | |
File "/workspace/pytorch/torch/testing/_internal/common_utils.py", line 552, in instantiated_test | |
test(self, **param_kwargs) | |
File "/workspace/pytorch/test/inductor/test_max_autotune.py", line 437, in test_max_autotune_addmm_persistent_tma | |
c_actual = torch.compile(addmm, dynamic=dynamic)(x, a, b) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_dynamo/eval_frame.py", line 663, in _fn | |
raise e.remove_dynamo_frames() from None # see TORCHDYNAMO_VERBOSE=1 | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/compile_fx.py", line 760, in _compile_fx_inner | |
raise InductorError(e, currentframe()).with_traceback( | |
File "/workspace/pytorch/torch/_inductor/compile_fx.py", line 745, in _compile_fx_inner | |
mb_compiled_graph = fx_codegen_and_compile( | |
^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/compile_fx.py", line 1295, in fx_codegen_and_compile | |
return scheme.codegen_and_compile(gm, example_inputs, inputs_to_check, graph_kwargs) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/compile_fx.py", line 1197, in codegen_and_compile | |
compiled_fn = graph.compile_to_module().call | |
^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/graph.py", line 2083, in compile_to_module | |
return self._compile_to_module() | |
^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/graph.py", line 2091, in _compile_to_module | |
self.codegen_with_cpp_wrapper() if self.cpp_wrapper else self.codegen() | |
^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/graph.py", line 1998, in codegen | |
self._update_scheduler() | |
File "/workspace/pytorch/torch/_inductor/graph.py", line 1992, in _update_scheduler | |
self.scheduler = Scheduler(self.operations) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/scheduler.py", line 1985, in __init__ | |
self._init(nodes) | |
File "/workspace/pytorch/torch/_inductor/scheduler.py", line 2065, in _init | |
self.finalize_multi_template_buffers() | |
File "/workspace/pytorch/torch/_inductor/scheduler.py", line 2671, in finalize_multi_template_buffers | |
min_node_unfused, _ = multi_node.get_min_choice() | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/ir.py", line 4578, in get_min_choice | |
min_choice = min(self.choice_timings, key=self.choice_timings.get) # type: ignore[arg-type] | |
^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/ir.py", line 4556, in choice_timings | |
self._choice_timings = self._choice_timings_fn() | |
^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 1949, in get_timings | |
timings = do_autotuning(precompile_fn) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 1913, in do_autotuning | |
timings = self.lookup( | |
^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/codecache.py", line 321, in lookup | |
timings = benchmark(choices) | |
^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 1893, in autotune | |
return make_benchmark_fn()(choices) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 2117, in benchmark_in_current_process | |
raise e | |
File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 2083, in benchmark_in_current_process | |
timing = benchmark_choice_in_current_process(choice, inputs) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 2063, in benchmark_choice_in_current_process | |
result = choice.benchmark(*inpts, out=output) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 1353, in benchmark | |
return self.bmreq.benchmark(*args, output_tensor=out) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/autotune_process.py", line 554, in benchmark | |
out = self.do_bench(fn, *input_tensors, output_tensor) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/autotune_process.py", line 615, in do_bench | |
out = benchmarker.benchmark_gpu(fn) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/runtime/benchmarking.py", line 39, in wrapper | |
return fn(self, *args, **kwargs) | |
^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/runtime/benchmarking.py", line 243, in benchmark_gpu | |
_callable() | |
File "/workspace/pytorch/torch/_inductor/autotune_process.py", line 709, in run_with_workspace | |
run_method( | |
File "/workspace/pytorch/torch/_inductor/runtime/triton_heuristics.py", line 956, in run | |
return launcher( | |
^^^^^^^^^ | |
File "<string>", line 5, in launcher | |
File "/workspace/triton/python/triton/backends/nvidia/driver.py", line 529, in __call__ | |
self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args) | |
torch._inductor.exc.InductorError: TypeError: only integer tensors of a single element can be converted to an index | |
Set TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS="+dynamo" | |
To execute this test, run the following from the base repo dir: | |
python test/inductor/test_max_autotune.py TestMaxAutotune.test_max_autotune_addmm_persistent_tma_a_transposed_False_b_transposed_True_dynamic_True | |
This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0 | |
====================================================================== | |
ERROR: test_max_autotune_addmm_persistent_tma_a_transposed_True_b_transposed_False_dynamic_True (__main__.TestMaxAutotune.test_max_autotune_addmm_persistent_tma_a_transposed_True_b_transposed_False_dynamic_True) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/workspace/pytorch/torch/testing/_internal/common_utils.py", line 3153, in wrapper | |
method(*args, **kwargs) | |
File "/workspace/pytorch/torch/testing/_internal/common_utils.py", line 552, in instantiated_test | |
test(self, **param_kwargs) | |
File "/workspace/pytorch/test/inductor/test_max_autotune.py", line 437, in test_max_autotune_addmm_persistent_tma | |
c_actual = torch.compile(addmm, dynamic=dynamic)(x, a, b) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_dynamo/eval_frame.py", line 663, in _fn | |
raise e.remove_dynamo_frames() from None # see TORCHDYNAMO_VERBOSE=1 | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/compile_fx.py", line 760, in _compile_fx_inner | |
raise InductorError(e, currentframe()).with_traceback( | |
File "/workspace/pytorch/torch/_inductor/compile_fx.py", line 745, in _compile_fx_inner | |
mb_compiled_graph = fx_codegen_and_compile( | |
^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/compile_fx.py", line 1295, in fx_codegen_and_compile | |
return scheme.codegen_and_compile(gm, example_inputs, inputs_to_check, graph_kwargs) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/compile_fx.py", line 1197, in codegen_and_compile | |
compiled_fn = graph.compile_to_module().call | |
^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/graph.py", line 2083, in compile_to_module | |
return self._compile_to_module() | |
^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/graph.py", line 2091, in _compile_to_module | |
self.codegen_with_cpp_wrapper() if self.cpp_wrapper else self.codegen() | |
^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/graph.py", line 1998, in codegen | |
self._update_scheduler() | |
File "/workspace/pytorch/torch/_inductor/graph.py", line 1992, in _update_scheduler | |
self.scheduler = Scheduler(self.operations) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/scheduler.py", line 1985, in __init__ | |
self._init(nodes) | |
File "/workspace/pytorch/torch/_inductor/scheduler.py", line 2065, in _init | |
self.finalize_multi_template_buffers() | |
File "/workspace/pytorch/torch/_inductor/scheduler.py", line 2671, in finalize_multi_template_buffers | |
min_node_unfused, _ = multi_node.get_min_choice() | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/ir.py", line 4578, in get_min_choice | |
min_choice = min(self.choice_timings, key=self.choice_timings.get) # type: ignore[arg-type] | |
^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/ir.py", line 4556, in choice_timings | |
self._choice_timings = self._choice_timings_fn() | |
^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 1949, in get_timings | |
timings = do_autotuning(precompile_fn) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 1913, in do_autotuning | |
timings = self.lookup( | |
^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/codecache.py", line 321, in lookup | |
timings = benchmark(choices) | |
^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 1893, in autotune | |
return make_benchmark_fn()(choices) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 2117, in benchmark_in_current_process | |
raise e | |
File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 2083, in benchmark_in_current_process | |
timing = benchmark_choice_in_current_process(choice, inputs) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 2063, in benchmark_choice_in_current_process | |
result = choice.benchmark(*inpts, out=output) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 1353, in benchmark | |
return self.bmreq.benchmark(*args, output_tensor=out) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/autotune_process.py", line 554, in benchmark | |
out = self.do_bench(fn, *input_tensors, output_tensor) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/autotune_process.py", line 615, in do_bench | |
out = benchmarker.benchmark_gpu(fn) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/runtime/benchmarking.py", line 39, in wrapper | |
return fn(self, *args, **kwargs) | |
^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/runtime/benchmarking.py", line 243, in benchmark_gpu | |
_callable() | |
File "/workspace/pytorch/torch/_inductor/autotune_process.py", line 709, in run_with_workspace | |
run_method( | |
File "/workspace/pytorch/torch/_inductor/runtime/triton_heuristics.py", line 956, in run | |
return launcher( | |
^^^^^^^^^ | |
File "<string>", line 5, in launcher | |
File "/workspace/triton/python/triton/backends/nvidia/driver.py", line 529, in __call__ | |
self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args) | |
torch._inductor.exc.InductorError: TypeError: only integer tensors of a single element can be converted to an index | |
Set TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS="+dynamo" | |
To execute this test, run the following from the base repo dir: | |
python test/inductor/test_max_autotune.py TestMaxAutotune.test_max_autotune_addmm_persistent_tma_a_transposed_True_b_transposed_False_dynamic_True | |
This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0 | |
====================================================================== | |
ERROR: test_max_autotune_addmm_persistent_tma_a_transposed_True_b_transposed_True_dynamic_True (__main__.TestMaxAutotune.test_max_autotune_addmm_persistent_tma_a_transposed_True_b_transposed_True_dynamic_True) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/workspace/pytorch/torch/testing/_internal/common_utils.py", line 3153, in wrapper | |
method(*args, **kwargs) | |
File "/workspace/pytorch/torch/testing/_internal/common_utils.py", line 552, in instantiated_test | |
test(self, **param_kwargs) | |
File "/workspace/pytorch/test/inductor/test_max_autotune.py", line 437, in test_max_autotune_addmm_persistent_tma | |
c_actual = torch.compile(addmm, dynamic=dynamic)(x, a, b) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_dynamo/eval_frame.py", line 663, in _fn | |
raise e.remove_dynamo_frames() from None # see TORCHDYNAMO_VERBOSE=1 | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/compile_fx.py", line 760, in _compile_fx_inner | |
raise InductorError(e, currentframe()).with_traceback( | |
File "/workspace/pytorch/torch/_inductor/compile_fx.py", line 745, in _compile_fx_inner | |
mb_compiled_graph = fx_codegen_and_compile( | |
^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/compile_fx.py", line 1295, in fx_codegen_and_compile | |
return scheme.codegen_and_compile(gm, example_inputs, inputs_to_check, graph_kwargs) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/compile_fx.py", line 1197, in codegen_and_compile | |
compiled_fn = graph.compile_to_module().call | |
^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/graph.py", line 2083, in compile_to_module | |
return self._compile_to_module() | |
^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/graph.py", line 2091, in _compile_to_module | |
self.codegen_with_cpp_wrapper() if self.cpp_wrapper else self.codegen() | |
^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/graph.py", line 1998, in codegen | |
self._update_scheduler() | |
File "/workspace/pytorch/torch/_inductor/graph.py", line 1992, in _update_scheduler | |
self.scheduler = Scheduler(self.operations) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/scheduler.py", line 1985, in __init__ | |
self._init(nodes) | |
File "/workspace/pytorch/torch/_inductor/scheduler.py", line 2065, in _init | |
self.finalize_multi_template_buffers() | |
File "/workspace/pytorch/torch/_inductor/scheduler.py", line 2671, in finalize_multi_template_buffers | |
min_node_unfused, _ = multi_node.get_min_choice() | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/ir.py", line 4578, in get_min_choice | |
min_choice = min(self.choice_timings, key=self.choice_timings.get) # type: ignore[arg-type] | |
^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/ir.py", line 4556, in choice_timings | |
self._choice_timings = self._choice_timings_fn() | |
^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 1949, in get_timings | |
timings = do_autotuning(precompile_fn) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 1913, in do_autotuning | |
timings = self.lookup( | |
^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/codecache.py", line 321, in lookup | |
timings = benchmark(choices) | |
^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 1893, in autotune | |
return make_benchmark_fn()(choices) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 2117, in benchmark_in_current_process | |
raise e | |
File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 2083, in benchmark_in_current_process | |
timing = benchmark_choice_in_current_process(choice, inputs) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 2063, in benchmark_choice_in_current_process | |
result = choice.benchmark(*inpts, out=output) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 1353, in benchmark | |
return self.bmreq.benchmark(*args, output_tensor=out) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/autotune_process.py", line 554, in benchmark | |
out = self.do_bench(fn, *input_tensors, output_tensor) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/autotune_process.py", line 615, in do_bench | |
out = benchmarker.benchmark_gpu(fn) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/runtime/benchmarking.py", line 39, in wrapper | |
return fn(self, *args, **kwargs) | |
^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/runtime/benchmarking.py", line 243, in benchmark_gpu | |
_callable() | |
File "/workspace/pytorch/torch/_inductor/autotune_process.py", line 709, in run_with_workspace | |
run_method( | |
File "/workspace/pytorch/torch/_inductor/runtime/triton_heuristics.py", line 956, in run | |
return launcher( | |
^^^^^^^^^ | |
File "<string>", line 5, in launcher | |
File "/workspace/triton/python/triton/backends/nvidia/driver.py", line 529, in __call__ | |
self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args) | |
torch._inductor.exc.InductorError: TypeError: only integer tensors of a single element can be converted to an index | |
Set TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS="+dynamo" | |
To execute this test, run the following from the base repo dir: | |
python test/inductor/test_max_autotune.py TestMaxAutotune.test_max_autotune_addmm_persistent_tma_a_transposed_True_b_transposed_True_dynamic_True | |
This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0 | |
====================================================================== | |
ERROR: test_max_autotune_regular_mm_persistent_tma_a_transposed_False_b_transposed_False_dynamic_True (__main__.TestMaxAutotune.test_max_autotune_regular_mm_persistent_tma_a_transposed_False_b_transposed_False_dynamic_True) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/workspace/pytorch/torch/testing/_internal/common_utils.py", line 3153, in wrapper | |
method(*args, **kwargs) | |
File "/workspace/pytorch/torch/testing/_internal/common_utils.py", line 552, in instantiated_test | |
test(self, **param_kwargs) | |
File "/workspace/pytorch/test/inductor/test_max_autotune.py", line 262, in test_max_autotune_regular_mm_persistent_tma | |
c_actual = torch.compile(mm, dynamic=dynamic)(a, b) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_dynamo/eval_frame.py", line 663, in _fn | |
raise e.remove_dynamo_frames() from None # see TORCHDYNAMO_VERBOSE=1 | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/compile_fx.py", line 760, in _compile_fx_inner | |
raise InductorError(e, currentframe()).with_traceback( | |
File "/workspace/pytorch/torch/_inductor/compile_fx.py", line 745, in _compile_fx_inner | |
mb_compiled_graph = fx_codegen_and_compile( | |
^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/compile_fx.py", line 1295, in fx_codegen_and_compile | |
return scheme.codegen_and_compile(gm, example_inputs, inputs_to_check, graph_kwargs) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/compile_fx.py", line 1197, in codegen_and_compile | |
compiled_fn = graph.compile_to_module().call | |
^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/graph.py", line 2083, in compile_to_module | |
return self._compile_to_module() | |
^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/graph.py", line 2091, in _compile_to_module | |
self.codegen_with_cpp_wrapper() if self.cpp_wrapper else self.codegen() | |
^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/graph.py", line 1998, in codegen | |
self._update_scheduler() | |
File "/workspace/pytorch/torch/_inductor/graph.py", line 1992, in _update_scheduler | |
self.scheduler = Scheduler(self.operations) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/scheduler.py", line 1985, in __init__ | |
self._init(nodes) | |
File "/workspace/pytorch/torch/_inductor/scheduler.py", line 2065, in _init | |
self.finalize_multi_template_buffers() | |
File "/workspace/pytorch/torch/_inductor/scheduler.py", line 2671, in finalize_multi_template_buffers | |
min_node_unfused, _ = multi_node.get_min_choice() | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/ir.py", line 4578, in get_min_choice | |
min_choice = min(self.choice_timings, key=self.choice_timings.get) # type: ignore[arg-type] | |
^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/ir.py", line 4556, in choice_timings | |
self._choice_timings = self._choice_timings_fn() | |
^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 1949, in get_timings | |
timings = do_autotuning(precompile_fn) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 1913, in do_autotuning | |
timings = self.lookup( | |
^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/codecache.py", line 321, in lookup | |
timings = benchmark(choices) | |
^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 1893, in autotune | |
return make_benchmark_fn()(choices) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 2117, in benchmark_in_current_process | |
raise e | |
File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 2083, in benchmark_in_current_process | |
timing = benchmark_choice_in_current_process(choice, inputs) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 2063, in benchmark_choice_in_current_process | |
result = choice.benchmark(*inpts, out=output) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 1353, in benchmark | |
return self.bmreq.benchmark(*args, output_tensor=out) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/autotune_process.py", line 554, in benchmark | |
out = self.do_bench(fn, *input_tensors, output_tensor) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/autotune_process.py", line 615, in do_bench | |
out = benchmarker.benchmark_gpu(fn) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/runtime/benchmarking.py", line 39, in wrapper | |
return fn(self, *args, **kwargs) | |
^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/runtime/benchmarking.py", line 243, in benchmark_gpu | |
_callable() | |
File "/workspace/pytorch/torch/_inductor/autotune_process.py", line 709, in run_with_workspace | |
run_method( | |
File "/workspace/pytorch/torch/_inductor/runtime/triton_heuristics.py", line 956, in run | |
return launcher( | |
^^^^^^^^^ | |
File "<string>", line 5, in launcher | |
File "/workspace/triton/python/triton/backends/nvidia/driver.py", line 529, in __call__ | |
self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args) | |
torch._inductor.exc.InductorError: TypeError: only integer tensors of a single element can be converted to an index | |
Set TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS="+dynamo" | |
To execute this test, run the following from the base repo dir: | |
python test/inductor/test_max_autotune.py TestMaxAutotune.test_max_autotune_regular_mm_persistent_tma_a_transposed_False_b_transposed_False_dynamic_True | |
This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0 | |
====================================================================== | |
ERROR: test_max_autotune_regular_mm_persistent_tma_a_transposed_False_b_transposed_True_dynamic_True (__main__.TestMaxAutotune.test_max_autotune_regular_mm_persistent_tma_a_transposed_False_b_transposed_True_dynamic_True) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/workspace/pytorch/torch/testing/_internal/common_utils.py", line 3153, in wrapper | |
method(*args, **kwargs) | |
File "/workspace/pytorch/torch/testing/_internal/common_utils.py", line 552, in instantiated_test | |
test(self, **param_kwargs) | |
File "/workspace/pytorch/test/inductor/test_max_autotune.py", line 262, in test_max_autotune_regular_mm_persistent_tma | |
c_actual = torch.compile(mm, dynamic=dynamic)(a, b) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_dynamo/eval_frame.py", line 663, in _fn | |
raise e.remove_dynamo_frames() from None # see TORCHDYNAMO_VERBOSE=1 | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/compile_fx.py", line 760, in _compile_fx_inner | |
raise InductorError(e, currentframe()).with_traceback( | |
File "/workspace/pytorch/torch/_inductor/compile_fx.py", line 745, in _compile_fx_inner | |
mb_compiled_graph = fx_codegen_and_compile( | |
^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/compile_fx.py", line 1295, in fx_codegen_and_compile | |
return scheme.codegen_and_compile(gm, example_inputs, inputs_to_check, graph_kwargs) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/compile_fx.py", line 1197, in codegen_and_compile | |
compiled_fn = graph.compile_to_module().call | |
^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/graph.py", line 2083, in compile_to_module | |
return self._compile_to_module() | |
^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/graph.py", line 2091, in _compile_to_module | |
self.codegen_with_cpp_wrapper() if self.cpp_wrapper else self.codegen() | |
^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/graph.py", line 1998, in codegen | |
self._update_scheduler() | |
File "/workspace/pytorch/torch/_inductor/graph.py", line 1992, in _update_scheduler | |
self.scheduler = Scheduler(self.operations) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/scheduler.py", line 1985, in __init__ | |
self._init(nodes) | |
File "/workspace/pytorch/torch/_inductor/scheduler.py", line 2065, in _init | |
self.finalize_multi_template_buffers() | |
File "/workspace/pytorch/torch/_inductor/scheduler.py", line 2671, in finalize_multi_template_buffers | |
min_node_unfused, _ = multi_node.get_min_choice() | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/ir.py", line 4578, in get_min_choice | |
min_choice = min(self.choice_timings, key=self.choice_timings.get) # type: ignore[arg-type] | |
^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/ir.py", line 4556, in choice_timings | |
self._choice_timings = self._choice_timings_fn() | |
^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 1949, in get_timings | |
timings = do_autotuning(precompile_fn) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 1913, in do_autotuning | |
timings = self.lookup( | |
^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/codecache.py", line 321, in lookup | |
timings = benchmark(choices) | |
^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 1893, in autotune | |
return make_benchmark_fn()(choices) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 2117, in benchmark_in_current_process | |
raise e | |
File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 2083, in benchmark_in_current_process | |
timing = benchmark_choice_in_current_process(choice, inputs) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 2063, in benchmark_choice_in_current_process | |
result = choice.benchmark(*inpts, out=output) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 1353, in benchmark | |
return self.bmreq.benchmark(*args, output_tensor=out) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/autotune_process.py", line 554, in benchmark | |
out = self.do_bench(fn, *input_tensors, output_tensor) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/autotune_process.py", line 615, in do_bench | |
out = benchmarker.benchmark_gpu(fn) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/runtime/benchmarking.py", line 39, in wrapper | |
return fn(self, *args, **kwargs) | |
^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/runtime/benchmarking.py", line 243, in benchmark_gpu | |
_callable() | |
File "/workspace/pytorch/torch/_inductor/autotune_process.py", line 709, in run_with_workspace | |
run_method( | |
File "/workspace/pytorch/torch/_inductor/runtime/triton_heuristics.py", line 956, in run | |
return launcher( | |
^^^^^^^^^ | |
File "<string>", line 5, in launcher | |
File "/workspace/triton/python/triton/backends/nvidia/driver.py", line 529, in __call__ | |
self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args) | |
torch._inductor.exc.InductorError: TypeError: only integer tensors of a single element can be converted to an index | |
Set TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS="+dynamo" | |
To execute this test, run the following from the base repo dir: | |
python test/inductor/test_max_autotune.py TestMaxAutotune.test_max_autotune_regular_mm_persistent_tma_a_transposed_False_b_transposed_True_dynamic_True | |
This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0 | |
====================================================================== | |
ERROR: test_max_autotune_regular_mm_persistent_tma_a_transposed_True_b_transposed_False_dynamic_True (__main__.TestMaxAutotune.test_max_autotune_regular_mm_persistent_tma_a_transposed_True_b_transposed_False_dynamic_True) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/workspace/pytorch/torch/testing/_internal/common_utils.py", line 3153, in wrapper | |
method(*args, **kwargs) | |
File "/workspace/pytorch/torch/testing/_internal/common_utils.py", line 552, in instantiated_test | |
test(self, **param_kwargs) | |
File "/workspace/pytorch/test/inductor/test_max_autotune.py", line 262, in test_max_autotune_regular_mm_persistent_tma | |
c_actual = torch.compile(mm, dynamic=dynamic)(a, b) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_dynamo/eval_frame.py", line 663, in _fn | |
raise e.remove_dynamo_frames() from None # see TORCHDYNAMO_VERBOSE=1 | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/compile_fx.py", line 760, in _compile_fx_inner | |
raise InductorError(e, currentframe()).with_traceback( | |
File "/workspace/pytorch/torch/_inductor/compile_fx.py", line 745, in _compile_fx_inner | |
mb_compiled_graph = fx_codegen_and_compile( | |
^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/compile_fx.py", line 1295, in fx_codegen_and_compile | |
return scheme.codegen_and_compile(gm, example_inputs, inputs_to_check, graph_kwargs) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/compile_fx.py", line 1197, in codegen_and_compile | |
compiled_fn = graph.compile_to_module().call | |
^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/graph.py", line 2083, in compile_to_module | |
return self._compile_to_module() | |
^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/graph.py", line 2091, in _compile_to_module | |
self.codegen_with_cpp_wrapper() if self.cpp_wrapper else self.codegen() | |
^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/graph.py", line 1998, in codegen | |
self._update_scheduler() | |
File "/workspace/pytorch/torch/_inductor/graph.py", line 1992, in _update_scheduler | |
self.scheduler = Scheduler(self.operations) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/scheduler.py", line 1985, in __init__ | |
self._init(nodes) | |
File "/workspace/pytorch/torch/_inductor/scheduler.py", line 2065, in _init | |
self.finalize_multi_template_buffers() | |
File "/workspace/pytorch/torch/_inductor/scheduler.py", line 2671, in finalize_multi_template_buffers | |
min_node_unfused, _ = multi_node.get_min_choice() | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/ir.py", line 4578, in get_min_choice | |
min_choice = min(self.choice_timings, key=self.choice_timings.get) # type: ignore[arg-type] | |
^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/ir.py", line 4556, in choice_timings | |
self._choice_timings = self._choice_timings_fn() | |
^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 1949, in get_timings | |
timings = do_autotuning(precompile_fn) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 1913, in do_autotuning | |
timings = self.lookup( | |
^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/codecache.py", line 321, in lookup | |
timings = benchmark(choices) | |
^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 1893, in autotune | |
return make_benchmark_fn()(choices) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 2117, in benchmark_in_current_process | |
raise e | |
File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 2083, in benchmark_in_current_process | |
timing = benchmark_choice_in_current_process(choice, inputs) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 2063, in benchmark_choice_in_current_process | |
result = choice.benchmark(*inpts, out=output) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 1353, in benchmark | |
return self.bmreq.benchmark(*args, output_tensor=out) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/autotune_process.py", line 554, in benchmark | |
out = self.do_bench(fn, *input_tensors, output_tensor) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/autotune_process.py", line 615, in do_bench | |
out = benchmarker.benchmark_gpu(fn) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/runtime/benchmarking.py", line 39, in wrapper | |
return fn(self, *args, **kwargs) | |
^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/runtime/benchmarking.py", line 243, in benchmark_gpu | |
_callable() | |
File "/workspace/pytorch/torch/_inductor/autotune_process.py", line 709, in run_with_workspace | |
run_method( | |
File "/workspace/pytorch/torch/_inductor/runtime/triton_heuristics.py", line 956, in run | |
return launcher( | |
^^^^^^^^^ | |
File "<string>", line 5, in launcher | |
File "/workspace/triton/python/triton/backends/nvidia/driver.py", line 529, in __call__ | |
self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args) | |
torch._inductor.exc.InductorError: TypeError: only integer tensors of a single element can be converted to an index | |
Set TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS="+dynamo" | |
To execute this test, run the following from the base repo dir: | |
python test/inductor/test_max_autotune.py TestMaxAutotune.test_max_autotune_regular_mm_persistent_tma_a_transposed_True_b_transposed_False_dynamic_True | |
This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0 | |
====================================================================== | |
ERROR: test_max_autotune_regular_mm_persistent_tma_a_transposed_True_b_transposed_True_dynamic_True (__main__.TestMaxAutotune.test_max_autotune_regular_mm_persistent_tma_a_transposed_True_b_transposed_True_dynamic_True) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/workspace/pytorch/torch/testing/_internal/common_utils.py", line 3153, in wrapper | |
method(*args, **kwargs) | |
File "/workspace/pytorch/torch/testing/_internal/common_utils.py", line 552, in instantiated_test | |
test(self, **param_kwargs) | |
File "/workspace/pytorch/test/inductor/test_max_autotune.py", line 262, in test_max_autotune_regular_mm_persistent_tma | |
c_actual = torch.compile(mm, dynamic=dynamic)(a, b) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_dynamo/eval_frame.py", line 663, in _fn | |
raise e.remove_dynamo_frames() from None # see TORCHDYNAMO_VERBOSE=1 | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/compile_fx.py", line 760, in _compile_fx_inner | |
raise InductorError(e, currentframe()).with_traceback( | |
File "/workspace/pytorch/torch/_inductor/compile_fx.py", line 745, in _compile_fx_inner | |
mb_compiled_graph = fx_codegen_and_compile( | |
^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/compile_fx.py", line 1295, in fx_codegen_and_compile | |
return scheme.codegen_and_compile(gm, example_inputs, inputs_to_check, graph_kwargs) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/compile_fx.py", line 1197, in codegen_and_compile | |
compiled_fn = graph.compile_to_module().call | |
^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/graph.py", line 2083, in compile_to_module | |
return self._compile_to_module() | |
^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/graph.py", line 2091, in _compile_to_module | |
self.codegen_with_cpp_wrapper() if self.cpp_wrapper else self.codegen() | |
^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/graph.py", line 1998, in codegen | |
self._update_scheduler() | |
File "/workspace/pytorch/torch/_inductor/graph.py", line 1992, in _update_scheduler | |
self.scheduler = Scheduler(self.operations) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/scheduler.py", line 1985, in __init__ | |
self._init(nodes) | |
File "/workspace/pytorch/torch/_inductor/scheduler.py", line 2065, in _init | |
self.finalize_multi_template_buffers() | |
File "/workspace/pytorch/torch/_inductor/scheduler.py", line 2671, in finalize_multi_template_buffers | |
min_node_unfused, _ = multi_node.get_min_choice() | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/ir.py", line 4578, in get_min_choice | |
min_choice = min(self.choice_timings, key=self.choice_timings.get) # type: ignore[arg-type] | |
^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/ir.py", line 4556, in choice_timings | |
self._choice_timings = self._choice_timings_fn() | |
^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 1949, in get_timings | |
timings = do_autotuning(precompile_fn) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 1913, in do_autotuning | |
timings = self.lookup( | |
^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/codecache.py", line 321, in lookup | |
timings = benchmark(choices) | |
^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 1893, in autotune | |
return make_benchmark_fn()(choices) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 2117, in benchmark_in_current_process | |
raise e | |
File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 2083, in benchmark_in_current_process | |
timing = benchmark_choice_in_current_process(choice, inputs) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 2063, in benchmark_choice_in_current_process | |
result = choice.benchmark(*inpts, out=output) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/select_algorithm.py", line 1353, in benchmark | |
return self.bmreq.benchmark(*args, output_tensor=out) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/autotune_process.py", line 554, in benchmark | |
out = self.do_bench(fn, *input_tensors, output_tensor) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/autotune_process.py", line 615, in do_bench | |
out = benchmarker.benchmark_gpu(fn) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/runtime/benchmarking.py", line 39, in wrapper | |
return fn(self, *args, **kwargs) | |
^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_inductor/runtime/benchmarking.py", line 243, in benchmark_gpu | |
_callable() | |
File "/workspace/pytorch/torch/_inductor/autotune_process.py", line 709, in run_with_workspace | |
run_method( | |
File "/workspace/pytorch/torch/_inductor/runtime/triton_heuristics.py", line 956, in run | |
return launcher( | |
^^^^^^^^^ | |
File "<string>", line 5, in launcher | |
File "/workspace/triton/python/triton/backends/nvidia/driver.py", line 529, in __call__ | |
self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args) | |
torch._inductor.exc.InductorError: TypeError: only integer tensors of a single element can be converted to an index | |
Set TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS="+dynamo" | |
To execute this test, run the following from the base repo dir: | |
python test/inductor/test_max_autotune.py TestMaxAutotune.test_max_autotune_regular_mm_persistent_tma_a_transposed_True_b_transposed_True_dynamic_True | |
This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0 | |
====================================================================== | |
ERROR: test_non_contiguous_input_mm_plus_mm (__main__.TestMaxAutotune.test_non_contiguous_input_mm_plus_mm) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/workspace/pytorch/torch/testing/_internal/common_utils.py", line 3153, in wrapper | |
method(*args, **kwargs) | |
File "/workspace/pytorch/torch/testing/_internal/common_utils.py", line 1928, in wrapper | |
return fn(*args, **kwargs) | |
^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/test/inductor/test_max_autotune.py", line 1040, in test_non_contiguous_input_mm_plus_mm | |
x2 = rand_strided((50257, 32768), (1, 50304), device=GPU_TYPE) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/workspace/pytorch/torch/_dynamo/testing.py", line 411, in rand_strided | |
buffer = torch.randn(needed_size, dtype=dtype, device=device) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 6.14 GiB. GPU 0 has a total capacity of 15.48 GiB of which 4.01 GiB is free. Including non-PyTorch memory, this process has 11.10 GiB memory in use. Process 437762 has 360.00 MiB memory in use. Of the allocated memory 6.23 GiB is allocated by PyTorch, and 2.98 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) | |
To execute this test, run the following from the base repo dir: | |
python test/inductor/test_max_autotune.py TestMaxAutotune.test_non_contiguous_input_mm_plus_mm | |
This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0 | |
====================================================================== | |
ERROR: test_low_precision (__main__.TestPrologueFusion.test_low_precision) | |
---------------------------------------------------------------------- | |
Traceback (most recent call last): | |
File "/workspace/pytorch/torch/testing/_internal/common_utils.py", line 3153, in wrapper | |
method(*args, **kwargs) | |
File "/workspace/pytorch/test/inductor/test_max_autotune.py", line 1407, in test_low_precision | |
self.check_code(code[0], num_kernels=1, num_allocs=1, num_deallocs=2) | |
File "/workspace/pytorch/test/inductor/test_max_autotune.py", line 1321, in check_code | |
).run(code_str) | |
^^^^^^^^^^^^^ | |
RuntimeError: Expected to not find ".run(" but found it | |
# Topologically Sorted Source Nodes: [to, add, matmul], Original ATen: [aten._to_copy, aten.add, aten.mm] | |
stream0 = get_raw_stream(0) | |
triton_tem_fused__to_copy_add_mm_1.run(buf0, arg1_1, buf1, 8, 1, 1, stream=stream0) | |
~~~~~ <--- HERE | |
del arg1_1 | |
del buf0 | |
From CHECK-NOT: .run( | |
To execute this test, run the following from the base repo dir: | |
python test/inductor/test_max_autotune.py TestPrologueFusion.test_low_precision | |
This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0 | |
---------------------------------------------------------------------- | |
Ran 102 tests in 152.328s | |
FAILED (errors=10, skipped=3) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
test_config_option_dont_assume_alignment_cudagraphs_cpu (__main__.CpuTests.test_config_option_dont_assume_alignment_cudagraphs_cpu) ... frames [('total', 4), ('ok', 4)] | |
stats [('calls_captured', 3), ('unique_graphs', 1)] | |
inductor [('fxgraph_cache_miss', 2), ('cudagraph_skips', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_AllenaiLongformerBase_repro_cuda (__main__.GPUTests.test_AllenaiLongformerBase_repro_cuda) ... inline_call [] | |
stats [('calls_captured', 22), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('pattern_matcher_nodes', 10), ('pattern_matcher_count', 8), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test__dyn_quant_matmul_4bit_cuda (__main__.GPUTests.test__dyn_quant_matmul_4bit_cuda) ... skipped 'No _dyn_quant_matmul_4bit implementation on CUDA' | |
test__dyn_quant_pack_4bit_weight_cuda (__main__.GPUTests.test__dyn_quant_pack_4bit_weight_cuda) ... skipped 'No _dyn_quant_pack_4bit_weight implementation on CUDA' | |
test__unsafe_masked_index_cuda (__main__.GPUTests.test__unsafe_masked_index_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test__unsafe_masked_index_put_accumulate_cuda (__main__.GPUTests.test__unsafe_masked_index_put_accumulate_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_abs_cuda (__main__.GPUTests.test_abs_cuda) ... inline_call [] | |
stats [('calls_captured', 6), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_adaptive_avg_pool1d_argmax_cuda (__main__.GPUTests.test_adaptive_avg_pool1d_argmax_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('async_compile_cache_miss', 3), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_adaptive_avg_pool2d1_cuda (__main__.GPUTests.test_adaptive_avg_pool2d1_cuda) ... inline_call [] | |
stats [('calls_captured', 15), ('unique_graphs', 5)] | |
aot_autograd [('total', 5), ('autograd_cache_miss', 5), ('autograd_cache_saved', 5), ('ok', 5)] | |
inductor [('triton_bundler_save_kernel', 70), ('async_compile_cache_miss', 10), ('async_compile_cache_hit', 10), ('fxgraph_cache_miss', 5), ('extern_calls', 2), ('intermediate_hooks', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_adaptive_avg_pool2d2_cuda (__main__.GPUTests.test_adaptive_avg_pool2d2_cuda) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('extern_calls', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_adaptive_avg_pool2d_low_prec_cuda (__main__.GPUTests.test_adaptive_avg_pool2d_low_prec_cuda) ... frames [('total', 2), ('ok', 2)] | |
inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_adaptive_avg_pool_errors_with_long_cuda (__main__.GPUTests.test_adaptive_avg_pool_errors_with_long_cuda) ... inline_call [] | |
stats [('calls_captured', 6), ('unique_graphs', 1)] | |
aot_autograd [('total', 3), ('autograd_cache_miss', 3), ('not_ok', 2), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 3), ('extern_calls', 2), ('async_compile_cache_miss', 1), ('intermediate_hooks', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_adaptive_avg_pool_with_output_size_0_cuda (__main__.GPUTests.test_adaptive_avg_pool_with_output_size_0_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 4)] | |
aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)] | |
inductor [('fxgraph_cache_miss', 4)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_adaptive_max_pool2d1_cuda (__main__.GPUTests.test_adaptive_max_pool2d1_cuda) ... inline_call [] | |
stats [('calls_captured', 15), ('unique_graphs', 5)] | |
aot_autograd [('total', 5), ('autograd_cache_miss', 5), ('autograd_cache_saved', 5), ('ok', 5)] | |
inductor [('triton_bundler_save_kernel', 49), ('async_compile_cache_miss', 7), ('async_compile_cache_hit', 7), ('fxgraph_cache_miss', 5)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_adaptive_max_pool2d2_cuda (__main__.GPUTests.test_adaptive_max_pool2d2_cuda) ... inline_call [] | |
stats [('calls_captured', 3), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('extern_calls', 3), ('fxgraph_cache_miss', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_adaptive_max_pool2d3_cuda (__main__.GPUTests.test_adaptive_max_pool2d3_cuda) ... inline_call [] | |
stats [('calls_captured', 12), ('unique_graphs', 4)] | |
aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)] | |
inductor [('triton_bundler_save_kernel', 28), ('fxgraph_cache_miss', 4), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_adaptive_pool_errors_with_long_cuda (__main__.GPUTests.test_adaptive_pool_errors_with_long_cuda) ... inline_call [] | |
stats [('calls_captured', 6), ('unique_graphs', 1)] | |
aot_autograd [('total', 3), ('autograd_cache_miss', 3), ('not_ok', 2), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 3), ('extern_calls', 2), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_add_complex3_cuda (__main__.GPUTests.test_add_complex3_cuda) ... /workspace/pytorch/torch/_inductor/lowering.py:1917: UserWarning: Torchinductor does not support code generation for complex operators. Performance may be worse than eager. | |
warnings.warn( | |
frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 2), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('extern_calls', 6), ('pattern_matcher_count', 3), ('pattern_matcher_nodes', 3), ('intermediate_hooks', 3), ('fxgraph_cache_miss', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_add_complex4_cuda (__main__.GPUTests.test_add_complex4_cuda) ... /workspace/pytorch/test/inductor/test_torchinductor.py:1365: UserWarning: ComplexHalf support is experimental and many operators don't support it yet. (Triggered internally at /workspace/pytorch/aten/src/ATen/EmptyTensor.cpp:56.) | |
x = torch.tensor( | |
frames [('total', 3), ('ok', 3)] | |
stats [('calls_captured', 9), ('unique_graphs', 3)] | |
aot_autograd [('total', 3), ('autograd_cache_miss', 3), ('autograd_cache_saved', 3), ('ok', 3)] | |
inductor [('pattern_matcher_nodes', 33), ('pattern_matcher_count', 27), ('triton_bundler_save_kernel', 21), ('extern_calls', 18), ('intermediate_hooks', 9), ('fxgraph_cache_miss', 3), ('async_compile_cache_miss', 3), ('async_compile_cache_hit', 3)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_add_complex5_cuda (__main__.GPUTests.test_add_complex5_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)] | |
inductor [('extern_calls', 16), ('intermediate_hooks', 8), ('triton_bundler_save_kernel', 7), ('pattern_matcher_count', 3), ('pattern_matcher_nodes', 3), ('async_compile_cache_miss', 3), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_add_complex6_cuda (__main__.GPUTests.test_add_complex6_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)] | |
inductor [('extern_calls', 16), ('triton_bundler_save_kernel', 7), ('intermediate_hooks', 6), ('pattern_matcher_count', 3), ('pattern_matcher_nodes', 3), ('async_compile_cache_miss', 3), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_add_complex_cuda (__main__.GPUTests.test_add_complex_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)] | |
inductor [('extern_calls', 16), ('intermediate_hooks', 8), ('triton_bundler_save_kernel', 7), ('pattern_matcher_count', 3), ('pattern_matcher_nodes', 3), ('async_compile_cache_miss', 3), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_add_const_float_cuda (__main__.GPUTests.test_add_const_float_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_add_const_int_cuda (__main__.GPUTests.test_add_const_int_cuda) ... inline_call [] | |
stats [('calls_captured', 12), ('unique_graphs', 6)] | |
aot_autograd [('total', 6), ('ok', 6), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('autograd_cache_hit', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 8), ('async_compile_cache_hit', 6), ('fxgraph_cache_miss', 4), ('fxgraph_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_add_inplace_permuted_cuda (__main__.GPUTests.test_add_inplace_permuted_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_adding_tensor_offsets_cuda (__main__.GPUTests.test_adding_tensor_offsets_cuda) ... stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_bypass', 2), ('ok', 2)] | |
inductor [('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_addmm_cuda (__main__.GPUTests.test_addmm_cuda) ... /workspace/pytorch/torch/_inductor/compile_fx.py:236: UserWarning: TensorFloat32 tensor cores for float32 matrix multiplication available but not enabled. Consider setting `torch.set_float32_matmul_precision('high')` for better performance. | |
warnings.warn( | |
inline_call [] | |
stats [('calls_captured', 10), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 42), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 6), ('fxgraph_cache_miss', 2), ('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('extern_calls', 2)] | |
graph_break [] | |
aten_mm_info [('aten.mm_8_8_8', 2)] | |
ok | |
test_addmv_cuda (__main__.GPUTests.test_addmv_cuda) ... frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('fxgraph_cache_miss', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_alexnet_prefix_cuda (__main__.GPUTests.test_alexnet_prefix_cuda) ... inline_call [] | |
stats [('calls_captured', 8), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 56), ('async_compile_cache_miss', 8), ('async_compile_cache_hit', 8), ('fxgraph_cache_miss', 2), ('extern_calls', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_aliased_buffer_reuse_cuda (__main__.GPUTests.test_aliased_buffer_reuse_cuda) ... inline_call [] | |
stats [('calls_captured', 7), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 14), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('extern_calls', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_4_4_4', 1)] | |
ok | |
test_angle_cuda (__main__.GPUTests.test_angle_cuda) ... inline_call [] | |
stats [('calls_captured', 6), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 42), ('extern_calls', 12), ('intermediate_hooks', 6), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 6), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_any_cuda (__main__.GPUTests.test_any_cuda) ... inline_call [] | |
stats [('calls_captured', 32), ('unique_graphs', 4)] | |
aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)] | |
inductor [('triton_bundler_save_kernel', 56), ('async_compile_cache_miss', 8), ('async_compile_cache_hit', 8), ('fxgraph_cache_miss', 4)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_aoti_eager_cache_hit_cuda (__main__.GPUTests.test_aoti_eager_cache_hit_cuda) ... W0509 18:19:03.396000 415353 torch/_export/__init__.py:67] +============================+ | |
W0509 18:19:03.397000 415353 torch/_export/__init__.py:68] | !!! WARNING !!! | | |
W0509 18:19:03.397000 415353 torch/_export/__init__.py:69] +============================+ | |
W0509 18:19:03.397000 415353 torch/_export/__init__.py:70] torch._export.aot_compile()/torch._export.aot_load() is being deprecated, please switch to directly calling torch._inductor.aoti_compile_and_package(torch.export.export())/torch._inductor.aoti_load_package() instead. | |
W0509 18:19:03.397000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.abs | |
W0509 18:19:03.404000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.abs | |
/workspace/pytorch/torch/library.py:288: UserWarning: Warning only once for all operators, other operators may also be overridden. | |
Overriding a previously registered kernel for the same operator and the same dispatch key | |
operator: aten::abs.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) | |
registered at /workspace/pytorch/build/aten/src/ATen/RegisterSchema.cpp:6 | |
dispatch key: CUDA | |
previous kernel: registered at /workspace/pytorch/build/aten/src/ATen/RegisterCPU_2.cpp:1215 | |
new kernel: registered at /dev/null:137 (Triggered internally at /workspace/pytorch/aten/src/ATen/core/dispatch/OperatorEntry.cpp:154.) | |
impl_fn(self.ns, name.split("::")[-1], dispatch_key) | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
inductor [('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_aoti_eager_dtype_device_layout_cuda (__main__.GPUTests.test_aoti_eager_dtype_device_layout_cuda) ... W0509 18:19:05.915000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.tril_indices.default | |
W0509 18:19:05.934000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.tril_indices.default | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
inductor [('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_aoti_eager_override_registration_cuda (__main__.GPUTests.test_aoti_eager_override_registration_cuda) ... W0509 18:19:08.680000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.abs.default | |
W0509 18:19:08.686000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.abs.default | |
W0509 18:19:11.034000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.acos.default | |
W0509 18:19:11.043000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.acos.default | |
W0509 18:19:13.407000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.abs.default | |
W0509 18:19:13.419000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.abs.default | |
W0509 18:19:15.796000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.abs.default | |
W0509 18:19:15.803000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.abs.default | |
W0509 18:19:18.121000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.abs.default | |
W0509 18:19:18.128000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.abs.default | |
W0509 18:19:20.467000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.abs.default | |
W0509 18:19:20.478000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.abs.default | |
W0509 18:19:22.880000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.abs.default | |
W0509 18:19:22.891000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.abs.default | |
W0509 18:19:25.336000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.abs.default | |
W0509 18:19:25.348000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.abs.default | |
W0509 18:19:27.735000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.clamp.Tensor | |
W0509 18:19:27.747000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.clamp.Tensor | |
W0509 18:19:30.224000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.clamp.Tensor | |
W0509 18:19:30.239000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.clamp.Tensor | |
frames [('total', 2), ('ok', 2)] | |
inline_call [] | |
stats [('calls_captured', 12), ('unique_graphs', 12)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('async_compile_cache_miss', 12), ('async_compile_cache_hit', 12), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_aoti_eager_support_out_cuda (__main__.GPUTests.test_aoti_eager_support_out_cuda) ... W0509 18:19:32.749000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.clamp.Tensor_out | |
W0509 18:19:32.766000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.clamp.Tensor_out | |
W0509 18:19:35.383000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.clamp.Tensor_out | |
W0509 18:19:35.399000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.clamp.Tensor_out | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
inductor [('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_aoti_eager_support_str_cuda (__main__.GPUTests.test_aoti_eager_support_str_cuda) ... W0509 18:19:37.717000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.div.Tensor_mode | |
W0509 18:19:37.730000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.div.Tensor_mode | |
W0509 18:19:40.191000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.div.Tensor_mode | |
W0509 18:19:40.200000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.div.Tensor_mode | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
inductor [('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_aoti_eager_with_persistent_cache_cuda (__main__.GPUTests.test_aoti_eager_with_persistent_cache_cuda) ... stats [('calls_captured', 1), ('unique_graphs', 1)] | |
inductor [('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_aoti_eager_with_scalar_cuda (__main__.GPUTests.test_aoti_eager_with_scalar_cuda) ... W0509 18:19:45.042000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.add | |
W0509 18:19:45.049000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.add | |
W0509 18:19:47.511000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.add.Tensor | |
W0509 18:19:47.524000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.add.Tensor | |
W0509 18:19:50.205000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.add.Tensor | |
W0509 18:19:50.214000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.add.Tensor | |
W0509 18:19:52.607000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.add.Tensor | |
W0509 18:19:52.615000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.add.Tensor | |
W0509 18:19:54.974000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.add.Scalar | |
W0509 18:19:54.986000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.add.Scalar | |
W0509 18:19:57.475000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.add.Scalar | |
W0509 18:19:57.486000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.add.Scalar | |
W0509 18:19:59.947000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.add.Scalar | |
W0509 18:19:59.955000 415353 torch/_dynamo/eval_frame.py:416] could not determine __code__ for aten.add.Scalar | |
stats [('calls_captured', 7), ('unique_graphs', 7)] | |
inductor [('async_compile_cache_miss', 7), ('async_compile_cache_hit', 7)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_arange1_cuda (__main__.GPUTests.test_arange1_cuda) ... inline_call [] | |
stats [('calls_captured', 10), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('pattern_matcher_count', 6), ('pattern_matcher_nodes', 6), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_arange2_cuda (__main__.GPUTests.test_arange2_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('pattern_matcher_count', 1), ('pattern_matcher_nodes', 1), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_arange3_cuda (__main__.GPUTests.test_arange3_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_arange4_cuda (__main__.GPUTests.test_arange4_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_arange5_cuda (__main__.GPUTests.test_arange5_cuda) ... frames [('total', 2), ('ok', 2)] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_arange6_cuda (__main__.GPUTests.test_arange6_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 4)] | |
aot_autograd [('total', 4), ('ok', 4), ('autograd_cache_miss', 3), ('autograd_cache_saved', 3), ('autograd_cache_hit', 1)] | |
inductor [('triton_bundler_save_kernel', 21), ('async_compile_cache_miss', 5), ('async_compile_cache_hit', 4), ('pattern_matcher_count', 3), ('pattern_matcher_nodes', 3), ('fxgraph_cache_miss', 3), ('fxgraph_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_argmax_argmin1_cuda (__main__.GPUTests.test_argmax_argmin1_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_argmax_argmin2_cuda (__main__.GPUTests.test_argmax_argmin2_cuda) ... inline_call [] | |
stats [('calls_captured', 8), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_argmax_argmin3_cuda (__main__.GPUTests.test_argmax_argmin3_cuda) ... inline_call [] | |
stats [('calls_captured', 8), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)] | |
inductor [('triton_bundler_save_kernel', 14), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_argmax_argmin_with_duplicates_cuda (__main__.GPUTests.test_argmax_argmin_with_duplicates_cuda) ... inline_call [] | |
stats [('calls_captured', 24), ('unique_graphs', 6)] | |
aot_autograd [('total', 6), ('ok', 6), ('autograd_cache_miss', 3), ('autograd_cache_saved', 3), ('autograd_cache_hit', 3)] | |
inductor [('triton_bundler_save_kernel', 42), ('async_compile_cache_miss', 18), ('async_compile_cache_hit', 12), ('fxgraph_cache_miss', 3), ('fxgraph_cache_hit', 3)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_argmax_argmin_with_nan_cuda (__main__.GPUTests.test_argmax_argmin_with_nan_cuda) ... inline_call [] | |
stats [('calls_captured', 24), ('unique_graphs', 6)] | |
aot_autograd [('total', 6), ('autograd_cache_miss', 6), ('autograd_cache_saved', 6), ('ok', 6)] | |
inductor [('triton_bundler_save_kernel', 84), ('async_compile_cache_miss', 12), ('async_compile_cache_hit', 12), ('fxgraph_cache_miss', 6)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_argmax_min_int32_cuda (__main__.GPUTests.test_argmax_min_int32_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('async_compile_cache_miss', 3), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_argmax_to_float_cuda (__main__.GPUTests.test_argmax_to_float_cuda) ... inline_call [] | |
stats [('calls_captured', 8), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)] | |
inductor [('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_as_strided_cuda (__main__.GPUTests.test_as_strided_cuda) ... inline_call [] | |
stats [('calls_captured', 16), ('unique_graphs', 4)] | |
aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_bypass', 4), ('ok', 4)] | |
inductor [('triton_bundler_save_kernel', 42), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 6), ('fxgraph_cache_miss', 4)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_as_strided_scatter_cuda (__main__.GPUTests.test_as_strided_scatter_cuda) ... inline_call [] | |
stats [('calls_captured', 10), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_avg_pool2d1_cuda (__main__.GPUTests.test_avg_pool2d1_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_avg_pool2d2_cuda (__main__.GPUTests.test_avg_pool2d2_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_avg_pool2d3_cuda (__main__.GPUTests.test_avg_pool2d3_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_avg_pool2d4_cuda (__main__.GPUTests.test_avg_pool2d4_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_avg_pool2d5_cuda (__main__.GPUTests.test_avg_pool2d5_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_avg_pool2d6_cuda (__main__.GPUTests.test_avg_pool2d6_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_avg_pool2d7_cuda (__main__.GPUTests.test_avg_pool2d7_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('extern_calls', 4), ('fxgraph_cache_miss', 2), ('intermediate_hooks', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_avg_pool2d8_cuda (__main__.GPUTests.test_avg_pool2d8_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_avg_pool2d_backward2_cuda (__main__.GPUTests.test_avg_pool2d_backward2_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_avg_pool2d_backward3_cuda (__main__.GPUTests.test_avg_pool2d_backward3_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_avg_pool2d_backward4_cuda (__main__.GPUTests.test_avg_pool2d_backward4_cuda) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('extern_calls', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_avg_pool2d_backward_cuda (__main__.GPUTests.test_avg_pool2d_backward_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_avg_pool3d_backward2_cuda (__main__.GPUTests.test_avg_pool3d_backward2_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_avg_pool3d_backward3_cuda (__main__.GPUTests.test_avg_pool3d_backward3_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_avg_pool3d_backward4_cuda (__main__.GPUTests.test_avg_pool3d_backward4_cuda) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('extern_calls', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_avg_pool3d_backward_cuda (__main__.GPUTests.test_avg_pool3d_backward_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_avg_pool_errors_with_uint_cuda (__main__.GPUTests.test_avg_pool_errors_with_uint_cuda) ... E0509 18:20:48.465000 415353 torch/_subclasses/fake_tensor.py:2431] [0/0] failed while attempting to run meta for aten.avg_pool2d.default | |
E0509 18:20:48.465000 415353 torch/_subclasses/fake_tensor.py:2431] [0/0] Traceback (most recent call last): | |
E0509 18:20:48.465000 415353 torch/_subclasses/fake_tensor.py:2431] [0/0] File "/workspace/pytorch/torch/_subclasses/fake_tensor.py", line 2427, in _dispatch_impl | |
E0509 18:20:48.465000 415353 torch/_subclasses/fake_tensor.py:2431] [0/0] r = func(*args, **kwargs) | |
E0509 18:20:48.465000 415353 torch/_subclasses/fake_tensor.py:2431] [0/0] ^^^^^^^^^^^^^^^^^^^^^ | |
E0509 18:20:48.465000 415353 torch/_subclasses/fake_tensor.py:2431] [0/0] File "/workspace/pytorch/torch/_ops.py", line 756, in __call__ | |
E0509 18:20:48.465000 415353 torch/_subclasses/fake_tensor.py:2431] [0/0] return self._op(*args, **kwargs) | |
E0509 18:20:48.465000 415353 torch/_subclasses/fake_tensor.py:2431] [0/0] ^^^^^^^^^^^^^^^^^^^^^^^^^ | |
E0509 18:20:48.465000 415353 torch/_subclasses/fake_tensor.py:2431] [0/0] File "/workspace/pytorch/torch/_meta_registrations.py", line 2727, in meta_avg_pool2d | |
E0509 18:20:48.465000 415353 torch/_subclasses/fake_tensor.py:2431] [0/0] torch._check( | |
E0509 18:20:48.465000 415353 torch/_subclasses/fake_tensor.py:2431] [0/0] File "/workspace/pytorch/torch/__init__.py", line 1660, in _check | |
E0509 18:20:48.465000 415353 torch/_subclasses/fake_tensor.py:2431] [0/0] _check_with(RuntimeError, cond, message) | |
E0509 18:20:48.465000 415353 torch/_subclasses/fake_tensor.py:2431] [0/0] File "/workspace/pytorch/torch/__init__.py", line 1642, in _check_with | |
E0509 18:20:48.465000 415353 torch/_subclasses/fake_tensor.py:2431] [0/0] raise error_type(message_evaluated) | |
E0509 18:20:48.465000 415353 torch/_subclasses/fake_tensor.py:2431] [0/0] RuntimeError: "avg_pool2d" not implemented for 'torch.uint8' | |
E0509 18:20:48.474000 415353 torch/_subclasses/fake_tensor.py:2431] [0/1] failed while attempting to run meta for aten.avg_pool2d.default | |
E0509 18:20:48.474000 415353 torch/_subclasses/fake_tensor.py:2431] [0/1] Traceback (most recent call last): | |
E0509 18:20:48.474000 415353 torch/_subclasses/fake_tensor.py:2431] [0/1] File "/workspace/pytorch/torch/_subclasses/fake_tensor.py", line 2427, in _dispatch_impl | |
E0509 18:20:48.474000 415353 torch/_subclasses/fake_tensor.py:2431] [0/1] r = func(*args, **kwargs) | |
E0509 18:20:48.474000 415353 torch/_subclasses/fake_tensor.py:2431] [0/1] ^^^^^^^^^^^^^^^^^^^^^ | |
E0509 18:20:48.474000 415353 torch/_subclasses/fake_tensor.py:2431] [0/1] File "/workspace/pytorch/torch/_ops.py", line 756, in __call__ | |
E0509 18:20:48.474000 415353 torch/_subclasses/fake_tensor.py:2431] [0/1] return self._op(*args, **kwargs) | |
E0509 18:20:48.474000 415353 torch/_subclasses/fake_tensor.py:2431] [0/1] ^^^^^^^^^^^^^^^^^^^^^^^^^ | |
E0509 18:20:48.474000 415353 torch/_subclasses/fake_tensor.py:2431] [0/1] File "/workspace/pytorch/torch/_meta_registrations.py", line 2727, in meta_avg_pool2d | |
E0509 18:20:48.474000 415353 torch/_subclasses/fake_tensor.py:2431] [0/1] torch._check( | |
E0509 18:20:48.474000 415353 torch/_subclasses/fake_tensor.py:2431] [0/1] File "/workspace/pytorch/torch/__init__.py", line 1660, in _check | |
E0509 18:20:48.474000 415353 torch/_subclasses/fake_tensor.py:2431] [0/1] _check_with(RuntimeError, cond, message) | |
E0509 18:20:48.474000 415353 torch/_subclasses/fake_tensor.py:2431] [0/1] File "/workspace/pytorch/torch/__init__.py", line 1642, in _check_with | |
E0509 18:20:48.474000 415353 torch/_subclasses/fake_tensor.py:2431] [0/1] raise error_type(message_evaluated) | |
E0509 18:20:48.474000 415353 torch/_subclasses/fake_tensor.py:2431] [0/1] RuntimeError: "avg_pool2d" not implemented for 'torch.uint16' | |
E0509 18:20:48.478000 415353 torch/_subclasses/fake_tensor.py:2431] [0/2] failed while attempting to run meta for aten.avg_pool2d.default | |
E0509 18:20:48.478000 415353 torch/_subclasses/fake_tensor.py:2431] [0/2] Traceback (most recent call last): | |
E0509 18:20:48.478000 415353 torch/_subclasses/fake_tensor.py:2431] [0/2] File "/workspace/pytorch/torch/_subclasses/fake_tensor.py", line 2427, in _dispatch_impl | |
E0509 18:20:48.478000 415353 torch/_subclasses/fake_tensor.py:2431] [0/2] r = func(*args, **kwargs) | |
E0509 18:20:48.478000 415353 torch/_subclasses/fake_tensor.py:2431] [0/2] ^^^^^^^^^^^^^^^^^^^^^ | |
E0509 18:20:48.478000 415353 torch/_subclasses/fake_tensor.py:2431] [0/2] File "/workspace/pytorch/torch/_ops.py", line 756, in __call__ | |
E0509 18:20:48.478000 415353 torch/_subclasses/fake_tensor.py:2431] [0/2] return self._op(*args, **kwargs) | |
E0509 18:20:48.478000 415353 torch/_subclasses/fake_tensor.py:2431] [0/2] ^^^^^^^^^^^^^^^^^^^^^^^^^ | |
E0509 18:20:48.478000 415353 torch/_subclasses/fake_tensor.py:2431] [0/2] File "/workspace/pytorch/torch/_meta_registrations.py", line 2727, in meta_avg_pool2d | |
E0509 18:20:48.478000 415353 torch/_subclasses/fake_tensor.py:2431] [0/2] torch._check( | |
E0509 18:20:48.478000 415353 torch/_subclasses/fake_tensor.py:2431] [0/2] File "/workspace/pytorch/torch/__init__.py", line 1660, in _check | |
E0509 18:20:48.478000 415353 torch/_subclasses/fake_tensor.py:2431] [0/2] _check_with(RuntimeError, cond, message) | |
E0509 18:20:48.478000 415353 torch/_subclasses/fake_tensor.py:2431] [0/2] File "/workspace/pytorch/torch/__init__.py", line 1642, in _check_with | |
E0509 18:20:48.478000 415353 torch/_subclasses/fake_tensor.py:2431] [0/2] raise error_type(message_evaluated) | |
E0509 18:20:48.478000 415353 torch/_subclasses/fake_tensor.py:2431] [0/2] RuntimeError: "avg_pool2d" not implemented for 'torch.uint32' | |
E0509 18:20:48.483000 415353 torch/_subclasses/fake_tensor.py:2431] [0/3] failed while attempting to run meta for aten.avg_pool2d.default | |
E0509 18:20:48.483000 415353 torch/_subclasses/fake_tensor.py:2431] [0/3] Traceback (most recent call last): | |
E0509 18:20:48.483000 415353 torch/_subclasses/fake_tensor.py:2431] [0/3] File "/workspace/pytorch/torch/_subclasses/fake_tensor.py", line 2427, in _dispatch_impl | |
E0509 18:20:48.483000 415353 torch/_subclasses/fake_tensor.py:2431] [0/3] r = func(*args, **kwargs) | |
E0509 18:20:48.483000 415353 torch/_subclasses/fake_tensor.py:2431] [0/3] ^^^^^^^^^^^^^^^^^^^^^ | |
E0509 18:20:48.483000 415353 torch/_subclasses/fake_tensor.py:2431] [0/3] File "/workspace/pytorch/torch/_ops.py", line 756, in __call__ | |
E0509 18:20:48.483000 415353 torch/_subclasses/fake_tensor.py:2431] [0/3] return self._op(*args, **kwargs) | |
E0509 18:20:48.483000 415353 torch/_subclasses/fake_tensor.py:2431] [0/3] ^^^^^^^^^^^^^^^^^^^^^^^^^ | |
E0509 18:20:48.483000 415353 torch/_subclasses/fake_tensor.py:2431] [0/3] File "/workspace/pytorch/torch/_meta_registrations.py", line 2727, in meta_avg_pool2d | |
E0509 18:20:48.483000 415353 torch/_subclasses/fake_tensor.py:2431] [0/3] torch._check( | |
E0509 18:20:48.483000 415353 torch/_subclasses/fake_tensor.py:2431] [0/3] File "/workspace/pytorch/torch/__init__.py", line 1660, in _check | |
E0509 18:20:48.483000 415353 torch/_subclasses/fake_tensor.py:2431] [0/3] _check_with(RuntimeError, cond, message) | |
E0509 18:20:48.483000 415353 torch/_subclasses/fake_tensor.py:2431] [0/3] File "/workspace/pytorch/torch/__init__.py", line 1642, in _check_with | |
E0509 18:20:48.483000 415353 torch/_subclasses/fake_tensor.py:2431] [0/3] raise error_type(message_evaluated) | |
E0509 18:20:48.483000 415353 torch/_subclasses/fake_tensor.py:2431] [0/3] RuntimeError: "avg_pool2d" not implemented for 'torch.uint64' | |
E0509 18:20:48.492000 415353 torch/_subclasses/fake_tensor.py:2431] [0/4] failed while attempting to run meta for aten.avg_pool2d.default | |
E0509 18:20:48.492000 415353 torch/_subclasses/fake_tensor.py:2431] [0/4] Traceback (most recent call last): | |
E0509 18:20:48.492000 415353 torch/_subclasses/fake_tensor.py:2431] [0/4] File "/workspace/pytorch/torch/_subclasses/fake_tensor.py", line 2427, in _dispatch_impl | |
E0509 18:20:48.492000 415353 torch/_subclasses/fake_tensor.py:2431] [0/4] r = func(*args, **kwargs) | |
E0509 18:20:48.492000 415353 torch/_subclasses/fake_tensor.py:2431] [0/4] ^^^^^^^^^^^^^^^^^^^^^ | |
E0509 18:20:48.492000 415353 torch/_subclasses/fake_tensor.py:2431] [0/4] File "/workspace/pytorch/torch/_ops.py", line 756, in __call__ | |
E0509 18:20:48.492000 415353 torch/_subclasses/fake_tensor.py:2431] [0/4] return self._op(*args, **kwargs) | |
E0509 18:20:48.492000 415353 torch/_subclasses/fake_tensor.py:2431] [0/4] ^^^^^^^^^^^^^^^^^^^^^^^^^ | |
E0509 18:20:48.492000 415353 torch/_subclasses/fake_tensor.py:2431] [0/4] File "/workspace/pytorch/torch/_meta_registrations.py", line 2727, in meta_avg_pool2d | |
E0509 18:20:48.492000 415353 torch/_subclasses/fake_tensor.py:2431] [0/4] torch._check( | |
E0509 18:20:48.492000 415353 torch/_subclasses/fake_tensor.py:2431] [0/4] File "/workspace/pytorch/torch/__init__.py", line 1660, in _check | |
E0509 18:20:48.492000 415353 torch/_subclasses/fake_tensor.py:2431] [0/4] _check_with(RuntimeError, cond, message) | |
E0509 18:20:48.492000 415353 torch/_subclasses/fake_tensor.py:2431] [0/4] File "/workspace/pytorch/torch/__init__.py", line 1642, in _check_with | |
E0509 18:20:48.492000 415353 torch/_subclasses/fake_tensor.py:2431] [0/4] raise error_type(message_evaluated) | |
E0509 18:20:48.492000 415353 torch/_subclasses/fake_tensor.py:2431] [0/4] RuntimeError: "avg_pool2d" not implemented for 'torch.uint8' | |
E0509 18:20:48.497000 415353 torch/_subclasses/fake_tensor.py:2431] [0/5] failed while attempting to run meta for aten.avg_pool2d.default | |
E0509 18:20:48.497000 415353 torch/_subclasses/fake_tensor.py:2431] [0/5] Traceback (most recent call last): | |
E0509 18:20:48.497000 415353 torch/_subclasses/fake_tensor.py:2431] [0/5] File "/workspace/pytorch/torch/_subclasses/fake_tensor.py", line 2427, in _dispatch_impl | |
E0509 18:20:48.497000 415353 torch/_subclasses/fake_tensor.py:2431] [0/5] r = func(*args, **kwargs) | |
E0509 18:20:48.497000 415353 torch/_subclasses/fake_tensor.py:2431] [0/5] ^^^^^^^^^^^^^^^^^^^^^ | |
E0509 18:20:48.497000 415353 torch/_subclasses/fake_tensor.py:2431] [0/5] File "/workspace/pytorch/torch/_ops.py", line 756, in __call__ | |
E0509 18:20:48.497000 415353 torch/_subclasses/fake_tensor.py:2431] [0/5] return self._op(*args, **kwargs) | |
E0509 18:20:48.497000 415353 torch/_subclasses/fake_tensor.py:2431] [0/5] ^^^^^^^^^^^^^^^^^^^^^^^^^ | |
E0509 18:20:48.497000 415353 torch/_subclasses/fake_tensor.py:2431] [0/5] File "/workspace/pytorch/torch/_meta_registrations.py", line 2727, in meta_avg_pool2d | |
E0509 18:20:48.497000 415353 torch/_subclasses/fake_tensor.py:2431] [0/5] torch._check( | |
E0509 18:20:48.497000 415353 torch/_subclasses/fake_tensor.py:2431] [0/5] File "/workspace/pytorch/torch/__init__.py", line 1660, in _check | |
E0509 18:20:48.497000 415353 torch/_subclasses/fake_tensor.py:2431] [0/5] _check_with(RuntimeError, cond, message) | |
E0509 18:20:48.497000 415353 torch/_subclasses/fake_tensor.py:2431] [0/5] File "/workspace/pytorch/torch/__init__.py", line 1642, in _check_with | |
E0509 18:20:48.497000 415353 torch/_subclasses/fake_tensor.py:2431] [0/5] raise error_type(message_evaluated) | |
E0509 18:20:48.497000 415353 torch/_subclasses/fake_tensor.py:2431] [0/5] RuntimeError: "avg_pool2d" not implemented for 'torch.uint16' | |
E0509 18:20:48.503000 415353 torch/_subclasses/fake_tensor.py:2431] [0/6] failed while attempting to run meta for aten.avg_pool2d.default | |
E0509 18:20:48.503000 415353 torch/_subclasses/fake_tensor.py:2431] [0/6] Traceback (most recent call last): | |
E0509 18:20:48.503000 415353 torch/_subclasses/fake_tensor.py:2431] [0/6] File "/workspace/pytorch/torch/_subclasses/fake_tensor.py", line 2427, in _dispatch_impl | |
E0509 18:20:48.503000 415353 torch/_subclasses/fake_tensor.py:2431] [0/6] r = func(*args, **kwargs) | |
E0509 18:20:48.503000 415353 torch/_subclasses/fake_tensor.py:2431] [0/6] ^^^^^^^^^^^^^^^^^^^^^ | |
E0509 18:20:48.503000 415353 torch/_subclasses/fake_tensor.py:2431] [0/6] File "/workspace/pytorch/torch/_ops.py", line 756, in __call__ | |
E0509 18:20:48.503000 415353 torch/_subclasses/fake_tensor.py:2431] [0/6] return self._op(*args, **kwargs) | |
E0509 18:20:48.503000 415353 torch/_subclasses/fake_tensor.py:2431] [0/6] ^^^^^^^^^^^^^^^^^^^^^^^^^ | |
E0509 18:20:48.503000 415353 torch/_subclasses/fake_tensor.py:2431] [0/6] File "/workspace/pytorch/torch/_meta_registrations.py", line 2727, in meta_avg_pool2d | |
E0509 18:20:48.503000 415353 torch/_subclasses/fake_tensor.py:2431] [0/6] torch._check( | |
E0509 18:20:48.503000 415353 torch/_subclasses/fake_tensor.py:2431] [0/6] File "/workspace/pytorch/torch/__init__.py", line 1660, in _check | |
E0509 18:20:48.503000 415353 torch/_subclasses/fake_tensor.py:2431] [0/6] _check_with(RuntimeError, cond, message) | |
E0509 18:20:48.503000 415353 torch/_subclasses/fake_tensor.py:2431] [0/6] File "/workspace/pytorch/torch/__init__.py", line 1642, in _check_with | |
E0509 18:20:48.503000 415353 torch/_subclasses/fake_tensor.py:2431] [0/6] raise error_type(message_evaluated) | |
E0509 18:20:48.503000 415353 torch/_subclasses/fake_tensor.py:2431] [0/6] RuntimeError: "avg_pool2d" not implemented for 'torch.uint32' | |
E0509 18:20:48.509000 415353 torch/_subclasses/fake_tensor.py:2431] [0/7] failed while attempting to run meta for aten.avg_pool2d.default | |
E0509 18:20:48.509000 415353 torch/_subclasses/fake_tensor.py:2431] [0/7] Traceback (most recent call last): | |
E0509 18:20:48.509000 415353 torch/_subclasses/fake_tensor.py:2431] [0/7] File "/workspace/pytorch/torch/_subclasses/fake_tensor.py", line 2427, in _dispatch_impl | |
E0509 18:20:48.509000 415353 torch/_subclasses/fake_tensor.py:2431] [0/7] r = func(*args, **kwargs) | |
E0509 18:20:48.509000 415353 torch/_subclasses/fake_tensor.py:2431] [0/7] ^^^^^^^^^^^^^^^^^^^^^ | |
E0509 18:20:48.509000 415353 torch/_subclasses/fake_tensor.py:2431] [0/7] File "/workspace/pytorch/torch/_ops.py", line 756, in __call__ | |
E0509 18:20:48.509000 415353 torch/_subclasses/fake_tensor.py:2431] [0/7] return self._op(*args, **kwargs) | |
E0509 18:20:48.509000 415353 torch/_subclasses/fake_tensor.py:2431] [0/7] ^^^^^^^^^^^^^^^^^^^^^^^^^ | |
E0509 18:20:48.509000 415353 torch/_subclasses/fake_tensor.py:2431] [0/7] File "/workspace/pytorch/torch/_meta_registrations.py", line 2727, in meta_avg_pool2d | |
E0509 18:20:48.509000 415353 torch/_subclasses/fake_tensor.py:2431] [0/7] torch._check( | |
E0509 18:20:48.509000 415353 torch/_subclasses/fake_tensor.py:2431] [0/7] File "/workspace/pytorch/torch/__init__.py", line 1660, in _check | |
E0509 18:20:48.509000 415353 torch/_subclasses/fake_tensor.py:2431] [0/7] _check_with(RuntimeError, cond, message) | |
E0509 18:20:48.509000 415353 torch/_subclasses/fake_tensor.py:2431] [0/7] File "/workspace/pytorch/torch/__init__.py", line 1642, in _check_with | |
E0509 18:20:48.509000 415353 torch/_subclasses/fake_tensor.py:2431] [0/7] raise error_type(message_evaluated) | |
E0509 18:20:48.509000 415353 torch/_subclasses/fake_tensor.py:2431] [0/7] RuntimeError: "avg_pool2d" not implemented for 'torch.uint64' | |
E0509 18:20:48.516000 415353 torch/_subclasses/fake_tensor.py:2431] [0/8] failed while attempting to run meta for aten.avg_pool3d.default | |
E0509 18:20:48.516000 415353 torch/_subclasses/fake_tensor.py:2431] [0/8] Traceback (most recent call last): | |
E0509 18:20:48.516000 415353 torch/_subclasses/fake_tensor.py:2431] [0/8] File "/workspace/pytorch/torch/_subclasses/fake_tensor.py", line 2427, in _dispatch_impl | |
E0509 18:20:48.516000 415353 torch/_subclasses/fake_tensor.py:2431] [0/8] r = func(*args, **kwargs) | |
E0509 18:20:48.516000 415353 torch/_subclasses/fake_tensor.py:2431] [0/8] ^^^^^^^^^^^^^^^^^^^^^ | |
E0509 18:20:48.516000 415353 torch/_subclasses/fake_tensor.py:2431] [0/8] File "/workspace/pytorch/torch/_ops.py", line 756, in __call__ | |
E0509 18:20:48.516000 415353 torch/_subclasses/fake_tensor.py:2431] [0/8] return self._op(*args, **kwargs) | |
E0509 18:20:48.516000 415353 torch/_subclasses/fake_tensor.py:2431] [0/8] ^^^^^^^^^^^^^^^^^^^^^^^^^ | |
E0509 18:20:48.516000 415353 torch/_subclasses/fake_tensor.py:2431] [0/8] File "/workspace/pytorch/torch/_prims_common/wrappers.py", line 308, in _fn | |
E0509 18:20:48.516000 415353 torch/_subclasses/fake_tensor.py:2431] [0/8] result = fn(*args, **kwargs) | |
E0509 18:20:48.516000 415353 torch/_subclasses/fake_tensor.py:2431] [0/8] ^^^^^^^^^^^^^^^^^^^ | |
E0509 18:20:48.516000 415353 torch/_subclasses/fake_tensor.py:2431] [0/8] File "/workspace/pytorch/torch/_meta_registrations.py", line 2925, in meta_avg_pool3d | |
E0509 18:20:48.516000 415353 torch/_subclasses/fake_tensor.py:2431] [0/8] torch._check( | |
E0509 18:20:48.516000 415353 torch/_subclasses/fake_tensor.py:2431] [0/8] File "/workspace/pytorch/torch/__init__.py", line 1660, in _check | |
E0509 18:20:48.516000 415353 torch/_subclasses/fake_tensor.py:2431] [0/8] _check_with(RuntimeError, cond, message) | |
E0509 18:20:48.516000 415353 torch/_subclasses/fake_tensor.py:2431] [0/8] File "/workspace/pytorch/torch/__init__.py", line 1642, in _check_with | |
E0509 18:20:48.516000 415353 torch/_subclasses/fake_tensor.py:2431] [0/8] raise error_type(message_evaluated) | |
E0509 18:20:48.516000 415353 torch/_subclasses/fake_tensor.py:2431] [0/8] RuntimeError: "avg_pool3d" not implemented for 'torch.uint8' | |
E0509 18:20:48.523000 415353 torch/_subclasses/fake_tensor.py:2431] [0/9] failed while attempting to run meta for aten.avg_pool3d.default | |
E0509 18:20:48.523000 415353 torch/_subclasses/fake_tensor.py:2431] [0/9] Traceback (most recent call last): | |
E0509 18:20:48.523000 415353 torch/_subclasses/fake_tensor.py:2431] [0/9] File "/workspace/pytorch/torch/_subclasses/fake_tensor.py", line 2427, in _dispatch_impl | |
E0509 18:20:48.523000 415353 torch/_subclasses/fake_tensor.py:2431] [0/9] r = func(*args, **kwargs) | |
E0509 18:20:48.523000 415353 torch/_subclasses/fake_tensor.py:2431] [0/9] ^^^^^^^^^^^^^^^^^^^^^ | |
E0509 18:20:48.523000 415353 torch/_subclasses/fake_tensor.py:2431] [0/9] File "/workspace/pytorch/torch/_ops.py", line 756, in __call__ | |
E0509 18:20:48.523000 415353 torch/_subclasses/fake_tensor.py:2431] [0/9] return self._op(*args, **kwargs) | |
E0509 18:20:48.523000 415353 torch/_subclasses/fake_tensor.py:2431] [0/9] ^^^^^^^^^^^^^^^^^^^^^^^^^ | |
E0509 18:20:48.523000 415353 torch/_subclasses/fake_tensor.py:2431] [0/9] File "/workspace/pytorch/torch/_prims_common/wrappers.py", line 308, in _fn | |
E0509 18:20:48.523000 415353 torch/_subclasses/fake_tensor.py:2431] [0/9] result = fn(*args, **kwargs) | |
E0509 18:20:48.523000 415353 torch/_subclasses/fake_tensor.py:2431] [0/9] ^^^^^^^^^^^^^^^^^^^ | |
E0509 18:20:48.523000 415353 torch/_subclasses/fake_tensor.py:2431] [0/9] File "/workspace/pytorch/torch/_meta_registrations.py", line 2925, in meta_avg_pool3d | |
E0509 18:20:48.523000 415353 torch/_subclasses/fake_tensor.py:2431] [0/9] torch._check( | |
E0509 18:20:48.523000 415353 torch/_subclasses/fake_tensor.py:2431] [0/9] File "/workspace/pytorch/torch/__init__.py", line 1660, in _check | |
E0509 18:20:48.523000 415353 torch/_subclasses/fake_tensor.py:2431] [0/9] _check_with(RuntimeError, cond, message) | |
E0509 18:20:48.523000 415353 torch/_subclasses/fake_tensor.py:2431] [0/9] File "/workspace/pytorch/torch/__init__.py", line 1642, in _check_with | |
E0509 18:20:48.523000 415353 torch/_subclasses/fake_tensor.py:2431] [0/9] raise error_type(message_evaluated) | |
E0509 18:20:48.523000 415353 torch/_subclasses/fake_tensor.py:2431] [0/9] RuntimeError: "avg_pool3d" not implemented for 'torch.uint16' | |
E0509 18:20:48.530000 415353 torch/_subclasses/fake_tensor.py:2431] [0/10] failed while attempting to run meta for aten.avg_pool3d.default | |
E0509 18:20:48.530000 415353 torch/_subclasses/fake_tensor.py:2431] [0/10] Traceback (most recent call last): | |
E0509 18:20:48.530000 415353 torch/_subclasses/fake_tensor.py:2431] [0/10] File "/workspace/pytorch/torch/_subclasses/fake_tensor.py", line 2427, in _dispatch_impl | |
E0509 18:20:48.530000 415353 torch/_subclasses/fake_tensor.py:2431] [0/10] r = func(*args, **kwargs) | |
E0509 18:20:48.530000 415353 torch/_subclasses/fake_tensor.py:2431] [0/10] ^^^^^^^^^^^^^^^^^^^^^ | |
E0509 18:20:48.530000 415353 torch/_subclasses/fake_tensor.py:2431] [0/10] File "/workspace/pytorch/torch/_ops.py", line 756, in __call__ | |
E0509 18:20:48.530000 415353 torch/_subclasses/fake_tensor.py:2431] [0/10] return self._op(*args, **kwargs) | |
E0509 18:20:48.530000 415353 torch/_subclasses/fake_tensor.py:2431] [0/10] ^^^^^^^^^^^^^^^^^^^^^^^^^ | |
E0509 18:20:48.530000 415353 torch/_subclasses/fake_tensor.py:2431] [0/10] File "/workspace/pytorch/torch/_prims_common/wrappers.py", line 308, in _fn | |
E0509 18:20:48.530000 415353 torch/_subclasses/fake_tensor.py:2431] [0/10] result = fn(*args, **kwargs) | |
E0509 18:20:48.530000 415353 torch/_subclasses/fake_tensor.py:2431] [0/10] ^^^^^^^^^^^^^^^^^^^ | |
E0509 18:20:48.530000 415353 torch/_subclasses/fake_tensor.py:2431] [0/10] File "/workspace/pytorch/torch/_meta_registrations.py", line 2925, in meta_avg_pool3d | |
E0509 18:20:48.530000 415353 torch/_subclasses/fake_tensor.py:2431] [0/10] torch._check( | |
E0509 18:20:48.530000 415353 torch/_subclasses/fake_tensor.py:2431] [0/10] File "/workspace/pytorch/torch/__init__.py", line 1660, in _check | |
E0509 18:20:48.530000 415353 torch/_subclasses/fake_tensor.py:2431] [0/10] _check_with(RuntimeError, cond, message) | |
E0509 18:20:48.530000 415353 torch/_subclasses/fake_tensor.py:2431] [0/10] File "/workspace/pytorch/torch/__init__.py", line 1642, in _check_with | |
E0509 18:20:48.530000 415353 torch/_subclasses/fake_tensor.py:2431] [0/10] raise error_type(message_evaluated) | |
E0509 18:20:48.530000 415353 torch/_subclasses/fake_tensor.py:2431] [0/10] RuntimeError: "avg_pool3d" not implemented for 'torch.uint32' | |
E0509 18:20:48.535000 415353 torch/_subclasses/fake_tensor.py:2431] [0/11] failed while attempting to run meta for aten.avg_pool3d.default | |
E0509 18:20:48.535000 415353 torch/_subclasses/fake_tensor.py:2431] [0/11] Traceback (most recent call last): | |
E0509 18:20:48.535000 415353 torch/_subclasses/fake_tensor.py:2431] [0/11] File "/workspace/pytorch/torch/_subclasses/fake_tensor.py", line 2427, in _dispatch_impl | |
E0509 18:20:48.535000 415353 torch/_subclasses/fake_tensor.py:2431] [0/11] r = func(*args, **kwargs) | |
E0509 18:20:48.535000 415353 torch/_subclasses/fake_tensor.py:2431] [0/11] ^^^^^^^^^^^^^^^^^^^^^ | |
E0509 18:20:48.535000 415353 torch/_subclasses/fake_tensor.py:2431] [0/11] File "/workspace/pytorch/torch/_ops.py", line 756, in __call__ | |
E0509 18:20:48.535000 415353 torch/_subclasses/fake_tensor.py:2431] [0/11] return self._op(*args, **kwargs) | |
E0509 18:20:48.535000 415353 torch/_subclasses/fake_tensor.py:2431] [0/11] ^^^^^^^^^^^^^^^^^^^^^^^^^ | |
E0509 18:20:48.535000 415353 torch/_subclasses/fake_tensor.py:2431] [0/11] File "/workspace/pytorch/torch/_prims_common/wrappers.py", line 308, in _fn | |
E0509 18:20:48.535000 415353 torch/_subclasses/fake_tensor.py:2431] [0/11] result = fn(*args, **kwargs) | |
E0509 18:20:48.535000 415353 torch/_subclasses/fake_tensor.py:2431] [0/11] ^^^^^^^^^^^^^^^^^^^ | |
E0509 18:20:48.535000 415353 torch/_subclasses/fake_tensor.py:2431] [0/11] File "/workspace/pytorch/torch/_meta_registrations.py", line 2925, in meta_avg_pool3d | |
E0509 18:20:48.535000 415353 torch/_subclasses/fake_tensor.py:2431] [0/11] torch._check( | |
E0509 18:20:48.535000 415353 torch/_subclasses/fake_tensor.py:2431] [0/11] File "/workspace/pytorch/torch/__init__.py", line 1660, in _check | |
E0509 18:20:48.535000 415353 torch/_subclasses/fake_tensor.py:2431] [0/11] _check_with(RuntimeError, cond, message) | |
E0509 18:20:48.535000 415353 torch/_subclasses/fake_tensor.py:2431] [0/11] File "/workspace/pytorch/torch/__init__.py", line 1642, in _check_with | |
E0509 18:20:48.535000 415353 torch/_subclasses/fake_tensor.py:2431] [0/11] raise error_type(message_evaluated) | |
E0509 18:20:48.535000 415353 torch/_subclasses/fake_tensor.py:2431] [0/11] RuntimeError: "avg_pool3d" not implemented for 'torch.uint64' | |
frames [('total', 12)] | |
ok | |
test_baddbmm_cuda (__main__.GPUTests.test_baddbmm_cuda) ... inline_call [] | |
stats [('calls_captured', 8), ('unique_graphs', 8)] | |
aot_autograd [('total', 8), ('ok', 8), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('autograd_cache_hit', 4)] | |
inductor [('extern_calls', 8), ('fxgraph_cache_miss', 4), ('fxgraph_cache_hit', 4)] | |
graph_break [] | |
aten_mm_info [('aten.baddbmm_128_100_64', 4)] | |
ok | |
test_batch_norm_2d_2_cuda (__main__.GPUTests.test_batch_norm_2d_2_cuda) ... frames [('total', 1), ('ok', 1)] | |
inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 56), ('async_compile_cache_miss', 8), ('async_compile_cache_hit', 8), ('fxgraph_cache_miss', 1), ('extern_calls', 1), ('intermediate_hooks', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_batch_norm_2d_cuda (__main__.GPUTests.test_batch_norm_2d_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_bernoulli1_cuda (__main__.GPUTests.test_bernoulli1_cuda) ... inline_call [] | |
stats [('calls_captured', 12), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 42), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 6), ('fxgraph_cache_miss', 2), ('extern_calls', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_bernoulli2_cuda (__main__.GPUTests.test_bernoulli2_cuda) ... inline_call [] | |
stats [('calls_captured', 10), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('fxgraph_cache_miss', 2), ('extern_calls', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_bfloat16_to_int16_cuda (__main__.GPUTests.test_bfloat16_to_int16_cuda) ... frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 5), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_bitwise2_cuda (__main__.GPUTests.test_bitwise2_cuda) ... inline_call [] | |
stats [('calls_captured', 8), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('async_compile_cache_miss', 3), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_bitwise3_cuda (__main__.GPUTests.test_bitwise3_cuda) ... inline_call [] | |
stats [('calls_captured', 12), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('async_compile_cache_miss', 3), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_bitwise_cuda (__main__.GPUTests.test_bitwise_cuda) ... inline_call [] | |
stats [('calls_captured', 8), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('async_compile_cache_miss', 3), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_bmm1_cuda (__main__.GPUTests.test_bmm1_cuda) ... inline_call [] | |
stats [('calls_captured', 10), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 42), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 6), ('extern_calls', 4), ('benchmarking.InductorBenchmarker.benchmark_gpu', 3), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [('aten.bmm_8_8_8', 2), ('aten.bmm_16_10_8', 2)] | |
ok | |
test_bmm2_cuda (__main__.GPUTests.test_bmm2_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('fxgraph_cache_miss', 1), ('extern_calls', 1)] | |
graph_break [] | |
aten_mm_info [('aten.bmm_8_8_8', 1)] | |
ok | |
test_bool_cuda (__main__.GPUTests.test_bool_cuda) ... inline_call [] | |
stats [('calls_captured', 18), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)] | |
inductor [('triton_bundler_save_kernel', 35), ('async_compile_cache_miss', 15), ('async_compile_cache_hit', 10), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_both_scalars_cuda (__main__.GPUTests.test_both_scalars_cuda) ... inline_call [] | |
stats [('calls_captured', 12), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)] | |
inductor [('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_bucketize_add_autotune_cuda (__main__.GPUTests.test_bucketize_add_autotune_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 56), ('benchmarking.InductorBenchmarker.benchmark_gpu', 8), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_bucketize_computed_offsets_cuda (__main__.GPUTests.test_bucketize_computed_offsets_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 14), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_bucketize_default_kwargs_cuda (__main__.GPUTests.test_bucketize_default_kwargs_cuda) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_bucketize_int_cuda (__main__.GPUTests.test_bucketize_int_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 4)] | |
aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)] | |
inductor [('triton_bundler_save_kernel', 28), ('fxgraph_cache_miss', 4), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_bucketize_nd_tiling_False_cuda (__main__.GPUTests.test_bucketize_nd_tiling_False_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 4)] | |
aot_autograd [('total', 4), ('ok', 4), ('autograd_cache_hit', 3), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1)] | |
inductor [('async_compile_cache_miss', 7), ('triton_bundler_save_kernel', 7), ('async_compile_cache_hit', 4), ('fxgraph_cache_hit', 3), ('fxgraph_cache_miss', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_bucketize_nd_tiling_True_cuda (__main__.GPUTests.test_bucketize_nd_tiling_True_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 4)] | |
aot_autograd [('total', 4), ('ok', 4), ('autograd_cache_hit', 3), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1)] | |
inductor [('async_compile_cache_miss', 7), ('triton_bundler_save_kernel', 7), ('async_compile_cache_hit', 4), ('fxgraph_cache_hit', 3), ('fxgraph_cache_miss', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_buffer_batch_norm_cuda (__main__.GPUTests.test_buffer_batch_norm_cuda) ... frames [('total', 1), ('ok', 1)] | |
inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 21), ('async_compile_cache_miss', 3), ('async_compile_cache_hit', 3), ('fxgraph_cache_miss', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_buffer_copied_in_graph_cuda (__main__.GPUTests.test_buffer_copied_in_graph_cuda) ... frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 6), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_buffer_copied_in_graph_with_different_shapes_cuda (__main__.GPUTests.test_buffer_copied_in_graph_with_different_shapes_cuda) ... frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 5), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('benchmarking.InductorBenchmarker.benchmark_gpu', 6), ('fxgraph_cache_miss', 1), ('extern_calls', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_4_4_2', 1)] | |
ok | |
test_buffer_use_after_remove_cuda (__main__.GPUTests.test_buffer_use_after_remove_cuda) ... frames [('total', 1), ('ok', 1)] | |
inline_call [] | |
stats [('calls_captured', 42), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('ok', 1), ('autograd_cache_saved', 1)] | |
inductor [('triton_bundler_save_kernel', 42), ('pattern_matcher_nodes', 16), ('pattern_matcher_count', 13), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 6), ('benchmarking.InductorBenchmarker.benchmark_gpu', 4), ('fxgraph_cache_miss', 2), ('extern_calls', 2)] | |
graph_break [] | |
aten_mm_info [('aten.bmm_6_2_3', 1), ('aten.bmm_6_3_2', 1)] | |
ok | |
test_builtins_round_cuda (__main__.GPUTests.test_builtins_round_cuda) ... stats [('calls_captured', 6), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_builtins_round_float_ndigits_neg_cuda (__main__.GPUTests.test_builtins_round_float_ndigits_neg_cuda) ... stats [('calls_captured', 4), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_builtins_round_float_ndigits_pos_cuda (__main__.GPUTests.test_builtins_round_float_ndigits_pos_cuda) ... stats [('calls_captured', 4), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_builtins_round_float_ndigits_zero_cuda (__main__.GPUTests.test_builtins_round_float_ndigits_zero_cuda) ... stats [('calls_captured', 4), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_builtins_round_int_ndigits_pos_cuda (__main__.GPUTests.test_builtins_round_int_ndigits_pos_cuda) ... stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_builtins_round_int_ndigits_zero_cuda (__main__.GPUTests.test_builtins_round_int_ndigits_zero_cuda) ... stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cat_cuda (__main__.GPUTests.test_cat_cuda) ... inline_call [] | |
stats [('calls_captured', 32), ('unique_graphs', 4)] | |
aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)] | |
inductor [('triton_bundler_save_kernel', 70), ('async_compile_cache_miss', 10), ('async_compile_cache_hit', 10), ('pattern_matcher_count', 4), ('pattern_matcher_nodes', 4), ('fxgraph_cache_miss', 4)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cat_empty_cuda (__main__.GPUTests.test_cat_empty_cuda) ... inline_call [] | |
stats [('calls_captured', 6), ('unique_graphs', 6)] | |
aot_autograd [('total', 6), ('autograd_cache_miss', 6), ('autograd_cache_saved', 6), ('ok', 6)] | |
inductor [('triton_bundler_save_kernel', 42), ('fxgraph_cache_miss', 6), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 6)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cat_empty_index_cuda (__main__.GPUTests.test_cat_empty_index_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cat_extern_kernel_cuda (__main__.GPUTests.test_cat_extern_kernel_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('extern_calls', 2), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_256_1600_1024', 1), ('aten.mm_256_256_100', 1)] | |
ok | |
test_cat_inplace_cuda (__main__.GPUTests.test_cat_inplace_cuda) ... frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 2), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('fxgraph_cache_miss', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cat_negative_dim_cuda (__main__.GPUTests.test_cat_negative_dim_cuda) ... inline_call [] | |
stats [('calls_captured', 6), ('unique_graphs', 6)] | |
aot_autograd [('total', 6), ('autograd_cache_miss', 6), ('autograd_cache_saved', 6), ('ok', 6)] | |
inductor [('triton_bundler_save_kernel', 42), ('fxgraph_cache_miss', 6), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 6)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cat_of_loops_and_extern_kernel_cuda (__main__.GPUTests.test_cat_of_loops_and_extern_kernel_cuda) ... frames [('total', 1), ('ok', 1)] | |
inline_call [] | |
stats [('calls_captured', 3), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('ok', 1)] | |
inductor [('fxgraph_cache_miss', 1), ('extern_calls', 1), ('intermediate_hooks', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cat_single_empty_cuda (__main__.GPUTests.test_cat_single_empty_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cat_uint8_cuda (__main__.GPUTests.test_cat_uint8_cuda) ... inline_call [] | |
stats [('calls_captured', 6), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('async_compile_cache_miss', 3), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cat_unbacked_2d_cuda (__main__.GPUTests.test_cat_unbacked_2d_cuda) ... inline_call [] | |
stats [('calls_captured', 24), ('unique_graphs', 4)] | |
aot_autograd [('total', 4), ('ok', 4), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('autograd_cache_hit', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('extern_calls', 8), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2), ('fxgraph_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cat_unbacked_empty_1d_cuda (__main__.GPUTests.test_cat_unbacked_empty_1d_cuda) ... inline_call [] | |
stats [('calls_captured', 24), ('unique_graphs', 4)] | |
aot_autograd [('total', 4), ('ok', 4), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('autograd_cache_hit', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('extern_calls', 8), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2), ('fxgraph_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cat_unbacked_legacy_empty_cuda (__main__.GPUTests.test_cat_unbacked_legacy_empty_cuda) ... inline_call [] | |
ok | |
test_cat_upcasting_cuda (__main__.GPUTests.test_cat_upcasting_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cauchy_cuda (__main__.GPUTests.test_cauchy_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_check_stack_no_cycles_cuda (__main__.GPUTests.test_check_stack_no_cycles_cuda) ... frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_chunk_recompiles_cuda (__main__.GPUTests.test_chunk_recompiles_cuda) ... stats [('calls_captured', 19), ('unique_graphs', 4)] | |
aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_bypass', 4), ('ok', 4)] | |
inductor [('fxgraph_cache_miss', 4)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_clamp_cuda (__main__.GPUTests.test_clamp_cuda) ... inline_call [] | |
stats [('calls_captured', 8), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_clamp_type_promotion_cuda (__main__.GPUTests.test_clamp_type_promotion_cuda) ... inline_call [] | |
stats [('calls_captured', 6), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('async_compile_cache_miss', 3), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_clone_cuda (__main__.GPUTests.test_clone_cuda) ... inline_call [] | |
stats [('calls_captured', 8), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_compar_cuda (__main__.GPUTests.test_compar_cuda) ... inline_call [] | |
stats [('calls_captured', 12), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)] | |
inductor [('triton_bundler_save_kernel', 42), ('async_compile_cache_miss', 18), ('async_compile_cache_hit', 12), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_complex_fallback_cuda (__main__.GPUTests.test_complex_fallback_cuda) ... inline_call [] | |
stats [('calls_captured', 7), ('unique_graphs', 3)] | |
aot_autograd [('total', 3), ('ok', 3), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('autograd_cache_hit', 1)] | |
inductor [('extern_calls', 9), ('triton_bundler_save_kernel', 7), ('intermediate_hooks', 4), ('fxgraph_cache_miss', 2), ('fxgraph_cache_hit', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_complex_memory_overlap_cuda (__main__.GPUTests.test_complex_memory_overlap_cuda) ... ok | |
test_computed_buffer_inlining_cuda (__main__.GPUTests.test_computed_buffer_inlining_cuda) ... frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 2), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 14), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2), ('pattern_matcher_count', 1), ('pattern_matcher_nodes', 1), ('fxgraph_cache_miss', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_concat_add_inplace_cuda (__main__.GPUTests.test_concat_add_inplace_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_config_option_dont_assume_alignment_cuda (__main__.GPUTests.test_config_option_dont_assume_alignment_cuda) ... frames [('total', 5), ('ok', 5)] | |
stats [('calls_captured', 15), ('unique_graphs', 5)] | |
aot_autograd [('total', 5), ('ok', 5), ('autograd_cache_hit', 4), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1)] | |
inductor [('async_compile_cache_miss', 9), ('triton_bundler_save_kernel', 7), ('async_compile_cache_hit', 5), ('fxgraph_cache_hit', 4), ('fxgraph_cache_miss', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_config_option_dont_assume_alignment_cudagraphs_cuda (__main__.GPUTests.test_config_option_dont_assume_alignment_cudagraphs_cuda) ... frames [('total', 4), ('ok', 4)] | |
stats [('calls_captured', 3), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_config_option_dont_assume_alignment_recompiles_cuda (__main__.GPUTests.test_config_option_dont_assume_alignment_recompiles_cuda) ... frames [('total', 5), ('ok', 5)] | |
stats [('calls_captured', 6), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_consecutive_split_cumprod_cuda (__main__.GPUTests.test_consecutive_split_cumprod_cuda) ... inline_call [] | |
stats [('calls_captured', 3), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_consecutive_split_cumsum_cuda (__main__.GPUTests.test_consecutive_split_cumsum_cuda) ... inline_call [] | |
stats [('calls_captured', 5), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_const_int32_to_float_cuda (__main__.GPUTests.test_const_int32_to_float_cuda) ... inline_call [] | |
stats [('calls_captured', 8), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)] | |
inductor [('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_constant_pad_1d_cuda (__main__.GPUTests.test_constant_pad_1d_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_constant_pad_2d_cuda (__main__.GPUTests.test_constant_pad_2d_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_constant_pad_3d_cuda (__main__.GPUTests.test_constant_pad_3d_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_constant_pad_fill_dtype_cuda (__main__.GPUTests.test_constant_pad_fill_dtype_cuda) ... inline_call [] | |
stats [('calls_captured', 8), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('async_compile_cache_miss', 3), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_constant_pad_float64_cuda (__main__.GPUTests.test_constant_pad_float64_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('async_compile_cache_miss', 3), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_constant_pad_nd_inplace_cuda (__main__.GPUTests.test_constant_pad_nd_inplace_cuda) ... frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_conv2d_backward_channels_last_cuda (__main__.GPUTests.test_conv2d_backward_channels_last_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('extern_calls', 3), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_conv2d_channels_last_cuda (__main__.GPUTests.test_conv2d_channels_last_cuda) ... skipped 'only support cpu conv2d channels_last' | |
test_conv3d_channels_last_use_block_ptr_False_cuda (__main__.GPUTests.test_conv3d_channels_last_use_block_ptr_False_cuda) ... skipped 'only support cpu conv3d channels_last' | |
test_conv3d_channels_last_use_block_ptr_True_cuda (__main__.GPUTests.test_conv3d_channels_last_use_block_ptr_True_cuda) ... skipped 'only support cpu conv3d channels_last' | |
test_conv3d_cuda (__main__.GPUTests.test_conv3d_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('extern_calls', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_conv_backward_cuda (__main__.GPUTests.test_conv_backward_cuda) ... inline_call [] | |
stats [('calls_captured', 28), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)] | |
inductor [('extern_calls', 28), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_conv_bn_fuse_cuda (__main__.GPUTests.test_conv_bn_fuse_cuda) ... skipped 'test is slow; run with PYTORCH_TEST_WITH_SLOW to enable test' | |
test_conv_functional_bn_fuse_cuda (__main__.GPUTests.test_conv_functional_bn_fuse_cuda) ... skipped 'only support cpu conv bn test' | |
test_conv_inference_heuristics_cuda (__main__.GPUTests.test_conv_inference_heuristics_cuda) ... frames [('total', 2), ('ok', 2)] | |
inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2), ('extern_calls', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_conv_shape_check_cuda (__main__.GPUTests.test_conv_shape_check_cuda) ... frames [('total', 3)] | |
inline_call [] | |
ok | |
test_conv_with_as_strided_cuda (__main__.GPUTests.test_conv_with_as_strided_cuda) ... inline_call [] | |
stats [('calls_captured', 10), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 42), ('pattern_matcher_count', 14), ('pattern_matcher_nodes', 14), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 6), ('fxgraph_cache_miss', 2), ('extern_calls', 2), ('intermediate_hooks', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_convolution1_cuda (__main__.GPUTests.test_convolution1_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('extern_calls', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_convolution2_cuda (__main__.GPUTests.test_convolution2_cuda) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('extern_calls', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_convolution3_cuda (__main__.GPUTests.test_convolution3_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('extern_calls', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_convolution4_cuda (__main__.GPUTests.test_convolution4_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2), ('extern_calls', 2), ('intermediate_hooks', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_convolution5_cuda (__main__.GPUTests.test_convolution5_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('extern_calls', 2), ('intermediate_hooks', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cos_cuda (__main__.GPUTests.test_cos_cuda) ... inline_call [] | |
stats [('calls_captured', 8), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cudnn_rnn_cuda (__main__.GPUTests.test_cudnn_rnn_cuda) ... inline_call [] | |
stats [('calls_captured', 6), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('extern_calls', 6), ('fxgraph_cache_miss', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cumprod_zero_dim_cuda (__main__.GPUTests.test_cumprod_zero_dim_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cumsum_cuda (__main__.GPUTests.test_cumsum_cuda) ... inline_call [] | |
stats [('calls_captured', 12), ('unique_graphs', 6)] | |
aot_autograd [('total', 6), ('autograd_cache_miss', 6), ('autograd_cache_saved', 6), ('ok', 6)] | |
inductor [('triton_bundler_save_kernel', 84), ('async_compile_cache_miss', 12), ('async_compile_cache_hit', 12), ('fxgraph_cache_miss', 6)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cumsum_inf_cuda (__main__.GPUTests.test_cumsum_inf_cuda) ... frames [('total', 5), ('ok', 5)] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cumsum_no_mask_cuda (__main__.GPUTests.test_cumsum_no_mask_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 4)] | |
aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)] | |
inductor [('triton_bundler_save_kernel', 28), ('fxgraph_cache_miss', 4), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cumsum_pattern_matcher_issue_cuda (__main__.GPUTests.test_cumsum_pattern_matcher_issue_cuda) ... inline_call [] | |
stats [('calls_captured', 8), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2), ('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cumsum_zero_dim_cuda (__main__.GPUTests.test_cumsum_zero_dim_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_custom_op_1_cuda (__main__.GPUTests.test_custom_op_1_cuda) ... inline_call [] | |
stats [('calls_captured', 3), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 14), ('async_compile_cache_miss', 2), ('extern_calls', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_custom_op_2_cuda (__main__.GPUTests.test_custom_op_2_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('extern_calls', 3), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_custom_op_3_cuda (__main__.GPUTests.test_custom_op_3_cuda) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('extern_calls', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_custom_op_default_layout_constraint_cuda (__main__.GPUTests.test_custom_op_default_layout_constraint_cuda) ... stats [('calls_captured', 3), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('ok', 1)] | |
inductor [('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_bypass', 1), ('pattern_matcher_count', 1), ('pattern_matcher_nodes', 1), ('extern_calls', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_custom_op_fixed_layout_channels_last_cuda (__main__.GPUTests.test_custom_op_fixed_layout_channels_last_cuda) ... frames [('total', 1), ('ok', 1)] | |
inline_call [] | |
stats [('calls_captured', 3), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 14), ('extern_calls', 3), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2), ('pattern_matcher_count', 1), ('pattern_matcher_nodes', 1), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_custom_op_fixed_layout_sequential_cuda (__main__.GPUTests.test_custom_op_fixed_layout_sequential_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 42), ('async_compile_cache_miss', 6), ('extern_calls', 6), ('async_compile_cache_hit', 6), ('fxgraph_cache_miss', 2), ('intermediate_hooks', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_custom_op_unbacked_symints_cuda (__main__.GPUTests.test_custom_op_unbacked_symints_cuda) ... inline_call [] | |
stats [('calls_captured', 14), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('extern_calls', 7), ('intermediate_hooks', 2), ('fxgraph_cache_miss', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_custom_scan_op_compiled_cuda (__main__.GPUTests.test_custom_scan_op_compiled_cuda) ... inline_call [] | |
stats [('calls_captured', 51), ('unique_graphs', 6)] | |
aot_autograd [('total', 3), ('autograd_cache_bypass', 3), ('ok', 3)] | |
inductor [('fxgraph_cache_bypass', 3), ('async_compile_cache_miss', 3), ('async_compile_cache_hit', 3)] | |
graph_break [] | |
aten_mm_info [] | |
frames [('total', 1), ('ok', 1)] | |
ok | |
test_custom_scan_op_cuda (__main__.GPUTests.test_custom_scan_op_cuda) ... inline_call [] | |
stats [('calls_captured', 24), ('unique_graphs', 2)] | |
ok | |
test_custom_scan_op_multi_input_cuda (__main__.GPUTests.test_custom_scan_op_multi_input_cuda) ... inline_call [] | |
stats [('calls_captured', 18), ('unique_graphs', 1)] | |
ok | |
test_custom_scan_would_split_cuda (__main__.GPUTests.test_custom_scan_would_split_cuda) ... frames [('total', 1), ('ok', 1)] | |
inline_call [] | |
stats [('calls_captured', 14), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_bypass', 1), ('ok', 1)] | |
inductor [('fxgraph_cache_bypass', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_data_type_propogation_cuda (__main__.GPUTests.test_data_type_propogation_cuda) ... skipped 'triton not supported' | |
test_dense_mask_index_cuda (__main__.GPUTests.test_dense_mask_index_cuda) | |
There will be a little difference for reduce order between aten and inductor ... inline_call [] | |
stats [('calls_captured', 6), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_deterministic_codegen_cuda (__main__.GPUTests.test_deterministic_codegen_cuda) ... /workspace/pytorch/torch/_dynamo/pgo.py:465: UserWarning: dynamo_pgo force disabled by torch._inductor.config.force_disable_caches | |
warn_once( | |
/workspace/pytorch/torch/_dynamo/pgo.py:465: UserWarning: dynamo_pgo force disabled by torch._inductor.config.force_disable_caches | |
warn_once( | |
/workspace/pytorch/torch/_dynamo/pgo.py:465: UserWarning: dynamo_pgo force disabled by torch._inductor.config.force_disable_caches | |
warn_once( | |
/workspace/pytorch/torch/_dynamo/pgo.py:465: UserWarning: dynamo_pgo force disabled by torch._inductor.config.force_disable_caches | |
warn_once( | |
/workspace/pytorch/torch/_dynamo/pgo.py:465: UserWarning: dynamo_pgo force disabled by torch._inductor.config.force_disable_caches | |
warn_once( | |
/workspace/pytorch/torch/_dynamo/pgo.py:465: UserWarning: dynamo_pgo force disabled by torch._inductor.config.force_disable_caches | |
warn_once( | |
/workspace/pytorch/torch/_dynamo/pgo.py:465: UserWarning: dynamo_pgo force disabled by torch._inductor.config.force_disable_caches | |
warn_once( | |
/workspace/pytorch/torch/_dynamo/pgo.py:465: UserWarning: dynamo_pgo force disabled by torch._inductor.config.force_disable_caches | |
warn_once( | |
/workspace/pytorch/torch/_dynamo/pgo.py:465: UserWarning: dynamo_pgo force disabled by torch._inductor.config.force_disable_caches | |
warn_once( | |
stats [('calls_captured', 27), ('unique_graphs', 9)] | |
aot_autograd [('total', 9), ('ok', 9)] | |
inductor [('pattern_matcher_nodes', 36), ('pattern_matcher_count', 9), ('async_compile_cache_miss', 9), ('async_compile_cache_hit', 9)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_deterministic_codegen_on_graph_break_cuda (__main__.GPUTests.test_deterministic_codegen_on_graph_break_cuda) ... /workspace/pytorch/torch/_dynamo/pgo.py:465: UserWarning: dynamo_pgo force disabled by torch._inductor.config.force_disable_caches | |
warn_once( | |
frames [('total', 2), ('ok', 2)] | |
inline_call [] | |
unimplemented [] | |
graph_break [('Call to `torch._dynamo.graph_break()`\n Explanation: User-inserted graph break. Message: None\n Hint: Remove the `torch._dynamo.graph_break()` call.\n\n Developer debug context: Called `torch._dynamo.graph_break()` with args `[]`, kwargs `{}`\n', 1)] | |
stats [('calls_captured', 6), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('ok', 2)] | |
inductor [('pattern_matcher_nodes', 8), ('pattern_matcher_count', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
aten_mm_info [] | |
ok | |
test_deterministic_codegen_with_suffix_cuda (__main__.GPUTests.test_deterministic_codegen_with_suffix_cuda) ... /workspace/pytorch/torch/_dynamo/pgo.py:465: UserWarning: dynamo_pgo force disabled by torch._inductor.config.force_disable_caches | |
warn_once( | |
/workspace/pytorch/torch/_dynamo/pgo.py:465: UserWarning: dynamo_pgo force disabled by torch._inductor.config.force_disable_caches | |
warn_once( | |
stats [('calls_captured', 7), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('ok', 2)] | |
inductor [('pattern_matcher_nodes', 8), ('pattern_matcher_count', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2), ('extern_calls', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_16_256_256', 1)] | |
ok | |
test_device_assert_cuda (__main__.GPUTests.test_device_assert_cuda) ... frames [('total', 5), ('ok', 5)] | |
stats [('calls_captured', 12), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_diagonal_copy_cuda (__main__.GPUTests.test_diagonal_copy_cuda) ... /workspace/pytorch/torch/_inductor/lowering.py:1725: FutureWarning: `torch._prims_common.check` is deprecated and will be removed in the future. Please use `torch._check*` functions instead. | |
check( | |
/workspace/pytorch/torch/_inductor/lowering.py:1725: FutureWarning: `torch._prims_common.check` is deprecated and will be removed in the future. Please use `torch._check*` functions instead. | |
check( | |
/workspace/pytorch/torch/_inductor/lowering.py:1725: FutureWarning: `torch._prims_common.check` is deprecated and will be removed in the future. Please use `torch._check*` functions instead. | |
check( | |
/workspace/pytorch/torch/_inductor/lowering.py:1725: FutureWarning: `torch._prims_common.check` is deprecated and will be removed in the future. Please use `torch._check*` functions instead. | |
check( | |
/workspace/pytorch/torch/_inductor/lowering.py:1725: FutureWarning: `torch._prims_common.check` is deprecated and will be removed in the future. Please use `torch._check*` functions instead. | |
check( | |
/workspace/pytorch/torch/_inductor/lowering.py:1725: FutureWarning: `torch._prims_common.check` is deprecated and will be removed in the future. Please use `torch._check*` functions instead. | |
check( | |
inline_call [] | |
stats [('calls_captured', 6), ('unique_graphs', 6)] | |
aot_autograd [('total', 6), ('autograd_cache_miss', 6), ('autograd_cache_saved', 6), ('ok', 6)] | |
inductor [('triton_bundler_save_kernel', 42), ('fxgraph_cache_miss', 6), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 6)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_dist_bf16_cuda (__main__.GPUTests.test_dist_bf16_cuda) ... inline_call [] | |
stats [('calls_captured', 6), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_dist_cuda (__main__.GPUTests.test_dist_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_div1_cuda (__main__.GPUTests.test_div1_cuda) ... inline_call [] | |
stats [('calls_captured', 10), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_div2_cuda (__main__.GPUTests.test_div2_cuda) ... inline_call [] | |
stats [('calls_captured', 10), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_div3_cuda (__main__.GPUTests.test_div3_cuda) ... inline_call [] | |
stats [('calls_captured', 10), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('async_compile_cache_miss', 3), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_div4_cuda (__main__.GPUTests.test_div4_cuda) ... inline_call [] | |
stats [('calls_captured', 10), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('async_compile_cache_miss', 3), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_div5_cuda (__main__.GPUTests.test_div5_cuda) ... inline_call [] | |
stats [('calls_captured', 10), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('async_compile_cache_miss', 3), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_div6_cuda (__main__.GPUTests.test_div6_cuda) ... inline_call [] | |
stats [('calls_captured', 10), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('async_compile_cache_miss', 3), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_div7_cuda (__main__.GPUTests.test_div7_cuda) ... inline_call [] | |
stats [('calls_captured', 10), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('async_compile_cache_miss', 3), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_div8_cuda (__main__.GPUTests.test_div8_cuda) ... inline_call [] | |
stats [('calls_captured', 10), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)] | |
inductor [('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_div9_cuda (__main__.GPUTests.test_div9_cuda) ... inline_call [] | |
stats [('calls_captured', 6), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_div_by_zero_cuda (__main__.GPUTests.test_div_by_zero_cuda) ... inline_call [] | |
stats [('calls_captured', 22), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_div_precision_cuda (__main__.GPUTests.test_div_precision_cuda) ... inline_call [] | |
stats [('calls_captured', 8), ('unique_graphs', 4)] | |
aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)] | |
inductor [('triton_bundler_save_kernel', 42), ('pattern_matcher_nodes', 14), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 6), ('pattern_matcher_count', 4), ('fxgraph_cache_miss', 4)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_div_prim_cuda (__main__.GPUTests.test_div_prim_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 4)] | |
aot_autograd [('total', 4), ('ok', 4), ('autograd_cache_miss', 3), ('autograd_cache_saved', 3), ('autograd_cache_hit', 1)] | |
inductor [('triton_bundler_save_kernel', 21), ('async_compile_cache_miss', 5), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 3), ('fxgraph_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_div_softmax_symfloat_cuda (__main__.GPUTests.test_div_softmax_symfloat_cuda) ... frames [('total', 5), ('ok', 5)] | |
stats [('calls_captured', 11), ('unique_graphs', 2)] | |
aot_autograd [('total', 3), ('autograd_cache_miss', 3), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('pattern_matcher_nodes', 6), ('pattern_matcher_count', 2), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_div_zero_dim_cuda (__main__.GPUTests.test_div_zero_dim_cuda) ... inline_call [] | |
stats [('calls_captured', 40), ('unique_graphs', 8)] | |
aot_autograd [('total', 8), ('ok', 8), ('autograd_cache_miss', 6), ('autograd_cache_saved', 6), ('autograd_cache_hit', 2)] | |
inductor [('triton_bundler_save_kernel', 42), ('async_compile_cache_miss', 10), ('async_compile_cache_hit', 8), ('fxgraph_cache_miss', 6), ('fxgraph_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_dropout2_cuda (__main__.GPUTests.test_dropout2_cuda) ... stats [('calls_captured', 4), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('ok', 2), ('autograd_cache_saved', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('fxgraph_cache_miss', 4), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('pattern_matcher_count', 1), ('pattern_matcher_nodes', 1), ('extern_calls', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_dropout3_cuda (__main__.GPUTests.test_dropout3_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('ok', 1), ('autograd_cache_saved', 1)] | |
inductor [('triton_bundler_save_kernel', 28), ('pattern_matcher_nodes', 6), ('extern_calls', 6), ('pattern_matcher_count', 4), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [('aten.mm_8_32_32', 3), ('aten.mm_32_32_8', 2)] | |
ok | |
test_dropout_cuda (__main__.GPUTests.test_dropout_cuda) ... frames [('total', 2), ('ok', 2)] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 7), ('async_compile_cache_miss', 3), ('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('extern_calls', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_dropout_deterministic_cuda (__main__.GPUTests.test_dropout_deterministic_cuda) ... frames [('total', 2), ('ok', 2)] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('fxgraph_cache_miss', 2), ('extern_calls', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2), ('cudagraph_recorded_non_static_inputs', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_dropout_trivial_0_cuda (__main__.GPUTests.test_dropout_trivial_0_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_dropout_trivial_1_cuda (__main__.GPUTests.test_dropout_trivial_1_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_dtype_mismatch_issue_cuda (__main__.GPUTests.test_dtype_mismatch_issue_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('pattern_matcher_nodes', 8), ('fxgraph_cache_miss', 2), ('pattern_matcher_count', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_dtype_sympy_expr_cuda (__main__.GPUTests.test_dtype_sympy_expr_cuda) ... stats [('calls_captured', 2), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_dtypeview_bfloat16_bfloat16_cuda (__main__.GPUTests.test_dtypeview_bfloat16_bfloat16_cuda) ... inline_call [] | |
stats [('calls_captured', 14), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('benchmarking.InductorBenchmarker.benchmark_gpu', 6), ('extern_calls', 4), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [('aten.mm_2_2_2', 4)] | |
ok | |
test_dtypeview_bfloat16_float16_cuda (__main__.GPUTests.test_dtypeview_bfloat16_float16_cuda) ... inline_call [] | |
stats [('calls_captured', 14), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('benchmarking.InductorBenchmarker.benchmark_gpu', 6), ('async_compile_cache_miss', 4), ('extern_calls', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [('aten.mm_2_2_2', 4)] | |
ok | |
test_dtypeview_bfloat16_float32_cuda (__main__.GPUTests.test_dtypeview_bfloat16_float32_cuda) ... inline_call [] | |
stats [('calls_captured', 14), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('benchmarking.InductorBenchmarker.benchmark_gpu', 8), ('extern_calls', 8), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2), ('intermediate_hooks', 2)] | |
graph_break [] | |
aten_mm_info [('aten.mm_2_4_2', 2), ('aten.mm_2_2_2', 2)] | |
ok | |
test_dtypeview_bfloat16_float64_cuda (__main__.GPUTests.test_dtypeview_bfloat16_float64_cuda) ... inline_call [] | |
stats [('calls_captured', 14), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('benchmarking.InductorBenchmarker.benchmark_gpu', 8), ('extern_calls', 8), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2), ('intermediate_hooks', 2)] | |
graph_break [] | |
aten_mm_info [('aten.mm_2_8_2', 2), ('aten.mm_2_2_2', 2)] | |
ok | |
test_dtypeview_bfloat16_int16_cuda (__main__.GPUTests.test_dtypeview_bfloat16_int16_cuda) ... inline_call [] | |
stats [('calls_captured', 14), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('benchmarking.InductorBenchmarker.benchmark_gpu', 6), ('async_compile_cache_miss', 4), ('extern_calls', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [('aten.mm_2_2_2', 4)] | |
ok | |
test_dtypeview_bfloat16_int32_cuda (__main__.GPUTests.test_dtypeview_bfloat16_int32_cuda) ... inline_call [] | |
stats [('calls_captured', 14), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('benchmarking.InductorBenchmarker.benchmark_gpu', 8), ('extern_calls', 8), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2), ('intermediate_hooks', 2)] | |
graph_break [] | |
aten_mm_info [('aten.mm_2_4_2', 2), ('aten.mm_2_2_2', 2)] | |
ok | |
test_dtypeview_bfloat16_int64_cuda (__main__.GPUTests.test_dtypeview_bfloat16_int64_cuda) ... inline_call [] | |
stats [('calls_captured', 14), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('benchmarking.InductorBenchmarker.benchmark_gpu', 8), ('extern_calls', 8), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2), ('intermediate_hooks', 2)] | |
graph_break [] | |
aten_mm_info [('aten.mm_2_8_2', 2), ('aten.mm_2_2_2', 2)] | |
ok | |
test_dtypeview_bfloat16_int8_cuda (__main__.GPUTests.test_dtypeview_bfloat16_int8_cuda) ... inline_call [] | |
stats [('calls_captured', 14), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('benchmarking.InductorBenchmarker.benchmark_gpu', 8), ('extern_calls', 8), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2), ('intermediate_hooks', 2)] | |
graph_break [] | |
aten_mm_info [('aten.mm_2_1_2', 2), ('aten.mm_2_2_2', 2)] | |
ok | |
test_dtypeview_bfloat16_uint8_cuda (__main__.GPUTests.test_dtypeview_bfloat16_uint8_cuda) ... inline_call [] | |
stats [('calls_captured', 14), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('benchmarking.InductorBenchmarker.benchmark_gpu', 8), ('extern_calls', 8), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2), ('intermediate_hooks', 2)] | |
graph_break [] | |
aten_mm_info [('aten.mm_2_1_2', 2), ('aten.mm_2_2_2', 2)] | |
ok | |
test_dtypeview_float16_bfloat16_cuda (__main__.GPUTests.test_dtypeview_float16_bfloat16_cuda) ... inline_call [] | |
stats [('calls_captured', 14), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('benchmarking.InductorBenchmarker.benchmark_gpu', 6), ('async_compile_cache_miss', 4), ('extern_calls', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [('aten.mm_2_2_2', 4)] | |
ok | |
test_dtypeview_float16_float16_cuda (__main__.GPUTests.test_dtypeview_float16_float16_cuda) ... inline_call [] | |
stats [('calls_captured', 14), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('benchmarking.InductorBenchmarker.benchmark_gpu', 6), ('extern_calls', 4), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [('aten.mm_2_2_2', 4)] | |
ok | |
test_dtypeview_float16_float32_cuda (__main__.GPUTests.test_dtypeview_float16_float32_cuda) ... inline_call [] | |
stats [('calls_captured', 14), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('benchmarking.InductorBenchmarker.benchmark_gpu', 8), ('extern_calls', 8), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2), ('intermediate_hooks', 2)] | |
graph_break [] | |
aten_mm_info [('aten.mm_2_4_2', 2), ('aten.mm_2_2_2', 2)] | |
ok | |
test_dtypeview_float16_float64_cuda (__main__.GPUTests.test_dtypeview_float16_float64_cuda) ... inline_call [] | |
stats [('calls_captured', 14), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('benchmarking.InductorBenchmarker.benchmark_gpu', 8), ('extern_calls', 8), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2), ('intermediate_hooks', 2)] | |
graph_break [] | |
aten_mm_info [('aten.mm_2_8_2', 2), ('aten.mm_2_2_2', 2)] | |
ok | |
test_dtypeview_float16_int16_cuda (__main__.GPUTests.test_dtypeview_float16_int16_cuda) ... inline_call [] | |
stats [('calls_captured', 14), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('benchmarking.InductorBenchmarker.benchmark_gpu', 6), ('async_compile_cache_miss', 4), ('extern_calls', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [('aten.mm_2_2_2', 4)] | |
ok | |
test_dtypeview_float16_int32_cuda (__main__.GPUTests.test_dtypeview_float16_int32_cuda) ... inline_call [] | |
stats [('calls_captured', 14), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('benchmarking.InductorBenchmarker.benchmark_gpu', 8), ('extern_calls', 8), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2), ('intermediate_hooks', 2)] | |
graph_break [] | |
aten_mm_info [('aten.mm_2_4_2', 2), ('aten.mm_2_2_2', 2)] | |
ok | |
test_dtypeview_float16_int64_cuda (__main__.GPUTests.test_dtypeview_float16_int64_cuda) ... inline_call [] | |
stats [('calls_captured', 14), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('benchmarking.InductorBenchmarker.benchmark_gpu', 8), ('extern_calls', 8), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2), ('intermediate_hooks', 2)] | |
graph_break [] | |
aten_mm_info [('aten.mm_2_8_2', 2), ('aten.mm_2_2_2', 2)] | |
ok | |
test_dtypeview_float16_int8_cuda (__main__.GPUTests.test_dtypeview_float16_int8_cuda) ... inline_call [] | |
stats [('calls_captured', 14), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('benchmarking.InductorBenchmarker.benchmark_gpu', 8), ('extern_calls', 8), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2), ('intermediate_hooks', 2)] | |
graph_break [] | |
aten_mm_info [('aten.mm_2_1_2', 2), ('aten.mm_2_2_2', 2)] | |
ok | |
test_dtypeview_float16_uint8_cuda (__main__.GPUTests.test_dtypeview_float16_uint8_cuda) ... inline_call [] | |
stats [('calls_captured', 14), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('benchmarking.InductorBenchmarker.benchmark_gpu', 8), ('extern_calls', 8), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2), ('intermediate_hooks', 2)] | |
graph_break [] | |
aten_mm_info [('aten.mm_2_1_2', 2), ('aten.mm_2_2_2', 2)] | |
ok | |
test_dtypeview_float32_bfloat16_cuda (__main__.GPUTests.test_dtypeview_float32_bfloat16_cuda) ... inline_call [] | |
stats [('calls_captured', 7), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 14), ('benchmarking.InductorBenchmarker.benchmark_gpu', 4), ('extern_calls', 4), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_2_1_2', 1), ('aten.mm_2_2_2', 1)] | |
ok | |
test_dtypeview_float32_float16_cuda (__main__.GPUTests.test_dtypeview_float32_float16_cuda) ... inline_call [] | |
stats [('calls_captured', 7), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 14), ('benchmarking.InductorBenchmarker.benchmark_gpu', 4), ('extern_calls', 4), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_2_1_2', 1), ('aten.mm_2_2_2', 1)] | |
ok | |
test_dtypeview_float32_float32_cuda (__main__.GPUTests.test_dtypeview_float32_float32_cuda) ... inline_call [] | |
stats [('calls_captured', 7), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('benchmarking.InductorBenchmarker.benchmark_gpu', 3), ('extern_calls', 2), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_2_2_2', 2)] | |
ok | |
test_dtypeview_float32_float64_cuda (__main__.GPUTests.test_dtypeview_float32_float64_cuda) ... inline_call [] | |
stats [('calls_captured', 7), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 14), ('benchmarking.InductorBenchmarker.benchmark_gpu', 4), ('extern_calls', 4), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_2_4_2', 1), ('aten.mm_2_2_2', 1)] | |
ok | |
test_dtypeview_float32_int16_cuda (__main__.GPUTests.test_dtypeview_float32_int16_cuda) ... inline_call [] | |
stats [('calls_captured', 7), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 14), ('benchmarking.InductorBenchmarker.benchmark_gpu', 4), ('extern_calls', 4), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_2_1_2', 1), ('aten.mm_2_2_2', 1)] | |
ok | |
test_dtypeview_float32_int32_cuda (__main__.GPUTests.test_dtypeview_float32_int32_cuda) ... inline_call [] | |
stats [('calls_captured', 7), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 14), ('benchmarking.InductorBenchmarker.benchmark_gpu', 3), ('async_compile_cache_miss', 2), ('extern_calls', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_2_2_2', 2)] | |
ok | |
test_dtypeview_float32_int64_cuda (__main__.GPUTests.test_dtypeview_float32_int64_cuda) ... inline_call [] | |
stats [('calls_captured', 7), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 14), ('benchmarking.InductorBenchmarker.benchmark_gpu', 4), ('extern_calls', 4), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_2_4_2', 1), ('aten.mm_2_2_2', 1)] | |
ok | |
test_dtypeview_float32_int8_cuda (__main__.GPUTests.test_dtypeview_float32_int8_cuda) ... ok | |
test_dtypeview_float32_uint8_cuda (__main__.GPUTests.test_dtypeview_float32_uint8_cuda) ... ok | |
test_dtypeview_float64_bfloat16_cuda (__main__.GPUTests.test_dtypeview_float64_bfloat16_cuda) ... ok | |
test_dtypeview_float64_float16_cuda (__main__.GPUTests.test_dtypeview_float64_float16_cuda) ... ok | |
test_dtypeview_float64_float32_cuda (__main__.GPUTests.test_dtypeview_float64_float32_cuda) ... inline_call [] | |
stats [('calls_captured', 7), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 14), ('extern_calls', 4), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_2_1_2', 1), ('aten.mm_2_2_2', 1)] | |
ok | |
test_dtypeview_float64_float64_cuda (__main__.GPUTests.test_dtypeview_float64_float64_cuda) ... inline_call [] | |
stats [('calls_captured', 7), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('extern_calls', 2), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_2_2_2', 2)] | |
ok | |
test_dtypeview_float64_int16_cuda (__main__.GPUTests.test_dtypeview_float64_int16_cuda) ... ok | |
test_dtypeview_float64_int32_cuda (__main__.GPUTests.test_dtypeview_float64_int32_cuda) ... inline_call [] | |
stats [('calls_captured', 7), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 14), ('extern_calls', 4), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_2_1_2', 1), ('aten.mm_2_2_2', 1)] | |
ok | |
test_dtypeview_float64_int64_cuda (__main__.GPUTests.test_dtypeview_float64_int64_cuda) ... inline_call [] | |
stats [('calls_captured', 7), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 14), ('async_compile_cache_miss', 2), ('extern_calls', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_2_2_2', 2)] | |
ok | |
test_dtypeview_float64_int8_cuda (__main__.GPUTests.test_dtypeview_float64_int8_cuda) ... ok | |
test_dtypeview_float64_uint8_cuda (__main__.GPUTests.test_dtypeview_float64_uint8_cuda) ... ok | |
test_dtypeview_fusion_cuda (__main__.GPUTests.test_dtypeview_fusion_cuda) ... frames [('total', 2), ('ok', 2)] | |
stats [('calls_captured', 12), ('unique_graphs', 4)] | |
aot_autograd [('total', 4), ('ok', 4), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('autograd_cache_hit', 2)] | |
inductor [('async_compile_cache_miss', 7), ('triton_bundler_save_kernel', 7), ('async_compile_cache_hit', 4), ('fxgraph_cache_hit', 3), ('fxgraph_cache_miss', 1)] | |
graph_break [] | |
aten_mm_info [] | |
inline_call [] | |
ok | |
test_dtypeview_int16_bfloat16_cuda (__main__.GPUTests.test_dtypeview_int16_bfloat16_cuda) ... inline_call [] | |
stats [('calls_captured', 14), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('benchmarking.InductorBenchmarker.benchmark_gpu', 6), ('async_compile_cache_miss', 4), ('extern_calls', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [('aten.mm_2_2_2', 4)] | |
ok | |
test_dtypeview_int16_float16_cuda (__main__.GPUTests.test_dtypeview_int16_float16_cuda) ... inline_call [] | |
stats [('calls_captured', 14), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('benchmarking.InductorBenchmarker.benchmark_gpu', 6), ('async_compile_cache_miss', 4), ('extern_calls', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [('aten.mm_2_2_2', 4)] | |
ok | |
test_dtypeview_int16_float32_cuda (__main__.GPUTests.test_dtypeview_int16_float32_cuda) ... inline_call [] | |
stats [('calls_captured', 14), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('benchmarking.InductorBenchmarker.benchmark_gpu', 8), ('extern_calls', 8), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2), ('intermediate_hooks', 2)] | |
graph_break [] | |
aten_mm_info [('aten.mm_2_4_2', 2), ('aten.mm_2_2_2', 2)] | |
ok | |
test_dtypeview_int16_float64_cuda (__main__.GPUTests.test_dtypeview_int16_float64_cuda) ... inline_call [] | |
stats [('calls_captured', 14), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('benchmarking.InductorBenchmarker.benchmark_gpu', 8), ('extern_calls', 8), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2), ('intermediate_hooks', 2)] | |
graph_break [] | |
aten_mm_info [('aten.mm_2_8_2', 2), ('aten.mm_2_2_2', 2)] | |
ok | |
test_dtypeview_int16_int16_cuda (__main__.GPUTests.test_dtypeview_int16_int16_cuda) ... inline_call [] | |
stats [('calls_captured', 14), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('benchmarking.InductorBenchmarker.benchmark_gpu', 6), ('extern_calls', 4), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [('aten.mm_2_2_2', 4)] | |
ok | |
test_dtypeview_int16_int32_cuda (__main__.GPUTests.test_dtypeview_int16_int32_cuda) ... inline_call [] | |
stats [('calls_captured', 14), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('benchmarking.InductorBenchmarker.benchmark_gpu', 8), ('extern_calls', 8), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2), ('intermediate_hooks', 2)] | |
graph_break [] | |
aten_mm_info [('aten.mm_2_4_2', 2), ('aten.mm_2_2_2', 2)] | |
ok | |
test_dtypeview_int16_int64_cuda (__main__.GPUTests.test_dtypeview_int16_int64_cuda) ... inline_call [] | |
stats [('calls_captured', 14), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('benchmarking.InductorBenchmarker.benchmark_gpu', 8), ('extern_calls', 8), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2), ('intermediate_hooks', 2)] | |
graph_break [] | |
aten_mm_info [('aten.mm_2_8_2', 2), ('aten.mm_2_2_2', 2)] | |
ok | |
test_dtypeview_int16_int8_cuda (__main__.GPUTests.test_dtypeview_int16_int8_cuda) ... inline_call [] | |
stats [('calls_captured', 14), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('benchmarking.InductorBenchmarker.benchmark_gpu', 8), ('extern_calls', 8), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2), ('intermediate_hooks', 2)] | |
graph_break [] | |
aten_mm_info [('aten.mm_2_1_2', 2), ('aten.mm_2_2_2', 2)] | |
ok | |
test_dtypeview_int16_uint8_cuda (__main__.GPUTests.test_dtypeview_int16_uint8_cuda) ... inline_call [] | |
stats [('calls_captured', 14), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('benchmarking.InductorBenchmarker.benchmark_gpu', 8), ('extern_calls', 8), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2), ('intermediate_hooks', 2)] | |
graph_break [] | |
aten_mm_info [('aten.mm_2_1_2', 2), ('aten.mm_2_2_2', 2)] | |
ok | |
test_dtypeview_int32_bfloat16_cuda (__main__.GPUTests.test_dtypeview_int32_bfloat16_cuda) ... inline_call [] | |
stats [('calls_captured', 7), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 14), ('benchmarking.InductorBenchmarker.benchmark_gpu', 4), ('extern_calls', 4), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_2_1_2', 1), ('aten.mm_2_2_2', 1)] | |
ok | |
test_dtypeview_int32_float16_cuda (__main__.GPUTests.test_dtypeview_int32_float16_cuda) ... inline_call [] | |
stats [('calls_captured', 7), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 14), ('benchmarking.InductorBenchmarker.benchmark_gpu', 4), ('extern_calls', 4), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_2_1_2', 1), ('aten.mm_2_2_2', 1)] | |
ok | |
test_dtypeview_int32_float32_cuda (__main__.GPUTests.test_dtypeview_int32_float32_cuda) ... inline_call [] | |
stats [('calls_captured', 7), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 14), ('benchmarking.InductorBenchmarker.benchmark_gpu', 3), ('async_compile_cache_miss', 2), ('extern_calls', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_2_2_2', 2)] | |
ok | |
test_dtypeview_int32_float64_cuda (__main__.GPUTests.test_dtypeview_int32_float64_cuda) ... inline_call [] | |
stats [('calls_captured', 7), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 14), ('benchmarking.InductorBenchmarker.benchmark_gpu', 4), ('extern_calls', 4), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_2_4_2', 1), ('aten.mm_2_2_2', 1)] | |
ok | |
test_dtypeview_int32_int16_cuda (__main__.GPUTests.test_dtypeview_int32_int16_cuda) ... inline_call [] | |
stats [('calls_captured', 7), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 14), ('benchmarking.InductorBenchmarker.benchmark_gpu', 4), ('extern_calls', 4), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_2_1_2', 1), ('aten.mm_2_2_2', 1)] | |
ok | |
test_dtypeview_int32_int32_cuda (__main__.GPUTests.test_dtypeview_int32_int32_cuda) ... inline_call [] | |
stats [('calls_captured', 7), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('benchmarking.InductorBenchmarker.benchmark_gpu', 3), ('extern_calls', 2), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_2_2_2', 2)] | |
ok | |
test_dtypeview_int32_int64_cuda (__main__.GPUTests.test_dtypeview_int32_int64_cuda) ... inline_call [] | |
stats [('calls_captured', 7), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 14), ('benchmarking.InductorBenchmarker.benchmark_gpu', 4), ('extern_calls', 4), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_2_4_2', 1), ('aten.mm_2_2_2', 1)] | |
ok | |
test_dtypeview_int32_int8_cuda (__main__.GPUTests.test_dtypeview_int32_int8_cuda) ... ok | |
test_dtypeview_int32_uint8_cuda (__main__.GPUTests.test_dtypeview_int32_uint8_cuda) ... ok | |
test_dtypeview_int64_bfloat16_cuda (__main__.GPUTests.test_dtypeview_int64_bfloat16_cuda) ... ok | |
test_dtypeview_int64_float16_cuda (__main__.GPUTests.test_dtypeview_int64_float16_cuda) ... ok | |
test_dtypeview_int64_float32_cuda (__main__.GPUTests.test_dtypeview_int64_float32_cuda) ... inline_call [] | |
stats [('calls_captured', 7), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 14), ('extern_calls', 4), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_2_1_2', 1), ('aten.mm_2_2_2', 1)] | |
ok | |
test_dtypeview_int64_float64_cuda (__main__.GPUTests.test_dtypeview_int64_float64_cuda) ... inline_call [] | |
stats [('calls_captured', 7), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 14), ('async_compile_cache_miss', 2), ('extern_calls', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_2_2_2', 2)] | |
ok | |
test_dtypeview_int64_int16_cuda (__main__.GPUTests.test_dtypeview_int64_int16_cuda) ... ok | |
test_dtypeview_int64_int32_cuda (__main__.GPUTests.test_dtypeview_int64_int32_cuda) ... inline_call [] | |
stats [('calls_captured', 7), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 14), ('extern_calls', 4), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_2_1_2', 1), ('aten.mm_2_2_2', 1)] | |
ok | |
test_dtypeview_int64_int64_cuda (__main__.GPUTests.test_dtypeview_int64_int64_cuda) ... inline_call [] | |
stats [('calls_captured', 7), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('extern_calls', 2), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_2_2_2', 2)] | |
ok | |
test_dtypeview_int64_int8_cuda (__main__.GPUTests.test_dtypeview_int64_int8_cuda) ... ok | |
test_dtypeview_int64_uint8_cuda (__main__.GPUTests.test_dtypeview_int64_uint8_cuda) ... ok | |
test_dtypeview_int8_bfloat16_cuda (__main__.GPUTests.test_dtypeview_int8_bfloat16_cuda) ... ok | |
test_dtypeview_int8_float16_cuda (__main__.GPUTests.test_dtypeview_int8_float16_cuda) ... ok | |
test_dtypeview_int8_float32_cuda (__main__.GPUTests.test_dtypeview_int8_float32_cuda) ... ok | |
test_dtypeview_int8_float64_cuda (__main__.GPUTests.test_dtypeview_int8_float64_cuda) ... ok | |
test_dtypeview_int8_int16_cuda (__main__.GPUTests.test_dtypeview_int8_int16_cuda) ... ok | |
test_dtypeview_int8_int32_cuda (__main__.GPUTests.test_dtypeview_int8_int32_cuda) ... ok | |
test_dtypeview_int8_int64_cuda (__main__.GPUTests.test_dtypeview_int8_int64_cuda) ... ok | |
test_dtypeview_int8_int8_cuda (__main__.GPUTests.test_dtypeview_int8_int8_cuda) ... ok | |
test_dtypeview_int8_uint8_cuda (__main__.GPUTests.test_dtypeview_int8_uint8_cuda) ... ok | |
test_dtypeview_uint8_bfloat16_cuda (__main__.GPUTests.test_dtypeview_uint8_bfloat16_cuda) ... ok | |
test_dtypeview_uint8_float16_cuda (__main__.GPUTests.test_dtypeview_uint8_float16_cuda) ... ok | |
test_dtypeview_uint8_float32_cuda (__main__.GPUTests.test_dtypeview_uint8_float32_cuda) ... ok | |
test_dtypeview_uint8_float64_cuda (__main__.GPUTests.test_dtypeview_uint8_float64_cuda) ... ok | |
test_dtypeview_uint8_int16_cuda (__main__.GPUTests.test_dtypeview_uint8_int16_cuda) ... ok | |
test_dtypeview_uint8_int32_cuda (__main__.GPUTests.test_dtypeview_uint8_int32_cuda) ... ok | |
test_dtypeview_uint8_int64_cuda (__main__.GPUTests.test_dtypeview_uint8_int64_cuda) ... ok | |
test_dtypeview_uint8_int8_cuda (__main__.GPUTests.test_dtypeview_uint8_int8_cuda) ... ok | |
test_dtypeview_uint8_uint8_cuda (__main__.GPUTests.test_dtypeview_uint8_uint8_cuda) ... ok | |
test_elu_cuda (__main__.GPUTests.test_elu_cuda) ... inline_call [] | |
stats [('calls_captured', 8), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_embedding_bag_byte_unpack_cuda (__main__.GPUTests.test_embedding_bag_byte_unpack_cuda) ... skipped 'No cuda implementation (it returns empty)' | |
test_embedding_bag_cuda (__main__.GPUTests.test_embedding_bag_cuda) ... inline_call [] | |
stats [('calls_captured', 10), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('extern_calls', 10), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_embedding_cuda (__main__.GPUTests.test_embedding_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_empty1_cuda (__main__.GPUTests.test_empty1_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)] | |
inductor [('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_empty2_cuda (__main__.GPUTests.test_empty2_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)] | |
inductor [('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_empty_strided_cuda (__main__.GPUTests.test_empty_strided_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)] | |
inductor [('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_erfc_cuda (__main__.GPUTests.test_erfc_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_erfinv_cuda (__main__.GPUTests.test_erfinv_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_exact_stride_cuda (__main__.GPUTests.test_exact_stride_cuda) ... inline_call [] | |
stats [('calls_captured', 9), ('unique_graphs', 3)] | |
aot_autograd [('total', 3), ('autograd_cache_miss', 3), ('autograd_cache_saved', 3), ('ok', 3)] | |
inductor [('triton_bundler_save_kernel', 14), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 3), ('fxgraph_cache_miss', 2), ('fxgraph_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
frames [('total', 1), ('ok', 1)] | |
ok | |
test_exp2_cuda (__main__.GPUTests.test_exp2_cuda) ... inline_call [] | |
stats [('calls_captured', 14), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_exp_cuda (__main__.GPUTests.test_exp_cuda) ... inline_call [] | |
stats [('calls_captured', 6), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_expand_as_cuda (__main__.GPUTests.test_expand_as_cuda) ... inline_call [] | |
stats [('calls_captured', 10), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_bypass', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_expand_cuda (__main__.GPUTests.test_expand_cuda) ... inline_call [] | |
stats [('calls_captured', 12), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_bypass', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_expanded_reduction_cuda (__main__.GPUTests.test_expanded_reduction_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_expm1_cuda (__main__.GPUTests.test_expm1_cuda) ... inline_call [] | |
stats [('calls_captured', 60), ('unique_graphs', 20)] | |
aot_autograd [('total', 20), ('ok', 20), ('autograd_cache_miss', 10), ('autograd_cache_saved', 10), ('autograd_cache_hit', 10)] | |
inductor [('triton_bundler_save_kernel', 70), ('async_compile_cache_miss', 30), ('async_compile_cache_hit', 20), ('fxgraph_cache_miss', 10), ('fxgraph_cache_hit', 10)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_fallback_mutable_op_basic_cuda (__main__.GPUTests.test_fallback_mutable_op_basic_cuda) ... inductor [('fxgraph_cache_bypass', 1), ('extern_calls', 1)] | |
aten_mm_info [] | |
ok | |
test_fallback_mutable_op_list_cuda (__main__.GPUTests.test_fallback_mutable_op_list_cuda) ... inductor [('fxgraph_cache_bypass', 2), ('extern_calls', 2), ('pattern_matcher_count', 1), ('pattern_matcher_nodes', 1)] | |
aten_mm_info [] | |
frames [('total', 1), ('ok', 1)] | |
inline_call [] | |
stats [('calls_captured', 3), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('ok', 1)] | |
graph_break [] | |
ok | |
test_fallback_mutable_op_list_tensor_cuda (__main__.GPUTests.test_fallback_mutable_op_list_tensor_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('ok', 1)] | |
inductor [('extern_calls', 2), ('fxgraph_cache_bypass', 1), ('pattern_matcher_count', 1), ('pattern_matcher_nodes', 1), ('intermediate_hooks', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_fallback_mutable_op_no_mutated_tensors_cuda (__main__.GPUTests.test_fallback_mutable_op_no_mutated_tensors_cuda) ... inductor [('fxgraph_cache_bypass', 1), ('extern_calls', 1)] | |
aten_mm_info [] | |
ok | |
test_fallback_mutable_op_with_return_cuda (__main__.GPUTests.test_fallback_mutable_op_with_return_cuda) ... inductor [('extern_calls', 2), ('fxgraph_cache_bypass', 1), ('intermediate_hooks', 1)] | |
aten_mm_info [] | |
ok | |
test_fft_real_input_cuda (__main__.GPUTests.test_fft_real_input_cuda) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('extern_calls', 3), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_fft_real_input_real_output_cuda (__main__.GPUTests.test_fft_real_input_real_output_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('extern_calls', 5), ('intermediate_hooks', 2), ('fxgraph_cache_miss', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_fill1_cuda (__main__.GPUTests.test_fill1_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_fill2_cuda (__main__.GPUTests.test_fill2_cuda) ... inline_call [] | |
stats [('calls_captured', 6), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_flip_cat_cuda (__main__.GPUTests.test_flip_cat_cuda) ... inline_call [] | |
stats [('calls_captured', 8), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('pattern_matcher_count', 4), ('pattern_matcher_nodes', 4), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_flip_cuda (__main__.GPUTests.test_flip_cuda) ... inline_call [] | |
stats [('calls_captured', 6), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_float16_to_int16_cuda (__main__.GPUTests.test_float16_to_int16_cuda) ... frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 4), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_float32_to_int32_cuda (__main__.GPUTests.test_float32_to_int32_cuda) ... frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 5), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_float_index_expression_cuda (__main__.GPUTests.test_float_index_expression_cuda) ... ok | |
test_float_index_expression_type_promotion_cuda (__main__.GPUTests.test_float_index_expression_type_promotion_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('async_compile_cache_miss', 3), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_floordiv_cuda (__main__.GPUTests.test_floordiv_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 4)] | |
aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)] | |
inductor [('triton_bundler_save_kernel', 28), ('fxgraph_cache_miss', 4), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_fmin_fmax_cuda (__main__.GPUTests.test_fmin_fmax_cuda) ... inline_call [] | |
stats [('calls_captured', 10), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_fmod_cuda (__main__.GPUTests.test_fmod_cuda) ... inline_call [] | |
stats [('calls_captured', 8), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_fmod_zero_dim_cuda (__main__.GPUTests.test_fmod_zero_dim_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 4)] | |
aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)] | |
inductor [('triton_bundler_save_kernel', 28), ('fxgraph_cache_miss', 4), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_forced_buffer_realize_cuda (__main__.GPUTests.test_forced_buffer_realize_cuda) ... inline_call [] | |
stats [('calls_captured', 8), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_bypass', 2), ('ok', 2)] | |
inductor [('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_fractional_max_pool2d1_cuda (__main__.GPUTests.test_fractional_max_pool2d1_cuda) ... inline_call [] | |
stats [('calls_captured', 3), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_fractional_max_pool2d2_cuda (__main__.GPUTests.test_fractional_max_pool2d2_cuda) ... inline_call [] | |
stats [('calls_captured', 3), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('extern_calls', 3), ('fxgraph_cache_miss', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_fractional_max_pool2d3_cuda (__main__.GPUTests.test_fractional_max_pool2d3_cuda) ... inline_call [] | |
stats [('calls_captured', 3), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_fractional_max_pool2d4_cuda (__main__.GPUTests.test_fractional_max_pool2d4_cuda) ... inline_call [] | |
stats [('calls_captured', 3), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('extern_calls', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_full_boolean_cuda (__main__.GPUTests.test_full_boolean_cuda) ... inline_call [] | |
stats [('calls_captured', 8), ('unique_graphs', 4)] | |
aot_autograd [('total', 4), ('ok', 4), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('autograd_cache_hit', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 12), ('async_compile_cache_hit', 8), ('fxgraph_cache_miss', 2), ('fxgraph_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_full_like_cuda (__main__.GPUTests.test_full_like_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_full_truncation_cuda (__main__.GPUTests.test_full_truncation_cuda) ... inline_call [] | |
stats [('calls_captured', 14), ('unique_graphs', 7)] | |
aot_autograd [('total', 7), ('autograd_cache_miss', 7), ('autograd_cache_saved', 7), ('ok', 7)] | |
inductor [('triton_bundler_save_kernel', 49), ('fxgraph_cache_miss', 7), ('async_compile_cache_miss', 7), ('async_compile_cache_hit', 7)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_functionalize_rng_wrappers_cuda (__main__.GPUTests.test_functionalize_rng_wrappers_cuda) ... inductor [('extern_calls', 10), ('intermediate_hooks', 2), ('fxgraph_cache_bypass', 1)] | |
aten_mm_info [] | |
ok | |
test_fuse_large_params_cuda (__main__.GPUTests.test_fuse_large_params_cuda) ... skipped 'test is slow; run with PYTORCH_TEST_WITH_SLOW to enable test' | |
test_fuse_tiled_cuda (__main__.GPUTests.test_fuse_tiled_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_fusing_write_into_disjoint_read_cuda (__main__.GPUTests.test_fusing_write_into_disjoint_read_cuda) ... inline_call [] | |
stats [('calls_captured', 16), ('unique_graphs', 4)] | |
aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)] | |
inductor [('triton_bundler_save_kernel', 56), ('async_compile_cache_miss', 8), ('async_compile_cache_hit', 8), ('fxgraph_cache_miss', 4)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_gather1_cuda (__main__.GPUTests.test_gather1_cuda) ... inline_call [] | |
stats [('calls_captured', 12), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_gather2_cuda (__main__.GPUTests.test_gather2_cuda) ... ok | |
test_gather3_cuda (__main__.GPUTests.test_gather3_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('ok', 2)] | |
inductor [('pattern_matcher_count', 16), ('pattern_matcher_nodes', 16), ('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_gather_scatter_cuda (__main__.GPUTests.test_gather_scatter_cuda) ... inline_call [] | |
stats [('calls_captured', 11), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 14), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_gelu_cuda (__main__.GPUTests.test_gelu_cuda) ... inline_call [] | |
stats [('calls_captured', 8), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_generate_rand_fp8_cuda (__main__.GPUTests.test_generate_rand_fp8_cuda) | |
PyTorch can not generate fp8 tensors with a normal distribution because of ... ok | |
test_getitem_cuda (__main__.GPUTests.test_getitem_cuda) ... frames [('total', 1), ('ok', 1)] | |
inline_call [] | |
ok | |
test_glu_cuda (__main__.GPUTests.test_glu_cuda) ... inline_call [] | |
stats [('calls_captured', 6), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 42), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 6), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_graph_partition_arange1_cuda (__main__.GPUTests.test_graph_partition_arange1_cuda) ... frames [('total', 2), ('ok', 2)] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_graph_partition_arange2_cuda (__main__.GPUTests.test_graph_partition_arange2_cuda) ... frames [('total', 2), ('ok', 2)] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_graph_partition_argmax_cuda (__main__.GPUTests.test_graph_partition_argmax_cuda) ... frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 4), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('fxgraph_cache_miss', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_graph_partition_both_scalars_cuda (__main__.GPUTests.test_graph_partition_both_scalars_cuda) ... frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 6), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('fxgraph_cache_miss', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_graph_partition_constant_tensor1_cuda (__main__.GPUTests.test_graph_partition_constant_tensor1_cuda) ... frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 4), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('fxgraph_cache_miss', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_graph_partition_constant_tensor2_cuda (__main__.GPUTests.test_graph_partition_constant_tensor2_cuda) ... frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 2), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_graph_partition_misaligned_input_cuda (__main__.GPUTests.test_graph_partition_misaligned_input_cuda) ... frames [('total', 4), ('ok', 4)] | |
stats [('calls_captured', 3), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_graph_partition_no_inputs_cuda (__main__.GPUTests.test_graph_partition_no_inputs_cuda) ... frames [('total', 2), ('ok', 2)] | |
unimplemented [] | |
graph_break [('Attempted to call function marked as skipped\n Explanation: Dynamo developers have intentionally marked that the function `manual_seed` in file `/workspace/pytorch/torch/_compile.py` should not be traced.\n Hint: Avoid calling the function `manual_seed`.\n Hint: Remove the function `manual_seed` or the file `/workspace/pytorch/torch/_compile.py` from torch/_dynamo/trace_rules.py. More graph breaks may occur as a result of attempting to trace into the function.\n Hint: Please file an issue to PyTorch.\n\n Developer debug context: module: torch.random, qualname: manual_seed, skip reason: <missing reason>\n', 1)] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('pattern_matcher_count', 1), ('pattern_matcher_nodes', 1), ('fxgraph_cache_miss', 1), ('extern_calls', 1)] | |
aten_mm_info [] | |
ok | |
test_graph_partition_refcount_cuda (__main__.GPUTests.test_graph_partition_refcount_cuda) ... inductor [('fxgraph_cache_bypass', 2), ('async_compile_cache_miss', 2), ('extern_calls', 2), ('async_compile_cache_hit', 2), ('cudagraph_recorded_non_static_inputs', 2)] | |
aten_mm_info [('aten.mm_5_5_5', 2)] | |
ok | |
test_graph_partition_scalar_inputs_cuda (__main__.GPUTests.test_graph_partition_scalar_inputs_cuda) ... frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 5), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('fxgraph_cache_miss', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_graph_partition_unbacked_symint_as_output_cuda (__main__.GPUTests.test_graph_partition_unbacked_symint_as_output_cuda) ... frames [('total', 5), ('ok', 5)] | |
unimplemented [] | |
graph_break [("Dynamic shape operator\n Explanation: Operator `aten.repeat_interleave.Tensor`'s output shape depends on input Tensor data.\n Hint: Enable tracing of dynamic shape operators with `torch._dynamo.config.capture_dynamic_output_shape_ops = True`\n\n Developer debug context: aten.repeat_interleave.Tensor\n", 1)] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2), ('pattern_matcher_count', 1), ('pattern_matcher_nodes', 1)] | |
aten_mm_info [] | |
ok | |
test_grid_sampler_2d_cuda (__main__.GPUTests.test_grid_sampler_2d_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('pattern_matcher_count', 34), ('pattern_matcher_nodes', 34), ('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_hardsigmoid_cuda (__main__.GPUTests.test_hardsigmoid_cuda) ... inline_call [] | |
stats [('calls_captured', 10), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_hardswish_cuda (__main__.GPUTests.test_hardswish_cuda) ... inline_call [] | |
stats [('calls_captured', 10), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_hardtanh_cuda (__main__.GPUTests.test_hardtanh_cuda) ... inline_call [] | |
stats [('calls_captured', 10), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_horizonal_fusion1_cuda (__main__.GPUTests.test_horizonal_fusion1_cuda) ... inline_call [] | |
stats [('calls_captured', 6), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_horizonal_fusion2_cuda (__main__.GPUTests.test_horizonal_fusion2_cuda) ... inline_call [] | |
stats [('calls_captured', 6), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 42), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 6), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_index1_cuda (__main__.GPUTests.test_index1_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 4)] | |
aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)] | |
inductor [('triton_bundler_save_kernel', 28), ('fxgraph_cache_miss', 4), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_index2_cuda (__main__.GPUTests.test_index2_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_index3_cuda (__main__.GPUTests.test_index3_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_index_dynamic_shapes_cuda (__main__.GPUTests.test_index_dynamic_shapes_cuda) ... inline_call [] | |
stats [('calls_captured', 64), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('pattern_matcher_count', 4), ('pattern_matcher_nodes', 4), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_index_propagation_abs_cuda (__main__.GPUTests.test_index_propagation_abs_cuda) ... frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 4), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('pattern_matcher_count', 1), ('pattern_matcher_nodes', 1), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_index_propagation_cuda (__main__.GPUTests.test_index_propagation_cuda) ... frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 2), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('pattern_matcher_count', 1), ('pattern_matcher_nodes', 1), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_index_propagation_device_assert_masked_cuda (__main__.GPUTests.test_index_propagation_device_assert_masked_cuda) ... inline_call [] | |
stats [('calls_captured', 10), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_index_propagation_flip_cuda (__main__.GPUTests.test_index_propagation_flip_cuda) ... frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 2), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('pattern_matcher_count', 1), ('pattern_matcher_nodes', 1), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_index_propagation_floordiv_cuda (__main__.GPUTests.test_index_propagation_floordiv_cuda) ... frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 3), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('pattern_matcher_count', 1), ('pattern_matcher_nodes', 1), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_index_propagation_nested_indirect_indexing_cuda (__main__.GPUTests.test_index_propagation_nested_indirect_indexing_cuda) ... frames [('total', 4), ('ok', 4)] | |
stats [('calls_captured', 7), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('extern_calls', 3), ('pattern_matcher_count', 1), ('pattern_matcher_nodes', 1), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_index_propagation_remainder_cuda (__main__.GPUTests.test_index_propagation_remainder_cuda) ... frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 3), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('pattern_matcher_count', 1), ('pattern_matcher_nodes', 1), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_index_put1_cuda (__main__.GPUTests.test_index_put1_cuda) ... inline_call [] | |
stats [('calls_captured', 24), ('unique_graphs', 4)] | |
aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)] | |
inductor [('triton_bundler_save_kernel', 98), ('async_compile_cache_miss', 14), ('async_compile_cache_hit', 14), ('fxgraph_cache_miss', 4)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_index_put2_cuda (__main__.GPUTests.test_index_put2_cuda) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 14), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_index_put3_cuda (__main__.GPUTests.test_index_put3_cuda) ... inline_call [] | |
stats [('calls_captured', 10), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 42), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 6), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_index_put4_cuda (__main__.GPUTests.test_index_put4_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_index_put_as_masked_fill_cuda (__main__.GPUTests.test_index_put_as_masked_fill_cuda) ... inline_call [] | |
stats [('calls_captured', 8), ('unique_graphs', 4)] | |
aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)] | |
inductor [('triton_bundler_save_kernel', 28), ('fxgraph_cache_miss', 4), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_index_put_deterministic_fallback_cuda (__main__.GPUTests.test_index_put_deterministic_fallback_cuda) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('extern_calls', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_index_put_failed_reinplace_cuda (__main__.GPUTests.test_index_put_failed_reinplace_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_index_put_fallback1_cuda (__main__.GPUTests.test_index_put_fallback1_cuda) ... inline_call [] | |
stats [('calls_captured', 8), ('unique_graphs', 4)] | |
aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)] | |
inductor [('triton_bundler_save_kernel', 28), ('fxgraph_cache_miss', 4), ('async_compile_cache_miss', 4), ('extern_calls', 4), ('async_compile_cache_hit', 4)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_index_put_fallback2_cuda (__main__.GPUTests.test_index_put_fallback2_cuda) ... inline_call [] | |
stats [('calls_captured', 8), ('unique_graphs', 4)] | |
aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)] | |
inductor [('triton_bundler_save_kernel', 28), ('fxgraph_cache_miss', 4), ('async_compile_cache_miss', 4), ('extern_calls', 4), ('async_compile_cache_hit', 4)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_index_put_index_cuda (__main__.GPUTests.test_index_put_index_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 42), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 6), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_index_put_reinplace_cuda (__main__.GPUTests.test_index_put_reinplace_cuda) ... inline_call [] | |
stats [('calls_captured', 6), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_bypass', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_index_select_cuda (__main__.GPUTests.test_index_select_cuda) ... inline_call [] | |
stats [('calls_captured', 16), ('unique_graphs', 4)] | |
aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)] | |
inductor [('triton_bundler_save_kernel', 84), ('async_compile_cache_miss', 12), ('async_compile_cache_hit', 12), ('fxgraph_cache_miss', 4)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_index_tensor_cuda (__main__.GPUTests.test_index_tensor_cuda) ... inline_call [] | |
stats [('calls_captured', 10), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_indirect_load_broadcast_cuda (__main__.GPUTests.test_indirect_load_broadcast_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_inductor_assert_cuda (__main__.GPUTests.test_inductor_assert_cuda) ... frames [('total', 4), ('ok', 4)] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('fxgraph_cache_miss', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_inductor_layout_optimization_input_mutations_cuda (__main__.GPUTests.test_inductor_layout_optimization_input_mutations_cuda) ... frames [('total', 2), ('ok', 2)] | |
inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2), ('extern_calls', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_inf_cuda (__main__.GPUTests.test_inf_cuda) ... inline_call [] | |
stats [('calls_captured', 6), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_inner_fn_str_and_stride_cuda (__main__.GPUTests.test_inner_fn_str_and_stride_cuda) ... inline_call [] | |
stats [('calls_captured', 6), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_bypass', 1), ('ok', 1)] | |
inductor [('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_inplace_activations_cuda (__main__.GPUTests.test_inplace_activations_cuda) ... inline_call [] | |
stats [('calls_captured', 32), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_inplace_add_cuda (__main__.GPUTests.test_inplace_add_cuda) ... frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_inplace_mixed_dtype_ops_cuda (__main__.GPUTests.test_inplace_mixed_dtype_ops_cuda) ... frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 4), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_inplace_resize_as_cuda (__main__.GPUTests.test_inplace_resize_as_cuda) ... E0509 18:22:34.546000 415353 torch/_dynamo/utils.py:2906] Accuracy failed: allclose not within tol=0.0001 | |
frames [('total', 2), ('ok', 2)] | |
unimplemented [] | |
graph_break [('Unsupported function call\n Explanation: Dynamo does not know how to trace the function `DelayGraphBreakVariable()`\n Hint: Avoid calling `DelayGraphBreakVariable()` in your code.\n Hint: Please report an issue to PyTorch.\n\n Developer debug context: call_function DelayGraphBreakVariable() [LazyVariableTracker()] {}\n', 1)] | |
ok | |
test_inplace_where_pointwise_cuda (__main__.GPUTests.test_inplace_where_pointwise_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_input_mutation1_cuda (__main__.GPUTests.test_input_mutation1_cuda) ... skipping cudagraphs due to mutated inputs (1 instances). Found from : | |
File "/workspace/pytorch/test/inductor/test_torchinductor.py", line 7321, in fn | |
a.copy_(b) | |
stats [('calls_captured', 5), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1), ('cudagraph_skips', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_input_mutation2_cuda (__main__.GPUTests.test_input_mutation2_cuda) ... stats [('calls_captured', 5), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 14), ('pattern_matcher_count', 3), ('pattern_matcher_nodes', 3), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_input_mutation3_cuda (__main__.GPUTests.test_input_mutation3_cuda) ... stats [('calls_captured', 7), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_bypass', 1), ('ok', 1)] | |
inductor [('pattern_matcher_nodes', 9), ('pattern_matcher_count', 7), ('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_input_mutation4_cuda (__main__.GPUTests.test_input_mutation4_cuda) ... stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_input_mutation5_cuda (__main__.GPUTests.test_input_mutation5_cuda) ... frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 2), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_insignificant_strides_cuda (__main__.GPUTests.test_insignificant_strides_cuda) ... frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 2), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('pattern_matcher_count', 1), ('pattern_matcher_nodes', 1), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_int8_weight_only_quant_cuda (__main__.GPUTests.test_int8_weight_only_quant_cuda) ... skipped 'No _weight_int8pack_mm implementation on CUDA' | |
test_int_input_dynamic_shapes_cuda (__main__.GPUTests.test_int_input_dynamic_shapes_cuda) ... frames [('total', 9), ('ok', 9)] | |
stats [('calls_captured', 5), ('unique_graphs', 5)] | |
aot_autograd [('total', 5), ('ok', 5), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('autograd_cache_hit', 1)] | |
inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 5), ('fxgraph_cache_miss', 4), ('fxgraph_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
inline_call [] | |
ok | |
test_invalid_operand_issue1_cuda (__main__.GPUTests.test_invalid_operand_issue1_cuda) ... inline_call [] | |
stats [('calls_captured', 14), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_isin_tensor_scalar_cuda (__main__.GPUTests.test_isin_tensor_scalar_cuda) ... stats [('calls_captured', 8), ('unique_graphs', 8)] | |
aot_autograd [('total', 8), ('ok', 8), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('autograd_cache_hit', 4)] | |
inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 12), ('async_compile_cache_hit', 8), ('fxgraph_cache_miss', 4), ('fxgraph_cache_hit', 4)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_isinf2_cuda (__main__.GPUTests.test_isinf2_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_isinf_cuda (__main__.GPUTests.test_isinf_cuda) ... inline_call [] | |
stats [('calls_captured', 8), ('unique_graphs', 4)] | |
aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)] | |
inductor [('triton_bundler_save_kernel', 28), ('fxgraph_cache_miss', 4), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_issue102546_cuda (__main__.GPUTests.test_issue102546_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_kernel_names_cuda (__main__.GPUTests.test_kernel_names_cuda) ... frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_kwargs_cuda (__main__.GPUTests.test_kwargs_cuda) ... skipped 'histogramdd only supports cpu' | |
test_l1_loss_cuda (__main__.GPUTests.test_l1_loss_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_large_block_sizes_cuda (__main__.GPUTests.test_large_block_sizes_cuda) | |
Inductor will try triton configs like x = 64 and y = 1024 which will ... frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 2), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 35), ('benchmarking.InductorBenchmarker.benchmark_gpu', 3), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_large_broadcast_reduction_cuda (__main__.GPUTests.test_large_broadcast_reduction_cuda) ... frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 2), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 14), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_large_grid_cuda (__main__.GPUTests.test_large_grid_cuda) ... frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 3), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('pattern_matcher_count', 1), ('pattern_matcher_nodes', 1), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_large_offset_pointwise_cuda (__main__.GPUTests.test_large_offset_pointwise_cuda) ... frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_large_pointwise_cuda (__main__.GPUTests.test_large_pointwise_cuda) ... frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_large_strided_reduction_cuda (__main__.GPUTests.test_large_strided_reduction_cuda) ... frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 21), ('async_compile_cache_miss', 3), ('async_compile_cache_hit', 3), ('fxgraph_cache_miss', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_large_tensor_reduction_cuda (__main__.GPUTests.test_large_tensor_reduction_cuda) ... frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 21), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2), ('benchmarking.InductorBenchmarker.benchmark_gpu', 2), ('fxgraph_cache_miss', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_layer_norm_cuda (__main__.GPUTests.test_layer_norm_cuda) ... W0509 18:22:45.117000 415353 torch/_inductor/debug.py:454] [0/0] model__752_inference_718 debug trace: /workspace/pytorch/torch_compile_debug/run_2025_05_09_18_22_44_935431-pid_415353/torchinductor/model__752_inference_718.0 | |
inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_leaky_relu_cuda (__main__.GPUTests.test_leaky_relu_cuda) ... inline_call [] | |
stats [('calls_captured', 8), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_lerp_cuda (__main__.GPUTests.test_lerp_cuda) ... inline_call [] | |
stats [('calls_captured', 6), ('unique_graphs', 4)] | |
aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)] | |
inductor [('triton_bundler_save_kernel', 28), ('fxgraph_cache_miss', 4), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_lgamma_cuda (__main__.GPUTests.test_lgamma_cuda) ... inline_call [] | |
stats [('calls_captured', 8), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_like_channels_last_cuda (__main__.GPUTests.test_like_channels_last_cuda) ... frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 4), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 21), ('async_compile_cache_miss', 3), ('async_compile_cache_hit', 3), ('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('extern_calls', 2), ('fxgraph_cache_miss', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_like_rands2_cuda (__main__.GPUTests.test_like_rands2_cuda) ... frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('pattern_matcher_count', 1), ('pattern_matcher_nodes', 1), ('fxgraph_cache_miss', 1), ('extern_calls', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_like_rands3_cuda (__main__.GPUTests.test_like_rands3_cuda) ... frames [('total', 2), ('ok', 2)] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 7), ('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('fxgraph_cache_miss', 2), ('extern_calls', 2), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_like_rands_cuda (__main__.GPUTests.test_like_rands_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('extern_calls', 8), ('intermediate_hooks', 4), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_linear1_cuda (__main__.GPUTests.test_linear1_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('pattern_matcher_nodes', 8), ('benchmarking.InductorBenchmarker.benchmark_gpu', 8), ('pattern_matcher_count', 6), ('fxgraph_cache_miss', 2), ('extern_calls', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [('aten.mm_2_16_8', 2)] | |
ok | |
test_linear2_cuda (__main__.GPUTests.test_linear2_cuda) ... inline_call [] | |
stats [('calls_captured', 16), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('ok', 2)] | |
inductor [('pattern_matcher_nodes', 32), ('triton_bundler_save_kernel', 28), ('pattern_matcher_count', 24), ('benchmarking.InductorBenchmarker.benchmark_gpu', 16), ('extern_calls', 8), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [('aten.mm_2_8_8', 8)] | |
ok | |
test_linear_dynamic_maxautotune_cuda (__main__.GPUTests.test_linear_dynamic_maxautotune_cuda) ... AUTOTUNE addmm(10x1, 10x1, 1x1) | |
triton_mm_1 0.0036 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=1 | |
triton_mm_0 0.0038 ms 95.8% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=1 | |
triton_mm_2 0.0038 ms 95.8% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=1 | |
triton_mm_3 0.0038 ms 95.8% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=1 | |
triton_mm_4 0.0038 ms 95.8% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=1 | |
SingleProcess AUTOTUNE benchmarking takes 0.1773 seconds and 0.1201 seconds precompiling for 5 choices | |
AUTOTUNE mm(1x10, 10x1) | |
triton_mm_5 0.0038 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=1 | |
triton_mm_7 0.0038 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=1 | |
triton_mm_9 0.0038 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=1 | |
triton_mm_6 0.0041 ms 93.0% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=1 | |
triton_mm_8 0.0041 ms 93.0% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=1 | |
SingleProcess AUTOTUNE benchmarking takes 0.1778 seconds and 0.1066 seconds precompiling for 5 choices | |
AUTOTUNE addmm(10x1, 10x1, 1x1) | |
triton_mm_12 0.0036 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=1 | |
triton_mm_14 0.0037 ms 98.3% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=1 | |
triton_mm_11 0.0038 ms 96.6% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=1 | |
triton_mm_13 0.0038 ms 95.8% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=1 | |
triton_mm_10 0.0041 ms 89.1% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=1 | |
SingleProcess AUTOTUNE benchmarking takes 0.1805 seconds and 0.1140 seconds precompiling for 5 choices | |
AUTOTUNE addmm(10x1, 10x1, 1x1) | |
triton_mm_17 0.0035 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=1 | |
triton_mm_18 0.0036 ms 98.2% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=1 | |
triton_mm_16 0.0036 ms 97.3% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=1 | |
triton_mm_15 0.0038 ms 92.4% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=1 | |
triton_mm_19 0.0038 ms 92.4% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=1 | |
SingleProcess AUTOTUNE benchmarking takes 0.0914 seconds and 0.0863 seconds precompiling for 5 choices | |
AUTOTUNE mm(1x10, 10x1) | |
triton_mm_21 0.0035 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=1 | |
triton_mm_20 0.0038 ms 91.6% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=1 | |
triton_mm_23 0.0038 ms 91.6% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=1 | |
triton_mm_24 0.0038 ms 91.6% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=1 | |
triton_mm_22 0.0038 ms 90.8% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=1 | |
SingleProcess AUTOTUNE benchmarking takes 0.0891 seconds and 0.0764 seconds precompiling for 5 choices | |
AUTOTUNE addmm(10x1, 10x1, 1x1) | |
triton_mm_26 0.0035 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=1 | |
triton_mm_28 0.0037 ms 93.1% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=1 | |
triton_mm_25 0.0038 ms 90.8% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=1, num_warps=1 | |
triton_mm_27 0.0038 ms 90.8% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=1 | |
triton_mm_29 0.0038 ms 90.8% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=16, BLOCK_M=16, BLOCK_N=16, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=1 | |
SingleProcess AUTOTUNE benchmarking takes 0.0900 seconds and 0.0725 seconds precompiling for 5 choices | |
frames [('total', 9), ('ok', 9)] | |
inline_call [] | |
stats [('calls_captured', 5), ('unique_graphs', 5)] | |
aot_autograd [('total', 5), ('ok', 5), ('autograd_cache_miss', 4), ('autograd_cache_saved', 2), ('autograd_cache_hit', 1)] | |
inductor [('triton_bundler_save_kernel', 294), ('async_compile_cache_miss', 68), ('benchmarking.InductorBenchmarker.benchmark_gpu', 48), ('select_algorithm_num_precompiles', 40), ('pattern_matcher_nodes', 12), ('pattern_matcher_count', 8), ('select_algorithm_precompile', 8), ('select_algorithm_autotune', 8), ('async_compile_cache_hit', 8), ('fxgraph_cache_miss', 6), ('fxgraph_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [('aten.addmm_s0_1_1', 2), ('aten.mm_1_1_s0', 2), ('aten.addmm_10_1_1', 2)] | |
ok | |
test_linear_float64_cuda (__main__.GPUTests.test_linear_float64_cuda) ... skipped 'cuda failed for float64 linear' | |
test_linear_mixed_dtype_cuda (__main__.GPUTests.test_linear_mixed_dtype_cuda) ... frames [('total', 2), ('ok', 1)] | |
inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('fxgraph_cache_miss', 1), ('extern_calls', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_9_3_3', 1)] | |
ok | |
test_linspace1_cuda (__main__.GPUTests.test_linspace1_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_linspace2_cuda (__main__.GPUTests.test_linspace2_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_linspace3_cuda (__main__.GPUTests.test_linspace3_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)] | |
inductor [('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_linspace4_cuda (__main__.GPUTests.test_linspace4_cuda) ... skipped 'requires multiple cuda devices' | |
test_list_clearing_cuda (__main__.GPUTests.test_list_clearing_cuda) ... inductor [('fxgraph_cache_bypass', 2), ('async_compile_cache_miss', 2), ('extern_calls', 2), ('async_compile_cache_hit', 2), ('cudagraph_recorded_non_static_inputs', 2)] | |
aten_mm_info [('aten.mm_5_5_5', 2)] | |
ok | |
test_log1p_cuda (__main__.GPUTests.test_log1p_cuda) ... inline_call [] | |
stats [('calls_captured', 60), ('unique_graphs', 20)] | |
aot_autograd [('total', 20), ('ok', 20), ('autograd_cache_miss', 10), ('autograd_cache_saved', 10), ('autograd_cache_hit', 10)] | |
inductor [('triton_bundler_save_kernel', 70), ('async_compile_cache_miss', 30), ('async_compile_cache_hit', 20), ('fxgraph_cache_miss', 10), ('fxgraph_cache_hit', 10)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_log2_cuda (__main__.GPUTests.test_log2_cuda) ... inline_call [] | |
stats [('calls_captured', 8), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_log_fp64_cuda (__main__.GPUTests.test_log_fp64_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('async_compile_cache_miss', 3), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_log_softmax_cuda (__main__.GPUTests.test_log_softmax_cuda) ... inline_call [] | |
stats [('calls_captured', 8), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('pattern_matcher_nodes', 24), ('pattern_matcher_count', 6), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_logaddexp_cuda (__main__.GPUTests.test_logaddexp_cuda) ... skipped 'Not implemented for CUDA' | |
test_logcumsumexp_cuda (__main__.GPUTests.test_logcumsumexp_cuda) ... inline_call [] | |
stats [('calls_captured', 12), ('unique_graphs', 6)] | |
aot_autograd [('total', 6), ('autograd_cache_miss', 6), ('autograd_cache_saved', 6), ('ok', 6)] | |
inductor [('triton_bundler_save_kernel', 84), ('async_compile_cache_miss', 12), ('async_compile_cache_hit', 12), ('fxgraph_cache_miss', 6)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_logcumsumexp_zero_dim_cuda (__main__.GPUTests.test_logcumsumexp_zero_dim_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_logsumexp_cuda (__main__.GPUTests.test_logsumexp_cuda) ... inline_call [] | |
stats [('calls_captured', 6), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_long_tensor_cuda (__main__.GPUTests.test_long_tensor_cuda) ... inline_call [] | |
stats [('calls_captured', 12), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_bypass', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 7), ('async_compile_cache_miss', 3), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_low_memory_max_pool_cuda (__main__.GPUTests.test_low_memory_max_pool_cuda) ... inline_call [] | |
stats [('calls_captured', 8), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_masked_fill_cuda (__main__.GPUTests.test_masked_fill_cuda) ... inline_call [] | |
stats [('calls_captured', 10), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_masked_fill_promotion_cuda (__main__.GPUTests.test_masked_fill_promotion_cuda) ... frames [('total', 2), ('ok', 2)] | |
stats [('calls_captured', 4), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_masked_scatter_cuda (__main__.GPUTests.test_masked_scatter_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('pattern_matcher_count', 8), ('pattern_matcher_nodes', 8), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_matmul_layer_norm_cuda (__main__.GPUTests.test_matmul_layer_norm_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('pattern_matcher_count', 4), ('pattern_matcher_nodes', 4), ('fxgraph_cache_miss', 1), ('extern_calls', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_1600_256_256', 1)] | |
ok | |
test_max_min_cuda (__main__.GPUTests.test_max_min_cuda) ... inline_call [] | |
stats [('calls_captured', 8), ('unique_graphs', 4)] | |
aot_autograd [('total', 4), ('ok', 4), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('autograd_cache_hit', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2), ('fxgraph_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_max_pool2d1_cuda (__main__.GPUTests.test_max_pool2d1_cuda) ... inline_call [] | |
stats [('calls_captured', 6), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_max_pool2d2_cuda (__main__.GPUTests.test_max_pool2d2_cuda) ... inline_call [] | |
stats [('calls_captured', 6), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_max_pool2d3_cuda (__main__.GPUTests.test_max_pool2d3_cuda) ... inline_call [] | |
stats [('calls_captured', 12), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_max_pool2d4_cuda (__main__.GPUTests.test_max_pool2d4_cuda) ... inline_call [] | |
stats [('calls_captured', 6), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_max_pool2d5_cuda (__main__.GPUTests.test_max_pool2d5_cuda) ... inline_call [] | |
stats [('calls_captured', 6), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_max_pool2d6_cuda (__main__.GPUTests.test_max_pool2d6_cuda) ... inline_call [] | |
stats [('calls_captured', 6), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('benchmarking.InductorBenchmarker.benchmark_gpu', 4), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_max_pool2d7_cuda (__main__.GPUTests.test_max_pool2d7_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_max_pool2d8_cuda (__main__.GPUTests.test_max_pool2d8_cuda) ... inline_call [] | |
stats [('calls_captured', 6), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('extern_calls', 6), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_max_pool2d_with_indices_backward2_cuda (__main__.GPUTests.test_max_pool2d_with_indices_backward2_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_max_pool2d_with_indices_backward3_cuda (__main__.GPUTests.test_max_pool2d_with_indices_backward3_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_max_pool2d_with_indices_backward4_cuda (__main__.GPUTests.test_max_pool2d_with_indices_backward4_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_max_pool2d_with_indices_backward5_cuda (__main__.GPUTests.test_max_pool2d_with_indices_backward5_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('extern_calls', 4), ('fxgraph_cache_miss', 2), ('intermediate_hooks', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_max_pool2d_with_indices_backward6_cuda (__main__.GPUTests.test_max_pool2d_with_indices_backward6_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('extern_calls', 4), ('fxgraph_cache_miss', 2), ('intermediate_hooks', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_max_pool2d_with_indices_backward_cuda (__main__.GPUTests.test_max_pool2d_with_indices_backward_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_mean_cuda (__main__.GPUTests.test_mean_cuda) ... inline_call [] | |
stats [('calls_captured', 8), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 56), ('async_compile_cache_miss', 8), ('async_compile_cache_hit', 8), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_min_max_reduction_cuda (__main__.GPUTests.test_min_max_reduction_cuda) ... inline_call [] | |
stats [('calls_captured', 48), ('unique_graphs', 6)] | |
aot_autograd [('total', 6), ('ok', 6), ('autograd_cache_miss', 3), ('autograd_cache_saved', 3), ('autograd_cache_hit', 3)] | |
inductor [('triton_bundler_save_kernel', 21), ('async_compile_cache_miss', 9), ('async_compile_cache_hit', 6), ('fxgraph_cache_miss', 3), ('fxgraph_cache_hit', 3)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_min_max_reduction_nan_cuda (__main__.GPUTests.test_min_max_reduction_nan_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_misaligned_address_issue1_cuda (__main__.GPUTests.test_misaligned_address_issue1_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_mix_device_index_cuda (__main__.GPUTests.test_mix_device_index_cuda) | |
A tiny repro for this meta internal issue: https://fb.workplace.com/groups/1075192433118967/posts/1567334737238065 ... frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 7), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('extern_calls', 6), ('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('intermediate_hooks', 2), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_mixed_mm2_cuda (__main__.GPUTests.test_mixed_mm2_cuda) ... skipped 'Not supported in Python 3.12+' | |
test_mixed_mm3_cuda (__main__.GPUTests.test_mixed_mm3_cuda) ... skipped 'Not supported in Python 3.12+' | |
test_mixed_mm_cuda (__main__.GPUTests.test_mixed_mm_cuda) ... skipped 'Not supported in Python 3.12+' | |
test_mm_mixed_dtype_cuda (__main__.GPUTests.test_mm_mixed_dtype_cuda) ... frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('fxgraph_cache_miss', 1), ('extern_calls', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_2_3_3', 1)] | |
ok | |
test_mm_views_cuda (__main__.GPUTests.test_mm_views_cuda) ... inline_call [] | |
stats [('calls_captured', 3), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('fxgraph_cache_miss', 1), ('extern_calls', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_32_32_32', 1)] | |
ok | |
test_move_arange_cuda (__main__.GPUTests.test_move_arange_cuda) ... inline_call [] | |
stats [('calls_captured', 3), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('pattern_matcher_count', 1), ('pattern_matcher_nodes', 1), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_mul_index_expr_cuda (__main__.GPUTests.test_mul_index_expr_cuda) ... inline_call [] | |
stats [('calls_captured', 10), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('async_compile_cache_miss', 3), ('async_compile_cache_hit', 2), ('pattern_matcher_count', 1), ('pattern_matcher_nodes', 1), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_mul_softmax_symfloat_cuda (__main__.GPUTests.test_mul_softmax_symfloat_cuda) ... frames [('total', 5), ('ok', 5)] | |
stats [('calls_captured', 11), ('unique_graphs', 2)] | |
aot_autograd [('total', 3), ('autograd_cache_miss', 3), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('pattern_matcher_nodes', 6), ('pattern_matcher_count', 2), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_multi_device_cuda (__main__.GPUTests.test_multi_device_cuda) ... W0509 18:23:21.436000 415353 torch/_inductor/utils.py:1762] [0/0] DeviceCopy in input program | |
W0509 18:23:21.438000 415353 torch/_inductor/utils.py:1762] [0/0] DeviceCopy in input program | |
W0509 18:23:21.438000 415353 torch/_inductor/utils.py:1762] [0/0] DeviceCopy in input program | |
inline_call [] | |
stats [('calls_captured', 14), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 14), ('extern_calls', 3), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_multi_gpu_device_cuda (__main__.GPUTests.test_multi_gpu_device_cuda) ... skipped 'requires multiple cuda devices' | |
test_multi_gpu_recompile_on_index_cuda (__main__.GPUTests.test_multi_gpu_recompile_on_index_cuda) ... skipped 'requires multiple cuda devices' | |
test_multi_threading_cuda (__main__.GPUTests.test_multi_threading_cuda) ... frames [('total', 2), ('ok', 2)] | |
inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)] | |
inductor [('extern_calls', 2), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [('aten.addmm_4_3_2', 1)] | |
ok | |
test_multilayer_any_cuda (__main__.GPUTests.test_multilayer_any_cuda) ... inline_call [] | |
stats [('calls_captured', 16), ('unique_graphs', 4)] | |
aot_autograd [('total', 4), ('ok', 4), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('autograd_cache_hit', 2)] | |
inductor [('triton_bundler_save_kernel', 42), ('async_compile_cache_miss', 18), ('async_compile_cache_hit', 12), ('fxgraph_cache_miss', 2), ('fxgraph_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_multilayer_prime_size_cuda (__main__.GPUTests.test_multilayer_prime_size_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)] | |
inductor [('triton_bundler_save_kernel', 21), ('async_compile_cache_miss', 9), ('async_compile_cache_hit', 6), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_multilayer_sum_low_prec_cuda (__main__.GPUTests.test_multilayer_sum_low_prec_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)] | |
inductor [('triton_bundler_save_kernel', 14), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_multilayer_var_cuda (__main__.GPUTests.test_multilayer_var_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 4)] | |
aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)] | |
inductor [('triton_bundler_save_kernel', 56), ('async_compile_cache_miss', 8), ('async_compile_cache_hit', 8), ('fxgraph_cache_miss', 4)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_multilayer_var_lowp_cuda (__main__.GPUTests.test_multilayer_var_lowp_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 4)] | |
aot_autograd [('total', 4), ('ok', 4), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('autograd_cache_hit', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 12), ('async_compile_cache_hit', 8), ('fxgraph_cache_miss', 2), ('fxgraph_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_mutable_custom_op_fixed_layout2_cuda (__main__.GPUTests.test_mutable_custom_op_fixed_layout2_cuda) ... inline_call [] | |
stats [('calls_captured', 8), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('ok', 2)] | |
inductor [('extern_calls', 8), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 6), ('intermediate_hooks', 4), ('fxgraph_cache_bypass', 2), ('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_mutable_custom_op_fixed_layout_cuda (__main__.GPUTests.test_mutable_custom_op_fixed_layout_cuda) ... stats [('calls_captured', 3), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('ok', 1)] | |
inductor [('fxgraph_cache_bypass', 1), ('pattern_matcher_count', 1), ('pattern_matcher_nodes', 1), ('extern_calls', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_mutations_loop_fusion_cuda (__main__.GPUTests.test_mutations_loop_fusion_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)] | |
inductor [('triton_bundler_save_kernel', 21), ('async_compile_cache_miss', 9), ('async_compile_cache_hit', 6), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_nan_to_num_cuda (__main__.GPUTests.test_nan_to_num_cuda) ... inline_call [] | |
stats [('calls_captured', 6), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_narrow_cuda (__main__.GPUTests.test_narrow_cuda) ... inline_call [] | |
stats [('calls_captured', 10), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_bypass', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_neg_index_cuda (__main__.GPUTests.test_neg_index_cuda) ... frames [('total', 9), ('ok', 9)] | |
stats [('calls_captured', 17), ('unique_graphs', 9)] | |
aot_autograd [('total', 9), ('autograd_cache_miss', 9), ('ok', 9), ('autograd_cache_saved', 8), ('autograd_cache_bypass', 1)] | |
inductor [('triton_bundler_save_kernel', 56), ('fxgraph_cache_miss', 9), ('async_compile_cache_miss', 8), ('async_compile_cache_hit', 8), ('pattern_matcher_count', 3), ('pattern_matcher_nodes', 3)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_neg_max_uint8_cuda (__main__.GPUTests.test_neg_max_uint8_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('async_compile_cache_miss', 3), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_new_empty_cuda (__main__.GPUTests.test_new_empty_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_new_empty_strided_cuda (__main__.GPUTests.test_new_empty_strided_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_new_ones_cuda (__main__.GPUTests.test_new_ones_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_nll_loss_backward_cuda (__main__.GPUTests.test_nll_loss_backward_cuda) ... inline_call [] | |
stats [('calls_captured', 8), ('unique_graphs', 4)] | |
aot_autograd [('total', 4), ('ok', 4), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('autograd_cache_hit', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('pattern_matcher_nodes', 8), ('async_compile_cache_miss', 6), ('pattern_matcher_count', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2), ('fxgraph_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_nll_loss_forward_cuda (__main__.GPUTests.test_nll_loss_forward_cuda) ... inline_call [] | |
stats [('calls_captured', 12), ('unique_graphs', 4)] | |
aot_autograd [('total', 4), ('ok', 4), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('autograd_cache_hit', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2), ('fxgraph_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_no_mega_fusion_during_lowering_cuda (__main__.GPUTests.test_no_mega_fusion_during_lowering_cuda) ... --> 7 | |
inline_call [] | |
stats [('calls_captured', 50), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 21), ('async_compile_cache_miss', 3), ('async_compile_cache_hit', 3), ('fxgraph_cache_miss', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_no_op_reduction_cuda (__main__.GPUTests.test_no_op_reduction_cuda) ... inline_call [] | |
stats [('calls_captured', 6), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_no_specization_over_symbolic_value_cuda (__main__.GPUTests.test_no_specization_over_symbolic_value_cuda) ... stats [('calls_captured', 2), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('fxgraph_cache_miss', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_nonzero_unbacked_refinement_cuda (__main__.GPUTests.test_nonzero_unbacked_refinement_cuda) ... inline_call [] | |
stats [('calls_captured', 30), ('unique_graphs', 3)] | |
aot_autograd [('total', 3), ('ok', 3), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('autograd_cache_hit', 1)] | |
inductor [('extern_calls', 12), ('triton_bundler_save_kernel', 7), ('intermediate_hooks', 3), ('async_compile_cache_miss', 3), ('fxgraph_cache_miss', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
frames [('total', 1), ('ok', 1)] | |
ok | |
test_one_hot_cuda (__main__.GPUTests.test_one_hot_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('pattern_matcher_count', 1), ('pattern_matcher_nodes', 1), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_output_strides_cuda (__main__.GPUTests.test_output_strides_cuda) ... /workspace/pytorch/torch/_dynamo/utils.py:3284: UserWarning: The use of `x.T` on tensors of dimension other than 2 to reverse their shape is deprecated and it will throw an error in a future release. Consider `x.mT` to transpose batches of matrices or `x.permute(*torch.arange(x.ndim - 1, -1, -1))` to reverse the dimensions of a tensor. (Triggered internally at /workspace/pytorch/aten/src/ATen/native/TensorShape.cpp:4413.) | |
return node.target(*args, **kwargs) | |
/workspace/pytorch/test/inductor/test_torchinductor.py:6789: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly. To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage() | |
self.assertEqual(inp.storage(), out.storage()) | |
frames [('total', 3), ('ok', 3)] | |
unimplemented [] | |
graph_break [('Call to `torch._dynamo.graph_break()`\n Explanation: User-inserted graph break. Message: None\n Hint: Remove the `torch._dynamo.graph_break()` call.\n\n Developer debug context: Called `torch._dynamo.graph_break()` with args `[]`, kwargs `{}`\n', 1)] | |
stats [('calls_captured', 7), ('unique_graphs', 3)] | |
aot_autograd [('total', 3), ('autograd_cache_miss', 3), ('ok', 3), ('autograd_cache_bypass', 2), ('autograd_cache_saved', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 3), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1), ('pattern_matcher_count', 1), ('pattern_matcher_nodes', 1)] | |
aten_mm_info [] | |
ok | |
test_pad_cast_cuda (__main__.GPUTests.test_pad_cast_cuda) ... inline_call [] | |
stats [('calls_captured', 8), ('unique_graphs', 4)] | |
aot_autograd [('total', 4), ('ok', 4), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('autograd_cache_hit', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2), ('fxgraph_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_pad_single_cuda (__main__.GPUTests.test_pad_single_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_pad_view_cuda (__main__.GPUTests.test_pad_view_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_pattern_matcher_multi_user_cuda (__main__.GPUTests.test_pattern_matcher_multi_user_cuda) ... inline_call [] | |
stats [('calls_captured', 6), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('pattern_matcher_nodes', 7), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('pattern_matcher_count', 2), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_permute1_cuda (__main__.GPUTests.test_permute1_cuda) ... inline_call [] | |
stats [('calls_captured', 10), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_permute2_cuda (__main__.GPUTests.test_permute2_cuda) ... inline_call [] | |
stats [('calls_captured', 6), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_bypass', 2), ('ok', 2)] | |
inductor [('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_philox_rand_cuda (__main__.GPUTests.test_philox_rand_cuda) ... frames [('total', 5), ('ok', 5)] | |
stats [('calls_captured', 8), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_pixel_shuffle_channels_last_cuda (__main__.GPUTests.test_pixel_shuffle_channels_last_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('pattern_matcher_count', 4), ('pattern_matcher_nodes', 4), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_pointwise_airy_ai_cuda (__main__.GPUTests.test_pointwise_airy_ai_cuda) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('extern_calls', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_pointwise_bessel_j0_cuda (__main__.GPUTests.test_pointwise_bessel_j0_cuda) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_pointwise_bessel_j1_cuda (__main__.GPUTests.test_pointwise_bessel_j1_cuda) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_pointwise_bessel_y0_cuda (__main__.GPUTests.test_pointwise_bessel_y0_cuda) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_pointwise_bessel_y1_cuda (__main__.GPUTests.test_pointwise_bessel_y1_cuda) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_pointwise_chebyshev_polynomial_t_cuda (__main__.GPUTests.test_pointwise_chebyshev_polynomial_t_cuda) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('extern_calls', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_pointwise_chebyshev_polynomial_u_cuda (__main__.GPUTests.test_pointwise_chebyshev_polynomial_u_cuda) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('extern_calls', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_pointwise_chebyshev_polynomial_v_cuda (__main__.GPUTests.test_pointwise_chebyshev_polynomial_v_cuda) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('extern_calls', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_pointwise_chebyshev_polynomial_w_cuda (__main__.GPUTests.test_pointwise_chebyshev_polynomial_w_cuda) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('extern_calls', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_pointwise_digamma_cuda (__main__.GPUTests.test_pointwise_digamma_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('extern_calls', 4), ('fxgraph_cache_miss', 2), ('intermediate_hooks', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_pointwise_entr_cuda (__main__.GPUTests.test_pointwise_entr_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_pointwise_erf_cuda (__main__.GPUTests.test_pointwise_erf_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_pointwise_erfc_cuda (__main__.GPUTests.test_pointwise_erfc_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_pointwise_erfcx_cuda (__main__.GPUTests.test_pointwise_erfcx_cuda) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_pointwise_erfinv_cuda (__main__.GPUTests.test_pointwise_erfinv_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_pointwise_exp2_cuda (__main__.GPUTests.test_pointwise_exp2_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_pointwise_expit_cuda (__main__.GPUTests.test_pointwise_expit_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_pointwise_expm1_cuda (__main__.GPUTests.test_pointwise_expm1_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_pointwise_gammainc_cuda (__main__.GPUTests.test_pointwise_gammainc_cuda) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('extern_calls', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_pointwise_gammaincc_cuda (__main__.GPUTests.test_pointwise_gammaincc_cuda) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('extern_calls', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_pointwise_gammaln_cuda (__main__.GPUTests.test_pointwise_gammaln_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_pointwise_hermite_polynomial_h_cuda (__main__.GPUTests.test_pointwise_hermite_polynomial_h_cuda) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('extern_calls', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_pointwise_hermite_polynomial_he_cuda (__main__.GPUTests.test_pointwise_hermite_polynomial_he_cuda) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('extern_calls', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_pointwise_i0_cuda (__main__.GPUTests.test_pointwise_i0_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_pointwise_i0e_cuda (__main__.GPUTests.test_pointwise_i0e_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('extern_calls', 4), ('fxgraph_cache_miss', 2), ('intermediate_hooks', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_pointwise_i1_cuda (__main__.GPUTests.test_pointwise_i1_cuda) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_pointwise_i1e_cuda (__main__.GPUTests.test_pointwise_i1e_cuda) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('extern_calls', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_pointwise_laguerre_polynomial_l_cuda (__main__.GPUTests.test_pointwise_laguerre_polynomial_l_cuda) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('extern_calls', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_pointwise_legendre_polynomial_p_cuda (__main__.GPUTests.test_pointwise_legendre_polynomial_p_cuda) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('extern_calls', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_pointwise_log1p_cuda (__main__.GPUTests.test_pointwise_log1p_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_pointwise_log_ndtr_cuda (__main__.GPUTests.test_pointwise_log_ndtr_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_pointwise_logit_cuda (__main__.GPUTests.test_pointwise_logit_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_pointwise_modified_bessel_i0_cuda (__main__.GPUTests.test_pointwise_modified_bessel_i0_cuda) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_pointwise_modified_bessel_i1_cuda (__main__.GPUTests.test_pointwise_modified_bessel_i1_cuda) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_pointwise_modified_bessel_k0_cuda (__main__.GPUTests.test_pointwise_modified_bessel_k0_cuda) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('extern_calls', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_pointwise_modified_bessel_k1_cuda (__main__.GPUTests.test_pointwise_modified_bessel_k1_cuda) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('extern_calls', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_pointwise_multigammaln_cuda (__main__.GPUTests.test_pointwise_multigammaln_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_pointwise_ndtr_cuda (__main__.GPUTests.test_pointwise_ndtr_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_pointwise_ndtri_cuda (__main__.GPUTests.test_pointwise_ndtri_cuda) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('extern_calls', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_pointwise_polygamma_cuda (__main__.GPUTests.test_pointwise_polygamma_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('extern_calls', 4), ('fxgraph_cache_miss', 2), ('intermediate_hooks', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_pointwise_psi_cuda (__main__.GPUTests.test_pointwise_psi_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('extern_calls', 4), ('fxgraph_cache_miss', 2), ('intermediate_hooks', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_pointwise_round_cuda (__main__.GPUTests.test_pointwise_round_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_pointwise_scaled_modified_bessel_k0_cuda (__main__.GPUTests.test_pointwise_scaled_modified_bessel_k0_cuda) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('extern_calls', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_pointwise_scaled_modified_bessel_k1_cuda (__main__.GPUTests.test_pointwise_scaled_modified_bessel_k1_cuda) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('extern_calls', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_pointwise_shifted_chebyshev_polynomial_t_cuda (__main__.GPUTests.test_pointwise_shifted_chebyshev_polynomial_t_cuda) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('extern_calls', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_pointwise_shifted_chebyshev_polynomial_u_cuda (__main__.GPUTests.test_pointwise_shifted_chebyshev_polynomial_u_cuda) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('extern_calls', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_pointwise_shifted_chebyshev_polynomial_v_cuda (__main__.GPUTests.test_pointwise_shifted_chebyshev_polynomial_v_cuda) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('extern_calls', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_pointwise_shifted_chebyshev_polynomial_w_cuda (__main__.GPUTests.test_pointwise_shifted_chebyshev_polynomial_w_cuda) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('extern_calls', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_pointwise_sinc_cuda (__main__.GPUTests.test_pointwise_sinc_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_pointwise_spherical_bessel_j0_cuda (__main__.GPUTests.test_pointwise_spherical_bessel_j0_cuda) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('extern_calls', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_pointwise_xlog1py_cuda (__main__.GPUTests.test_pointwise_xlog1py_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_pointwise_xlogy_cuda (__main__.GPUTests.test_pointwise_xlogy_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_pointwise_zeta_cuda (__main__.GPUTests.test_pointwise_zeta_cuda) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('extern_calls', 2), ('fxgraph_cache_miss', 1), ('intermediate_hooks', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_polar_cuda (__main__.GPUTests.test_polar_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)] | |
inductor [('extern_calls', 4), ('intermediate_hooks', 2), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_pow1_cuda (__main__.GPUTests.test_pow1_cuda) ... inline_call [] | |
stats [('calls_captured', 34), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_pow2_cuda (__main__.GPUTests.test_pow2_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_pow3_cuda (__main__.GPUTests.test_pow3_cuda) ... frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 3), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_pow_int_cuda (__main__.GPUTests.test_pow_int_cuda) ... inline_call [] | |
stats [('calls_captured', 20), ('unique_graphs', 10)] | |
aot_autograd [('total', 10), ('ok', 10), ('autograd_cache_miss', 5), ('autograd_cache_saved', 5), ('autograd_cache_hit', 5)] | |
inductor [('triton_bundler_save_kernel', 35), ('extern_calls', 20), ('async_compile_cache_miss', 15), ('intermediate_hooks', 10), ('async_compile_cache_hit', 10), ('fxgraph_cache_miss', 5), ('fxgraph_cache_hit', 5)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_pow_symfloat_cuda (__main__.GPUTests.test_pow_symfloat_cuda) ... stats [('calls_captured', 3), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_prepare_softmax_with_fast_math_cuda (__main__.GPUTests.test_prepare_softmax_with_fast_math_cuda) | |
Measure on a A100, perf is 3.487ms v.s. 3.358ms without or with flushing to zero. A 4% speedup. ... frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 5), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('pattern_matcher_nodes', 4), ('fxgraph_cache_miss', 1), ('pattern_matcher_count', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_prod_cuda (__main__.GPUTests.test_prod_cuda) ... inline_call [] | |
stats [('calls_captured', 12), ('unique_graphs', 4)] | |
aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)] | |
inductor [('triton_bundler_save_kernel', 56), ('async_compile_cache_miss', 8), ('async_compile_cache_hit', 8), ('fxgraph_cache_miss', 4)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_profiler_mark_wrapper_call_cuda (__main__.GPUTests.test_profiler_mark_wrapper_call_cuda) ... stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_rand_like_deterministic_cuda (__main__.GPUTests.test_rand_like_deterministic_cuda) ... frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 2), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 14), ('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('extern_calls', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_randint_cuda (__main__.GPUTests.test_randint_cuda) ... stats [('calls_captured', 3), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 21), ('pattern_matcher_count', 3), ('pattern_matcher_nodes', 3), ('async_compile_cache_miss', 3), ('async_compile_cache_hit', 3), ('fxgraph_cache_miss', 1), ('extern_calls', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_randint_distribution_cuda (__main__.GPUTests.test_randint_distribution_cuda) ... stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('pattern_matcher_count', 1), ('pattern_matcher_nodes', 1), ('fxgraph_cache_miss', 1), ('extern_calls', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_randint_int64_mod_cuda (__main__.GPUTests.test_randint_int64_mod_cuda) ... frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 2), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('pattern_matcher_count', 1), ('pattern_matcher_nodes', 1), ('fxgraph_cache_miss', 1), ('extern_calls', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_randint_kernel_count_cuda (__main__.GPUTests.test_randint_kernel_count_cuda) ... stats [('calls_captured', 3), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 14), ('pattern_matcher_count', 3), ('pattern_matcher_nodes', 3), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('extern_calls', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_randn_generator_cuda (__main__.GPUTests.test_randn_generator_cuda) ... inline_call [("Failed to convert args/kwargs to proxy\n Explanation: Missing `as_proxy()` implementation for some arg/kwarg.\n\n\n Developer debug context: call_function args: ListVariable(length=2) UserDefinedObjectVariable(Generator) ConstantVariable(device: device(type='cuda', index=0))\n", 1)] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('async_compile_cache_miss', 3), ('extern_calls', 2), ('async_compile_cache_hit', 2), ('pattern_matcher_count', 1), ('pattern_matcher_nodes', 1), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_randn_like_empty_cuda (__main__.GPUTests.test_randn_like_empty_cuda) ... inline_call [] | |
stats [('calls_captured', 6), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_randn_with_dtype_and_device_cuda (__main__.GPUTests.test_randn_with_dtype_and_device_cuda) ... skipped 'only support cpu randn_with_dtype_and_device test' | |
test_reduction1_cuda (__main__.GPUTests.test_reduction1_cuda) ... inline_call [] | |
stats [('calls_captured', 10), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 42), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 6), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_reduction2_cuda (__main__.GPUTests.test_reduction2_cuda) ... inline_call [] | |
stats [('calls_captured', 8), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 35), ('async_compile_cache_miss', 5), ('async_compile_cache_hit', 5), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_reduction3_cuda (__main__.GPUTests.test_reduction3_cuda) ... inline_call [] | |
stats [('calls_captured', 8), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 35), ('async_compile_cache_miss', 5), ('async_compile_cache_hit', 5), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_reduction4_cuda (__main__.GPUTests.test_reduction4_cuda) ... inline_call [] | |
stats [('calls_captured', 8), ('unique_graphs', 4)] | |
aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)] | |
inductor [('triton_bundler_save_kernel', 28), ('fxgraph_cache_miss', 4), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_reduction5_cuda (__main__.GPUTests.test_reduction5_cuda) ... inline_call [] | |
stats [('calls_captured', 8), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 35), ('async_compile_cache_miss', 5), ('async_compile_cache_hit', 5), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_reduction_config_limit_cuda (__main__.GPUTests.test_reduction_config_limit_cuda) | |
This unit-test tests whether we exceed cudaDeviceProperties.maxGridSize in ... ok | |
test_reflection_pad2d_backward_cuda (__main__.GPUTests.test_reflection_pad2d_backward_cuda) ... inline_call [] | |
stats [('calls_captured', 14), ('unique_graphs', 14)] | |
aot_autograd [('total', 14), ('autograd_cache_miss', 14), ('autograd_cache_saved', 14), ('ok', 14)] | |
inductor [('pattern_matcher_count', 112), ('pattern_matcher_nodes', 112), ('triton_bundler_save_kernel', 98), ('fxgraph_cache_miss', 14), ('async_compile_cache_miss', 14), ('async_compile_cache_hit', 14)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_reflection_pad2d_cuda (__main__.GPUTests.test_reflection_pad2d_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('pattern_matcher_count', 8), ('pattern_matcher_nodes', 8), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_reinterpret_dtypeview_cuda (__main__.GPUTests.test_reinterpret_dtypeview_cuda) ... frames [('total', 2), ('ok', 2)] | |
stats [('calls_captured', 12), ('unique_graphs', 3)] | |
aot_autograd [('total', 3), ('autograd_cache_miss', 3), ('autograd_cache_bypass', 3), ('ok', 3)] | |
inductor [('pattern_matcher_count', 6), ('pattern_matcher_nodes', 6), ('fxgraph_cache_hit', 2), ('fxgraph_cache_miss', 1)] | |
graph_break [] | |
aten_mm_info [] | |
inline_call [] | |
ok | |
test_relu_cuda (__main__.GPUTests.test_relu_cuda) ... inline_call [] | |
stats [('calls_captured', 8), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_remainder_cuda (__main__.GPUTests.test_remainder_cuda) ... inline_call [] | |
stats [('calls_captured', 14), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_remove_no_ops_cuda (__main__.GPUTests.test_remove_no_ops_cuda) ... frames [('total', 14), ('ok', 14)] | |
inline_call [] | |
stats [('calls_captured', 42), ('unique_graphs', 14)] | |
aot_autograd [('total', 14), ('autograd_cache_miss', 14), ('autograd_cache_saved', 14), ('ok', 14)] | |
inductor [('triton_bundler_save_kernel', 42), ('extern_calls', 14), ('async_compile_cache_miss', 14), ('async_compile_cache_hit', 10), ('fxgraph_cache_miss', 8), ('fxgraph_cache_hit', 6)] | |
graph_break [] | |
aten_mm_info [('aten.mm_256_256_256', 8)] | |
ok | |
test_remove_noop_clone_cuda (__main__.GPUTests.test_remove_noop_clone_cuda) ... inline_call [] | |
stats [('calls_captured', 10), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 42), ('pattern_matcher_nodes', 12), ('pattern_matcher_count', 10), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 6), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_remove_noop_copy_cuda (__main__.GPUTests.test_remove_noop_copy_cuda) ... inline_call [] | |
stats [('calls_captured', 14), ('unique_graphs', 4)] | |
aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)] | |
inductor [('triton_bundler_save_kernel', 28), ('fxgraph_cache_miss', 4), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_repeat_as_strided_cuda (__main__.GPUTests.test_repeat_as_strided_cuda) ... inline_call [] | |
stats [('calls_captured', 6), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_repeat_cuda (__main__.GPUTests.test_repeat_cuda) ... inline_call [] | |
stats [('calls_captured', 8), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 42), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 6), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_repeat_interleave_2_cuda (__main__.GPUTests.test_repeat_interleave_2_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)] | |
inductor [('extern_calls', 4), ('intermediate_hooks', 2), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_repeat_interleave_cuda (__main__.GPUTests.test_repeat_interleave_cuda) ... inline_call [] | |
stats [('calls_captured', 6), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 42), ('pattern_matcher_count', 8), ('pattern_matcher_nodes', 8), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 6), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_require_stride_expanded_cuda (__main__.GPUTests.test_require_stride_expanded_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('fxgraph_cache_miss', 2), ('extern_calls', 2), ('intermediate_hooks', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_resize_as_cuda (__main__.GPUTests.test_resize_as_cuda) ... stats [('calls_captured', 102), ('unique_graphs', 102)] | |
aot_autograd [('total', 102), ('ok', 102), ('autograd_cache_miss', 86), ('autograd_cache_saved', 86), ('autograd_cache_hit', 16)] | |
inductor [('triton_bundler_save_kernel', 476), ('async_compile_cache_miss', 136), ('async_compile_cache_hit', 102), ('fxgraph_cache_miss', 68), ('fxgraph_cache_hit', 34)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_resize_cuda (__main__.GPUTests.test_resize_cuda) ... stats [('calls_captured', 34), ('unique_graphs', 34)] | |
aot_autograd [('total', 34), ('autograd_cache_miss', 34), ('autograd_cache_saved', 34), ('ok', 34)] | |
inductor [('triton_bundler_save_kernel', 238), ('fxgraph_cache_miss', 34), ('async_compile_cache_miss', 34), ('async_compile_cache_hit', 34)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_reuse_buffers_with_aliasing_cuda (__main__.GPUTests.test_reuse_buffers_with_aliasing_cuda) ... inline_call [] | |
stats [('calls_captured', 30), ('unique_graphs', 5)] | |
aot_autograd [('total', 5), ('ok', 5), ('autograd_cache_miss', 3), ('autograd_cache_saved', 3), ('autograd_cache_hit', 2)] | |
inductor [('extern_calls', 40), ('intermediate_hooks', 20), ('triton_bundler_save_kernel', 14), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 3), ('fxgraph_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
frames [('total', 1), ('ok', 1)] | |
ok | |
test_roi_align_cuda (__main__.GPUTests.test_roi_align_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('extern_calls', 4), ('fxgraph_cache_miss', 2), ('intermediate_hooks', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_roll_cuda (__main__.GPUTests.test_roll_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('pattern_matcher_count', 10), ('pattern_matcher_nodes', 10), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_round_correctness_cuda (__main__.GPUTests.test_round_correctness_cuda) ... skipped 'need to debug tl.libdevice on A100/V100' | |
test_round_cuda (__main__.GPUTests.test_round_cuda) ... inline_call [] | |
stats [('calls_captured', 8), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_rsqrt_cuda (__main__.GPUTests.test_rsqrt_cuda) ... inline_call [] | |
stats [('calls_captured', 8), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_rsqrt_dynamic_shapes_cuda (__main__.GPUTests.test_rsqrt_dynamic_shapes_cuda) ... frames [('total', 9), ('ok', 9)] | |
stats [('calls_captured', 16), ('unique_graphs', 5)] | |
aot_autograd [('total', 5), ('autograd_cache_miss', 5), ('autograd_cache_saved', 5), ('ok', 5)] | |
inductor [('triton_bundler_save_kernel', 35), ('fxgraph_cache_miss', 5), ('extern_calls', 5), ('async_compile_cache_miss', 5), ('async_compile_cache_hit', 5), ('benchmarking.InductorBenchmarker.benchmark_gpu', 2)] | |
graph_break [] | |
aten_mm_info [('aten.bmm_s2_s2_s2', 2), ('aten.bmm_4_4_4', 2), ('aten.bmm_s1_s1_s1', 1)] | |
inline_call [] | |
ok | |
test_scalar_cpu_tensor_arg_cuda (__main__.GPUTests.test_scalar_cpu_tensor_arg_cuda) ... inline_call [] | |
stats [('calls_captured', 8), ('unique_graphs', 4)] | |
aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)] | |
inductor [('triton_bundler_save_kernel', 28), ('fxgraph_cache_miss', 4), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_scalar_input_cuda (__main__.GPUTests.test_scalar_input_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('async_compile_cache_miss', 3), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_scalar_output_cuda (__main__.GPUTests.test_scalar_output_cuda) ... inline_call [] | |
stats [('calls_captured', 6), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_scaled_dot_product_attention_cuda (__main__.GPUTests.test_scaled_dot_product_attention_cuda) ... /workspace/pytorch/torch/_inductor/lowering.py:7007: UserWarning: | |
Online softmax is disabled on the fly since Inductor decides to | |
split the reduction. Cut an issue to PyTorch if this is an | |
important use case and you want to speed it up with online | |
softmax. | |
warnings.warn( | |
inline_call [] | |
stats [('calls_captured', 12), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 35), ('pattern_matcher_nodes', 12), ('pattern_matcher_count', 8), ('async_compile_cache_miss', 5), ('async_compile_cache_hit', 5), ('extern_calls', 4), ('benchmarking.InductorBenchmarker.benchmark_gpu', 2), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [('aten.bmm_2_2_2', 2)] | |
ok | |
test_scaled_dot_product_efficient_attention_cuda (__main__.GPUTests.test_scaled_dot_product_efficient_attention_cuda) ... inline_call [] | |
stats [('calls_captured', 3), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('extern_calls', 3), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_scatter1_cuda (__main__.GPUTests.test_scatter1_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_scatter2_cuda (__main__.GPUTests.test_scatter2_cuda) ... skipped 'unstable on sm86' | |
test_scatter3_cuda (__main__.GPUTests.test_scatter3_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_scatter4_cuda (__main__.GPUTests.test_scatter4_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 4)] | |
aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)] | |
inductor [('triton_bundler_save_kernel', 42), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 6), ('fxgraph_cache_miss', 4), ('extern_calls', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_scatter5_cuda (__main__.GPUTests.test_scatter5_cuda) ... /workspace/pytorch/test/inductor/test_torchinductor.py:8207: UserWarning: The reduce argument of torch.scatter with Tensor src is deprecated and will be removed in a future PyTorch release. Use torch.scatter_reduce instead for more reduction options. (Triggered internally at /workspace/pytorch/aten/src/ATen/native/TensorAdvancedIndexing.cpp:232.) | |
a.scatter_(dim, index, b, reduce=reduce) | |
inline_call [] | |
stats [('calls_captured', 16), ('unique_graphs', 4)] | |
aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)] | |
inductor [('triton_bundler_save_kernel', 70), ('async_compile_cache_miss', 10), ('async_compile_cache_hit', 10), ('fxgraph_cache_miss', 4), ('extern_calls', 4)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_scatter6_cuda (__main__.GPUTests.test_scatter6_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 4)] | |
aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)] | |
inductor [('triton_bundler_save_kernel', 42), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 6), ('fxgraph_cache_miss', 4), ('extern_calls', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_scatter_add1_cuda (__main__.GPUTests.test_scatter_add1_cuda) ... skipped 'Flaky test, needs debugging' | |
test_scatter_add2_cuda (__main__.GPUTests.test_scatter_add2_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_scatter_add3_cuda (__main__.GPUTests.test_scatter_add3_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 4)] | |
aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)] | |
inductor [('triton_bundler_save_kernel', 42), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 6), ('fxgraph_cache_miss', 4), ('extern_calls', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_scatter_bf16_cuda (__main__.GPUTests.test_scatter_bf16_cuda) ... inline_call [] | |
stats [('calls_captured', 6), ('unique_graphs', 6)] | |
aot_autograd [('total', 6), ('ok', 6), ('autograd_cache_miss', 3), ('autograd_cache_saved', 3), ('autograd_cache_hit', 3)] | |
inductor [('triton_bundler_save_kernel', 21), ('async_compile_cache_miss', 9), ('extern_calls', 6), ('async_compile_cache_hit', 6), ('fxgraph_cache_miss', 3), ('fxgraph_cache_hit', 3)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_scatter_reduce1_cuda (__main__.GPUTests.test_scatter_reduce1_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_scatter_reduce2_cuda (__main__.GPUTests.test_scatter_reduce2_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 4)] | |
aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)] | |
inductor [('triton_bundler_save_kernel', 56), ('async_compile_cache_miss', 8), ('async_compile_cache_hit', 8), ('fxgraph_cache_miss', 4), ('extern_calls', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_scatter_reduce3_cuda (__main__.GPUTests.test_scatter_reduce3_cuda) ... inline_call [] | |
stats [('calls_captured', 16), ('unique_graphs', 4)] | |
aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)] | |
inductor [('triton_bundler_save_kernel', 70), ('async_compile_cache_miss', 10), ('async_compile_cache_hit', 10), ('fxgraph_cache_miss', 4), ('extern_calls', 4)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_scheduler_vertical_fusion1_cuda (__main__.GPUTests.test_scheduler_vertical_fusion1_cuda) ... inline_call [] | |
stats [('calls_captured', 34), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_bypass', 2), ('ok', 2)] | |
inductor [('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_sdpa_prefer_nd_tiling_False_use_block_ptr_False_cuda (__main__.GPUTests.test_sdpa_prefer_nd_tiling_False_use_block_ptr_False_cuda) ... inline_call [] | |
stats [('calls_captured', 30), ('unique_graphs', 3)] | |
aot_autograd [('total', 3), ('ok', 3), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('autograd_cache_hit', 1)] | |
inductor [('triton_bundler_save_kernel', 14), ('extern_calls', 9), ('pattern_matcher_count', 8), ('pattern_matcher_nodes', 8), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 3), ('fxgraph_cache_miss', 2), ('fxgraph_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_23760_8_128', 2)] | |
frames [('total', 1), ('ok', 1)] | |
ok | |
test_sdpa_prefer_nd_tiling_False_use_block_ptr_True_cuda (__main__.GPUTests.test_sdpa_prefer_nd_tiling_False_use_block_ptr_True_cuda) ... inline_call [] | |
stats [('calls_captured', 30), ('unique_graphs', 3)] | |
aot_autograd [('total', 3), ('ok', 3), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('autograd_cache_hit', 1)] | |
inductor [('triton_bundler_save_kernel', 14), ('extern_calls', 9), ('pattern_matcher_count', 8), ('pattern_matcher_nodes', 8), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 3), ('fxgraph_cache_miss', 2), ('fxgraph_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_23760_8_128', 2)] | |
frames [('total', 1), ('ok', 1)] | |
ok | |
test_sdpa_prefer_nd_tiling_True_use_block_ptr_False_cuda (__main__.GPUTests.test_sdpa_prefer_nd_tiling_True_use_block_ptr_False_cuda) ... inline_call [] | |
stats [('calls_captured', 30), ('unique_graphs', 3)] | |
aot_autograd [('total', 3), ('ok', 3), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('autograd_cache_hit', 1)] | |
inductor [('triton_bundler_save_kernel', 14), ('extern_calls', 9), ('pattern_matcher_count', 8), ('pattern_matcher_nodes', 8), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 3), ('fxgraph_cache_miss', 2), ('fxgraph_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_23760_8_128', 2)] | |
frames [('total', 1), ('ok', 1)] | |
ok | |
test_sdpa_prefer_nd_tiling_True_use_block_ptr_True_cuda (__main__.GPUTests.test_sdpa_prefer_nd_tiling_True_use_block_ptr_True_cuda) ... inline_call [] | |
stats [('calls_captured', 30), ('unique_graphs', 3)] | |
aot_autograd [('total', 3), ('ok', 3), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('autograd_cache_hit', 1)] | |
inductor [('triton_bundler_save_kernel', 14), ('extern_calls', 9), ('pattern_matcher_count', 8), ('pattern_matcher_nodes', 8), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 3), ('fxgraph_cache_miss', 2), ('fxgraph_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_23760_8_128', 2)] | |
frames [('total', 1), ('ok', 1)] | |
ok | |
test_sdpa_unaligned_mask_cuda (__main__.GPUTests.test_sdpa_unaligned_mask_cuda) ... inline_call [] | |
stats [('calls_captured', 10), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('extern_calls', 4), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_sdpa_unaligned_mask_freezing_cuda (__main__.GPUTests.test_sdpa_unaligned_mask_freezing_cuda) ... frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 5), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_bypass', 1), ('ok', 1)] | |
inductor [('extern_calls', 2), ('fxgraph_cache_miss', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_searchsorted_cuda (__main__.GPUTests.test_searchsorted_cuda) ... inline_call [] | |
stats [('calls_captured', 40), ('unique_graphs', 40)] | |
aot_autograd [('total', 40), ('autograd_cache_miss', 40), ('autograd_cache_saved', 40), ('ok', 40)] | |
inductor [('triton_bundler_save_kernel', 280), ('fxgraph_cache_miss', 40), ('async_compile_cache_miss', 40), ('async_compile_cache_hit', 40)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_select_scatter_cuda (__main__.GPUTests.test_select_scatter_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_setitem_with_int_parameter_cuda (__main__.GPUTests.test_setitem_with_int_parameter_cuda) ... stats [('calls_captured', 3), ('unique_graphs', 3)] | |
aot_autograd [('total', 3), ('autograd_cache_miss', 3), ('autograd_cache_saved', 3), ('ok', 3), ('autograd_cache_guard_miss', 1)] | |
inductor [('triton_bundler_save_kernel', 21), ('fxgraph_cache_miss', 4), ('async_compile_cache_miss', 3), ('async_compile_cache_hit', 3)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_sgn_cuda (__main__.GPUTests.test_sgn_cuda) ... inline_call [] | |
stats [('calls_captured', 8), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_sgn_extremal_cuda (__main__.GPUTests.test_sgn_extremal_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_shape_padding_cuda (__main__.GPUTests.test_shape_padding_cuda) ... inline_call [] | |
stats [('calls_captured', 24), ('unique_graphs', 24)] | |
aot_autograd [('total', 24), ('ok', 24), ('autograd_cache_miss', 12), ('autograd_cache_saved', 12), ('autograd_cache_hit', 12)] | |
inductor [('extern_calls', 24), ('fxgraph_cache_hit', 14), ('benchmarking.InductorBenchmarker.benchmark_gpu', 12), ('fxgraph_cache_miss', 10), ('pattern_matcher_count', 6), ('pattern_matcher_nodes', 6)] | |
graph_break [] | |
aten_mm_info [('aten.bmm_11_13_15', 4), ('aten.mm_11_13_15', 2), ('aten.addmm_11_13_15', 2), ('aten.baddbmm_11_13_15', 2)] | |
ok | |
test_shape_prop_torch_ones_cuda (__main__.GPUTests.test_shape_prop_torch_ones_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_should_pad_bench_for_bmm_cuda (__main__.GPUTests.test_should_pad_bench_for_bmm_cuda) ... ok | |
test_sigmoid_cuda (__main__.GPUTests.test_sigmoid_cuda) ... inline_call [] | |
stats [('calls_captured', 6), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_sign_dtype_cuda (__main__.GPUTests.test_sign_dtype_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_signbit_cuda (__main__.GPUTests.test_signbit_cuda) ... inline_call [] | |
stats [('calls_captured', 10), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_silu_cuda (__main__.GPUTests.test_silu_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_simplify_loops_cuda (__main__.GPUTests.test_simplify_loops_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_sin_cuda (__main__.GPUTests.test_sin_cuda) ... inline_call [] | |
stats [('calls_captured', 8), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_single_elem_cuda (__main__.GPUTests.test_single_elem_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_single_elem_indirect_cuda (__main__.GPUTests.test_single_elem_indirect_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('extern_calls', 4), ('fxgraph_cache_miss', 2), ('intermediate_hooks', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_size_asserts_for_multi_output_fallback_cuda (__main__.GPUTests.test_size_asserts_for_multi_output_fallback_cuda) ... frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 3), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('extern_calls', 3), ('fxgraph_cache_miss', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_sizehint_issue1_cuda (__main__.GPUTests.test_sizehint_issue1_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('pattern_matcher_count', 10), ('pattern_matcher_nodes', 10), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_slice1_cuda (__main__.GPUTests.test_slice1_cuda) ... inline_call [] | |
stats [('calls_captured', 20), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_bypass', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_slice2_cuda (__main__.GPUTests.test_slice2_cuda) ... inline_call [] | |
stats [('calls_captured', 16), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_slice3_cuda (__main__.GPUTests.test_slice3_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_bypass', 2), ('ok', 2)] | |
inductor [('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_slice4_cuda (__main__.GPUTests.test_slice4_cuda) ... inline_call [] | |
stats [('calls_captured', 8), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_bypass', 2), ('ok', 2)] | |
inductor [('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_slice_mutation1_cuda (__main__.GPUTests.test_slice_mutation1_cuda) ... inline_call [] | |
stats [('calls_captured', 12), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 56), ('async_compile_cache_miss', 8), ('async_compile_cache_hit', 8), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_slice_mutation2_cuda (__main__.GPUTests.test_slice_mutation2_cuda) ... stats [('calls_captured', 6), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 14), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_slice_mutation3_cuda (__main__.GPUTests.test_slice_mutation3_cuda) ... stats [('calls_captured', 2), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_slice_scatter2_cuda (__main__.GPUTests.test_slice_scatter2_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_slice_scatter3_cuda (__main__.GPUTests.test_slice_scatter3_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_slice_scatter4_cuda (__main__.GPUTests.test_slice_scatter4_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_slice_scatter5_cuda (__main__.GPUTests.test_slice_scatter5_cuda) ... inline_call [] | |
stats [('calls_captured', 8), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_slice_scatter_cuda (__main__.GPUTests.test_slice_scatter_cuda) ... inline_call [] | |
stats [('calls_captured', 6), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_slice_scatter_reinplace_cuda (__main__.GPUTests.test_slice_scatter_reinplace_cuda) ... inline_call [] | |
stats [('calls_captured', 7), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('pattern_matcher_nodes', 7), ('triton_bundler_save_kernel', 7), ('pattern_matcher_count', 6), ('benchmarking.InductorBenchmarker.benchmark_gpu', 2), ('extern_calls', 2), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [('aten.mm_256_64_64', 1), ('aten.bmm_32_33_64', 1)] | |
ok | |
test_slice_view_with_graph_break_cuda (__main__.GPUTests.test_slice_view_with_graph_break_cuda) ... frames [('total', 2), ('ok', 2)] | |
stats [('calls_captured', 7), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_softmax_backward_data_cuda (__main__.GPUTests.test_softmax_backward_data_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_softmax_cuda (__main__.GPUTests.test_softmax_cuda) ... inline_call [] | |
stats [('calls_captured', 8), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('pattern_matcher_nodes', 24), ('pattern_matcher_count', 6), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_softmax_one_kernel_loop_cuda (__main__.GPUTests.test_softmax_one_kernel_loop_cuda) ... inline_call [] | |
stats [('calls_captured', 5), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('pattern_matcher_nodes', 4), ('fxgraph_cache_miss', 1), ('pattern_matcher_count', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_softmax_one_kernel_persist_cuda (__main__.GPUTests.test_softmax_one_kernel_persist_cuda) ... inline_call [] | |
stats [('calls_captured', 5), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('pattern_matcher_nodes', 4), ('fxgraph_cache_miss', 1), ('pattern_matcher_count', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_sort_bool_cuda (__main__.GPUTests.test_sort_bool_cuda) ... inline_call [] | |
stats [('calls_captured', 16), ('unique_graphs', 4)] | |
aot_autograd [('total', 4), ('ok', 4), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('autograd_cache_hit', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2), ('fxgraph_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_sort_cuda (__main__.GPUTests.test_sort_cuda) ... inline_call [] | |
stats [('calls_captured', 12), ('unique_graphs', 4)] | |
aot_autograd [('total', 4), ('ok', 4), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('autograd_cache_hit', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2), ('fxgraph_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_sort_stable_cuda (__main__.GPUTests.test_sort_stable_cuda) ... inline_call [] | |
stats [('calls_captured', 24), ('unique_graphs', 8)] | |
aot_autograd [('total', 8), ('autograd_cache_miss', 8), ('autograd_cache_saved', 8), ('ok', 8)] | |
inductor [('triton_bundler_save_kernel', 56), ('fxgraph_cache_miss', 8), ('async_compile_cache_miss', 8), ('async_compile_cache_hit', 8)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_sort_transpose_cuda (__main__.GPUTests.test_sort_transpose_cuda) ... inline_call [] | |
stats [('calls_captured', 12), ('unique_graphs', 4)] | |
aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)] | |
inductor [('triton_bundler_save_kernel', 28), ('fxgraph_cache_miss', 4), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_split_cuda (__main__.GPUTests.test_split_cuda) ... inline_call [] | |
stats [('calls_captured', 22), ('unique_graphs', 4)] | |
aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('ok', 4), ('autograd_cache_bypass', 2), ('autograd_cache_saved', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 4), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_split_cumprod_cuda (__main__.GPUTests.test_split_cumprod_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 4)] | |
aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)] | |
inductor [('triton_bundler_save_kernel', 28), ('fxgraph_cache_miss', 4), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_split_cumprod_low_prec_cuda (__main__.GPUTests.test_split_cumprod_low_prec_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_split_cumsum_cuda (__main__.GPUTests.test_split_cumsum_cuda) ... inline_call [] | |
stats [('calls_captured', 16), ('unique_graphs', 16)] | |
aot_autograd [('total', 16), ('autograd_cache_miss', 16), ('autograd_cache_saved', 16), ('ok', 16)] | |
inductor [('triton_bundler_save_kernel', 112), ('fxgraph_cache_miss', 16), ('async_compile_cache_miss', 16), ('async_compile_cache_hit', 16)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_split_cumsum_index_cuda (__main__.GPUTests.test_split_cumsum_index_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_split_cumsum_low_prec_cuda (__main__.GPUTests.test_split_cumsum_low_prec_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('pattern_matcher_count', 1), ('pattern_matcher_nodes', 1), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_split_failed_cuda (__main__.GPUTests.test_split_failed_cuda) ... E0509 18:24:53.529000 415353 torch/_subclasses/fake_tensor.py:2431] [0/0] failed while attempting to run meta for aten.split_with_sizes.default | |
E0509 18:24:53.529000 415353 torch/_subclasses/fake_tensor.py:2431] [0/0] Traceback (most recent call last): | |
E0509 18:24:53.529000 415353 torch/_subclasses/fake_tensor.py:2431] [0/0] File "/workspace/pytorch/torch/_subclasses/fake_tensor.py", line 2427, in _dispatch_impl | |
E0509 18:24:53.529000 415353 torch/_subclasses/fake_tensor.py:2431] [0/0] r = func(*args, **kwargs) | |
E0509 18:24:53.529000 415353 torch/_subclasses/fake_tensor.py:2431] [0/0] ^^^^^^^^^^^^^^^^^^^^^ | |
E0509 18:24:53.529000 415353 torch/_subclasses/fake_tensor.py:2431] [0/0] File "/workspace/pytorch/torch/_ops.py", line 756, in __call__ | |
E0509 18:24:53.529000 415353 torch/_subclasses/fake_tensor.py:2431] [0/0] return self._op(*args, **kwargs) | |
E0509 18:24:53.529000 415353 torch/_subclasses/fake_tensor.py:2431] [0/0] ^^^^^^^^^^^^^^^^^^^^^^^^^ | |
E0509 18:24:53.529000 415353 torch/_subclasses/fake_tensor.py:2431] [0/0] File "/workspace/pytorch/torch/_refs/__init__.py", line 4167, in split_with_sizes | |
E0509 18:24:53.529000 415353 torch/_subclasses/fake_tensor.py:2431] [0/0] torch._check_with( | |
E0509 18:24:53.529000 415353 torch/_subclasses/fake_tensor.py:2431] [0/0] File "/workspace/pytorch/torch/__init__.py", line 1642, in _check_with | |
E0509 18:24:53.529000 415353 torch/_subclasses/fake_tensor.py:2431] [0/0] raise error_type(message_evaluated) | |
E0509 18:24:53.529000 415353 torch/_subclasses/fake_tensor.py:2431] [0/0] ValueError: Split sizes add up to 4 but got the tensor's size of 5 | |
frames [('total', 1)] | |
ok | |
test_split_with_integer_cuda (__main__.GPUTests.test_split_with_integer_cuda) ... frames [('total', 6), ('ok', 6)] | |
stats [('calls_captured', 12), ('unique_graphs', 3)] | |
aot_autograd [('total', 3), ('autograd_cache_miss', 3), ('autograd_cache_bypass', 3), ('ok', 3)] | |
inductor [('fxgraph_cache_miss', 3)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_split_with_list_cuda (__main__.GPUTests.test_split_with_list_cuda) ... inline_call [] | |
stats [('calls_captured', 52), ('unique_graphs', 6)] | |
aot_autograd [('total', 6), ('autograd_cache_miss', 6), ('autograd_cache_saved', 6), ('ok', 6)] | |
inductor [('triton_bundler_save_kernel', 140), ('async_compile_cache_miss', 20), ('async_compile_cache_hit', 20), ('fxgraph_cache_miss', 6)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_split_with_sizes_with_unbacked_symints_cuda (__main__.GPUTests.test_split_with_sizes_with_unbacked_symints_cuda) ... frames [('total', 2), ('ok', 2)] | |
stats [('calls_captured', 35), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_bypass', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1)] | |
inductor [('extern_calls', 9), ('fxgraph_cache_miss', 2), ('pattern_matcher_count', 1), ('pattern_matcher_nodes', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_split_with_unbacked_symints_cuda (__main__.GPUTests.test_split_with_unbacked_symints_cuda) ... frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 13), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('pattern_matcher_count', 1), ('pattern_matcher_nodes', 1), ('fxgraph_cache_miss', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_sqrt_dynamic_shapes_cuda (__main__.GPUTests.test_sqrt_dynamic_shapes_cuda) ... skipped 'sqrt dynamic shapes only supports cpu' | |
test_squeeze1_cuda (__main__.GPUTests.test_squeeze1_cuda) ... inline_call [] | |
stats [('calls_captured', 10), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_squeeze2_cuda (__main__.GPUTests.test_squeeze2_cuda) ... inline_call [] | |
stats [('calls_captured', 12), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_squeeze_varargs_cuda (__main__.GPUTests.test_squeeze_varargs_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_stack_cuda (__main__.GPUTests.test_stack_cuda) ... inline_call [] | |
stats [('calls_captured', 6), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_std_cuda (__main__.GPUTests.test_std_cuda) ... inline_call [] | |
stats [('calls_captured', 16), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 56), ('async_compile_cache_miss', 8), ('async_compile_cache_hit', 8), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_stride_preservation_with_stride_modifying_fx_pass_cuda (__main__.GPUTests.test_stride_preservation_with_stride_modifying_fx_pass_cuda) ... frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_bypass', 1), ('ok', 1)] | |
inductor [('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_bypass', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_strided_inputs_cuda (__main__.GPUTests.test_strided_inputs_cuda) ... frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_sum1_cuda (__main__.GPUTests.test_sum1_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_sum2_cuda (__main__.GPUTests.test_sum2_cuda) ... inline_call [] | |
stats [('calls_captured', 8), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_sum3_cuda (__main__.GPUTests.test_sum3_cuda) ... inline_call [] | |
stats [('calls_captured', 8), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_sum4_cuda (__main__.GPUTests.test_sum4_cuda) ... inline_call [] | |
stats [('calls_captured', 10), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_sum5_cuda (__main__.GPUTests.test_sum5_cuda) ... inline_call [] | |
stats [('calls_captured', 10), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_sum_dtype_cuda (__main__.GPUTests.test_sum_dtype_cuda) ... inline_call [] | |
stats [('calls_captured', 8), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_sum_int_cuda (__main__.GPUTests.test_sum_int_cuda) ... inline_call [] | |
stats [('calls_captured', 12), ('unique_graphs', 3)] | |
aot_autograd [('total', 3), ('autograd_cache_miss', 3), ('autograd_cache_saved', 3), ('ok', 3)] | |
inductor [('triton_bundler_save_kernel', 21), ('fxgraph_cache_miss', 3), ('async_compile_cache_miss', 3), ('async_compile_cache_hit', 3)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_sum_keepdims_cuda (__main__.GPUTests.test_sum_keepdims_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_tan_cuda (__main__.GPUTests.test_tan_cuda) ... inline_call [] | |
stats [('calls_captured', 8), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_tanh_cuda (__main__.GPUTests.test_tanh_cuda) ... inline_call [] | |
stats [('calls_captured', 8), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_tensor1_cuda (__main__.GPUTests.test_tensor1_cuda) ... inline_call [] | |
stats [('calls_captured', 6), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_tensor2_cuda (__main__.GPUTests.test_tensor2_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_tensor3_cuda (__main__.GPUTests.test_tensor3_cuda) ... inline_call [] | |
stats [('calls_captured', 14), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 42), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 6), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_tensor_index_put_slice_cuda (__main__.GPUTests.test_tensor_index_put_slice_cuda) ... frames [('total', 10), ('ok', 10)] | |
stats [('calls_captured', 90), ('unique_graphs', 10)] | |
aot_autograd [('total', 10), ('autograd_cache_miss', 10), ('autograd_cache_saved', 10), ('ok', 10)] | |
inductor [('triton_bundler_save_kernel', 56), ('async_compile_cache_miss', 12), ('async_compile_cache_hit', 10), ('pattern_matcher_count', 10), ('pattern_matcher_nodes', 10), ('fxgraph_cache_miss', 8), ('fxgraph_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_tensor_index_slice_cuda (__main__.GPUTests.test_tensor_index_slice_cuda) ... frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 16), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 56), ('async_compile_cache_miss', 8), ('async_compile_cache_hit', 8), ('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('fxgraph_cache_miss', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_tmp_not_defined_issue1_use_block_ptr_False_cuda (__main__.GPUTests.test_tmp_not_defined_issue1_use_block_ptr_False_cuda) ... inline_call [] | |
stats [('calls_captured', 22), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_tmp_not_defined_issue1_use_block_ptr_True_cuda (__main__.GPUTests.test_tmp_not_defined_issue1_use_block_ptr_True_cuda) ... inline_call [] | |
stats [('calls_captured', 22), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_tmp_not_defined_issue2_cuda (__main__.GPUTests.test_tmp_not_defined_issue2_cuda) ... inline_call [] | |
stats [('calls_captured', 6), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_tmp_not_defined_issue3_cuda (__main__.GPUTests.test_tmp_not_defined_issue3_cuda) ... inline_call [] | |
stats [('calls_captured', 66), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_bypass', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 42), ('pattern_matcher_nodes', 16), ('async_compile_cache_miss', 15), ('extern_calls', 12), ('pattern_matcher_count', 10), ('async_compile_cache_hit', 10), ('benchmarking.InductorBenchmarker.benchmark_gpu', 6), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [('aten.addmm_6144_1001_6', 1)] | |
ok | |
test_to_device_constant_cuda (__main__.GPUTests.test_to_device_constant_cuda) ... inline_call [] | |
stats [('calls_captured', 14), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 42), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 6), ('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_to_device_cuda (__main__.GPUTests.test_to_device_cuda) ... W0509 18:25:06.079000 415353 torch/_inductor/utils.py:1762] [0/0] DeviceCopy in input program | |
W0509 18:25:06.104000 415353 torch/_inductor/utils.py:1762] [0/0] DeviceCopy in input program | |
inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 2), ('extern_calls', 2), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_to_dtype_cuda (__main__.GPUTests.test_to_dtype_cuda) ... inline_call [] | |
stats [('calls_captured', 10), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_to_memory_format_cuda (__main__.GPUTests.test_to_memory_format_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 4)] | |
aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)] | |
inductor [('triton_bundler_save_kernel', 28), ('fxgraph_cache_miss', 4), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_topk_cuda (__main__.GPUTests.test_topk_cuda) ... inline_call [] | |
stats [('calls_captured', 6), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('extern_calls', 6), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_transpose_add_cuda (__main__.GPUTests.test_transpose_add_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_transpose_cuda (__main__.GPUTests.test_transpose_cuda) ... inline_call [] | |
stats [('calls_captured', 10), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_transposed_propagates_cuda (__main__.GPUTests.test_transposed_propagates_cuda) ... stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_triu_cuda (__main__.GPUTests.test_triu_cuda) ... inline_call [] | |
stats [('calls_captured', 6), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('pattern_matcher_count', 12), ('pattern_matcher_nodes', 12), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_uint4x2_mixed_mm_cuda (__main__.GPUTests.test_uint4x2_mixed_mm_cuda) ... inline_call [] | |
stats [('calls_captured', 14), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('extern_calls', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [('aten.mm_8_8_8', 2)] | |
ok | |
test_uint_cuda (__main__.GPUTests.test_uint_cuda) ... inline_call [] | |
stats [('calls_captured', 6), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('async_compile_cache_miss', 3), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_unbacked_floordiv_simplify_cuda (__main__.GPUTests.test_unbacked_floordiv_simplify_cuda) ... inline_call [] | |
stats [('calls_captured', 52), ('unique_graphs', 4)] | |
aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)] | |
inductor [('triton_bundler_save_kernel', 28), ('fxgraph_cache_miss', 4), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_unbacked_floordiv_simplify_errors_cuda (__main__.GPUTests.test_unbacked_floordiv_simplify_errors_cuda) ... frames [('total', 1)] | |
ok | |
test_unbind_cuda (__main__.GPUTests.test_unbind_cuda) ... inline_call [] | |
stats [('calls_captured', 20), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_bypass', 2), ('ok', 2)] | |
inductor [('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_unfold_zero_dimension_tensor_cuda (__main__.GPUTests.test_unfold_zero_dimension_tensor_cuda) ... stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('fxgraph_cache_miss', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_unroll_small_reduction_cuda (__main__.GPUTests.test_unroll_small_reduction_cuda) ... inline_call [] | |
stats [('calls_captured', 72), ('unique_graphs', 4)] | |
aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)] | |
inductor [('triton_bundler_save_kernel', 56), ('async_compile_cache_miss', 8), ('async_compile_cache_hit', 8), ('fxgraph_cache_miss', 4)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_unspec_inputs_bfloat16_cuda (__main__.GPUTests.test_unspec_inputs_bfloat16_cuda) ... frames [('total', 5), ('ok', 5)] | |
stats [('calls_captured', 6), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_unspec_inputs_float16_cuda (__main__.GPUTests.test_unspec_inputs_float16_cuda) ... frames [('total', 5), ('ok', 5)] | |
stats [('calls_captured', 6), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_unspec_inputs_float32_cuda (__main__.GPUTests.test_unspec_inputs_float32_cuda) ... frames [('total', 5), ('ok', 5)] | |
stats [('calls_captured', 6), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_unspec_inputs_float64_cuda (__main__.GPUTests.test_unspec_inputs_float64_cuda) ... frames [('total', 5), ('ok', 5)] | |
stats [('calls_captured', 9), ('unique_graphs', 2)] | |
aot_autograd [('total', 3), ('autograd_cache_miss', 3), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_unspec_inputs_int16_cuda (__main__.GPUTests.test_unspec_inputs_int16_cuda) ... frames [('total', 5), ('ok', 5)] | |
stats [('calls_captured', 6), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_unspec_inputs_int32_cuda (__main__.GPUTests.test_unspec_inputs_int32_cuda) ... frames [('total', 5), ('ok', 5)] | |
stats [('calls_captured', 6), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_unspec_inputs_int64_cuda (__main__.GPUTests.test_unspec_inputs_int64_cuda) ... frames [('total', 5), ('ok', 5)] | |
stats [('calls_captured', 6), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_unspec_inputs_int8_cuda (__main__.GPUTests.test_unspec_inputs_int8_cuda) ... frames [('total', 5), ('ok', 5)] | |
stats [('calls_captured', 6), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_unspec_inputs_uint8_cuda (__main__.GPUTests.test_unspec_inputs_uint8_cuda) ... frames [('total', 5), ('ok', 5)] | |
stats [('calls_captured', 6), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_unsqueeze_cuda (__main__.GPUTests.test_unsqueeze_cuda) ... inline_call [] | |
stats [('calls_captured', 20), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_unsqueeze_inplace_cuda (__main__.GPUTests.test_unsqueeze_inplace_cuda) ... inline_call [] | |
stats [('calls_captured', 10), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_upsample_bicubic2d_cuda (__main__.GPUTests.test_upsample_bicubic2d_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('pattern_matcher_count', 8), ('pattern_matcher_nodes', 8), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_upsample_bilinear2d_a_cuda (__main__.GPUTests.test_upsample_bilinear2d_a_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('pattern_matcher_count', 16), ('pattern_matcher_nodes', 16), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_upsample_bilinear2d_b_cuda (__main__.GPUTests.test_upsample_bilinear2d_b_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('pattern_matcher_count', 8), ('pattern_matcher_nodes', 8), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_upsample_cat_conv_cuda (__main__.GPUTests.test_upsample_cat_conv_cuda) ... skipped 'only support cpu upsample_cat_conv test' | |
test_upsample_nearest1d_cuda (__main__.GPUTests.test_upsample_nearest1d_cuda) ... inline_call [] | |
stats [('calls_captured', 10), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 56), ('pattern_matcher_count', 10), ('pattern_matcher_nodes', 10), ('async_compile_cache_miss', 8), ('async_compile_cache_hit', 8), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_upsample_nearest2d_backward_cuda (__main__.GPUTests.test_upsample_nearest2d_backward_cuda) ... inline_call [] | |
stats [('calls_captured', 10), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 56), ('async_compile_cache_miss', 8), ('async_compile_cache_hit', 8), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_upsample_nearest2d_cuda (__main__.GPUTests.test_upsample_nearest2d_cuda) ... inline_call [] | |
stats [('calls_captured', 10), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 56), ('pattern_matcher_count', 20), ('pattern_matcher_nodes', 20), ('async_compile_cache_miss', 8), ('async_compile_cache_hit', 8), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_upsample_nearest3d_cuda (__main__.GPUTests.test_upsample_nearest3d_cuda) ... inline_call [] | |
stats [('calls_captured', 10), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 56), ('pattern_matcher_count', 30), ('pattern_matcher_nodes', 30), ('async_compile_cache_miss', 8), ('async_compile_cache_hit', 8), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_var_correction_cuda (__main__.GPUTests.test_var_correction_cuda) ... /workspace/pytorch/test/inductor/test_torchinductor.py:5381: UserWarning: var(): degrees of freedom is <= 0. Correction should be strictly less than the reduction factor (input numel divided by output numel). (Triggered internally at /workspace/pytorch/aten/src/ATen/native/ReduceOps.cpp:1839.) | |
torch.var(x, dim=dim, correction=10), | |
/workspace/pytorch/test/inductor/test_torchinductor.py:5381: UserWarning: var(): degrees of freedom is <= 0. Correction should be strictly less than the reduction factor (input numel divided by output numel). (Triggered internally at /workspace/pytorch/aten/src/ATen/native/ReduceOps.cpp:1839.) | |
torch.var(x, dim=dim, correction=10), | |
/workspace/pytorch/test/inductor/test_torchinductor.py:5381: UserWarning: var(): degrees of freedom is <= 0. Correction should be strictly less than the reduction factor (input numel divided by output numel). (Triggered internally at /workspace/pytorch/aten/src/ATen/native/ReduceOps.cpp:1839.) | |
torch.var(x, dim=dim, correction=10), | |
/workspace/pytorch/test/inductor/test_torchinductor.py:5381: UserWarning: var(): degrees of freedom is <= 0. Correction should be strictly less than the reduction factor (input numel divided by output numel). (Triggered internally at /workspace/pytorch/aten/src/ATen/native/ReduceOps.cpp:1839.) | |
torch.var(x, dim=dim, correction=10), | |
inline_call [] | |
stats [('calls_captured', 12), ('unique_graphs', 4)] | |
aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)] | |
inductor [('triton_bundler_save_kernel', 28), ('fxgraph_cache_miss', 4), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_var_mean_tile_reduction_False_cuda (__main__.GPUTests.test_var_mean_tile_reduction_False_cuda) ... inline_call [] | |
stats [('calls_captured', 12), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_var_mean_tile_reduction_True_cuda (__main__.GPUTests.test_var_mean_tile_reduction_True_cuda) ... inline_call [] | |
stats [('calls_captured', 12), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_vdd_clamp_cuda (__main__.GPUTests.test_vdd_clamp_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_vectorized_ops_masked_cuda (__main__.GPUTests.test_vectorized_ops_masked_cuda) ... inline_call [] | |
stats [('calls_captured', 8), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('pattern_matcher_count', 4), ('pattern_matcher_nodes', 4), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_vectorized_ops_masked_var_novec_cuda (__main__.GPUTests.test_vectorized_ops_masked_var_novec_cuda) ... inline_call [] | |
stats [('calls_captured', 8), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('pattern_matcher_count', 4), ('pattern_matcher_nodes', 4), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_vertical_fusion1_cuda (__main__.GPUTests.test_vertical_fusion1_cuda) ... inline_call [] | |
stats [('calls_captured', 18), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_view_as_complex_cuda (__main__.GPUTests.test_view_as_complex_cuda) ... frames [('total', 1), ('ok', 1)] | |
stats [('calls_captured', 2), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('extern_calls', 2), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('intermediate_hooks', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_view_as_real_cuda (__main__.GPUTests.test_view_as_real_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('extern_calls', 4), ('async_compile_cache_miss', 3), ('intermediate_hooks', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_view_detach_cuda (__main__.GPUTests.test_view_detach_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_bypass', 2), ('ok', 2)] | |
inductor [('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_view_on_aliased_cuda (__main__.GPUTests.test_view_on_aliased_cuda) ... inline_call [] | |
stats [('calls_captured', 20), ('unique_graphs', 4)] | |
aot_autograd [('total', 4), ('ok', 4), ('autograd_cache_miss', 3), ('autograd_cache_saved', 3), ('autograd_cache_hit', 1)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 3), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2), ('pattern_matcher_count', 1), ('pattern_matcher_nodes', 1), ('fxgraph_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_view_uint8_through_differing_bitwidths_cuda (__main__.GPUTests.test_view_uint8_through_differing_bitwidths_cuda) ... inline_call [] | |
stats [('calls_captured', 12), ('unique_graphs', 6)] | |
aot_autograd [('total', 6), ('autograd_cache_miss', 6), ('ok', 6)] | |
ok | |
test_views1_cuda (__main__.GPUTests.test_views1_cuda) ... inline_call [] | |
stats [('calls_captured', 140), ('unique_graphs', 56)] | |
aot_autograd [('total', 56), ('autograd_cache_miss', 56), ('autograd_cache_saved', 56), ('ok', 56)] | |
inductor [('triton_bundler_save_kernel', 392), ('pattern_matcher_count', 56), ('pattern_matcher_nodes', 56), ('fxgraph_cache_miss', 56), ('async_compile_cache_miss', 56), ('async_compile_cache_hit', 56)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_views2_cuda (__main__.GPUTests.test_views2_cuda) ... inline_call [] | |
stats [('calls_captured', 30), ('unique_graphs', 12)] | |
aot_autograd [('total', 12), ('autograd_cache_miss', 12), ('autograd_cache_saved', 12), ('ok', 12)] | |
inductor [('triton_bundler_save_kernel', 84), ('pattern_matcher_count', 12), ('pattern_matcher_nodes', 12), ('fxgraph_cache_miss', 12), ('async_compile_cache_miss', 12), ('async_compile_cache_hit', 12)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_views3_cuda (__main__.GPUTests.test_views3_cuda) ... inline_call [] | |
stats [('calls_captured', 6), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('pattern_matcher_nodes', 8), ('pattern_matcher_count', 6), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_views4_cuda (__main__.GPUTests.test_views4_cuda) ... inline_call [] | |
stats [('calls_captured', 6), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('pattern_matcher_nodes', 8), ('pattern_matcher_count', 6), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_views5_cuda (__main__.GPUTests.test_views5_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_bypass', 2), ('ok', 2)] | |
inductor [('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_views6_cuda (__main__.GPUTests.test_views6_cuda) ... inline_call [] | |
stats [('calls_captured', 10), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_views7_cuda (__main__.GPUTests.test_views7_cuda) ... inline_call [] | |
stats [('calls_captured', 12), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 28), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_weight_norm_bwd_cuda (__main__.GPUTests.test_weight_norm_bwd_cuda) | |
Weight norm backward eager kernel does not support non-contiguous ... frames [('total', 9), ('ok', 9)] | |
inline_call [] | |
unimplemented [] | |
graph_break [('Tensor.backward', 1)] | |
stats [('calls_captured', 8), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('ok', 2), ('autograd_cache_saved', 1)] | |
inductor [('triton_bundler_save_kernel', 63), ('benchmarking.InductorBenchmarker.benchmark_gpu', 12), ('async_compile_cache_miss', 9), ('extern_calls', 9), ('async_compile_cache_hit', 9), ('pattern_matcher_nodes', 6), ('pattern_matcher_count', 4), ('fxgraph_cache_miss', 2)] | |
aten_mm_info [('aten.mm_2_1025_2', 2), ('aten.addmm_2_2_1025', 1), ('aten.addmm_2_1_2', 1), ('aten.mm_2_2_1', 1), ('aten.mm_1_2_2', 1)] | |
ok | |
test_where_broadcast_cuda (__main__.GPUTests.test_where_broadcast_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_bypass', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_where_with_logical_op_cuda (__main__.GPUTests.test_where_with_logical_op_cuda) ... inline_call [] | |
stats [('calls_captured', 8), ('unique_graphs', 4)] | |
aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)] | |
inductor [('triton_bundler_save_kernel', 28), ('fxgraph_cache_miss', 4), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_xblock_divides_xnumel_cuda (__main__.GPUTests.test_xblock_divides_xnumel_cuda) ... inline_call [] | |
stats [('calls_captured', 4), ('unique_graphs', 4)] | |
aot_autograd [('total', 4), ('autograd_cache_miss', 4), ('autograd_cache_saved', 4), ('ok', 4)] | |
inductor [('triton_bundler_save_kernel', 28), ('fxgraph_cache_miss', 4), ('async_compile_cache_miss', 4), ('async_compile_cache_hit', 4)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_zero_dim_reductions_cuda (__main__.GPUTests.test_zero_dim_reductions_cuda) ... frames [('total', 2), ('ok', 2)] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 14), ('fxgraph_cache_miss', 2), ('async_compile_cache_miss', 2), ('async_compile_cache_hit', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_zero_element_mutation_cuda (__main__.GPUTests.test_zero_element_mutation_cuda) ... inline_call [] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_zeros_cuda (__main__.GPUTests.test_zeros_cuda) ... inline_call [] | |
stats [('calls_captured', 14), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] | |
inductor [('triton_bundler_save_kernel', 42), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 6), ('fxgraph_cache_miss', 2)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cuda_broadcast1_broadcast1 (__main__.SweepInputsGPUTest.test_cuda_broadcast1_broadcast1) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cuda_broadcast1_broadcast2 (__main__.SweepInputsGPUTest.test_cuda_broadcast1_broadcast2) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cuda_broadcast1_broadcast3 (__main__.SweepInputsGPUTest.test_cuda_broadcast1_broadcast3) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cuda_broadcast1_dense (__main__.SweepInputsGPUTest.test_cuda_broadcast1_dense) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cuda_broadcast1_double (__main__.SweepInputsGPUTest.test_cuda_broadcast1_double) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cuda_broadcast1_int (__main__.SweepInputsGPUTest.test_cuda_broadcast1_int) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cuda_broadcast1_strided (__main__.SweepInputsGPUTest.test_cuda_broadcast1_strided) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cuda_broadcast1_transposed (__main__.SweepInputsGPUTest.test_cuda_broadcast1_transposed) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cuda_broadcast2_broadcast1 (__main__.SweepInputsGPUTest.test_cuda_broadcast2_broadcast1) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cuda_broadcast2_broadcast2 (__main__.SweepInputsGPUTest.test_cuda_broadcast2_broadcast2) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cuda_broadcast2_broadcast3 (__main__.SweepInputsGPUTest.test_cuda_broadcast2_broadcast3) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cuda_broadcast2_dense (__main__.SweepInputsGPUTest.test_cuda_broadcast2_dense) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cuda_broadcast2_double (__main__.SweepInputsGPUTest.test_cuda_broadcast2_double) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cuda_broadcast2_int (__main__.SweepInputsGPUTest.test_cuda_broadcast2_int) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cuda_broadcast2_strided (__main__.SweepInputsGPUTest.test_cuda_broadcast2_strided) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cuda_broadcast2_transposed (__main__.SweepInputsGPUTest.test_cuda_broadcast2_transposed) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cuda_broadcast3_broadcast1 (__main__.SweepInputsGPUTest.test_cuda_broadcast3_broadcast1) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cuda_broadcast3_broadcast2 (__main__.SweepInputsGPUTest.test_cuda_broadcast3_broadcast2) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cuda_broadcast3_broadcast3 (__main__.SweepInputsGPUTest.test_cuda_broadcast3_broadcast3) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cuda_broadcast3_dense (__main__.SweepInputsGPUTest.test_cuda_broadcast3_dense) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cuda_broadcast3_double (__main__.SweepInputsGPUTest.test_cuda_broadcast3_double) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cuda_broadcast3_int (__main__.SweepInputsGPUTest.test_cuda_broadcast3_int) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cuda_broadcast3_strided (__main__.SweepInputsGPUTest.test_cuda_broadcast3_strided) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cuda_broadcast3_transposed (__main__.SweepInputsGPUTest.test_cuda_broadcast3_transposed) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cuda_dense_broadcast1 (__main__.SweepInputsGPUTest.test_cuda_dense_broadcast1) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cuda_dense_broadcast2 (__main__.SweepInputsGPUTest.test_cuda_dense_broadcast2) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cuda_dense_broadcast3 (__main__.SweepInputsGPUTest.test_cuda_dense_broadcast3) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cuda_dense_dense (__main__.SweepInputsGPUTest.test_cuda_dense_dense) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cuda_dense_double (__main__.SweepInputsGPUTest.test_cuda_dense_double) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cuda_dense_int (__main__.SweepInputsGPUTest.test_cuda_dense_int) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cuda_dense_strided (__main__.SweepInputsGPUTest.test_cuda_dense_strided) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cuda_dense_transposed (__main__.SweepInputsGPUTest.test_cuda_dense_transposed) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cuda_double_broadcast1 (__main__.SweepInputsGPUTest.test_cuda_double_broadcast1) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cuda_double_broadcast2 (__main__.SweepInputsGPUTest.test_cuda_double_broadcast2) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cuda_double_broadcast3 (__main__.SweepInputsGPUTest.test_cuda_double_broadcast3) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cuda_double_dense (__main__.SweepInputsGPUTest.test_cuda_double_dense) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cuda_double_double (__main__.SweepInputsGPUTest.test_cuda_double_double) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cuda_double_int (__main__.SweepInputsGPUTest.test_cuda_double_int) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cuda_double_strided (__main__.SweepInputsGPUTest.test_cuda_double_strided) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cuda_double_transposed (__main__.SweepInputsGPUTest.test_cuda_double_transposed) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cuda_int_broadcast1 (__main__.SweepInputsGPUTest.test_cuda_int_broadcast1) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cuda_int_broadcast2 (__main__.SweepInputsGPUTest.test_cuda_int_broadcast2) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cuda_int_broadcast3 (__main__.SweepInputsGPUTest.test_cuda_int_broadcast3) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cuda_int_dense (__main__.SweepInputsGPUTest.test_cuda_int_dense) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cuda_int_double (__main__.SweepInputsGPUTest.test_cuda_int_double) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cuda_int_int (__main__.SweepInputsGPUTest.test_cuda_int_int) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cuda_int_strided (__main__.SweepInputsGPUTest.test_cuda_int_strided) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cuda_int_transposed (__main__.SweepInputsGPUTest.test_cuda_int_transposed) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cuda_strided_broadcast1 (__main__.SweepInputsGPUTest.test_cuda_strided_broadcast1) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cuda_strided_broadcast2 (__main__.SweepInputsGPUTest.test_cuda_strided_broadcast2) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cuda_strided_broadcast3 (__main__.SweepInputsGPUTest.test_cuda_strided_broadcast3) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cuda_strided_dense (__main__.SweepInputsGPUTest.test_cuda_strided_dense) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cuda_strided_double (__main__.SweepInputsGPUTest.test_cuda_strided_double) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cuda_strided_int (__main__.SweepInputsGPUTest.test_cuda_strided_int) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cuda_strided_strided (__main__.SweepInputsGPUTest.test_cuda_strided_strided) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cuda_strided_transposed (__main__.SweepInputsGPUTest.test_cuda_strided_transposed) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cuda_transposed_broadcast1 (__main__.SweepInputsGPUTest.test_cuda_transposed_broadcast1) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cuda_transposed_broadcast2 (__main__.SweepInputsGPUTest.test_cuda_transposed_broadcast2) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cuda_transposed_broadcast3 (__main__.SweepInputsGPUTest.test_cuda_transposed_broadcast3) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cuda_transposed_dense (__main__.SweepInputsGPUTest.test_cuda_transposed_dense) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cuda_transposed_double (__main__.SweepInputsGPUTest.test_cuda_transposed_double) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cuda_transposed_int (__main__.SweepInputsGPUTest.test_cuda_transposed_int) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cuda_transposed_strided (__main__.SweepInputsGPUTest.test_cuda_transposed_strided) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_cuda_transposed_transposed (__main__.SweepInputsGPUTest.test_cuda_transposed_transposed) ... inline_call [] | |
stats [('calls_captured', 1), ('unique_graphs', 1)] | |
aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] | |
inductor [('triton_bundler_save_kernel', 7), ('fxgraph_cache_miss', 1), ('async_compile_cache_miss', 1), ('async_compile_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
test_ctr_not_moved_to_cuda_when_used_in_index_put (__main__.TritonCodeGenTests.test_ctr_not_moved_to_cuda_when_used_in_index_put) ... frames [('total', 2), ('ok', 2)] | |
stats [('calls_captured', 2), ('unique_graphs', 2)] | |
aot_autograd [('total', 2), ('ok', 2), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('autograd_cache_hit', 1)] | |
inductor [('triton_bundler_save_kernel', 14), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 4), ('extern_calls', 2), ('fxgraph_cache_miss', 1), ('fxgraph_cache_hit', 1)] | |
graph_break [] | |
aten_mm_info [] | |
ok | |
---------------------------------------------------------------------- | |
Ran 835 tests in 418.647s | |
OK (skipped=26) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment