jerryzh168 · September 9, 2024 18:31
diff --git a/gistfile1.txt b/gistfile1.txt
 ...........frames [('total', 1), ('ok', 1)]
 inductor [('pattern_matcher_count', 4), ('pattern_matcher_nodes', 4), ('fxgraph_cache_miss', 1), ('extern_calls', 1)]
 inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('ok', 1)]
 .frames [('total', 1), ('ok', 1)]
 inductor [('pattern_matcher_count', 4), ('pattern_matcher_nodes', 4), ('fxgraph_cache_miss', 1), ('extern_calls', 1)]
 inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('ok', 1)]
 .frames [('total', 1), ('ok', 1)]
 inductor [('pattern_matcher_count', 4), ('pattern_matcher_nodes', 4), ('fxgraph_cache_miss', 1), ('extern_calls', 1)]
 inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('ok', 1)]
 .frames [('total', 1), ('ok', 1)]
 inductor [('benchmarking.TritonBenchmarker.benchmark_gpu', 4), ('pattern_matcher_count', 4), ('pattern_matcher_nodes', 4), ('benchmarking.TritonBenchmarker.triton_do_bench', 1), ('fxgraph_cache_miss', 1), ('extern_calls', 1)]
 inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('ok', 1)]
 .frames [('total', 1), ('ok', 1)]
 inductor [('benchmarking.TritonBenchmarker.benchmark_gpu', 4), ('pattern_matcher_count', 4), ('pattern_matcher_nodes', 4), ('fxgraph_cache_miss', 1), ('extern_calls', 1)]
 inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('ok', 1)]
 ./home/jerryzh/anaconda3/envs/ao_new/lib/python3.9/site-packages/torch/_inductor/compile_fx.py:167: UserWarning: TensorFloat32 tensor cores for float32 matrix multiplication available but not enabled. Consider setting `torch.set_float32_matmul_precision('high')` for better performance.
  warnings.warn(
 frames [('total', 1), ('ok', 1)]
 inductor [('pattern_matcher_count', 4), ('pattern_matcher_nodes', 4), ('benchmarking.TritonBenchmarker.benchmark_gpu', 2), ('fxgraph_cache_miss', 1), ('extern_calls', 1)]
 inline_call []
 stats [('calls_captured', 1), ('unique_graphs', 1)]
 aot_autograd [('total', 1), ('ok', 1)]
 .............
	...........frames [('total', 1), ('ok', 1)]
	inductor [('pattern_matcher_count', 4), ('pattern_matcher_nodes', 4), ('fxgraph_cache_miss', 1), ('extern_calls', 1)]
	inline_call []
	stats [('calls_captured', 1), ('unique_graphs', 1)]
	aot_autograd [('total', 1), ('ok', 1)]
	.frames [('total', 1), ('ok', 1)]
	inductor [('pattern_matcher_count', 4), ('pattern_matcher_nodes', 4), ('fxgraph_cache_miss', 1), ('extern_calls', 1)]
	inline_call []
	stats [('calls_captured', 1), ('unique_graphs', 1)]
	aot_autograd [('total', 1), ('ok', 1)]
	.frames [('total', 1), ('ok', 1)]
	inductor [('pattern_matcher_count', 4), ('pattern_matcher_nodes', 4), ('fxgraph_cache_miss', 1), ('extern_calls', 1)]
	inline_call []
	stats [('calls_captured', 1), ('unique_graphs', 1)]
	aot_autograd [('total', 1), ('ok', 1)]
	.frames [('total', 1), ('ok', 1)]
	inductor [('benchmarking.TritonBenchmarker.benchmark_gpu', 4), ('pattern_matcher_count', 4), ('pattern_matcher_nodes', 4), ('benchmarking.TritonBenchmarker.triton_do_bench', 1), ('fxgraph_cache_miss', 1), ('extern_calls', 1)]
	inline_call []
	stats [('calls_captured', 1), ('unique_graphs', 1)]
	aot_autograd [('total', 1), ('ok', 1)]
	.frames [('total', 1), ('ok', 1)]
	inductor [('benchmarking.TritonBenchmarker.benchmark_gpu', 4), ('pattern_matcher_count', 4), ('pattern_matcher_nodes', 4), ('fxgraph_cache_miss', 1), ('extern_calls', 1)]
	inline_call []
	stats [('calls_captured', 1), ('unique_graphs', 1)]
	aot_autograd [('total', 1), ('ok', 1)]
	./home/jerryzh/anaconda3/envs/ao_new/lib/python3.9/site-packages/torch/_inductor/compile_fx.py:167: UserWarning: TensorFloat32 tensor cores for float32 matrix multiplication available but not enabled. Consider setting `torch.set_float32_matmul_precision('high')` for better performance.
	warnings.warn(
	frames [('total', 1), ('ok', 1)]
	inductor [('pattern_matcher_count', 4), ('pattern_matcher_nodes', 4), ('benchmarking.TritonBenchmarker.benchmark_gpu', 2), ('fxgraph_cache_miss', 1), ('extern_calls', 1)]
	inline_call []
	stats [('calls_captured', 1), ('unique_graphs', 1)]
	aot_autograd [('total', 1), ('ok', 1)]
	.............