davidberard98 · March 2, 2022 05:08
diff --git a/test_jit_cuda_fuser-windows_errors.txt b/test_jit_cuda_fuser-windows_errors.txt
 test__softmax_function (__main__.TestCudaFuser) ... ok
 test__softmax_function_half_to_float (__main__.TestCudaFuser) ... ok
 test_addcmul_ops (__main__.TestCudaFuser) ... ok
 test_alias_pass_fix (__main__.TestCudaFuser) ... ERROR
 test_autocast_1 (__main__.TestCudaFuser) ... skipped 'Failing windows test - see 73620'
 test_autocast_1_bfloat (__main__.TestCudaFuser) ... skipped 'device does not support BFloat16'
 test_autocast_2 (__main__.TestCudaFuser) ... skipped 'Failing windows test - see 73620'
 test_autocast_2_bfloat (__main__.TestCudaFuser) ... skipped 'device does not support BFloat16'
 test_backward_type (__main__.TestCudaFuser) ... ERROR
 test_batch_norm_half (__main__.TestCudaFuser) ... C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\modules\module.py:1383: UserWarning: positional arguments and argument "destination" are deprecated. nn.Module.state_dict will not accept them in the future. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.
  warnings.warn(
 FAIL
 test_batch_norm_impl_index_correctness (__main__.TestCudaFuser) ... FAIL
 test_bfloat (__main__.TestCudaFuser) ... skipped 'device does not support BFloat16'
 test_binary_bitwise (__main__.TestCudaFuser) ... ok
 test_binary_ops (__main__.TestCudaFuser) ... ok
 test_binary_ops_channels_last_with_bcast (__main__.TestCudaFuser) ... ok
 test_binary_ops_permutation (__main__.TestCudaFuser) ... ERROR
 test_branches (__main__.TestCudaFuser) ... ok
 test_broadcasting_0 (__main__.TestCudaFuser) ... ok
 test_broadcasting_1 (__main__.TestCudaFuser) ... ok
 test_broadcasting_2 (__main__.TestCudaFuser) ... ERROR
 test_broadcasting_3 (__main__.TestCudaFuser) ... ok
 test_broadcasting_multiple_output (__main__.TestCudaFuser) ... skipped "broadcast on branches can't be resolved yet"
 test_broadcasting_multiple_output_shape (__main__.TestCudaFuser) ... skipped 'Broadcast with different output not supported yet'
 test_broadcasting_partition_logic_0 (__main__.TestCudaFuser) ... ERROR
 test_broadcasting_partition_logic_1 (__main__.TestCudaFuser) ... ERROR
 test_build_shape_expression_native_dropout (__main__.TestCudaFuser) ... FAIL
 test_category_rule (__main__.TestCudaFuser) ... expected failure
 test_channels_last_with_broadcast (__main__.TestCudaFuser) ... ERROR
 test_chunk (__main__.TestCudaFuser) ... ok
 test_clean_profile_ivalue (__main__.TestCudaFuser) ... ok
 test_const (__main__.TestCudaFuser) ... ok
 test_conv2d_bias (__main__.TestCudaFuser) ... ERROR
 test_cpu_scalar (__main__.TestCudaFuser) ... ok
 test_disable_sibling_fuse (__main__.TestCudaFuser) ... ok
 test_dropout_inference_fusion (__main__.TestCudaFuser) ... FAIL
 test_dropout_train_nograd_fusion (__main__.TestCudaFuser) ... FAIL
 test_dropout_train_nograd_prob_check (__main__.TestCudaFuser) ... FAIL
 test_dropout_training_fusion (__main__.TestCudaFuser) ... skipped 'Failing windows test - see 73620'
 test_dropout_training_prob_check (__main__.TestCudaFuser) ... FAIL
 test_dynamic_size (__main__.TestCudaFuser) ... [W C:\actions-runner\_work\pytorch\pytorch\torch\csrc\jit\python\init.cpp:650] Warning: Use _jit_set_fusion_strategy, bailout depth is deprecated. Setting to (STATIC, 20) (function operator ())
 ok
 test_fix_shape_expression_bn (__main__.TestCudaFuser) ... ok
 test_gelu (__main__.TestCudaFuser) ... skipped 'Failing windows test - see 73620'
 test_grad_sum_to_size (__main__.TestCudaFuser) ... skipped 'Failing windows test - see 73620'
 test_graph_for_with_missing_optimized_engine (__main__.TestCudaFuser) ... ok
 test_graph_rng (__main__.TestCudaFuser) ... FAIL
 test_half (__main__.TestCudaFuser) ... ok
 test_inplace_removal (__main__.TestCudaFuser) ... FAIL
 test_layer_norm_autodiff (__main__.TestCudaFuser) ... skipped 'Failing windows test - see 73620'
 test_layer_norm_parser (__main__.TestCudaFuser) ... ok
 test_linear (__main__.TestCudaFuser) ... FAIL
 test_linear_1d_weight_mismatch_bias_dtype (__main__.TestCudaFuser) ... FAIL
 test_multiple_device_pw (__main__.TestCudaFuser) ... skipped 'requires multiple CUDA device'
 test_native_layer_norm (__main__.TestCudaFuser) ... FAIL
 test_native_layer_norm_bfloat (__main__.TestCudaFuser) ... skipped 'device does not support BFloat16'
 test_native_layer_norm_half (__main__.TestCudaFuser) ... FAIL
 test_norm (__main__.TestCudaFuser) ... FAIL
 test_norm_bfloat (__main__.TestCudaFuser) ... skipped 'device does not support BFloat16'
 test_norm_channels_last (__main__.TestCudaFuser) ... FAIL
 test_norm_half (__main__.TestCudaFuser) ... FAIL
 test_norm_large (__main__.TestCudaFuser) ... FAIL
 test_normalization_partition (__main__.TestCudaFuser) ... ok
 test_permutation_preservation (__main__.TestCudaFuser) ... ok
 test_profile_ivalue (__main__.TestCudaFuser) ... FAIL
 test_profiling_node (__main__.TestCudaFuser) ... FAIL
 test_pw_single_reduction_partition (__main__.TestCudaFuser) ... ok
 test_random_topo (__main__.TestCudaFuser) ... ok
 test_reduction (__main__.TestCudaFuser) ... ok
 test_reduction_dtypes_axis (__main__.TestCudaFuser) ... ok
 test_reduction_multiple_output (__main__.TestCudaFuser) ... [W C:\actions-runner\_work\pytorch\pytorch\torch\csrc\jit\python\init.cpp:650] Warning: Use _jit_set_fusion_strategy, bailout depth is deprecated. Setting to (STATIC, 20) (function operator ())
 ok
 test_reduction_permutation (__main__.TestCudaFuser) ... ok
 test_reduction_sizes_op (__main__.TestCudaFuser) ... FAIL
 test_remove_output_used_only_in_dtype (__main__.TestCudaFuser) ... ok
 test_rsub (__main__.TestCudaFuser) ... ok
 test_scalar_input (__main__.TestCudaFuser) ... ok
 test_scalar_tensor (__main__.TestCudaFuser) ... ok
 test_scalar_tensor_permuted (__main__.TestCudaFuser) ... ok
 test_shape_expression (__main__.TestCudaFuser) ... ok
 test_sibling_fusion (__main__.TestCudaFuser) ... FAIL
 test_sibling_fusion_no_scalar_inputs (__main__.TestCudaFuser) ... ok
 test_single_reduction_broadcast (__main__.TestCudaFuser) ... ok
 test_singleton_fusion (__main__.TestCudaFuser) ... ok
 test_softmax (__main__.TestCudaFuser) ... ERROR
 test_softmax_bfloat (__main__.TestCudaFuser) ... skipped 'device does not support BFloat16'
 test_softmax_dtype (__main__.TestCudaFuser) ... skipped 'Failing windows test - see 73620'
 test_softmax_half (__main__.TestCudaFuser) ... ERROR
 test_softplus_fuser (__main__.TestCudaFuser) ... skipped 'Failing windows test - see 73620'
 test_squeeze (__main__.TestCudaFuser) ... ok
 test_squeeze_negative_dim (__main__.TestCudaFuser) ... ok
 test_sum_to_one (__main__.TestCudaFuser) ... ok
 test_sum_to_size (__main__.TestCudaFuser) ... FAIL
 test_ternary_ops (__main__.TestCudaFuser) ... ok
 test_ternary_ops_integer_compatibility (__main__.TestCudaFuser) ... ok
 test_ternary_ops_type_promotion (__main__.TestCudaFuser) ... ok
 test_to_dtype_bf16_to_bf16 (__main__.TestCudaFuser) ... skipped 'device does not support BFloat16'
 test_to_dtype_bf16_to_fp32 (__main__.TestCudaFuser) ... skipped 'device does not support BFloat16'
 test_to_dtype_fp16_to_fp16 (__main__.TestCudaFuser) ... ok
 test_to_dtype_fp16_to_fp32 (__main__.TestCudaFuser) ... ok
 test_to_dtype_fp32_to_bf16 (__main__.TestCudaFuser) ... skipped 'device does not support BFloat16'
 test_to_dtype_fp32_to_fp16 (__main__.TestCudaFuser) ... ok
 test_trivial_reduction (__main__.TestCudaFuser) ... ok
 test_type_as_op (__main__.TestCudaFuser) ... ok
 test_unary_bitwise (__main__.TestCudaFuser) ... ok
 test_unary_ops (__main__.TestCudaFuser) ... skipped 'Failing windows test - see 73620'
 test_unsqueeze (__main__.TestCudaFuser) ... ok
 test_view (__main__.TestCudaFuser) ... ERROR
 test_autodiff_fallback (jit.test_fuser_common.TestFuserCommon) ... ok
 test_context_manager_test (__main__.TestPassManagerCudaFuser) ... ok
 test_register_fuser (__main__.TestPassManagerCudaFuser) ... ok

 ======================================================================
 ERROR: test_alias_pass_fix (__main__.TestCudaFuser)
 ----------------------------------------------------------------------
 Traceback (most recent call last):
  File "../../../test/test_jit_cuda_fuser.py", line 3501, in test_alias_pass_fix
    self._run_helper(t_jit, t, x, w, b)
  File "../../../test/test_jit_cuda_fuser.py", line 149, in _run_helper
    jit_o = jit_op(*args)
  File "C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\testing\_internal\common_utils.py", line 428, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\testing\_internal\common_utils.py", line 425, in prof_callable
    return callable(*args, **kwargs)
 RuntimeError: The following operation failed in the TorchScript interpreter.
 Traceback of TorchScript (most recent call last):
 RuntimeError: index_map.find(root_dom[i]) != index_map.end() INTERNAL ASSERT FAILED at "C:\\actions-runner\\_work\\pytorch\\pytorch\\torch\\csrc\\jit\\codegen\\cuda\\index_compute.cpp":1980, please report a bug to PyTorch. Couldn't find root mapping for TV8 dim: 0 id: iS127{T0.size[1]}


 ======================================================================
 ERROR: test_backward_type (__main__.TestCudaFuser)
 ----------------------------------------------------------------------
 Traceback (most recent call last):
  File "../../../test/test_jit_cuda_fuser.py", line 2347, in test_backward_type
    FileCheck().check(FUSION_GROUP).run(bwd_graph)
 RuntimeError: Expected to find "prim::CudaFusionGroup" but did not find it
 Searched string:
 graph(%0 : Tensor,
 ~~~~~~~~~~~~~~~~~~ <--- HERE
      %self_size.23 : int[]?,
      %other_size.23 : int[]?,
      %self_size.25 : int[]?,
 From CHECK: prim::CudaFusionGroup


 ======================================================================
 ERROR: test_binary_ops_permutation (__main__.TestCudaFuser)
 ----------------------------------------------------------------------
 Traceback (most recent call last):
  File "../../../test/test_jit_cuda_fuser.py", line 1119, in test_binary_ops_permutation
    self._permutation_helper(x, b_axis, torch.float32, "cuda", perm0, perm1)
  File "../../../test/test_jit_cuda_fuser.py", line 1096, in _permutation_helper
    jit_o = t_jit(x, y)
  File "C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\testing\_internal\common_utils.py", line 428, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\testing\_internal\common_utils.py", line 425, in prof_callable
    return callable(*args, **kwargs)
 RuntimeError: The following operation failed in the TorchScript interpreter.
 Traceback of TorchScript (most recent call last):
 RuntimeError: index_map.find(root_dom[i]) != index_map.end() INTERNAL ASSERT FAILED at "C:\\actions-runner\\_work\\pytorch\\pytorch\\torch\\csrc\\jit\\codegen\\cuda\\index_compute.cpp":1980, please report a bug to PyTorch. Couldn't find root mapping for TV6 dim: 1 id: iS105{T0.size[0]}


 ======================================================================
 ERROR: test_broadcasting_2 (__main__.TestCudaFuser)
 ----------------------------------------------------------------------
 Traceback (most recent call last):
  File "../../../test/test_jit_cuda_fuser.py", line 364, in test_broadcasting_2
    jit_o = t_jit(x, y, 2.0)
  File "C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\testing\_internal\common_utils.py", line 428, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\testing\_internal\common_utils.py", line 425, in prof_callable
    return callable(*args, **kwargs)
 RuntimeError: The following operation failed in the TorchScript interpreter.
 Traceback of TorchScript (most recent call last):
 RuntimeError: index_map.find(root_dom[i]) != index_map.end() INTERNAL ASSERT FAILED at "C:\\actions-runner\\_work\\pytorch\\pytorch\\torch\\csrc\\jit\\codegen\\cuda\\index_compute.cpp":1980, please report a bug to PyTorch. Couldn't find root mapping for TV5 dim: 2 id: iS102{T0.size[2]}


 ======================================================================
 ERROR: test_broadcasting_partition_logic_0 (__main__.TestCudaFuser)
 ----------------------------------------------------------------------
 Traceback (most recent call last):
  File "../../../test/test_jit_cuda_fuser.py", line 408, in test_broadcasting_partition_logic_0
    jit_o = t_jit(x, y, z)
  File "C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\testing\_internal\common_utils.py", line 428, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\testing\_internal\common_utils.py", line 425, in prof_callable
    return callable(*args, **kwargs)
 RuntimeError: The following operation failed in the TorchScript interpreter.
 Traceback of TorchScript (most recent call last):
 RuntimeError: index_map.find(root_dom[i]) != index_map.end() INTERNAL ASSERT FAILED at "C:\\actions-runner\\_work\\pytorch\\pytorch\\torch\\csrc\\jit\\codegen\\cuda\\index_compute.cpp":1980, please report a bug to PyTorch. Couldn't find root mapping for TV9 dim: 0 id: iS179{T0.size[0]}


 ======================================================================
 ERROR: test_broadcasting_partition_logic_1 (__main__.TestCudaFuser)
 ----------------------------------------------------------------------
 Traceback (most recent call last):
  File "../../../test/test_jit_cuda_fuser.py", line 430, in test_broadcasting_partition_logic_1
    jit_o = t_jit(x, y, z)
  File "C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\testing\_internal\common_utils.py", line 428, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\testing\_internal\common_utils.py", line 425, in prof_callable
    return callable(*args, **kwargs)
 RuntimeError: The following operation failed in the TorchScript interpreter.
 Traceback of TorchScript (most recent call last):
 RuntimeError: index_map.find(root_dom[i]) != index_map.end() INTERNAL ASSERT FAILED at "C:\\actions-runner\\_work\\pytorch\\pytorch\\torch\\csrc\\jit\\codegen\\cuda\\index_compute.cpp":1980, please report a bug to PyTorch. Couldn't find root mapping for TV9 dim: 2 id: iS186{T0.size[2]}


 ======================================================================
 ERROR: test_channels_last_with_broadcast (__main__.TestCudaFuser)
 ----------------------------------------------------------------------
 Traceback (most recent call last):
  File "../../../test/test_jit_cuda_fuser.py", line 1736, in test_channels_last_with_broadcast
    jit_o = t_jit(x, y)
  File "C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\testing\_internal\common_utils.py", line 428, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\testing\_internal\common_utils.py", line 425, in prof_callable
    return callable(*args, **kwargs)
 RuntimeError: The following operation failed in the TorchScript interpreter.
 Traceback of TorchScript (most recent call last):
 RuntimeError: index_map.find(root_dom[i]) != index_map.end() INTERNAL ASSERT FAILED at "C:\\actions-runner\\_work\\pytorch\\pytorch\\torch\\csrc\\jit\\codegen\\cuda\\index_compute.cpp":1980, please report a bug to PyTorch. Couldn't find root mapping for TV5 dim: 0 id: iS111{T0.size[0]}


 ======================================================================
 ERROR: test_conv2d_bias (__main__.TestCudaFuser)
 ----------------------------------------------------------------------
 Traceback (most recent call last):
  File "../../../test/test_jit_cuda_fuser.py", line 2981, in test_conv2d_bias
    jit_o = jitted(inp, weight, bias)
  File "C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\testing\_internal\common_utils.py", line 428, in prof_func_call
    return prof_callable(func_call, *args, **kwargs)
  File "C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\testing\_internal\common_utils.py", line 425, in prof_callable
    return callable(*args, **kwargs)
 RuntimeError: The following operation failed in the TorchScript interpreter.
 Traceback of TorchScript (most recent call last):
 RuntimeError: index_map.find(root_dom[i]) != index_map.end() INTERNAL ASSERT FAILED at "C:\\actions-runner\\_work\\pytorch\\pytorch\\torch\\csrc\\jit\\codegen\\cuda\\index_compute.cpp":1980, please report a bug to PyTorch. Couldn't find root mapping for TV6 dim: 0 id: iS113{T0.size[1]}


 ======================================================================
 ERROR: test_softmax (__main__.TestCudaFuser)
 ----------------------------------------------------------------------
 Traceback (most recent call last):
  File "../../../test/test_jit_cuda_fuser.py", line 1624, in test_softmax
    self._softmax_helper(x, reduction_dim, torch.float32, "cuda", 1e-4)
  File "../../../test/test_jit_cuda_fuser.py", line 1511, in _softmax_helper
    jit_o = t_jit(x, y)
  File "C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\modules\module.py", line 1111, in _call_impl
    return forward_call(*input, **kwargs)
  File "C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\testing\_internal\common_utils.py", line 431, in prof_meth_call
    return prof_callable(meth_call, *args, **kwargs)
  File "C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\testing\_internal\common_utils.py", line 425, in prof_callable
    return callable(*args, **kwargs)
 RuntimeError: The following operation failed in the TorchScript interpreter.
 Traceback of TorchScript (most recent call last):
 RuntimeError: index_map.find(root_dom[i]) != index_map.end() INTERNAL ASSERT FAILED at "C:\\actions-runner\\_work\\pytorch\\pytorch\\torch\\csrc\\jit\\codegen\\cuda\\index_compute.cpp":1980, please report a bug to PyTorch. Couldn't find root mapping for TV11 dim: 0 id: iS126{T9.size[0]}


 ======================================================================
 ERROR: test_softmax_half (__main__.TestCudaFuser)
 ----------------------------------------------------------------------
 Traceback (most recent call last):
  File "../../../test/test_jit_cuda_fuser.py", line 1640, in test_softmax_half
    self._softmax_helper(x, reduction_dim, torch.float16, "cuda", 5e-3)
  File "../../../test/test_jit_cuda_fuser.py", line 1511, in _softmax_helper
    jit_o = t_jit(x, y)
  File "C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\modules\module.py", line 1111, in _call_impl
    return forward_call(*input, **kwargs)
  File "C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\testing\_internal\common_utils.py", line 431, in prof_meth_call
    return prof_callable(meth_call, *args, **kwargs)
  File "C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\testing\_internal\common_utils.py", line 425, in prof_callable
    return callable(*args, **kwargs)
 RuntimeError: The following operation failed in the TorchScript interpreter.
 Traceback of TorchScript (most recent call last):
 RuntimeError: index_map.find(root_dom[i]) != index_map.end() INTERNAL ASSERT FAILED at "C:\\actions-runner\\_work\\pytorch\\pytorch\\torch\\csrc\\jit\\codegen\\cuda\\index_compute.cpp":1980, please report a bug to PyTorch. Couldn't find root mapping for TV22 dim: 0 id: iS208{T14.size[0]}


 ======================================================================
 ERROR: test_view (__main__.TestCudaFuser)
 ----------------------------------------------------------------------
 Traceback (most recent call last):
  File "../../../test/test_jit_cuda_fuser.py", line 3363, in test_view
    self._bias_view_relu_helper([2, 3, 4, 5], [-1, 4, 5], torch.float, 'cuda', 1e-6)
  File "../../../test/test_jit_cuda_fuser.py", line 3201, in _bias_view_relu_helper
    jit_o = t_jit(x, output_shape)
  File "C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\modules\module.py", line 1111, in _call_impl
    return forward_call(*input, **kwargs)
  File "C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\testing\_internal\common_utils.py", line 431, in prof_meth_call
    return prof_callable(meth_call, *args, **kwargs)
  File "C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\testing\_internal\common_utils.py", line 425, in prof_callable
    return callable(*args, **kwargs)
 RuntimeError: view_sizes.has_value() INTERNAL ASSERT FAILED at "C:\\actions-runner\\_work\\pytorch\\pytorch\\torch\\csrc\\jit\\codegen\\cuda\\parser.cpp":2463, please report a bug to PyTorch. The size parameter is required.

 ======================================================================
 FAIL: test_batch_norm_half (__main__.TestCudaFuser)
 ----------------------------------------------------------------------
 Traceback (most recent call last):
  File "../../../test/test_jit_cuda_fuser.py", line 2893, in test_batch_norm_half
    self._test_batch_norm_impl_index_helper(4, 8, 5, affine, track_running_stats, training, torch.half)
  File "../../../test/test_jit_cuda_fuser.py", line 2845, in _test_batch_norm_impl_index_helper
    self.assertGraphContainsExactly(t_jit.graph_for(x), FUSION_GUARD, 1, consider_subgraphs=True)
  File "C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\testing\_internal\jit_utils.py", line 325, in assertGraphContainsExactly
    perform_assert(graph, kind, count, num_kind_nodes,
  File "C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\testing\_internal\jit_utils.py", line 318, in perform_assert
    raise AssertionError(
 AssertionError: graph(%self : __torch__.MyModule,
      %x.1 : Tensor):
  %2 : float = prim::Constant[value=1.0000000000000001e-05]() # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\modules\batchnorm.py:179:12
  %3 : float = prim::Constant[value=0.10000000000000001]()
  %4 : int = prim::Constant[value=1]() # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\modules\batchnorm.py:148:46
  %5 : bool = prim::Constant[value=1]() # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:2423:81
  %6 : float = prim::Constant[value=2.]() # ../../../test/test_jit_cuda_fuser.py:2792:24
  %7 : int = prim::BailoutTemplate_0()
  %8 : Half(4, 8, 5, 5, strides=[200, 25, 5, 1], requires_grad=1, device=cuda:0) = prim::BailOut[index=0](%7, %x.1, %self)
  %bn : __torch__.torch.nn.modules.batchnorm.BatchNorm2d = prim::GetAttr[name="bn"](%self)
  %training.1 : bool = prim::GetAttr[name="training"](%bn)
   = prim::If(%training.1) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\modules\batchnorm.py:145:11
    block0():
      %num_batches_tracked.7 : Tensor = prim::GetAttr[name="num_batches_tracked"](%bn)
      %12 : Tensor = aten::add_(%num_batches_tracked.7, %4, %4) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\modules\batchnorm.py:148:16
      -> ()
    block1():
      -> ()
  %running_mean : Tensor = prim::GetAttr[name="running_mean"](%bn)
  %14 : Half(8, strides=[1], requires_grad=0, device=cuda:0) = prim::BailOut[index=2](%7, %running_mean, %x.1, %bn, %training.1)
  %running_var : Tensor = prim::GetAttr[name="running_var"](%bn)
  %16 : Half(8, strides=[1], requires_grad=0, device=cuda:0) = prim::BailOut[index=3](%7, %running_var, %x.1, %bn, %training.1, %14)
  %weight : Tensor = prim::GetAttr[name="weight"](%bn)
  %18 : Half(8, strides=[1], requires_grad=1, device=cuda:0) = prim::BailOut[index=4](%7, %weight, %x.1, %bn, %training.1, %14, %16)
  %bias : Tensor = prim::GetAttr[name="bias"](%bn)
  %20 : Half(8, strides=[1], requires_grad=1, device=cuda:0) = prim::BailOut[index=5](%7, %bias, %x.1, %training.1, %18, %14, %16)
   = prim::If(%training.1) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:2419:4
    block0():
      %21 : Half(4, 8, 5, 5, strides=[200, 25, 5, 1], requires_grad=1, device=cuda:0) = prim::BailOut[index=6](%7, %x.1, %training.1, %18, %20, %14, %16)
      -> ()
    block1():
      -> ()
  %22 : Half(4, 8, 5, 5, strides=[200, 25, 5, 1], requires_grad=1, device=cuda:0) = prim::BailOut[index=7](%7, %x.1, %training.1, %18, %20, %14, %16)
  %o.1 : Tensor = aten::batch_norm(%22, %18, %20, %14, %16, %training.1, %3, %2, %5) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:2422:11
  %24 : Half(4, 8, 5, 5, strides=[200, 25, 5, 1], requires_grad=1, device=cuda:0) = prim::BailOut[index=8](%7, %o.1)
  %o.5 : Half(4, 8, 5, 5, strides=[200, 25, 5, 1], requires_grad=1, device=cuda:0) = aten::mul(%24, %6) # ../../../test/test_jit_cuda_fuser.py:2792:20
  return (%o.5)
 with prim::BailoutTemplate_0 = graph(%self : __torch__.MyModule,
      %x.1 : Tensor):
  %2 : Half(4, 8, 5, 5, strides=[200, 25, 5, 1], requires_grad=1, device=cuda:0) = prim::BailOut[index=0](%x.1, %self)
  %3 : float = prim::Constant[value=2.]() # ../../../test/test_jit_cuda_fuser.py:2792:24
  %4 : bool = prim::Constant[value=1]() # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:2423:81
  %5 : int = prim::Constant[value=2]() # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:2385:31
  %6 : int = prim::Constant[value=0]() # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:2384:22
  %7 : str = prim::Constant[value="Expected more than 1 value per channel when training, got input size {}"]() # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:2388:25
  %8 : str = prim::Constant[value="builtins.ValueError"]() # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\modules\batchnorm.py:407:18
  %9 : str = prim::Constant[value="expected 4D input (got {}D input)"]() # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\modules\batchnorm.py:407:29
  %10 : int = prim::Constant[value=4]() # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\modules\batchnorm.py:406:26
  %11 : int = prim::Constant[value=1]() # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\modules\batchnorm.py:148:46
  %12 : float = prim::Constant[value=0.10000000000000001]()
  %13 : float = prim::Constant[value=1.0000000000000001e-05]() # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\modules\batchnorm.py:179:12
  %bn : __torch__.torch.nn.modules.batchnorm.BatchNorm2d = prim::GetAttr[name="bn"](%self)
  %15 : int = aten::dim(%2) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\modules\batchnorm.py:406:11
  %16 : bool = aten::ne(%15, %10) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\modules\batchnorm.py:406:11
   = prim::If(%16) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\modules\batchnorm.py:406:8
    block0():
      %17 : Tensor = prim::BailOut[index=1](%x.1, %bn)
      %18 : int = aten::dim(%17) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\modules\batchnorm.py:407:72
      %19 : str = aten::format(%9, %18) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\modules\batchnorm.py:407:29
       = prim::RaiseException(%19, %8) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\modules\batchnorm.py:407:12
      -> ()
    block1():
      -> ()
  %training.1 : bool = prim::GetAttr[name="training"](%bn)
   = prim::If(%training.1) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\modules\batchnorm.py:145:11
    block0():
      %num_batches_tracked.7 : Tensor = prim::GetAttr[name="num_batches_tracked"](%bn)
      %22 : Tensor = aten::add_(%num_batches_tracked.7, %11, %11) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\modules\batchnorm.py:148:16
      -> ()
    block1():
      -> ()
  %running_mean : Tensor = prim::GetAttr[name="running_mean"](%bn)
  %24 : Half(8, strides=[1], requires_grad=0, device=cuda:0) = prim::BailOut[index=2](%running_mean, %x.1, %bn, %training.1)
  %running_var : Tensor = prim::GetAttr[name="running_var"](%bn)
  %26 : Half(8, strides=[1], requires_grad=0, device=cuda:0) = prim::BailOut[index=3](%running_var, %x.1, %bn, %training.1, %24)
  %weight : Tensor = prim::GetAttr[name="weight"](%bn)
  %28 : Half(8, strides=[1], requires_grad=1, device=cuda:0) = prim::BailOut[index=4](%weight, %x.1, %bn, %training.1, %24, %26)
  %bias : Tensor = prim::GetAttr[name="bias"](%bn)
  %30 : Half(8, strides=[1], requires_grad=1, device=cuda:0) = prim::BailOut[index=5](%bias, %x.1, %training.1, %28, %24, %26)
   = prim::If(%training.1) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:2419:4
    block0():
      %31 : Half(4, 8, 5, 5, strides=[200, 25, 5, 1], requires_grad=1, device=cuda:0) = prim::BailOut[index=6](%x.1, %training.1, %28, %30, %24, %26)
      %32 : int[] = aten::size(%31) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:2420:27
      %size_prods.1 : int = aten::__getitem__(%32, %6) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:2384:17
      %34 : int = aten::len(%32) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:2385:19
      %35 : int = aten::sub(%34, %5) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:2385:19
      %size_prods : int = prim::Loop(%35, %4, %size_prods.1) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:2385:4
        block0(%i.1 : int, %size_prods.11 : int):
          %39 : int = aten::add(%i.1, %5) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:2386:27
          %40 : int = aten::__getitem__(%32, %39) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:2386:22
          %size_prods.5 : int = aten::mul(%size_prods.11, %40) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:2386:8
          -> (%4, %size_prods.5)
      %42 : bool = aten::eq(%size_prods, %11) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:2387:7
       = prim::If(%42) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:2387:4
        block0():
          %43 : str = aten::format(%7, %32) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:2388:25
           = prim::RaiseException(%43, %8) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:2388:8
          -> ()
        block1():
          -> ()
      -> ()
    block1():
      -> ()
  %44 : Half(4, 8, 5, 5, strides=[200, 25, 5, 1], requires_grad=1, device=cuda:0) = prim::BailOut[index=7](%x.1, %training.1, %28, %30, %24, %26)
  %o.1 : Tensor = aten::batch_norm(%44, %28, %30, %24, %26, %training.1, %12, %13, %4) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:2422:11
  %46 : Half(4, 8, 5, 5, strides=[200, 25, 5, 1], requires_grad=1, device=cuda:0) = prim::BailOut[index=8](%o.1)
  %o.5 : Half(4, 8, 5, 5, strides=[200, 25, 5, 1], requires_grad=1, device=cuda:0) = aten::mul(%46, %3) # ../../../test/test_jit_cuda_fuser.py:2792:20
  return (%o.5)

 Error: graph contains 0 prim::CudaFusionGuard nodes (including subgraphs) but expected 1

 ======================================================================
 FAIL: test_batch_norm_impl_index_correctness (__main__.TestCudaFuser)
 ----------------------------------------------------------------------
 Traceback (most recent call last):
  File "../../../test/test_jit_cuda_fuser.py", line 2918, in test_batch_norm_impl_index_correctness
    self._test_batch_norm_impl_index_helper(b, c, hw, affine, track_running_stats, training)
  File "../../../test/test_jit_cuda_fuser.py", line 2845, in _test_batch_norm_impl_index_helper
    self.assertGraphContainsExactly(t_jit.graph_for(x), FUSION_GUARD, 1, consider_subgraphs=True)
  File "C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\testing\_internal\jit_utils.py", line 325, in assertGraphContainsExactly
    perform_assert(graph, kind, count, num_kind_nodes,
  File "C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\testing\_internal\jit_utils.py", line 318, in perform_assert
    raise AssertionError(
 AssertionError: graph(%self : __torch__.MyModule,
      %x.1 : Tensor):
  %2 : float = prim::Constant[value=1.0000000000000001e-05]() # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\modules\batchnorm.py:179:12
  %3 : float = prim::Constant[value=0.10000000000000001]()
  %4 : int = prim::Constant[value=1]() # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\modules\batchnorm.py:148:46
  %5 : bool = prim::Constant[value=1]() # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:2423:81
  %6 : float = prim::Constant[value=2.]() # ../../../test/test_jit_cuda_fuser.py:2792:24
  %7 : int = prim::BailoutTemplate_0()
  %8 : Float(2, 4, 1, 1, strides=[4, 1, 1, 1], requires_grad=1, device=cuda:0) = prim::BailOut[index=0](%7, %x.1, %self)
  %bn : __torch__.torch.nn.modules.batchnorm.BatchNorm2d = prim::GetAttr[name="bn"](%self)
  %training.1 : bool = prim::GetAttr[name="training"](%bn)
   = prim::If(%training.1) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\modules\batchnorm.py:145:11
    block0():
      %num_batches_tracked.7 : Tensor = prim::GetAttr[name="num_batches_tracked"](%bn)
      %12 : Tensor = aten::add_(%num_batches_tracked.7, %4, %4) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\modules\batchnorm.py:148:16
      -> ()
    block1():
      -> ()
  %running_mean : Tensor = prim::GetAttr[name="running_mean"](%bn)
  %14 : Float(4, strides=[1], requires_grad=0, device=cuda:0) = prim::BailOut[index=2](%7, %running_mean, %x.1, %bn, %training.1)
  %running_var : Tensor = prim::GetAttr[name="running_var"](%bn)
  %16 : Float(4, strides=[1], requires_grad=0, device=cuda:0) = prim::BailOut[index=3](%7, %running_var, %x.1, %bn, %training.1, %14)
  %weight : Tensor = prim::GetAttr[name="weight"](%bn)
  %18 : Float(4, strides=[1], requires_grad=1, device=cuda:0) = prim::BailOut[index=4](%7, %weight, %x.1, %bn, %training.1, %14, %16)
  %bias : Tensor = prim::GetAttr[name="bias"](%bn)
  %20 : Float(4, strides=[1], requires_grad=1, device=cuda:0) = prim::BailOut[index=5](%7, %bias, %x.1, %training.1, %18, %14, %16)
   = prim::If(%training.1) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:2419:4
    block0():
      %21 : Float(2, 4, 1, 1, strides=[4, 1, 1, 1], requires_grad=1, device=cuda:0) = prim::BailOut[index=6](%7, %x.1, %training.1, %18, %20, %14, %16)
      -> ()
    block1():
      -> ()
  %22 : Float(2, 4, 1, 1, strides=[4, 1, 1, 1], requires_grad=1, device=cuda:0) = prim::BailOut[index=7](%7, %x.1, %training.1, %18, %20, %14, %16)
  %o.1 : Tensor = aten::batch_norm(%22, %18, %20, %14, %16, %training.1, %3, %2, %5) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:2422:11
  %24 : Float(2, 4, 1, 1, strides=[4, 1, 1, 1], requires_grad=1, device=cuda:0) = prim::BailOut[index=8](%7, %o.1)
  %o.5 : Float(2, 4, 1, 1, strides=[4, 1, 1, 1], requires_grad=1, device=cuda:0) = aten::mul(%24, %6) # ../../../test/test_jit_cuda_fuser.py:2792:20
  return (%o.5)
 with prim::BailoutTemplate_0 = graph(%self : __torch__.MyModule,
      %x.1 : Tensor):
  %2 : Float(2, 4, 1, 1, strides=[4, 1, 1, 1], requires_grad=1, device=cuda:0) = prim::BailOut[index=0](%x.1, %self)
  %3 : float = prim::Constant[value=2.]() # ../../../test/test_jit_cuda_fuser.py:2792:24
  %4 : bool = prim::Constant[value=1]() # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:2423:81
  %5 : int = prim::Constant[value=2]() # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:2385:31
  %6 : int = prim::Constant[value=0]() # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:2384:22
  %7 : str = prim::Constant[value="Expected more than 1 value per channel when training, got input size {}"]() # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:2388:25
  %8 : str = prim::Constant[value="builtins.ValueError"]() # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\modules\batchnorm.py:407:18
  %9 : str = prim::Constant[value="expected 4D input (got {}D input)"]() # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\modules\batchnorm.py:407:29
  %10 : int = prim::Constant[value=4]() # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\modules\batchnorm.py:406:26
  %11 : int = prim::Constant[value=1]() # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\modules\batchnorm.py:148:46
  %12 : float = prim::Constant[value=0.10000000000000001]()
  %13 : float = prim::Constant[value=1.0000000000000001e-05]() # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\modules\batchnorm.py:179:12
  %bn : __torch__.torch.nn.modules.batchnorm.BatchNorm2d = prim::GetAttr[name="bn"](%self)
  %15 : int = aten::dim(%2) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\modules\batchnorm.py:406:11
  %16 : bool = aten::ne(%15, %10) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\modules\batchnorm.py:406:11
   = prim::If(%16) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\modules\batchnorm.py:406:8
    block0():
      %17 : Tensor = prim::BailOut[index=1](%x.1, %bn)
      %18 : int = aten::dim(%17) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\modules\batchnorm.py:407:72
      %19 : str = aten::format(%9, %18) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\modules\batchnorm.py:407:29
       = prim::RaiseException(%19, %8) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\modules\batchnorm.py:407:12
      -> ()
    block1():
      -> ()
  %training.1 : bool = prim::GetAttr[name="training"](%bn)
   = prim::If(%training.1) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\modules\batchnorm.py:145:11
    block0():
      %num_batches_tracked.7 : Tensor = prim::GetAttr[name="num_batches_tracked"](%bn)
      %22 : Tensor = aten::add_(%num_batches_tracked.7, %11, %11) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\modules\batchnorm.py:148:16
      -> ()
    block1():
      -> ()
  %running_mean : Tensor = prim::GetAttr[name="running_mean"](%bn)
  %24 : Float(4, strides=[1], requires_grad=0, device=cuda:0) = prim::BailOut[index=2](%running_mean, %x.1, %bn, %training.1)
  %running_var : Tensor = prim::GetAttr[name="running_var"](%bn)
  %26 : Float(4, strides=[1], requires_grad=0, device=cuda:0) = prim::BailOut[index=3](%running_var, %x.1, %bn, %training.1, %24)
  %weight : Tensor = prim::GetAttr[name="weight"](%bn)
  %28 : Float(4, strides=[1], requires_grad=1, device=cuda:0) = prim::BailOut[index=4](%weight, %x.1, %bn, %training.1, %24, %26)
  %bias : Tensor = prim::GetAttr[name="bias"](%bn)
  %30 : Float(4, strides=[1], requires_grad=1, device=cuda:0) = prim::BailOut[index=5](%bias, %x.1, %training.1, %28, %24, %26)
   = prim::If(%training.1) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:2419:4
    block0():
      %31 : Float(2, 4, 1, 1, strides=[4, 1, 1, 1], requires_grad=1, device=cuda:0) = prim::BailOut[index=6](%x.1, %training.1, %28, %30, %24, %26)
      %32 : int[] = aten::size(%31) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:2420:27
      %size_prods.1 : int = aten::__getitem__(%32, %6) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:2384:17
      %34 : int = aten::len(%32) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:2385:19
      %35 : int = aten::sub(%34, %5) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:2385:19
      %size_prods : int = prim::Loop(%35, %4, %size_prods.1) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:2385:4
        block0(%i.1 : int, %size_prods.11 : int):
          %39 : int = aten::add(%i.1, %5) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:2386:27
          %40 : int = aten::__getitem__(%32, %39) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:2386:22
          %size_prods.5 : int = aten::mul(%size_prods.11, %40) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:2386:8
          -> (%4, %size_prods.5)
      %42 : bool = aten::eq(%size_prods, %11) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:2387:7
       = prim::If(%42) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:2387:4
        block0():
          %43 : str = aten::format(%7, %32) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:2388:25
           = prim::RaiseException(%43, %8) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:2388:8
          -> ()
        block1():
          -> ()
      -> ()
    block1():
      -> ()
  %44 : Float(2, 4, 1, 1, strides=[4, 1, 1, 1], requires_grad=1, device=cuda:0) = prim::BailOut[index=7](%x.1, %training.1, %28, %30, %24, %26)
  %o.1 : Tensor = aten::batch_norm(%44, %28, %30, %24, %26, %training.1, %12, %13, %4) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:2422:11
  %46 : Float(2, 4, 1, 1, strides=[4, 1, 1, 1], requires_grad=1, device=cuda:0) = prim::BailOut[index=8](%o.1)
  %o.5 : Float(2, 4, 1, 1, strides=[4, 1, 1, 1], requires_grad=1, device=cuda:0) = aten::mul(%46, %3) # ../../../test/test_jit_cuda_fuser.py:2792:20
  return (%o.5)

 Error: graph contains 0 prim::CudaFusionGuard nodes (including subgraphs) but expected 1

 ======================================================================
 FAIL: test_build_shape_expression_native_dropout (__main__.TestCudaFuser)
 ----------------------------------------------------------------------
 Traceback (most recent call last):
  File "../../../test/test_jit_cuda_fuser.py", line 3572, in test_build_shape_expression_native_dropout
    self.assertGraphContains(t_jit.graph_for(x), FUSION_GUARD)
  File "C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\testing\_internal\jit_utils.py", line 311, in assertGraphContains
    self.assertTrue(len(out_nodes) > 0)
 AssertionError: False is not true

 ======================================================================
 FAIL: test_dropout_inference_fusion (__main__.TestCudaFuser)
 ----------------------------------------------------------------------
 Traceback (most recent call last):
  File "../../../test/test_jit_cuda_fuser.py", line 2148, in test_dropout_inference_fusion
    self._run_helper(t_jit, t, x, 0.15, False)
  File "../../../test/test_jit_cuda_fuser.py", line 154, in _run_helper
    self.assertGraphContainsExactly(jit_op.graph_for(*args), FUSION_GUARD, 1, consider_subgraphs=True)
  File "C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\testing\_internal\jit_utils.py", line 325, in assertGraphContainsExactly
    perform_assert(graph, kind, count, num_kind_nodes,
  File "C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\testing\_internal\jit_utils.py", line 318, in perform_assert
    raise AssertionError(
 AssertionError: graph(%x.1 : Tensor,
      %p.1 : float,
      %train.1 : bool):
  %3 : str = prim::Constant[value="builtins.ValueError"]() # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1278:14
  %4 : str = prim::Constant[value="dropout probability has to be between 0 and 1, but got {}"]() # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1278:25
  %5 : bool = prim::Constant[value=1]() # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1277:7
  %6 : float = prim::Constant[value=0.]() # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1277:11
  %7 : float = prim::Constant[value=1.]() # ../../../test/test_jit_cuda_fuser.py:2143:20
  %8 : int = prim::Constant[value=1]()
  %9 : int = prim::BailoutTemplate_0()
  %10 : Tensor = prim::BailOut[index=0](%9, %x.1, %p.1, %train.1)
  %11 : bool = aten::lt(%p.1, %6) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1277:7
  %12 : bool = prim::If(%11) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1277:7
    block0():
      -> (%5)
    block1():
      %13 : bool = aten::gt(%p.1, %7) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1277:18
      -> (%13)
   = prim::If(%12) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1277:4
    block0():
      %14 : str = aten::format(%4, %p.1) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1278:25
       = prim::RaiseException(%14, %3) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1278:8
      -> ()
    block1():
      -> ()
  %15 : Tensor = aten::dropout(%10, %p.1, %train.1) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1279:60
  %16 : Tensor = prim::BailOut[index=1](%9, %15)
  %o.5 : Tensor = aten::add(%16, %7, %8) # ../../../test/test_jit_cuda_fuser.py:2143:16
  return (%o.5)
 with prim::BailoutTemplate_0 = graph(%x.1 : Tensor,
      %p.1 : float,
      %train.1 : bool):
  %3 : Float(10, 4, 8, strides=[32, 8, 1], requires_grad=0, device=cuda:0) = prim::BailOut[index=0](%x.1, %p.1, %train.1)
  %4 : int = prim::Constant[value=1]()
  %5 : float = prim::Constant[value=1.]() # ../../../test/test_jit_cuda_fuser.py:2143:20
  %6 : float = prim::Constant[value=0.]() # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1277:11
  %7 : bool = prim::Constant[value=1]() # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1277:7
  %8 : str = prim::Constant[value="dropout probability has to be between 0 and 1, but got {}"]() # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1278:25
  %9 : str = prim::Constant[value="builtins.ValueError"]() # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1278:14
  %10 : bool = aten::lt(%p.1, %6) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1277:7
  %11 : bool = prim::If(%10) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1277:7
    block0():
      -> (%7)
    block1():
      %12 : bool = aten::gt(%p.1, %5) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1277:18
      -> (%12)
   = prim::If(%11) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1277:4
    block0():
      %13 : str = aten::format(%8, %p.1) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1278:25
       = prim::RaiseException(%13, %9) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1278:8
      -> ()
    block1():
      -> ()
  %14 : Tensor = aten::dropout(%3, %p.1, %train.1) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1279:60
  %15 : Float(10, 4, 8, strides=[32, 8, 1], requires_grad=0, device=cuda:0) = prim::BailOut[index=1](%14)
  %o.5 : Float(10, 4, 8, strides=[32, 8, 1], requires_grad=0, device=cuda:0) = aten::add(%15, %5, %4) # ../../../test/test_jit_cuda_fuser.py:2143:16
  return (%o.5)

 Error: graph contains 0 prim::CudaFusionGuard nodes (including subgraphs) but expected 1

 ======================================================================
 FAIL: test_dropout_train_nograd_fusion (__main__.TestCudaFuser)
 ----------------------------------------------------------------------
 Traceback (most recent call last):
  File "../../../test/test_jit_cuda_fuser.py", line 2165, in test_dropout_train_nograd_fusion
    self._run_helper(t_jit, t, x, 0.0, True)
  File "../../../test/test_jit_cuda_fuser.py", line 154, in _run_helper
    self.assertGraphContainsExactly(jit_op.graph_for(*args), FUSION_GUARD, 1, consider_subgraphs=True)
  File "C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\testing\_internal\jit_utils.py", line 325, in assertGraphContainsExactly
    perform_assert(graph, kind, count, num_kind_nodes,
  File "C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\testing\_internal\jit_utils.py", line 318, in perform_assert
    raise AssertionError(
 AssertionError: graph(%x.1 : Tensor,
      %p.1 : float,
      %train.1 : bool):
  %3 : str = prim::Constant[value="builtins.ValueError"]() # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1278:14
  %4 : str = prim::Constant[value="dropout probability has to be between 0 and 1, but got {}"]() # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1278:25
  %5 : bool = prim::Constant[value=1]() # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1277:7
  %6 : float = prim::Constant[value=0.]() # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1277:11
  %7 : float = prim::Constant[value=1.]() # ../../../test/test_jit_cuda_fuser.py:2160:20
  %8 : int = prim::Constant[value=1]()
  %9 : int = prim::BailoutTemplate_0()
  %10 : Tensor = prim::BailOut[index=0](%9, %x.1, %p.1, %train.1)
  %11 : bool = aten::lt(%p.1, %6) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1277:7
  %12 : bool = prim::If(%11) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1277:7
    block0():
      -> (%5)
    block1():
      %13 : bool = aten::gt(%p.1, %7) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1277:18
      -> (%13)
   = prim::If(%12) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1277:4
    block0():
      %14 : str = aten::format(%4, %p.1) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1278:25
       = prim::RaiseException(%14, %3) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1278:8
      -> ()
    block1():
      -> ()
  %15 : Tensor = aten::dropout(%10, %p.1, %train.1) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1279:60
  %16 : Tensor = prim::BailOut[index=1](%9, %15)
  %o.5 : Tensor = aten::add(%16, %7, %8) # ../../../test/test_jit_cuda_fuser.py:2160:16
  return (%o.5)
 with prim::BailoutTemplate_0 = graph(%x.1 : Tensor,
      %p.1 : float,
      %train.1 : bool):
  %3 : Float(10, 4, 8, strides=[32, 8, 1], requires_grad=0, device=cuda:0) = prim::BailOut[index=0](%x.1, %p.1, %train.1)
  %4 : int = prim::Constant[value=1]()
  %5 : float = prim::Constant[value=1.]() # ../../../test/test_jit_cuda_fuser.py:2160:20
  %6 : float = prim::Constant[value=0.]() # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1277:11
  %7 : bool = prim::Constant[value=1]() # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1277:7
  %8 : str = prim::Constant[value="dropout probability has to be between 0 and 1, but got {}"]() # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1278:25
  %9 : str = prim::Constant[value="builtins.ValueError"]() # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1278:14
  %10 : bool = aten::lt(%p.1, %6) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1277:7
  %11 : bool = prim::If(%10) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1277:7
    block0():
      -> (%7)
    block1():
      %12 : bool = aten::gt(%p.1, %5) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1277:18
      -> (%12)
   = prim::If(%11) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1277:4
    block0():
      %13 : str = aten::format(%8, %p.1) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1278:25
       = prim::RaiseException(%13, %9) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1278:8
      -> ()
    block1():
      -> ()
  %14 : Tensor = aten::dropout(%3, %p.1, %train.1) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1279:60
  %15 : Float(10, 4, 8, strides=[32, 8, 1], requires_grad=0, device=cuda:0) = prim::BailOut[index=1](%14)
  %o.5 : Float(10, 4, 8, strides=[32, 8, 1], requires_grad=0, device=cuda:0) = aten::add(%15, %5, %4) # ../../../test/test_jit_cuda_fuser.py:2160:16
  return (%o.5)

 Error: graph contains 0 prim::CudaFusionGuard nodes (including subgraphs) but expected 1

 ======================================================================
 FAIL: test_dropout_train_nograd_prob_check (__main__.TestCudaFuser)
 ----------------------------------------------------------------------
 Traceback (most recent call last):
  File "../../../test/test_jit_cuda_fuser.py", line 2195, in test_dropout_train_nograd_prob_check
    self.assertGraphContainsExactly(t_jit.graph_for(x, prob, True), FUSION_GUARD, 1, consider_subgraphs=True)
  File "C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\testing\_internal\jit_utils.py", line 325, in assertGraphContainsExactly
    perform_assert(graph, kind, count, num_kind_nodes,
  File "C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\testing\_internal\jit_utils.py", line 318, in perform_assert
    raise AssertionError(
 AssertionError: graph(%x.1 : Tensor,
      %p.1 : float,
      %train.1 : bool):
  %3 : str = prim::Constant[value="builtins.ValueError"]() # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1278:14
  %4 : str = prim::Constant[value="dropout probability has to be between 0 and 1, but got {}"]() # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1278:25
  %5 : bool = prim::Constant[value=1]() # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1277:7
  %6 : float = prim::Constant[value=0.]() # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1277:11
  %7 : float = prim::Constant[value=1.]() # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1277:22
  %8 : float = prim::Constant[value=2.]() # ../../../test/test_jit_cuda_fuser.py:2177:20
  %9 : int = prim::BailoutTemplate_0()
  %10 : Tensor = prim::BailOut[index=0](%9, %x.1, %p.1, %train.1)
  %11 : bool = aten::lt(%p.1, %6) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1277:7
  %12 : bool = prim::If(%11) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1277:7
    block0():
      -> (%5)
    block1():
      %13 : bool = aten::gt(%p.1, %7) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1277:18
      -> (%13)
   = prim::If(%12) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1277:4
    block0():
      %14 : str = aten::format(%4, %p.1) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1278:25
       = prim::RaiseException(%14, %3) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1278:8
      -> ()
    block1():
      -> ()
  %15 : Tensor = aten::dropout(%10, %p.1, %train.1) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1279:60
  %16 : Tensor = prim::BailOut[index=1](%9, %15)
  %o.5 : Tensor = aten::mul(%16, %8) # ../../../test/test_jit_cuda_fuser.py:2177:16
  return (%o.5)
 with prim::BailoutTemplate_0 = graph(%x.1 : Tensor,
      %p.1 : float,
      %train.1 : bool):
  %3 : Float(1024, 1024, strides=[1024, 1], requires_grad=0, device=cuda:0) = prim::BailOut[index=0](%x.1, %p.1, %train.1)
  %4 : float = prim::Constant[value=2.]() # ../../../test/test_jit_cuda_fuser.py:2177:20
  %5 : float = prim::Constant[value=1.]() # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1277:22
  %6 : float = prim::Constant[value=0.]() # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1277:11
  %7 : bool = prim::Constant[value=1]() # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1277:7
  %8 : str = prim::Constant[value="dropout probability has to be between 0 and 1, but got {}"]() # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1278:25
  %9 : str = prim::Constant[value="builtins.ValueError"]() # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1278:14
  %10 : bool = aten::lt(%p.1, %6) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1277:7
  %11 : bool = prim::If(%10) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1277:7
    block0():
      -> (%7)
    block1():
      %12 : bool = aten::gt(%p.1, %5) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1277:18
      -> (%12)
   = prim::If(%11) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1277:4
    block0():
      %13 : str = aten::format(%8, %p.1) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1278:25
       = prim::RaiseException(%13, %9) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1278:8
      -> ()
    block1():
      -> ()
  %14 : Tensor = aten::dropout(%3, %p.1, %train.1) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1279:60
  %15 : Float(1024, 1024, strides=[1024, 1], requires_grad=0, device=cuda:0) = prim::BailOut[index=1](%14)
  %o.5 : Float(1024, 1024, strides=[1024, 1], requires_grad=0, device=cuda:0) = aten::mul(%15, %4) # ../../../test/test_jit_cuda_fuser.py:2177:16
  return (%o.5)

 Error: graph contains 0 prim::CudaFusionGuard nodes (including subgraphs) but expected 1

 ======================================================================
 FAIL: test_dropout_training_prob_check (__main__.TestCudaFuser)
 ----------------------------------------------------------------------
 Traceback (most recent call last):
  File "../../../test/test_jit_cuda_fuser.py", line 2282, in test_dropout_training_prob_check
    self.assertGraphContainsExactly(t_jit.graph_for(x, prob, True), FUSION_GUARD, 1, consider_subgraphs=True)
  File "C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\testing\_internal\jit_utils.py", line 325, in assertGraphContainsExactly
    perform_assert(graph, kind, count, num_kind_nodes,
  File "C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\testing\_internal\jit_utils.py", line 318, in perform_assert
    raise AssertionError(
 AssertionError: graph(%x.1 : Tensor,
      %p.1 : float,
      %train.1 : bool):
  %3 : str = prim::Constant[value="builtins.ValueError"]() # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1278:14
  %4 : str = prim::Constant[value="dropout probability has to be between 0 and 1, but got {}"]() # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1278:25
  %5 : bool = prim::Constant[value=1]() # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1277:7
  %6 : float = prim::Constant[value=0.]() # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1277:11
  %7 : float = prim::Constant[value=1.]() # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1277:22
  %8 : int = prim::BailoutTemplate_0()
  %9 : Float(1024, 1024, strides=[1024, 1], requires_grad=1, device=cuda:0) = prim::BailOut[index=0](%8, %x.1, %p.1, %train.1)
  %10 : bool = aten::lt(%p.1, %6) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1277:7
  %11 : bool = prim::If(%10) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1277:7
    block0():
      -> (%5)
    block1():
      %12 : bool = aten::gt(%p.1, %7) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1277:18
      -> (%12)
   = prim::If(%11) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1277:4
    block0():
      %13 : str = aten::format(%4, %p.1) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1278:25
       = prim::RaiseException(%13, %3) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1278:8
      -> ()
    block1():
      -> ()
  %14 : Float(1024, 1024, strides=[1024, 1], requires_grad=1, device=cuda:0), %15 : bool = prim::RequiresGradCheck[types=[Tensor(requires_grad=1)]](%9)
  %16 : Tensor = prim::If(%15)
    block0():
      %17 : Tensor = prim::DifferentiableGraph_1(%14, %p.1, %train.1)
      -> (%17)
    block1():
      %18 : Function = prim::Constant[name="fallback_function", fallback=1]()
      %19 : (Tensor) = prim::CallFunction(%18, %9, %p.1, %train.1)
      %20 : Tensor = prim::TupleUnpack(%19)
      -> (%20)
  %21 : Float(1024, 1024, strides=[1024, 1], requires_grad=1, device=cuda:0) = prim::BailOut[index=1](%8, %16)
  %22 : Float(1024, 1024, strides=[1024, 1], requires_grad=1, device=cuda:0), %23 : bool = prim::RequiresGradCheck[types=[Tensor(requires_grad=1)]](%21)
  %24 : Float(1024, 1024, strides=[1024, 1], requires_grad=1, device=cuda:0) = prim::If(%23)
    block0():
      %o.10 : Float(1024, 1024, strides=[1024, 1], requires_grad=1, device=cuda:0) = prim::DifferentiableGraph_2(%22)
      -> (%o.10)
    block1():
      %26 : Function = prim::Constant[name="fallback_function", fallback=1]()
      %27 : (Tensor) = prim::CallFunction(%26, %21)
      %28 : Tensor = prim::TupleUnpack(%27)
      -> (%28)
  return (%24)
 with prim::BailoutTemplate_0 = graph(%x.1 : Tensor,
      %p.1 : float,
      %train.1 : bool):
  %3 : Float(1024, 1024, strides=[1024, 1], requires_grad=1, device=cuda:0) = prim::BailOut[index=0](%x.1, %p.1, %train.1)
  %4 : float = prim::Constant[value=2.]() # ../../../test/test_jit_cuda_fuser.py:2262:20
  %5 : float = prim::Constant[value=1.]() # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1277:22
  %6 : float = prim::Constant[value=0.]() # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1277:11
  %7 : bool = prim::Constant[value=1]() # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1277:7
  %8 : str = prim::Constant[value="dropout probability has to be between 0 and 1, but got {}"]() # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1278:25
  %9 : str = prim::Constant[value="builtins.ValueError"]() # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1278:14
  %10 : bool = aten::lt(%p.1, %6) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1277:7
  %11 : bool = prim::If(%10) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1277:7
    block0():
      -> (%7)
    block1():
      %12 : bool = aten::gt(%p.1, %5) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1277:18
      -> (%12)
   = prim::If(%11) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1277:4
    block0():
      %13 : str = aten::format(%8, %p.1) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1278:25
       = prim::RaiseException(%13, %9) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1278:8
      -> ()
    block1():
      -> ()
  %14 : Tensor = aten::dropout(%3, %p.1, %train.1) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1279:60
  %15 : Float(1024, 1024, strides=[1024, 1], requires_grad=1, device=cuda:0) = prim::BailOut[index=1](%14)
  %o.5 : Float(1024, 1024, strides=[1024, 1], requires_grad=1, device=cuda:0) = aten::mul(%15, %4) # ../../../test/test_jit_cuda_fuser.py:2262:16
  return (%o.5)
 with prim::DifferentiableGraph_1 = graph(%0 : Tensor,
      %p.1 : float,
      %train.1 : bool):
  %4 : float = prim::Constant[value=1.]() # <string>:45:19
  %3 : float = prim::Constant[value=0.]() # <string>:46:39
  %20 : int = prim::BailoutTemplate_0()
  %5 : float = aten::sub(%4, %p.1) # <string>:45:19
  %6 : float = aten::Float(%train.1) # <string>:45:29
  %p1m.1 : float = aten::mul(%5, %6) # <string>:45:19
  %8 : bool = aten::eq(%p1m.1, %3) # <string>:46:32
  %9 : float = aten::Float(%8) # <string>:46:26
  %10 : float = aten::add(%9, %p1m.1) # <string>:46:26
  %scale.4 : float = aten::div(%4, %10) # <string>:46:20
  %res.1 : Tensor, %mask.4 : Tensor = aten::native_dropout(%0, %p.1, %train.1) # <string>:47:23
  %18 : Tensor = prim::BailOut[index=0](%20, %mask.4, %scale.4, %res.1)
  %19 : Tensor = prim::BailOut[index=1](%20, %res.1, %scale.4, %18)
  return (%19, %scale.4, %18)
 with prim::BailoutTemplate_0 = graph(%0 : Tensor,
      %p.1 : float,
      %train.1 : bool):
  %3 : float = prim::Constant[value=0.]() # <string>:46:39
  %4 : float = prim::Constant[value=1.]() # <string>:45:19
  %5 : float = aten::sub(%4, %p.1) # <string>:45:19
  %6 : float = aten::Float(%train.1) # <string>:45:29
  %p1m.1 : float = aten::mul(%5, %6) # <string>:45:19
  %8 : bool = aten::eq(%p1m.1, %3) # <string>:46:32
  %9 : float = aten::Float(%8) # <string>:46:26
  %10 : float = aten::add(%9, %p1m.1) # <string>:46:26
  %scale.4 : float = aten::div(%4, %10) # <string>:46:20
  %res.1 : Tensor, %mask.4 : Tensor = aten::native_dropout(%0, %p.1, %train.1) # <string>:47:23
  %14 : Bool(1024, 1024, strides=[1024, 1], requires_grad=0, device=cuda:0) = prim::BailOut[index=0](%mask.4, %scale.4, %res.1)
  %15 : Float(1024, 1024, strides=[1024, 1], requires_grad=0, device=cuda:0) = prim::BailOut[index=1](%res.1, %scale.4, %14)
  return (%15, %scale.4, %14)
 with prim::DifferentiableGraph_2 = graph(%0 : Tensor):
  %1 : float = prim::Constant[value=2.]()
  %8 : int = prim::BailoutTemplate_0()
  %7 : Tensor = prim::BailOut[index=0](%8, %0)
  %o.6 : Tensor = aten::mul(%7, %1) # ../../../test/test_jit_cuda_fuser.py:2262:16
  return (%o.6)
 with prim::BailoutTemplate_0 = graph(%0 : Tensor):
  %1 : Float(1024, 1024, strides=[1024, 1], requires_grad=0, device=cuda:0) = prim::BailOut[index=0](%0)
  %2 : float = prim::Constant[value=2.]()
  %o.6 : Float(1024, 1024, strides=[1024, 1], requires_grad=0, device=cuda:0) = aten::mul(%1, %2) # ../../../test/test_jit_cuda_fuser.py:2262:16
  return (%o.6)

 Error: graph contains 0 prim::CudaFusionGuard nodes (including subgraphs) but expected 1

 ======================================================================
 FAIL: test_graph_rng (__main__.TestCudaFuser)
 ----------------------------------------------------------------------
 Traceback (most recent call last):
  File "../../../test/test_jit_cuda_fuser.py", line 2745, in test_graph_rng
    self.assertGraphContainsExactly(t_jit.graph_for(a), FUSION_GUARD, 1)
  File "C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\testing\_internal\jit_utils.py", line 339, in assertGraphContainsExactly
    perform_assert(graph, kind, len(out_nodes), num_kind_nodes,
  File "C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\testing\_internal\jit_utils.py", line 318, in perform_assert
    raise AssertionError(
 AssertionError: graph(%x.1 : Tensor):
  %1 : int = prim::BailoutTemplate_0()
  %2 : Tensor = prim::BailOut[index=0](%1, %x.1)
  %3 : bool = prim::CudaFusionGuard[types=[Float(10000, strides=[1], requires_grad=0, device=cuda:0)]](%2)
  %4 : Tensor = prim::If(%3)
    block0():
      %5 : Tensor = prim::CudaFusionGroup_1[cache_id=251](%2)
      -> (%5)
    block1():
      %6 : Function = prim::Constant[name="fallback_function", fallback=1]()
      %7 : (Tensor) = prim::CallFunction(%6, %2)
      %8 : Tensor = prim::TupleUnpack(%7)
      -> (%8)
  %9 : Tensor = prim::BailOut[index=1](%1, %4)
  %10 : bool = prim::CudaFusionGuard[types=[Float(10000, strides=[1], requires_grad=0, device=cuda:0)]](%9)
  %11 : Tensor = prim::If(%10)
    block0():
      %12 : Tensor = prim::CudaFusionGroup_2[cache_id=251](%9)
      -> (%12)
    block1():
      %13 : Function = prim::Constant[name="fallback_function", fallback=1]()
      %14 : (Tensor) = prim::CallFunction(%13, %9)
      %15 : Tensor = prim::TupleUnpack(%14)
      -> (%15)
  %16 : Tensor = prim::BailOut[index=2](%1, %11)
  return (%16)
 with prim::BailoutTemplate_0 = graph(%x.1 : Tensor):
  %1 : Float(10000, strides=[1], requires_grad=0, device=cuda:0) = prim::BailOut[index=0](%x.1)
  %2 : bool = prim::Constant[value=1]()
  %3 : int = prim::Constant[value=1]()
  %4 : float = prim::Constant[value=1.]() # ../../../test/test_jit_cuda_fuser.py:2734:20
  %5 : float = prim::Constant[value=0.10000000000000001]() # ../../../test/test_jit_cuda_fuser.py:2735:49
  %o.1 : Float(10000, strides=[1], requires_grad=0, device=cuda:0) = aten::add(%1, %4, %3) # ../../../test/test_jit_cuda_fuser.py:2734:16
  %7 : Tensor = aten::dropout(%o.1, %5, %2) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1279:60
  %8 : Float(10000, strides=[1], requires_grad=0, device=cuda:0) = prim::BailOut[index=1](%7)
  %o.9 : Float(10000, strides=[1], requires_grad=0, device=cuda:0) = aten::add(%8, %4, %3) # ../../../test/test_jit_cuda_fuser.py:2736:16
  %10 : Tensor = aten::dropout(%o.9, %5, %2) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1279:60
  %11 : Float(10000, strides=[1], requires_grad=0, device=cuda:0) = prim::BailOut[index=2](%10)
  return (%11)
 with prim::CudaFusionGroup_1 = graph(%0 : Float(10000, strides=[1], requires_grad=0, device=cuda:0)):
  %1 : bool = prim::Constant[value=1]()
  %2 : float = prim::Constant[value=0.10000000000000001]() # ../../../test/test_jit_cuda_fuser.py:2735:49
  %3 : int = prim::Constant[value=1]()
  %4 : float = prim::Constant[value=1.]() # ../../../test/test_jit_cuda_fuser.py:2734:20
  %o.1 : Float(10000, strides=[1], requires_grad=0, device=cuda:0) = aten::add(%0, %4, %3) # ../../../test/test_jit_cuda_fuser.py:2734:16
  %6 : Tensor = aten::dropout(%o.1, %2, %1) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1279:60
  return (%6)
 with prim::CudaFusionGroup_2 = graph(%0 : Float(10000, strides=[1], requires_grad=0, device=cuda:0)):
  %1 : bool = prim::Constant[value=1]()
  %2 : float = prim::Constant[value=0.10000000000000001]() # ../../../test/test_jit_cuda_fuser.py:2735:49
  %3 : int = prim::Constant[value=1]()
  %4 : float = prim::Constant[value=1.]() # ../../../test/test_jit_cuda_fuser.py:2734:20
  %o.9 : Float(10000, strides=[1], requires_grad=0, device=cuda:0) = aten::add(%0, %4, %3) # ../../../test/test_jit_cuda_fuser.py:2736:16
  %6 : Tensor = aten::dropout(%o.9, %2, %1) # C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\nn\functional.py:1279:60
  return (%6)

 Error: graph contains 2 prim::CudaFusionGuard nodes (excluding subgraphs) but expected 1

 ======================================================================
 FAIL: test_inplace_removal (__main__.TestCudaFuser)
 ----------------------------------------------------------------------
 Traceback (most recent call last):
  File "../../../test/test_jit_cuda_fuser.py", line 2963, in test_inplace_removal
    self.assertGraphContains(graph, FUSION_GROUP, True)
  File "C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\testing\_internal\jit_utils.py", line 298, in assertGraphContains
    self.assertTrue(count > 0)
 AssertionError: False is not true

 ======================================================================
 FAIL: test_linear (__main__.TestCudaFuser)
 ----------------------------------------------------------------------
 Traceback (most recent call last):
  File "../../../test/test_jit_cuda_fuser.py", line 2307, in test_linear
    self.assertGraphContainsExactly(t_jit.graph_for(x, weight, bias), FUSION_GUARD, 1)
  File "C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\testing\_internal\jit_utils.py", line 339, in assertGraphContainsExactly
    perform_assert(graph, kind, len(out_nodes), num_kind_nodes,
  File "C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\testing\_internal\jit_utils.py", line 318, in perform_assert
    raise AssertionError(
 AssertionError: graph(%x.1 : Tensor,
      %weight.1 : Tensor,
      %bias.1 : Tensor):
  %3 : int = prim::BailoutTemplate_0()
  %4 : Tensor = prim::BailOut[index=0](%3, %bias.1, %x.1, %weight.1)
  %5 : Tensor = prim::BailOut[index=1](%3, %weight.1, %x.1, %4)
  %6 : Tensor = prim::BailOut[index=2](%3, %x.1, %5, %4)
  %7 : Tensor = aten::t(%5)
  %8 : Tensor = aten::matmul(%6, %7)
  %9 : Tensor = prim::add_optional(%8, %4)
  %10 : Tensor = prim::BailOut[index=3](%3, %9)
  %o.5 : Tensor = aten::relu(%10) # ../../../test/test_jit_cuda_fuser.py:2296:16
  return (%o.5)
 with prim::BailoutTemplate_0 = graph(%x.1 : Tensor,
      %weight.1 : Tensor,
      %bias.1 : Tensor):
  %3 : Float(8, strides=[1], requires_grad=0, device=cuda:0) = prim::BailOut[index=0](%bias.1, %x.1, %weight.1)
  %4 : Float(8, 2, strides=[2, 1], requires_grad=0, device=cuda:0) = prim::BailOut[index=1](%weight.1, %x.1, %3)
  %5 : Float(4, 2, strides=[2, 1], requires_grad=0, device=cuda:0) = prim::BailOut[index=2](%x.1, %4, %3)
  %o.1 : Tensor = aten::linear(%5, %4, %3) # ../../../test/test_jit_cuda_fuser.py:2295:16
  %7 : Float(4, 8, strides=[8, 1], requires_grad=0, device=cuda:0) = prim::BailOut[index=3](%o.1)
  %o.5 : Float(4, 8, strides=[8, 1], requires_grad=0, device=cuda:0) = aten::relu(%7) # ../../../test/test_jit_cuda_fuser.py:2296:16
  return (%o.5)

 Error: graph contains 0 prim::CudaFusionGuard nodes (excluding subgraphs) but expected 1

 ======================================================================
 FAIL: test_linear_1d_weight_mismatch_bias_dtype (__main__.TestCudaFuser)
 ----------------------------------------------------------------------
 Traceback (most recent call last):
  File "../../../test/test_jit_cuda_fuser.py", line 3095, in test_linear_1d_weight_mismatch_bias_dtype
    self.assertEqual(o, jit_o)
  File "C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\testing\_internal\common_utils.py", line 2121, in assertEqual
    assert_equal(
  File "C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\testing\_comparison.py", line 1074, in assert_equal
    raise error_metas[0].to_error()
 AssertionError: The values for attribute 'dtype' do not match: torch.float16 != torch.float32.

 ======================================================================
 FAIL: test_native_layer_norm (__main__.TestCudaFuser)
 ----------------------------------------------------------------------
 Traceback (most recent call last):
  File "../../../test/test_jit_cuda_fuser.py", line 1336, in test_native_layer_norm
    self._native_layer_norm_helper(input_shape, norm_shape, torch.float32, "cuda", 1e-4, affine)
  File "../../../test/test_jit_cuda_fuser.py", line 1322, in _native_layer_norm_helper
    self.assertGraphContains(t_jit.graph_for(x), FUSION_GUARD)
  File "C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\testing\_internal\jit_utils.py", line 311, in assertGraphContains
    self.assertTrue(len(out_nodes) > 0)
 AssertionError: False is not true

 ======================================================================
 FAIL: test_native_layer_norm_half (__main__.TestCudaFuser)
 ----------------------------------------------------------------------
 Traceback (most recent call last):
  File "../../../test/test_jit_cuda_fuser.py", line 1349, in test_native_layer_norm_half
    self._native_layer_norm_helper(input_shape, norm_shape, torch.float16, "cuda", 5e-3)
  File "../../../test/test_jit_cuda_fuser.py", line 1322, in _native_layer_norm_helper
    self.assertGraphContains(t_jit.graph_for(x), FUSION_GUARD)
  File "C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\testing\_internal\jit_utils.py", line 311, in assertGraphContains
    self.assertTrue(len(out_nodes) > 0)
 AssertionError: False is not true

 ======================================================================
 FAIL: test_norm (__main__.TestCudaFuser)
 ----------------------------------------------------------------------
 Traceback (most recent call last):
  File "../../../test/test_jit_cuda_fuser.py", line 1439, in test_norm
    self._norm_helper(x, torch.float32, "cuda", 1e-4, is_batch_norm_else_instance_norm)
  File "../../../test/test_jit_cuda_fuser.py", line 1410, in _norm_helper
    self.assertGraphContains(t_jit.graph_for(x, running_mean, running_var), FUSION_GUARD)
  File "C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\testing\_internal\jit_utils.py", line 311, in assertGraphContains
    self.assertTrue(len(out_nodes) > 0)
 AssertionError: False is not true

 ======================================================================
 FAIL: test_norm_channels_last (__main__.TestCudaFuser)
 ----------------------------------------------------------------------
 Traceback (most recent call last):
  File "../../../test/test_jit_cuda_fuser.py", line 1422, in test_norm_channels_last
    self._norm_helper(size, torch.float32, "cuda", 1e-4, is_batch_norm_else_instance_norm, memory_format=mf)
  File "../../../test/test_jit_cuda_fuser.py", line 1410, in _norm_helper
    self.assertGraphContains(t_jit.graph_for(x, running_mean, running_var), FUSION_GUARD)
  File "C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\testing\_internal\jit_utils.py", line 311, in assertGraphContains
    self.assertTrue(len(out_nodes) > 0)
 AssertionError: False is not true

 ======================================================================
 FAIL: test_norm_half (__main__.TestCudaFuser)
 ----------------------------------------------------------------------
 Traceback (most recent call last):
  File "../../../test/test_jit_cuda_fuser.py", line 1472, in test_norm_half
    self._norm_helper(x, torch.float16, "cuda", 5e-3, is_batch_norm_else_instance_norm)
  File "../../../test/test_jit_cuda_fuser.py", line 1410, in _norm_helper
    self.assertGraphContains(t_jit.graph_for(x, running_mean, running_var), FUSION_GUARD)
  File "C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\testing\_internal\jit_utils.py", line 311, in assertGraphContains
    self.assertTrue(len(out_nodes) > 0)
 AssertionError: False is not true

 ======================================================================
 FAIL: test_norm_large (__main__.TestCudaFuser)
 ----------------------------------------------------------------------
 Traceback (most recent call last):
  File "../../../test/test_jit_cuda_fuser.py", line 1455, in test_norm_large
    self._norm_helper(x, torch.float32, "cuda", 1e-4, is_batch_norm_else_instance_norm)
  File "../../../test/test_jit_cuda_fuser.py", line 1410, in _norm_helper
    self.assertGraphContains(t_jit.graph_for(x, running_mean, running_var), FUSION_GUARD)
  File "C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\testing\_internal\jit_utils.py", line 311, in assertGraphContains
    self.assertTrue(len(out_nodes) > 0)
 AssertionError: False is not true

 ======================================================================
 FAIL: test_profile_ivalue (__main__.TestCudaFuser)
 ----------------------------------------------------------------------
 Traceback (most recent call last):
  File "../../../test/test_jit_cuda_fuser.py", line 2036, in test_profile_ivalue
    self.assertGraphContains(t_jit.graph_for(x, y, (0, 1), False), FUSION_GUARD)
  File "C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\testing\_internal\jit_utils.py", line 311, in assertGraphContains
    self.assertTrue(len(out_nodes) > 0)
 AssertionError: False is not true

 ======================================================================
 FAIL: test_profiling_node (__main__.TestCudaFuser)
 ----------------------------------------------------------------------
 Traceback (most recent call last):
  File "../../../test/test_jit_cuda_fuser.py", line 1989, in test_profiling_node
    self._run_helper(repro_jit, repro, x, 0.6)
  File "../../../test/test_jit_cuda_fuser.py", line 154, in _run_helper
    self.assertGraphContainsExactly(jit_op.graph_for(*args), FUSION_GUARD, 1, consider_subgraphs=True)
  File "C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\testing\_internal\jit_utils.py", line 325, in assertGraphContainsExactly
    perform_assert(graph, kind, count, num_kind_nodes,
  File "C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\testing\_internal\jit_utils.py", line 318, in perform_assert
    raise AssertionError(
 AssertionError: graph(%x.1 : Tensor,
      %alpha.1 : float):
  %2 : NoneType = prim::Constant()
  %3 : int = prim::Constant[value=1]()
  %4 : int = prim::BailoutTemplate_0()
  %o.1 : Tensor = aten::rand_like(%x.1, %2, %2, %2, %2, %2) # ../../../test/test_jit_cuda_fuser.py:1985:16
  %6 : Tensor = prim::BailOut[index=0](%4, %o.1, %alpha.1)
  %o.5 : Tensor = aten::add(%6, %alpha.1, %3) # ../../../test/test_jit_cuda_fuser.py:1986:16
  return (%o.5)
 with prim::BailoutTemplate_0 = graph(%x.1 : Tensor,
      %alpha.1 : float):
  %2 : int = prim::Constant[value=1]()
  %3 : NoneType = prim::Constant()
  %o.1 : Tensor = aten::rand_like(%x.1, %3, %3, %3, %3, %3) # ../../../test/test_jit_cuda_fuser.py:1985:16
  %5 : Float(4, 8, 8, 8, strides=[512, 64, 8, 1], requires_grad=0, device=cuda:0) = prim::BailOut[index=0](%o.1, %alpha.1)
  %o.5 : Float(4, 8, 8, 8, strides=[512, 64, 8, 1], requires_grad=0, device=cuda:0) = aten::add(%5, %alpha.1, %2) # ../../../test/test_jit_cuda_fuser.py:1986:16
  return (%o.5)

 Error: graph contains 0 prim::CudaFusionGuard nodes (including subgraphs) but expected 1

 ======================================================================
 FAIL: test_reduction_sizes_op (__main__.TestCudaFuser)
 ----------------------------------------------------------------------
 Traceback (most recent call last):
  File "../../../test/test_jit_cuda_fuser.py", line 2013, in test_reduction_sizes_op
    self.assertGraphContainsExactly(t_jit.graph_for(x, y), FUSION_GUARD, 0)
  File "C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\testing\_internal\jit_utils.py", line 339, in assertGraphContainsExactly
    perform_assert(graph, kind, len(out_nodes), num_kind_nodes,
  File "C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\testing\_internal\jit_utils.py", line 318, in perform_assert
    raise AssertionError(
 AssertionError: graph(%x.1 : Tensor,
      %y.1 : Tensor):
  %2 : int[] = prim::Constant[value=[2, 4]]()
  %3 : int = prim::BailoutTemplate_0()
  %4 : Tensor = prim::BailOut[index=0](%3, %y.1, %x.1)
  %5 : Tensor = prim::BailOut[index=1](%3, %x.1, %4)
  %6 : bool = prim::CudaFusionGuard[types=[Float(2, 3, 4, 5, strides=[60, 20, 5, 1], requires_grad=0, device=cuda:0), Float(2, 3, 4, 5, strides=[60, 20, 5, 1], requires_grad=0, device=cuda:0)]](%5, %4)
  %7 : Tensor = prim::If(%6)
    block0():
      %o.12 : Tensor = prim::CudaFusionGroup_1[cache_id=590](%5, %4)
      -> (%o.12)
    block1():
      %9 : Function = prim::Constant[name="fallback_function", fallback=1]()
      %10 : (Tensor) = prim::CallFunction(%9, %5, %4)
      %11 : Tensor = prim::TupleUnpack(%10)
      -> (%11)
  %12 : Tensor = prim::BailOut[index=2](%3, %7)
  return (%2)
 with prim::BailoutTemplate_0 = graph(%x.1 : Tensor,
      %y.1 : Tensor):
  %2 : Float(2, 3, 4, 5, strides=[60, 20, 5, 1], requires_grad=0, device=cuda:0) = prim::BailOut[index=0](%y.1, %x.1)
  %3 : Float(2, 3, 4, 5, strides=[60, 20, 5, 1], requires_grad=0, device=cuda:0) = prim::BailOut[index=1](%x.1, %2)
  %4 : int[] = prim::Constant[value=[1, 3]]()
  %5 : NoneType = prim::Constant()
  %6 : bool = prim::Constant[value=0]()
  %7 : int = prim::Constant[value=1]() # ../../../test/test_jit_cuda_fuser.py:2004:23
  %o.1 : Float(2, 3, 4, 5, strides=[60, 20, 5, 1], requires_grad=0, device=cuda:0) = aten::add(%3, %2, %7) # ../../../test/test_jit_cuda_fuser.py:2002:16
  %o.5 : Float(2, 3, 4, 5, strides=[60, 20, 5, 1], requires_grad=0, device=cuda:0) = aten::relu(%o.1) # ../../../test/test_jit_cuda_fuser.py:2003:16
  %o.9 : Tensor = aten::sum(%o.5, %4, %6, %5) # ../../../test/test_jit_cuda_fuser.py:2004:16
  %11 : Float(2, 4, strides=[4, 1], requires_grad=0, device=cuda:0) = prim::BailOut[index=2](%o.9)
  %12 : int[] = aten::size(%11) # ../../../test/test_jit_cuda_fuser.py:2005:19
  return (%12)
 with prim::CudaFusionGroup_1 = graph(%0 : Float(2, 3, 4, 5, strides=[60, 20, 5, 1], requires_grad=0, device=cuda:0),
      %1 : Float(2, 3, 4, 5, strides=[60, 20, 5, 1], requires_grad=0, device=cuda:0)):
  %2 : NoneType = prim::Constant()
  %3 : bool = prim::Constant[value=0]()
  %4 : int[] = prim::Constant[value=[1, 3]]()
  %5 : int = prim::Constant[value=1]() # ../../../test/test_jit_cuda_fuser.py:2004:23
  %o.1 : Float(2, 3, 4, 5, strides=[60, 20, 5, 1], requires_grad=0, device=cuda:0) = aten::add(%0, %1, %5) # ../../../test/test_jit_cuda_fuser.py:2002:16
  %o.5 : Float(2, 3, 4, 5, strides=[60, 20, 5, 1], requires_grad=0, device=cuda:0) = aten::relu(%o.1) # ../../../test/test_jit_cuda_fuser.py:2003:16
  %o.9 : Tensor = aten::sum(%o.5, %4, %3, %2) # ../../../test/test_jit_cuda_fuser.py:2004:16
  return (%o.9)

 Error: graph contains 1 prim::CudaFusionGuard nodes (excluding subgraphs) but expected 0

 ======================================================================
 FAIL: test_sibling_fusion (__main__.TestCudaFuser)
 ----------------------------------------------------------------------
 Traceback (most recent call last):
  File "../../../test/test_jit_cuda_fuser.py", line 3136, in test_sibling_fusion
    self._run_fwd_helper(t2, ['aten::sum', 'aten::mul'], x, y)
  File "../../../test/test_jit_cuda_fuser.py", line 3114, in _run_fwd_helper
    self.assertGraphContainsExactly(graph, op, 0)
  File "C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\testing\_internal\jit_utils.py", line 339, in assertGraphContainsExactly
    perform_assert(graph, kind, len(out_nodes), num_kind_nodes,
  File "C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\testing\_internal\jit_utils.py", line 318, in perform_assert
    raise AssertionError(
 AssertionError: graph(%x.1 : Tensor,
      %y.1 : Tensor):
  %2 : bool = prim::Constant[value=0]()
  %3 : NoneType = prim::Constant()
  %4 : int[] = prim::Constant[value=[0]]()
  %5 : int = prim::BailoutTemplate_0()
  %6 : Tensor = prim::BailOut[index=0](%5, %y.1, %x.1)
  %7 : Tensor = prim::BailOut[index=1](%5, %x.1, %6)
  %o1.1 : Tensor = aten::sum(%x.1, %4, %2, %3) # ../../../test/test_jit_cuda_fuser.py:3133:17
  %9 : bool = prim::CudaFusionGuard[types=[Float(2, 5, strides=[5, 1], requires_grad=0, device=cuda:0), Float(2, 5, strides=[5, 1], requires_grad=0, device=cuda:0)]](%7, %6)
  %10 : Tensor = prim::If(%9)
    block0():
      %o2.4 : Tensor = prim::CudaFusionGroup_1[cache_id=600](%7, %6)
      -> (%o2.4)
    block1():
      %12 : Function = prim::Constant[name="fallback_function", fallback=1]()
      %13 : (Tensor) = prim::CallFunction(%12, %7, %6)
      %14 : Tensor = prim::TupleUnpack(%13)
      -> (%14)
  %15 : (Tensor, Tensor) = prim::TupleConstruct(%o1.1, %10)
  return (%15)
 with prim::BailoutTemplate_0 = graph(%x.1 : Tensor,
      %y.1 : Tensor):
  %2 : Float(2, 5, strides=[5, 1], requires_grad=0, device=cuda:0) = prim::BailOut[index=0](%y.1, %x.1)
  %3 : Float(2, 5, strides=[5, 1], requires_grad=0, device=cuda:0) = prim::BailOut[index=1](%x.1, %2)
  %4 : int[] = prim::Constant[value=[0]]()
  %5 : NoneType = prim::Constant()
  %6 : bool = prim::Constant[value=0]()
  %o1.1 : Tensor = aten::sum(%x.1, %4, %6, %5) # ../../../test/test_jit_cuda_fuser.py:3133:17
  %8 : Float(2, 5, strides=[5, 1], requires_grad=0, device=cuda:0) = aten::mul(%3, %2) # ../../../test/test_jit_cuda_fuser.py:3134:18
  %o2.1 : Tensor = aten::sum(%8, %4, %6, %5) # ../../../test/test_jit_cuda_fuser.py:3134:18
  %10 : (Tensor, Tensor) = prim::TupleConstruct(%o1.1, %o2.1)
  return (%10)
 with prim::CudaFusionGroup_1 = graph(%0 : Float(2, 5, strides=[5, 1], requires_grad=0, device=cuda:0),
      %1 : Float(2, 5, strides=[5, 1], requires_grad=0, device=cuda:0)):
  %2 : NoneType = prim::Constant()
  %3 : bool = prim::Constant[value=0]()
  %4 : int[] = prim::Constant[value=[0]]()
  %5 : Float(2, 5, strides=[5, 1], requires_grad=0, device=cuda:0) = aten::mul(%0, %1) # ../../../test/test_jit_cuda_fuser.py:3134:18
  %o2.1 : Tensor = aten::sum(%5, %4, %3, %2) # ../../../test/test_jit_cuda_fuser.py:3134:18
  return (%o2.1)

 Error: graph contains 1 aten::sum nodes (excluding subgraphs) but expected 0

 ======================================================================
 FAIL: test_sum_to_size (__main__.TestCudaFuser)
 ----------------------------------------------------------------------
 Traceback (most recent call last):
  File "../../../test/test_jit_cuda_fuser.py", line 2059, in test_sum_to_size
    self.assertGraphContains(t_jit.graph_for(x, y, (4, 1)), FUSION_GUARD)
  File "C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\testing\_internal\jit_utils.py", line 311, in assertGraphContains
    self.assertTrue(len(out_nodes) > 0)
 AssertionError: False is not true

 ----------------------------------------------------------------------
 Ran 108 tests in 314.187s

 FAILED (failures=22, errors=11, skipped=21, expected failures=1)