Skip to content

Instantly share code, notes, and snippets.

@davidberard98
Last active March 8, 2023 23:40
Show Gist options
  • Save davidberard98/6a3875a0f71641349beea7bde64560ce to your computer and use it in GitHub Desktop.
Save davidberard98/6a3875a0f71641349beea7bde64560ce to your computer and use it in GitHub Desktop.
cuda train hf_Longformer [2023-03-08 23:40:23,242] torch._dynamo.debug_utils: [WARNING] Compiled Fx GraphModule failed. Creating script to minify the error.
[2023-03-08 23:40:23,244] torch._dynamo.debug_utils: [WARNING] Writing minified repro to /scratch/dberard/bisectdynamo/pytorch/torch_compile_debug/run_2023_03_08_23_40_23_244562-pid_3089959/minifier/minifier_launcher.py
ERROR:common:inductor raised Exception: Please convert all Tensors to FakeTensors first or instantiate FakeTensorMode with 'allow_non_fake_inputs'. Found in aten.copy_.default(*(tensor([[[[0., 0., 0., ..., 0., 0., 0.]]],
[[[0., 0., 0., ..., 0., 0., 0.]]],
[[[0., 0., 0., ..., 0., 0., 0.]]],
[[[0., 0., 0., ..., 0., 0., 0.]]]], device='cuda:0'), FakeTensor(FakeTensor(..., device='meta', size=(4, 1, 1, 1024)), cuda:0)), **{})
Set torch._dynamo.config.verbose=True for more information
Minifier script written to /scratch/dberard/bisectdynamo/pytorch/torch_compile_debug/run_2023_03_08_23_40_23_244562-pid_3089959/minifier/minifier_launcher.py. Run this script to find the smallest traced graph which reproduces this error.
You can suppress this exception and fall back to eager by setting:
torch._dynamo.config.suppress_errors = True
Traceback (most recent call last):
File "/scratch/dberard/bisectdynamo/pytorch/torch/_dynamo/output_graph.py", line 708, in call_user_compiler
compiled_fn = compiler_fn(gm, self.fake_example_inputs())
File "/scratch/dberard/bisectdynamo/pytorch/torch/_dynamo/debug_utils.py", line 1032, in debug_wrapper
run_fwd_maybe_bwd(compiled_gm, example_inputs)
File "/scratch/dberard/bisectdynamo/pytorch/torch/_dynamo/debug_utils.py", line 633, in run_fwd_maybe_bwd
out = gm(args)
File "/scratch/dberard/bisectdynamo/pytorch/torch/_functorch/aot_autograd.py", line 1222, in g
return f(*args)
File "/scratch/dberard/bisectdynamo/pytorch/torch/_dynamo/eval_frame.py", line 231, in _fn
return fn(*args, **kwargs)
File "/scratch/dberard/bisectdynamo/pytorch/torch/_functorch/aot_autograd.py", line 2853, in forward
return compiled_fn(full_args)
File "/scratch/dberard/bisectdynamo/pytorch/torch/_functorch/aot_autograd.py", line 1222, in g
return f(*args)
File "/scratch/dberard/bisectdynamo/pytorch/torch/_functorch/aot_autograd.py", line 2407, in debug_compiled_function
return compiled_function(*args)
File "/scratch/dberard/bisectdynamo/pytorch/torch/_functorch/aot_autograd.py", line 1909, in runtime_wrapper
all_outs = call_func_with_args(
File "/scratch/dberard/bisectdynamo/pytorch/torch/_functorch/aot_autograd.py", line 1247, in call_func_with_args
out = normalize_as_list(f(args))
File "/scratch/dberard/bisectdynamo/pytorch/torch/_functorch/aot_autograd.py", line 1222, in g
return f(*args)
File "/scratch/dberard/bisectdynamo/pytorch/torch/autograd/function.py", line 506, in apply
return super().apply(*args, **kwargs) # type: ignore[misc]
File "/scratch/dberard/bisectdynamo/pytorch/torch/_functorch/aot_autograd.py", line 2162, in forward
fw_outs = call_func_with_args(
File "/scratch/dberard/bisectdynamo/pytorch/torch/_functorch/aot_autograd.py", line 1247, in call_func_with_args
out = normalize_as_list(f(args))
File "/scratch/dberard/bisectdynamo/pytorch/torch/_inductor/compile_fx.py", line 252, in run
return model(new_inputs)
File "/scratch/dberard/bisectdynamo/pytorch/torch/_inductor/compile_fx.py", line 271, in run
return compiled_fn(new_inputs)
File "/scratch/dberard/bisectdynamo/pytorch/torch/_inductor/compile_fx.py", line 351, in run
dst.copy_(src)
File "/scratch/dberard/bisectdynamo/pytorch/torch/utils/_stats.py", line 20, in wrapper
return fn(*args, **kwargs)
File "/scratch/dberard/bisectdynamo/pytorch/torch/_subclasses/fake_tensor.py", line 944, in __torch_dispatch__
return func(*args, **kwargs)
File "/scratch/dberard/bisectdynamo/pytorch/torch/_ops.py", line 284, in __call__
return self._op(*args, **kwargs or {})
File "/scratch/dberard/bisectdynamo/pytorch/torch/utils/_stats.py", line 20, in wrapper
return fn(*args, **kwargs)
File "/scratch/dberard/bisectdynamo/pytorch/torch/_subclasses/fake_tensor.py", line 1057, in __torch_dispatch__
return self.dispatch(func, types, args, kwargs)
File "/scratch/dberard/bisectdynamo/pytorch/torch/_subclasses/fake_tensor.py", line 1136, in dispatch
args, kwargs = self.validate_and_convert_non_fake_tensors(
File "/scratch/dberard/bisectdynamo/pytorch/torch/_subclasses/fake_tensor.py", line 1290, in validate_and_convert_non_fake_tensors
return tree_map_only(
File "/scratch/dberard/bisectdynamo/pytorch/torch/utils/_pytree.py", line 266, in tree_map_only
return tree_map(map_only(ty)(fn), pytree)
File "/scratch/dberard/bisectdynamo/pytorch/torch/utils/_pytree.py", line 196, in tree_map
return tree_unflatten([fn(i) for i in flat_args], spec)
File "/scratch/dberard/bisectdynamo/pytorch/torch/utils/_pytree.py", line 196, in <listcomp>
return tree_unflatten([fn(i) for i in flat_args], spec)
File "/scratch/dberard/bisectdynamo/pytorch/torch/utils/_pytree.py", line 247, in inner
return f(x)
File "/scratch/dberard/bisectdynamo/pytorch/torch/_subclasses/fake_tensor.py", line 1282, in validate
raise Exception(
Exception: Please convert all Tensors to FakeTensors first or instantiate FakeTensorMode with 'allow_non_fake_inputs'. Found in aten.copy_.default(*(tensor([[[[0., 0., 0., ..., 0., 0., 0.]]],
[[[0., 0., 0., ..., 0., 0., 0.]]],
[[[0., 0., 0., ..., 0., 0., 0.]]],
[[[0., 0., 0., ..., 0., 0., 0.]]]], device='cuda:0'), FakeTensor(FakeTensor(..., device='meta', size=(4, 1, 1, 1024)), cuda:0)), **{})
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/scratch/dberard/bisectdynamo/pytorch/benchmarks/dynamo/common.py", line 1293, in check_accuracy
new_result = optimized_model_iter_fn(model_copy, example_inputs)
File "/scratch/dberard/bisectdynamo/pytorch/torch/_dynamo/eval_frame.py", line 231, in _fn
return fn(*args, **kwargs)
File "/scratch/dberard/bisectdynamo/pytorch/benchmarks/dynamo/common.py", line 1160, in run_n_iterations
self.model_iter_fn(mod, inputs, collect_outputs=False)
File "benchmarks/dynamo/torchbench.py", line 365, in forward_and_backward_pass
cloned_inputs = clone_inputs(inputs)
File "benchmarks/dynamo/torchbench.py", line 366, in <graph break in forward_and_backward_pass>
self.optimizer_zero_grad(mod)
File "benchmarks/dynamo/torchbench.py", line 368, in <graph break in forward_and_backward_pass>
pred = mod(*cloned_inputs)
File "/scratch/dberard/bisectdynamo/pytorch/torch/nn/modules/module.py", line 1533, in _call_impl
return forward_call(*args, **kwargs)
File "/data/home/dberard/miniconda/envs/bisectdynamo/lib/python3.8/site-packages/transformers/models/longformer/modeling_longformer.py", line 1848, in forward
outputs = self.longformer(
File "/scratch/dberard/bisectdynamo/pytorch/torch/nn/modules/module.py", line 1533, in _call_impl
return forward_call(*args, **kwargs)
File "/data/home/dberard/miniconda/envs/bisectdynamo/lib/python3.8/site-packages/transformers/models/longformer/modeling_longformer.py", line 1742, in forward
extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)[
File "/scratch/dberard/bisectdynamo/pytorch/torch/_dynamo/eval_frame.py", line 368, in catch_errors
return callback(frame, cache_size, hooks)
File "/scratch/dberard/bisectdynamo/pytorch/torch/_dynamo/convert_frame.py", line 404, in _convert_frame
result = inner_convert(frame, cache_size, hooks)
File "/scratch/dberard/bisectdynamo/pytorch/torch/_dynamo/convert_frame.py", line 104, in _fn
return fn(*args, **kwargs)
File "/scratch/dberard/bisectdynamo/pytorch/torch/_dynamo/convert_frame.py", line 262, in _convert_frame_assert
return _compile(
File "/scratch/dberard/bisectdynamo/pytorch/torch/_dynamo/utils.py", line 164, in time_wrapper
r = func(*args, **kwargs)
File "/scratch/dberard/bisectdynamo/pytorch/torch/_dynamo/convert_frame.py", line 324, in _compile
out_code = transform_code_object(code, transform)
File "/scratch/dberard/bisectdynamo/pytorch/torch/_dynamo/bytecode_transformation.py", line 530, in transform_code_object
transformations(instructions, code_options)
File "/scratch/dberard/bisectdynamo/pytorch/torch/_dynamo/convert_frame.py", line 311, in transform
tracer.run()
File "/scratch/dberard/bisectdynamo/pytorch/torch/_dynamo/symbolic_convert.py", line 1862, in run
super().run()
File "/scratch/dberard/bisectdynamo/pytorch/torch/_dynamo/symbolic_convert.py", line 619, in run
and self.step()
File "/scratch/dberard/bisectdynamo/pytorch/torch/_dynamo/symbolic_convert.py", line 583, in step
getattr(self, inst.opname)(inst)
File "/scratch/dberard/bisectdynamo/pytorch/torch/_dynamo/symbolic_convert.py", line 379, in wrapper
self.output.compile_subgraph(self, reason=reason)
File "/scratch/dberard/bisectdynamo/pytorch/torch/_dynamo/output_graph.py", line 579, in compile_subgraph
self.compile_and_call_fx_graph(tx, pass2.graph_output_vars(), root)
File "/scratch/dberard/bisectdynamo/pytorch/torch/_dynamo/output_graph.py", line 626, in compile_and_call_fx_graph
compiled_fn = self.call_user_compiler(gm)
File "/scratch/dberard/bisectdynamo/pytorch/torch/_dynamo/utils.py", line 164, in time_wrapper
r = func(*args, **kwargs)
File "/scratch/dberard/bisectdynamo/pytorch/torch/_dynamo/output_graph.py", line 713, in call_user_compiler
raise BackendCompilerFailed(self.compiler_fn, e) from e
torch._dynamo.exc.BackendCompilerFailed: inductor raised Exception: Please convert all Tensors to FakeTensors first or instantiate FakeTensorMode with 'allow_non_fake_inputs'. Found in aten.copy_.default(*(tensor([[[[0., 0., 0., ..., 0., 0., 0.]]],
[[[0., 0., 0., ..., 0., 0., 0.]]],
[[[0., 0., 0., ..., 0., 0., 0.]]],
[[[0., 0., 0., ..., 0., 0., 0.]]]], device='cuda:0'), FakeTensor(FakeTensor(..., device='meta', size=(4, 1, 1, 1024)), cuda:0)), **{})
Set torch._dynamo.config.verbose=True for more information
Minifier script written to /scratch/dberard/bisectdynamo/pytorch/torch_compile_debug/run_2023_03_08_23_40_23_244562-pid_3089959/minifier/minifier_launcher.py. Run this script to find the smallest traced graph which reproduces this error.
You can suppress this exception and fall back to eager by setting:
torch._dynamo.config.suppress_errors = True
TorchDynamo optimized model failed to run because of following error
FAIL
cuda train hf_Longformer [2023-03-08 23:20:43,889] torch._inductor.utils: [WARNING] skipping cudagraphs due to multiple devices
[2023-03-08 23:20:49,843] torch._inductor.utils: [WARNING] skipping cudagraphs due to multiple devices
[2023-03-08 23:20:55,239] torch._inductor.utils: [WARNING] skipping cudagraphs due to multiple devices
[2023-03-08 23:20:55,280] torch._inductor.graph: [ERROR] Error from lowering
Traceback (most recent call last):
File "/scratch/dberard/bisectdynamo/pytorch/torch/_inductor/graph.py", line 354, in call_function
out = lowerings[target](*args, **kwargs)
File "/scratch/dberard/bisectdynamo/pytorch/torch/_inductor/lowering.py", line 229, in wrapped
validate_ir(out)
File "/scratch/dberard/bisectdynamo/pytorch/torch/_inductor/ir.py", line 103, in validate_ir
_check_tensorbox(node_or_nodes)
File "/scratch/dberard/bisectdynamo/pytorch/torch/_inductor/ir.py", line 88, in _check_tensorbox
assert isinstance(
AssertionError: Found <class 'torch._inductor.ir.DynamicScalar'>, which is not a supported top level IR node. See [Note: Inductor IR]
ERROR:common:inductor raised LoweringException: AssertionError: Found <class 'torch._inductor.ir.DynamicScalar'>, which is not a supported top level IR node. See [Note: Inductor IR]
target: aten._local_scalar_dense.default
args[0]: TensorBox(StorageBox(
Pointwise(
'cpu',
torch.int64,
tmp0 = constant(1024, torch.int64)
tmp1 = constant(512, torch.int64)
tmp2 = truncdiv(tmp0, tmp1)
return tmp2
,
ranges=(),
origins={div}
)
))
While executing %_local_scalar_dense : [#users=0] = call_function[target=torch.ops.aten._local_scalar_dense.default](args = (%div,), kwargs = {})
Original traceback:
File "/data/home/dberard/miniconda/envs/bisectdynamo/lib/python3.8/site-packages/transformers/models/longformer/modeling_longformer.py", line 839, in <graph break in _sliding_chunks_query_key_matmul>
query = self._chunk(query, window_overlap, self.config.__dict__.get("onnx_export", False))
File "/data/home/dberard/miniconda/envs/bisectdynamo/lib/python3.8/site-packages/transformers/models/longformer/modeling_longformer.py", line 769, in _chunk
hidden_states = hidden_states.view(
Set torch._dynamo.config.verbose=True for more information
You can suppress this exception and fall back to eager by setting:
torch._dynamo.config.suppress_errors = True
Traceback (most recent call last):
File "/scratch/dberard/bisectdynamo/pytorch/torch/_inductor/graph.py", line 354, in call_function
out = lowerings[target](*args, **kwargs)
File "/scratch/dberard/bisectdynamo/pytorch/torch/_inductor/lowering.py", line 229, in wrapped
validate_ir(out)
File "/scratch/dberard/bisectdynamo/pytorch/torch/_inductor/ir.py", line 103, in validate_ir
_check_tensorbox(node_or_nodes)
File "/scratch/dberard/bisectdynamo/pytorch/torch/_inductor/ir.py", line 88, in _check_tensorbox
assert isinstance(
AssertionError: Found <class 'torch._inductor.ir.DynamicScalar'>, which is not a supported top level IR node. See [Note: Inductor IR]
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/scratch/dberard/bisectdynamo/pytorch/torch/_dynamo/output_graph.py", line 708, in call_user_compiler
compiled_fn = compiler_fn(gm, self.fake_example_inputs())
File "/scratch/dberard/bisectdynamo/pytorch/torch/_dynamo/debug_utils.py", line 1055, in debug_wrapper
compiled_gm = compiler_fn(gm, example_inputs)
File "/scratch/dberard/bisectdynamo/pytorch/torch/_dynamo/backends/inductor.py", line 9, in inductor
return compile_fx(*args, **kwargs)
File "/scratch/dberard/bisectdynamo/pytorch/torch/_inductor/compile_fx.py", line 488, in compile_fx
return aot_autograd(
File "/scratch/dberard/bisectdynamo/pytorch/torch/_dynamo/backends/common.py", line 48, in compiler_fn
cg = aot_module_simplified(gm, example_inputs, **kwargs)
File "/scratch/dberard/bisectdynamo/pytorch/torch/_functorch/aot_autograd.py", line 2839, in aot_module_simplified
compiled_fn = create_aot_dispatcher_function(
File "/scratch/dberard/bisectdynamo/pytorch/torch/_dynamo/utils.py", line 164, in time_wrapper
r = func(*args, **kwargs)
File "/scratch/dberard/bisectdynamo/pytorch/torch/_functorch/aot_autograd.py", line 2522, in create_aot_dispatcher_function
compiled_fn = compiler_fn(flat_fn, fake_flat_args, aot_config)
File "/scratch/dberard/bisectdynamo/pytorch/torch/_functorch/aot_autograd.py", line 1724, in aot_wrapper_dedupe
return compiler_fn(flat_fn, leaf_flat_args, aot_config)
File "/scratch/dberard/bisectdynamo/pytorch/torch/_functorch/aot_autograd.py", line 1335, in aot_dispatch_base
compiled_fw = aot_config.fw_compiler(fw_module, flat_args_with_views_handled)
File "/scratch/dberard/bisectdynamo/pytorch/torch/_dynamo/utils.py", line 164, in time_wrapper
r = func(*args, **kwargs)
File "/scratch/dberard/bisectdynamo/pytorch/torch/_inductor/compile_fx.py", line 462, in fw_compiler
return inner_compile(
File "/scratch/dberard/bisectdynamo/pytorch/torch/_dynamo/debug_utils.py", line 595, in debug_wrapper
compiled_fn = compiler_fn(gm, example_inputs)
File "/scratch/dberard/bisectdynamo/pytorch/torch/_inductor/debug.py", line 239, in inner
return fn(*args, **kwargs)
File "/data/home/dberard/miniconda/envs/bisectdynamo/lib/python3.8/contextlib.py", line 75, in inner
return func(*args, **kwds)
File "/scratch/dberard/bisectdynamo/pytorch/torch/_inductor/compile_fx.py", line 179, in compile_fx_inner
graph.run(*example_inputs)
File "/scratch/dberard/bisectdynamo/pytorch/torch/_dynamo/utils.py", line 164, in time_wrapper
r = func(*args, **kwargs)
File "/scratch/dberard/bisectdynamo/pytorch/torch/_inductor/graph.py", line 211, in run
return super().run(*args)
File "/scratch/dberard/bisectdynamo/pytorch/torch/fx/interpreter.py", line 136, in run
self.env[node] = self.run_node(node)
File "/scratch/dberard/bisectdynamo/pytorch/torch/_inductor/graph.py", line 436, in run_node
result = super().run_node(n)
File "/scratch/dberard/bisectdynamo/pytorch/torch/fx/interpreter.py", line 177, in run_node
return getattr(self, n.op)(n.target, args, kwargs)
File "/scratch/dberard/bisectdynamo/pytorch/torch/_inductor/graph.py", line 358, in call_function
raise LoweringException(e, target, args, kwargs) from e
torch._inductor.exc.LoweringException: AssertionError: Found <class 'torch._inductor.ir.DynamicScalar'>, which is not a supported top level IR node. See [Note: Inductor IR]
target: aten._local_scalar_dense.default
args[0]: TensorBox(StorageBox(
Pointwise(
'cpu',
torch.int64,
tmp0 = constant(1024, torch.int64)
tmp1 = constant(512, torch.int64)
tmp2 = truncdiv(tmp0, tmp1)
return tmp2
,
ranges=(),
origins={div}
)
))
While executing %_local_scalar_dense : [#users=0] = call_function[target=torch.ops.aten._local_scalar_dense.default](args = (%div,), kwargs = {})
Original traceback:
File "/data/home/dberard/miniconda/envs/bisectdynamo/lib/python3.8/site-packages/transformers/models/longformer/modeling_longformer.py", line 839, in <graph break in _sliding_chunks_query_key_matmul>
query = self._chunk(query, window_overlap, self.config.__dict__.get("onnx_export", False))
File "/data/home/dberard/miniconda/envs/bisectdynamo/lib/python3.8/site-packages/transformers/models/longformer/modeling_longformer.py", line 769, in _chunk
hidden_states = hidden_states.view(
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/scratch/dberard/bisectdynamo/pytorch/benchmarks/dynamo/common.py", line 1293, in check_accuracy
new_result = optimized_model_iter_fn(model_copy, example_inputs)
File "/scratch/dberard/bisectdynamo/pytorch/torch/_dynamo/eval_frame.py", line 231, in _fn
return fn(*args, **kwargs)
File "/scratch/dberard/bisectdynamo/pytorch/benchmarks/dynamo/common.py", line 1160, in run_n_iterations
self.model_iter_fn(mod, inputs, collect_outputs=False)
File "benchmarks/dynamo/torchbench.py", line 365, in forward_and_backward_pass
cloned_inputs = clone_inputs(inputs)
File "benchmarks/dynamo/torchbench.py", line 366, in <graph break in forward_and_backward_pass>
self.optimizer_zero_grad(mod)
File "benchmarks/dynamo/torchbench.py", line 368, in <graph break in forward_and_backward_pass>
pred = mod(*cloned_inputs)
File "/scratch/dberard/bisectdynamo/pytorch/torch/nn/modules/module.py", line 1533, in _call_impl
return forward_call(*args, **kwargs)
File "/data/home/dberard/miniconda/envs/bisectdynamo/lib/python3.8/site-packages/transformers/models/longformer/modeling_longformer.py", line 1848, in forward
outputs = self.longformer(
File "/scratch/dberard/bisectdynamo/pytorch/torch/nn/modules/module.py", line 1533, in _call_impl
return forward_call(*args, **kwargs)
File "/data/home/dberard/miniconda/envs/bisectdynamo/lib/python3.8/site-packages/transformers/models/longformer/modeling_longformer.py", line 1742, in forward
extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)[
File "/data/home/dberard/miniconda/envs/bisectdynamo/lib/python3.8/site-packages/transformers/models/longformer/modeling_longformer.py", line 1750, in <graph break in forward>
encoder_outputs = self.encoder(
File "/scratch/dberard/bisectdynamo/pytorch/torch/nn/modules/module.py", line 1533, in _call_impl
return forward_call(*args, **kwargs)
File "/data/home/dberard/miniconda/envs/bisectdynamo/lib/python3.8/site-packages/transformers/models/longformer/modeling_longformer.py", line 1294, in forward
is_global_attn = is_index_global_attn.flatten().any().item()
File "/data/home/dberard/miniconda/envs/bisectdynamo/lib/python3.8/site-packages/transformers/models/longformer/modeling_longformer.py", line 1326, in <graph break in forward>
layer_outputs = layer_module(
File "/scratch/dberard/bisectdynamo/pytorch/torch/nn/modules/module.py", line 1533, in _call_impl
return forward_call(*args, **kwargs)
File "/data/home/dberard/miniconda/envs/bisectdynamo/lib/python3.8/site-packages/transformers/models/longformer/modeling_longformer.py", line 1249, in forward
self_attn_outputs = self.attention(
File "/scratch/dberard/bisectdynamo/pytorch/torch/nn/modules/module.py", line 1533, in _call_impl
return forward_call(*args, **kwargs)
File "/data/home/dberard/miniconda/envs/bisectdynamo/lib/python3.8/site-packages/transformers/models/longformer/modeling_longformer.py", line 1185, in forward
self_outputs = self.self(
File "/scratch/dberard/bisectdynamo/pytorch/torch/nn/modules/module.py", line 1533, in _call_impl
return forward_call(*args, **kwargs)
File "/data/home/dberard/miniconda/envs/bisectdynamo/lib/python3.8/site-packages/transformers/models/longformer/modeling_longformer.py", line 574, in forward
attn_scores = self._sliding_chunks_query_key_matmul(
File "/data/home/dberard/miniconda/envs/bisectdynamo/lib/python3.8/site-packages/transformers/models/longformer/modeling_longformer.py", line 586, in <graph break in forward>
diagonal_mask = self._sliding_chunks_query_key_matmul(
File "/data/home/dberard/miniconda/envs/bisectdynamo/lib/python3.8/site-packages/transformers/models/longformer/modeling_longformer.py", line 839, in _sliding_chunks_query_key_matmul
query = self._chunk(query, window_overlap, self.config.__dict__.get("onnx_export", False))
File "/scratch/dberard/bisectdynamo/pytorch/torch/_dynamo/eval_frame.py", line 368, in catch_errors
return callback(frame, cache_size, hooks)
File "/scratch/dberard/bisectdynamo/pytorch/torch/_dynamo/convert_frame.py", line 404, in _convert_frame
result = inner_convert(frame, cache_size, hooks)
File "/scratch/dberard/bisectdynamo/pytorch/torch/_dynamo/convert_frame.py", line 104, in _fn
return fn(*args, **kwargs)
File "/scratch/dberard/bisectdynamo/pytorch/torch/_dynamo/convert_frame.py", line 262, in _convert_frame_assert
return _compile(
File "/scratch/dberard/bisectdynamo/pytorch/torch/_dynamo/utils.py", line 164, in time_wrapper
r = func(*args, **kwargs)
File "/scratch/dberard/bisectdynamo/pytorch/torch/_dynamo/convert_frame.py", line 324, in _compile
out_code = transform_code_object(code, transform)
File "/scratch/dberard/bisectdynamo/pytorch/torch/_dynamo/bytecode_transformation.py", line 530, in transform_code_object
transformations(instructions, code_options)
File "/scratch/dberard/bisectdynamo/pytorch/torch/_dynamo/convert_frame.py", line 311, in transform
tracer.run()
File "/scratch/dberard/bisectdynamo/pytorch/torch/_dynamo/symbolic_convert.py", line 1862, in run
super().run()
File "/scratch/dberard/bisectdynamo/pytorch/torch/_dynamo/symbolic_convert.py", line 619, in run
and self.step()
File "/scratch/dberard/bisectdynamo/pytorch/torch/_dynamo/symbolic_convert.py", line 583, in step
getattr(self, inst.opname)(inst)
File "/scratch/dberard/bisectdynamo/pytorch/torch/_dynamo/symbolic_convert.py", line 379, in wrapper
self.output.compile_subgraph(self, reason=reason)
File "/scratch/dberard/bisectdynamo/pytorch/torch/_dynamo/output_graph.py", line 579, in compile_subgraph
self.compile_and_call_fx_graph(tx, pass2.graph_output_vars(), root)
File "/scratch/dberard/bisectdynamo/pytorch/torch/_dynamo/output_graph.py", line 626, in compile_and_call_fx_graph
compiled_fn = self.call_user_compiler(gm)
File "/scratch/dberard/bisectdynamo/pytorch/torch/_dynamo/utils.py", line 164, in time_wrapper
r = func(*args, **kwargs)
File "/scratch/dberard/bisectdynamo/pytorch/torch/_dynamo/output_graph.py", line 713, in call_user_compiler
raise BackendCompilerFailed(self.compiler_fn, e) from e
torch._dynamo.exc.BackendCompilerFailed: inductor raised LoweringException: AssertionError: Found <class 'torch._inductor.ir.DynamicScalar'>, which is not a supported top level IR node. See [Note: Inductor IR]
target: aten._local_scalar_dense.default
args[0]: TensorBox(StorageBox(
Pointwise(
'cpu',
torch.int64,
tmp0 = constant(1024, torch.int64)
tmp1 = constant(512, torch.int64)
tmp2 = truncdiv(tmp0, tmp1)
return tmp2
,
ranges=(),
origins={div}
)
))
While executing %_local_scalar_dense : [#users=0] = call_function[target=torch.ops.aten._local_scalar_dense.default](args = (%div,), kwargs = {})
Original traceback:
File "/data/home/dberard/miniconda/envs/bisectdynamo/lib/python3.8/site-packages/transformers/models/longformer/modeling_longformer.py", line 839, in <graph break in _sliding_chunks_query_key_matmul>
query = self._chunk(query, window_overlap, self.config.__dict__.get("onnx_export", False))
File "/data/home/dberard/miniconda/envs/bisectdynamo/lib/python3.8/site-packages/transformers/models/longformer/modeling_longformer.py", line 769, in _chunk
hidden_states = hidden_states.view(
Set torch._dynamo.config.verbose=True for more information
You can suppress this exception and fall back to eager by setting:
torch._dynamo.config.suppress_errors = True
TorchDynamo optimized model failed to run because of following error
FAIL
Traceback (most recent call last):
File "torch_compile_debug/run_2023_03_08_23_32_30_111734-pid_3083040/minifier/minifier_launcher.py", line 55, in <module>
mod = make_fx(Repro(), tracing_mode='real')(*args)
File "/scratch/dberard/bisectdynamo/pytorch/torch/fx/experimental/proxy_tensor.py", line 714, in wrapped
t = dispatch_trace(wrap_key(func, args, fx_tracer), tracer=fx_tracer, concrete_args=tuple(phs))
File "/scratch/dberard/bisectdynamo/pytorch/torch/fx/experimental/proxy_tensor.py", line 443, in dispatch_trace
graph = tracer.trace(root, concrete_args)
File "/scratch/dberard/bisectdynamo/pytorch/torch/fx/_symbolic_trace.py", line 778, in trace
(self.create_arg(fn(*args)),),
File "/scratch/dberard/bisectdynamo/pytorch/torch/fx/experimental/proxy_tensor.py", line 459, in wrapped
out = f(*tensors)
File "<string>", line 1, in <lambda>
File "/scratch/dberard/bisectdynamo/pytorch/torch/fx/_symbolic_trace.py", line 756, in module_call_wrapper
return self.call_module(mod, forward, args, kwargs)
File "/scratch/dberard/bisectdynamo/pytorch/torch/fx/experimental/proxy_tensor.py", line 409, in call_module
return forward(*args, **kwargs)
File "/scratch/dberard/bisectdynamo/pytorch/torch/fx/_symbolic_trace.py", line 749, in forward
return _orig_module_call(mod, *args, **kwargs)
File "/scratch/dberard/bisectdynamo/pytorch/torch/nn/modules/module.py", line 1533, in _call_impl
return forward_call(*args, **kwargs)
File "torch_compile_debug/run_2023_03_08_23_32_30_111734-pid_3083040/minifier/minifier_launcher.py", line 48, in forward
_local_scalar_dense = torch.ops.aten._local_scalar_dense.default(div); div = None
File "/scratch/dberard/bisectdynamo/pytorch/torch/_ops.py", line 284, in __call__
return self._op(*args, **kwargs or {})
File "/scratch/dberard/bisectdynamo/pytorch/torch/utils/_stats.py", line 20, in wrapper
return fn(*args, **kwargs)
File "/scratch/dberard/bisectdynamo/pytorch/torch/fx/experimental/proxy_tensor.py", line 487, in __torch_dispatch__
return self.inner_torch_dispatch(func, types, args, kwargs)
File "/scratch/dberard/bisectdynamo/pytorch/torch/fx/experimental/proxy_tensor.py", line 512, in inner_torch_dispatch
out = proxy_call(self, func, args, kwargs)
File "/scratch/dberard/bisectdynamo/pytorch/torch/fx/experimental/proxy_tensor.py", line 282, in proxy_call
raise RuntimeError(
RuntimeError: It appears that you're trying to get value out of a tracing tensor with aten._local_scalar_dense.default - erroring out! It's likely that this is caused by data-dependent control flow or similar. It may be possible to trace this with dynamic shapes; try setting tracing_mode='symbolic' in your make_fx call.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment