Created
April 11, 2025 19:27
-
-
Save andrewor14/b4058adc32fcb1dafd70574e0eae335d to your computer and use it in GitHub Desktop.
float8 tp stack trace
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[rank0]: Traceback (most recent call last): | |
[rank0]: File "/home/andrewor/local/torchtune/recipes/full_finetune_distributed.py", line 982, in <module> | |
[rank0]: sys.exit(recipe_main()) | |
[rank0]: File "/home/andrewor/local/torchtune/torchtune/config/_parse.py", line 99, in wrapper | |
[rank0]: sys.exit(recipe_main(conf)) | |
[rank0]: File "/home/andrewor/local/torchtune/recipes/full_finetune_distributed.py", line 977, in recipe_main | |
[rank0]: recipe.train() | |
[rank0]: File "/home/andrewor/local/torchtune/recipes/full_finetune_distributed.py", line 810, in train | |
[rank0]: logits = self._model(**batch) | |
[rank0]: File "/home/andrewor/local/pytorch/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl | |
[rank0]: return self._call_impl(*args, **kwargs) | |
[rank0]: File "/home/andrewor/local/pytorch/torch/nn/modules/module.py", line 1857, in _call_impl | |
[rank0]: return inner() | |
[rank0]: File "/home/andrewor/local/pytorch/torch/nn/modules/module.py", line 1805, in inner | |
[rank0]: result = forward_call(*args, **kwargs) | |
[rank0]: File "/home/andrewor/local/torchtune/torchtune/modules/transformer.py", line 648, in forward | |
[rank0]: h = layer( | |
[rank0]: File "/home/andrewor/local/pytorch/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl | |
[rank0]: return self._call_impl(*args, **kwargs) | |
[rank0]: File "/home/andrewor/local/pytorch/torch/nn/modules/module.py", line 1857, in _call_impl | |
[rank0]: return inner() | |
[rank0]: File "/home/andrewor/local/pytorch/torch/nn/modules/module.py", line 1805, in inner | |
[rank0]: result = forward_call(*args, **kwargs) | |
[rank0]: File "/home/andrewor/local/pytorch/torch/distributed/algorithms/_checkpoint/checkpoint_wrapper.py", line 171, in forward | |
[rank0]: return self.checkpoint_fn( # type: ignore[misc] | |
[rank0]: File "/home/andrewor/local/pytorch/torch/_compile.py", line 51, in inner | |
[rank0]: return disable_fn(*args, **kwargs) | |
[rank0]: File "/home/andrewor/local/pytorch/torch/_dynamo/eval_frame.py", line 850, in _fn | |
[rank0]: return fn(*args, **kwargs) | |
[rank0]: File "/home/andrewor/local/pytorch/torch/utils/checkpoint.py", line 495, in checkpoint | |
[rank0]: ret = function(*args, **kwargs) | |
[rank0]: File "/home/andrewor/local/pytorch/torch/nn/modules/module.py", line 1749, in _wrapped_call_impl | |
[rank0]: return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc] | |
[rank0]: File "/home/andrewor/local/pytorch/torch/_dynamo/eval_frame.py", line 658, in _fn | |
[rank0]: return fn(*args, **kwargs) | |
[rank0]: File "/home/andrewor/local/pytorch/torch/nn/modules/module.py", line 1762, in _call_impl | |
[rank0]: return forward_call(*args, **kwargs) | |
[rank0]: File "/home/andrewor/local/torchtune/torchtune/modules/transformer.py", line 122, in forward | |
[rank0]: attn_out = self.attn(h, h, mask=mask, input_pos=input_pos) | |
[rank0]: File "/home/andrewor/local/pytorch/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl | |
[rank0]: return self._call_impl(*args, **kwargs) | |
[rank0]: File "/home/andrewor/local/pytorch/torch/nn/modules/module.py", line 1857, in _call_impl | |
[rank0]: return inner() | |
[rank0]: File "/home/andrewor/local/pytorch/torch/nn/modules/module.py", line 1805, in inner | |
[rank0]: result = forward_call(*args, **kwargs) | |
[rank0]: File "/home/andrewor/local/torchtune/torchtune/modules/attention.py", line 263, in forward | |
[rank0]: k = self.k_proj(y) | |
[rank0]: File "/home/andrewor/local/pytorch/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl | |
[rank0]: return self._call_impl(*args, **kwargs) | |
[rank0]: File "/home/andrewor/local/pytorch/torch/nn/modules/module.py", line 1857, in _call_impl | |
[rank0]: return inner() | |
[rank0]: File "/home/andrewor/local/pytorch/torch/nn/modules/module.py", line 1805, in inner | |
[rank0]: result = forward_call(*args, **kwargs) | |
[rank0]: File "/home/andrewor/local/ao/torchao/float8/float8_linear.py", line 352, in forward | |
[rank0]: output = matmul_with_hp_or_float8_args.apply( | |
[rank0]: File "/home/andrewor/local/pytorch/torch/autograd/function.py", line 575, in apply | |
[rank0]: return super().apply(*args, **kwargs) # type: ignore[misc] | |
[rank0]: File "/home/andrewor/local/ao/torchao/float8/float8_linear.py", line 120, in forward | |
[rank0]: input_maybe_fp8_reshaped = input_maybe_fp8.reshape(-1, orig_shape[-1]) | |
[rank0]: File "/home/andrewor/local/pytorch/torch/_compile.py", line 51, in inner | |
[rank0]: return disable_fn(*args, **kwargs) | |
[rank0]: File "/home/andrewor/local/pytorch/torch/_dynamo/eval_frame.py", line 850, in _fn | |
[rank0]: return fn(*args, **kwargs) | |
[rank0]: File "/home/andrewor/local/pytorch/torch/distributed/tensor/_api.py", line 350, in __torch_dispatch__ | |
[rank0]: return DTensor._op_dispatcher.dispatch( | |
[rank0]: File "/home/andrewor/local/pytorch/torch/distributed/tensor/_dispatch.py", line 172, in dispatch | |
[rank0]: self.redistribute_local_args( | |
[rank0]: File "/home/andrewor/local/pytorch/torch/distributed/tensor/_dispatch.py", line 309, in redistribute_local_args | |
[rank0]: resharded_local_tensor = redistribute_local_tensor( | |
[rank0]: File "/home/andrewor/local/pytorch/torch/distributed/tensor/_redistribute.py", line 213, in redistribute_local_tensor | |
[rank0]: new_local_tensor = current_placement._to_replicate_tensor( | |
[rank0]: File "/home/andrewor/local/pytorch/torch/distributed/tensor/placement_types.py", line 260, in _to_replicate_tensor | |
[rank0]: result = funcol.all_gather_tensor( | |
[rank0]: File "/home/andrewor/local/pytorch/torch/distributed/_functional_collectives.py", line 215, in all_gather_tensor | |
[rank0]: res = torch.cat(torch.chunk(res, group_size, dim=0), dim=gather_dim) | |
[rank0]: File "/home/andrewor/local/ao/torchao/float8/float8_tensor.py", line 375, in __torch_dispatch__ | |
[rank0]: raise NotImplementedError(f"attempting to run {func}, this is not supported") | |
[rank0]: NotImplementedError: attempting to run aten.chunk.default, this is not supported |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment