Skip to content

Instantly share code, notes, and snippets.

@andrewor14
Created April 11, 2025 19:27
Show Gist options
  • Save andrewor14/b4058adc32fcb1dafd70574e0eae335d to your computer and use it in GitHub Desktop.
Save andrewor14/b4058adc32fcb1dafd70574e0eae335d to your computer and use it in GitHub Desktop.
float8 tp stack trace
[rank0]: Traceback (most recent call last):
[rank0]: File "/home/andrewor/local/torchtune/recipes/full_finetune_distributed.py", line 982, in <module>
[rank0]: sys.exit(recipe_main())
[rank0]: File "/home/andrewor/local/torchtune/torchtune/config/_parse.py", line 99, in wrapper
[rank0]: sys.exit(recipe_main(conf))
[rank0]: File "/home/andrewor/local/torchtune/recipes/full_finetune_distributed.py", line 977, in recipe_main
[rank0]: recipe.train()
[rank0]: File "/home/andrewor/local/torchtune/recipes/full_finetune_distributed.py", line 810, in train
[rank0]: logits = self._model(**batch)
[rank0]: File "/home/andrewor/local/pytorch/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
[rank0]: return self._call_impl(*args, **kwargs)
[rank0]: File "/home/andrewor/local/pytorch/torch/nn/modules/module.py", line 1857, in _call_impl
[rank0]: return inner()
[rank0]: File "/home/andrewor/local/pytorch/torch/nn/modules/module.py", line 1805, in inner
[rank0]: result = forward_call(*args, **kwargs)
[rank0]: File "/home/andrewor/local/torchtune/torchtune/modules/transformer.py", line 648, in forward
[rank0]: h = layer(
[rank0]: File "/home/andrewor/local/pytorch/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
[rank0]: return self._call_impl(*args, **kwargs)
[rank0]: File "/home/andrewor/local/pytorch/torch/nn/modules/module.py", line 1857, in _call_impl
[rank0]: return inner()
[rank0]: File "/home/andrewor/local/pytorch/torch/nn/modules/module.py", line 1805, in inner
[rank0]: result = forward_call(*args, **kwargs)
[rank0]: File "/home/andrewor/local/pytorch/torch/distributed/algorithms/_checkpoint/checkpoint_wrapper.py", line 171, in forward
[rank0]: return self.checkpoint_fn( # type: ignore[misc]
[rank0]: File "/home/andrewor/local/pytorch/torch/_compile.py", line 51, in inner
[rank0]: return disable_fn(*args, **kwargs)
[rank0]: File "/home/andrewor/local/pytorch/torch/_dynamo/eval_frame.py", line 850, in _fn
[rank0]: return fn(*args, **kwargs)
[rank0]: File "/home/andrewor/local/pytorch/torch/utils/checkpoint.py", line 495, in checkpoint
[rank0]: ret = function(*args, **kwargs)
[rank0]: File "/home/andrewor/local/pytorch/torch/nn/modules/module.py", line 1749, in _wrapped_call_impl
[rank0]: return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
[rank0]: File "/home/andrewor/local/pytorch/torch/_dynamo/eval_frame.py", line 658, in _fn
[rank0]: return fn(*args, **kwargs)
[rank0]: File "/home/andrewor/local/pytorch/torch/nn/modules/module.py", line 1762, in _call_impl
[rank0]: return forward_call(*args, **kwargs)
[rank0]: File "/home/andrewor/local/torchtune/torchtune/modules/transformer.py", line 122, in forward
[rank0]: attn_out = self.attn(h, h, mask=mask, input_pos=input_pos)
[rank0]: File "/home/andrewor/local/pytorch/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
[rank0]: return self._call_impl(*args, **kwargs)
[rank0]: File "/home/andrewor/local/pytorch/torch/nn/modules/module.py", line 1857, in _call_impl
[rank0]: return inner()
[rank0]: File "/home/andrewor/local/pytorch/torch/nn/modules/module.py", line 1805, in inner
[rank0]: result = forward_call(*args, **kwargs)
[rank0]: File "/home/andrewor/local/torchtune/torchtune/modules/attention.py", line 263, in forward
[rank0]: k = self.k_proj(y)
[rank0]: File "/home/andrewor/local/pytorch/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
[rank0]: return self._call_impl(*args, **kwargs)
[rank0]: File "/home/andrewor/local/pytorch/torch/nn/modules/module.py", line 1857, in _call_impl
[rank0]: return inner()
[rank0]: File "/home/andrewor/local/pytorch/torch/nn/modules/module.py", line 1805, in inner
[rank0]: result = forward_call(*args, **kwargs)
[rank0]: File "/home/andrewor/local/ao/torchao/float8/float8_linear.py", line 352, in forward
[rank0]: output = matmul_with_hp_or_float8_args.apply(
[rank0]: File "/home/andrewor/local/pytorch/torch/autograd/function.py", line 575, in apply
[rank0]: return super().apply(*args, **kwargs) # type: ignore[misc]
[rank0]: File "/home/andrewor/local/ao/torchao/float8/float8_linear.py", line 120, in forward
[rank0]: input_maybe_fp8_reshaped = input_maybe_fp8.reshape(-1, orig_shape[-1])
[rank0]: File "/home/andrewor/local/pytorch/torch/_compile.py", line 51, in inner
[rank0]: return disable_fn(*args, **kwargs)
[rank0]: File "/home/andrewor/local/pytorch/torch/_dynamo/eval_frame.py", line 850, in _fn
[rank0]: return fn(*args, **kwargs)
[rank0]: File "/home/andrewor/local/pytorch/torch/distributed/tensor/_api.py", line 350, in __torch_dispatch__
[rank0]: return DTensor._op_dispatcher.dispatch(
[rank0]: File "/home/andrewor/local/pytorch/torch/distributed/tensor/_dispatch.py", line 172, in dispatch
[rank0]: self.redistribute_local_args(
[rank0]: File "/home/andrewor/local/pytorch/torch/distributed/tensor/_dispatch.py", line 309, in redistribute_local_args
[rank0]: resharded_local_tensor = redistribute_local_tensor(
[rank0]: File "/home/andrewor/local/pytorch/torch/distributed/tensor/_redistribute.py", line 213, in redistribute_local_tensor
[rank0]: new_local_tensor = current_placement._to_replicate_tensor(
[rank0]: File "/home/andrewor/local/pytorch/torch/distributed/tensor/placement_types.py", line 260, in _to_replicate_tensor
[rank0]: result = funcol.all_gather_tensor(
[rank0]: File "/home/andrewor/local/pytorch/torch/distributed/_functional_collectives.py", line 215, in all_gather_tensor
[rank0]: res = torch.cat(torch.chunk(res, group_size, dim=0), dim=gather_dim)
[rank0]: File "/home/andrewor/local/ao/torchao/float8/float8_tensor.py", line 375, in __torch_dispatch__
[rank0]: raise NotImplementedError(f"attempting to run {func}, this is not supported")
[rank0]: NotImplementedError: attempting to run aten.chunk.default, this is not supported
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment