andrewor14 · April 11, 2025 19:27
diff --git a/gistfile1.txt b/gistfile1.txt
 [rank0]: Traceback (most recent call last):
 [rank0]:   File "/home/andrewor/local/torchtune/recipes/full_finetune_distributed.py", line 982, in <module>
 [rank0]:     sys.exit(recipe_main())
 [rank0]:   File "/home/andrewor/local/torchtune/torchtune/config/_parse.py", line 99, in wrapper
 [rank0]:     sys.exit(recipe_main(conf))
 [rank0]:   File "/home/andrewor/local/torchtune/recipes/full_finetune_distributed.py", line 977, in recipe_main
 [rank0]:     recipe.train()
 [rank0]:   File "/home/andrewor/local/torchtune/recipes/full_finetune_distributed.py", line 810, in train
 [rank0]:     logits = self._model(**batch)
 [rank0]:   File "/home/andrewor/local/pytorch/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
 [rank0]:     return self._call_impl(*args, **kwargs)
 [rank0]:   File "/home/andrewor/local/pytorch/torch/nn/modules/module.py", line 1857, in _call_impl
 [rank0]:     return inner()
 [rank0]:   File "/home/andrewor/local/pytorch/torch/nn/modules/module.py", line 1805, in inner
 [rank0]:     result = forward_call(*args, **kwargs)
 [rank0]:   File "/home/andrewor/local/torchtune/torchtune/modules/transformer.py", line 648, in forward
 [rank0]:     h = layer(
 [rank0]:   File "/home/andrewor/local/pytorch/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
 [rank0]:     return self._call_impl(*args, **kwargs)
 [rank0]:   File "/home/andrewor/local/pytorch/torch/nn/modules/module.py", line 1857, in _call_impl
 [rank0]:     return inner()
 [rank0]:   File "/home/andrewor/local/pytorch/torch/nn/modules/module.py", line 1805, in inner
 [rank0]:     result = forward_call(*args, **kwargs)
 [rank0]:   File "/home/andrewor/local/pytorch/torch/distributed/algorithms/_checkpoint/checkpoint_wrapper.py", line 171, in forward
 [rank0]:     return self.checkpoint_fn(  # type: ignore[misc]
 [rank0]:   File "/home/andrewor/local/pytorch/torch/_compile.py", line 51, in inner
 [rank0]:     return disable_fn(*args, **kwargs)
 [rank0]:   File "/home/andrewor/local/pytorch/torch/_dynamo/eval_frame.py", line 850, in _fn
 [rank0]:     return fn(*args, **kwargs)
 [rank0]:   File "/home/andrewor/local/pytorch/torch/utils/checkpoint.py", line 495, in checkpoint
 [rank0]:     ret = function(*args, **kwargs)
 [rank0]:   File "/home/andrewor/local/pytorch/torch/nn/modules/module.py", line 1749, in _wrapped_call_impl
 [rank0]:     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
 [rank0]:   File "/home/andrewor/local/pytorch/torch/_dynamo/eval_frame.py", line 658, in _fn
 [rank0]:     return fn(*args, **kwargs)
 [rank0]:   File "/home/andrewor/local/pytorch/torch/nn/modules/module.py", line 1762, in _call_impl
 [rank0]:     return forward_call(*args, **kwargs)
 [rank0]:   File "/home/andrewor/local/torchtune/torchtune/modules/transformer.py", line 122, in forward
 [rank0]:     attn_out = self.attn(h, h, mask=mask, input_pos=input_pos)
 [rank0]:   File "/home/andrewor/local/pytorch/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
 [rank0]:     return self._call_impl(*args, **kwargs)
 [rank0]:   File "/home/andrewor/local/pytorch/torch/nn/modules/module.py", line 1857, in _call_impl
 [rank0]:     return inner()
 [rank0]:   File "/home/andrewor/local/pytorch/torch/nn/modules/module.py", line 1805, in inner
 [rank0]:     result = forward_call(*args, **kwargs)
 [rank0]:   File "/home/andrewor/local/torchtune/torchtune/modules/attention.py", line 263, in forward
 [rank0]:     k = self.k_proj(y)
 [rank0]:   File "/home/andrewor/local/pytorch/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
 [rank0]:     return self._call_impl(*args, **kwargs)
 [rank0]:   File "/home/andrewor/local/pytorch/torch/nn/modules/module.py", line 1857, in _call_impl
 [rank0]:     return inner()
 [rank0]:   File "/home/andrewor/local/pytorch/torch/nn/modules/module.py", line 1805, in inner
 [rank0]:     result = forward_call(*args, **kwargs)
 [rank0]:   File "/home/andrewor/local/ao/torchao/float8/float8_linear.py", line 352, in forward
 [rank0]:     output = matmul_with_hp_or_float8_args.apply(
 [rank0]:   File "/home/andrewor/local/pytorch/torch/autograd/function.py", line 575, in apply
 [rank0]:     return super().apply(*args, **kwargs)  # type: ignore[misc]
 [rank0]:   File "/home/andrewor/local/ao/torchao/float8/float8_linear.py", line 120, in forward
 [rank0]:     input_maybe_fp8_reshaped = input_maybe_fp8.reshape(-1, orig_shape[-1])
 [rank0]:   File "/home/andrewor/local/pytorch/torch/_compile.py", line 51, in inner
 [rank0]:     return disable_fn(*args, **kwargs)
 [rank0]:   File "/home/andrewor/local/pytorch/torch/_dynamo/eval_frame.py", line 850, in _fn
 [rank0]:     return fn(*args, **kwargs)
 [rank0]:   File "/home/andrewor/local/pytorch/torch/distributed/tensor/_api.py", line 350, in __torch_dispatch__
 [rank0]:     return DTensor._op_dispatcher.dispatch(
 [rank0]:   File "/home/andrewor/local/pytorch/torch/distributed/tensor/_dispatch.py", line 172, in dispatch
 [rank0]:     self.redistribute_local_args(
 [rank0]:   File "/home/andrewor/local/pytorch/torch/distributed/tensor/_dispatch.py", line 309, in redistribute_local_args
 [rank0]:     resharded_local_tensor = redistribute_local_tensor(
 [rank0]:   File "/home/andrewor/local/pytorch/torch/distributed/tensor/_redistribute.py", line 213, in redistribute_local_tensor
 [rank0]:     new_local_tensor = current_placement._to_replicate_tensor(
 [rank0]:   File "/home/andrewor/local/pytorch/torch/distributed/tensor/placement_types.py", line 260, in _to_replicate_tensor
 [rank0]:     result = funcol.all_gather_tensor(
 [rank0]:   File "/home/andrewor/local/pytorch/torch/distributed/_functional_collectives.py", line 215, in all_gather_tensor
 [rank0]:     res = torch.cat(torch.chunk(res, group_size, dim=0), dim=gather_dim)
 [rank0]:   File "/home/andrewor/local/ao/torchao/float8/float8_tensor.py", line 375, in __torch_dispatch__
 [rank0]:     raise NotImplementedError(f"attempting to run {func}, this is not supported")
 [rank0]: NotImplementedError: attempting to run aten.chunk.default, this is not supported
	[rank0]: Traceback (most recent call last):
	[rank0]: File "/home/andrewor/local/torchtune/recipes/full_finetune_distributed.py", line 982, in <module>
	[rank0]: sys.exit(recipe_main())
	[rank0]: File "/home/andrewor/local/torchtune/torchtune/config/_parse.py", line 99, in wrapper
	[rank0]: sys.exit(recipe_main(conf))
	[rank0]: File "/home/andrewor/local/torchtune/recipes/full_finetune_distributed.py", line 977, in recipe_main
	[rank0]: recipe.train()
	[rank0]: File "/home/andrewor/local/torchtune/recipes/full_finetune_distributed.py", line 810, in train
	[rank0]: logits = self._model(**batch)
	[rank0]: File "/home/andrewor/local/pytorch/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
	[rank0]: return self._call_impl(args, *kwargs)
	[rank0]: File "/home/andrewor/local/pytorch/torch/nn/modules/module.py", line 1857, in _call_impl
	[rank0]: return inner()
	[rank0]: File "/home/andrewor/local/pytorch/torch/nn/modules/module.py", line 1805, in inner
	[rank0]: result = forward_call(args, *kwargs)
	[rank0]: File "/home/andrewor/local/torchtune/torchtune/modules/transformer.py", line 648, in forward
	[rank0]: h = layer(
	[rank0]: File "/home/andrewor/local/pytorch/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
	[rank0]: return self._call_impl(args, *kwargs)
	[rank0]: File "/home/andrewor/local/pytorch/torch/nn/modules/module.py", line 1857, in _call_impl
	[rank0]: return inner()
	[rank0]: File "/home/andrewor/local/pytorch/torch/nn/modules/module.py", line 1805, in inner
	[rank0]: result = forward_call(args, *kwargs)
	[rank0]: File "/home/andrewor/local/pytorch/torch/distributed/algorithms/_checkpoint/checkpoint_wrapper.py", line 171, in forward
	[rank0]: return self.checkpoint_fn( # type: ignore[misc]
	[rank0]: File "/home/andrewor/local/pytorch/torch/_compile.py", line 51, in inner
	[rank0]: return disable_fn(args, *kwargs)
	[rank0]: File "/home/andrewor/local/pytorch/torch/_dynamo/eval_frame.py", line 850, in _fn
	[rank0]: return fn(args, *kwargs)
	[rank0]: File "/home/andrewor/local/pytorch/torch/utils/checkpoint.py", line 495, in checkpoint
	[rank0]: ret = function(args, *kwargs)
	[rank0]: File "/home/andrewor/local/pytorch/torch/nn/modules/module.py", line 1749, in _wrapped_call_impl
	[rank0]: return self._compiled_call_impl(args, *kwargs) # type: ignore[misc]
	[rank0]: File "/home/andrewor/local/pytorch/torch/_dynamo/eval_frame.py", line 658, in _fn
	[rank0]: return fn(args, *kwargs)
	[rank0]: File "/home/andrewor/local/pytorch/torch/nn/modules/module.py", line 1762, in _call_impl
	[rank0]: return forward_call(args, *kwargs)
	[rank0]: File "/home/andrewor/local/torchtune/torchtune/modules/transformer.py", line 122, in forward
	[rank0]: attn_out = self.attn(h, h, mask=mask, input_pos=input_pos)
	[rank0]: File "/home/andrewor/local/pytorch/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
	[rank0]: return self._call_impl(args, *kwargs)
	[rank0]: File "/home/andrewor/local/pytorch/torch/nn/modules/module.py", line 1857, in _call_impl
	[rank0]: return inner()
	[rank0]: File "/home/andrewor/local/pytorch/torch/nn/modules/module.py", line 1805, in inner
	[rank0]: result = forward_call(args, *kwargs)
	[rank0]: File "/home/andrewor/local/torchtune/torchtune/modules/attention.py", line 263, in forward
	[rank0]: k = self.k_proj(y)
	[rank0]: File "/home/andrewor/local/pytorch/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
	[rank0]: return self._call_impl(args, *kwargs)
	[rank0]: File "/home/andrewor/local/pytorch/torch/nn/modules/module.py", line 1857, in _call_impl
	[rank0]: return inner()
	[rank0]: File "/home/andrewor/local/pytorch/torch/nn/modules/module.py", line 1805, in inner
	[rank0]: result = forward_call(args, *kwargs)
	[rank0]: File "/home/andrewor/local/ao/torchao/float8/float8_linear.py", line 352, in forward
	[rank0]: output = matmul_with_hp_or_float8_args.apply(
	[rank0]: File "/home/andrewor/local/pytorch/torch/autograd/function.py", line 575, in apply
	[rank0]: return super().apply(args, *kwargs) # type: ignore[misc]
	[rank0]: File "/home/andrewor/local/ao/torchao/float8/float8_linear.py", line 120, in forward
	[rank0]: input_maybe_fp8_reshaped = input_maybe_fp8.reshape(-1, orig_shape[-1])
	[rank0]: File "/home/andrewor/local/pytorch/torch/_compile.py", line 51, in inner
	[rank0]: return disable_fn(args, *kwargs)
	[rank0]: File "/home/andrewor/local/pytorch/torch/_dynamo/eval_frame.py", line 850, in _fn
	[rank0]: return fn(args, *kwargs)
	[rank0]: File "/home/andrewor/local/pytorch/torch/distributed/tensor/_api.py", line 350, in __torch_dispatch__
	[rank0]: return DTensor._op_dispatcher.dispatch(
	[rank0]: File "/home/andrewor/local/pytorch/torch/distributed/tensor/_dispatch.py", line 172, in dispatch
	[rank0]: self.redistribute_local_args(
	[rank0]: File "/home/andrewor/local/pytorch/torch/distributed/tensor/_dispatch.py", line 309, in redistribute_local_args
	[rank0]: resharded_local_tensor = redistribute_local_tensor(
	[rank0]: File "/home/andrewor/local/pytorch/torch/distributed/tensor/_redistribute.py", line 213, in redistribute_local_tensor
	[rank0]: new_local_tensor = current_placement._to_replicate_tensor(
	[rank0]: File "/home/andrewor/local/pytorch/torch/distributed/tensor/placement_types.py", line 260, in _to_replicate_tensor
	[rank0]: result = funcol.all_gather_tensor(
	[rank0]: File "/home/andrewor/local/pytorch/torch/distributed/_functional_collectives.py", line 215, in all_gather_tensor
	[rank0]: res = torch.cat(torch.chunk(res, group_size, dim=0), dim=gather_dim)
	[rank0]: File "/home/andrewor/local/ao/torchao/float8/float8_tensor.py", line 375, in __torch_dispatch__
	[rank0]: raise NotImplementedError(f"attempting to run {func}, this is not supported")
	[rank0]: NotImplementedError: attempting to run aten.chunk.default, this is not supported