yf225’s gists

yf225 / compile_log_after_reinplace_foreach_copy.txt

Created March 22, 2024 06:38

This file has been truncated, but you can view the full file.

	W2024-03-21 23:13:59,343.343000 140436118718272 torch/distributed/run.py:757]
	W2024-03-21 23:13:59,343.343000 140436118718272 torch/distributed/run.py:757] *****************************************
	W2024-03-21 23:13:59,343.343000 140436118718272 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
	W2024-03-21 23:13:59,343.343000 140436118718272 torch/distributed/run.py:757] *****************************************
	Starting snapshot record_memory_history
	STAGE:2024-03-21 23:14:03 585465:585465 ActivityProfilerController.cpp:314] Completed Stage: Warm Up
	STAGE:2024-03-21 23:14:07 585466:585466 ActivityProfilerController.cpp:314] Completed Stage: Warm Up
	[rank0]:W2024-03-21 23:14:07,880.880000 139776655075136 test_dynamo_fsdp.py:207] FORWARD
	NCCL version 2.19.3+cuda12.1
	[rank0]:W2024-03-21 23:14:09,967.967000 139776655075136 t

yf225 / compile_log_with_Brian_PRs.txt

Created March 22, 2024 05:51

This file has been truncated, but you can view the full file.

	W2024-03-21 22:47:19,103.103000 140068881299264 torch/distributed/run.py:757]
	W2024-03-21 22:47:19,103.103000 140068881299264 torch/distributed/run.py:757] *****************************************
	W2024-03-21 22:47:19,103.103000 140068881299264 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
	W2024-03-21 22:47:19,103.103000 140068881299264 torch/distributed/run.py:757] *****************************************
	Starting snapshot record_memory_history
	STAGE:2024-03-21 22:47:24 187402:187402 ActivityProfilerController.cpp:314] Completed Stage: Warm Up
	STAGE:2024-03-21 22:47:26 187404:187404 ActivityProfilerController.cpp:314] Completed Stage: Warm Up
	[rank0]:W2024-03-21 22:47:26,526.526000 140112140277568 test_dynamo_fsdp.py:207] FORWARD
	NCCL version 2.19.3+cuda12.1
	[rank0]:W2024-03-21 22:47:28,726.726000 140112140277568 t

yf225 / BWD_fx_graph_after_Brian_PRs_and_view_chain_move.txt

Created March 22, 2024 05:03

	TRACED GRAPH
	===== AFTER POST GRAD =====
	/data/users/willfeng/pytorch_yf225/torch/fx/_lazy_graph_module.py class <lambda>(torch.nn.Module):
	def forward(self, arg0_1: "f32[]", arg1_1: "f32[2, 1234]", arg2_1: "f32[2, 1234]", arg3_1: "f32[2, 1234]", arg4_1: "f32[1234, 1234]", arg5_1: "f32[1234, 1234]", arg6_1: "f32[1234]", arg7_1: "f32[1234]", arg8_1: "f32[1234]", arg9_1: "f32[1234, 1234]", arg10_1: "f32[761378]", arg11_1: "f32[617]", arg12_1: "f32[761378]", arg13_1: "f32[617]", arg14_1: "f32[761378]", arg15_1: "f32[617]"):
	# File: /data/users/willfeng/pytorch_yf225/torch/distributed/_composable/fsdp/_fsdp_collectives.py:47 in foreach_all_gather, code: all_gather_output = torch.empty(
	empty: "f32[4571970]" = torch.ops.aten.empty.memory_format([4571970], dtype = torch.float32, device = device(type='cuda', index=0), pin_memory = False)

	# File: /data/users/willfeng/pytorch_yf225/torch/distributed/_composable/fsdp/_fsdp_collectives.py:50 in foreach_all_gather, code: all_gather_

yf225 / FWD_fx_graph_after_Brian_PRs_and_view_chain_move.txt

Created March 22, 2024 05:03

	TRACED GRAPH
	===== AFTER POST GRAD =====
	/data/users/willfeng/pytorch_yf225/torch/fx/_lazy_graph_module.py class GraphModule(torch.nn.Module):
	def forward(self, primals_1: "f32[2, 1234]", primals_2: "f32[761378]", primals_3: "f32[617]", primals_4: "f32[761378]", primals_5: "f32[617]", primals_6: "f32[761378]", primals_7: "f32[617]", primals_8: "f32[1234, 1234]", primals_9: "f32[1234]", primals_10: "f32[1234, 1234]", primals_11: "f32[1234]", primals_12: "f32[1234, 1234]", primals_13: "f32[1234]", primals_14):
	# File: /data/users/willfeng/pytorch_yf225/torch/distributed/_composable/fsdp/_fsdp_collectives.py:47 in foreach_all_gather, code: all_gather_output = torch.empty(
	empty: "f32[4571970]" = torch.ops.aten.empty.memory_format([4571970], dtype = torch.float32, device = device(type='cuda', index=0), pin_memory = False)

	# File: /data/users/willfeng/pytorch_yf225/torch/distributed/_composable/fsdp/_fsdp_collectives.py:50 in foreach_all_gather, code: all_gather_input = all_g

yf225 / RBLOCK_missing_error.txt

Created March 22, 2024 01:09

	[rank0]:E2024-03-21 18:04:21,941.941000 140554485671744 torch/testing/_internal/common_distributed.py:663] Caught exception:
	[rank0]:E2024-03-21 18:04:21,941.941000 140554485671744 torch/testing/_internal/common_distributed.py:663] Traceback (most recent call last):
	[rank0]:E2024-03-21 18:04:21,941.941000 140554485671744 torch/testing/_internal/common_distributed.py:663] File "/data/users/willfeng/pytorch_yf225/torch/testing/_internal/common_distributed.py", line 656, in run_test
	[rank0]:E2024-03-21 18:04:21,941.941000 140554485671744 torch/testing/_internal/common_distributed.py:663] getattr(self, test_name)()
	[rank0]:E2024-03-21 18:04:21,941.941000 140554485671744 torch/testing/_internal/common_distributed.py:663] File "/data/users/willfeng/pytorch_yf225/torch/testing/_internal/common_distributed.py", line 542, in wrapper
	[rank0]:E2024-03-21 18:04:21,941.941000 140554485671744 torch/testing/_internal/common_distributed.py:663] fn()
	[rank0]:E2024-03-21 18:04:21,941.941000 140554485671744 torch/t

yf225 / 3FC2RELU_FWD_graph.txt

Last active March 19, 2024 20:38

	===== AFTER POST GRAD =====
	/data/users/willfeng/pytorch_yf225/torch/fx/_lazy_graph_module.py class GraphModule(torch.nn.Module):
	def forward(self, primals_1: "f32[2, 12340]", primals_2: "f32[76137800]", primals_3: "f32[6170]", primals_4: "f32[76137800]", primals_5: "f32[6170]", primals_6: "f32[76137800]", primals_7: "f32[6170]", primals_8: "f32[12340, 12340]", primals_9: "f32[12340]", primals_10: "f32[12340, 12340]", primals_11: "f32[12340]", primals_12: "f32[12340, 12340]", primals_13: "f32[12340]", primals_14):
	# File: /data/users/willfeng/pytorch_yf225/torch/distributed/_composable/fsdp/_fsdp_collectives.py:46 in foreach_all_gather, code: all_gather_output = torch.empty(
	empty: "f32[456863820]" = torch.ops.aten.empty.memory_format([456863820], dtype = torch.float32, device = device(type='cuda', index=0), pin_memory = False)

	# File: /data/users/willfeng/pytorch_yf225/torch/distributed/_composable/fsdp/_fsdp_collectives.py:49 in foreach_all_gather, code: all_gather_inp

yf225 / 3FC2RELU_BWD_graph.txt

Last active March 19, 2024 20:39

	===== AFTER POST GRAD =====
	/data/users/willfeng/pytorch_yf225/torch/fx/_lazy_graph_module.py class <lambda>(torch.nn.Module):
	def forward(self, arg0_1: "f32[]", arg1_1: "f32[2, 12340]", arg2_1: "f32[12340, 12340]", arg3_1: "f32[12340, 12340]", arg4_1: "f32[2, 12340]", arg5_1: "f32[2, 12340]", arg6_1: "f32[12340]", arg7_1: "f32[12340]", arg8_1: "f32[12340]", arg9_1: "f32[12340, 12340]", arg10_1: "f32[76137800]", arg11_1: "f32[6170]", arg12_1: "f32[76137800]", arg13_1: "f32[6170]", arg14_1: "f32[76137800]", arg15_1: "f32[6170]"):
	# File: <eval_with_key>.37:15 in forward, code: expand = torch.ops.aten.expand.default(getitem, [2, 12340]); getitem = None
	expand: "f32[2, 12340]" = torch.ops.aten.expand.default(arg0_1, [2, 12340]); arg0_1 = None

	# File: /data/users/willfeng/pytorch_yf225/torch/distributed/_composable/fsdp/_fsdp_collectives.py:46 in foreach_all_gather, code: all_gather_output = torch.empty(
	empty_1: "f32[456863820]" = torch.ops.aten.empty.memory_forma

yf225 / ppFSDP_2FC1RELU_CA_bwd_graph.py

Created March 7, 2024 18:39

	TRACED GRAPH
	===== Forward graph 4 =====
	/data/users/willfeng/pytorch_yf225/torch/fx/_lazy_graph_module.py class <lambda>(torch.nn.Module):
	def forward(self, arg0_1: "f32[]", arg1_1: "f32[2, 12340]", arg2_1: "f32[12340, 12340]", arg3_1: "f32[2, 12340]", arg4_1: "f32[12340]", arg5_1: "f32[12340]", arg6_1: "f32[12340, 12340]", arg7_1: "f32[76137800]", arg8_1: "f32[6170]", arg9_1: "f32[76137800]", arg10_1: "f32[6170]"):
	# File: <eval_with_key>.96:12 in forward, code: expand = torch.ops.aten.expand.default(getitem, [2, 12340]); getitem = None
	expand: "f32[2, 12340]" = torch.ops.aten.expand.default(arg0_1, [2, 12340]); arg0_1 = None

	# File: <eval_with_key>.96:14 in forward, code: clone = torch.ops.aten.clone.default(expand, memory_format = torch.contiguous_format); expand = None
	clone: "f32[2, 12340]" = torch.ops.aten.clone.default(expand, memory_format = torch.contiguous_format); expand = None

yf225 / ppFSDP_2FC1RELU_fwd_graph.py

Created March 7, 2024 18:37

	TRACED GRAPH
	===== Forward graph 0 =====
	/data/users/willfeng/pytorch_yf225/torch/fx/_lazy_graph_module.py class GraphModule(torch.nn.Module):
	def forward(self, primals_1: "f32[2, 12340]", primals_2: "f32[76137800]", primals_3: "f32[6170]", primals_4: "f32[76137800]", primals_5: "f32[6170]", primals_6: "f32[12340, 12340]", primals_7: "f32[12340]", primals_8: "f32[12340, 12340]", primals_9: "f32[12340]", primals_10):
	# File: /data/users/willfeng/pytorch_yf225/torch/distributed/_composable/fsdp/_fsdp_collectives.py:46 in foreach_all_gather, code: all_gather_output = torch.empty(
	empty: "f32[304575880]" = torch.ops.aten.empty.memory_format([304575880], dtype = torch.float32, device = device(type='cuda', index=0), pin_memory = False)

	# File: /data/users/willfeng/pytorch_yf225/torch/distributed/_composable/fsdp/_fsdp_collectives.py:49 in foreach_all_gather, code: all_gather_input = all_gather_output.narrow(
	slice_1: "f32[152287940]" = torch.ops.aten.slice.Tensor(empty

yf225 / foreach_all_gather_copy_out__rewrite.py

Created March 6, 2024 04:18

	# NOTE: This is the biggest difference between eager and compile code path.
	# In eager, we directly copy from `all_gather_output` into `fsdp_param.all_gather_output` (`fsdp_param._unsharded_param` will be updated because of shared storage),
	# but in compile path we copy from `as_strided(all_gather_output)` into `fsdp_param._unsharded_param` to avoid having `fsdp_param.all_gather_output` as graph input.
	# They are equivalent and must produce the same result.
	if not torch.distributed._functional_collectives.is_torchdynamo_compiling():
	out = [
	fsdp_param.all_gather_output.view(world_size, -1) for fsdp_param in fsdp_params
	]
	torch.split_with_sizes_copy(
	all_gather_output, all_gather_input_numels, dim=1, out=out

Will Feng yf225