pacman100 · August 25, 2023 10:18
diff --git a/fsdp_mem_usage_logs b/fsdp_mem_usage_logs
 accelerator.process_index=0 GPU Memory before entering the loading : 0
 accelerator.process_index=0 GPU Memory consumed at the end of the loading (end-begin): 0
 accelerator.process_index=0 GPU Peak Memory consumed during the loading (max-begin): 0
 accelerator.process_index=0 GPU Total Peak Memory consumed during the loading (max): 0
 accelerator.process_index=0 CPU Memory before entering the loading : 926
 accelerator.process_index=0 CPU Memory consumed at the end of the loading (end-begin): 26415
 accelerator.process_index=0 CPU Peak Memory consumed during the loading (max-begin): 31818
 accelerator.process_index=0 CPU Total Peak Memory consumed during the loading (max): 32744
 accelerator.process_index=0 model.lm_head.weight=Parameter containing:
 tensor([[-0.0179,  0.0201, -0.0273,  ..., -0.0275, -0.0396, -0.0131],
        [-0.0510, -0.0079, -0.0383,  ..., -0.0481,  0.0581,  0.0282],
        [-0.0217, -0.0216, -0.0064,  ..., -0.0508,  0.0554, -0.0013],
        ...,
        [ 0.0425,  0.0452, -0.0131,  ...,  0.0019,  0.0476,  0.0342],
        [-0.0170, -0.0085,  0.0449,  ..., -0.0074,  0.0178,  0.0043],
        [-0.0439, -0.0859, -0.0820,  ...,  0.0130,  0.0669,  0.0884]],
       requires_grad=True)
 accelerator.process_index=1 GPU Memory before entering the loading : 0
 accelerator.process_index=1 GPU Memory consumed at the end of the loading (end-begin): 0
 accelerator.process_index=1 GPU Peak Memory consumed during the loading (max-begin): 0
 accelerator.process_index=1 GPU Total Peak Memory consumed during the loading (max): 0
 accelerator.process_index=1 CPU Memory before entering the loading : 933
 accelerator.process_index=1 CPU Memory consumed at the end of the loading (end-begin): 10
 accelerator.process_index=1 CPU Peak Memory consumed during the loading (max-begin): 573
 accelerator.process_index=1 CPU Total Peak Memory consumed during the loading (max): 1506
 accelerator.process_index=1 model.lm_head.weight=Parameter containing:
 tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], requires_grad=True)
 accelerator.process_index=0 GPU Memory before entering the prepare : 0
 accelerator.process_index=0 GPU Memory consumed at the end of the prepare (end-begin): 13202
 accelerator.process_index=0 GPU Peak Memory consumed during the prepare (max-begin): 15458
 accelerator.process_index=0 GPU Total Peak Memory consumed during the prepare (max): 15458
 accelerator.process_index=0 CPU Memory before entering the prepare : 27345
 accelerator.process_index=0 CPU Memory consumed at the end of the prepare (end-begin): -26394
 accelerator.process_index=0 CPU Peak Memory consumed during the prepare (max-begin): 0
 accelerator.process_index=0 CPU Total Peak Memory consumed during the prepare (max): 27345
 FullyShardedDataParallel(
  (_fsdp_wrapped_module): RWForCausalLM(
    (transformer): RWModel(
      (word_embeddings): Embedding(65024, 4544)
      (h): ModuleList(
        (0-31): 32 x FullyShardedDataParallel(
          (_fsdp_wrapped_module): DecoderLayer(
            (input_layernorm): LayerNorm((4544,), eps=1e-05, elementwise_affine=True)
            (self_attention): Attention(
              (maybe_rotary): RotaryEmbedding()
              (query_key_value): Linear(in_features=4544, out_features=4672, bias=False)
              (dense): Linear(in_features=4544, out_features=4544, bias=False)
              (attention_dropout): Dropout(p=0.0, inplace=False)
            )
            (mlp): MLP(
              (dense_h_to_4h): Linear(in_features=4544, out_features=18176, bias=False)
              (act): GELU(approximate='none')
              (dense_4h_to_h): Linear(in_features=18176, out_features=4544, bias=False)
            )
          )
        )
      )
      (ln_f): LayerNorm((4544,), eps=1e-05, elementwise_affine=True)
    )
    (lm_head): Linear(in_features=4544, out_features=65024, bias=False)
  )
 )
 accelerator.process_index=1 GPU Memory before entering the prepare : 0
 accelerator.process_index=1 GPU Memory consumed at the end of the prepare (end-begin): 13202
 accelerator.process_index=1 GPU Peak Memory consumed during the prepare (max-begin): 15458
 accelerator.process_index=1 GPU Total Peak Memory consumed during the prepare (max): 15458
 accelerator.process_index=1 CPU Memory before entering the prepare : 945
 accelerator.process_index=1 CPU Memory consumed at the end of the prepare (end-begin): 4
 accelerator.process_index=1 CPU Peak Memory consumed during the prepare (max-begin): 4
 accelerator.process_index=1 CPU Total Peak Memory consumed during the prepare (max): 949
 accelerator.process_index=1 model.lm_head.weight=Parameter containing:
 tensor([[-0.0179,  0.0201, -0.0273,  ..., -0.0275, -0.0396, -0.0131],
        [-0.0510, -0.0079, -0.0383,  ..., -0.0481,  0.0581,  0.0282],
        [-0.0217, -0.0216, -0.0064,  ..., -0.0508,  0.0554, -0.0013],
        ...,
        [ 0.0425,  0.0452, -0.0131,  ...,  0.0019,  0.0476,  0.0342],
        [-0.0170, -0.0085,  0.0449,  ..., -0.0074,  0.0178,  0.0043],
        [-0.0439, -0.0859, -0.0820,  ...,  0.0130,  0.0669,  0.0884]],
       device='cuda:1', requires_grad=True)
 accelerator.process_index=0 model.lm_head.weight=Parameter containing:
 tensor([[-0.0179,  0.0201, -0.0273,  ..., -0.0275, -0.0396, -0.0131],
        [-0.0510, -0.0079, -0.0383,  ..., -0.0481,  0.0581,  0.0282],
        [-0.0217, -0.0216, -0.0064,  ..., -0.0508,  0.0554, -0.0013],
        ...,
        [ 0.0425,  0.0452, -0.0131,  ...,  0.0019,  0.0476,  0.0342],
        [-0.0170, -0.0085,  0.0449,  ..., -0.0074,  0.0178,  0.0043],
        [-0.0439, -0.0859, -0.0820,  ...,  0.0130,  0.0669,  0.0884]],
       device='cuda:0', requires_grad=True)
	accelerator.process_index=0 GPU Memory before entering the loading : 0
	accelerator.process_index=0 GPU Memory consumed at the end of the loading (end-begin): 0
	accelerator.process_index=0 GPU Peak Memory consumed during the loading (max-begin): 0
	accelerator.process_index=0 GPU Total Peak Memory consumed during the loading (max): 0
	accelerator.process_index=0 CPU Memory before entering the loading : 926
	accelerator.process_index=0 CPU Memory consumed at the end of the loading (end-begin): 26415
	accelerator.process_index=0 CPU Peak Memory consumed during the loading (max-begin): 31818
	accelerator.process_index=0 CPU Total Peak Memory consumed during the loading (max): 32744
	accelerator.process_index=0 model.lm_head.weight=Parameter containing:
	tensor([[-0.0179, 0.0201, -0.0273, ..., -0.0275, -0.0396, -0.0131],
	[-0.0510, -0.0079, -0.0383, ..., -0.0481, 0.0581, 0.0282],
	[-0.0217, -0.0216, -0.0064, ..., -0.0508, 0.0554, -0.0013],
	...,
	[ 0.0425, 0.0452, -0.0131, ..., 0.0019, 0.0476, 0.0342],
	[-0.0170, -0.0085, 0.0449, ..., -0.0074, 0.0178, 0.0043],
	[-0.0439, -0.0859, -0.0820, ..., 0.0130, 0.0669, 0.0884]],
	requires_grad=True)
	accelerator.process_index=1 GPU Memory before entering the loading : 0
	accelerator.process_index=1 GPU Memory consumed at the end of the loading (end-begin): 0
	accelerator.process_index=1 GPU Peak Memory consumed during the loading (max-begin): 0
	accelerator.process_index=1 GPU Total Peak Memory consumed during the loading (max): 0
	accelerator.process_index=1 CPU Memory before entering the loading : 933
	accelerator.process_index=1 CPU Memory consumed at the end of the loading (end-begin): 10
	accelerator.process_index=1 CPU Peak Memory consumed during the loading (max-begin): 573
	accelerator.process_index=1 CPU Total Peak Memory consumed during the loading (max): 1506
	accelerator.process_index=1 model.lm_head.weight=Parameter containing:
	tensor([[0., 0., 0., ..., 0., 0., 0.],
	[0., 0., 0., ..., 0., 0., 0.],
	[0., 0., 0., ..., 0., 0., 0.],
	...,
	[0., 0., 0., ..., 0., 0., 0.],
	[0., 0., 0., ..., 0., 0., 0.],
	[0., 0., 0., ..., 0., 0., 0.]], requires_grad=True)
	accelerator.process_index=0 GPU Memory before entering the prepare : 0
	accelerator.process_index=0 GPU Memory consumed at the end of the prepare (end-begin): 13202
	accelerator.process_index=0 GPU Peak Memory consumed during the prepare (max-begin): 15458
	accelerator.process_index=0 GPU Total Peak Memory consumed during the prepare (max): 15458
	accelerator.process_index=0 CPU Memory before entering the prepare : 27345
	accelerator.process_index=0 CPU Memory consumed at the end of the prepare (end-begin): -26394
	accelerator.process_index=0 CPU Peak Memory consumed during the prepare (max-begin): 0
	accelerator.process_index=0 CPU Total Peak Memory consumed during the prepare (max): 27345
	FullyShardedDataParallel(
	(_fsdp_wrapped_module): RWForCausalLM(
	(transformer): RWModel(
	(word_embeddings): Embedding(65024, 4544)
	(h): ModuleList(
	(0-31): 32 x FullyShardedDataParallel(
	(_fsdp_wrapped_module): DecoderLayer(
	(input_layernorm): LayerNorm((4544,), eps=1e-05, elementwise_affine=True)
	(self_attention): Attention(
	(maybe_rotary): RotaryEmbedding()
	(query_key_value): Linear(in_features=4544, out_features=4672, bias=False)
	(dense): Linear(in_features=4544, out_features=4544, bias=False)
	(attention_dropout): Dropout(p=0.0, inplace=False)
	)
	(mlp): MLP(
	(dense_h_to_4h): Linear(in_features=4544, out_features=18176, bias=False)
	(act): GELU(approximate='none')
	(dense_4h_to_h): Linear(in_features=18176, out_features=4544, bias=False)
	)
	)
	)
	)
	(ln_f): LayerNorm((4544,), eps=1e-05, elementwise_affine=True)
	)
	(lm_head): Linear(in_features=4544, out_features=65024, bias=False)
	)
	)
	accelerator.process_index=1 GPU Memory before entering the prepare : 0
	accelerator.process_index=1 GPU Memory consumed at the end of the prepare (end-begin): 13202
	accelerator.process_index=1 GPU Peak Memory consumed during the prepare (max-begin): 15458
	accelerator.process_index=1 GPU Total Peak Memory consumed during the prepare (max): 15458
	accelerator.process_index=1 CPU Memory before entering the prepare : 945
	accelerator.process_index=1 CPU Memory consumed at the end of the prepare (end-begin): 4
	accelerator.process_index=1 CPU Peak Memory consumed during the prepare (max-begin): 4
	accelerator.process_index=1 CPU Total Peak Memory consumed during the prepare (max): 949
	accelerator.process_index=1 model.lm_head.weight=Parameter containing:
	tensor([[-0.0179, 0.0201, -0.0273, ..., -0.0275, -0.0396, -0.0131],
	[-0.0510, -0.0079, -0.0383, ..., -0.0481, 0.0581, 0.0282],
	[-0.0217, -0.0216, -0.0064, ..., -0.0508, 0.0554, -0.0013],
	...,
	[ 0.0425, 0.0452, -0.0131, ..., 0.0019, 0.0476, 0.0342],
	[-0.0170, -0.0085, 0.0449, ..., -0.0074, 0.0178, 0.0043],
	[-0.0439, -0.0859, -0.0820, ..., 0.0130, 0.0669, 0.0884]],
	device='cuda:1', requires_grad=True)
	accelerator.process_index=0 model.lm_head.weight=Parameter containing:
	tensor([[-0.0179, 0.0201, -0.0273, ..., -0.0275, -0.0396, -0.0131],
	[-0.0510, -0.0079, -0.0383, ..., -0.0481, 0.0581, 0.0282],
	[-0.0217, -0.0216, -0.0064, ..., -0.0508, 0.0554, -0.0013],
	...,
	[ 0.0425, 0.0452, -0.0131, ..., 0.0019, 0.0476, 0.0342],
	[-0.0170, -0.0085, 0.0449, ..., -0.0074, 0.0178, 0.0043],
	[-0.0439, -0.0859, -0.0820, ..., 0.0130, 0.0669, 0.0884]],
	device='cuda:0', requires_grad=True)