Last active
June 8, 2024 01:06
-
-
Save Atry/4ebf4e6208a2a3628f65c85a40f9c49d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--------------------------------------------------------------------------- | |
RuntimeError Traceback (most recent call last) | |
Cell In[4], line 92 | |
90 with torch.no_grad(): | |
91 deepspeed_hybrid_engine.eval() | |
---> 92 print(deepspeed_hybrid_engine.generate( | |
93 torch.tensor([[1]], dtype=torch.int, device=deepspeed_hybrid_engine.device), | |
94 synced_gpus=True, | |
95 generation_config=GenerationConfig(max_new_tokens=20), | |
96 )) | |
File ~/peftai/.venv/lib/python3.11/site-packages/deepspeed/runtime/hybrid_engine.py:254, in DeepSpeedHybridEngine.generate(self, *inputs, **kwargs) | |
251 self.fuse_lora_weight() | |
253 self.retake_inference_cache() | |
--> 254 generate_ret_vals = self._generate(*inputs, **kwargs) | |
256 if len(self.all_lora_params) > 0: | |
257 self.unfuse_lora_weight() | |
File ~/peftai/.venv/lib/python3.11/site-packages/torch/utils/_contextlib.py:115, in context_decorator.<locals>.decorate_context(*args, **kwargs) | |
112 @functools.wraps(func) | |
113 def decorate_context(*args, **kwargs): | |
114 with ctx_factory(): | |
--> 115 return func(*args, **kwargs) | |
File ~/peftai/.venv/lib/python3.11/site-packages/transformers/generation/utils.py:1576, in GenerationMixin.generate(self, inputs, generation_config, logits_processor, stopping_criteria, prefix_allowed_tokens_fn, synced_gpus, assistant_model, streamer, negative_prompt_ids, negative_prompt_attention_mask, **kwargs) | |
1559 result = self._assisted_decoding( | |
1560 input_ids, | |
1561 candidate_generator=candidate_generator, | |
(...) | |
1572 **model_kwargs, | |
1573 ) | |
1574 if generation_mode == GenerationMode.GREEDY_SEARCH: | |
1575 # 11. run greedy search | |
-> 1576 result = self._greedy_search( | |
1577 input_ids, | |
1578 logits_processor=prepared_logits_processor, | |
1579 stopping_criteria=prepared_stopping_criteria, | |
1580 pad_token_id=generation_config.pad_token_id, | |
1581 output_scores=generation_config.output_scores, | |
1582 output_logits=generation_config.output_logits, | |
1583 return_dict_in_generate=generation_config.return_dict_in_generate, | |
1584 synced_gpus=synced_gpus, | |
1585 streamer=streamer, | |
1586 **model_kwargs, | |
1587 ) | |
1589 elif generation_mode == GenerationMode.CONTRASTIVE_SEARCH: | |
1590 if not model_kwargs["use_cache"]: | |
File ~/peftai/.venv/lib/python3.11/site-packages/transformers/generation/utils.py:2494, in GenerationMixin._greedy_search(self, input_ids, logits_processor, stopping_criteria, max_length, pad_token_id, eos_token_id, output_attentions, output_hidden_states, output_scores, output_logits, return_dict_in_generate, synced_gpus, streamer, **model_kwargs) | |
2491 model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) | |
2493 # forward pass to get next token | |
-> 2494 outputs = self( | |
2495 **model_inputs, | |
2496 return_dict=True, | |
2497 output_attentions=output_attentions, | |
2498 output_hidden_states=output_hidden_states, | |
2499 ) | |
2501 if synced_gpus and this_peer_finished: | |
2502 continue # don't waste resources running the code we don't need | |
File ~/peftai/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py:1532, in Module._wrapped_call_impl(self, *args, **kwargs) | |
1530 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc] | |
1531 else: | |
-> 1532 return self._call_impl(*args, **kwargs) | |
File ~/peftai/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py:1582, in Module._call_impl(self, *args, **kwargs) | |
1579 bw_hook = hooks.BackwardHook(self, full_backward_hooks, backward_pre_hooks) | |
1580 args = bw_hook.setup_input_hook(args) | |
-> 1582 result = forward_call(*args, **kwargs) | |
1583 if _global_forward_hooks or self._forward_hooks: | |
1584 for hook_id, hook in ( | |
1585 *_global_forward_hooks.items(), | |
1586 *self._forward_hooks.items(), | |
1587 ): | |
1588 # mark that always called hook is run | |
File ~/peftai/.venv/lib/python3.11/site-packages/transformers/models/llama/modeling_llama.py:1208, in LlamaForCausalLM.forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict, cache_position) | |
1205 return_dict = return_dict if return_dict is not None else self.config.use_return_dict | |
1207 # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) | |
-> 1208 outputs = self.model( | |
1209 input_ids=input_ids, | |
1210 attention_mask=attention_mask, | |
1211 position_ids=position_ids, | |
1212 past_key_values=past_key_values, | |
1213 inputs_embeds=inputs_embeds, | |
1214 use_cache=use_cache, | |
1215 output_attentions=output_attentions, | |
1216 output_hidden_states=output_hidden_states, | |
1217 return_dict=return_dict, | |
1218 cache_position=cache_position, | |
1219 ) | |
1221 hidden_states = outputs[0] | |
1222 if self.config.pretraining_tp > 1: | |
File ~/peftai/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py:1532, in Module._wrapped_call_impl(self, *args, **kwargs) | |
1530 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc] | |
1531 else: | |
-> 1532 return self._call_impl(*args, **kwargs) | |
File ~/peftai/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py:1582, in Module._call_impl(self, *args, **kwargs) | |
1579 bw_hook = hooks.BackwardHook(self, full_backward_hooks, backward_pre_hooks) | |
1580 args = bw_hook.setup_input_hook(args) | |
-> 1582 result = forward_call(*args, **kwargs) | |
1583 if _global_forward_hooks or self._forward_hooks: | |
1584 for hook_id, hook in ( | |
1585 *_global_forward_hooks.items(), | |
1586 *self._forward_hooks.items(), | |
1587 ): | |
1588 # mark that always called hook is run | |
File ~/peftai/.venv/lib/python3.11/site-packages/transformers/models/llama/modeling_llama.py:1018, in LlamaModel.forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, use_cache, output_attentions, output_hidden_states, return_dict, cache_position) | |
1007 layer_outputs = self._gradient_checkpointing_func( | |
1008 decoder_layer.__call__, | |
1009 hidden_states, | |
(...) | |
1015 cache_position, | |
1016 ) | |
1017 else: | |
-> 1018 layer_outputs = decoder_layer( | |
1019 hidden_states, | |
1020 attention_mask=causal_mask, | |
1021 position_ids=position_ids, | |
1022 past_key_value=past_key_values, | |
1023 output_attentions=output_attentions, | |
1024 use_cache=use_cache, | |
1025 cache_position=cache_position, | |
1026 ) | |
1028 hidden_states = layer_outputs[0] | |
1030 if use_cache: | |
File ~/peftai/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py:1532, in Module._wrapped_call_impl(self, *args, **kwargs) | |
1530 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc] | |
1531 else: | |
-> 1532 return self._call_impl(*args, **kwargs) | |
File ~/peftai/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py:1582, in Module._call_impl(self, *args, **kwargs) | |
1579 bw_hook = hooks.BackwardHook(self, full_backward_hooks, backward_pre_hooks) | |
1580 args = bw_hook.setup_input_hook(args) | |
-> 1582 result = forward_call(*args, **kwargs) | |
1583 if _global_forward_hooks or self._forward_hooks: | |
1584 for hook_id, hook in ( | |
1585 *_global_forward_hooks.items(), | |
1586 *self._forward_hooks.items(), | |
1587 ): | |
1588 # mark that always called hook is run | |
File ~/peftai/.venv/lib/python3.11/site-packages/deepspeed/model_implementations/transformers/ds_transformer.py:171, in DeepSpeedTransformerInference.forward(self, input, input_mask, attention_mask, attn_mask, head_mask, layer_past, get_key_value, get_present, encoder_output, enc_dec_attn_mask, x, encoder_hidden_states, encoder_attention_mask, use_cache, alibi, output_attentions, layer_head_mask, past_key_value, **kwargs) | |
167 input = input.to(target_dtype) | |
169 with torch.no_grad(): | |
170 attention_output, key, value, context_outputtn_ctx, inp_norm = \ | |
--> 171 self.attention(input, | |
172 input_mask, | |
173 head_mask, | |
174 layer_past, | |
175 get_present, | |
176 encoder_hidden_states, | |
177 encoder_attention_mask, | |
178 output_attentions, | |
179 self.norm_w, | |
180 self.norm_b, | |
181 alibi) | |
183 presents = (key, value) | |
184 self.layer_past = presents if layer_past is None else None | |
File ~/peftai/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py:1532, in Module._wrapped_call_impl(self, *args, **kwargs) | |
1530 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc] | |
1531 else: | |
-> 1532 return self._call_impl(*args, **kwargs) | |
File ~/peftai/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py:1541, in Module._call_impl(self, *args, **kwargs) | |
1536 # If we don't have any hooks, we want to skip the rest of the logic in | |
1537 # this function, and just call forward. | |
1538 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks | |
1539 or _global_backward_pre_hooks or _global_backward_hooks | |
1540 or _global_forward_hooks or _global_forward_pre_hooks): | |
-> 1541 return forward_call(*args, **kwargs) | |
1543 try: | |
1544 result = None | |
File ~/peftai/.venv/lib/python3.11/site-packages/deepspeed/ops/transformer/inference/ds_attention.py:141, in DeepSpeedSelfAttention.forward(self, input, input_mask, head_mask, layer_past, get_present, encoder_hidden_states, encoder_attention_mask, output_attentions, norm_w, norm_b, alibi) | |
128 def forward(self, | |
129 input, | |
130 input_mask, | |
(...) | |
138 norm_b=None, | |
139 alibi=None): | |
140 if self.attn_qkvw is None: | |
--> 141 self._attn_qkvw, self._attn_qkvb = self._merge_qkv() | |
142 else: | |
143 self._attn_qkvw = self.attn_qkvw | |
File ~/peftai/.venv/lib/python3.11/site-packages/deepspeed/ops/transformer/inference/ds_attention.py:118, in DeepSpeedSelfAttention._merge_qkv(self) | |
116 def _merge_qkv(self): | |
117 qvkw = DeepSpeedSelfAttention._qkv_buffers[0] | |
--> 118 qvkw[:self.hidden_size_per_partition, :] = self.attn_qw # type: ignore | |
119 qvkw[self.hidden_size_per_partition:2 * self.hidden_size_per_partition, :] = self.attn_kw # type: ignore | |
120 qvkw[2 * self.hidden_size_per_partition:, :] = self.attn_vw # type: ignore | |
RuntimeError: The expanded size of the tensor (2048) must match the existing size (1179648) at non-singleton dimension 1. Target sizes: [2048, 2048]. Tensor sizes: [1179648] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Using quantizer for weights: CUDAQuantizer | |
[2024-06-08 00:54:02,799] [INFO] [partition_parameters.py:562:patch_init_and_builtins] Enable Zero3 engine with INT4 quantization. | |
[2024-06-08 00:54:03,256] [INFO] [partition_parameters.py:345:__exit__] finished initializing model - num_params = 603, num_elems = 3.30B | |
[2024-06-08 00:54:08,102] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False | |
[2024-06-08 00:54:08,103] [INFO] [logging.py:96:log_dist] [Rank 0] Creating ZeRO Offload | |
[2024-06-08 00:54:08,290] [INFO] [utils.py:779:see_memory_usage] DeepSpeedZeRoOffload initialize [begin] | |
[2024-06-08 00:54:08,291] [INFO] [utils.py:780:see_memory_usage] MA 1.84 GB Max_MA 2.21 GB CA 2.38 GB Max_CA 2 GB | |
[2024-06-08 00:54:08,292] [INFO] [utils.py:787:see_memory_usage] CPU Virtual Memory: used = 7.96 GB, percent = 25.4% | |
Parameter Offload: Total persistent parameters: 92160 in 45 params | |
[2024-06-08 00:54:08,478] [INFO] [utils.py:779:see_memory_usage] DeepSpeedZeRoOffload initialize [end] | |
[2024-06-08 00:54:08,479] [INFO] [utils.py:780:see_memory_usage] MA 1.84 GB Max_MA 1.84 GB CA 2.38 GB Max_CA 2 GB | |
[2024-06-08 00:54:08,480] [INFO] [utils.py:787:see_memory_usage] CPU Virtual Memory: used = 7.96 GB, percent = 25.4% | |
[2024-06-08 00:54:08,481] [INFO] [config.py:996:print] DeepSpeedEngine configuration: | |
[2024-06-08 00:54:08,482] [INFO] [config.py:1000:print] activation_checkpointing_config { | |
"partition_activations": false, | |
"contiguous_memory_optimization": false, | |
"cpu_checkpointing": false, | |
"number_checkpoints": null, | |
"synchronize_checkpoint_boundary": false, | |
"profile": false | |
} | |
[2024-06-08 00:54:08,482] [INFO] [config.py:1000:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True} | |
[2024-06-08 00:54:08,483] [INFO] [config.py:1000:print] amp_enabled .................. False | |
[2024-06-08 00:54:08,484] [INFO] [config.py:1000:print] amp_params ................... False | |
[2024-06-08 00:54:08,485] [INFO] [config.py:1000:print] autotuning_config ............ { | |
"enabled": false, | |
"start_step": null, | |
"end_step": null, | |
"metric_path": null, | |
"arg_mappings": null, | |
"metric": "throughput", | |
"model_info": null, | |
"results_dir": "autotuning_results", | |
"exps_dir": "autotuning_exps", | |
"overwrite": true, | |
"fast": true, | |
"start_profile_step": 3, | |
"end_profile_step": 5, | |
"tuner_type": "gridsearch", | |
"tuner_early_stopping": 5, | |
"tuner_num_trials": 50, | |
"model_info_path": null, | |
"mp_size": 1, | |
"max_train_batch_size": null, | |
"min_train_batch_size": 1, | |
"max_train_micro_batch_size_per_gpu": 1.024000e+03, | |
"min_train_micro_batch_size_per_gpu": 1, | |
"num_tuning_micro_batch_sizes": 3 | |
} | |
[2024-06-08 00:54:08,485] [INFO] [config.py:1000:print] bfloat16_enabled ............. True | |
[2024-06-08 00:54:08,485] [INFO] [config.py:1000:print] bfloat16_immediate_grad_update False | |
[2024-06-08 00:54:08,486] [INFO] [config.py:1000:print] checkpoint_parallel_write_pipeline False | |
[2024-06-08 00:54:08,486] [INFO] [config.py:1000:print] checkpoint_tag_validation_enabled True | |
[2024-06-08 00:54:08,487] [INFO] [config.py:1000:print] checkpoint_tag_validation_fail False | |
[2024-06-08 00:54:08,487] [INFO] [config.py:1000:print] comms_config ................. <deepspeed.comm.config.DeepSpeedCommsConfig object at 0x7f9f266ea990> | |
[2024-06-08 00:54:08,488] [INFO] [config.py:1000:print] communication_data_type ...... None | |
[2024-06-08 00:54:08,488] [INFO] [config.py:1000:print] compile_config ............... enabled=False backend='inductor' kwargs={} | |
[2024-06-08 00:54:08,489] [INFO] [config.py:1000:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} | |
[2024-06-08 00:54:08,489] [INFO] [config.py:1000:print] curriculum_enabled_legacy .... False | |
[2024-06-08 00:54:08,490] [INFO] [config.py:1000:print] curriculum_params_legacy ..... False | |
[2024-06-08 00:54:08,490] [INFO] [config.py:1000:print] data_efficiency_config ....... {'enabled': False, 'seed': 1234, 'data_sampling': {'enabled': False, 'num_epochs': 1000, 'num_workers': 0, 'curriculum_learning': {'enabled': False}}, 'data_routing': {'enabled': False, 'random_ltd': {'enabled': False, 'layer_token_lr_schedule': {'enabled': False}}}} | |
[2024-06-08 00:54:08,491] [INFO] [config.py:1000:print] data_efficiency_enabled ...... False | |
[2024-06-08 00:54:08,491] [INFO] [config.py:1000:print] dataloader_drop_last ......... False | |
[2024-06-08 00:54:08,491] [INFO] [config.py:1000:print] disable_allgather ............ False | |
[2024-06-08 00:54:08,492] [INFO] [config.py:1000:print] dump_state ................... False | |
[2024-06-08 00:54:08,492] [INFO] [config.py:1000:print] dynamic_loss_scale_args ...... None | |
[2024-06-08 00:54:08,493] [INFO] [config.py:1000:print] eigenvalue_enabled ........... False | |
[2024-06-08 00:54:08,493] [INFO] [config.py:1000:print] eigenvalue_gas_boundary_resolution 1 | |
[2024-06-08 00:54:08,494] [INFO] [config.py:1000:print] eigenvalue_layer_name ........ bert.encoder.layer | |
[2024-06-08 00:54:08,494] [INFO] [config.py:1000:print] eigenvalue_layer_num ......... 0 | |
[2024-06-08 00:54:08,495] [INFO] [config.py:1000:print] eigenvalue_max_iter .......... 100 | |
[2024-06-08 00:54:08,495] [INFO] [config.py:1000:print] eigenvalue_stability ......... 1e-06 | |
[2024-06-08 00:54:08,496] [INFO] [config.py:1000:print] eigenvalue_tol ............... 0.01 | |
[2024-06-08 00:54:08,496] [INFO] [config.py:1000:print] eigenvalue_verbose ........... False | |
[2024-06-08 00:54:08,496] [INFO] [config.py:1000:print] elasticity_enabled ........... False | |
[2024-06-08 00:54:08,497] [INFO] [config.py:1000:print] flops_profiler_config ........ { | |
"enabled": false, | |
"recompute_fwd_factor": 0.0, | |
"profile_step": 1, | |
"module_depth": -1, | |
"top_modules": 1, | |
"detailed": true, | |
"output_file": null | |
} | |
[2024-06-08 00:54:08,497] [INFO] [config.py:1000:print] fp16_auto_cast ............... None | |
[2024-06-08 00:54:08,498] [INFO] [config.py:1000:print] fp16_enabled ................. False | |
[2024-06-08 00:54:08,498] [INFO] [config.py:1000:print] fp16_master_weights_and_gradients False | |
[2024-06-08 00:54:08,499] [INFO] [config.py:1000:print] global_rank .................. 0 | |
[2024-06-08 00:54:08,499] [INFO] [config.py:1000:print] grad_accum_dtype ............. None | |
[2024-06-08 00:54:08,500] [INFO] [config.py:1000:print] gradient_accumulation_steps .. 1 | |
[2024-06-08 00:54:08,500] [INFO] [config.py:1000:print] gradient_clipping ............ 0.0 | |
[2024-06-08 00:54:08,501] [INFO] [config.py:1000:print] gradient_predivide_factor .... 1.0 | |
[2024-06-08 00:54:08,501] [INFO] [config.py:1000:print] graph_harvesting ............. False | |
[2024-06-08 00:54:08,504] [INFO] [config.py:1000:print] hybrid_engine ................ enabled=False max_out_tokens=512 inference_tp_size=1 release_inference_cache=False pin_parameters=True tp_gather_partition_size=8 | |
[2024-06-08 00:54:08,504] [INFO] [config.py:1000:print] initial_dynamic_scale ........ 1 | |
[2024-06-08 00:54:08,505] [INFO] [config.py:1000:print] load_universal_checkpoint .... False | |
[2024-06-08 00:54:08,508] [INFO] [config.py:1000:print] loss_scale ................... 1.0 | |
[2024-06-08 00:54:08,508] [INFO] [config.py:1000:print] memory_breakdown ............. False | |
[2024-06-08 00:54:08,509] [INFO] [config.py:1000:print] mics_hierarchial_params_gather False | |
[2024-06-08 00:54:08,509] [INFO] [config.py:1000:print] mics_shard_size .............. -1 | |
[2024-06-08 00:54:08,510] [INFO] [config.py:1000:print] monitor_config ............... tensorboard=TensorBoardConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') wandb=WandbConfig(enabled=False, group=None, team=None, project='deepspeed') csv_monitor=CSVConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') enabled=False | |
[2024-06-08 00:54:08,510] [INFO] [config.py:1000:print] nebula_config ................ { | |
"enabled": false, | |
"persistent_storage_path": null, | |
"persistent_time_interval": 100, | |
"num_of_version_in_retention": 2, | |
"enable_nebula_load": true, | |
"load_path": null | |
} | |
[2024-06-08 00:54:08,511] [INFO] [config.py:1000:print] optimizer_legacy_fusion ...... False | |
[2024-06-08 00:54:08,511] [INFO] [config.py:1000:print] optimizer_name ............... None | |
[2024-06-08 00:54:08,512] [INFO] [config.py:1000:print] optimizer_params ............. None | |
[2024-06-08 00:54:08,512] [INFO] [config.py:1000:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0, 'pipe_partitioned': True, 'grad_partitioned': True} | |
[2024-06-08 00:54:08,512] [INFO] [config.py:1000:print] pld_enabled .................. False | |
[2024-06-08 00:54:08,513] [INFO] [config.py:1000:print] pld_params ................... False | |
[2024-06-08 00:54:08,513] [INFO] [config.py:1000:print] prescale_gradients ........... False | |
[2024-06-08 00:54:08,514] [INFO] [config.py:1000:print] scheduler_name ............... None | |
[2024-06-08 00:54:08,514] [INFO] [config.py:1000:print] scheduler_params ............. None | |
[2024-06-08 00:54:08,515] [INFO] [config.py:1000:print] seq_parallel_communication_data_type torch.float32 | |
[2024-06-08 00:54:08,515] [INFO] [config.py:1000:print] sparse_attention ............. None | |
[2024-06-08 00:54:08,515] [INFO] [config.py:1000:print] sparse_gradients_enabled ..... False | |
[2024-06-08 00:54:08,516] [INFO] [config.py:1000:print] steps_per_print .............. 10 | |
[2024-06-08 00:54:08,516] [INFO] [config.py:1000:print] train_batch_size ............. 1 | |
[2024-06-08 00:54:08,517] [INFO] [config.py:1000:print] train_micro_batch_size_per_gpu 1 | |
[2024-06-08 00:54:08,517] [INFO] [config.py:1000:print] use_data_before_expert_parallel_ False | |
[2024-06-08 00:54:08,517] [INFO] [config.py:1000:print] use_node_local_storage ....... False | |
[2024-06-08 00:54:08,519] [INFO] [config.py:1000:print] wall_clock_breakdown ......... False | |
[2024-06-08 00:54:08,520] [INFO] [config.py:1000:print] weight_quantization_config ... q_type='symmetric' q_groups=1 enabled=True num_bits=8 quantized_initialization={'num_bits': 4, 'group_size': 64, 'group_dim': 1, 'symmetric': False} post_init_quant={} | |
[2024-06-08 00:54:08,520] [INFO] [config.py:1000:print] world_size ................... 1 | |
[2024-06-08 00:54:08,521] [INFO] [config.py:1000:print] zero_allow_untested_optimizer False | |
[2024-06-08 00:54:08,521] [INFO] [config.py:1000:print] zero_config .................. stage=3 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500,000,000 use_multi_rank_bucket_allreduce=True allgather_partitions=True allgather_bucket_size=500,000,000 overlap_comm=True load_from_fp32_weights=False elastic_checkpoint=False offload_param=None offload_optimizer=None sub_group_size=1,000,000,000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50,000,000 param_persistence_threshold=100,000 model_persistence_threshold=sys.maxsize max_live_parameters=1,000,000,000 max_reuse_distance=1,000,000,000 gather_16bit_weights_on_model_save=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False zero_hpz_partition_size=1 zero_quantized_weights=True zero_quantized_nontrainable_weights=True zero_quantized_gradients=False mics_shard_size=-1 mics_hierarchical_params_gather=False memory_efficient_linear=True pipeline_loading_checkpoint=False override_module_apply=True | |
[2024-06-08 00:54:08,522] [INFO] [config.py:1000:print] zero_enabled ................. True | |
[2024-06-08 00:54:08,522] [INFO] [config.py:1000:print] zero_force_ds_cpu_optimizer .. True | |
[2024-06-08 00:54:08,522] [INFO] [config.py:1000:print] zero_optimization_stage ...... 3 | |
[2024-06-08 00:54:08,523] [INFO] [config.py:986:print_user_config] json = { | |
"zero_optimization": { | |
"load_from_fp32_weights": false, | |
"stage": 3, | |
"zero_quantized_weights": true, | |
"zero_quantized_nontrainable_weights": true | |
}, | |
"train_micro_batch_size_per_gpu": 1, | |
"bf16": { | |
"enabled": true | |
}, | |
"weight_quantization": { | |
"quantized_initialization": { | |
"num_bits": 4, | |
"group_size": 64, | |
"group_dim": 1, | |
"symmetric": false | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment