Created
June 23, 2025 16:48
-
-
Save ashvinnihalani/596c8442937bd0ec9a02c944ab3f345a to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ------------------------ arguments ------------------------ | |
| account_for_embedding_in_pipeline_split ......... False | |
| account_for_loss_in_pipeline_split .............. False | |
| accumulate_allreduce_grads_in_fp32 .............. True # Double check Good | |
| adam_beta1 ...................................... 0.9 | |
| adam_beta2 ...................................... 0.95 | |
| adam_eps ........................................ 1e-08 | |
| add_bias_linear ................................. False | |
| add_position_embedding .......................... True # Double Check | |
| add_qkv_bias .................................... False | |
| adlr_autoresume ................................. False | |
| adlr_autoresume_interval ........................ 1000 | |
| align_grad_reduce ............................... True # Doubl Check | |
| align_param_gather .............................. True # Double Check | |
| app_tag_run_name ................................ None | |
| app_tag_run_version ............................. 0.0.0 | |
| apply_layernorm_1p .............................. False # Double Check. Should be set to false if no not layer norm | |
| apply_query_key_layer_scaling ................... False | |
| apply_residual_connection_post_layernorm ........ False | |
| apply_rope_fusion ............................... False | |
| async_save ...................................... None | |
| async_tensor_model_parallel_allreduce ........... True | |
| attention_backend ............................... AttnBackend.auto | |
| attention_dropout ............................... 0.0 | |
| attention_softmax_in_fp32 ....................... False | |
| auto_detect_ckpt_format ......................... False | |
| barrier_with_L1_time ............................ True | |
| bert_binary_head ................................ True # Double Check | |
| bert_embedder_type .............................. megatron | |
| bert_load ....................................... None | |
| bf16 ............................................ True | |
| bias_dropout_fusion ............................. True # Different | |
| bias_gelu_fusion ................................ False | |
| bias_swiglu_fusion .............................. False | |
| biencoder_projection_dim ........................ 0 | |
| biencoder_shared_query_context_model ............ False | |
| block_data_path ................................. None | |
| calc_ft_timeouts ................................ False | |
| calculate_per_token_loss ........................ False | |
| check_for_large_grads ........................... False | |
| check_for_nan_in_loss_and_grad .................. False | |
| check_for_spiky_loss ............................ False | |
| check_weight_hash_across_dp_replicas_interval ... None | |
| ckpt_assume_constant_structure .................. False | |
| ckpt_convert_format ............................. None | |
| ckpt_convert_save ............................... None | |
| ckpt_convert_update_legacy_dist_opt_format ...... False | |
| ckpt_format ..................................... torch_dist | |
| ckpt_fully_parallel_load ........................ True | |
| ckpt_fully_parallel_save ........................ True | |
| ckpt_fully_parallel_save_deprecated ............. True | |
| ckpt_step ....................................... None | |
| classes_fraction ................................ 1.0 | |
| clip_grad ....................................... 1.0 | |
| clone_scatter_output_in_embedding ............... True | |
| config_logger_dir ............................... | |
| consumed_train_samples .......................... 0 | |
| consumed_valid_samples .......................... 0 | |
| context_parallel_size ........................... 1 | |
| cp_comm_type .................................... ['p2p'] | |
| create_attention_mask_in_dataloader ............. True # different | |
| cross_entropy_fusion_impl ....................... native | |
| cross_entropy_loss_fusion ....................... False # Different | |
| cuda_graph_scope ................................ full | |
| cuda_graph_warmup_steps ......................... 3 | |
| data_args_path .................................. None | |
| data_cache_path ................................. None | |
| data_parallel_random_init ....................... False | |
| data_parallel_sharding_strategy ................. no_shard | |
| data_parallel_size .............................. 16 | |
| data_path ....................................... ['/shared/phoenix-pre-training-production-dataset/virtual_stage_1_250506_500B/training_mixed_data_prepacked/packed_8193_00003'] | |
| data_per_class_fraction ......................... 1.0 | |
| data_sharding ................................... True | |
| dataloader_type ................................. single | |
| ddp_average_in_collective ....................... False # Different | |
| ddp_bucket_size ................................. None | |
| ddp_num_buckets ................................. None | |
| ddp_pad_buckets_for_high_nccl_busbw ............. False # Different | |
| decoder_first_pipeline_num_layers ............... None | |
| decoder_last_pipeline_num_layers ................ None | |
| decoder_num_layers .............................. None | |
| decoder_seq_length .............................. None | |
| decoupled_lr .................................... None | |
| decoupled_min_lr ................................ None | |
| decrease_batch_size_if_needed ................... False | |
| defer_embedding_wgrad_compute ................... False # Double Check Set t | |
| deprecated_use_mcore_models ..................... True | |
| deterministic_mode .............................. False | |
| dino_bottleneck_size ............................ 256 | |
| dino_freeze_last_layer .......................... 1 | |
| dino_head_hidden_size ........................... 2048 | |
| dino_local_crops_number ......................... 10 | |
| dino_local_img_size ............................. 96 | |
| dino_norm_last_layer ............................ False | |
| dino_teacher_temp ............................... 0.07 | |
| dino_warmup_teacher_temp ........................ 0.04 | |
| dino_warmup_teacher_temp_epochs ................. 30 | |
| disable_bf16_reduced_precision_matmul ........... False | |
| disable_mamba_mem_eff_path ...................... False | |
| disable_straggler_on_startup .................... False | |
| dist_ckpt_format_deprecated ..................... None | |
| dist_ckpt_strictness ............................ log_all | |
| distribute_saved_activations .................... False | |
| distributed_backend ............................. nccl | |
| distributed_timeout_minutes ..................... 60 | |
| embedding_path .................................. None | |
| empty_unused_memory_level ....................... 0 | |
| enable_cuda_graph ............................... False | |
| enable_ft_package ............................... False | |
| enable_gloo_process_groups ...................... True | |
| enable_msc ...................................... True | |
| enable_one_logger ............................... True | |
| encoder_num_layers .............................. 48 | |
| encoder_pipeline_model_parallel_size ............ 0 | |
| encoder_seq_length .............................. 8192 | |
| encoder_tensor_model_parallel_size .............. 0 | |
| end_weight_decay ................................ 0.1 | |
| eod_mask_loss ................................... False | |
| error_injection_rate ............................ 0 | |
| error_injection_type ............................ transient_error | |
| eval_interval ................................... 1000 | |
| eval_iters ...................................... 0 | |
| evidence_data_path .............................. None | |
| exit_duration_in_mins ........................... 220 | |
| exit_interval ................................... None | |
| exit_on_missing_checkpoint ...................... False | |
| exit_signal_handler ............................. False | |
| exp_avg_dtype ................................... torch.float32 # Double Check not even used | |
| exp_avg_sq_dtype ................................ torch.float32 # Double Check not even used | |
| expert_model_parallel_size ...................... 4 | |
| expert_tensor_parallel_size ..................... 2 | |
| external_cuda_graph ............................. False | |
| ffn_hidden_size ................................. 1344 | |
| finetune ........................................ False | |
| first_last_layers_bf16 .......................... False | |
| flash_decode .................................... False | |
| fp16 ............................................ False | |
| fp16_lm_cross_entropy ........................... False | |
| fp32_residual_connection ........................ False | |
| fp8 ............................................. None | |
| fp8_amax_compute_algo ........................... most_recent | |
| fp8_amax_history_len ............................ 1 | |
| fp8_interval .................................... 1 | |
| fp8_margin ...................................... 0 | |
| fp8_param_gather ................................ False | |
| fp8_recipe ...................................... delayed | |
| fp8_wgrad ....................................... True | |
| global_batch_size ............................... 112 | |
| grad_reduce_in_bf16 ............................. False | |
| gradient_accumulation_fusion .................... True | |
| gradient_reduce_div_fusion ...................... True | |
| group_query_attention ........................... False | |
| head_lr_mult .................................... 1.0 | |
| heterogeneous_layers_config_encoded_json ........ None | |
| heterogeneous_layers_config_path ................ None | |
| hidden_dropout .................................. 0.0 | |
| hidden_size ..................................... 384 | |
| hierarchical_context_parallel_sizes ............. None | |
| hybrid_attention_ratio .......................... 0.0 | |
| hybrid_mlp_ratio ................................ 0.0 | |
| hybrid_override_pattern ......................... None | |
| hysteresis ...................................... 2 | |
| ict_head_size ................................... None | |
| ict_load ........................................ None | |
| img_h ........................................... 224 | |
| img_w ........................................... 224 | |
| indexer_batch_size .............................. 128 | |
| indexer_log_interval ............................ 1000 | |
| inference_batch_times_seqlen_threshold .......... -1 | |
| inference_dynamic_batching ...................... False | |
| inference_dynamic_batching_buffer_guaranteed_fraction 0.2 | |
| inference_dynamic_batching_buffer_overflow_factor None | |
| inference_dynamic_batching_buffer_size_gb ....... 40.0 | |
| inference_dynamic_batching_chunk_size ........... 256 | |
| inference_dynamic_batching_max_requests_override None | |
| inference_dynamic_batching_max_tokens_override .. None | |
| inference_max_batch_size ........................ 8 | |
| inference_max_seq_length ........................ 2560 | |
| inference_rng_tracker ........................... False | |
| init_method_std ................................. 0.006 | |
| init_method_xavier_uniform ...................... False | |
| init_model_with_meta_device ..................... False | |
| initial_loss_scale .............................. 4294967296 | |
| is_hybrid_model ................................. False | |
| iter_per_epoch .................................. 1250 | |
| iterations_to_skip .............................. [] | |
| keep_fp8_transpose_cache_when_using_custom_fsdp . False | |
| kv_channels ..................................... 128 | |
| kv_lora_rank .................................... 128 | |
| lazy_mpu_init ................................... None | |
| load ............................................ /shared/checkpoints/ashvinn/mlm_expreproduction-8-node | |
| local_rank ...................................... 0 | |
| log_interval .................................... 1 | |
| log_loss_scale_to_tensorboard ................... True | |
| log_memory_to_tensorboard ....................... True | |
| log_num_zeros_in_grad ........................... False | |
| log_params_norm ................................. False | |
| log_progress .................................... False | |
| log_straggler ................................... False | |
| log_throughput .................................. True | |
| log_timers_to_tensorboard ....................... False | |
| log_validation_ppl_to_tensorboard ............... True | |
| log_world_size_to_tensorboard ................... False | |
| logging_level ................................... 40 | |
| loss_scale ...................................... None | |
| loss_scale_window ............................... 1000 | |
| lr .............................................. 0.00024 | |
| lr_decay_iters .................................. None | |
| lr_decay_samples ................................ None | |
| lr_decay_style .................................. cosine # Different | |
| lr_warmup_fraction .............................. None | |
| lr_warmup_init .................................. 0.00024 | |
| lr_warmup_iters ................................. 0 | |
| lr_warmup_samples ............................... 0 | |
| lr_wsd_decay_iters .............................. None | |
| lr_wsd_decay_samples ............................ None | |
| lr_wsd_decay_style .............................. exponential | |
| main_grads_dtype ................................ torch.float32 | |
| main_params_dtype ............................... torch.float32 | |
| make_vocab_size_divisible_by .................... 256 | |
| mamba_head_dim .................................. 64 | |
| mamba_num_groups ................................ 8 | |
| mamba_num_heads ................................. None | |
| mamba_state_dim ................................. 128 | |
| manual_gc ....................................... True | |
| manual_gc_eval .................................. True | |
| manual_gc_interval .............................. 20 | |
| mask_factor ..................................... 1.0 | |
| mask_prob ....................................... 0.15 | |
| mask_type ....................................... random | |
| masked_softmax_fusion ........................... True | |
| max_position_embeddings ......................... 8192 | |
| max_tokens_to_oom ............................... 12000 | |
| memory_snapshot_path ............................ snapshot.pickle | |
| merge_file ...................................... None | |
| micro_batch_size ................................ 1 | |
| microbatch_group_size_per_vp_stage .............. 4 | |
| mid_level_dataset_surplus ....................... 0.005 | |
| min_loss_scale .................................. 1.0 | |
| min_lr .......................................... 0.0 | |
| mlp_chunks_for_prefill .......................... 1 | |
| mmap_bin_files .................................. True | |
| mock_data ....................................... False | |
| moe_aux_loss_coeff .............................. 0.0 # Different | |
| moe_enable_deepep ............................... False # Different | |
| moe_expert_capacity_factor ...................... None | |
| moe_extended_tp ................................. False | |
| moe_ffn_hidden_size ............................. 576 | |
| moe_grouped_gemm ................................ True | |
| moe_input_jitter_eps ............................ None | |
| moe_layer_freq .................................. [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1] | |
| moe_layer_recompute ............................. False | |
| moe_pad_expert_input_to_capacity ................ False | |
| moe_per_layer_logging ........................... False | |
| moe_permute_fusion .............................. False | |
| moe_router_bias_update_rate ..................... 0.001 | |
| moe_router_dtype ................................ None # Double Check | |
| moe_router_enable_expert_bias ................... False | |
| moe_router_group_topk ........................... None | |
| moe_router_load_balancing_type .................. aux_loss | |
| moe_router_num_groups ........................... None | |
| moe_router_pre_softmax .......................... False | |
| moe_router_score_function ....................... softmax | |
| moe_router_topk ................................. 2 # Different | |
| moe_router_topk_scaling_factor .................. None | |
| moe_shared_expert_intermediate_size ............. None | |
| moe_shared_expert_overlap ....................... False | |
| moe_token_dispatcher_type ....................... allgather # Change | |
| moe_token_drop_policy ........................... probs | |
| moe_use_legacy_grouped_gemm ..................... False | |
| moe_use_upcycling ............................... False | |
| moe_z_loss_coeff ................................ None | |
| mrope_section ................................... None | |
| mscale .......................................... 1.0 | |
| mscale_all_dim .................................. 1.0 | |
| mtp_loss_scaling_factor ......................... 0.1 | |
| mtp_num_layers .................................. None | |
| multi_latent_attention .......................... True | |
| nccl_communicator_config_path ................... None | |
| no_load_optim ................................... None | |
| no_load_rng ..................................... None # Double Check | |
| no_persist_layer_norm ........................... False # Same inverse | |
| no_rope_freq .................................... None | |
| no_save_optim ................................... None | |
| no_save_rng ..................................... None | |
| non_persistent_ckpt_type ........................ None | |
| non_persistent_global_ckpt_dir .................. None | |
| non_persistent_local_ckpt_algo .................. fully_parallel | |
| non_persistent_local_ckpt_dir ................... None | |
| non_persistent_save_interval .................... None | |
| norm_epsilon .................................... 1e-05 | |
| normalization ................................... RMSNorm | |
| num_attention_heads ............................. 64 | |
| num_channels .................................... 3 | |
| num_classes ..................................... 1000 | |
| num_dataset_builder_threads ..................... 1 | |
| num_distributed_optimizer_instances ............. 1 | |
| num_experts ..................................... 576 | |
| num_layers ...................................... 48 | |
| num_layers_at_end_in_bf16 ....................... 1 | |
| num_layers_at_start_in_bf16 ..................... 1 | |
| num_layers_per_virtual_pipeline_stage ........... None | |
| num_query_groups ................................ 1 | |
| num_virtual_stages_per_pipeline_rank ............ 4 | |
| num_workers ..................................... 2 | |
| object_storage_cache_path ....................... None | |
| one_logger_async ................................ False | |
| one_logger_project .............................. megatron-lm | |
| one_logger_run_name ............................. None | |
| onnx_safe ....................................... None | |
| openai_gelu ..................................... False | |
| optimizer ....................................... adam | |
| optimizer_cpu_offload ........................... False | |
| optimizer_offload_fraction ...................... 1.0 | |
| output_bert_embeddings .......................... False | |
| overlap_cpu_optimizer_d2h_h2d ................... False | |
| overlap_grad_reduce ............................. True # Double Check | |
| overlap_p2p_comm ................................ True # Double Check | |
| overlap_p2p_comm_warmup_flush ................... False # Double Check | |
| overlap_param_gather ............................ True # Double Check | |
| overlap_param_gather_with_optimizer_step ........ False # Double Check | |
| override_opt_param_scheduler .................... False | |
| params_dtype .................................... torch.bfloat16 | |
| patch_dim ....................................... 16 | |
| per_split_data_args_path ........................ None | |
| perform_initialization .......................... True | |
| pin_cpu_grads ................................... True # Double Check | |
| pin_cpu_params .................................. True # Double Check | |
| pipeline_model_parallel_comm_backend ............ None | |
| pipeline_model_parallel_size .................... 2 | |
| pipeline_model_parallel_split_rank .............. None | |
| position_embedding_type ......................... rope | |
| pretrained_checkpoint ........................... None | |
| profile ......................................... False | |
| profile_ranks ................................... [0] | |
| profile_step_end ................................ 12 | |
| profile_step_start .............................. 10 | |
| q_lora_rank ..................................... 284 | |
| qk_head_dim ..................................... 128 | |
| qk_layernorm .................................... False | |
| qk_pos_emb_head_dim ............................. 64 | |
| query_in_block_prob ............................. 0.1 | |
| rampup_batch_size ............................... None | |
| rank ............................................ 0 | |
| recompute_granularity ........................... None | |
| recompute_method ................................ None | |
| recompute_modules ............................... None | |
| recompute_num_layers ............................ None | |
| record_memory_history ........................... False | |
| relative_attention_max_distance ................. 128 | |
| relative_attention_num_buckets .................. 32 | |
| replication ..................................... False | |
| replication_factor .............................. 2 | |
| replication_jump ................................ None | |
| rerun_mode ...................................... disabled | |
| reset_attention_mask ............................ False | |
| reset_position_ids .............................. False | |
| result_rejected_tracker_filename ................ None | |
| retriever_report_topk_accuracies ................ [] | |
| retriever_score_scaling ......................... False | |
| retriever_seq_length ............................ 256 | |
| retro_add_retriever ............................. False | |
| retro_attention_gate ............................ 1 | |
| retro_cyclic_train_iters ........................ None | |
| retro_encoder_attention_dropout ................. 0.1 | |
| retro_encoder_hidden_dropout .................... 0.1 | |
| retro_encoder_layers ............................ 2 | |
| retro_num_neighbors ............................. 2 | |
| retro_num_retrieved_chunks ...................... 2 | |
| retro_project_dir ............................... None | |
| retro_verify_neighbor_count ..................... True | |
| rope_scaling_factor ............................. 8.0 # Different | |
| rotary_base ..................................... 500000 | |
| rotary_interleaved .............................. False | |
| rotary_percent .................................. 1.0 | |
| rotary_scaling_factor ........................... 1.0 | |
| rotary_seq_len_interpolation_factor ............. None | |
| run_workload_inspector_server ................... False | |
| sample_rate ..................................... 1.0 | |
| save ............................................ /shared/checkpoints/ashvinn/mlm_expreproduction-8-node | |
| save_interval ................................... 1000 | |
| scatter_gather_tensors_in_pipeline .............. True | |
| seed ............................................ 1234 | |
| seq_length ...................................... 8192 | |
| sequence_parallel ............................... True | |
| sgd_momentum .................................... 0.9 | |
| short_seq_prob .................................. 0.1 | |
| skip_train ...................................... False | |
| skipped_train_samples ........................... 0 | |
| spec ............................................ None | |
| split ........................................... 100,0,0 | |
| squared_relu .................................... False | |
| start_weight_decay .............................. 0.1 | |
| straggler_ctrlr_port ............................ 65535 | |
| straggler_minmax_count .......................... 1 | |
| suggested_communication_unit_size ............... None | |
| swiglu .......................................... True | |
| swin_backbone_type .............................. tiny | |
| te_rng_tracker .................................. False | |
| tensor_model_parallel_size ...................... 2 | |
| tensorboard_dir ................................. /shared/shared_experiments/ashvinn/mlm_expreproduction-8-node | |
| tensorboard_log_interval ........................ 1 | |
| tensorboard_queue_size .......................... 1000 | |
| test_data_path .................................. None | |
| test_mode ....................................... False | |
| tiktoken_num_special_tokens ..................... 1000 | |
| tiktoken_pattern ................................ None | |
| tiktoken_special_tokens ......................... None | |
| timing_log_level ................................ 0 | |
| timing_log_option ............................... minmax | |
| titles_data_path ................................ None | |
| tokenizer_model ................................. /workspace/src/tokenizers/utm-2 | |
| tokenizer_type .................................. HuggingFaceTokenizer | |
| torch_fsdp2_reshard_after_forward ............... True | |
| tp_comm_bootstrap_backend ....................... nccl | |
| tp_comm_bulk_dgrad .............................. True # Double Check | |
| tp_comm_bulk_wgrad .............................. True # Double Check | |
| tp_comm_overlap ................................. False | |
| tp_comm_overlap_ag .............................. True | |
| tp_comm_overlap_cfg ............................. None | |
| tp_comm_overlap_rs .............................. True | |
| tp_comm_overlap_rs_dgrad ........................ False | |
| tp_comm_split_ag ................................ True | |
| tp_comm_split_rs ................................ True | |
| train_data_path ................................. None | |
| train_iters ..................................... 20000 | |
| train_samples ................................... None | |
| train_sync_interval ............................. None | |
| transformer_impl ................................ transformer_engine | |
| transformer_pipeline_model_parallel_size ........ 2 | |
| untie_embeddings_and_output_weights ............. True | |
| use_checkpoint_args ............................. False | |
| use_checkpoint_opt_param_scheduler .............. False | |
| use_cpu_initialization .......................... None | |
| use_custom_fsdp ................................. False | |
| use_dist_ckpt ................................... True | |
| use_dist_ckpt_deprecated ........................ False | |
| use_distributed_optimizer ....................... True | |
| use_flash_attn .................................. False | |
| use_legacy_models ............................... False | |
| use_mp_args_from_checkpoint_args ................ False | |
| use_one_sent_docs ............................... False | |
| use_persistent_ckpt_worker ...................... False | |
| use_precision_aware_optimizer ................... False | |
| use_pytorch_profiler ............................ False | |
| use_ring_exchange_p2p ........................... False | |
| use_rope_scaling ................................ False | |
| use_rotary_position_embeddings .................. False | |
| use_tokenizer_model_from_checkpoint_args ........ True | |
| use_torch_fsdp2 ................................. False | |
| use_torch_optimizer_for_cpu_offload ............. False | |
| use_tp_pp_dp_mapping ............................ False | |
| v_head_dim ...................................... 128 | |
| valid_data_path ................................. None | |
| variable_seq_lengths ............................ False | |
| virtual_pipeline_model_parallel_size ............ 4 | |
| vision_backbone_type ............................ vit | |
| vision_pretraining .............................. False | |
| vision_pretraining_type ......................... classify | |
| vocab_extra_ids ................................. 0 | |
| vocab_file ...................................... None | |
| vocab_size ...................................... None | |
| wandb_exp_name .................................. reproduction-8-node | |
| wandb_project ................................... nemo-rufus | |
| wandb_save_dir .................................. | |
| weight_decay .................................... 0.1 | |
| weight_decay_incr_style ......................... constant | |
| wgrad_deferral_limit ............................ 0 | |
| world_size ...................................... 64 | |
| yaml_cfg ........................................ None |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment