Skip to content

Instantly share code, notes, and snippets.

@ashvinnihalani
Created June 23, 2025 16:48
Show Gist options
  • Select an option

  • Save ashvinnihalani/596c8442937bd0ec9a02c944ab3f345a to your computer and use it in GitHub Desktop.

Select an option

Save ashvinnihalani/596c8442937bd0ec9a02c944ab3f345a to your computer and use it in GitHub Desktop.
------------------------ arguments ------------------------
account_for_embedding_in_pipeline_split ......... False
account_for_loss_in_pipeline_split .............. False
accumulate_allreduce_grads_in_fp32 .............. True # Double check Good
adam_beta1 ...................................... 0.9
adam_beta2 ...................................... 0.95
adam_eps ........................................ 1e-08
add_bias_linear ................................. False
add_position_embedding .......................... True # Double Check
add_qkv_bias .................................... False
adlr_autoresume ................................. False
adlr_autoresume_interval ........................ 1000
align_grad_reduce ............................... True # Doubl Check
align_param_gather .............................. True # Double Check
app_tag_run_name ................................ None
app_tag_run_version ............................. 0.0.0
apply_layernorm_1p .............................. False # Double Check. Should be set to false if no not layer norm
apply_query_key_layer_scaling ................... False
apply_residual_connection_post_layernorm ........ False
apply_rope_fusion ............................... False
async_save ...................................... None
async_tensor_model_parallel_allreduce ........... True
attention_backend ............................... AttnBackend.auto
attention_dropout ............................... 0.0
attention_softmax_in_fp32 ....................... False
auto_detect_ckpt_format ......................... False
barrier_with_L1_time ............................ True
bert_binary_head ................................ True # Double Check
bert_embedder_type .............................. megatron
bert_load ....................................... None
bf16 ............................................ True
bias_dropout_fusion ............................. True # Different
bias_gelu_fusion ................................ False
bias_swiglu_fusion .............................. False
biencoder_projection_dim ........................ 0
biencoder_shared_query_context_model ............ False
block_data_path ................................. None
calc_ft_timeouts ................................ False
calculate_per_token_loss ........................ False
check_for_large_grads ........................... False
check_for_nan_in_loss_and_grad .................. False
check_for_spiky_loss ............................ False
check_weight_hash_across_dp_replicas_interval ... None
ckpt_assume_constant_structure .................. False
ckpt_convert_format ............................. None
ckpt_convert_save ............................... None
ckpt_convert_update_legacy_dist_opt_format ...... False
ckpt_format ..................................... torch_dist
ckpt_fully_parallel_load ........................ True
ckpt_fully_parallel_save ........................ True
ckpt_fully_parallel_save_deprecated ............. True
ckpt_step ....................................... None
classes_fraction ................................ 1.0
clip_grad ....................................... 1.0
clone_scatter_output_in_embedding ............... True
config_logger_dir ...............................
consumed_train_samples .......................... 0
consumed_valid_samples .......................... 0
context_parallel_size ........................... 1
cp_comm_type .................................... ['p2p']
create_attention_mask_in_dataloader ............. True # different
cross_entropy_fusion_impl ....................... native
cross_entropy_loss_fusion ....................... False # Different
cuda_graph_scope ................................ full
cuda_graph_warmup_steps ......................... 3
data_args_path .................................. None
data_cache_path ................................. None
data_parallel_random_init ....................... False
data_parallel_sharding_strategy ................. no_shard
data_parallel_size .............................. 16
data_path ....................................... ['/shared/phoenix-pre-training-production-dataset/virtual_stage_1_250506_500B/training_mixed_data_prepacked/packed_8193_00003']
data_per_class_fraction ......................... 1.0
data_sharding ................................... True
dataloader_type ................................. single
ddp_average_in_collective ....................... False # Different
ddp_bucket_size ................................. None
ddp_num_buckets ................................. None
ddp_pad_buckets_for_high_nccl_busbw ............. False # Different
decoder_first_pipeline_num_layers ............... None
decoder_last_pipeline_num_layers ................ None
decoder_num_layers .............................. None
decoder_seq_length .............................. None
decoupled_lr .................................... None
decoupled_min_lr ................................ None
decrease_batch_size_if_needed ................... False
defer_embedding_wgrad_compute ................... False # Double Check Set t
deprecated_use_mcore_models ..................... True
deterministic_mode .............................. False
dino_bottleneck_size ............................ 256
dino_freeze_last_layer .......................... 1
dino_head_hidden_size ........................... 2048
dino_local_crops_number ......................... 10
dino_local_img_size ............................. 96
dino_norm_last_layer ............................ False
dino_teacher_temp ............................... 0.07
dino_warmup_teacher_temp ........................ 0.04
dino_warmup_teacher_temp_epochs ................. 30
disable_bf16_reduced_precision_matmul ........... False
disable_mamba_mem_eff_path ...................... False
disable_straggler_on_startup .................... False
dist_ckpt_format_deprecated ..................... None
dist_ckpt_strictness ............................ log_all
distribute_saved_activations .................... False
distributed_backend ............................. nccl
distributed_timeout_minutes ..................... 60
embedding_path .................................. None
empty_unused_memory_level ....................... 0
enable_cuda_graph ............................... False
enable_ft_package ............................... False
enable_gloo_process_groups ...................... True
enable_msc ...................................... True
enable_one_logger ............................... True
encoder_num_layers .............................. 48
encoder_pipeline_model_parallel_size ............ 0
encoder_seq_length .............................. 8192
encoder_tensor_model_parallel_size .............. 0
end_weight_decay ................................ 0.1
eod_mask_loss ................................... False
error_injection_rate ............................ 0
error_injection_type ............................ transient_error
eval_interval ................................... 1000
eval_iters ...................................... 0
evidence_data_path .............................. None
exit_duration_in_mins ........................... 220
exit_interval ................................... None
exit_on_missing_checkpoint ...................... False
exit_signal_handler ............................. False
exp_avg_dtype ................................... torch.float32 # Double Check not even used
exp_avg_sq_dtype ................................ torch.float32 # Double Check not even used
expert_model_parallel_size ...................... 4
expert_tensor_parallel_size ..................... 2
external_cuda_graph ............................. False
ffn_hidden_size ................................. 1344
finetune ........................................ False
first_last_layers_bf16 .......................... False
flash_decode .................................... False
fp16 ............................................ False
fp16_lm_cross_entropy ........................... False
fp32_residual_connection ........................ False
fp8 ............................................. None
fp8_amax_compute_algo ........................... most_recent
fp8_amax_history_len ............................ 1
fp8_interval .................................... 1
fp8_margin ...................................... 0
fp8_param_gather ................................ False
fp8_recipe ...................................... delayed
fp8_wgrad ....................................... True
global_batch_size ............................... 112
grad_reduce_in_bf16 ............................. False
gradient_accumulation_fusion .................... True
gradient_reduce_div_fusion ...................... True
group_query_attention ........................... False
head_lr_mult .................................... 1.0
heterogeneous_layers_config_encoded_json ........ None
heterogeneous_layers_config_path ................ None
hidden_dropout .................................. 0.0
hidden_size ..................................... 384
hierarchical_context_parallel_sizes ............. None
hybrid_attention_ratio .......................... 0.0
hybrid_mlp_ratio ................................ 0.0
hybrid_override_pattern ......................... None
hysteresis ...................................... 2
ict_head_size ................................... None
ict_load ........................................ None
img_h ........................................... 224
img_w ........................................... 224
indexer_batch_size .............................. 128
indexer_log_interval ............................ 1000
inference_batch_times_seqlen_threshold .......... -1
inference_dynamic_batching ...................... False
inference_dynamic_batching_buffer_guaranteed_fraction 0.2
inference_dynamic_batching_buffer_overflow_factor None
inference_dynamic_batching_buffer_size_gb ....... 40.0
inference_dynamic_batching_chunk_size ........... 256
inference_dynamic_batching_max_requests_override None
inference_dynamic_batching_max_tokens_override .. None
inference_max_batch_size ........................ 8
inference_max_seq_length ........................ 2560
inference_rng_tracker ........................... False
init_method_std ................................. 0.006
init_method_xavier_uniform ...................... False
init_model_with_meta_device ..................... False
initial_loss_scale .............................. 4294967296
is_hybrid_model ................................. False
iter_per_epoch .................................. 1250
iterations_to_skip .............................. []
keep_fp8_transpose_cache_when_using_custom_fsdp . False
kv_channels ..................................... 128
kv_lora_rank .................................... 128
lazy_mpu_init ................................... None
load ............................................ /shared/checkpoints/ashvinn/mlm_expreproduction-8-node
local_rank ...................................... 0
log_interval .................................... 1
log_loss_scale_to_tensorboard ................... True
log_memory_to_tensorboard ....................... True
log_num_zeros_in_grad ........................... False
log_params_norm ................................. False
log_progress .................................... False
log_straggler ................................... False
log_throughput .................................. True
log_timers_to_tensorboard ....................... False
log_validation_ppl_to_tensorboard ............... True
log_world_size_to_tensorboard ................... False
logging_level ................................... 40
loss_scale ...................................... None
loss_scale_window ............................... 1000
lr .............................................. 0.00024
lr_decay_iters .................................. None
lr_decay_samples ................................ None
lr_decay_style .................................. cosine # Different
lr_warmup_fraction .............................. None
lr_warmup_init .................................. 0.00024
lr_warmup_iters ................................. 0
lr_warmup_samples ............................... 0
lr_wsd_decay_iters .............................. None
lr_wsd_decay_samples ............................ None
lr_wsd_decay_style .............................. exponential
main_grads_dtype ................................ torch.float32
main_params_dtype ............................... torch.float32
make_vocab_size_divisible_by .................... 256
mamba_head_dim .................................. 64
mamba_num_groups ................................ 8
mamba_num_heads ................................. None
mamba_state_dim ................................. 128
manual_gc ....................................... True
manual_gc_eval .................................. True
manual_gc_interval .............................. 20
mask_factor ..................................... 1.0
mask_prob ....................................... 0.15
mask_type ....................................... random
masked_softmax_fusion ........................... True
max_position_embeddings ......................... 8192
max_tokens_to_oom ............................... 12000
memory_snapshot_path ............................ snapshot.pickle
merge_file ...................................... None
micro_batch_size ................................ 1
microbatch_group_size_per_vp_stage .............. 4
mid_level_dataset_surplus ....................... 0.005
min_loss_scale .................................. 1.0
min_lr .......................................... 0.0
mlp_chunks_for_prefill .......................... 1
mmap_bin_files .................................. True
mock_data ....................................... False
moe_aux_loss_coeff .............................. 0.0 # Different
moe_enable_deepep ............................... False # Different
moe_expert_capacity_factor ...................... None
moe_extended_tp ................................. False
moe_ffn_hidden_size ............................. 576
moe_grouped_gemm ................................ True
moe_input_jitter_eps ............................ None
moe_layer_freq .................................. [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
moe_layer_recompute ............................. False
moe_pad_expert_input_to_capacity ................ False
moe_per_layer_logging ........................... False
moe_permute_fusion .............................. False
moe_router_bias_update_rate ..................... 0.001
moe_router_dtype ................................ None # Double Check
moe_router_enable_expert_bias ................... False
moe_router_group_topk ........................... None
moe_router_load_balancing_type .................. aux_loss
moe_router_num_groups ........................... None
moe_router_pre_softmax .......................... False
moe_router_score_function ....................... softmax
moe_router_topk ................................. 2 # Different
moe_router_topk_scaling_factor .................. None
moe_shared_expert_intermediate_size ............. None
moe_shared_expert_overlap ....................... False
moe_token_dispatcher_type ....................... allgather # Change
moe_token_drop_policy ........................... probs
moe_use_legacy_grouped_gemm ..................... False
moe_use_upcycling ............................... False
moe_z_loss_coeff ................................ None
mrope_section ................................... None
mscale .......................................... 1.0
mscale_all_dim .................................. 1.0
mtp_loss_scaling_factor ......................... 0.1
mtp_num_layers .................................. None
multi_latent_attention .......................... True
nccl_communicator_config_path ................... None
no_load_optim ................................... None
no_load_rng ..................................... None # Double Check
no_persist_layer_norm ........................... False # Same inverse
no_rope_freq .................................... None
no_save_optim ................................... None
no_save_rng ..................................... None
non_persistent_ckpt_type ........................ None
non_persistent_global_ckpt_dir .................. None
non_persistent_local_ckpt_algo .................. fully_parallel
non_persistent_local_ckpt_dir ................... None
non_persistent_save_interval .................... None
norm_epsilon .................................... 1e-05
normalization ................................... RMSNorm
num_attention_heads ............................. 64
num_channels .................................... 3
num_classes ..................................... 1000
num_dataset_builder_threads ..................... 1
num_distributed_optimizer_instances ............. 1
num_experts ..................................... 576
num_layers ...................................... 48
num_layers_at_end_in_bf16 ....................... 1
num_layers_at_start_in_bf16 ..................... 1
num_layers_per_virtual_pipeline_stage ........... None
num_query_groups ................................ 1
num_virtual_stages_per_pipeline_rank ............ 4
num_workers ..................................... 2
object_storage_cache_path ....................... None
one_logger_async ................................ False
one_logger_project .............................. megatron-lm
one_logger_run_name ............................. None
onnx_safe ....................................... None
openai_gelu ..................................... False
optimizer ....................................... adam
optimizer_cpu_offload ........................... False
optimizer_offload_fraction ...................... 1.0
output_bert_embeddings .......................... False
overlap_cpu_optimizer_d2h_h2d ................... False
overlap_grad_reduce ............................. True # Double Check
overlap_p2p_comm ................................ True # Double Check
overlap_p2p_comm_warmup_flush ................... False # Double Check
overlap_param_gather ............................ True # Double Check
overlap_param_gather_with_optimizer_step ........ False # Double Check
override_opt_param_scheduler .................... False
params_dtype .................................... torch.bfloat16
patch_dim ....................................... 16
per_split_data_args_path ........................ None
perform_initialization .......................... True
pin_cpu_grads ................................... True # Double Check
pin_cpu_params .................................. True # Double Check
pipeline_model_parallel_comm_backend ............ None
pipeline_model_parallel_size .................... 2
pipeline_model_parallel_split_rank .............. None
position_embedding_type ......................... rope
pretrained_checkpoint ........................... None
profile ......................................... False
profile_ranks ................................... [0]
profile_step_end ................................ 12
profile_step_start .............................. 10
q_lora_rank ..................................... 284
qk_head_dim ..................................... 128
qk_layernorm .................................... False
qk_pos_emb_head_dim ............................. 64
query_in_block_prob ............................. 0.1
rampup_batch_size ............................... None
rank ............................................ 0
recompute_granularity ........................... None
recompute_method ................................ None
recompute_modules ............................... None
recompute_num_layers ............................ None
record_memory_history ........................... False
relative_attention_max_distance ................. 128
relative_attention_num_buckets .................. 32
replication ..................................... False
replication_factor .............................. 2
replication_jump ................................ None
rerun_mode ...................................... disabled
reset_attention_mask ............................ False
reset_position_ids .............................. False
result_rejected_tracker_filename ................ None
retriever_report_topk_accuracies ................ []
retriever_score_scaling ......................... False
retriever_seq_length ............................ 256
retro_add_retriever ............................. False
retro_attention_gate ............................ 1
retro_cyclic_train_iters ........................ None
retro_encoder_attention_dropout ................. 0.1
retro_encoder_hidden_dropout .................... 0.1
retro_encoder_layers ............................ 2
retro_num_neighbors ............................. 2
retro_num_retrieved_chunks ...................... 2
retro_project_dir ............................... None
retro_verify_neighbor_count ..................... True
rope_scaling_factor ............................. 8.0 # Different
rotary_base ..................................... 500000
rotary_interleaved .............................. False
rotary_percent .................................. 1.0
rotary_scaling_factor ........................... 1.0
rotary_seq_len_interpolation_factor ............. None
run_workload_inspector_server ................... False
sample_rate ..................................... 1.0
save ............................................ /shared/checkpoints/ashvinn/mlm_expreproduction-8-node
save_interval ................................... 1000
scatter_gather_tensors_in_pipeline .............. True
seed ............................................ 1234
seq_length ...................................... 8192
sequence_parallel ............................... True
sgd_momentum .................................... 0.9
short_seq_prob .................................. 0.1
skip_train ...................................... False
skipped_train_samples ........................... 0
spec ............................................ None
split ........................................... 100,0,0
squared_relu .................................... False
start_weight_decay .............................. 0.1
straggler_ctrlr_port ............................ 65535
straggler_minmax_count .......................... 1
suggested_communication_unit_size ............... None
swiglu .......................................... True
swin_backbone_type .............................. tiny
te_rng_tracker .................................. False
tensor_model_parallel_size ...................... 2
tensorboard_dir ................................. /shared/shared_experiments/ashvinn/mlm_expreproduction-8-node
tensorboard_log_interval ........................ 1
tensorboard_queue_size .......................... 1000
test_data_path .................................. None
test_mode ....................................... False
tiktoken_num_special_tokens ..................... 1000
tiktoken_pattern ................................ None
tiktoken_special_tokens ......................... None
timing_log_level ................................ 0
timing_log_option ............................... minmax
titles_data_path ................................ None
tokenizer_model ................................. /workspace/src/tokenizers/utm-2
tokenizer_type .................................. HuggingFaceTokenizer
torch_fsdp2_reshard_after_forward ............... True
tp_comm_bootstrap_backend ....................... nccl
tp_comm_bulk_dgrad .............................. True # Double Check
tp_comm_bulk_wgrad .............................. True # Double Check
tp_comm_overlap ................................. False
tp_comm_overlap_ag .............................. True
tp_comm_overlap_cfg ............................. None
tp_comm_overlap_rs .............................. True
tp_comm_overlap_rs_dgrad ........................ False
tp_comm_split_ag ................................ True
tp_comm_split_rs ................................ True
train_data_path ................................. None
train_iters ..................................... 20000
train_samples ................................... None
train_sync_interval ............................. None
transformer_impl ................................ transformer_engine
transformer_pipeline_model_parallel_size ........ 2
untie_embeddings_and_output_weights ............. True
use_checkpoint_args ............................. False
use_checkpoint_opt_param_scheduler .............. False
use_cpu_initialization .......................... None
use_custom_fsdp ................................. False
use_dist_ckpt ................................... True
use_dist_ckpt_deprecated ........................ False
use_distributed_optimizer ....................... True
use_flash_attn .................................. False
use_legacy_models ............................... False
use_mp_args_from_checkpoint_args ................ False
use_one_sent_docs ............................... False
use_persistent_ckpt_worker ...................... False
use_precision_aware_optimizer ................... False
use_pytorch_profiler ............................ False
use_ring_exchange_p2p ........................... False
use_rope_scaling ................................ False
use_rotary_position_embeddings .................. False
use_tokenizer_model_from_checkpoint_args ........ True
use_torch_fsdp2 ................................. False
use_torch_optimizer_for_cpu_offload ............. False
use_tp_pp_dp_mapping ............................ False
v_head_dim ...................................... 128
valid_data_path ................................. None
variable_seq_lengths ............................ False
virtual_pipeline_model_parallel_size ............ 4
vision_backbone_type ............................ vit
vision_pretraining .............................. False
vision_pretraining_type ......................... classify
vocab_extra_ids ................................. 0
vocab_file ...................................... None
vocab_size ...................................... None
wandb_exp_name .................................. reproduction-8-node
wandb_project ................................... nemo-rufus
wandb_save_dir ..................................
weight_decay .................................... 0.1
weight_decay_incr_style ......................... constant
wgrad_deferral_limit ............................ 0
world_size ...................................... 64
yaml_cfg ........................................ None
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment