ehartford · July 22, 2024 21:01
diff --git a/dolphin-2.9.3-mistral-nemo.yml b/dolphin-2.9.3-mistral-nemo.yml
 base_model: /workspace/models/Mistral-Nemo-Base-2407
 model_type: AutoModelForCausalLM
 tokenizer_type: AutoTokenizer

 load_in_8bit: false
 # load_in_4bit: true
 strict: false

 datasets:
  - path: /workspace/datasets/dolphin-2.9.3/dolphin201-sharegpt2.jsonl
    type: sharegpt
    conversation: chatml
  - path: /workspace/datasets/dolphin-2.9.3/SystemChat_filtered_sharegpt.jsonl
    type: sharegpt
    conversation: chatml
  - path: /workspace/datasets/dolphin-2.9.3/SystemChat_multilingual_sharegpt.jsonl
    type: sharegpt
    conversation: chatml
  - path: /workspace/datasets/dolphin-2.9.3/dolphin-coder-translate-sharegpt2.jsonl
    type: sharegpt
    conversation: chatml
  - path: /workspace/datasets/dolphin-2.9.3/dolphin-coder-codegen-sharegpt2.jsonl
    type: sharegpt
    conversation: chatml
  - path: /workspace/datasets/dolphin-2.9.3/m-a-p_Code-Feedback-sharegpt-unfiltered.jsonl
    type: sharegpt
    conversation: chatml
  - path: /workspace/datasets/dolphin-2.9.3/m-a-p_CodeFeedback-Filtered-Instruction-sharegpt-unfiltered.jsonl
    type: sharegpt
    conversation: chatml
  - path: /workspace/datasets/dolphin-2.9.3/not_samantha_norefusals.jsonl
    type: sharegpt
    conversation: chatml
  - path: /workspace/datasets/dolphin-2.9.3/Orca-Math-resort-unfiltered.jsonl
    type: sharegpt
    conversation: chatml
  - path: /workspace/datasets/dolphin-2.9.3/agent_instruct_react_unfiltered.jsonl
    type: sharegpt  
    conversation: chatml
  - path: /workspace/datasets/dolphin-2.9.3/toolbench_instruct_j1s1_3k_unfiltered.jsonl
    type: sharegpt  
    conversation: chatml
  - path: /workspace/datasets/dolphin-2.9.3/toolbench_negative_unfiltered.jsonl
    type: sharegpt
    conversation: chatml
  - path: /workspace/datasets/dolphin-2.9.3/toolbench_react_10p_unfiltered.jsonl
    type: sharegpt
    conversation: chatml
  - path: /workspace/datasets/dolphin-2.9.3/toolbench_tflan_cot_30p_unfiltered.jsonl
    type: sharegpt
    conversation: chatml
  - path: /workspace/datasets/dolphin-2.9.3/openhermes200k_unfiltered.jsonl
    type: sharegpt 
    conversation: chatml

 chat_template: chatml
 # adapter: qlora
 # lora_r: 128
 # lora_alpha: 16
 # lora_modules_to_save: [embed_tokens, lm_head]
 # lora_dropout: 0.05
 # lora_target_linear: true


 unfrozen_parameters:
 - ^lm_head.weight$
 - ^model.embed_tokens.weight$
 - input_layernorm
 - model.norm
 - post_attention_layernorm
 - self_attn.rotary_emb
 # mlp.down_proj layers
 - model.layers.0.mlp.down_proj
 - model.layers.1.mlp.down_proj
 - model.layers.4.mlp.down_proj
 - model.layers.37.mlp.down_proj
 - model.layers.24.mlp.down_proj
 - model.layers.2.mlp.down_proj
 - model.layers.38.mlp.down_proj
 - model.layers.35.mlp.down_proj
 - model.layers.25.mlp.down_proj
 - model.layers.6.mlp.down_proj
 - model.layers.22.mlp.down_proj
 - model.layers.23.mlp.down_proj
 - model.layers.3.mlp.down_proj
 - model.layers.21.mlp.down_proj
 - model.layers.5.mlp.down_proj
 - model.layers.28.mlp.down_proj
 - model.layers.20.mlp.down_proj
 - model.layers.26.mlp.down_proj
 - model.layers.19.mlp.down_proj
 - model.layers.34.mlp.down_proj
 # mlp.gate_proj layers
 - model.layers.2.mlp.gate_proj
 - model.layers.1.mlp.gate_proj
 - model.layers.3.mlp.gate_proj
 - model.layers.5.mlp.gate_proj
 - model.layers.4.mlp.gate_proj
 - model.layers.35.mlp.gate_proj
 - model.layers.36.mlp.gate_proj
 - model.layers.37.mlp.gate_proj
 - model.layers.38.mlp.gate_proj
 - model.layers.34.mlp.gate_proj
 - model.layers.33.mlp.gate_proj
 - model.layers.8.mlp.gate_proj
 - model.layers.32.mlp.gate_proj
 - model.layers.6.mlp.gate_proj
 - model.layers.28.mlp.gate_proj
 - model.layers.26.mlp.gate_proj
 - model.layers.30.mlp.gate_proj
 - model.layers.23.mlp.gate_proj
 - model.layers.29.mlp.gate_proj
 - model.layers.27.mlp.gate_proj
 # mlp.up_proj layers
 - model.layers.3.mlp.up_proj
 - model.layers.4.mlp.up_proj
 - model.layers.6.mlp.up_proj
 - model.layers.2.mlp.up_proj
 - model.layers.5.mlp.up_proj
 - model.layers.8.mlp.up_proj
 - model.layers.10.mlp.up_proj
 - model.layers.9.mlp.up_proj
 - model.layers.7.mlp.up_proj
 - model.layers.0.mlp.up_proj
 - model.layers.17.mlp.up_proj
 - model.layers.15.mlp.up_proj
 - model.layers.22.mlp.up_proj
 - model.layers.18.mlp.up_proj
 - model.layers.16.mlp.up_proj
 - model.layers.11.mlp.up_proj
 - model.layers.21.mlp.up_proj
 - model.layers.23.mlp.up_proj
 - model.layers.20.mlp.up_proj
 - model.layers.27.mlp.up_proj
 # self_attn.k_proj layers
 - model.layers.30.self_attn.k_proj
 - model.layers.27.self_attn.k_proj
 - model.layers.25.self_attn.k_proj
 - model.layers.33.self_attn.k_proj
 - model.layers.26.self_attn.k_proj
 - model.layers.31.self_attn.k_proj
 - model.layers.35.self_attn.k_proj
 - model.layers.39.self_attn.k_proj
 - model.layers.22.self_attn.k_proj
 - model.layers.24.self_attn.k_proj
 - model.layers.21.self_attn.k_proj
 - model.layers.28.self_attn.k_proj
 - model.layers.23.self_attn.k_proj
 - model.layers.36.self_attn.k_proj
 - model.layers.20.self_attn.k_proj
 - model.layers.37.self_attn.k_proj
 - model.layers.29.self_attn.k_proj
 - model.layers.32.self_attn.k_proj
 - model.layers.16.self_attn.k_proj
 - model.layers.18.self_attn.k_proj
 # self_attn.o_proj layers
 - model.layers.7.self_attn.o_proj
 - model.layers.6.self_attn.o_proj
 - model.layers.9.self_attn.o_proj
 - model.layers.5.self_attn.o_proj
 - model.layers.27.self_attn.o_proj
 - model.layers.26.self_attn.o_proj
 - model.layers.4.self_attn.o_proj
 - model.layers.31.self_attn.o_proj
 - model.layers.8.self_attn.o_proj
 - model.layers.16.self_attn.o_proj
 - model.layers.3.self_attn.o_proj
 - model.layers.10.self_attn.o_proj
 - model.layers.18.self_attn.o_proj
 - model.layers.33.self_attn.o_proj
 - model.layers.17.self_attn.o_proj
 - model.layers.32.self_attn.o_proj
 - model.layers.30.self_attn.o_proj
 - model.layers.2.self_attn.o_proj
 - model.layers.15.self_attn.o_proj
 - model.layers.11.self_attn.o_proj
 # self_attn.q_proj layers
 - model.layers.14.self_attn.q_proj
 - model.layers.11.self_attn.q_proj
 - model.layers.15.self_attn.q_proj
 - model.layers.9.self_attn.q_proj
 - model.layers.8.self_attn.q_proj
 - model.layers.18.self_attn.q_proj
 - model.layers.12.self_attn.q_proj
 - model.layers.13.self_attn.q_proj
 - model.layers.19.self_attn.q_proj
 - model.layers.16.self_attn.q_proj
 - model.layers.10.self_attn.q_proj
 - model.layers.17.self_attn.q_proj
 - model.layers.7.self_attn.q_proj
 - model.layers.5.self_attn.q_proj
 - model.layers.20.self_attn.q_proj
 - model.layers.3.self_attn.q_proj
 - model.layers.26.self_attn.q_proj
 - model.layers.27.self_attn.q_proj
 - model.layers.28.self_attn.q_proj
 - model.layers.33.self_attn.q_proj
 # self_attn.v_proj layers
 - model.layers.27.self_attn.v_proj
 - model.layers.20.self_attn.v_proj
 - model.layers.24.self_attn.v_proj
 - model.layers.25.self_attn.v_proj
 - model.layers.30.self_attn.v_proj
 - model.layers.2.self_attn.v_proj
 - model.layers.23.self_attn.v_proj
 - model.layers.22.self_attn.v_proj
 - model.layers.26.self_attn.v_proj
 - model.layers.33.self_attn.v_proj
 - model.layers.37.self_attn.v_proj
 - model.layers.7.self_attn.v_proj
 - model.layers.4.self_attn.v_proj
 - model.layers.18.self_attn.v_proj
 - model.layers.31.self_attn.v_proj
 - model.layers.17.self_attn.v_proj
 - model.layers.35.self_attn.v_proj
 - model.layers.32.self_attn.v_proj
 - model.layers.21.self_attn.v_proj
 - model.layers.3.self_attn.v_proj



 dataset_prepared_path:  /workspace/axolotl/dolph-2.9.3-nemo-prepared
 val_set_size: 0.01
 output_dir: /workspace/axolotl/dolphin-2.9.3-mistral-nemo

 sequence_len: 8192
 sample_packing: true
 pad_to_sequence_len: true

 wandb_project: dolphin-2.9.3-Mistral-nemo
 wandb_watch:
 wandb_run_id:
 wandb_log_model:

 gradient_accumulation_steps: 16
 micro_batch_size: 1
 num_epochs: 3
 optimizer: adamw_torch
 lr_scheduler: cosine
 learning_rate: 5e-6
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32:

 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 early_stopping_patience:
 resume_from_checkpoint:
 logging_steps: 1
 xformers_attention:
 flash_attention: true

 warmup_steps: 100
 # evals_per_epoch: 4
 eval_table_size:
 saves_per_epoch: 1
 save_total_limit: 2
 save_steps:
 debug:
 deepspeed: deepspeed_configs/zero3_bf16.json
 weight_decay: 0.1
 special_tokens:
  eos_token: "<|im_end|>"
  pad_token: "<pad>"
  bos_token: "<s>"
  unk_token: "<unk>"
 tokens:
  - "<|im_start|>"


 # fsdp:
 #   - full_shard
 #   - auto_wrap
 # fsdp_config:
 #   fsdp_limit_all_gathers: true
 #   fsdp_sync_module_states: true
 #   fsdp_offload_params: true
 #   fsdp_use_orig_params: false
 #   fsdp_cpu_ram_efficient_loading: true
 #   fsdp_transformer_layer_cls_to_wrap: MixtralSparseMoeBlock
 #   fsdp_state_dict_type: FULL_STATE_DICT
 #   fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
 #   fsdp_sharding_strategy: FULL_SHARD
 #   fsdp_forward_prefetch: false
 #   fsdp_backward_prefetch: BACKWARD_PRE
	base_model: /workspace/models/Mistral-Nemo-Base-2407
	model_type: AutoModelForCausalLM
	tokenizer_type: AutoTokenizer

	load_in_8bit: false
	# load_in_4bit: true
	strict: false

	datasets:
	- path: /workspace/datasets/dolphin-2.9.3/dolphin201-sharegpt2.jsonl
	type: sharegpt
	conversation: chatml
	- path: /workspace/datasets/dolphin-2.9.3/SystemChat_filtered_sharegpt.jsonl
	type: sharegpt
	conversation: chatml
	- path: /workspace/datasets/dolphin-2.9.3/SystemChat_multilingual_sharegpt.jsonl
	type: sharegpt
	conversation: chatml
	- path: /workspace/datasets/dolphin-2.9.3/dolphin-coder-translate-sharegpt2.jsonl
	type: sharegpt
	conversation: chatml
	- path: /workspace/datasets/dolphin-2.9.3/dolphin-coder-codegen-sharegpt2.jsonl
	type: sharegpt
	conversation: chatml
	- path: /workspace/datasets/dolphin-2.9.3/m-a-p_Code-Feedback-sharegpt-unfiltered.jsonl
	type: sharegpt
	conversation: chatml
	- path: /workspace/datasets/dolphin-2.9.3/m-a-p_CodeFeedback-Filtered-Instruction-sharegpt-unfiltered.jsonl
	type: sharegpt
	conversation: chatml
	- path: /workspace/datasets/dolphin-2.9.3/not_samantha_norefusals.jsonl
	type: sharegpt
	conversation: chatml
	- path: /workspace/datasets/dolphin-2.9.3/Orca-Math-resort-unfiltered.jsonl
	type: sharegpt
	conversation: chatml
	- path: /workspace/datasets/dolphin-2.9.3/agent_instruct_react_unfiltered.jsonl
	type: sharegpt
	conversation: chatml
	- path: /workspace/datasets/dolphin-2.9.3/toolbench_instruct_j1s1_3k_unfiltered.jsonl
	type: sharegpt
	conversation: chatml
	- path: /workspace/datasets/dolphin-2.9.3/toolbench_negative_unfiltered.jsonl
	type: sharegpt
	conversation: chatml
	- path: /workspace/datasets/dolphin-2.9.3/toolbench_react_10p_unfiltered.jsonl
	type: sharegpt
	conversation: chatml
	- path: /workspace/datasets/dolphin-2.9.3/toolbench_tflan_cot_30p_unfiltered.jsonl
	type: sharegpt
	conversation: chatml
	- path: /workspace/datasets/dolphin-2.9.3/openhermes200k_unfiltered.jsonl
	type: sharegpt
	conversation: chatml

	chat_template: chatml
	# adapter: qlora
	# lora_r: 128
	# lora_alpha: 16
	# lora_modules_to_save: [embed_tokens, lm_head]
	# lora_dropout: 0.05
	# lora_target_linear: true


	unfrozen_parameters:
	- ^lm_head.weight$
	- ^model.embed_tokens.weight$
	- input_layernorm
	- model.norm
	- post_attention_layernorm
	- self_attn.rotary_emb
	# mlp.down_proj layers
	- model.layers.0.mlp.down_proj
	- model.layers.1.mlp.down_proj
	- model.layers.4.mlp.down_proj
	- model.layers.37.mlp.down_proj
	- model.layers.24.mlp.down_proj
	- model.layers.2.mlp.down_proj
	- model.layers.38.mlp.down_proj
	- model.layers.35.mlp.down_proj
	- model.layers.25.mlp.down_proj
	- model.layers.6.mlp.down_proj
	- model.layers.22.mlp.down_proj
	- model.layers.23.mlp.down_proj
	- model.layers.3.mlp.down_proj
	- model.layers.21.mlp.down_proj
	- model.layers.5.mlp.down_proj
	- model.layers.28.mlp.down_proj
	- model.layers.20.mlp.down_proj
	- model.layers.26.mlp.down_proj
	- model.layers.19.mlp.down_proj
	- model.layers.34.mlp.down_proj
	# mlp.gate_proj layers
	- model.layers.2.mlp.gate_proj
	- model.layers.1.mlp.gate_proj
	- model.layers.3.mlp.gate_proj
	- model.layers.5.mlp.gate_proj
	- model.layers.4.mlp.gate_proj
	- model.layers.35.mlp.gate_proj
	- model.layers.36.mlp.gate_proj
	- model.layers.37.mlp.gate_proj
	- model.layers.38.mlp.gate_proj
	- model.layers.34.mlp.gate_proj
	- model.layers.33.mlp.gate_proj
	- model.layers.8.mlp.gate_proj
	- model.layers.32.mlp.gate_proj
	- model.layers.6.mlp.gate_proj
	- model.layers.28.mlp.gate_proj
	- model.layers.26.mlp.gate_proj
	- model.layers.30.mlp.gate_proj
	- model.layers.23.mlp.gate_proj
	- model.layers.29.mlp.gate_proj
	- model.layers.27.mlp.gate_proj
	# mlp.up_proj layers
	- model.layers.3.mlp.up_proj
	- model.layers.4.mlp.up_proj
	- model.layers.6.mlp.up_proj
	- model.layers.2.mlp.up_proj
	- model.layers.5.mlp.up_proj
	- model.layers.8.mlp.up_proj
	- model.layers.10.mlp.up_proj
	- model.layers.9.mlp.up_proj
	- model.layers.7.mlp.up_proj
	- model.layers.0.mlp.up_proj
	- model.layers.17.mlp.up_proj
	- model.layers.15.mlp.up_proj
	- model.layers.22.mlp.up_proj
	- model.layers.18.mlp.up_proj
	- model.layers.16.mlp.up_proj
	- model.layers.11.mlp.up_proj
	- model.layers.21.mlp.up_proj
	- model.layers.23.mlp.up_proj
	- model.layers.20.mlp.up_proj
	- model.layers.27.mlp.up_proj
	# self_attn.k_proj layers
	- model.layers.30.self_attn.k_proj
	- model.layers.27.self_attn.k_proj
	- model.layers.25.self_attn.k_proj
	- model.layers.33.self_attn.k_proj
	- model.layers.26.self_attn.k_proj
	- model.layers.31.self_attn.k_proj
	- model.layers.35.self_attn.k_proj
	- model.layers.39.self_attn.k_proj
	- model.layers.22.self_attn.k_proj
	- model.layers.24.self_attn.k_proj
	- model.layers.21.self_attn.k_proj
	- model.layers.28.self_attn.k_proj
	- model.layers.23.self_attn.k_proj
	- model.layers.36.self_attn.k_proj
	- model.layers.20.self_attn.k_proj
	- model.layers.37.self_attn.k_proj
	- model.layers.29.self_attn.k_proj
	- model.layers.32.self_attn.k_proj
	- model.layers.16.self_attn.k_proj
	- model.layers.18.self_attn.k_proj
	# self_attn.o_proj layers
	- model.layers.7.self_attn.o_proj
	- model.layers.6.self_attn.o_proj
	- model.layers.9.self_attn.o_proj
	- model.layers.5.self_attn.o_proj
	- model.layers.27.self_attn.o_proj
	- model.layers.26.self_attn.o_proj
	- model.layers.4.self_attn.o_proj
	- model.layers.31.self_attn.o_proj
	- model.layers.8.self_attn.o_proj
	- model.layers.16.self_attn.o_proj
	- model.layers.3.self_attn.o_proj
	- model.layers.10.self_attn.o_proj
	- model.layers.18.self_attn.o_proj
	- model.layers.33.self_attn.o_proj
	- model.layers.17.self_attn.o_proj
	- model.layers.32.self_attn.o_proj
	- model.layers.30.self_attn.o_proj
	- model.layers.2.self_attn.o_proj
	- model.layers.15.self_attn.o_proj
	- model.layers.11.self_attn.o_proj
	# self_attn.q_proj layers
	- model.layers.14.self_attn.q_proj
	- model.layers.11.self_attn.q_proj
	- model.layers.15.self_attn.q_proj
	- model.layers.9.self_attn.q_proj
	- model.layers.8.self_attn.q_proj
	- model.layers.18.self_attn.q_proj
	- model.layers.12.self_attn.q_proj
	- model.layers.13.self_attn.q_proj
	- model.layers.19.self_attn.q_proj
	- model.layers.16.self_attn.q_proj
	- model.layers.10.self_attn.q_proj
	- model.layers.17.self_attn.q_proj
	- model.layers.7.self_attn.q_proj
	- model.layers.5.self_attn.q_proj
	- model.layers.20.self_attn.q_proj
	- model.layers.3.self_attn.q_proj
	- model.layers.26.self_attn.q_proj
	- model.layers.27.self_attn.q_proj
	- model.layers.28.self_attn.q_proj
	- model.layers.33.self_attn.q_proj
	# self_attn.v_proj layers
	- model.layers.27.self_attn.v_proj
	- model.layers.20.self_attn.v_proj
	- model.layers.24.self_attn.v_proj
	- model.layers.25.self_attn.v_proj
	- model.layers.30.self_attn.v_proj
	- model.layers.2.self_attn.v_proj
	- model.layers.23.self_attn.v_proj
	- model.layers.22.self_attn.v_proj
	- model.layers.26.self_attn.v_proj
	- model.layers.33.self_attn.v_proj
	- model.layers.37.self_attn.v_proj
	- model.layers.7.self_attn.v_proj
	- model.layers.4.self_attn.v_proj
	- model.layers.18.self_attn.v_proj
	- model.layers.31.self_attn.v_proj
	- model.layers.17.self_attn.v_proj
	- model.layers.35.self_attn.v_proj
	- model.layers.32.self_attn.v_proj
	- model.layers.21.self_attn.v_proj
	- model.layers.3.self_attn.v_proj



	dataset_prepared_path: /workspace/axolotl/dolph-2.9.3-nemo-prepared
	val_set_size: 0.01
	output_dir: /workspace/axolotl/dolphin-2.9.3-mistral-nemo

	sequence_len: 8192
	sample_packing: true
	pad_to_sequence_len: true

	wandb_project: dolphin-2.9.3-Mistral-nemo
	wandb_watch:
	wandb_run_id:
	wandb_log_model:

	gradient_accumulation_steps: 16
	micro_batch_size: 1
	num_epochs: 3
	optimizer: adamw_torch
	lr_scheduler: cosine
	learning_rate: 5e-6
	train_on_inputs: false
	group_by_length: false
	bf16: auto
	fp16:
	tf32:

	gradient_checkpointing: true
	gradient_checkpointing_kwargs:
	use_reentrant: false
	early_stopping_patience:
	resume_from_checkpoint:
	logging_steps: 1
	xformers_attention:
	flash_attention: true

	warmup_steps: 100
	# evals_per_epoch: 4
	eval_table_size:
	saves_per_epoch: 1
	save_total_limit: 2
	save_steps:
	debug:
	deepspeed: deepspeed_configs/zero3_bf16.json
	weight_decay: 0.1
	special_tokens:
	eos_token: "<\|im_end\|>"
	pad_token: "<pad>"
	bos_token: "<s>"
	unk_token: "<unk>"
	tokens:
	- "<\|im_start\|>"


	# fsdp:
	# - full_shard
	# - auto_wrap
	# fsdp_config:
	# fsdp_limit_all_gathers: true
	# fsdp_sync_module_states: true
	# fsdp_offload_params: true
	# fsdp_use_orig_params: false
	# fsdp_cpu_ram_efficient_loading: true
	# fsdp_transformer_layer_cls_to_wrap: MixtralSparseMoeBlock
	# fsdp_state_dict_type: FULL_STATE_DICT
	# fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
	# fsdp_sharding_strategy: FULL_SHARD
	# fsdp_forward_prefetch: false
	# fsdp_backward_prefetch: BACKWARD_PRE