fearnworks · May 28, 2023 18:42
diff --git a/config.yml b/config.yml
 # Changing this would change the base model used for training
 base_model: huggyllama/llama-7b
 # Changing this would change the configuration used for the base model
 base_model_config: huggyllama/llama-7b
 # Changing this would change the type of model used for training
 model_type: LlamaForCausalLM
 # Changing this would change the type of tokenizer used for tokenizing text data
 tokenizer_type: LlamaTokenizer
 # Changing this to false would prevent the model from being loaded in 8-bit precision
 load_in_8bit: false
 # Changing this to true would load the model in 4-bit precision
 load_in_4bit: true
 # Changing this to true would enforce strict loading of the base model's configuration
 strict: false
 # Changing this to true would push the dataset to Hugging Face's Hub during training
 push_dataset_to_hub:
 # Changing this would change the datasets used for training
 datasets:
  - path: teknium/GPT4-LLM-Cleaned # Changing this would change the path to the dataset on Hugging Face's Hub
    type: alpaca # Changing this would change the type of dataset used for training
 # Changing this would change the path where the prepared dataset is stored
 dataset_prepared_path: last_run_prepared
 # Changing this would change the size of the validation set as a fraction of the total dataset size
 val_set_size: 0.02
 # Changing this would change the type of adapter used during training
 adapter: qlora
 # Changing this would change the directory where the LoRA model is stored
 lora_model_dir:
 # Changing this would change the maximum sequence length for training examples
 sequence_len: 256
 # Changing this would change the maximum packed sequence length for training examples (if using packed sequences)
 max_packed_sequence_len:
 # Changing this would change the rank of LoRA's low-rank approximation matrices, which affects their expressiveness and memory usage during training
 lora_r: 64
 # Changing this would change LoRA's alpha hyperparameter, which controls how much weight is given to new information during training and affects how quickly LoRA adapters adapt to new data
 lora_alpha: 32
 #Changing this value will modify LoRA's dropout rate during training
 lora_dropout: 0.0
 #Changing these values will modify which modules in the base model are targeted with LoRA adapters during training (if left blank, all modules are targeted)
 lora_target_modules:
 #Changing this value to false will target all layers with LoRA adapters during training (if true, only linear layers are targeted)
 lora_target_linear: true
 #Changing this value to false will not use fan-in/fan-out initialization for LoRA adapters during training (if true, fan-in/fan-out initialization is used)
 lora_fan_in_fan_out:
 #Changing this value will modify name of Weights & Biases project to log training information to (if left blank, no logging is performed)
 wandb_project:
 #Changing this value to false will not log gradients and parameters to Weights & Biases during training (if true, gradients and parameters are logged)
 wandb_watch:
 #Changing this value will modify Weights & Biases run ID to resume logging from (if left blank, a new run is created)
 wandb_run_id:
 #Changing this value to false will not log trained model to Weights & Biases after training (if true, trained model is logged)
 wandb_log_model:
 #Changing this value will modify directory where trained models and other output files are saved after training
 output_dir: ./qlora-out
 batch_size: 2 #Changing batch size will modify how many examples are processed at once during training
 micro_batch_size: 1 #Changing micro-batch size will modify how many examples are processed at once within each batch during training
 num_epochs: 3 #Changing number of epochs will modify how many passes through entire dataset occur during training
 optimizer: paged_adamw_32bit #Changing optimizer will modify which optimizer is used during training
 torchdistx_path:
 lr_scheduler: cosine #Changing learning rate scheduler will modify which learning rate scheduler is used during training
 learning_rate: 0.0002 #Changing learning rate will modify initial learning rate for optimizer
 train_on_inputs: false
 group_by_length: false
 bf16: false
 fp16: true #Changing fp16 to false will not use fp16 precision during training
 tf32: false
 gradient_checkpointing: true #Changing gradient_checkpointing to false will not use gradient checkpointing during training
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank: 0
 logging_steps: 1 #Changing logging_steps will modify how often logging should occur during training
 xformers_attention:
 flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_steps: 10 #Changing warmup_steps will modify number of warmup steps before learning rate reaches initial value
 eval_steps: 50 #Changing eval_steps will modify how often evaluation should occur during training
 save_steps:
 debug:
 device_map: auto
 deepspeed:
 weight_decay: 0.0 #Changing weight_decay will modify weight decay hyperparameter for optimizer
 fsdp:
 fsdp_config:
 special_tokens: #Specifies special tokens used by tokenizer
  bos_token: "<s>" #Beginning of sentence token
  eos_token: "</s>" #End of sentence token
  unk_token: "<unk>" #Unknown token
	# Changing this would change the base model used for training
	base_model: huggyllama/llama-7b
	# Changing this would change the configuration used for the base model
	base_model_config: huggyllama/llama-7b
	# Changing this would change the type of model used for training
	model_type: LlamaForCausalLM
	# Changing this would change the type of tokenizer used for tokenizing text data
	tokenizer_type: LlamaTokenizer
	# Changing this to false would prevent the model from being loaded in 8-bit precision
	load_in_8bit: false
	# Changing this to true would load the model in 4-bit precision
	load_in_4bit: true
	# Changing this to true would enforce strict loading of the base model's configuration
	strict: false
	# Changing this to true would push the dataset to Hugging Face's Hub during training
	push_dataset_to_hub:
	# Changing this would change the datasets used for training
	datasets:
	- path: teknium/GPT4-LLM-Cleaned # Changing this would change the path to the dataset on Hugging Face's Hub
	type: alpaca # Changing this would change the type of dataset used for training
	# Changing this would change the path where the prepared dataset is stored
	dataset_prepared_path: last_run_prepared
	# Changing this would change the size of the validation set as a fraction of the total dataset size
	val_set_size: 0.02
	# Changing this would change the type of adapter used during training
	adapter: qlora
	# Changing this would change the directory where the LoRA model is stored
	lora_model_dir:
	# Changing this would change the maximum sequence length for training examples
	sequence_len: 256
	# Changing this would change the maximum packed sequence length for training examples (if using packed sequences)
	max_packed_sequence_len:
	# Changing this would change the rank of LoRA's low-rank approximation matrices, which affects their expressiveness and memory usage during training
	lora_r: 64
	# Changing this would change LoRA's alpha hyperparameter, which controls how much weight is given to new information during training and affects how quickly LoRA adapters adapt to new data
	lora_alpha: 32
	#Changing this value will modify LoRA's dropout rate during training
	lora_dropout: 0.0
	#Changing these values will modify which modules in the base model are targeted with LoRA adapters during training (if left blank, all modules are targeted)
	lora_target_modules:
	#Changing this value to false will target all layers with LoRA adapters during training (if true, only linear layers are targeted)
	lora_target_linear: true
	#Changing this value to false will not use fan-in/fan-out initialization for LoRA adapters during training (if true, fan-in/fan-out initialization is used)
	lora_fan_in_fan_out:
	#Changing this value will modify name of Weights & Biases project to log training information to (if left blank, no logging is performed)
	wandb_project:
	#Changing this value to false will not log gradients and parameters to Weights & Biases during training (if true, gradients and parameters are logged)
	wandb_watch:
	#Changing this value will modify Weights & Biases run ID to resume logging from (if left blank, a new run is created)
	wandb_run_id:
	#Changing this value to false will not log trained model to Weights & Biases after training (if true, trained model is logged)
	wandb_log_model:
	#Changing this value will modify directory where trained models and other output files are saved after training
	output_dir: ./qlora-out
	batch_size: 2 #Changing batch size will modify how many examples are processed at once during training
	micro_batch_size: 1 #Changing micro-batch size will modify how many examples are processed at once within each batch during training
	num_epochs: 3 #Changing number of epochs will modify how many passes through entire dataset occur during training
	optimizer: paged_adamw_32bit #Changing optimizer will modify which optimizer is used during training
	torchdistx_path:
	lr_scheduler: cosine #Changing learning rate scheduler will modify which learning rate scheduler is used during training
	learning_rate: 0.0002 #Changing learning rate will modify initial learning rate for optimizer
	train_on_inputs: false
	group_by_length: false
	bf16: false
	fp16: true #Changing fp16 to false will not use fp16 precision during training
	tf32: false
	gradient_checkpointing: true #Changing gradient_checkpointing to false will not use gradient checkpointing during training
	early_stopping_patience:
	resume_from_checkpoint:
	local_rank: 0
	logging_steps: 1 #Changing logging_steps will modify how often logging should occur during training
	xformers_attention:
	flash_attention:
	gptq_groupsize:
	gptq_model_v1:
	warmup_steps: 10 #Changing warmup_steps will modify number of warmup steps before learning rate reaches initial value
	eval_steps: 50 #Changing eval_steps will modify how often evaluation should occur during training
	save_steps:
	debug:
	device_map: auto
	deepspeed:
	weight_decay: 0.0 #Changing weight_decay will modify weight decay hyperparameter for optimizer
	fsdp:
	fsdp_config:
	special_tokens: #Specifies special tokens used by tokenizer
	bos_token: "<s>" #Beginning of sentence token
	eos_token: "</s>" #End of sentence token
	unk_token: "<unk>" #Unknown token