andrewor14 · May 19, 2025 22:12
diff --git a/gistfile1.txt b/gistfile1.txt
 batch_size: 16
 batch_size_val: 8
 checkpointer:
  _component_: torchtune.training.FullModelHFCheckpointer
  checkpoint_dir: /tmp/Llama-3.2-3B-Instruct/
  checkpoint_files:
  - model-00001-of-00002.safetensors
  - model-00002-of-00002.safetensors
  model_type: LLAMA3_2
  output_dir: /home/andrewor/local/logs/tune/Llama3.2-3B_qat
  recipe_checkpoint: null
 clip_grad_norm: null
 compile: false
 dataset:
  _component_: torchtune.datasets.alpaca_cleaned_dataset
  packed: false
  split: train[:95%]
 dataset_val:
  _component_: torchtune.datasets.alpaca_cleaned_dataset
  split: train[95%:]
 device: cuda
 dtype: bf16
 enable_activation_checkpointing: true
 enable_activation_offloading: false
 epochs: 1
 gradient_accumulation_steps: 8
 log_every_n_steps: 1
 log_peak_memory_stats: true
 loss:
  _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
 max_steps_per_epoch: null
 metric_logger:
  _component_: torchtune.training.metric_logging.DiskLogger
  log_dir: /home/andrewor/local/logs/tune/Llama3.2-3B_qat/metrics
 model:
  _component_: torchtune.models.llama3_2.llama3_2_3b
 optimizer:
  _component_: torch.optim.AdamW
  fused: true
  lr: 2.0e-05
 optimizer_in_bwd: false
 output_dir: /home/andrewor/local/logs/tune/Llama3.2-3B_qat/metrics
 profiler:
  _component_: torchtune.training.setup_torch_profiler
  active_steps: 2
  cpu: true
  cuda: true
  enabled: false
  num_cycles: 1
  output_dir: /home/andrewor/local/logs/tune/Llama3.2-3B_qat/metrics/profiling_outputs
  profile_memory: false
  record_shapes: true
  wait_steps: 5
  warmup_steps: 3
  with_flops: false
  with_stack: false
 quantizer:
  _component_: torchtune.training.quantization.Int8DynActInt4WeightQATQuantizer
  groupsize: 32
 resume_from_checkpoint: false
 run_val_every_n_steps: null
 seed: null
 shuffle: true
 tokenizer:
  _component_: torchtune.models.llama3.llama3_tokenizer
  max_seq_len: null
  path: /tmp/Llama-3.2-3B-Instruct/original/tokenizer.model
	batch_size: 16
	batch_size_val: 8
	checkpointer:
	_component_: torchtune.training.FullModelHFCheckpointer
	checkpoint_dir: /tmp/Llama-3.2-3B-Instruct/
	checkpoint_files:
	- model-00001-of-00002.safetensors
	- model-00002-of-00002.safetensors
	model_type: LLAMA3_2
	output_dir: /home/andrewor/local/logs/tune/Llama3.2-3B_qat
	recipe_checkpoint: null
	clip_grad_norm: null
	compile: false
	dataset:
	_component_: torchtune.datasets.alpaca_cleaned_dataset
	packed: false
	split: train[:95%]
	dataset_val:
	_component_: torchtune.datasets.alpaca_cleaned_dataset
	split: train[95%:]
	device: cuda
	dtype: bf16
	enable_activation_checkpointing: true
	enable_activation_offloading: false
	epochs: 1
	gradient_accumulation_steps: 8
	log_every_n_steps: 1
	log_peak_memory_stats: true
	loss:
	_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
	max_steps_per_epoch: null
	metric_logger:
	_component_: torchtune.training.metric_logging.DiskLogger
	log_dir: /home/andrewor/local/logs/tune/Llama3.2-3B_qat/metrics
	model:
	_component_: torchtune.models.llama3_2.llama3_2_3b
	optimizer:
	_component_: torch.optim.AdamW
	fused: true
	lr: 2.0e-05
	optimizer_in_bwd: false
	output_dir: /home/andrewor/local/logs/tune/Llama3.2-3B_qat/metrics
	profiler:
	_component_: torchtune.training.setup_torch_profiler
	active_steps: 2
	cpu: true
	cuda: true
	enabled: false
	num_cycles: 1
	output_dir: /home/andrewor/local/logs/tune/Llama3.2-3B_qat/metrics/profiling_outputs
	profile_memory: false
	record_shapes: true
	wait_steps: 5
	warmup_steps: 3
	with_flops: false
	with_stack: false
	quantizer:
	_component_: torchtune.training.quantization.Int8DynActInt4WeightQATQuantizer
	groupsize: 32
	resume_from_checkpoint: false
	run_val_every_n_steps: null
	seed: null
	shuffle: true
	tokenizer:
	_component_: torchtune.models.llama3.llama3_tokenizer
	max_seq_len: null
	path: /tmp/Llama-3.2-3B-Instruct/original/tokenizer.model
No results found