Last active
February 15, 2023 12:45
-
-
Save verdimrc/4254223b881ddc018cef728e838fb675 to your computer and use it in GitHub Desktop.
Huggingface
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
#set -aex | |
echo "PWD = $(pwd)" | |
: "${SM_NUM_GPUS:=4}" | |
: "${MODEL_NAME:=gpt2}" | |
: "${OUTPUT_ROOT:=/mnt/scratch}" | |
: "${TRAINING_JOB_NAME:=haha}" | |
OUTPUT_DIR=$OUTPUT_ROOT/${MODEL_NAME}-finetuned/$TRAINING_JOB_NAME | |
# https://docs.wandb.ai/guides/track/advanced/environment-variables | |
mkdir -p $OUTPUT_DIR | |
export WANDB_DIR=$OUTPUT_DIR | |
# [BEWARE] When set to online and wandb not configured properly (e.g., key, | |
# etc.), HF will ask how to proceed with Wandb and wait for your answer, which | |
# will cause your training job to "hang" forever when running on a cluster. | |
export WANDB_MODE=offline | |
declare -a OPTS=( | |
--report_to tensorboard wandb | |
# https://docs.wandb.ai/guides/integrations/huggingface | |
# If None, defaults to huggingface. | |
#--run_name RUN_NAME An optional descriptor for the run. Notably used for | |
# wandb logging. (default: None) | |
--model_name_or_path $MODEL_NAME | |
--tokenizer_name $MODEL_NAME | |
--train_file train_data/train_1669615964.csv | |
--validation_file train_data/val_1669615964.csv | |
--do_train | |
--do_eval | |
--evaluation_strategy=steps | |
#--logging_strategy=steps # Already the default | |
--logging_steps 1 | |
--output_dir $OUTPUT_DIR | |
--logging_dir $OUTPUT_DIR | |
--num_train_epochs 3 | |
--eval_steps 1 | |
--gradient_accumulation_steps 32 | |
--per_device_train_batch_size 4 | |
--per_device_eval_batch_size 4 | |
--gradient_checkpointing | |
--learning_rate 5e-06 | |
--warmup_steps 10 | |
--save_total_limit 1 | |
--save_steps 2 | |
--save_strategy epoch | |
) | |
echo torchrun --standalone --nnodes=1 --nproc_per_node=$SM_NUM_GPUS run_clm.py "${OPTS[@]}" "$@" | |
torchrun --standalone --nnodes=1 --nproc_per_node=$SM_NUM_GPUS run_clm.py "${OPTS[@]}" "$@" | |
[[ $? == 0 ]] || exit 1 | |
find $OUTPUT_DIR -type f | xargs ls -alh |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment