Skip to content

Instantly share code, notes, and snippets.

@vwxyzjn
Last active November 16, 2024 05:13
Show Gist options
  • Save vwxyzjn/481eb97713b9a6eb0e1b15afeb47286d to your computer and use it in GitHub Desktop.
Save vwxyzjn/481eb97713b9a6eb0e1b15afeb47286d to your computer and use it in GitHub Desktop.
python scripts/submit_finetune_job.py \
--cluster ai2/augusta-google-1 \
--priority high \
--workspace ai2/tulu-3-dev \
--num_nodes 4 \
--image costah/open_instruct_ppo_ray_ninja \
--default_beaker_config configs/beaker_configs/default_finetune_multinode.yaml \
--config configs/train_configs/sft/tulu3_8b_preview_mix_v3.9.yaml \
--exp_name olmo1124_finetune
version: v2
description: open-instruct-finetune-multinode
budget: ai2/oe-adapt
tasks:
- name: open-instruct-finetune-multinode
replicas: 4
leaderSelection: true
hostNetworking: true
propagateFailure: true
propagatePreemption: true
synchronizedStartTimeout: 60m
image:
beaker: nathanl/open_instruct_auto
command: [
'/bin/sh', '-c'
]
arguments: ['
unset CUDA_LAUNCH_BLOCKING && PYTHONPATH="/stage:$PYTHONPATH" pip install git+https://github.com/vwxyzjn/transformers.git@olmo1124_classification && accelerate launch
--mixed_precision bf16
--num_machines 4
--num_processes 32
--machine_rank $BEAKER_REPLICA_RANK
--main_process_ip $BEAKER_LEADER_REPLICA_HOSTNAME
--main_process_port 29400
--use_deepspeed
--deepspeed_config_file configs/ds_configs/stage3_no_offloading_accelerate.conf
--deepspeed_multinode_launcher standard
open_instruct/finetune.py
--model_name_or_path meta-llama/Meta-Llama-3-8B
--tokenizer_name meta-llama/Meta-Llama-3-8B
--use_slow_tokenizer
--use_flash_attn
--max_seq_length 4096
--preprocessing_num_workers 16
--per_device_train_batch_size 1
--gradient_accumulation_steps 4
--learning_rate 5e-6
--lr_scheduler_type linear
--warmup_ratio 0.03
--weight_decay 0.
--num_train_epochs 2
--output_dir /output/
--with_tracking
--report_to tensorboard
--logging_steps 1
--reduce_loss sum
']
envVars:
- name: CUDA_DEVICE_ORDER
value: PCI_BUS_ID
- name: TRANSFORMERS_CACHE
value: ./cache/
- name: WANDB_API_KEY
secret: WANDB_API_KEY
- name: WANDB_PROJECT
value: open-instruct
- name: WANDB_WATCH
value: false
- name: WANDB_LOG_MODEL
value: false
- name: WANDB_DISABLED
value: true
- name: HF_TOKEN
secret: HF_TOKEN
# datasets:
# - mountPath: /oe-adapt-default
# source:
# weka: oe-adapt-default
result:
path: /output
resources:
gpuCount: 8
context:
priority: normal
preemptible: true
import copy
import subprocess
import yaml
from datetime import datetime
import argparse
import re
import shlex
from open_instruct.utils import get_beaker_whoami
def load_yaml(file_path):
with open(file_path, 'r') as f:
return yaml.load(f, Loader=yaml.FullLoader)
def main():
parser = argparse.ArgumentParser(description="Run experiment with Beaker config")
parser.add_argument("--default_beaker_config", default="configs/beaker_configs/default_finetune.yaml",
help="Path to the default Beaker config file")
parser.add_argument("--config", default=None,
help="Path to an additional config file to override default settings")
parser.add_argument("--cluster", type=str, default="ai2/allennlp-cirrascale", help="Beaker cluster to use")
parser.add_argument("--priority", type=str, default="high", help="Priority of the job")
parser.add_argument("--preemptible", type=bool, default=True, help="Whether to use preemptible instances")
parser.add_argument("--num_gpus", type=int, default=8, help="Number of GPUs to use")
parser.add_argument("--num_nodes", type=int, default=1, help="Number of nodes to use")
parser.add_argument("--image", type=str, default="nathanl/open_instruct_auto", help="Beaker image to use.")
parser.add_argument("--workspace", type=str, default="ai2/tulu-2-improvements", help="Beaker workspace to use.")
parser.add_argument("--datasets", nargs='+', help="List of datasets to mount in form <beaker_id>:<mount_path>")
# allow unknown args from CLI, use this to modify loaded config in bash scripts for sweeping
# Note, can only override args in --config passed (not default FlatArguments class in open_instruct/utils.py)
# Use parse_known_args instead of parse_args
args, unknown = parser.parse_known_args()
# Process unknown arguments
# must be of the form --{arg} {value}
unknown_args = {}
i = 0
while i < len(unknown):
if unknown[i].startswith('--'):
key = unknown[i][2:]
if i + 1 < len(unknown) and not unknown[i+1].startswith('--'):
value = unknown[i+1]
i += 2
else:
value = None
i += 1
unknown_args[key] = value
else:
i += 1
# Print known arguments
train_config = load_yaml(args.config)
print("Config:", train_config)
# Print unknown arguments
print("Unknown arguments:", unknown_args)
now = datetime.now().strftime("%m%d%Y%H%M%S")
with open(args.default_beaker_config, 'r') as f:
default_yaml = f.read()
d1 = yaml.load(default_yaml, Loader=yaml.FullLoader)
if args.num_nodes > 1:
assert args.num_gpus == 8, "`num_gpus` must be set to 8 when training with multiple nodes"
d1['tasks'][0]['replicas'] = args.num_nodes
d1['tasks'][0]['image']['beaker'] = args.image
d1['tasks'][0]['context']['cluster'] = args.cluster
d1['tasks'][0]['context']['priority'] = args.priority
d1['tasks'][0]['context']['preemptible'] = args.preemptible # True requried for Jupiter/Pluto
d1['tasks'][0]['resources']['gpuCount'] = args.num_gpus
# modify here for different set of experiments
experiment_group = "dataset_comparison"
wandb_project = "open_instruct"
# if args.wandb_api_key:
# wandb_api_key = args.wandb_api_key
# else:
# wandb_api_key = os.environ.get("WANDB_API_KEY")
# if config is passed, load and merge that
def override_and_reconstruct_command(original_command, train_config, unknown_args):
def parse_args(args):
cmd_dict = {}
i = 0
while i < len(args):
if args[i].startswith('--'):
key = args[i][2:]
if i + 1 < len(args) and not args[i+1].startswith('--'):
cmd_dict[key] = args[i+1]
i += 2
else:
cmd_dict[key] = True
i += 1
else:
i += 1
return cmd_dict
# Split the original command into a list
cmd_parts = shlex.split(original_command)
# Find the index of open_instruct/finetune.py
script_index = cmd_parts.index('open_instruct/finetune.py')
# Find the index of 'accelerate launch'
pre_index = cmd_parts.index('launch')
# Separate the command into pre-script and post-script parts
pre_script = cmd_parts[:pre_index+1] # 'accelerate launch'
pre_script_args = cmd_parts[pre_index+1:script_index]
post_script_args = cmd_parts[script_index+1:]
# Parse arguments
pre_dict = parse_args(pre_script_args)
post_dict = parse_args(post_script_args)
# Combine dictionaries and apply overrides
cmd_dict = {**post_dict}
cmd_dict.update(train_config)
cmd_dict.update(unknown_args)
# Reconstruct the command string
new_cmd_parts = pre_script
# add pre python args
for key, value in pre_dict.items():
new_cmd_parts.append(f'--{key}')
if value is not True:
new_cmd_parts.append(str(value))
# add python job + post args
new_cmd_parts.append('open_instruct/finetune.py')
for key, value in cmd_dict.items():
if key == "dataset_mixer":
key = "dataset_mixer_list"
value = parse_dataset_mixer(value)
new_cmd_parts.append(f'--{key}')
# if string in [], expand args
if isinstance(value, list):
for v in value:
new_cmd_parts.append(str(v))
elif value is not True:
new_cmd_parts.append(str(value))
return ' '.join(new_cmd_parts)
new_arguments = override_and_reconstruct_command(d1['tasks'][0]['arguments'][0], train_config, unknown_args)
# place --num_processes with args.num_gpus * args.num_nodes
# will be --num_processes {N} before
new_arguments = re.sub(r'--num_processes \d+', f'--num_processes {args.num_gpus * args.num_nodes}', new_arguments)
# place --num_machines with args.num_nodes
# will be --num_machines {N} before
new_arguments = re.sub(r'--num_machines \d+', f'--num_machines {args.num_nodes}', new_arguments)
model_name = get_model_name(new_arguments)
# if model name has /, replace with _
model_name = model_name.replace("/", "_")
# try given config only has one
dataset_name, dataset_mixer, train_file = check_dataset_selection(new_arguments)
print(f"Dataset selection is valid.")
print(f"Dataset name: {dataset_name}")
print(f"Dataset mixer: {dataset_mixer}")
print(f"Train file: {train_file}")
d = copy.deepcopy(d1)
d['tasks'][0]['arguments'][0] = new_arguments
# name and description
exp_name = f"open_instruct_finetune_{model_name}_{now}"
d['description'] = exp_name
d['tasks'][0]['name'] = exp_name
# add cluster-specific env vars
if args.num_nodes > 1:
if args.cluster == "ai2/jupiter-cirrascale-2":
d['tasks'][0]['envVars'] += [
{
"name": "NCCL_SOCKET_IFNAME",
"value": "ib",
},
{
"name": "NCCL_IB_HCA",
"value": "^=mlx5_bond_0",
},
{
"name": "NCCL_DEBUG",
"value": "INFO",
},
]
elif args.cluster == "ai2/pluto-cirrascale":
d['tasks'][0]['envVars'] += [
{
"name": "NCCL_IB_HCA",
"value": "^=mlx5_1,mlx5_2",
},
{
"name": "NCCL_DEBUG",
"value": "INFO",
},
]
elif args.cluster == "ai2/augusta-google-1":
d['tasks'][0]['envVars'] += [
{
"name":"LD_LIBRARY_PATH",
"value": r"/var/lib/tcpxo/lib64:${LD_LIBRARY_PATH}",
},
{
"name":"NCCL_CROSS_NIC",
"value": "0",
},
{
"name":"NCCL_ALGO",
"value": "Ring,Tree",
},
{
"name":"NCCL_PROTO",
"value": "Simple",
},
{
"name":"NCCL_MIN_NCHANNELS",
"value": "4",
},
{
"name":"NCCL_P2P_NET_CHUNKSIZE",
"value": "524288",
},
{
"name":"NCCL_P2P_PCI_CHUNKSIZE",
"value": "524288",
},
{
"name":"NCCL_P2P_NVL_CHUNKSIZE",
"value": "1048576",
},
{
"name":"NCCL_FASTRAK_NUM_FLOWS",
"value": "2",
},
{
"name":"NCCL_FASTRAK_ENABLE_CONTROL_CHANNEL",
"value": "0",
},
{
"name":"NCCL_BUFFSIZE",
"value": "8388608",
},
{
"name":"NCCL_FASTRAK_USE_SNAP",
"value": "1",
},
{
"name":"CUDA_VISIBLE_DEVICES",
"value": "0,1,2,3,4,5,6,7",
},
{
"name":"NCCL_NET_GDR_LEVEL",
"value": "PIX",
},
{
"name":"NCCL_FASTRAK_ENABLE_HOTPATH_LOGGING",
"value": "0",
},
{
"name":"NCCL_TUNER_PLUGIN",
"value": "libnccl-tuner.so",
},
{
"name":"NCCL_TUNER_CONFIG_PATH",
"value": "/var/lib/tcpxo/lib64/a3plus_tuner_config.textproto",
},
{
"name":"NCCL_SHIMNET_GUEST_CONFIG_CHECKER_CONFIG_FILE",
"value": "/var/lib/tcpxo/lib64/a3plus_guest_config.textproto",
},
{
"name":"NCCL_FASTRAK_PLUGIN_ACCEPT_TIMEOUT_MS",
"value": "600000",
},
{
"name":"NCCL_NVLS_ENABLE",
"value": "0",
},
{
"name":"NCCL_DEBUG",
"value": "WARN",
},
{
"name":"NCCL_FASTRAK_CTRL_DEV",
"value": "enp0s12",
},
{
"name":"NCCL_FASTRAK_IFNAME",
"value": "enp6s0,enp7s0,enp13s0,enp14s0,enp134s0,enp135s0,enp141s0,enp142s0",
},
{
"name":"NCCL_SOCKET_IFNAME",
"value": "enp0s12",
},
{
"name":"NCCL_USE_SNAP",
"value": "1",
},
{
"name":"NCCL_FASTRAK_USE_LLCM",
"value": "1",
},
{
"name":"NCCL_FASTRAK_LLCM_DEVICE_DIRECTORY",
"value": "/dev/aperture_devices",
},
]
# WANDB settings
for env in d['tasks'][0]['envVars']:
if env['name'] == "WANDB_DISABLED":
env['value'] = False
if env['name'] == "WANDB_PROJECT":
env['value'] = wandb_project
beaker_whoami = get_beaker_whoami()
d['tasks'][0]['envVars'].append({
'name': 'WANDB_NAME', 'value': exp_name
})
d['tasks'][0]['envVars'].append({
'name': 'WANDB_RUN_GROUP', 'value': experiment_group
})
d['tasks'][0]['envVars'].append({
'name': 'BEAKER_TOKEN', 'secret': f"{beaker_whoami}_BEAKER_TOKEN"
})
d['tasks'][0]['envVars'].append({
'name': 'HF_TOKEN', 'secret': f"{beaker_whoami}_HF_TOKEN"
})
d['tasks'][0]['envVars'].append({
'name': 'WANDB_API_KEY', 'secret': f"{beaker_whoami}_WANDB_API_KEY"
})
# mount datasets
if args.datasets:
if not d['tasks'][0].get('datasets'):
d['tasks'][0]['datasets'] = []
for dataset in args.datasets:
beaker_id, mount_path = dataset.split(':')
d['tasks'][0]['datasets'].append({
'mountPath': mount_path,
'source': { 'beaker': beaker_id }
})
# optionally, print to debug config
print(d)
fn = "configs/beaker_configs/auto_created/{}.yaml".format(exp_name)
file = open(fn, "w")
yaml.dump(d, file, default_flow_style=True)
file.close()
cmd = f"beaker experiment create {fn} --workspace {args.workspace}"
subprocess.Popen(cmd, shell=True)
def check_dataset_selection(command_string):
parts = shlex.split(command_string)
dataset_name = None
dataset_mixer = None
train_file = None
for i, part in enumerate(parts):
if part == '--dataset_name' and i + 1 < len(parts):
dataset_name = parts[i + 1]
elif part == '--dataset_mixer_list' and i + 1 < len(parts):
dataset_mixer = parts[i + 1]
j = i + 2
while j < len(parts) - 1:
dataset_mixer += ' ' + parts[j]
if '--' in parts[j+1]:
break
j += 1
elif part == '--train_file' and i + 1 < len(parts):
train_file = parts[i + 1]
if ((dataset_name is not None and dataset_mixer is not None) or
(dataset_name is not None and train_file is not None) or
(dataset_mixer is not None and train_file is not None)):
raise ValueError("Cannot provide two dataset selection mechanisms.")
return dataset_name, dataset_mixer, train_file
def parse_dataset_mixer(mixer_dict):
elems = []
for k, v in mixer_dict.items():
elems.append(k)
elems.append(str(v))
return ' '.join(elems)
def get_model_name(command_string):
parts = shlex.split(command_string)
for i, part in enumerate(parts):
if part == '--model_name_or_path':
if i + 1 < len(parts):
return parts[i + 1]
return None # Return None if model name is not found
if __name__ == "__main__":
main()
model_name_or_path: allenai/open_instruct_dev
model_revision: peteish7-weka-anneal-from-928646-50B-nowup_big-number-no-whammy-2_soup_step11931_hf
use_flash_attn: true
tokenizer_name: allenai/open_instruct_dev
tokenizer_revision: peteish7-weka-anneal-from-928646-50B-nowup_big-number-no-whammy-2_soup_step11931_hf
use_slow_tokenizer: true
dataset_mixer:
# General datasets:
allenai/tulu-v3.9-tmp: 1.0
# Datasets removed because of licensing:
# ai2-adapt-dev/flan_v2_converted: 89982 # ODC-BY
# ai2-adapt-dev/sciriff_converted: 10000 # ODC-BY
# ai2-adapt-dev/no_robots_converted: 9500 # NC
# AI-MO/NuminaMath-TIR: 72441 # NC
max_seq_length: 2048
preprocessing_num_workers: 128
per_device_train_batch_size: 1 # note, this is set up for 8 GPUs
gradient_accumulation_steps: 4 # effective batch size 128 with 4 nodes
learning_rate: 5.0e-06
lr_scheduler_type: linear
warmup_ratio: 0.03
weight_decay: 0.0
num_train_epochs: 2
output_dir: /output/
with_tracking: true
report_to:
- wandb
logging_steps: 1
checkpointing_steps: epoch
dataset_mix_dir: /output/
add_bos: true
model_name_or_path: /model
use_flash_attn: true
tokenizer_name: /model
use_slow_tokenizer: true
dataset_mixer:
# General datasets:
/dataset/mixed_ds_train.json: 1.0
max_seq_length: 2048
preprocessing_num_workers: 128
per_device_train_batch_size: 1 # note, this is set up for 8 GPUs
gradient_accumulation_steps: 4 # effective batch size 128 with 4 nodes
learning_rate: 5.0e-06
lr_scheduler_type: linear
warmup_ratio: 0.03
weight_decay: 0.0
num_train_epochs: 2
output_dir: /output/
with_tracking: true
report_to:
- wandb
logging_steps: 1
checkpointing_steps: epoch
dataset_mix_dir: /output/
add_bos: true
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment