Created
May 9, 2023 21:25
-
-
Save shaowei-su/333deb339ba72b7cdd43aae1b59da67d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from ray.job_submission import JobSubmissionClient | |
client = JobSubmissionClient("http://127.0.0.1:8265") | |
kick_off_pytorch_benchmark = ( | |
# Run the benchmark. | |
"python3.8 ./run_clm_deepspeed_train.py --model_name_or_path EleutherAI/gpt-neox-20b --block_size 2048 --output_dir /nvme/out2 --num_train_epochs 3 --learning_rate 5e-5 --weight_decay 0. --num_workers 16 --upload_dir '[S3]' --per_device_train_batch_size 1 --per_device_eval_batch_size 1 --gradient_accumulation_steps 1 --train_file /tmp/gpt/train.csv --validation_file /tmp/gpt/val.csv --seed 42" | |
) | |
submission_id = client.submit_job( | |
entrypoint=kick_off_pytorch_benchmark, | |
runtime_env={'pip': ['tabulate'], 'working_dir': './', "env_vars": {"RDMAV_FORK_SAFE": "1", "NCCL_DEBUG": "INFO", "NCCL_PROTO": "simple", "FI_LOG_LEVEL": "warn", "FI_PROVIDER": "efa", "FI_EFA_USE_DEVICE_RDMA": "1", "NCCL_ALGO": "RING", "CURL_CA_BUNDLE": ""}} | |
) | |
print("Use the following command to follow this Job's logs:") | |
print(f"ray job logs '{submission_id}' --follow --address http://localhost:8265") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment