-
-
Save mrm8488/31f0522084dee37ffa7df8ede6cfb670 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/sh | |
set -x | |
# == Swarm training (alpha release) == | |
# Setup: | |
# | |
# git clone https://github.com/shawwn/gpt-2 | |
# cd gpt-2 | |
# git checkout dev-shard | |
# python3 download_model.py 117M | |
# python3 download_model.py 1558M | |
# sudo pip3 install -r requirements.txt | |
# | |
# also install tensorflow==1.15.0, or put it in a venv if you don't want to globally downgrade | |
# To prepare a dataset, you don't need to do anything! You can pass the dataset as --dataset foo.txt | |
# | |
# However, for performance, when training on dozens or hundreds of TPUs, you will want to tokenize the dataset | |
# since the tokenizer can only generate about 16k tokens per second. (That's about 4 TPUv3-8s for 117M, or 64 TPUv3-8's for 1.5B.) | |
# | |
# To tokenize the dataset, run: | |
# | |
# python3 tokenize_dataset.py foo.txt foo.txt.npz | |
# | |
# then pass in --dataset foo.txt.npz to train_multi.py. | |
# | |
# ~~~WARNING~~~: The following scripts create TPUv3-8's in zone europe-west4-a. TFRC doesn't grant access to these TPUs by default. | |
# To train using the default 100 TPUv2-8's, replace "europe-west4-a" with "us-central1-f" in all following lines, and change | |
# --accelerator-type "v3-8" to --accelerator-type "v2-8". | |
# | |
# Note that training GPT-2 1.5B might not work on TPUv2-8's due to memory limitations, because right now the code is set up to | |
# use 3 out of 8 TPU cores. However, there are two ways around this: | |
# | |
# 1. You can create up to 5 non-preemptible TPUv3-8's in us-central1-f, and create a swarm consisting of those. (Be sure to | |
# **remove** the --preemptible flag when creating the five TPUs!) | |
# | |
# 2. Bug me on twitter to add a command line option so that you can train on 100 preemptible TPUv2-8's by disabling the multicore training. | |
# | |
# (This is an alpha release, so bear with me. The final release will support all these configurations without much effort.) | |
# ~~~END WARNING~~~ | |
# ~~~ CRUCIAL WARNING ~~~ YOU MUST READ THIS SECTION!! ~~~ | |
# | |
# Make **ABSOLUTELY CERTAIN** that your VM is in the *exact same region* as the TPUs. If your VM is in a different region, | |
# then swarm training will accrue hundreds of dollars of bandwidth charges very quickly! I learned this the hard way. | |
# | |
# YOU HAVE BEEN WARNED. | |
# | |
# On the other hand, this is pretty much the only thing you have to worry about. Everything else is perfectly safe. | |
# The worst that can happen other than this is that the code won't work. | |
# | |
# ~~~ END CRUCIAL WARNING ~~~ | |
# Save the following script as `tpu-create-eu`: | |
# | |
# #!/bin/sh | |
# set -ex | |
# i=${1} | |
# shift 1 | |
# exec gcloud compute tpus create tpeu${i} --zone europe-west4-a --network default --range 10.49.${i}.0/29 --version 1.15 --accelerator-type "v3-8" --preemptible "$@" | |
# | |
# Now you can create a bunch of TPUs by running: | |
# | |
# for i in {0..19}; do tpu-create-eu $i --async & done | |
# | |
# Each TPU will preempt after 24 hours. After that, you'll need to delete them and re-create them. Save this script as `tpu-delete-eu`: | |
# | |
# #!/bin/sh | |
# set -ex | |
# i=${1} | |
# shift 1 | |
# exec gcloud compute tpus delete --zone europe-west4-a tpeu${1} "$@" | |
# | |
# Now you can delete the TPUs by running: | |
# | |
# for i in {0..19}; do tpu-delete-eu $i --quiet --async & done | |
# | |
# After creating the TPUs, fill in their IP addresses below. | |
# (The $targets variable should end up as a comma-separated list of IPs.) | |
# | |
# You can get your TPU ip addresses by running: | |
# | |
# gcloud compute tpus list --zone europe-west4-a | |
# | |
# (which I've aliased to `tpu-satus-eu` for convenience.) | |
# | |
# If you run into any problems or have any questions, message me or DM me on twitter: | |
# | |
# https://twitter.com/theshawwn | |
# | |
targets= | |
targets="${targets}grpc://10.49.0.2:8470" | |
targets="${targets},grpc://10.49.1.2:8470" | |
targets="${targets},grpc://10.49.2.2:8470" | |
targets="${targets},grpc://10.49.3.2:8470" | |
targets="${targets},grpc://10.49.4.2:8470" | |
targets="${targets},grpc://10.49.5.2:8470" | |
targets="${targets},grpc://10.49.6.2:8470" | |
targets="${targets},grpc://10.49.7.2:8470" | |
targets="${targets},grpc://10.49.8.2:8470" | |
targets="${targets},grpc://10.49.9.2:8470" | |
targets="${targets},grpc://10.49.10.2:8470" | |
targets="${targets},grpc://10.49.11.2:8470" | |
targets="${targets},grpc://10.49.12.2:8470" | |
targets="${targets},grpc://10.49.13.2:8470" | |
targets="${targets},grpc://10.49.14.2:8470" | |
targets="${targets},grpc://10.49.15.2:8470" | |
targets="${targets},grpc://10.49.16.2:8470" | |
targets="${targets},grpc://10.49.17.2:8470" | |
targets="${targets},grpc://10.49.18.2:8470" | |
targets="${targets},grpc://10.49.19.2:8470" | |
# 117M | |
exec python3 -m pdb -c continue train_multi.py --targets "$targets" --dataset ./yourdataset.txt.npz --run_name yourmodelsmall --optimizer adam --model_name 117M --batch_size 28 --learning_rate 0.000055 --only_train_transformer_layers --dtype float32 --device 0 --skip_cores 1 --max_cores 7 --colocate_gradients --memory_saving_gradients --allow_soft_placement --init_tpu "$@" | |
# 1.5B | |
#exec python3 -m pdb -c continue train_multi.py --targets "$targets" --dataset ./yourdataset.txt.npz --run_name yourmodelxl --optimizer adam --model_name 1558M --batch_size 6 --learning_rate 0.000055 --only_train_transformer_layers --dtype float32 --device 1 --skip_cores 4 --max_cores 3 --colocate_gradients --memory_saving_gradients --allow_soft_placement --init_tpu "$@" | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment