-
-
Save kinoc/39cc43e477767d3a27e927be659e57ec to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3 | |
# Usage: | |
# PYTHONPATH=src ./train --dataset <file|directory|glob> | |
# Got 1558M to train on a TITAN RTX using IBM Tensorflow_Large_Model_Support | |
# TLMS can insert explicit swaps in the graph between GPU and CPU, to extend the memory | |
# But the graph has While_Loop, so you have to use the TFLMSv2 version (which works with TF 1.x) | |
# | |
# Download, expand, get the egg out and install. | |
# must install IBM Large Model Support 2.0 for Tensorflow | |
# https://github.com/IBM/tensorflow-large-model-support | |
# https://www.ibm.com/support/knowledgecenter/SS5SF7_1.6.0/navigation/pai_getstarted_tflmsv2.html | |
# https://developer.ibm.com/linuxonpower/2019/06/11/tensorflow-large-model-support-resources/ | |
# direct link to TFLMS v2 egg | |
# https://public.dhe.ibm.com/ibmdl/export/pub/software/server/ibm-ai/conda/linux-64/tensorflow-large-model-support-2.0.2-py36_970.gfa57a9e.tar.bz2 | |
# | |
# note extra switches for time based throttling, early termination, gpu use and re-arranged code for TFLMS | |
# | |
import argparse | |
import json | |
import os | |
import sys | |
import numpy as np | |
import tensorflow as tf | |
import time | |
import datetime | |
import tqdm | |
from tensorflow.core.protobuf import rewriter_config_pb2 | |
from tensorflow_large_model_support import LMS | |
import model | |
import sample | |
import encoder | |
from load_dataset import load_dataset, Sampler | |
from accumulate import AccumulatingOptimizer | |
import memory_saving_gradients | |
CHECKPOINT_DIR = 'checkpoint' | |
SAMPLE_DIR = 'samples' | |
parser = argparse.ArgumentParser( | |
description='Fine-tune GPT-2 on your custom dataset.', | |
formatter_class=argparse.ArgumentDefaultsHelpFormatter) | |
parser.add_argument('--dataset', metavar='PATH', type=str, required=True, help='Input file, directory, or glob pattern (utf-8 text, or preencoded .npz files).') | |
parser.add_argument('--model_name', metavar='MODEL', type=str, default='117M', help='Pretrained model name') | |
parser.add_argument('--combine', metavar='CHARS', type=int, default=50000, help='Concatenate input files with <|endoftext|> separator into chunks of this minimum size') | |
parser.add_argument('--encoding', type=str, default='utf-8', help='Set the encoding for reading and writing files.') | |
parser.add_argument('--batch_size', metavar='SIZE', type=int, default=1, help='Batch size') | |
parser.add_argument('--learning_rate', metavar='LR', type=float, default=0.0001, help='Learning rate for Adam') | |
parser.add_argument('--accumulate_gradients', metavar='N', type=int, default=1, help='Accumulate gradients across N minibatches.') | |
parser.add_argument('--memory_saving_gradients', default=False, action='store_true', help='Use gradient checkpointing to reduce vram usage.') | |
parser.add_argument('--only_train_transformer_layers', default=False, action='store_true', help='Restrict training to the transformer blocks.') | |
parser.add_argument('--optimizer', type=str, default='adam', help='Optimizer. <adam|sgd>.') | |
parser.add_argument('--noise', type=float, default=0.0, help='Add noise to input training data to regularize against typos.') | |
parser.add_argument('--top_k', type=int, default=40, help='K for top-k sampling.') | |
parser.add_argument('--top_p', type=float, default=0.0, help='P for top-p sampling. Overrides top_k if set > 0.') | |
parser.add_argument('--restore_from', type=str, default='latest', help='Either "latest", "fresh", or a path to a checkpoint file') | |
parser.add_argument('--run_name', type=str, default='run1', help='Run id. Name of subdirectory in checkpoint/ and samples/') | |
parser.add_argument('--run_until_count', metavar='N', type=int, default=sys.maxsize, help='exit program when counter reaches N steps') | |
parser.add_argument('--run_until_loss', metavar='N', type=float, default=-1.0, help='exit program when avg loss < N') | |
parser.add_argument('--sample_every', metavar='N', type=int, default=100, help='Generate samples every N steps') | |
parser.add_argument('--sample_length', metavar='TOKENS', type=int, default=1023, help='Sample this many tokens') | |
parser.add_argument('--sample_num', metavar='N', type=int, default=1, help='Generate this many samples') | |
parser.add_argument('--save_every', metavar='N', type=int, default=1000, help='Write a checkpoint every N steps') | |
parser.add_argument('--val_dataset', metavar='PATH', type=str, default=None, help='Dataset for validation loss, defaults to --dataset.') | |
parser.add_argument('--val_batch_size', metavar='SIZE', type=int, default=2, help='Batch size for validation.') | |
parser.add_argument('--val_batch_count', metavar='N', type=int, default=40, help='Number of batches for validation.') | |
parser.add_argument('--val_every', metavar='STEPS', type=int, default=0, help='Calculate validation loss every STEPS steps.') | |
parser.add_argument('--use_gpu', metavar='STEPS', type=str, default='', help='Sets CUDA_VISIBLE_DEVICES for multiple-gpu.') | |
parser.add_argument('--use_throttle', type=str, default='', help='Hours to throttle (e.g. "9 18").') | |
parser.add_argument('--throttle_sleep', type=float, default=1.0, help='Time to sleep in seconds.') | |
parser.add_argument('--swapout_threshold', type=int, default=4096, help='LMS swapout_threshold') | |
parser.add_argument('--swapin_ahead', type=int, default=1024, help='LMS swapin_ahead') | |
parser.add_argument('--swapin_groupby', type=int, default=256, help='LMS swapin_groupby') | |
parser.add_argument('--sync_mode', type=int, default=0, help='LMS sync_mode') | |
def maketree(path): | |
try: | |
os.makedirs(path) | |
except: | |
pass | |
def randomize(context, hparams, p): | |
if p > 0: | |
mask = tf.random.uniform(shape=tf.shape(context)) < p | |
noise = tf.random.uniform(shape=tf.shape(context), minval=0, maxval=hparams.n_vocab, dtype=tf.int32) | |
return tf.where(mask, noise, context) | |
else: | |
return context | |
def main(): | |
args = parser.parse_args() | |
if (len(args.use_gpu)>0): | |
os.environ['CUDA_VISIBLE_DEVICES'] = args.use_gpu #'1' | |
print('Set CUDA_VISIBLE_DEVICES =[{}]'.format(os.environ['CUDA_VISIBLE_DEVICES'])) | |
print('TF version: {}'.format(tf.__version__)) | |
if tf.test.gpu_device_name(): | |
print('Default GPU Device: {}'.format(tf.test.gpu_device_name())) | |
else: | |
print("Please install GPU version of TF") | |
tf.logging.set_verbosity(tf.logging.INFO) | |
enc = encoder.get_encoder(args.model_name) | |
hparams = model.default_hparams() | |
with open(os.path.join('models', args.model_name, 'hparams.json')) as f: | |
hparams.override_from_dict(json.load(f)) | |
if args.sample_length > hparams.n_ctx: | |
raise ValueError( | |
"Can't get samples longer than window size: %s" % hparams.n_ctx) | |
if args.model_name == '345M': | |
args.memory_saving_gradients = True | |
if args.optimizer == 'adam': | |
args.only_train_transformer_layers = True | |
config = tf.ConfigProto() | |
config.gpu_options.allow_growth = True | |
config.graph_options.rewrite_options.layout_optimizer = rewriter_config_pb2.RewriterConfig.OFF | |
with tf.name_scope('de_optimizer'): | |
if args.optimizer == 'adam': | |
opt = tf.train.AdamOptimizer(learning_rate=args.learning_rate) | |
elif args.optimizer == 'sgd': | |
opt = tf.train.GradientDescentOptimizer(learning_rate=args.learning_rate) | |
else: | |
exit('Bad optimizer:', args.optimizer) | |
context = tf.placeholder(tf.int32, [args.batch_size, None]) | |
context_in = randomize(context, hparams, args.noise) | |
output = model.model(hparams=hparams, X=context_in) | |
loss = tf.reduce_mean( | |
tf.nn.sparse_softmax_cross_entropy_with_logits( | |
labels=context[:, 1:], logits=output['logits'][:, :-1])) | |
if args.val_every > 0: | |
val_context = tf.placeholder(tf.int32, [args.val_batch_size, None]) | |
val_output = model.model(hparams=hparams, X=val_context) | |
val_loss = tf.reduce_mean( | |
tf.nn.sparse_softmax_cross_entropy_with_logits( | |
labels=val_context[:, 1:], logits=val_output['logits'][:, :-1])) | |
val_loss_summary = tf.summary.scalar('val_loss', val_loss) | |
tf_sample = sample.sample_sequence( | |
hparams=hparams, | |
length=args.sample_length, | |
context=context, | |
batch_size=args.batch_size, | |
temperature=1.0, | |
top_k=args.top_k, | |
top_p=args.top_p) | |
all_vars = [v for v in tf.trainable_variables() if 'model' in v.name] | |
train_vars = [v for v in all_vars if '/h' in v.name] if args.only_train_transformer_layers else all_vars | |
#if args.optimizer == 'adam': | |
# opt = tf.train.AdamOptimizer(learning_rate=args.learning_rate) | |
#elif args.optimizer == 'sgd': | |
# opt = tf.train.GradientDescentOptimizer(learning_rate=args.learning_rate) | |
#else: | |
# exit('Bad optimizer:', args.optimizer) | |
if args.accumulate_gradients > 1: | |
if args.memory_saving_gradients: | |
exit("Memory saving gradients are not implemented for gradient accumulation yet.") | |
opt = AccumulatingOptimizer( | |
opt=opt, | |
var_list=train_vars) | |
opt_reset = opt.reset() | |
opt_compute = opt.compute_gradients(loss) | |
opt_apply = opt.apply_gradients() | |
summary_loss = tf.summary.scalar('loss', opt_apply) | |
else: | |
if args.memory_saving_gradients: | |
opt_grads = memory_saving_gradients.gradients(loss, train_vars) | |
else: | |
opt_grads = tf.gradients(loss, train_vars) | |
opt_grads = list(zip(opt_grads, train_vars)) | |
opt_apply = opt.apply_gradients(opt_grads) | |
summary_loss = tf.summary.scalar('loss', loss) | |
summary_lr = tf.summary.scalar('learning_rate', args.learning_rate) | |
summaries = tf.summary.merge([summary_lr, summary_loss]) | |
summary_log = tf.summary.FileWriter( | |
os.path.join(CHECKPOINT_DIR, args.run_name)) | |
saver = tf.train.Saver( | |
var_list=all_vars, | |
max_to_keep=5, | |
keep_checkpoint_every_n_hours=2) | |
print("BEGIN LMS") | |
# Enable Large Model Support | |
lms_model = LMS(swapout_threshold=args.swapout_threshold, | |
swapin_ahead=args.swapin_ahead, | |
swapin_groupby=args.swapin_groupby, | |
sync_mode=args.sync_mode) | |
lms_model.run(tf.get_default_graph()) | |
print("BEGIN SESS") | |
with tf.Session(config=config) as sess: | |
sess.run(tf.global_variables_initializer()) | |
if args.restore_from == 'latest': | |
ckpt = tf.train.latest_checkpoint( | |
os.path.join(CHECKPOINT_DIR, args.run_name)) | |
if ckpt is None: | |
# Get fresh GPT weights if new run. | |
ckpt = tf.train.latest_checkpoint( | |
os.path.join('models', args.model_name)) | |
elif args.restore_from == 'fresh': | |
ckpt = tf.train.latest_checkpoint( | |
os.path.join('models', args.model_name)) | |
else: | |
ckpt = tf.train.latest_checkpoint(args.restore_from) | |
print('Loading checkpoint', ckpt) | |
saver.restore(sess, ckpt) | |
print('Loading dataset...') | |
chunks = load_dataset(enc, args.dataset, args.combine, encoding=args.encoding) | |
data_sampler = Sampler(chunks) | |
if args.val_every > 0: | |
if args.val_dataset: | |
val_chunks = load_dataset(enc, args.val_dataset, args.combine, encoding=args.encoding) | |
else: | |
val_chunks = chunks | |
print('dataset has', data_sampler.total_size, 'tokens') | |
print('Training...') | |
if args.val_every > 0: | |
# Sample from validation set once with fixed seed to make | |
# it deterministic during training as well as across runs. | |
val_data_sampler = Sampler(val_chunks, seed=1) | |
val_batches = [[val_data_sampler.sample(1024) for _ in range(args.val_batch_size)] | |
for _ in range(args.val_batch_count)] | |
counter = 1 | |
counter_path = os.path.join(CHECKPOINT_DIR, args.run_name, 'counter') | |
if os.path.exists(counter_path): | |
# Load the step number if we're resuming a run | |
# Add 1 so we don't immediately try to save again | |
with open(counter_path, 'r') as fp: | |
counter = int(fp.read()) + 1 | |
def save(): | |
maketree(os.path.join(CHECKPOINT_DIR, args.run_name)) | |
print( | |
'Saving', | |
os.path.join(CHECKPOINT_DIR, args.run_name, | |
'model-{}').format(counter)) | |
saver.save( | |
sess, | |
os.path.join(CHECKPOINT_DIR, args.run_name, 'model'), | |
global_step=counter) | |
with open(counter_path, 'w') as fp: | |
fp.write(str(counter) + '\n') | |
def generate_samples(): | |
print('Generating samples...') | |
context_tokens = data_sampler.sample(1) | |
all_text = [] | |
index = 0 | |
while index < args.sample_num: | |
out = sess.run( | |
tf_sample, | |
feed_dict={context: args.batch_size * [context_tokens]}) | |
for i in range(min(args.sample_num - index, args.batch_size)): | |
text = enc.decode(out[i]) | |
text = '======== SAMPLE {} ========\n{}\n'.format( | |
index + 1, text) | |
all_text.append(text) | |
index += 1 | |
print(text) | |
maketree(os.path.join(SAMPLE_DIR, args.run_name)) | |
with open( | |
os.path.join(SAMPLE_DIR, args.run_name, | |
'samples-{}').format(counter), 'w') as fp: | |
fp.write('\n'.join(all_text)) | |
def validation(): | |
print('Calculating validation loss...') | |
losses = [] | |
for batch in tqdm.tqdm(val_batches): | |
losses.append(sess.run(val_loss, feed_dict={val_context: batch})) | |
v_val_loss = np.mean(losses) | |
v_summary = sess.run(val_loss_summary, feed_dict={val_loss: v_val_loss}) | |
summary_log.add_summary(v_summary, counter) | |
summary_log.flush() | |
print( | |
'[{counter} | {time:2.2f}] validation loss = {loss:2.2f}' | |
.format( | |
counter=counter, | |
time=time.time() - start_time, | |
loss=v_val_loss)) | |
def sample_batch(): | |
return [data_sampler.sample(1024) for _ in range(args.batch_size)] | |
avg_loss = (0.0, 0.0) | |
start_time = time.time() | |
try: | |
while True: | |
if counter % args.save_every == 0: | |
save() | |
if counter % args.sample_every == 0: | |
generate_samples() | |
if args.val_every > 0 and (counter % args.val_every == 0 or counter == 1): | |
validation() | |
if args.accumulate_gradients > 1: | |
sess.run(opt_reset) | |
for _ in range(args.accumulate_gradients): | |
sess.run( | |
opt_compute, feed_dict={context: sample_batch()}) | |
(v_loss, v_summary) = sess.run((opt_apply, summary_loss)) | |
else: | |
(_, v_loss, v_summary) = sess.run( | |
(opt_apply, loss, summary_loss), | |
feed_dict={context: sample_batch()}) | |
summary_log.add_summary(v_summary, counter) | |
avg_loss = (avg_loss[0] * 0.99 + v_loss, | |
avg_loss[1] * 0.99 + 1.0) | |
print( | |
'[{counter} | {time:2.2f}] loss={loss:2.2f} avg={avg:2.2f}' | |
.format( | |
counter=counter, | |
time=time.time() - start_time, | |
loss=v_loss, | |
avg=avg_loss[0] / avg_loss[1])) | |
counter += 1 | |
# KHC mods | |
avg_ls =avg_loss[0] / avg_loss[1] | |
if ((args.run_until_count != sys.maxsize) and ( counter % args.run_until_count == 0)): | |
save() | |
quit() | |
if (avg_ls < args.run_until_loss) : | |
save() | |
quit() | |
if (len(args.use_throttle)>1): | |
now = datetime.datetime.now() | |
throttle_times = args.use_throttle.split() | |
start_time = int(throttle_times[0]) | |
stop_time = int(throttle_times[1]) | |
now_time = now.hour | |
sleep_time = args.throttle_sleep | |
if ((start_time<now_time) and (now_time<stop_time)): | |
time.sleep(sleep_time) | |
print(' {} < {} < {} => Throttle {}'.format(start_time,now_time,stop_time,sleep_time)) | |
except KeyboardInterrupt: | |
print('interrupted') | |
save() | |
if __name__ == '__main__': | |
main() |
GenTxt
commented
Oct 20, 2020
via email
Sorry that didn't work. I will be revisiting it in a few weeks.
Here is the other parameters I use locally, maybe they may be worth trying.
sudo PYTHONPATH=src ./trainLMS.py --dataset ~/gpt-corpus/Run1558Lap1xx --model_name 1558M --run_name Run1558Lap1x --save_every 10000 --sample_every 1000 --sample_num 10 --sample_length 256 --memory_saving_gradients --val_every 5000 --restore_from /home/kinoc/gpt2_train2/gpt-2/checkpoint/Run1558Lap1xx --batch_size 1 --run_until_count 4500000 --run_until_loss 0.1 --use_gpu "0,1" --use_throttle "12 16" --throttle_sleep 2.0 --optimizer sgd --learning_rate 0.0001 --only_train_transformer_layers --swapout_threshold 512 --swapin_ahead 256 --swapin_groupby 16 --sync_mode 0
You will of course need to make local adjustments, and may not want the throttle. Let me know if it makes any difference.
Thank you - this is awesome and 1558M model training works with my Titan RTX. This also seems to allow a RTX 2080 to train with 774M model although I couldn't get RTX 2080 to work with 1558M.
System specs:
- Ubuntu 18.10, Python 3.6, tensorflow-gpu=1.14
- Ryzen 2700, 128GB RAM with Titan RTX and RTX 2080 GPUs.
- Also note that if using Ryzen, IOMMU has issues that generates a differing set of OOM-related issues. Disable IOMMU in BIOS and pass amd_iommu=off to the kernel.