Skip to content

Instantly share code, notes, and snippets.

View williamFalcon's full-sized avatar
🎯
Focusing

William Falcon williamFalcon

🎯
Focusing
View GitHub Profile
# clear last step
optimizer.zero_grad()
# 16 accumulated gradient steps
scaled_loss = 0
for accumulated_step_i in range(16):
out = model.forward()
loss = some_loss(out,y)
loss.backward()
scaled_loss += loss.item()
trainer = Trainer(accumulate_grad_batches=16)
trainer.fit(model)
# put model on GPU
model.cuda(0)
# put data on gpu (cuda on a variable returns a cuda copy)
x = x.cuda(0)
# runs on GPU now
model(x)
# put model on GPU
model.cuda(0)
# put data on gpu (cuda on a variable returns a cuda copy)
x = x.cuda(0)
# runs on GPU now
model(x)
# ask lightning to use gpu 0 for training
trainer = Trainer(gpus=[0])
trainer.fit(model)
# enable 16-bit on the model and the optimizer
model, optimizers = amp.initialize(model, optimizers, opt_level='O2')
# when doing .backward, let amp do it so it can scale the loss
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
trainer = Trainer(amp_level='O2', use_amp=False)
trainer.fit(model)
# copy model on each GPU and give a fourth of the batch to each
model = DataParallel(model, devices=[0, 1, 2 ,3])
# out has 4 outputs (one for each gpu)
out = model(x.cuda(0))
# ask lightning to use 4 GPUs for training
trainer = Trainer(gpus=[0, 1, 2, 3])
trainer.fit(model)
# each model is sooo big we can't fit both in memory
encoder_rnn.cuda(0)
decoder_rnn.cuda(1)
# run input through encoder on GPU 0
encoder_out = encoder_rnn(x.cuda(0))
# run output through decoder on the next GPU
out = decoder_rnn(encoder_out.cuda(1))