Created
February 15, 2018 22:16
-
-
Save srush/08aa222ff405a7ac70c25cdbb2f9eba6 to your computer and use it in GitHub Desktop.
Language Modeling
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" Some language modeling code. | |
python lm.py --model LstmLm --devid 1 --lr 0.01 --clip 2 --optim Adam --nlayers 2 --nhid 512 --dropout 0.5 --epochs 50 --bsz 128 --bptt 32 --tieweights | |
Train: 42.90723478099889, Valid: 75.74921810452948, Test: 72.84913233987702 | |
python lm.py --model NnLm --devid 3 --lr 0.001 --clip 0 --optim Adam --nlayers 2 --nhid 512 --dropout 0.5 --epochs 20 --bsz 64 --bptt 64 | |
Train: 71.58999479091389, Valid: 158.07431086368382, Test: 146.13046578572258 | |
Tested on torch.__version__ == 0.3.1b0+2b47480 | |
""" | |
import argparse | |
import torch | |
import torch.nn as nn | |
import torch.optim as optim | |
import torch.nn.functional as F | |
from torch.autograd import Variable as V | |
from torch.nn import Parameter | |
from torch.nn.utils import clip_grad_norm | |
import torchtext | |
from tqdm import tqdm | |
import random | |
import math | |
def parse_args(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument("--devid", type=int, default=-1) | |
parser.add_argument("--model", choices=["NnLm", "LstmLm"], default="LstmLm") | |
parser.add_argument("--nhid", type=int, default=256) | |
parser.add_argument("--nlayers", type=int, default=1) | |
parser.add_argument("--tieweights", action="store_true") | |
parser.add_argument("--maxnorm", type=float, default=None) | |
parser.add_argument("--dropout", type=float, default=0) | |
parser.add_argument("--epochs", type=int, default=10) | |
parser.add_argument("--optim", choices=["SGD", "Adam"], default="Adam") | |
parser.add_argument("--lr", type=float, default=0.01) | |
parser.add_argument("--lrd", type=float, default=0.25) | |
parser.add_argument("--wd", type=float, default=1e-4) | |
parser.add_argument("--bsz", type=int, default=32) | |
parser.add_argument("--bptt", type=int, default=32) | |
parser.add_argument("--clip", type=float, default=5) | |
# Adam parameters | |
parser.add_argument("--b1", type=float, default=0.9) | |
parser.add_argument("--b2", type=float, default=0.999) | |
parser.add_argument("--eps", type=float, default=1e-8) | |
# SGD parameters | |
parser.add_argument("--mom", type=float, default=0.99) | |
parser.add_argument("--dm", type=float, default=0) | |
parser.add_argument("--nonag", action="store_true", default=False) | |
return parser.parse_args() | |
args = parse_args() | |
random.seed(1111) | |
torch.manual_seed(1111) | |
if args.devid >= 0: | |
torch.cuda.manual_seed_all(1111) | |
torch.backends.cudnn.enabled = False | |
print("Cudnn is enabled: {}".format(torch.backends.cudnn.enabled)) | |
TEXT = torchtext.data.Field() | |
train, valid, test = torchtext.datasets.LanguageModelingDataset.splits( | |
path=".", | |
train="train.txt", validation="valid.txt", test="valid.txt", text_field=TEXT) | |
#path="data/", | |
#train="train.txt", validation="valid.txt", test="test.txt", text_field=TEXT) | |
TEXT.build_vocab(train) | |
padidx = TEXT.vocab.stoi["<pad>"] | |
train_iter, valid_iter, test_iter = torchtext.data.BPTTIterator.splits( | |
(train, valid, test), batch_size=args.bsz, device=args.devid, bptt_len=args.bptt, repeat=False) | |
class Lm(nn.Module): | |
""" Boring superclass for all the language modeling code. """ | |
def __init__(self): | |
super(Lm, self).__init__() | |
def train_epoch(self, iter, loss, optimizer): | |
self.train() | |
train_loss = 0 | |
nwords = 0 | |
hid = None | |
for batch in tqdm(iter): | |
optimizer.zero_grad() | |
x = batch.text | |
y = batch.target | |
out, hid = model(x, hid if hid is not None else None) | |
bloss = loss(out.view(-1, model.vsize), y.view(-1)) | |
bloss.backward() | |
train_loss += bloss | |
# bytetensor.sum overflows, so cast to int | |
nwords += y.ne(padidx).int().sum() | |
if args.clip > 0: | |
clip_grad_norm(self.parameters(), args.clip) | |
optimizer.step() | |
return train_loss.data[0], nwords.data[0] | |
def validate(self, iter, loss): | |
self.eval() | |
valid_loss = 0 | |
nwords = 0 | |
hid = None | |
for batch in iter: | |
x = batch.text | |
y = batch.target | |
out, hid = model(x, hid if hid is not None else None) | |
valid_loss += loss(out.view(-1, model.vsize), y.view(-1)) | |
nwords += y.ne(padidx).int().sum() | |
return valid_loss.data[0], nwords.data[0] | |
def generate_predictions(self): | |
self.eval() | |
data = torchtext.datasets.LanguageModelingDataset( | |
path="input.txt", | |
text_field=TEXT) | |
data_iter = torchtext.data.BPTTIterator(data, 211, 12, device=args.devid, train=False) | |
outputs = [[] for _ in range(211)] | |
print() | |
print("Generating Kaggle predictions") | |
for batch in tqdm(data_iter): | |
# T x N x V | |
scores, idxs = self(batch.text, None)[0][-3].topk(20, dim=-1) | |
for i in range(idxs.size(0)): | |
outputs[i].append([TEXT.vocab.itos[x] for x in idxs[i].data.tolist()]) | |
with open(self.__class__.__name__ + ".preds.txt", "w") as f: | |
f.write("id,word\n") | |
ok = 1 | |
for sentences in outputs: | |
f.write("\n".join(["{},{}".format(ok+i, " ".join(x)) for i, x in enumerate(sentences)])) | |
f.write("\n") | |
ok += len(sentences) | |
class NnLm(Lm): | |
""" Feedforward neural network LM, pretends each bptt sequence is a sentence. """ | |
def __init__(self, vocab, nhid, kW=3, nlayers=1, dropout=0, tieweights=True): | |
super(NnLm, self).__init__() | |
self.vsize = len(vocab.itos) | |
self.kW = kW | |
self.nhid = nhid | |
self.lut = nn.Embedding(self.vsize, nhid, max_norm=args.maxnorm) | |
self.conv = nn.Conv1d(nhid, nhid, kW, stride=1) | |
self.drop = nn.Dropout(dropout) | |
m = [] | |
for i in range(nlayers-1): | |
m.append(nn.Linear(nhid, nhid)) | |
m.append(nn.Tanh()) | |
m.append(nn.Dropout(dropout)) | |
self.mlp = nn.Sequential(*m) | |
self.proj = nn.Linear(nhid, self.vsize) | |
if tieweights: | |
# Weight tying is horrible for this model. | |
self.proj.weight = self.lut.weight | |
def forward(self, input, hid): | |
emb = self.lut(input) | |
# T = time, N = batch, H = hidden | |
T, N, H = emb.size() | |
# We pad manually, since conv1d only does symmetric padding. | |
# This is kind of incorrect, ideally you'd have an embedding | |
# for a separate padding token that you append to the start of a sen. | |
pad = V(emb.data.new(self.kW-1, N, H)) | |
pad.data.fill_(0) | |
pad.requires_grad = False | |
emb = torch.cat([pad, emb], 0) | |
# Conv wants N(1) x H(2) x T(0), but our input is T(0) x N(1) x H(2) | |
# Then we have to convert back to T(2) x N(0) x H(1) from N(0) x H(1) x T(2) | |
# Sorry, that was kind of backwards, think about it from input order of dimensions | |
# to what we want (right to left in the above description). | |
hid = self.conv(emb.permute(1,2,0)).permute(2,0,1) | |
return self.proj(self.mlp(hid)), hid | |
class LstmLm(Lm): | |
def __init__(self, vocab, nhid, nlayers=1, dropout=0, tieweights=True): | |
super(LstmLm, self).__init__() | |
self.vsize = len(vocab.itos) | |
self.lut = nn.Embedding(self.vsize, nhid, max_norm=args.maxnorm) | |
self.rnn = nn.LSTM( | |
input_size=nhid, | |
hidden_size=nhid, | |
num_layers=nlayers, | |
dropout=dropout) | |
self.drop = nn.Dropout(dropout) | |
self.proj = nn.Linear(nhid, self.vsize) | |
if tieweights: | |
# See https://arxiv.org/abs/1608.05859 | |
# Seems to improve ppl by 13%. | |
self.proj.weight = self.lut.weight | |
def forward(self, input, hid): | |
emb = self.lut(input) | |
hids, hid = self.rnn(emb, hid) | |
# Detach hiddens to truncate the computational graph for BPTT. | |
# Recall that hid = (h,c). | |
return self.proj(self.drop(hids)), tuple(map(lambda x: x.detach(), hid)) | |
if __name__ == "__main__": | |
models = {model.__name__: model for model in [NnLm, LstmLm]} | |
model = models[args.model]( | |
TEXT.vocab, args.nhid, nlayers=args.nlayers, dropout=args.dropout, tieweights=args.tieweights) | |
print(model) | |
if args.devid >= 0: | |
model.cuda(args.devid) | |
# We do not want to give the model credit for predicting padding symbols, | |
# this can decrease ppl a few points. | |
weight = torch.FloatTensor(model.vsize).fill_(1) | |
weight[padidx] = 0 | |
if args.devid >= 0: | |
weight = weight.cuda(args.devid) | |
loss = nn.CrossEntropyLoss(weight=V(weight), size_average=False) | |
params = [p for p in model.parameters() if p.requires_grad] | |
if args.optim == "Adam": | |
optimizer = optim.Adam( | |
params, lr = args.lr, weight_decay = args.wd, betas=(args.b1, args.b2)) | |
elif args.optim == "SGD": | |
optimizer = optim.SGD( | |
params, lr = args.lr, weight_decay = args.wd, | |
nesterov = not args.nonag, momentum = args.mom, dampening = args.dm) | |
schedule = optim.lr_scheduler.ReduceLROnPlateau( | |
optimizer, patience=1, factor=args.lrd, threshold=1e-3) | |
for epoch in range(args.epochs): | |
print("Epoch {}, lr {}".format(epoch, optimizer.param_groups[0]['lr'])) | |
train_loss, train_words = model.train_epoch( | |
iter=train_iter, loss=loss, optimizer=optimizer) | |
valid_loss, valid_words = model.validate(valid_iter, loss) | |
schedule.step(valid_loss) | |
print("Train: {}, Valid: {}".format( | |
math.exp(train_loss / train_words), math.exp(valid_loss / valid_words))) | |
test_loss, test_words = model.validate(test_iter, loss) | |
print("Test: {}".format(math.exp(test_loss / test_words))) | |
model.generate_predictions() | |
torch.save(model.cpu(), model.__class__.__name__ + ".pth") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment