Last active
April 1, 2024 20:02
-
-
Save masterdezign/a3376551de80c30b4167b3182bb65ec2 to your computer and use it in GitHub Desktop.
Character-level text generation with PyTorch LSTM
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Based on min-char-rnn.py | |
# https://gist.github.com/karpathy/d4dee566867f8291f086 | |
import torch as th | |
import torch.nn as nn | |
import numpy as np | |
class Model(nn.Module): | |
def __init__(self, hidden_size, vocab_size, num_layers): | |
super(Model, self).__init__() | |
self.vocab_size = vocab_size | |
self.lstm = nn.LSTM(vocab_size, hidden_size, num_layers) | |
self.fc = nn.Linear(hidden_size, vocab_size) | |
def forward(self, x, h, train=True): | |
y, h = self.lstm(x, h) | |
y = self.fc(y) | |
if not train: | |
y = nn.functional.softmax(y, dim=1) | |
return y, h | |
def sample(model_state, seed_ix, n, device='cpu'): | |
""" | |
Sample a sequence of integers from the LSTM model. | |
seed_ix is a seed letter for the first time step. | |
""" | |
model, h = model_state | |
x = th.zeros(model.vocab_size).to(device) | |
x[seed_ix] = 1 | |
ixes = [seed_ix] | |
with th.no_grad(): | |
for t in range(n): | |
x = x.view(1, -1) | |
p, h = model(x, h, train=False) | |
p = p.ravel() | |
ix = np.random.choice(range(model.vocab_size), p=p.cpu().numpy()) | |
x = th.zeros(model.vocab_size).to(device) | |
x[ix] = 1 | |
ixes.append(ix) | |
return ixes | |
def main(device='cuda:0'): | |
# Data I/O | |
data = open("input.txt", "r").read() # Must be a simple plain text file | |
chars = list(set(data)) | |
data_size, vocab_size = len(data), len(chars) | |
print("data has %d characters, %d unique." % (data_size, vocab_size)) | |
char_to_ix = {ch: i for i, ch in enumerate(chars)} | |
ix_to_char = {i: ch for i, ch in enumerate(chars)} | |
# Hyperparameters | |
hidden_size = 512 # size of hidden layer of neurons | |
seq_length = 25 # number of steps to unroll the RNN for | |
learning_rate = 1e-2 | |
num_layers = 3 | |
model = Model(hidden_size, vocab_size, num_layers).to(device) | |
# Initialize optimizer | |
optimizer = th.optim.Adam(model.parameters(), lr=learning_rate) | |
# Train | |
n, p = 0, 0 | |
# Initialize the hidden state | |
h0 = (th.zeros(num_layers, hidden_size).to(device), | |
th.zeros(num_layers, hidden_size).to(device)) | |
for _ in range(10_000_000 + 1): | |
# Prepare inputs (we're sweeping from left to right in steps seq_length long) | |
if p + seq_length + 1 >= len(data) or n == 0: | |
# Reset LSTM memory | |
h0 = (th.zeros(num_layers, hidden_size).to(device), | |
th.zeros(num_layers, hidden_size).to(device)) | |
# Go from start of data | |
p = 0 | |
inputs = [char_to_ix[ch] for ch in data[p : p + seq_length]] | |
targets = [char_to_ix[ch] for ch in data[p + 1 : p + seq_length + 1]] | |
# Sample from the model now and then | |
if n % 1000 == 0: | |
sample_ix = sample((model, h0), inputs[0], 500, device=device) | |
txt = "".join(ix_to_char[ix] for ix in sample_ix) | |
print("\n%s\n----" % (txt,)) | |
# From int to one-hot | |
tgt = th.zeros(len(targets), vocab_size).to(device) | |
for i in range(len(targets)): | |
tgt[i, targets[i]] = 1 | |
x = th.zeros(len(inputs), vocab_size).to(device) | |
for i in range(len(inputs)): | |
x[i, inputs[i]] = 1 | |
# Optimize LSTM with cross-entropy loss | |
optimizer.zero_grad() | |
y, h0 = model(x, h0) | |
loss = nn.functional.cross_entropy(y, tgt) | |
# Avoid RuntimeError: Trying to backward through the graph a second time | |
# Detach the hidden state from the graph | |
h0 = (h0[0].detach(), h0[1].detach()) | |
if n % 10000 == 0: | |
progress = 100 * n * seq_length / data_size | |
print(f"{progress:.2f}% Loss: {loss.item():.4f}") | |
print("----") | |
loss.backward() | |
# Clip gradients to avoid exploding gradients | |
th.nn.utils.clip_grad_norm_(model.parameters(), 1) | |
optimizer.step() | |
if n % 250_000 == 0: | |
th.save(model.state_dict(), f"model_{n//1000}.pth") | |
# Save hidden state and chars | |
th.save((h0, chars), f"model_{n//1000}_state.pth") | |
p += seq_length # move data pointer | |
n += 1 # iteration counter | |
if __name__ == "__main__": | |
main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch as th | |
import torch.nn as nn | |
import numpy as np | |
class Model(nn.Module): | |
def __init__(self, hidden_size, vocab_size, num_layers): | |
super(Model, self).__init__() | |
self.vocab_size = vocab_size | |
self.lstm = nn.LSTM(vocab_size, hidden_size, num_layers) | |
self.fc = nn.Linear(hidden_size, vocab_size) | |
def forward(self, x, h, train=True): | |
y, h = self.lstm(x, h) | |
y = self.fc(y) | |
if not train: | |
y = nn.functional.softmax(y, dim=1) | |
return y, h | |
def sample(model_state, seed_ix, n, device="cpu"): | |
""" | |
Sample a sequence of integers from the LSTM model. | |
seed_ix is a seed letter for the first time step. | |
""" | |
model, h = model_state | |
x = th.zeros(model.vocab_size).to(device) | |
x[seed_ix] = 1 | |
ixes = [seed_ix] | |
with th.no_grad(): | |
for t in range(n): | |
x = x.view(1, -1) | |
p, h = model(x, h, train=False) | |
p = p.ravel() | |
ix = np.random.choice(range(model.vocab_size), p=p.cpu().numpy()) | |
x = th.zeros(model.vocab_size).to(device) | |
x[ix] = 1 | |
ixes.append(ix) | |
return ixes | |
def test_model(device="cpu", model_filename="model.pth", state_filename="model_state.pth"): | |
h0, chars = th.load(state_filename, map_location=device) | |
h1, h2 = h0 | |
h1 = h1.to(device) | |
h2 = h2.to(device) | |
h0 = h1, h2 | |
vocab_size = len(chars) | |
char_to_ix = {ch: i for i, ch in enumerate(chars)} | |
ix_to_char = {i: ch for i, ch in enumerate(chars)} | |
# Hyperparameters | |
hidden_size = 128 # size of hidden layer of neurons | |
num_layers = 2 | |
model = Model(hidden_size, vocab_size, num_layers).to(device) | |
# Load model to cpu | |
model.load_state_dict(th.load(model_filename, map_location=device)) | |
model.eval() | |
sample_ix = sample((model, h0), char_to_ix["#"], 2000) | |
txt = "".join(ix_to_char[ix] for ix in sample_ix) | |
print(txt) | |
if __name__ == "__main__": | |
f = "model_1500" | |
test_model("cpu", f"{f}.pth", f"{f}_state.pth") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment