masterdezign · April 1, 2024 20:02
diff --git a/charnn.py b/charnn.py
 # Based on min-char-rnn.py
 # https://gist.github.com/karpathy/d4dee566867f8291f086

 import torch as th
 import torch.nn as nn
 import numpy as np


 class Model(nn.Module):
    def __init__(self, hidden_size, vocab_size, num_layers):
        super(Model, self).__init__()
        self.vocab_size = vocab_size
        self.lstm = nn.LSTM(vocab_size, hidden_size, num_layers)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, h, train=True):
        y, h = self.lstm(x, h)
        y = self.fc(y)
        if not train:
            y = nn.functional.softmax(y, dim=1)
        return y, h


 def sample(model_state, seed_ix, n, device='cpu'):
    """
    Sample a sequence of integers from the LSTM model.

    seed_ix is a seed letter for the first time step.
    """
    model, h = model_state
    x = th.zeros(model.vocab_size).to(device)
    x[seed_ix] = 1
    ixes = [seed_ix]
    with th.no_grad():
        for t in range(n):
            x = x.view(1, -1)
            p, h = model(x, h, train=False)
            p = p.ravel()
            ix = np.random.choice(range(model.vocab_size), p=p.cpu().numpy())
            x = th.zeros(model.vocab_size).to(device)
            x[ix] = 1
            ixes.append(ix)
    return ixes


 def main(device='cuda:0'):
    # Data I/O
    data = open("input.txt", "r").read()  # Must be a simple plain text file
    chars = list(set(data))
    data_size, vocab_size = len(data), len(chars)
    print("data has %d characters, %d unique." % (data_size, vocab_size))
    char_to_ix = {ch: i for i, ch in enumerate(chars)}
    ix_to_char = {i: ch for i, ch in enumerate(chars)}

    # Hyperparameters
    hidden_size = 512  # size of hidden layer of neurons
    seq_length = 25  # number of steps to unroll the RNN for
    learning_rate = 1e-2

    num_layers = 3
    model = Model(hidden_size, vocab_size, num_layers).to(device)

    # Initialize optimizer
    optimizer = th.optim.Adam(model.parameters(), lr=learning_rate)

    # Train
    n, p = 0, 0
    # Initialize the hidden state
    h0 = (th.zeros(num_layers, hidden_size).to(device),
          th.zeros(num_layers, hidden_size).to(device))
    for _ in range(10_000_000 + 1):
        # Prepare inputs (we're sweeping from left to right in steps seq_length long)
        if p + seq_length + 1 >= len(data) or n == 0:
            # Reset LSTM memory
            h0 = (th.zeros(num_layers, hidden_size).to(device),
                  th.zeros(num_layers, hidden_size).to(device))
            # Go from start of data
            p = 0
        inputs = [char_to_ix[ch] for ch in data[p : p + seq_length]]
        targets = [char_to_ix[ch] for ch in data[p + 1 : p + seq_length + 1]]

        # Sample from the model now and then
        if n % 1000 == 0:
            sample_ix = sample((model, h0), inputs[0], 500, device=device)
            txt = "".join(ix_to_char[ix] for ix in sample_ix)
            print("\n%s\n----" % (txt,))

        # From int to one-hot
        tgt = th.zeros(len(targets), vocab_size).to(device)
        for i in range(len(targets)):
            tgt[i, targets[i]] = 1

        x = th.zeros(len(inputs), vocab_size).to(device)
        for i in range(len(inputs)):
            x[i, inputs[i]] = 1

        # Optimize LSTM with cross-entropy loss
        optimizer.zero_grad()

        y, h0 = model(x, h0)
        loss = nn.functional.cross_entropy(y, tgt)

        # Avoid RuntimeError: Trying to backward through the graph a second time
        # Detach the hidden state from the graph
        h0 = (h0[0].detach(), h0[1].detach())

        if n % 10000 == 0:
            progress = 100 * n * seq_length / data_size
            print(f"{progress:.2f}% Loss: {loss.item():.4f}")
            print("----")

        loss.backward()
        # Clip gradients to avoid exploding gradients
        th.nn.utils.clip_grad_norm_(model.parameters(), 1)
        optimizer.step()

        if n % 250_000 == 0:
            th.save(model.state_dict(), f"model_{n//1000}.pth")
            # Save hidden state and chars
            th.save((h0, chars), f"model_{n//1000}_state.pth")

        p += seq_length  # move data pointer
        n += 1  # iteration counter


 if __name__ == "__main__":
    main()
diff --git a/generate.py b/generate.py
 import torch as th
 import torch.nn as nn
 import numpy as np


 class Model(nn.Module):
    def __init__(self, hidden_size, vocab_size, num_layers):
        super(Model, self).__init__()
        self.vocab_size = vocab_size
        self.lstm = nn.LSTM(vocab_size, hidden_size, num_layers)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, h, train=True):
        y, h = self.lstm(x, h)
        y = self.fc(y)
        if not train:
            y = nn.functional.softmax(y, dim=1)
        return y, h


 def sample(model_state, seed_ix, n, device="cpu"):
    """
    Sample a sequence of integers from the LSTM model.

    seed_ix is a seed letter for the first time step.
    """
    model, h = model_state
    x = th.zeros(model.vocab_size).to(device)
    x[seed_ix] = 1
    ixes = [seed_ix]
    with th.no_grad():
        for t in range(n):
            x = x.view(1, -1)
            p, h = model(x, h, train=False)
            p = p.ravel()
            ix = np.random.choice(range(model.vocab_size), p=p.cpu().numpy())
            x = th.zeros(model.vocab_size).to(device)
            x[ix] = 1
            ixes.append(ix)
    return ixes


 def test_model(device="cpu", model_filename="model.pth", state_filename="model_state.pth"):
    h0, chars = th.load(state_filename, map_location=device)
    h1, h2 = h0
    h1 = h1.to(device)
    h2 = h2.to(device)
    h0 = h1, h2

    vocab_size = len(chars)
    char_to_ix = {ch: i for i, ch in enumerate(chars)}
    ix_to_char = {i: ch for i, ch in enumerate(chars)}

    # Hyperparameters
    hidden_size = 128  # size of hidden layer of neurons
    num_layers = 2
    model = Model(hidden_size, vocab_size, num_layers).to(device)

    # Load model to cpu
    model.load_state_dict(th.load(model_filename, map_location=device))
    model.eval()

    sample_ix = sample((model, h0), char_to_ix["#"], 2000)
    txt = "".join(ix_to_char[ix] for ix in sample_ix)
    print(txt)


 if __name__ == "__main__":
    f = "model_1500"
    test_model("cpu", f"{f}.pth", f"{f}_state.pth")
	# Based on min-char-rnn.py
	# https://gist.github.com/karpathy/d4dee566867f8291f086

	import torch as th
	import torch.nn as nn
	import numpy as np


	class Model(nn.Module):
	def __init__(self, hidden_size, vocab_size, num_layers):
	super(Model, self).__init__()
	self.vocab_size = vocab_size
	self.lstm = nn.LSTM(vocab_size, hidden_size, num_layers)
	self.fc = nn.Linear(hidden_size, vocab_size)

	def forward(self, x, h, train=True):
	y, h = self.lstm(x, h)
	y = self.fc(y)
	if not train:
	y = nn.functional.softmax(y, dim=1)
	return y, h


	def sample(model_state, seed_ix, n, device='cpu'):
	"""
	Sample a sequence of integers from the LSTM model.

	seed_ix is a seed letter for the first time step.
	"""
	model, h = model_state
	x = th.zeros(model.vocab_size).to(device)
	x[seed_ix] = 1
	ixes = [seed_ix]
	with th.no_grad():
	for t in range(n):
	x = x.view(1, -1)
	p, h = model(x, h, train=False)
	p = p.ravel()
	ix = np.random.choice(range(model.vocab_size), p=p.cpu().numpy())
	x = th.zeros(model.vocab_size).to(device)
	x[ix] = 1
	ixes.append(ix)
	return ixes


	def main(device='cuda:0'):
	# Data I/O
	data = open("input.txt", "r").read() # Must be a simple plain text file
	chars = list(set(data))
	data_size, vocab_size = len(data), len(chars)
	print("data has %d characters, %d unique." % (data_size, vocab_size))
	char_to_ix = {ch: i for i, ch in enumerate(chars)}
	ix_to_char = {i: ch for i, ch in enumerate(chars)}

	# Hyperparameters
	hidden_size = 512 # size of hidden layer of neurons
	seq_length = 25 # number of steps to unroll the RNN for
	learning_rate = 1e-2

	num_layers = 3
	model = Model(hidden_size, vocab_size, num_layers).to(device)

	# Initialize optimizer
	optimizer = th.optim.Adam(model.parameters(), lr=learning_rate)

	# Train
	n, p = 0, 0
	# Initialize the hidden state
	h0 = (th.zeros(num_layers, hidden_size).to(device),
	th.zeros(num_layers, hidden_size).to(device))
	for _ in range(10_000_000 + 1):
	# Prepare inputs (we're sweeping from left to right in steps seq_length long)
	if p + seq_length + 1 >= len(data) or n == 0:
	# Reset LSTM memory
	h0 = (th.zeros(num_layers, hidden_size).to(device),
	th.zeros(num_layers, hidden_size).to(device))
	# Go from start of data
	p = 0
	inputs = [char_to_ix[ch] for ch in data[p : p + seq_length]]
	targets = [char_to_ix[ch] for ch in data[p + 1 : p + seq_length + 1]]

	# Sample from the model now and then
	if n % 1000 == 0:
	sample_ix = sample((model, h0), inputs[0], 500, device=device)
	txt = "".join(ix_to_char[ix] for ix in sample_ix)
	print("\n%s\n----" % (txt,))

	# From int to one-hot
	tgt = th.zeros(len(targets), vocab_size).to(device)
	for i in range(len(targets)):
	tgt[i, targets[i]] = 1

	x = th.zeros(len(inputs), vocab_size).to(device)
	for i in range(len(inputs)):
	x[i, inputs[i]] = 1

	# Optimize LSTM with cross-entropy loss
	optimizer.zero_grad()

	y, h0 = model(x, h0)
	loss = nn.functional.cross_entropy(y, tgt)

	# Avoid RuntimeError: Trying to backward through the graph a second time
	# Detach the hidden state from the graph
	h0 = (h0[0].detach(), h0[1].detach())

	if n % 10000 == 0:
	progress = 100 * n * seq_length / data_size
	print(f"{progress:.2f}% Loss: {loss.item():.4f}")
	print("----")

	loss.backward()
	# Clip gradients to avoid exploding gradients
	th.nn.utils.clip_grad_norm_(model.parameters(), 1)
	optimizer.step()

	if n % 250_000 == 0:
	th.save(model.state_dict(), f"model_{n//1000}.pth")
	# Save hidden state and chars
	th.save((h0, chars), f"model_{n//1000}_state.pth")

	p += seq_length # move data pointer
	n += 1 # iteration counter


	if __name__ == "__main__":
	main()