-
-
Save adityathakker/cbf626fa3fa57414b48b9fa16726abbd to your computer and use it in GitHub Desktop.
Minimal character-level language model with a Vanilla Recurrent Neural Network, in Python/numpy
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Minimal character-level Vanilla RNN model. Written by Andrej Karpathy (@karpathy). Updated By Aditya Thakker (@adityathakker) | |
BSD License | |
""" | |
import numpy as np | |
# data I/O | |
data = open('input.txt', 'r').read() # should be simple plain text file | |
chars = list(set(data)) # gives a list of unique characters in the data | |
data_size, vocab_size = len(data), len(chars) | |
print('Data has %d characters, %d unique.' % (data_size, vocab_size)) | |
char_to_ix = { ch:i for i,ch in enumerate(chars) } # character to index dictionary | |
ix_to_char = { i:ch for i,ch in enumerate(chars) } # index to character dictionary | |
# hyperparameters | |
hidden_size = 100 # size of hidden layer of neurons | |
seq_length = 25 # number of steps to unroll the RNN for | |
learning_rate = 1e-1 # 1e-1 = 0.1 | |
# model parameters | |
Wxh = np.random.randn(hidden_size, vocab_size)*0.01 # input to hidden | |
Whh = np.random.randn(hidden_size, hidden_size)*0.01 # hidden to hidden | |
Why = np.random.randn(vocab_size, hidden_size)*0.01 # hidden to output | |
bh = np.zeros((hidden_size, 1)) # hidden bias | |
by = np.zeros((vocab_size, 1)) # output bias | |
def lossFun(inputs, targets, hprev): | |
""" | |
inputs,targets are both list of integers. | |
hprev is Hx1 array of initial hidden state | |
returns the loss, gradients on model parameters, and last hidden state | |
""" | |
xs, hs, ys, ps = {}, {}, {}, {} | |
hs[-1] = np.copy(hprev) # -1 is the index because in the first iteration of the loop, the hidden state index t-1 = 0-1 = -1. So we intialize it first | |
loss = 0 | |
# forward pass | |
# this loop goes over all the chars in the seq_length | |
for t in range(len(inputs)): | |
xs[t] = np.zeros((vocab_size,1)) # encode in 1-of-k representation | |
xs[t][inputs[t]] = 1 # set the current char index to 1 | |
hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh) # hidden state; hs[t].shape = (hidden_neurons x 1) | |
ys[t] = np.dot(Why, hs[t]) + by # unnormalized log probabilities for next chars; ys[t].shape = vocab_size x 1 | |
ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t])) # probabilities for next chars (i.e softmax); ps[t] = vocab_size x 1 | |
loss += -np.log(ps[t][targets[t],0]) # softmax (cross-entropy loss) | |
# backward pass: compute gradients going backwards | |
dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why) # intial zero derivatives for all weights | |
dbh, dby = np.zeros_like(bh), np.zeros_like(by) # inital zero derivatives for all biases | |
dhnext = np.zeros_like(hs[0]) | |
for t in reversed(range(len(inputs))): | |
dy = np.copy(ps[t]) | |
dy[targets[t]] -= 1 # backprop into y. see http://cs231n.github.io/neural-networks-case-study/#grad if confused here | |
# apply the chain rule and you can also derive all these | |
dWhy += np.dot(dy, hs[t].T) | |
dby += dy | |
dh = np.dot(Why.T, dy) + dhnext # backprop into h | |
dhraw = (1 - hs[t] * hs[t]) * dh # backprop through tanh nonlinearity | |
dbh += dhraw | |
dWxh += np.dot(dhraw, xs[t].T) | |
dWhh += np.dot(dhraw, hs[t-1].T) | |
dhnext = np.dot(Whh.T, dhraw) | |
for dparam in [dWxh, dWhh, dWhy, dbh, dby]: | |
np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients | |
return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1] | |
def sample(h, seed_ix, n): | |
""" | |
sample a sequence of integers from the model | |
h is memory state, seed_ix is seed letter for first time step | |
""" | |
x = np.zeros((vocab_size, 1)) | |
x[seed_ix] = 1 | |
ixes = [] | |
for t in range(n): | |
h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh) | |
y = np.dot(Why, h) + by | |
p = np.exp(y) / np.sum(np.exp(y)) | |
ix = np.random.choice(range(vocab_size), p=p.ravel()) | |
x = np.zeros((vocab_size, 1)) | |
x[ix] = 1 | |
ixes.append(ix) | |
return ixes | |
n, p = 0, 0 # n is iteration counter; p is data pointer | |
mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why) # memory variables for Adagrad | |
mbh, mby = np.zeros_like(bh), np.zeros_like(by) # memory variables for Adagrad | |
smooth_loss = -np.log(1.0/vocab_size)*seq_length # loss at iteration 0 | |
while True: | |
# prepare inputs (we're sweeping from left to right in steps seq_length long) | |
if p+seq_length+1 >= len(data) or n == 0: | |
hprev = np.zeros((hidden_size,1)) # reset RNN memory | |
p = 0 # go from start of data | |
inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]] | |
targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]]# targets list is given offset by 1, because that is what we want as output | |
# sample from the model every 100th time | |
if n % 100 == 0: | |
sample_ix = sample(hprev, inputs[0], 200) # params: last known prev hidden state(hprev), starting character(input[0]), length to sample(200) | |
txt = ''.join(ix_to_char[ix] for ix in sample_ix) # join all characters to form a sentence string | |
print('----\n %s \n----' % (txt, )) | |
# forward seq_length characters through the net and fetch gradient | |
loss, dWxh, dWhh, dWhy, dbh, dby, hprev = lossFun(inputs, targets, hprev) # params: input data (input), target output (target), hidden state vector from previous chunk (hprev); output: gradients of all variables (dWxh, dWhh, dWhy, dbh, dby), loss (loss), last hidden state vector (hprev) | |
smooth_loss = smooth_loss * 0.999 + loss * 0.001 | |
if n % 100 == 0: print('iter %d, loss: %f' % (n, smooth_loss)) # print progress | |
# perform parameter update with Adagrad | |
for param, dparam, mem in zip([Wxh, Whh, Why, bh, by], | |
[dWxh, dWhh, dWhy, dbh, dby], | |
[mWxh, mWhh, mWhy, mbh, mby]): | |
mem += dparam * dparam | |
param += -learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update | |
p += seq_length # move data pointer | |
n += 1 # iteration counter |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment