-
-
Save eromoe/4c34c10b84bc78d44ba79ab8c6d25865 to your computer and use it in GitHub Desktop.
Minimal character-level language model with a Vanilla Recurrent Neural Network, in Python/numpy
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Minimal character-level Vanilla RNN model. Written by Andrej Karpathy (@karpathy) | |
BSD License | |
""" | |
import numpy as np | |
# data I/O | |
data = open('input.txt', 'r').read() # should be simple plain text file | |
chars = list(set(data)) | |
data_size, vocab_size = len(data), len(chars) | |
print 'data has %d characters, %d unique.' % (data_size, vocab_size) | |
char_to_ix = { ch:i for i,ch in enumerate(chars) } | |
ix_to_char = { i:ch for i,ch in enumerate(chars) } | |
# hyperparameters | |
hidden_size = 100 # size of hidden layer of neurons | |
# 感觉这里的seq_length相当于batch | |
# 先用input [0,25] ,tagert [1, 26] 训练, 然后input [25,50] ,tagert [26, 51] 。。。。 | |
seq_length = 25 # number of steps to unroll the RNN for | |
# 10**-1, 1 × 10^-1 | |
learning_rate = 1e-1 | |
# model parameters | |
Wxh = np.random.randn(hidden_size, vocab_size)*0.01 # input to hidden | |
Whh = np.random.randn(hidden_size, hidden_size)*0.01 # hidden to hidden | |
Why = np.random.randn(vocab_size, hidden_size)*0.01 # hidden to output | |
bh = np.zeros((hidden_size, 1)) # hidden bias | |
by = np.zeros((vocab_size, 1)) # output bias | |
def lossFun(inputs, targets, hprev): | |
""" | |
inputs,targets are both list of integers. | |
hprev is Hx1 array of initial hidden state | |
returns the loss, gradients on model parameters, and last hidden state | |
""" | |
xs, hs, ys, ps = {}, {}, {}, {} | |
# hs[-1], 也就是hprev 是用0 初始化的 | |
hs[-1] = np.copy(hprev) | |
loss = 0 | |
# forward pass | |
for t in xrange(len(inputs)): | |
# xs 长度是 input 长度 | |
# xs 每个值是 one hot 编码, 也就是 [0, ... 1, ..0] 长度是vocab_size | |
xs[t] = np.zeros((vocab_size,1)) # encode in 1-of-k representation | |
xs[t][inputs[t]] = 1 | |
# 正向传播的地方在这里hs[t-1] ,这里是个递归 | |
hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh) # hidden state | |
# Why外面随机初始化的 | |
ys[t] = np.dot(Why, hs[t]) + by # unnormalized log probabilities for next chars | |
ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t])) # probabilities for next chars | |
loss += -np.log(ps[t][targets[t],0]) # softmax (cross-entropy loss) | |
# backward pass: compute gradients going backwards | |
# 全0 初始化 | |
dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why) | |
dbh, dby = np.zeros_like(bh), np.zeros_like(by) | |
dhnext = np.zeros_like(hs[0]) | |
for t in reversed(xrange(len(inputs))): | |
dy = np.copy(ps[t]) | |
dy[targets[t]] -= 1 # backprop into y. see http://cs231n.github.io/neural-networks-case-study/#grad if confused here | |
dWhy += np.dot(dy, hs[t].T) | |
dby += dy | |
dh = np.dot(Why.T, dy) + dhnext # backprop into h | |
dhraw = (1 - hs[t] * hs[t]) * dh # backprop through tanh nonlinearity | |
dbh += dhraw | |
dWxh += np.dot(dhraw, xs[t].T) | |
dWhh += np.dot(dhraw, hs[t-1].T) | |
dhnext = np.dot(Whh.T, dhraw) | |
for dparam in [dWxh, dWhh, dWhy, dbh, dby]: | |
# 限制参数范围,最小值-5,最大值5 | |
np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients | |
return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1] | |
def sample(h, seed_ix, n): | |
""" | |
sample a sequence of integers from the model | |
h is memory state, seed_ix is seed letter for first time step | |
""" | |
x = np.zeros((vocab_size, 1)) | |
x[seed_ix] = 1 | |
ixes = [] | |
for t in xrange(n): | |
h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh) | |
y = np.dot(Why, h) + by | |
p = np.exp(y) / np.sum(np.exp(y)) | |
ix = np.random.choice(range(vocab_size), p=p.ravel()) | |
x = np.zeros((vocab_size, 1)) | |
x[ix] = 1 | |
ixes.append(ix) | |
return ixes | |
n, p = 0, 0 | |
mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why) | |
mbh, mby = np.zeros_like(bh), np.zeros_like(by) # memory variables for Adagrad | |
smooth_loss = -np.log(1.0/vocab_size)*seq_length # loss at iteration 0 | |
while True: | |
# prepare inputs (we're sweeping from left to right in steps seq_length long) | |
# 没有结束 or 没有开始 , 结束了再设p=0重新开始 | |
if p+seq_length+1 >= len(data) or n == 0: | |
hprev = np.zeros((hidden_size,1)) # reset RNN memory | |
p = 0 # go from start of data | |
inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]] | |
targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]] | |
# sample from the model now and then | |
if n % 100 == 0: | |
sample_ix = sample(hprev, inputs[0], 200) | |
txt = ''.join(ix_to_char[ix] for ix in sample_ix) | |
print '----\n %s \n----' % (txt, ) | |
# forward seq_length characters through the net and fetch gradient | |
loss, dWxh, dWhh, dWhy, dbh, dby, hprev = lossFun(inputs, targets, hprev) | |
# smooth_loss 是为了平滑输出结果设的,并没有其他用途 | |
smooth_loss = smooth_loss * 0.999 + loss * 0.001 | |
if n % 100 == 0: print 'iter %d, loss: %f' % (n, smooth_loss) # print progress | |
# perform parameter update with Adagrad | |
for param, dparam, mem in zip([Wxh, Whh, Why, bh, by], | |
[dWxh, dWhh, dWhy, dbh, dby], | |
[mWxh, mWhh, mWhy, mbh, mby]): | |
# 因为mWxh, Wxh 都是直接设定的 | |
# numpy 直接操作内存,所以这里赋值也可以更新 | |
mem += dparam * dparam | |
param += -learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update | |
p += seq_length # move data pointer | |
n += 1 # iteration counter |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
RNN反向传播原理
https://www.cnblogs.com/pinard/p/6509630.html