Created
February 7, 2017 10:45
-
-
Save georgeblck/4d806e56693420ad22af37a3c29affde to your computer and use it in GitHub Desktop.
Minimal character-level language model with a Vanilla Recurrent Neural Network, in R
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### | |
### Minimal character-level Vanilla RNN model. Written by Andrej Karpathy (@karpathy) | |
### BSD License | |
### Re-written in R by @georgeblck | |
### | |
rm(list=ls(all=TRUE)) | |
options(digits=10) | |
# Make clipping functions of various speeds (for speed) | |
if ("Rcpp" %in% rownames(installed.packages())){ | |
library(Rcpp) | |
cppFunction('NumericVector rcpp_clip( NumericVector x, double a, double b){ | |
return clamp( a, x, b ) ; | |
}') | |
clip.mat <- function(mat){ | |
matrix(apply(mat, 2, rcpp_clip, a =-5, b=5), | |
nrow = nrow(mat), ncol = ncol(mat)) | |
} | |
} else { | |
clip <- function(vec, UB=5, LB=-5){ | |
pmax( LB, pmin( vec, UB)) | |
} | |
clip.mat <- function(mat){ | |
matrix(apply(mat,2,clip), nrow = nrow(mat), ncol = ncol(mat)) | |
} | |
} | |
# Data I/O | |
# If you have internet access download the data | |
data <- readLines("https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt") | |
data <- paste(data, collapse = "\n") | |
data <- strsplit(data, "")[[1]] | |
chars <- unique(data) | |
data_size <- length(data) | |
vocab_size <- length(chars) | |
char_to_ix <- rbind(chars, 1:vocab_size) | |
# hyperparameters | |
hidden_size <- 100 | |
seq_length <- 25 | |
learning_rate <- 0.1 | |
# model parameters | |
set.seed(12345) | |
Wxh <- matrix(rnorm(hidden_size * vocab_size), | |
hidden_size, vocab_size) * 0.01 # input to hidden | |
Whh <- matrix(rnorm(hidden_size * hidden_size), | |
hidden_size, hidden_size) * 0.01 # hidden to hidden | |
Why <- matrix(rnorm(hidden_size * vocab_size), | |
vocab_size, hidden_size) * 0.01 # hidden to output | |
bh <- matrix(0, hidden_size, 1) # hidden bias | |
by <- matrix(0, vocab_size, 1) # output bias | |
weight.list <- list(Wxh = Wxh, Whh = Whh, Why = Why, | |
bh = bh, by = by) | |
lossFun <- function(inputs, targets, hprev, w){ | |
## inputs,targets are both vectors of integers (i.e. x, y) | |
## hprev is Hx1 vector of initial hidden state | |
## w is the list of weights | |
## returns the loss, gradients on model parameters, and last hidden state | |
# Initialize variables | |
loss <- 0 | |
len_input <- length(inputs) | |
xs <- matrix(0, nrow = vocab_size, ncol = len_input) | |
hs <- matrix(0, nrow = hidden_size, ncol = (len_input + 1)) | |
hs[,1] <- hprev | |
ys <- xs | |
ps <- ys | |
# forward pass | |
for (t in 1:len_input){ | |
# encode in 1-of-k representation | |
xs[inputs[t], t] <- 1 | |
hs[, (t+1)] <- tanh(w$Wxh %*% xs[, t] + | |
w$Whh %*% hs[, (t-1+1)] + w$bh) | |
# unnormalized log probabilities for next chars | |
ys[, t] <- w$Why %*% hs[, (t+1)] + w$by | |
# probabilities for next chars | |
ps[, t] <- exp(ys[, t]) / sum(exp(ys[, t])) | |
# softmax (cross-entropy loss) | |
loss <- loss + (-1.0)*log(ps[targets[t], t]) | |
} | |
# backward pass: compute gradients going backwards | |
dWxh <- matrix(0, hidden_size, vocab_size) | |
dWhh <- matrix(0, hidden_size, hidden_size) | |
dWhy <- matrix(0, vocab_size, hidden_size) | |
dbh <- matrix(0, hidden_size, 1) | |
dby <- matrix(0, vocab_size, 1) | |
dhnext <- 0 | |
for (t in len_input:1){ | |
dy <- ps[, t] | |
# backprop into y | |
dy[targets[t]] <- dy[targets[t]] - 1 | |
dWhy <- dWhy + dy %*% t(hs[, t+1]) | |
dby <- dby + dy | |
# backprop into h | |
dh <- t(w$Why) %*% dy + dhnext | |
# backprop through tanh nonlinearity | |
dhraw <- (1 - hs[, t+1]^2) * dh | |
dbh <- dbh + dhraw | |
dWxh <- dWxh + dhraw %*% t(xs[, t]) | |
dWhh <- dWhh + dhraw %*% t(hs[, (t-1+1)]) | |
dhnext <- t(w$Whh) %*% dhraw | |
} | |
dweights.list <- list(dWxh = dWxh, dWhh = dWhh, dWhy = dWhy, | |
dbh = dbh, dby = dby) | |
# clip to mitigate exploding gradients | |
dweights.list <- lapply(dweights.list, clip.mat) | |
return(list(loss = loss, dweights.list = dweights.list, | |
hprev = hs[, len_input+1])) | |
} | |
sampled <- function(h, seed_ix, n, w){ | |
## samples a sequence of integers from the model | |
## h is the memory state, seed_ix is seed letter for first time step | |
## w is the list of weights | |
x <-matrix(0, vocab_size, 1) | |
x[seed_ix,] <- 1 | |
ixes <- NULL | |
for (t in 1:n){ | |
h <- tanh(w$Wxh %*% x + w$Whh %*% h + w$bh) | |
y <- w$Why %*% h + by | |
p <- exp(y) / sum(exp(y)) | |
ix <- sample(x = 1:vocab_size, size = 1, | |
p = drop(p), replace = TRUE) | |
x <- matrix(0, vocab_size, 1) | |
x[ix, ] <- 1 | |
ixes <- c(ixes, ix) | |
} | |
return(ixes) | |
} | |
# memory variables for Adagrad | |
mWxh <- matrix(0, hidden_size, vocab_size) | |
mWhh <- matrix(0, hidden_size, hidden_size) | |
mWhy <- matrix(0, vocab_size, hidden_size) | |
mbh <- matrix(0, hidden_size, 1) | |
mby <- matrix(0, vocab_size, 1) | |
mweights.list <- list(mWxh= mWxh, mWhh = mWhh, mWhy = mWhy, | |
mbh = mbh, mby = mby) | |
# loss at iteration 0 | |
smooth_loss <- (-1) * log(1/vocab_size) * seq_length | |
n <- 1 | |
p <- 1 | |
while (TRUE){ | |
# prepare inputs (we're sweeping from left to right in steps seq_length long) | |
if ((p+seq_length+1) >= length(data) || n == 1){ | |
cat("\nNew Epoch\n") | |
hprev <- matrix(0, hidden_size, 1) # reset RNN memory | |
p <- 1 # go from start of data | |
} | |
inputs <- apply(as.matrix(data[p:(p+seq_length-1)]), 1, | |
function(x)grep(x, char_to_ix[1,], fixed = TRUE)) | |
targets <- apply(as.matrix(data[(p+1):(p+seq_length)]), 1, | |
function(x)grep(x, char_to_ix[1,], fixed = TRUE)) | |
# Sample from the model now and then | |
if (n%%1000 == 0){ | |
sample_ix <- sampled(hprev, inputs[1], 200, weight.list) | |
cat(paste(char_to_ix[1,sample_ix], collapse = "")) | |
} | |
# forward seq_length characters through the net and fetch gradient | |
results <- lossFun(inputs, targets, hprev, weight.list) | |
smooth_loss <- smooth_loss * 0.999 + results$loss * 0.001 | |
if (n%%1000 == 0){ | |
cat("\nIteration ", n, ", loss:\t", smooth_loss) | |
} | |
hprev <- results$hprev | |
# Update with adagrad | |
updated <- mapply(FUN = function(we, dwe, mem){ | |
mem <- mem + dwe*dwe | |
# adagrad update | |
we <- we - (learning_rate * dwe) / sqrt(mem + 1/100000000) | |
return(list(we = we, mem = mem)) | |
}, we = weight.list, dwe = results$dweights.list, | |
mem = mweights.list, SIMPLIFY = FALSE) | |
weight.list <- lapply(updated, function(x)x$we) | |
mweights.list <- lapply(updated, function(x)x$mem) | |
# move data pointer | |
p <- p + seq_length | |
# iteration counter | |
n <- n + 1 | |
if (n == 10001) break | |
} | |
# gradient checking function | |
gradCheck <- function(inputs, targets, hprev, w){ | |
# How many checks per Parameter? | |
num_checks <- 10 | |
delta <- 0.00001 | |
dw <- lossFun(inputs, targets, hprev, w)$dweights.list | |
doit <- mapply(FUN = function(weights, dweights, names){ | |
s0 <- dim(weights) | |
s1 <- dim(dweights) | |
if (any(s0 != s1) ) | |
cat("Error dims dont match:\t", s0, "\t", s1, "\n") | |
w.temp <- w | |
cat("\nWeight:", names, "\n") | |
for (i in 1:num_checks){ | |
ri <- runif(n = 1, min = 0, max = length(weights)) | |
weights_vec <- as.vector(weights) | |
old_val <- weights_vec[ri] | |
# evaluate cost at [x + delta] and [x - delta] | |
weights_vec[ri] <- old_val + delta | |
w.temp[[names]] <- matrix(weights_vec, dim(weights)) | |
cg0 <- lossFun(inputs, targets, hprev, w.temp)$loss | |
weights_vec[ri] <- old_val - delta | |
w.temp[[names]] <- matrix(weights_vec, dim(weights)) | |
cg1 <- lossFun(inputs, targets, hprev, w.temp)$loss | |
# reset old value for this parameter | |
w.temp[[names]] <- weights | |
# fetch both numerical and analytic gradient | |
grad_analytical <- as.vector(dweights)[ri] | |
grad_numerical <- (cg0 - cg1) / (2 * delta) | |
rel_error <- abs(grad_analytical - grad_numerical) / abs(grad_numerical + grad_analytical) | |
# rel_error should be on order of 1e-7 or less | |
cat("Numerical: ", grad_numerical, "|| Analytical: ", grad_analytical, "||-->\t", rel_error, "\n") | |
} | |
}, weights = w, dweights = dw, names = names(w)) | |
} | |
set.seed(123456) | |
gradCheck(inputs, targets, hprev, weight.list) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment