Skip to content

Instantly share code, notes, and snippets.

@georgeblck
Created February 7, 2017 10:45
Show Gist options
  • Save georgeblck/4d806e56693420ad22af37a3c29affde to your computer and use it in GitHub Desktop.
Save georgeblck/4d806e56693420ad22af37a3c29affde to your computer and use it in GitHub Desktop.
Minimal character-level language model with a Vanilla Recurrent Neural Network, in R
###
### Minimal character-level Vanilla RNN model. Written by Andrej Karpathy (@karpathy)
### BSD License
### Re-written in R by @georgeblck
###
rm(list=ls(all=TRUE))
options(digits=10)
# Make clipping functions of various speeds (for speed)
if ("Rcpp" %in% rownames(installed.packages())){
library(Rcpp)
cppFunction('NumericVector rcpp_clip( NumericVector x, double a, double b){
return clamp( a, x, b ) ;
}')
clip.mat <- function(mat){
matrix(apply(mat, 2, rcpp_clip, a =-5, b=5),
nrow = nrow(mat), ncol = ncol(mat))
}
} else {
clip <- function(vec, UB=5, LB=-5){
pmax( LB, pmin( vec, UB))
}
clip.mat <- function(mat){
matrix(apply(mat,2,clip), nrow = nrow(mat), ncol = ncol(mat))
}
}
# Data I/O
# If you have internet access download the data
data <- readLines("https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt")
data <- paste(data, collapse = "\n")
data <- strsplit(data, "")[[1]]
chars <- unique(data)
data_size <- length(data)
vocab_size <- length(chars)
char_to_ix <- rbind(chars, 1:vocab_size)
# hyperparameters
hidden_size <- 100
seq_length <- 25
learning_rate <- 0.1
# model parameters
set.seed(12345)
Wxh <- matrix(rnorm(hidden_size * vocab_size),
hidden_size, vocab_size) * 0.01 # input to hidden
Whh <- matrix(rnorm(hidden_size * hidden_size),
hidden_size, hidden_size) * 0.01 # hidden to hidden
Why <- matrix(rnorm(hidden_size * vocab_size),
vocab_size, hidden_size) * 0.01 # hidden to output
bh <- matrix(0, hidden_size, 1) # hidden bias
by <- matrix(0, vocab_size, 1) # output bias
weight.list <- list(Wxh = Wxh, Whh = Whh, Why = Why,
bh = bh, by = by)
lossFun <- function(inputs, targets, hprev, w){
## inputs,targets are both vectors of integers (i.e. x, y)
## hprev is Hx1 vector of initial hidden state
## w is the list of weights
## returns the loss, gradients on model parameters, and last hidden state
# Initialize variables
loss <- 0
len_input <- length(inputs)
xs <- matrix(0, nrow = vocab_size, ncol = len_input)
hs <- matrix(0, nrow = hidden_size, ncol = (len_input + 1))
hs[,1] <- hprev
ys <- xs
ps <- ys
# forward pass
for (t in 1:len_input){
# encode in 1-of-k representation
xs[inputs[t], t] <- 1
hs[, (t+1)] <- tanh(w$Wxh %*% xs[, t] +
w$Whh %*% hs[, (t-1+1)] + w$bh)
# unnormalized log probabilities for next chars
ys[, t] <- w$Why %*% hs[, (t+1)] + w$by
# probabilities for next chars
ps[, t] <- exp(ys[, t]) / sum(exp(ys[, t]))
# softmax (cross-entropy loss)
loss <- loss + (-1.0)*log(ps[targets[t], t])
}
# backward pass: compute gradients going backwards
dWxh <- matrix(0, hidden_size, vocab_size)
dWhh <- matrix(0, hidden_size, hidden_size)
dWhy <- matrix(0, vocab_size, hidden_size)
dbh <- matrix(0, hidden_size, 1)
dby <- matrix(0, vocab_size, 1)
dhnext <- 0
for (t in len_input:1){
dy <- ps[, t]
# backprop into y
dy[targets[t]] <- dy[targets[t]] - 1
dWhy <- dWhy + dy %*% t(hs[, t+1])
dby <- dby + dy
# backprop into h
dh <- t(w$Why) %*% dy + dhnext
# backprop through tanh nonlinearity
dhraw <- (1 - hs[, t+1]^2) * dh
dbh <- dbh + dhraw
dWxh <- dWxh + dhraw %*% t(xs[, t])
dWhh <- dWhh + dhraw %*% t(hs[, (t-1+1)])
dhnext <- t(w$Whh) %*% dhraw
}
dweights.list <- list(dWxh = dWxh, dWhh = dWhh, dWhy = dWhy,
dbh = dbh, dby = dby)
# clip to mitigate exploding gradients
dweights.list <- lapply(dweights.list, clip.mat)
return(list(loss = loss, dweights.list = dweights.list,
hprev = hs[, len_input+1]))
}
sampled <- function(h, seed_ix, n, w){
## samples a sequence of integers from the model
## h is the memory state, seed_ix is seed letter for first time step
## w is the list of weights
x <-matrix(0, vocab_size, 1)
x[seed_ix,] <- 1
ixes <- NULL
for (t in 1:n){
h <- tanh(w$Wxh %*% x + w$Whh %*% h + w$bh)
y <- w$Why %*% h + by
p <- exp(y) / sum(exp(y))
ix <- sample(x = 1:vocab_size, size = 1,
p = drop(p), replace = TRUE)
x <- matrix(0, vocab_size, 1)
x[ix, ] <- 1
ixes <- c(ixes, ix)
}
return(ixes)
}
# memory variables for Adagrad
mWxh <- matrix(0, hidden_size, vocab_size)
mWhh <- matrix(0, hidden_size, hidden_size)
mWhy <- matrix(0, vocab_size, hidden_size)
mbh <- matrix(0, hidden_size, 1)
mby <- matrix(0, vocab_size, 1)
mweights.list <- list(mWxh= mWxh, mWhh = mWhh, mWhy = mWhy,
mbh = mbh, mby = mby)
# loss at iteration 0
smooth_loss <- (-1) * log(1/vocab_size) * seq_length
n <- 1
p <- 1
while (TRUE){
# prepare inputs (we're sweeping from left to right in steps seq_length long)
if ((p+seq_length+1) >= length(data) || n == 1){
cat("\nNew Epoch\n")
hprev <- matrix(0, hidden_size, 1) # reset RNN memory
p <- 1 # go from start of data
}
inputs <- apply(as.matrix(data[p:(p+seq_length-1)]), 1,
function(x)grep(x, char_to_ix[1,], fixed = TRUE))
targets <- apply(as.matrix(data[(p+1):(p+seq_length)]), 1,
function(x)grep(x, char_to_ix[1,], fixed = TRUE))
# Sample from the model now and then
if (n%%1000 == 0){
sample_ix <- sampled(hprev, inputs[1], 200, weight.list)
cat(paste(char_to_ix[1,sample_ix], collapse = ""))
}
# forward seq_length characters through the net and fetch gradient
results <- lossFun(inputs, targets, hprev, weight.list)
smooth_loss <- smooth_loss * 0.999 + results$loss * 0.001
if (n%%1000 == 0){
cat("\nIteration ", n, ", loss:\t", smooth_loss)
}
hprev <- results$hprev
# Update with adagrad
updated <- mapply(FUN = function(we, dwe, mem){
mem <- mem + dwe*dwe
# adagrad update
we <- we - (learning_rate * dwe) / sqrt(mem + 1/100000000)
return(list(we = we, mem = mem))
}, we = weight.list, dwe = results$dweights.list,
mem = mweights.list, SIMPLIFY = FALSE)
weight.list <- lapply(updated, function(x)x$we)
mweights.list <- lapply(updated, function(x)x$mem)
# move data pointer
p <- p + seq_length
# iteration counter
n <- n + 1
if (n == 10001) break
}
# gradient checking function
gradCheck <- function(inputs, targets, hprev, w){
# How many checks per Parameter?
num_checks <- 10
delta <- 0.00001
dw <- lossFun(inputs, targets, hprev, w)$dweights.list
doit <- mapply(FUN = function(weights, dweights, names){
s0 <- dim(weights)
s1 <- dim(dweights)
if (any(s0 != s1) )
cat("Error dims dont match:\t", s0, "\t", s1, "\n")
w.temp <- w
cat("\nWeight:", names, "\n")
for (i in 1:num_checks){
ri <- runif(n = 1, min = 0, max = length(weights))
weights_vec <- as.vector(weights)
old_val <- weights_vec[ri]
# evaluate cost at [x + delta] and [x - delta]
weights_vec[ri] <- old_val + delta
w.temp[[names]] <- matrix(weights_vec, dim(weights))
cg0 <- lossFun(inputs, targets, hprev, w.temp)$loss
weights_vec[ri] <- old_val - delta
w.temp[[names]] <- matrix(weights_vec, dim(weights))
cg1 <- lossFun(inputs, targets, hprev, w.temp)$loss
# reset old value for this parameter
w.temp[[names]] <- weights
# fetch both numerical and analytic gradient
grad_analytical <- as.vector(dweights)[ri]
grad_numerical <- (cg0 - cg1) / (2 * delta)
rel_error <- abs(grad_analytical - grad_numerical) / abs(grad_numerical + grad_analytical)
# rel_error should be on order of 1e-7 or less
cat("Numerical: ", grad_numerical, "|| Analytical: ", grad_analytical, "||-->\t", rel_error, "\n")
}
}, weights = w, dweights = dw, names = names(w))
}
set.seed(123456)
gradCheck(inputs, targets, hprev, weight.list)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment