Created
February 8, 2016 02:46
-
-
Save bjpcjp/8e88395452b6b3a09fca to your computer and use it in GitHub Desktop.
Build your own neural network classifier in R (source: http://junma5.weebly.com/data-blog)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# How to build your own NN classifier in r | |
# source: http://www.r-bloggers.com/build-your-own-neural-network-classifier-in-r/ | |
# reference: http://junma5.weebly.com/data-blog/build-your-own-neural-network-classifier-in-r | |
# project: | |
# 1) build simple NN with 2 fully-connected layers | |
# 2) use NN to classify a dataset of 4-class 2D images & visualize decision boundary. | |
# 3) train NN with MNIST dataset | |
# ref: stanford CS23 source: http://cs231n.github.io/ | |
# Build spiral dataset, 4 classes * 200 examples | |
library(ggplot2) | |
library(caret) | |
# didn't have caret library installed. fixed inside R envt with | |
# install.packages('caret') | |
# caret = predictive model creation library | |
# -- data splits | |
# -- pre processing | |
# -- feature selection | |
# -- model tuning | |
# -- variable importance estimation | |
N <- 200 # number of points per class | |
D <- 2 # dimensionality | |
K <- 4 # number of classes | |
X <- data.frame() # data matrix (rows = records) | |
y <- data.frame() # class labels | |
set.seed(308) # reset rndm number generator to known state | |
for (j in (1:K)){ # for each class, | |
r <- seq(0.05,1,length.out = N) # radius = new sequence of length N | |
t <- seq((j-1)*4.7,j*4.7, length.out = N) + rnorm(N, sd = 0.3) # theta | |
Xtemp <- data.frame(x =r*sin(t) , y = r*cos(t)) | |
ytemp <- data.frame(matrix(j, N, 1)) | |
X <- rbind(X, Xtemp) | |
y <- rbind(y, ytemp) | |
} | |
data <- cbind(X,y) # data matrix | |
colnames(data) <- c(colnames(X), 'label') | |
# X & Y are 800x2 & 800x1 data frames | |
x_min <- min(X[,1])-0.2 | |
x_max <- max(X[,1])+0.2 | |
y_min <- min(X[,2])-0.2 | |
y_max <- max(X[,2])+0.2 | |
# lets visualize the data: | |
ggplot(data) + geom_point(aes( | |
x=x, | |
y=y, | |
color = as.character(label)), | |
size = 2) | |
+ theme_bw(base_size = 15) + | |
xlim(x_min, x_max) + ylim(y_min, y_max) + | |
ggtitle('Spiral Data Visulization') + | |
coord_fixed(ratio = 0.8) + | |
theme(axis.ticks=element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), | |
axis.text=element_blank(), axis.title=element_blank(), legend.position = 'none') | |
# construct the NN | |
# need a new matrix Y (800x4) so for each row in Y, entry with index==label = 1 (0 otherwise) | |
X <- as.matrix(X) | |
Y <- matrix(0, N*K, K) | |
for (i in 1:(N*K)){ | |
Y[i, y[i,]] <- 1 | |
} | |
################################################################################ | |
# NN construction | |
# | |
# use X & Y matrices. Return list of 4 (W,b,W2,b2 = weight & bias for each layer) | |
# specify step_size, alias learning rate | |
# specify regularization_strength, alias lambda | |
# | |
# activation function ReLU | |
# loss/cost function: softmax | |
# | |
# implementation uses vectorized ops | |
# | |
################################################################################ | |
# %*% dot product, * element wise product | |
nnet <- function(X, Y, step_size = 0.5, reg = 0.001, h = 10, niteration){ | |
# get dim of input | |
N <- nrow(X) # number of examples | |
K <- ncol(Y) # number of classes | |
D <- ncol(X) # dimensionality | |
# initialize parameters randomly. | |
# rnorm = random number, normal distribution | |
W <- 0.01 * matrix(rnorm(D*h), nrow = D) | |
b <- matrix(0, nrow = 1, ncol = h) | |
W2 <- 0.01 * matrix(rnorm(h*K), nrow = h) | |
b2 <- matrix(0, nrow = 1, ncol = K) | |
# gradient descent loop to update weight and bias | |
for (i in 0:niteration){ | |
# hidden layer, ReLU activation | |
hidden_layer <- pmax(0, X%*% W + matrix(rep(b,N), nrow = N, byrow = T)) | |
hidden_layer <- matrix(hidden_layer, nrow = N) | |
# class score | |
scores <- hidden_layer%*%W2 + matrix(rep(b2,N), nrow = N, byrow = T) | |
# compute and normalize class probabilities | |
exp_scores <- exp(scores) | |
probs <- exp_scores / rowSums(exp_scores) | |
# compute the loss: softmax and regularization | |
corect_logprobs <- -log(probs) | |
data_loss <- sum(corect_logprobs*Y)/N | |
reg_loss <- 0.5*reg*sum(W*W) + 0.5*reg*sum(W2*W2) | |
loss <- data_loss + reg_loss | |
# check progress | |
if (i%%1000 == 0 | i == niteration){ | |
print(paste("iteration", i,': loss', loss))} | |
# compute the gradient on scores | |
dscores <- probs-Y | |
dscores <- dscores/N | |
# backpropate the gradient to the parameters | |
dW2 <- t(hidden_layer)%*%dscores | |
db2 <- colSums(dscores) | |
# next backprop into hidden layer | |
dhidden <- dscores%*%t(W2) | |
# backprop the ReLU non-linearity | |
dhidden[hidden_layer <= 0] <- 0 | |
# finally into W,b | |
dW <- t(X)%*%dhidden | |
db <- colSums(dhidden) | |
# add regularization gradient contribution | |
dW2 <- dW2 + reg *W2 | |
dW <- dW + reg *W | |
# update parameter | |
W <- W-step_size*dW | |
b <- b-step_size*db | |
W2 <- W2-step_size*dW2 | |
b2 <- b2-step_size*db2 | |
} | |
return(list(W, b, W2, b2)) | |
} | |
################################################################################ | |
# Prediction function | |
# | |
################################################################################ | |
nnetPred <- function(X, para = list()){ | |
W <- para[[1]] | |
b <- para[[2]] | |
W2 <- para[[3]] | |
b2 <- para[[4]] | |
# pmax = parallel max on a pair of vectors | |
# A %*% B = matrix multiplication | |
N <- nrow(X) # number of rows in X | |
hidden_layer <- pmax(0, X%*% W + matrix(rep(b,N), nrow = N, byrow = T)) | |
hidden_layer <- matrix(hidden_layer, nrow = N) | |
scores <- hidden_layer%*%W2 + matrix(rep(b2,N), nrow = N, byrow = T) | |
predicted_class <- apply(scores, 1, which.max) | |
return(predicted_class) | |
} | |
nnet.model <- nnet(X, Y, step_size = 0.4,reg = 0.0002, h=50, niteration = 6000) | |
predicted_class <- nnetPred(X, nnet.model) | |
print(paste('training accuracy:',mean(predicted_class == (y)))) | |
################################################################################ | |
# plot decision boundary | |
# | |
################################################################################ | |
hs <- 0.01 | |
grid <- as.matrix(expand.grid(seq(x_min, x_max, by = hs), seq(y_min, y_max, by =hs))) | |
Z <- nnetPred(grid, nnet.model) | |
ggplot()+ | |
geom_tile(aes(x = grid[,1],y = grid[,2],fill=as.character(Z)), alpha = 0.3, show.legend = F)+ | |
geom_point(data = data, aes(x=x, y=y, color = as.character(label)), size = 2) + theme_bw(base_size = 15) + | |
ggtitle('Neural Network Decision Boundary') + | |
coord_fixed(ratio = 0.8) + | |
theme(axis.ticks=element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), | |
axis.text=element_blank(), axis.title=element_blank(), legend.position = 'none') | |
print('done') | |
################################################################################ | |
# MNIST data & pre processing | |
# | |
################################################################################ | |
displayDigit <- function(X){ | |
m <- matrix(unlist(X),nrow = 28,byrow = T) | |
m <- t(apply(m, 2, rev)) | |
image(m,col=grey.colors(255)) | |
} | |
train <- read.csv("data/train.csv", header = TRUE, stringsAsFactors = F) | |
displayDigit(train[18,-1]) | |
################################################################################ | |
# pre process by removing near-zero variance columns & sclae by max(X) | |
# | |
# data is split for cross-validation | |
################################################################################ | |
nzv <- nearZeroVar(train) | |
nzv.nolabel <- nzv-1 | |
inTrain <- createDataPartition(y=train$label, p=0.7, list=F) | |
training <- train[inTrain, ] | |
CV <- train[-inTrain, ] | |
X <- as.matrix(training[, -1]) # data matrix (each row = single example) | |
N <- nrow(X) # number of examples | |
y <- training[, 1] # class labels | |
K <- length(unique(y)) # number of classes | |
X.proc <- X[, -nzv.nolabel]/max(X) # scale | |
D <- ncol(X.proc) # dimensionality | |
Xcv <- as.matrix(CV[, -1]) # data matrix (each row = single example) | |
ycv <- CV[, 1] # class labels | |
Xcv.proc <- Xcv[, -nzv.nolabel]/max(X) # scale CV data | |
Y <- matrix(0, N, K) | |
for (i in 1:N){ | |
Y[i, y[i]+1] <- 1 | |
} | |
################################################################################ | |
# model training & CV accuracy | |
# | |
################################################################################ | |
nnet.mnist <- nnet(X.proc, Y, step_size = 0.3, | |
reg = 0.0001, niteration = 3500) | |
predicted_class <- nnetPred(X.proc, nnet.mnist) | |
print(paste('training set accuracy:', | |
mean(predicted_class == (y+1)))) | |
predicted_class <- nnetPred(Xcv.proc, nnet.mnist) | |
print(paste('CV accuracy:', | |
mean(predicted_class == (ycv+1)))) | |
################################################################################ | |
# randomly select an image & predict the label | |
# | |
################################################################################ | |
Xtest <- Xcv[sample(1:nrow(Xcv), 1), ] | |
Xtest.proc <- as.matrix(Xtest[-nzv.nolabel], nrow = 1) | |
predicted_test <- nnetPred(t(Xtest.proc), nnet.mnist) | |
print(paste('The predicted digit is:',predicted_test-1 )) | |
displayDigit(Xtest) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment