bjpcjp · February 8, 2016 02:46
diff --git a/main.r b/main.r
 # How to build your own NN classifier in r
 # source: http://www.r-bloggers.com/build-your-own-neural-network-classifier-in-r/
 # reference: http://junma5.weebly.com/data-blog/build-your-own-neural-network-classifier-in-r

 # project:
 # 1) build simple NN with 2 fully-connected layers
 # 2) use NN to classify a dataset of 4-class 2D images & visualize decision boundary.
 # 3) train NN with MNIST dataset

 # ref: stanford CS23 source: http://cs231n.github.io/

 # Build spiral dataset, 4 classes * 200 examples

 library(ggplot2)
 library(caret)

 # didn't have caret library installed. fixed inside R envt with
 # install.packages('caret')
 # caret = predictive model creation library
 # -- data splits
 # -- pre processing
 # -- feature selection
 # -- model tuning
 # -- variable importance estimation

 N <- 200 # number of points per class
 D <- 2 # dimensionality
 K <- 4 # number of classes
 X <- data.frame() # data matrix (rows = records)
 y <- data.frame() # class labels

 set.seed(308) # reset rndm number generator to known state
 
 for (j in (1:K)){ # for each class,

  r <- seq(0.05,1,length.out = N) # radius = new sequence of length N
  t <- seq((j-1)*4.7,j*4.7, length.out = N) + rnorm(N, sd = 0.3) # theta

  Xtemp <- data.frame(x =r*sin(t) , y = r*cos(t)) 
  ytemp <- data.frame(matrix(j, N, 1))
  X <- rbind(X, Xtemp)
  y <- rbind(y, ytemp)
 }
 
 data <- cbind(X,y) # data matrix
 colnames(data) <- c(colnames(X), 'label')

 # X & Y are 800x2 & 800x1 data frames

 x_min <- min(X[,1])-0.2
 x_max <- max(X[,1])+0.2
 y_min <- min(X[,2])-0.2
 y_max <- max(X[,2])+0.2
 
 # lets visualize the data:
 ggplot(data) + geom_point(aes(
 	x=x, 
 	y=y, 
 	color = as.character(label)), 
 	size = 2) 
 	+ theme_bw(base_size = 15) +
  xlim(x_min, x_max) + ylim(y_min, y_max) +
  ggtitle('Spiral Data Visulization') +
  coord_fixed(ratio = 0.8) +
  theme(axis.ticks=element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), 
        axis.text=element_blank(), axis.title=element_blank(), legend.position = 'none')

 # construct the NN
 # need a new matrix Y (800x4) so for each row in Y, entry with index==label = 1 (0 otherwise)

 X <- as.matrix(X)
 Y <- matrix(0, N*K, K)
 
 for (i in 1:(N*K)){
  Y[i, y[i,]] <- 1
 }

 ################################################################################
 # NN construction
 #
 # use X & Y matrices. Return list of 4 (W,b,W2,b2 = weight & bias for each layer)
 # specify step_size, alias learning rate
 # specify regularization_strength, alias lambda
 #
 # activation function ReLU
 # loss/cost function: softmax
 # 
 # implementation uses vectorized ops
 #
 ################################################################################

 # %*% dot product, * element wise product
 nnet <- function(X, Y, step_size = 0.5, reg = 0.001, h = 10, niteration){
  # get dim of input
  N <- nrow(X) # number of examples
  K <- ncol(Y) # number of classes
  D <- ncol(X) # dimensionality
 
  # initialize parameters randomly.
  # rnorm = random number, normal distribution

  W <- 0.01 *  matrix(rnorm(D*h), nrow = D)
  b <-         matrix(0, nrow = 1, ncol = h)
  W2 <- 0.01 * matrix(rnorm(h*K), nrow = h)
  b2 <-        matrix(0, nrow = 1, ncol = K)
 
  # gradient descent loop to update weight and bias
  for (i in 0:niteration){

    # hidden layer, ReLU activation
    hidden_layer <- pmax(0, X%*% W + matrix(rep(b,N), nrow = N, byrow = T))
    hidden_layer <- matrix(hidden_layer, nrow = N)

    # class score
    scores <- hidden_layer%*%W2 + matrix(rep(b2,N), nrow = N, byrow = T)
 
    # compute and normalize class probabilities
    exp_scores <- exp(scores)
    probs <- exp_scores / rowSums(exp_scores)
 
    # compute the loss: softmax and regularization
    corect_logprobs <- -log(probs)
    data_loss <- sum(corect_logprobs*Y)/N
    reg_loss <- 0.5*reg*sum(W*W) + 0.5*reg*sum(W2*W2)
    loss <- data_loss + reg_loss
    
    # check progress
    if (i%%1000 == 0 | i == niteration){
      print(paste("iteration", i,': loss', loss))}
 
    # compute the gradient on scores
    dscores <- probs-Y
    dscores <- dscores/N
 
    # backpropate the gradient to the parameters
    dW2 <- t(hidden_layer)%*%dscores
    db2 <- colSums(dscores)
    # next backprop into hidden layer
    dhidden <- dscores%*%t(W2)
    # backprop the ReLU non-linearity
    dhidden[hidden_layer <= 0] <- 0
    # finally into W,b
    dW <- t(X)%*%dhidden
    db <- colSums(dhidden)
 
    # add regularization gradient contribution
    dW2 <- dW2 + reg *W2
    dW <- dW + reg *W
 
    # update parameter 
    W <- W-step_size*dW
    b <- b-step_size*db
    W2 <- W2-step_size*dW2
    b2 <- b2-step_size*db2
  }
  return(list(W, b, W2, b2))
 }

 ################################################################################
 # Prediction function
 #
 ################################################################################

 nnetPred <- function(X, para = list()){
  W <- para[[1]]
  b <- para[[2]]
  W2 <- para[[3]]
  b2 <- para[[4]]
 
  # pmax = parallel max on a pair of vectors
  # A %*% B = matrix multiplication

  N <- nrow(X) # number of rows in X

  hidden_layer <- pmax(0, X%*% W + matrix(rep(b,N), nrow = N, byrow = T)) 
  hidden_layer <- matrix(hidden_layer, nrow = N)

  scores <- hidden_layer%*%W2 + matrix(rep(b2,N), nrow = N, byrow = T) 
  predicted_class <- apply(scores, 1, which.max)
 
  return(predicted_class)  
 }
 
 nnet.model <- nnet(X, Y, step_size = 0.4,reg = 0.0002, h=50, niteration = 6000)

 predicted_class <- nnetPred(X, nnet.model)
 print(paste('training accuracy:',mean(predicted_class == (y))))

 ################################################################################
 # plot decision boundary
 # 
 ################################################################################

 hs <- 0.01
 grid <- as.matrix(expand.grid(seq(x_min, x_max, by = hs), seq(y_min, y_max, by =hs)))
 Z <- nnetPred(grid, nnet.model)
 
 ggplot()+
  geom_tile(aes(x = grid[,1],y = grid[,2],fill=as.character(Z)), alpha = 0.3, show.legend = F)+ 
  geom_point(data = data, aes(x=x, y=y, color = as.character(label)), size = 2) + theme_bw(base_size = 15) +
  ggtitle('Neural Network Decision Boundary') +
  coord_fixed(ratio = 0.8) + 
  theme(axis.ticks=element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), 
        axis.text=element_blank(), axis.title=element_blank(), legend.position = 'none')

 print('done')

 ################################################################################
 # MNIST data & pre processing
 #
 ################################################################################

 displayDigit <- function(X){
  m <- matrix(unlist(X),nrow = 28,byrow = T)
  m <- t(apply(m, 2, rev))
  image(m,col=grey.colors(255))
 }
 
 train <- read.csv("data/train.csv", header = TRUE, stringsAsFactors = F)
 displayDigit(train[18,-1])

 ################################################################################
 # pre process by removing near-zero variance columns & sclae by max(X)
 #
 # data is split for cross-validation
 ################################################################################

 nzv <- nearZeroVar(train)
 nzv.nolabel <- nzv-1
 
 inTrain <- createDataPartition(y=train$label, p=0.7, list=F)
 
 training <- train[inTrain, ]
 CV <- train[-inTrain, ]
 
 X <- as.matrix(training[, -1]) # data matrix (each row = single example)
 N <- nrow(X) # number of examples
 y <- training[, 1] # class labels
 
 K <- length(unique(y)) # number of classes
 X.proc <- X[, -nzv.nolabel]/max(X) # scale
 D <- ncol(X.proc) # dimensionality
 
 Xcv <- as.matrix(CV[, -1]) # data matrix (each row = single example)
 ycv <- CV[, 1] # class labels
 Xcv.proc <- Xcv[, -nzv.nolabel]/max(X) # scale CV data
 
 Y <- matrix(0, N, K)
 
 for (i in 1:N){
  Y[i, y[i]+1] <- 1
 }

 ################################################################################
 # model training & CV accuracy
 #
 ################################################################################

 nnet.mnist <- nnet(X.proc, Y, step_size = 0.3, 
                   reg = 0.0001, niteration = 3500)

 predicted_class <- nnetPred(X.proc, nnet.mnist)

 print(paste('training set accuracy:',
            mean(predicted_class == (y+1))))

 predicted_class <- nnetPred(Xcv.proc, nnet.mnist)

 print(paste('CV accuracy:',
            mean(predicted_class == (ycv+1))))

 ################################################################################
 # randomly select an image & predict the label
 #
 ################################################################################

 Xtest <- Xcv[sample(1:nrow(Xcv), 1), ]
 Xtest.proc <- as.matrix(Xtest[-nzv.nolabel], nrow = 1)
 predicted_test <- nnetPred(t(Xtest.proc), nnet.mnist)
 print(paste('The predicted digit is:',predicted_test-1 ))

 displayDigit(Xtest)
	# How to build your own NN classifier in r
	# source: http://www.r-bloggers.com/build-your-own-neural-network-classifier-in-r/
	# reference: http://junma5.weebly.com/data-blog/build-your-own-neural-network-classifier-in-r

	# project:
	# 1) build simple NN with 2 fully-connected layers
	# 2) use NN to classify a dataset of 4-class 2D images & visualize decision boundary.
	# 3) train NN with MNIST dataset

	# ref: stanford CS23 source: http://cs231n.github.io/

	# Build spiral dataset, 4 classes * 200 examples

	library(ggplot2)
	library(caret)

	# didn't have caret library installed. fixed inside R envt with
	# install.packages('caret')
	# caret = predictive model creation library
	# -- data splits
	# -- pre processing
	# -- feature selection
	# -- model tuning
	# -- variable importance estimation

	N <- 200 # number of points per class
	D <- 2 # dimensionality
	K <- 4 # number of classes
	X <- data.frame() # data matrix (rows = records)
	y <- data.frame() # class labels

	set.seed(308) # reset rndm number generator to known state

	for (j in (1:K)){ # for each class,

	r <- seq(0.05,1,length.out = N) # radius = new sequence of length N
	t <- seq((j-1)4.7,j4.7, length.out = N) + rnorm(N, sd = 0.3) # theta

	Xtemp <- data.frame(x =rsin(t) , y = rcos(t))
	ytemp <- data.frame(matrix(j, N, 1))
	X <- rbind(X, Xtemp)
	y <- rbind(y, ytemp)
	}

	data <- cbind(X,y) # data matrix
	colnames(data) <- c(colnames(X), 'label')

	# X & Y are 800x2 & 800x1 data frames

	x_min <- min(X[,1])-0.2
	x_max <- max(X[,1])+0.2
	y_min <- min(X[,2])-0.2
	y_max <- max(X[,2])+0.2

	# lets visualize the data:
	ggplot(data) + geom_point(aes(
	x=x,
	y=y,
	color = as.character(label)),
	size = 2)
	+ theme_bw(base_size = 15) +
	xlim(x_min, x_max) + ylim(y_min, y_max) +
	ggtitle('Spiral Data Visulization') +
	coord_fixed(ratio = 0.8) +
	theme(axis.ticks=element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
	axis.text=element_blank(), axis.title=element_blank(), legend.position = 'none')

	# construct the NN
	# need a new matrix Y (800x4) so for each row in Y, entry with index==label = 1 (0 otherwise)

	X <- as.matrix(X)
	Y <- matrix(0, N*K, K)

	for (i in 1:(N*K)){
	Y[i, y[i,]] <- 1
	}

	################################################################################
	# NN construction
	#
	# use X & Y matrices. Return list of 4 (W,b,W2,b2 = weight & bias for each layer)
	# specify step_size, alias learning rate
	# specify regularization_strength, alias lambda
	#
	# activation function ReLU
	# loss/cost function: softmax
	#
	# implementation uses vectorized ops
	#
	################################################################################

	# %% dot product, element wise product
	nnet <- function(X, Y, step_size = 0.5, reg = 0.001, h = 10, niteration){
	# get dim of input
	N <- nrow(X) # number of examples
	K <- ncol(Y) # number of classes
	D <- ncol(X) # dimensionality

	# initialize parameters randomly.
	# rnorm = random number, normal distribution

	W <- 0.01 * matrix(rnorm(D*h), nrow = D)
	b <- matrix(0, nrow = 1, ncol = h)
	W2 <- 0.01 * matrix(rnorm(h*K), nrow = h)
	b2 <- matrix(0, nrow = 1, ncol = K)

	# gradient descent loop to update weight and bias
	for (i in 0:niteration){

	# hidden layer, ReLU activation
	hidden_layer <- pmax(0, X%*% W + matrix(rep(b,N), nrow = N, byrow = T))
	hidden_layer <- matrix(hidden_layer, nrow = N)

	# class score
	scores <- hidden_layer%*%W2 + matrix(rep(b2,N), nrow = N, byrow = T)

	# compute and normalize class probabilities
	exp_scores <- exp(scores)
	probs <- exp_scores / rowSums(exp_scores)

	# compute the loss: softmax and regularization
	corect_logprobs <- -log(probs)
	data_loss <- sum(corect_logprobs*Y)/N
	reg_loss <- 0.5regsum(WW) + 0.5regsum(W2W2)
	loss <- data_loss + reg_loss

	# check progress
	if (i%%1000 == 0 \| i == niteration){
	print(paste("iteration", i,': loss', loss))}

	# compute the gradient on scores
	dscores <- probs-Y
	dscores <- dscores/N

	# backpropate the gradient to the parameters
	dW2 <- t(hidden_layer)%*%dscores
	db2 <- colSums(dscores)
	# next backprop into hidden layer
	dhidden <- dscores%*%t(W2)
	# backprop the ReLU non-linearity
	dhidden[hidden_layer <= 0] <- 0
	# finally into W,b
	dW <- t(X)%*%dhidden
	db <- colSums(dhidden)

	# add regularization gradient contribution
	dW2 <- dW2 + reg *W2
	dW <- dW + reg *W

	# update parameter
	W <- W-step_size*dW
	b <- b-step_size*db
	W2 <- W2-step_size*dW2
	b2 <- b2-step_size*db2
	}
	return(list(W, b, W2, b2))
	}

	################################################################################
	# Prediction function
	#
	################################################################################

	nnetPred <- function(X, para = list()){
	W <- para[[1]]
	b <- para[[2]]
	W2 <- para[[3]]
	b2 <- para[[4]]

	# pmax = parallel max on a pair of vectors
	# A %*% B = matrix multiplication

	N <- nrow(X) # number of rows in X

	hidden_layer <- pmax(0, X%*% W + matrix(rep(b,N), nrow = N, byrow = T))
	hidden_layer <- matrix(hidden_layer, nrow = N)

	scores <- hidden_layer%*%W2 + matrix(rep(b2,N), nrow = N, byrow = T)
	predicted_class <- apply(scores, 1, which.max)

	return(predicted_class)
	}

	nnet.model <- nnet(X, Y, step_size = 0.4,reg = 0.0002, h=50, niteration = 6000)

	predicted_class <- nnetPred(X, nnet.model)
	print(paste('training accuracy:',mean(predicted_class == (y))))

	################################################################################
	# plot decision boundary
	#
	################################################################################

	hs <- 0.01
	grid <- as.matrix(expand.grid(seq(x_min, x_max, by = hs), seq(y_min, y_max, by =hs)))
	Z <- nnetPred(grid, nnet.model)

	ggplot()+
	geom_tile(aes(x = grid[,1],y = grid[,2],fill=as.character(Z)), alpha = 0.3, show.legend = F)+
	geom_point(data = data, aes(x=x, y=y, color = as.character(label)), size = 2) + theme_bw(base_size = 15) +
	ggtitle('Neural Network Decision Boundary') +
	coord_fixed(ratio = 0.8) +
	theme(axis.ticks=element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
	axis.text=element_blank(), axis.title=element_blank(), legend.position = 'none')

	print('done')

	################################################################################
	# MNIST data & pre processing
	#
	################################################################################

	displayDigit <- function(X){
	m <- matrix(unlist(X),nrow = 28,byrow = T)
	m <- t(apply(m, 2, rev))
	image(m,col=grey.colors(255))
	}

	train <- read.csv("data/train.csv", header = TRUE, stringsAsFactors = F)
	displayDigit(train[18,-1])

	################################################################################
	# pre process by removing near-zero variance columns & sclae by max(X)
	#
	# data is split for cross-validation
	################################################################################

	nzv <- nearZeroVar(train)
	nzv.nolabel <- nzv-1

	inTrain <- createDataPartition(y=train$label, p=0.7, list=F)

	training <- train[inTrain, ]
	CV <- train[-inTrain, ]

	X <- as.matrix(training[, -1]) # data matrix (each row = single example)
	N <- nrow(X) # number of examples
	y <- training[, 1] # class labels

	K <- length(unique(y)) # number of classes
	X.proc <- X[, -nzv.nolabel]/max(X) # scale
	D <- ncol(X.proc) # dimensionality

	Xcv <- as.matrix(CV[, -1]) # data matrix (each row = single example)
	ycv <- CV[, 1] # class labels
	Xcv.proc <- Xcv[, -nzv.nolabel]/max(X) # scale CV data

	Y <- matrix(0, N, K)

	for (i in 1:N){
	Y[i, y[i]+1] <- 1
	}

	################################################################################
	# model training & CV accuracy
	#
	################################################################################

	nnet.mnist <- nnet(X.proc, Y, step_size = 0.3,
	reg = 0.0001, niteration = 3500)

	predicted_class <- nnetPred(X.proc, nnet.mnist)

	print(paste('training set accuracy:',
	mean(predicted_class == (y+1))))

	predicted_class <- nnetPred(Xcv.proc, nnet.mnist)

	print(paste('CV accuracy:',
	mean(predicted_class == (ycv+1))))

	################################################################################
	# randomly select an image & predict the label
	#
	################################################################################

	Xtest <- Xcv[sample(1:nrow(Xcv), 1), ]
	Xtest.proc <- as.matrix(Xtest[-nzv.nolabel], nrow = 1)
	predicted_test <- nnetPred(t(Xtest.proc), nnet.mnist)
	print(paste('The predicted digit is:',predicted_test-1 ))

	displayDigit(Xtest)