zjf · December 14, 2015 06:39
diff --git a/bg.R b/bg.R
 ## Demo
 set.seed(12321)
 W_all_test1 = list(matrix(rnorm(1, 0, 0.5), nc = 1), matrix(10, nc = 1))
 b_all_test1 = list(matrix(rnorm(1, 0, 0.5), nc = 1), matrix(-5, nc = 1))
 X = matrix(rnorm(200, 0, 2), ncol = 1)
 Y = apply(X, 1, function(x){forward_propagation(x, W_all_test1, b_all_test1)})
 X.train = X[1:150,]
 Y.train = Y[1:150]
 X.test = X[151:200,]
 Y.test = Y[151:200]
 #plot(X, Y)
 model = back_propagation(matrix(X.train, nc = 1), Y.train, c(1,1), 10, 0)
 Y.pred = apply(matrix(X.test, nc = 1), 1, function(x){forward_propagation(x, model$W, model$b)})
 plot(Y.test, Y.pred)

 ## The following functions should be loaded first.
 ## http://ufldl.stanford.edu/wiki/index.php/Backpropagation_Algorithm
 act_f <- function(x){
  x = as.numeric(x)
  return(1/(1+exp(-x)))
 }

 act_f_der <- function(x){
  x = as.numeric(x)
  x1 = act_f(x1)
  return(x1*(1-x1))
 }

 forward_z <- function(a, W, b){
  a = as.numeric(a)
  return(W %*% a + b)
 }

 forward_a <- function(z){
  return(act_f(z))
 }

 forward_propagation <- function(x, W_all, b_all){
  n = length(W_all)
  if(!is.array(x))
    x = matrix(x, nr = 1)
  y = matrix(0, nrow(x), nrow(W_all[[length(W_all)]]))
  for(j in 1:nrow(x)){
    a = as.numeric(x[j,])
    for(i in 1:n){
      z = forward_z(a, W_all[[i]], b_all[[i]])
      a = forward_a(z)
    }
    y[j, ] = a
  }
  return(y)
 }

 back_propagation <- function(X, Y, net_str, alpha, lambda, ...){
  if(!is.array(Y)){
    Y = matrix(Y, nc = 1)
  }
  m = dim(X)[1]
  p = dim(X)[2]
  n = length(as.numeric(net_str))
  W = list()
  b = list()
  args = list(...)
  if(length(args) == 2){
    beta = args[[1]]
    rho = args[[2]]
    sparse = TRUE
  }else{
    sparse = FALSE
  }
  ## Init
  set.seed(216)
  for(i in 1:n){
    W = c(W, list(matrix(rnorm(net_str[i] * p, 0, 0.5), ncol = p)))
    b = c(b, list(matrix(rnorm(net_str[i], 0, 0.5), ncol = 1)))
    p = net_str[i]
  }
  W_new = W
  b_new = b
  
  err = 1
  k = 1
  while(err[k] > 0.00001){
    W_gradient = lapply(W, "*", lambda)
    b_gradient = lapply(b, "*", 0)
    
    ## if there exits a sparsity constraint on the hidden units,
    ## then perform scan through samples computing a forward pass on each to
    ## accumulate (sum up) the activations and compute rho_real
    if(sparse == TRUE){
      rho_real = lapply(W, function(x){matrix(numeric(nrow(x)), nc = 1)})
      for(i in 1:m){
        activation_step = X[i, ]
        for(j in 1:n){
          activation_step = forward_propagation(activation_step, W[j], b[j])
          rho_real[[j]] = rho_real[[j]] + matrix(activation_step, nc = 1)
        }
      }
      rho_real = lapply(rho_real, "/", m)
    }
    
    for(i in 1:m){
      activation_all = list(matrix(X[i, ], nc = 1))
      activation_step = X[i, ]
      delta = lapply(W, function(x){numeric(nrow(x))})
      for(j in 1:n){
        activation_step = forward_propagation(activation_step, W[j], b[j])
        activation_all = c(activation_all, list(matrix(activation_step, nc = 1)))
      }
      delta[[n]] = -(Y[i, ]- activation_all[[n+1]]) * activation_all[[n+1]] * (1 - activation_all[[n+1]])
      W_gradient[[n]] = W_gradient[[n]] + as.matrix(delta[[n]], nc = 1) %*% t(activation_all[[n]]) / m
      b_gradient[[n]] = b_gradient[[n]] + as.matrix(delta[[n]], nc = 1) / m
      for(j in n:2){
        delta[[j-1]] = as.numeric(t(W[[j]]) %*% delta[[j]] + ifelse(rep(!sparse, net_str[j-1]), 0, beta*(-rho/rho_real[[j-1]]+(1-rho)/(1-rho_real[[j-1]])))) * activation_all[[j]] * (1 - activation_all[[j]])
        W_gradient[[j-1]] = W_gradient[[j-1]] + as.matrix(delta[[j-1]], nc = 1) %*% t(activation_all[[j-1]]) / m
        b_gradient[[j-1]] = b_gradient[[j-1]] + as.matrix(delta[[j-1]], nc = 1) / m
      }
    }
    
    for(i in 1:n){
      W_new[[i]] = W[[i]] - alpha * W_gradient[[i]]
      b_new[[i]] = b[[i]] - alpha * b_gradient[[i]]
    }
    W = W_new
    b = b_new
    k = k + 1
    err = c(err, sum(c(unlist(W_gradient)^2, unlist(b_gradient)^2))*alpha^2)
    print(c(k, err[k]))
  }
  return(list(W=W, b=b, k=k, err = err))
 }
	## Demo
	set.seed(12321)
	W_all_test1 = list(matrix(rnorm(1, 0, 0.5), nc = 1), matrix(10, nc = 1))
	b_all_test1 = list(matrix(rnorm(1, 0, 0.5), nc = 1), matrix(-5, nc = 1))
	X = matrix(rnorm(200, 0, 2), ncol = 1)
	Y = apply(X, 1, function(x){forward_propagation(x, W_all_test1, b_all_test1)})
	X.train = X[1:150,]
	Y.train = Y[1:150]
	X.test = X[151:200,]
	Y.test = Y[151:200]
	#plot(X, Y)
	model = back_propagation(matrix(X.train, nc = 1), Y.train, c(1,1), 10, 0)
	Y.pred = apply(matrix(X.test, nc = 1), 1, function(x){forward_propagation(x, model$W, model$b)})
	plot(Y.test, Y.pred)

	## The following functions should be loaded first.
	## http://ufldl.stanford.edu/wiki/index.php/Backpropagation_Algorithm
	act_f <- function(x){
	x = as.numeric(x)
	return(1/(1+exp(-x)))
	}

	act_f_der <- function(x){
	x = as.numeric(x)
	x1 = act_f(x1)
	return(x1*(1-x1))
	}

	forward_z <- function(a, W, b){
	a = as.numeric(a)
	return(W %*% a + b)
	}

	forward_a <- function(z){
	return(act_f(z))
	}

	forward_propagation <- function(x, W_all, b_all){
	n = length(W_all)
	if(!is.array(x))
	x = matrix(x, nr = 1)
	y = matrix(0, nrow(x), nrow(W_all[[length(W_all)]]))
	for(j in 1:nrow(x)){
	a = as.numeric(x[j,])
	for(i in 1:n){
	z = forward_z(a, W_all[[i]], b_all[[i]])
	a = forward_a(z)
	}
	y[j, ] = a
	}
	return(y)
	}

	back_propagation <- function(X, Y, net_str, alpha, lambda, ...){
	if(!is.array(Y)){
	Y = matrix(Y, nc = 1)
	}
	m = dim(X)[1]
	p = dim(X)[2]
	n = length(as.numeric(net_str))
	W = list()
	b = list()
	args = list(...)
	if(length(args) == 2){
	beta = args[[1]]
	rho = args[[2]]
	sparse = TRUE
	}else{
	sparse = FALSE
	}
	## Init
	set.seed(216)
	for(i in 1:n){
	W = c(W, list(matrix(rnorm(net_str[i] * p, 0, 0.5), ncol = p)))
	b = c(b, list(matrix(rnorm(net_str[i], 0, 0.5), ncol = 1)))
	p = net_str[i]
	}
	W_new = W
	b_new = b

	err = 1
	k = 1
	while(err[k] > 0.00001){
	W_gradient = lapply(W, "*", lambda)
	b_gradient = lapply(b, "*", 0)

	## if there exits a sparsity constraint on the hidden units,
	## then perform scan through samples computing a forward pass on each to
	## accumulate (sum up) the activations and compute rho_real
	if(sparse == TRUE){
	rho_real = lapply(W, function(x){matrix(numeric(nrow(x)), nc = 1)})
	for(i in 1:m){
	activation_step = X[i, ]
	for(j in 1:n){
	activation_step = forward_propagation(activation_step, W[j], b[j])
	rho_real[[j]] = rho_real[[j]] + matrix(activation_step, nc = 1)
	}
	}
	rho_real = lapply(rho_real, "/", m)
	}

	for(i in 1:m){
	activation_all = list(matrix(X[i, ], nc = 1))
	activation_step = X[i, ]
	delta = lapply(W, function(x){numeric(nrow(x))})
	for(j in 1:n){
	activation_step = forward_propagation(activation_step, W[j], b[j])
	activation_all = c(activation_all, list(matrix(activation_step, nc = 1)))
	}
	delta[[n]] = -(Y[i, ]- activation_all[[n+1]]) * activation_all[[n+1]] * (1 - activation_all[[n+1]])
	W_gradient[[n]] = W_gradient[[n]] + as.matrix(delta[[n]], nc = 1) %*% t(activation_all[[n]]) / m
	b_gradient[[n]] = b_gradient[[n]] + as.matrix(delta[[n]], nc = 1) / m
	for(j in n:2){
	delta[[j-1]] = as.numeric(t(W[[j]]) %% delta[[j]] + ifelse(rep(!sparse, net_str[j-1]), 0, beta(-rho/rho_real[[j-1]]+(1-rho)/(1-rho_real[[j-1]])))) * activation_all[[j]] * (1 - activation_all[[j]])
	W_gradient[[j-1]] = W_gradient[[j-1]] + as.matrix(delta[[j-1]], nc = 1) %*% t(activation_all[[j-1]]) / m
	b_gradient[[j-1]] = b_gradient[[j-1]] + as.matrix(delta[[j-1]], nc = 1) / m
	}
	}

	for(i in 1:n){
	W_new[[i]] = W[[i]] - alpha * W_gradient[[i]]
	b_new[[i]] = b[[i]] - alpha * b_gradient[[i]]
	}
	W = W_new
	b = b_new
	k = k + 1
	err = c(err, sum(c(unlist(W_gradient)^2, unlist(b_gradient)^2))*alpha^2)
	print(c(k, err[k]))
	}
	return(list(W=W, b=b, k=k, err = err))
	}