Laurae2 · February 26, 2017 12:58
diff --git a/Linear_regression_simple_GD.R b/Linear_regression_simple_GD.R
 # Setting up random matrix
 set.seed(11111)
 x <- data.frame(a = rnorm(n = 15) * 5,
                b = rnorm(n = 15) * 3 + 1,
                c = rnorm(n = 15) * 2 + 2)

 # Setting up the (perfect) linear relationship
 y <- 2 + (x[, 1] * 2) + (x[, 2] * 3) + (x[, 3] * 4) + (x[, 3] ^ 2) + (x[, 1] * x[, 2])

 # Setting up polynomial features
 columns <- ncol(x)
 for (i in 1:columns) {
  x[, paste0(colnames(x)[i], "X", colnames(x)[i])] <- x[, i] * x[, i]
  for (j in i:columns) {
    x[, paste0(colnames(x)[i], "X", colnames(x)[j])] <- x[, i] * x[, j]
  }
 }

 # Add column names and intercept
 colnames(x) <- c("a*2", "b*3", "c*4", "aXa", "aXb*1", "aXc", "bXb", "bXc", "cXc*1")
 x <- as.matrix(cbind(Intercept = 1, x))

 # Make Gradient Descent brute force code
 GradientDescent_brute <- function(x, y, param, eta, iters){
  
  # Uncomment every code if you want to monitor the cost
  # browser()
  # cost <- rep(0,iters)
  
  # Perform loop per iteration of gradient descent
  for(k in 1:iters){
    
    # Initialize dummy gradient vector to zero
    grad <- rep(0, ncol(x))
    
    # Loop through each feature (column)
    for(i in 1:ncol(x)) {
      
      # Loop through each observation (row)
      for(j in 1:nrow(x)) {
        
        # Get the gradient increase per observation per feature
        # Squared Error = (x - y) ^ 2
        # Squared Error Gradient: 2 * (x - y)
        # new feature gradient = old feature gradient + (((matmult of observation feature values * coefficients) - value) * observation feature value)
        grad[i] <- grad[i] + 2 * (((x[j, ] %*% param) - y[j]) * x[j, i])
        
      }
      
    }
    
    # Apply shrinkage to gradient, and divide by observation count to get mean
    param <- param - (eta * grad / nrow(x))
    
    # Cost specific routines:
    # grad <- ((x %*% param) - y) ^ 2
    # cost[k] <- sum(grad) / nrow(x)
  }
  
  print(param)
  return(param)
  
 }

 # Try with gradient descent here:
 GradientDescent_brute(x, y, rep(0, 10), 0.0001, 100)
	# Setting up random matrix
	set.seed(11111)
	x <- data.frame(a = rnorm(n = 15) * 5,
	b = rnorm(n = 15) * 3 + 1,
	c = rnorm(n = 15) * 2 + 2)

	# Setting up the (perfect) linear relationship
	y <- 2 + (x[, 1] * 2) + (x[, 2] * 3) + (x[, 3] * 4) + (x[, 3] ^ 2) + (x[, 1] * x[, 2])

	# Setting up polynomial features
	columns <- ncol(x)
	for (i in 1:columns) {
	x[, paste0(colnames(x)[i], "X", colnames(x)[i])] <- x[, i] * x[, i]
	for (j in i:columns) {
	x[, paste0(colnames(x)[i], "X", colnames(x)[j])] <- x[, i] * x[, j]
	}
	}

	# Add column names and intercept
	colnames(x) <- c("a2", "b3", "c4", "aXa", "aXb1", "aXc", "bXb", "bXc", "cXc*1")
	x <- as.matrix(cbind(Intercept = 1, x))

	# Make Gradient Descent brute force code
	GradientDescent_brute <- function(x, y, param, eta, iters){

	# Uncomment every code if you want to monitor the cost
	# browser()
	# cost <- rep(0,iters)

	# Perform loop per iteration of gradient descent
	for(k in 1:iters){

	# Initialize dummy gradient vector to zero
	grad <- rep(0, ncol(x))

	# Loop through each feature (column)
	for(i in 1:ncol(x)) {

	# Loop through each observation (row)
	for(j in 1:nrow(x)) {

	# Get the gradient increase per observation per feature
	# Squared Error = (x - y) ^ 2
	# Squared Error Gradient: 2 * (x - y)
	# new feature gradient = old feature gradient + (((matmult of observation feature values * coefficients) - value) * observation feature value)
	grad[i] <- grad[i] + 2 * (((x[j, ] %% param) - y[j]) x[j, i])

	}

	}

	# Apply shrinkage to gradient, and divide by observation count to get mean
	param <- param - (eta * grad / nrow(x))

	# Cost specific routines:
	# grad <- ((x %*% param) - y) ^ 2
	# cost[k] <- sum(grad) / nrow(x)
	}

	print(param)
	return(param)

	}

	# Try with gradient descent here:
	GradientDescent_brute(x, y, rep(0, 10), 0.0001, 100)