viniciusmss · September 20, 2019 16:16
diff --git a/R Code to Accompany Training Set and Test Set Video.R b/R Code to Accompany Training Set and Test Set Video.R
 ### Arbitrarily define some simple data-generating process ("DGP"):
 X = c(1:100)

 set.seed(9876) # this sets the random seed, ensuring replicability
 irreducible_error <- rnorm(100)

 # Here's the data-generating process ("DGP")
 Y <- 10*X + 500*irreducible_error

 data_set <- data.frame(X, Y)

 # You could plot it if you wished...
 plot(data_set$X, data_set$Y, col = "blue", 
     pch = 16, # pch = 16 creates a filled circle 
     cex = 2,  # cex = 2 doubles the size of the points
     xlab = "Values of X", ylab = "Values of Y", main = "The Data")

 ### Take the 100 observations in the data 
 ### and split the data into a 50 observation training set
 ### and a 50 observation test set...
 # 1 line of code below, w/ ";", to ensure you run the set.seed line
 set.seed(5); training_set_rows <- sample(1:100, 50, replace = FALSE)

 # define the training set by selecting only the 50 training set rows
 training_set <- data_set[training_set_rows,]

 # define the test set by deleting the 50 training set rows
 test_set <-     data_set[-training_set_rows,]

 # you can plot both sets to visually confirm sampling looks random
 dev.off() # to erase the previous figure

 plot(training_set[,1], training_set[,2], pch = 16, col = "blue",
     xlab = "Values of X", ylab = "Values of Y",
     main = "Training Set in Blue, Test Set in Orange")

 # now add the test set points in purple
 points(test_set[,1], test_set[,2], pch = 16, col = "orange")


 ### Now--get real. Ordinarily you would only have the Xs and Ys
 ### and you would not know the DGP (but you would WANT to know it.)
 # You might decide to fit a linear regression.
 # If you did fit a linear regression here, do you think you'd be overfitting? 
 # How could you assess if you were overfitting?

 # Run a linear regression on the training set. call it "reg1"
 reg1 <- lm(Y ~ X, data = training_set)
 summary(reg1) # to observe the regression results--the below should appear

 # Call:
 #  lm(formula = Y ~ X, data = training_set)
 # Residuals:
 #  Min     1Q Median     3Q    Max 
 # -968.8 -348.6   -2.0  287.8 1411.5 
 # Coefficients:
 # Estimate Std. Error t value Pr(>|t|)    
 # (Intercept) -172.630    146.342  -1.180    0.244    
 # X             14.222      2.584   5.505 1.42e-06 ***
 # Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
 # Residual standard error: 485.6 on 48 degrees of freedom
 # Multiple R-squared:  0.387,	Adjusted R-squared:  0.3742 
 # F-statistic:  30.3 on 1 and 48 DF,  p-value: 1.418e-06

 # (1) What does your regression say "f", the relationship btw X and Y is?
 # (2) Is the median residual -2? (residual = 'observed Y' - 'predicted Y')

 # Obtain the training set Y values predicted by the regression
 predict(reg1, newdata = training_set)

 # Calculate the training set error (mean squared error)
 #How?

 ## Calculate the test set error (mean squared error)
 # First use the regression to make "Y" predictions for the test set
 predict(reg1, newdata = test_set)

 ## Change pseudocode below to code. What do the results imply? Are we overfitting?
 mean((training set observed Ys - training set predicted Ys)^2)
 mean((test set observed Ys - test set predicted Ys)^2)

 #######
	### Arbitrarily define some simple data-generating process ("DGP"):
	X = c(1:100)

	set.seed(9876) # this sets the random seed, ensuring replicability
	irreducible_error <- rnorm(100)

	# Here's the data-generating process ("DGP")
	Y <- 10X + 500irreducible_error

	data_set <- data.frame(X, Y)

	# You could plot it if you wished...
	plot(data_set$X, data_set$Y, col = "blue",
	pch = 16, # pch = 16 creates a filled circle
	cex = 2, # cex = 2 doubles the size of the points
	xlab = "Values of X", ylab = "Values of Y", main = "The Data")

	### Take the 100 observations in the data
	### and split the data into a 50 observation training set
	### and a 50 observation test set...
	# 1 line of code below, w/ ";", to ensure you run the set.seed line
	set.seed(5); training_set_rows <- sample(1:100, 50, replace = FALSE)

	# define the training set by selecting only the 50 training set rows
	training_set <- data_set[training_set_rows,]

	# define the test set by deleting the 50 training set rows
	test_set <- data_set[-training_set_rows,]

	# you can plot both sets to visually confirm sampling looks random
	dev.off() # to erase the previous figure

	plot(training_set[,1], training_set[,2], pch = 16, col = "blue",
	xlab = "Values of X", ylab = "Values of Y",
	main = "Training Set in Blue, Test Set in Orange")

	# now add the test set points in purple
	points(test_set[,1], test_set[,2], pch = 16, col = "orange")


	### Now--get real. Ordinarily you would only have the Xs and Ys
	### and you would not know the DGP (but you would WANT to know it.)
	# You might decide to fit a linear regression.
	# If you did fit a linear regression here, do you think you'd be overfitting?
	# How could you assess if you were overfitting?

	# Run a linear regression on the training set. call it "reg1"
	reg1 <- lm(Y ~ X, data = training_set)
	summary(reg1) # to observe the regression results--the below should appear

	# Call:
	# lm(formula = Y ~ X, data = training_set)
	# Residuals:
	# Min 1Q Median 3Q Max
	# -968.8 -348.6 -2.0 287.8 1411.5
	# Coefficients:
	# Estimate Std. Error t value Pr(>\|t\|)
	# (Intercept) -172.630 146.342 -1.180 0.244
	# X 14.222 2.584 5.505 1.42e-06 ***
	# Signif. codes: 0 '*' 0.001 '' 0.01 '*' 0.05 '.' 0.1 ' ' 1
	# Residual standard error: 485.6 on 48 degrees of freedom
	# Multiple R-squared: 0.387, Adjusted R-squared: 0.3742
	# F-statistic: 30.3 on 1 and 48 DF, p-value: 1.418e-06

	# (1) What does your regression say "f", the relationship btw X and Y is?
	# (2) Is the median residual -2? (residual = 'observed Y' - 'predicted Y')

	# Obtain the training set Y values predicted by the regression
	predict(reg1, newdata = training_set)

	# Calculate the training set error (mean squared error)
	#How?

	## Calculate the test set error (mean squared error)
	# First use the regression to make "Y" predictions for the test set
	predict(reg1, newdata = test_set)

	## Change pseudocode below to code. What do the results imply? Are we overfitting?
	mean((training set observed Ys - training set predicted Ys)^2)
	mean((test set observed Ys - test set predicted Ys)^2)

	#######
No results found