doron2402 · October 28, 2017 05:13
diff --git a/Tip_prediction_multiple_linear_regression.r b/Tip_prediction_multiple_linear_regression.r
 # Multiple Linear Regression

 # Importing the dataset
 dataset = read.csv('tip.csv')

 # Encoding categorical data, in this case country 
 dataset$Country = factor(dataset$Country,
                       levels = c('USA', 'England', 'France', 'Israel', 'Italy'),
                       labels = c(1, 2, 3, 4, 5))

 # install.packages('caTools')
 # Load `caTools`
 library(caTools)
 # Set some random seed 
 set.seed(123)

 split = sample.split(dataset$Tip, SplitRatio = 0.8)
 training_set = subset(dataset, split == TRUE)
 test_set = subset(dataset, split == FALSE)


 # Fitting Multiple Linear Regression to the Training set
 regressor = lm(formula = Tip ~ .,
               data = training_set)

 # Predicting the Test set results
 y_pred = predict(regressor, newdata = test_set)

 # By this point we should run `summary(regressor)`
 # In order to check the p-value 
 #Coefficients:
 #              Estimate Std. Error t value Pr(>|t|)    
 #(Intercept)   28.32170   15.30406   1.851 0.077702 .  
 #WaiterAge      0.60355    0.27501   2.195 0.039042 *  
 #GroupSize     -2.90429    2.91919  -0.995 0.330603    
 #DinersAvgAge  -0.07064    0.24696  -0.286 0.777532    
 #CostOfMeal     0.09314    0.04918   1.894 0.071446 .  
 #Country2     -48.60033   13.15088  -3.696 0.001263 ** 
 #Country3     -48.62730   12.46118  -3.902 0.000765 ***
 #Country4     -21.03267   13.19317  -1.594 0.125156    
 #Country5     -38.42068   15.17851  -2.531 0.019020 *  
 #---
 # Let's get rid of `DinersAveAge

 # Taking out DinersAvgAge
 regressor2 = lm(formula = Tip ~ WaiterAge + GroupSize + CostOfMeal + Country,
                data = training_set)
 y_pred2 = predict(regressor, newdata = test_set)

 # Again let's run `summary(regressor2)`
 # WaiterAge     0.60737    0.26915   2.257 0.033841 *  
 # GroupSize    -2.72674    2.79491  -0.976 0.339408    
 # CostOfMeal    0.09151    0.04786   1.912 0.068410 .  
 # Country2    -50.10668   11.80735  -4.244 0.000307 ***
 # Country3    -49.63997   11.70673  -4.240 0.000309 ***
 # Country4    -22.21611   12.27500  -1.810 0.083404 .  
 # Country5    -39.49177   14.41273  -2.740 0.011666 *  
 # Let's get rid of Group Size
 # Taking out GroupSize
 regressor3 = lm(formula = Tip ~ WaiterAge + CostOfMeal + Country,
                data = training_set)
 y_pred3 = predict(regressor, newdata = test_set)
 # Running `summary(regressor3)`
 # WaiterAge     0.57197    0.26643   2.147 0.042124 *  
 # CostOfMeal    0.04684    0.01392   3.366 0.002564 ** 
 # Country2    -48.31559   11.65204  -4.147 0.000364 ***
 # Country3    -48.69739   11.65508  -4.178 0.000336 ***
 # Country4    -21.92541   12.25906  -1.789 0.086327 .  
 # Country5    -34.71975   13.54374  -2.564 0.017050 *  

 # We're looking for p-balue <= 0.05 so let's take out `Country`
 # Taking out Country
 regressor4 = lm(formula = Tip ~ WaiterAge + CostOfMeal,
                data = training_set)
 y_pred4 = predict(regressor, newdata = test_set)

 # And we got
 # WaiterAge    0.37441    0.33924   1.104   0.2791  
 # CostOfMeal   0.03703    0.01795   2.063   0.0485 *

 # We can learn that the biggest factor of a tip is the cost of the `meal` but the second factor is 
 # the waiter age
 # Now we can Run our model on our test dataset and compare it to our train dataset
	# Multiple Linear Regression

	# Importing the dataset
	dataset = read.csv('tip.csv')

	# Encoding categorical data, in this case country
	dataset$Country = factor(dataset$Country,
	levels = c('USA', 'England', 'France', 'Israel', 'Italy'),
	labels = c(1, 2, 3, 4, 5))

	# install.packages('caTools')
	# Load `caTools`
	library(caTools)
	# Set some random seed
	set.seed(123)

	split = sample.split(dataset$Tip, SplitRatio = 0.8)
	training_set = subset(dataset, split == TRUE)
	test_set = subset(dataset, split == FALSE)


	# Fitting Multiple Linear Regression to the Training set
	regressor = lm(formula = Tip ~ .,
	data = training_set)

	# Predicting the Test set results
	y_pred = predict(regressor, newdata = test_set)

	# By this point we should run `summary(regressor)`
	# In order to check the p-value
	#Coefficients:
	# Estimate Std. Error t value Pr(>\|t\|)
	#(Intercept) 28.32170 15.30406 1.851 0.077702 .
	#WaiterAge 0.60355 0.27501 2.195 0.039042 *
	#GroupSize -2.90429 2.91919 -0.995 0.330603
	#DinersAvgAge -0.07064 0.24696 -0.286 0.777532
	#CostOfMeal 0.09314 0.04918 1.894 0.071446 .
	#Country2 -48.60033 13.15088 -3.696 0.001263 **
	#Country3 -48.62730 12.46118 -3.902 0.000765 ***
	#Country4 -21.03267 13.19317 -1.594 0.125156
	#Country5 -38.42068 15.17851 -2.531 0.019020 *
	#---
	# Let's get rid of `DinersAveAge

	# Taking out DinersAvgAge
	regressor2 = lm(formula = Tip ~ WaiterAge + GroupSize + CostOfMeal + Country,
	data = training_set)
	y_pred2 = predict(regressor, newdata = test_set)

	# Again let's run `summary(regressor2)`
	# WaiterAge 0.60737 0.26915 2.257 0.033841 *
	# GroupSize -2.72674 2.79491 -0.976 0.339408
	# CostOfMeal 0.09151 0.04786 1.912 0.068410 .
	# Country2 -50.10668 11.80735 -4.244 0.000307 ***
	# Country3 -49.63997 11.70673 -4.240 0.000309 ***
	# Country4 -22.21611 12.27500 -1.810 0.083404 .
	# Country5 -39.49177 14.41273 -2.740 0.011666 *
	# Let's get rid of Group Size
	# Taking out GroupSize
	regressor3 = lm(formula = Tip ~ WaiterAge + CostOfMeal + Country,
	data = training_set)
	y_pred3 = predict(regressor, newdata = test_set)
	# Running `summary(regressor3)`
	# WaiterAge 0.57197 0.26643 2.147 0.042124 *
	# CostOfMeal 0.04684 0.01392 3.366 0.002564 **
	# Country2 -48.31559 11.65204 -4.147 0.000364 ***
	# Country3 -48.69739 11.65508 -4.178 0.000336 ***
	# Country4 -21.92541 12.25906 -1.789 0.086327 .
	# Country5 -34.71975 13.54374 -2.564 0.017050 *

	# We're looking for p-balue <= 0.05 so let's take out `Country`
	# Taking out Country
	regressor4 = lm(formula = Tip ~ WaiterAge + CostOfMeal,
	data = training_set)
	y_pred4 = predict(regressor, newdata = test_set)

	# And we got
	# WaiterAge 0.37441 0.33924 1.104 0.2791
	# CostOfMeal 0.03703 0.01795 2.063 0.0485 *

	# We can learn that the biggest factor of a tip is the cost of the `meal` but the second factor is
	# the waiter age
	# Now we can Run our model on our test dataset and compare it to our train dataset