Last active
October 28, 2017 05:13
-
-
Save doron2402/0bfcd133dc02bdbf192d8c9a5d53857d to your computer and use it in GitHub Desktop.
Multiple Linear Regression Predict Tip
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Multiple Linear Regression | |
# Importing the dataset | |
dataset = read.csv('tip.csv') | |
# Encoding categorical data, in this case country | |
dataset$Country = factor(dataset$Country, | |
levels = c('USA', 'England', 'France', 'Israel', 'Italy'), | |
labels = c(1, 2, 3, 4, 5)) | |
# install.packages('caTools') | |
# Load `caTools` | |
library(caTools) | |
# Set some random seed | |
set.seed(123) | |
split = sample.split(dataset$Tip, SplitRatio = 0.8) | |
training_set = subset(dataset, split == TRUE) | |
test_set = subset(dataset, split == FALSE) | |
# Fitting Multiple Linear Regression to the Training set | |
regressor = lm(formula = Tip ~ ., | |
data = training_set) | |
# Predicting the Test set results | |
y_pred = predict(regressor, newdata = test_set) | |
# By this point we should run `summary(regressor)` | |
# In order to check the p-value | |
#Coefficients: | |
# Estimate Std. Error t value Pr(>|t|) | |
#(Intercept) 28.32170 15.30406 1.851 0.077702 . | |
#WaiterAge 0.60355 0.27501 2.195 0.039042 * | |
#GroupSize -2.90429 2.91919 -0.995 0.330603 | |
#DinersAvgAge -0.07064 0.24696 -0.286 0.777532 | |
#CostOfMeal 0.09314 0.04918 1.894 0.071446 . | |
#Country2 -48.60033 13.15088 -3.696 0.001263 ** | |
#Country3 -48.62730 12.46118 -3.902 0.000765 *** | |
#Country4 -21.03267 13.19317 -1.594 0.125156 | |
#Country5 -38.42068 15.17851 -2.531 0.019020 * | |
#--- | |
# Let's get rid of `DinersAveAge | |
# Taking out DinersAvgAge | |
regressor2 = lm(formula = Tip ~ WaiterAge + GroupSize + CostOfMeal + Country, | |
data = training_set) | |
y_pred2 = predict(regressor, newdata = test_set) | |
# Again let's run `summary(regressor2)` | |
# WaiterAge 0.60737 0.26915 2.257 0.033841 * | |
# GroupSize -2.72674 2.79491 -0.976 0.339408 | |
# CostOfMeal 0.09151 0.04786 1.912 0.068410 . | |
# Country2 -50.10668 11.80735 -4.244 0.000307 *** | |
# Country3 -49.63997 11.70673 -4.240 0.000309 *** | |
# Country4 -22.21611 12.27500 -1.810 0.083404 . | |
# Country5 -39.49177 14.41273 -2.740 0.011666 * | |
# Let's get rid of Group Size | |
# Taking out GroupSize | |
regressor3 = lm(formula = Tip ~ WaiterAge + CostOfMeal + Country, | |
data = training_set) | |
y_pred3 = predict(regressor, newdata = test_set) | |
# Running `summary(regressor3)` | |
# WaiterAge 0.57197 0.26643 2.147 0.042124 * | |
# CostOfMeal 0.04684 0.01392 3.366 0.002564 ** | |
# Country2 -48.31559 11.65204 -4.147 0.000364 *** | |
# Country3 -48.69739 11.65508 -4.178 0.000336 *** | |
# Country4 -21.92541 12.25906 -1.789 0.086327 . | |
# Country5 -34.71975 13.54374 -2.564 0.017050 * | |
# We're looking for p-balue <= 0.05 so let's take out `Country` | |
# Taking out Country | |
regressor4 = lm(formula = Tip ~ WaiterAge + CostOfMeal, | |
data = training_set) | |
y_pred4 = predict(regressor, newdata = test_set) | |
# And we got | |
# WaiterAge 0.37441 0.33924 1.104 0.2791 | |
# CostOfMeal 0.03703 0.01795 2.063 0.0485 * | |
# We can learn that the biggest factor of a tip is the cost of the `meal` but the second factor is | |
# the waiter age | |
# Now we can Run our model on our test dataset and compare it to our train dataset | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment