Skip to content

Instantly share code, notes, and snippets.

@doron2402
Last active October 28, 2017 05:13
Show Gist options
  • Save doron2402/0bfcd133dc02bdbf192d8c9a5d53857d to your computer and use it in GitHub Desktop.
Save doron2402/0bfcd133dc02bdbf192d8c9a5d53857d to your computer and use it in GitHub Desktop.
Multiple Linear Regression Predict Tip
# Multiple Linear Regression
# Importing the dataset
dataset = read.csv('tip.csv')
# Encoding categorical data, in this case country
dataset$Country = factor(dataset$Country,
levels = c('USA', 'England', 'France', 'Israel', 'Italy'),
labels = c(1, 2, 3, 4, 5))
# install.packages('caTools')
# Load `caTools`
library(caTools)
# Set some random seed
set.seed(123)
split = sample.split(dataset$Tip, SplitRatio = 0.8)
training_set = subset(dataset, split == TRUE)
test_set = subset(dataset, split == FALSE)
# Fitting Multiple Linear Regression to the Training set
regressor = lm(formula = Tip ~ .,
data = training_set)
# Predicting the Test set results
y_pred = predict(regressor, newdata = test_set)
# By this point we should run `summary(regressor)`
# In order to check the p-value
#Coefficients:
# Estimate Std. Error t value Pr(>|t|)
#(Intercept) 28.32170 15.30406 1.851 0.077702 .
#WaiterAge 0.60355 0.27501 2.195 0.039042 *
#GroupSize -2.90429 2.91919 -0.995 0.330603
#DinersAvgAge -0.07064 0.24696 -0.286 0.777532
#CostOfMeal 0.09314 0.04918 1.894 0.071446 .
#Country2 -48.60033 13.15088 -3.696 0.001263 **
#Country3 -48.62730 12.46118 -3.902 0.000765 ***
#Country4 -21.03267 13.19317 -1.594 0.125156
#Country5 -38.42068 15.17851 -2.531 0.019020 *
#---
# Let's get rid of `DinersAveAge
# Taking out DinersAvgAge
regressor2 = lm(formula = Tip ~ WaiterAge + GroupSize + CostOfMeal + Country,
data = training_set)
y_pred2 = predict(regressor, newdata = test_set)
# Again let's run `summary(regressor2)`
# WaiterAge 0.60737 0.26915 2.257 0.033841 *
# GroupSize -2.72674 2.79491 -0.976 0.339408
# CostOfMeal 0.09151 0.04786 1.912 0.068410 .
# Country2 -50.10668 11.80735 -4.244 0.000307 ***
# Country3 -49.63997 11.70673 -4.240 0.000309 ***
# Country4 -22.21611 12.27500 -1.810 0.083404 .
# Country5 -39.49177 14.41273 -2.740 0.011666 *
# Let's get rid of Group Size
# Taking out GroupSize
regressor3 = lm(formula = Tip ~ WaiterAge + CostOfMeal + Country,
data = training_set)
y_pred3 = predict(regressor, newdata = test_set)
# Running `summary(regressor3)`
# WaiterAge 0.57197 0.26643 2.147 0.042124 *
# CostOfMeal 0.04684 0.01392 3.366 0.002564 **
# Country2 -48.31559 11.65204 -4.147 0.000364 ***
# Country3 -48.69739 11.65508 -4.178 0.000336 ***
# Country4 -21.92541 12.25906 -1.789 0.086327 .
# Country5 -34.71975 13.54374 -2.564 0.017050 *
# We're looking for p-balue <= 0.05 so let's take out `Country`
# Taking out Country
regressor4 = lm(formula = Tip ~ WaiterAge + CostOfMeal,
data = training_set)
y_pred4 = predict(regressor, newdata = test_set)
# And we got
# WaiterAge 0.37441 0.33924 1.104 0.2791
# CostOfMeal 0.03703 0.01795 2.063 0.0485 *
# We can learn that the biggest factor of a tip is the cost of the `meal` but the second factor is
# the waiter age
# Now we can Run our model on our test dataset and compare it to our train dataset
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment