Created
August 28, 2015 01:21
-
-
Save jpotts18/e725cf67bc0e06c9c49e to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
########################### | |
# Identify the question | |
########################### | |
Identify the question you are trying to solve | |
# Which car is the most undervalued? | |
# Are we trying to maximize or minimize a variable? | |
########################### | |
# Understand the data | |
########################### | |
# Understand what they variables mean | |
# Variable = Description | |
# Id = Record_ID | |
# Model = Model Description | |
# Price = Offer Price in EUROs | |
# Age_08_04 = Age in months as in August 2004 | |
# Mfg_Month = Manufacturing month (1-12) | |
# Mfg_Year = Manufacturing Year | |
# KM = Accumulated Kilometers on odometer | |
# Fuel_Type = Fuel Type (Petrol, Diesel, CNG) | |
# HP = Horse Power | |
# Met_Color = Metallic Color? (Yes=1, No=0) | |
# Color = Color (Blue, Red, Grey, Silver, Black, etc.) | |
# Automatic = Automatic ( (Yes=1, No=0) | |
# CC = Cylinder Volume in cubic centimeters | |
# Doors = Number of doors | |
# Cylinders = Number of cylinders | |
# Gears = Number of gear positions | |
# Quarterly_Tax = Quarterly road tax in EUROs | |
# Weight = Weight in Kilograms | |
# Mfr_Guarantee = Within Manufacturer's Guarantee period (Yes=1, No=0) | |
# BOVAG_Guarantee = BOVAG (Dutch dealer network) Guarantee (Yes=1, No=0) | |
# Guarantee_Period = Guarantee period in months | |
# ABS = Anti-Lock Brake System (Yes=1, No=0) | |
# Airbag_1 = Driver_Airbag (Yes=1, No=0) | |
# Airbag_2 = Passenger Airbag (Yes=1, No=0) | |
# Airco = Airconditioning (Yes=1, No=0) | |
# Automatic_airco = Automatic Airconditioning (Yes=1, No=0) | |
# Boardcomputer = Boardcomputer (Yes=1, No=0) | |
# CD_Player = CD Player (Yes=1, No=0) | |
# Central_Lock = Central Lock (Yes=1, No=0) | |
# Powered_Windows = Powered Windows (Yes=1, No=0) | |
# Power_Steering = Power Steering (Yes=1, No=0) | |
# Radio = Radio (Yes=1, No=0) | |
# Mistlamps = Mistlamps (Yes=1, No=0) | |
# Sport_Model = Sport Model (Yes=1, No=0) | |
# Backseat_Divider = Backseat Divider (Yes=1, No=0) | |
# Metallic_Rim = Metallic Rim (Yes=1, No=0) | |
# Radio_cassette = Radio Cassette (Yes=1, No=0) | |
# Parking_Assistant = Parking assistance system (Yes=1, No=0) | |
# Tow_Bar = Tow Bar (Yes=1, No=0) | |
########################### | |
# Explore the data | |
########################### | |
# set working directory to be next to data | |
raw.data <- read.csv('toyota-corolla.csv') | |
View(raw.data) | |
# What types of variables are in this data set? | |
str(raw.data) | |
# We won't be able to use all of these variables why? | |
plot(raw.data$Price ~ raw.data$Mfg_Year) | |
subset <- raw.data[,c(3:7)] | |
plot(subset) | |
plot(subset$KM ~ subset$Age_08_04) | |
# What can we learn from this correlation? | |
plot(subset$Price ~ subset$KM) | |
# What can we learn from this? | |
plot(subset) | |
plot(subset$Age_08_04 ~ subset$Mfg_Year) | |
plot(subset$Age_08_04 ~ subset$Mfg_Month) | |
# can we learn anything from this data? | |
plot(subset) | |
subset <- raw.data[,c(3:4,7,9,13:18)] | |
plot(subset) | |
# Run one linear regression | |
plot(subset$Price ~ subset$KM) | |
price_km_linear_regression <- lm(subset$Price ~ subset$KM) | |
# Plot this line on the graph | |
abline(price_km_linear_regression, col = "red") | |
summary(price_km_linear_regression) | |
# What do all of these numbers mean? | |
# Intercept? | |
# subset$KM Estimate | |
# Multiple R-squared | |
# 32% explained by | |
plot(price_km_linear_regression) | |
mlr.2factors <- lm(subset$Price ~ subset$KM + subset$Weight) | |
summary(mlr.2factors) | |
mlr.2factors <- lm(subset$Price ~ subset$KM + subset$Age_08_04) | |
summary(mlr.2factors) | |
mlr.3factors <- lm(subset$Price ~ subset$KM + subset$Age_08_04 + subset$Weight) | |
summary(mlr.3factors) | |
# Why weight? | |
# Run an MLR | |
mlr.allfactors <- lm(subset$Price ~ | |
subset$KM + | |
subset$Age_08_04 + | |
subset$HP + | |
subset$CC + | |
subset$Doors + | |
subset$Cylinders + | |
subset$Gears + | |
subset$Quarterly_Tax + | |
subset$Weight | |
) | |
summary(mlr.allfactors) | |
# How many factors do we need to get the most accurate MLR? | |
# How do we identify which ones give the most predictive power? | |
mlr.4factors <- lm(subset$Price ~ subset$KM + subset$Age_08_04 + subset$HP + subset$Weight) | |
summary(mlr.4factors) | |
# Are the extra factors worth it? | |
# .8614 - 4 factors | |
# .863 - All factors | |
# How do we calculate the predicted vs actual | |
coef(mlr.4factors) | |
plot(density(resid(mlr.4factors))) | |
resid(mlr.4factors) | |
subset["residual"] <- resid(mlr.4factors) | |
new_data = data.frame(KM=1000, Age_08_04=15, HP=100, Weight=1200) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment