Created
June 13, 2016 18:05
-
-
Save arthurwuhoo/43b1cad1462de3eb73d6a5080084ba9a to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
############################################################## | |
# DAY 11: LINEAR REGRESSION EXERCISES | |
############################################################## | |
# 1) Height and Mass. Scrape the height and mass data from here. | |
# ---------------------------------------------------------------------------- | |
library(rvest) | |
heightweightdata <- read_html("http://socr.ucla.edu/docs/resources/SOCR_Data/SOCR_Data_Dinov_020108_HeightsWeights.html") | |
heightweightdata <- as.data.frame(heightweightdata %>% html_nodes("table") %>% .[[1]] %>% html_table()) | |
# Convert from imperial to metric units. | |
head(heightweightdata) #shows that the header column got read in as a row accidentally. lets fix. | |
columnnames <- heightweightdata[1,] | |
heightweightdata_v2 <- heightweightdata[2:25001,] | |
colnames(heightweightdata_v2) <- columnnames | |
heightweightdata_v2 <- as.data.frame(lapply(heightweightdata_v2, as.numeric)) | |
## actual conversion | |
heightweightdata_v2$height_m <- heightweightdata_v2$`Height(Inches)`*2.54/100 | |
heightweightdata_v2$weight_kg <- heightweightdata_v2$`Weight(Pounds)`*0.453592 | |
# Add a column for Body Mass Index (BMI). | |
heightweightdata_v2$bmi <- heightweightdata_v2$weight_kg/(heightweightdata_v2$height_m^2) | |
# Plot mass vesus height and use BMI to colour the points in the scatter plot. | |
##lets use ggplot for this | |
library(ggplot2) | |
library(RColorBrewer) | |
ggplot(data = heightweightdata_v2, aes(weight_kg, height_m)) + | |
geom_point(aes(color = bmi)) + | |
coord_cartesian() + | |
scale_color_gradient() + | |
scale_color_continuous(low = "blue", high ="red") + | |
theme_bw() | |
# Create a linear model of mass against height. | |
fit.1 <- lm(height_m ~ weight_kg, heightweightdata_v2) | |
summary(fit.1) | |
# Would a quadratic model be a better fit? Check with anova(). | |
fit.2 = lm(height_m ~ poly(weight_kg, 2), data = heightweightdata_v2) | |
anova(fit.1, fit.2) #no, the quadratic model is not a better fit. P value is not significant. | |
# ---------------------------------------------------------------------------- | |
# 2) Build a regression model using the Boston data from the MASS package, which gives median | |
# housing prices as a function of a variety of variables for census tracts in the Boston area. | |
# ---------------------------------------------------------------------------- | |
# NOTE: medv is the median value of owner-occupied homes in \$1000s. | |
library(MASS) | |
str(Boston) #looking at what boston is. | |
fit.1 <- lm(medv ~ ., Boston) | |
summary(fit.1) #reports a 0.7338 adjusted r-squared, not bad. | |
#however, it also shows that indus and age are not significant. let's try it without them. | |
fit.2 <- lm(medv ~ . -age -indus, Boston) | |
summary(fit.2) #reports a 0.7348 adjusted r-squared, awesome! all variables are also significant. | |
# ---------------------------------------------------------------------------- | |
# 3) Building an Automated Wine Critic. Grab a copy of the wine data. Use these data to build a | |
# model to predict the quality of a wine. Consider carefully how you will handle the two | |
# different types of wine. Consider using interactions. | |
# ---------------------------------------------------------------------------- | |
load("wine.RData") #make sure you correct this filepath if file not in your current wd! | |
wine.fit.1 <- lm(quality ~., wine) | |
summary(wine.fit.1) #this looks like a pretty poor r-squared (<0.3) | |
#let's take andrew's hint and use some interaction variables. | |
# how do we know which interaction variables to include? | |
# there are a few different techniques, like including *every* potential | |
# interaction and then paring the model down, but that also means | |
# including 78 interaction variables! that could take a while to assess the impact of each one. | |
# let's do a boxplot of each variable by wine grouping. if we see that the boxplot | |
# distributions are very different depending on group, maybe it's worth including | |
# an interaction term with type of wine * that variable. | |
head(wine) #interesting. maybe we should do a facet boxplot with wine type. | |
library(reshape) | |
melted_wine <- melt(wine) | |
ggplot(data = melted_wine, aes(x=variable, y=value)) + | |
geom_boxplot(aes(fill=type)) + | |
facet_wrap( ~ variable, scales="free") | |
# it seems like interactions are appropriate for basically all | |
# variables except the amount of alcohol by type. | |
winefit.2 <- lm(quality ~ . + type*fixed.acidity + type*volatile.acidity + | |
type*citric.acid + type*residual.sugar + type*chlorides + | |
type*free.sulfur.dioxide + type*density + type*pH + | |
type*sulphates, wine) | |
summary(winefit.2) ## ok, the adj - r.squared seems to be marginally better (about ~0.3) | |
#from here, you can strip out the insignificant variables from winefit.2. There were a few. | |
# ---------------------------------------------------------------------------- | |
# 4) You’ll find some data here which reflects the relationship between a laptop battery’s active | |
# time and charging time. Generate a model which can be used to predict this relationship. | |
# ---------------------------------------------------------------------------- | |
charge.data <- read.csv("laptop-charging.csv") | |
#REMEMBER to split things into training and testing sets!! | |
index <- sample(1:nrow(charge.data),prob = ) | |
percent_training <- 0.8 | |
n_training_obs <- floor(percent_training*nrow(charge.data)) | |
train_ind <- sample(1:nrow(charge.data), size = n_training_obs) | |
charge.data.train <- charge.data[train_ind, ] | |
charge.data.test <- charge.data[-train_ind, ] | |
### explore by plotting | |
dev.off() # clear any exsiting plots | |
plot(charge.data.train$charge, charge.data.train$active) | |
#looks weird. probably can't fit a linear model to that. let's try anyway. | |
laptop.fit.one <- lm(active ~ charge, charge.data.train) | |
summary(laptop.fit.one) #Model 1 | |
abline(laptop.fit.one, col = "red") | |
# the above achieves a 0.6465 adjusted r-squared. not bad, but not the best. | |
laptop.fit.quad <- lm(active ~ poly(charge, 2) ,charge.data.train) #trying quadratic | |
summary(laptop.fit.quad) #model 2 | |
abline(laptop.fit.quad, col = "blue") | |
laptop.fit.log <- lm(active ~ log(charge), charge.data.train) #trying log | |
summary(laptop.fit.log) #model 3 shows its better than 1 and 2 by r-squared | |
abline(laptop.fit.log, col = "green") | |
#user-defined model: | |
# as a gut check, it seems like every hour of charge results in 1.5 hours of active use | |
# until it hits three hours of charge, which is when active life totally plateaus. | |
# let's make a function with that. | |
arthurs_function <- function(chargetime){ | |
#initialize prediction vector | |
active_predictions <- rep(NA, length(chargetime)) | |
active_predictions[chargetime <= 3] <- 1.5*chargetime[chargetime<=3] | |
active_predictions[chargetime > 3] <- 4.5 | |
return(active_predictions) | |
} | |
## evaluating models against test data - using MSE | |
## remember that MSE is defined by the summation of (actual y - predicted y) / (n-p) | |
## where p is the number of predictors, n is the number of modeled observations | |
#model 1 predictions | |
model1_pred <- predict(laptop.fit.one, charge.data.test) | |
model2_pred <- predict(laptop.fit.quad, charge.data.test) | |
model3_pred <- predict(laptop.fit.log, charge.data.test) | |
model4_pred <- arthurs_function(charge.data.test$charge) | |
mse <- function(actual, predicted) { | |
return(sum((actual-predicted)^2)/(length(predicted)-2)) | |
} | |
#in interpreting mse, higher mse = worse model. | |
mse(charge.data.test$active, model1_pred) # highest mse, thus the worst model | |
mse(charge.data.test$active, model2_pred) # second lowest mse, thus the second best model | |
mse(charge.data.test$active, model3_pred) # second highest mse, thus the second worst model | |
mse(charge.data.test$active, model4_pred) # has the lowest mse, therefore the best model |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment