Skip to content

Instantly share code, notes, and snippets.

@arthurwuhoo
Created June 13, 2016 18:05
Show Gist options
  • Save arthurwuhoo/43b1cad1462de3eb73d6a5080084ba9a to your computer and use it in GitHub Desktop.
Save arthurwuhoo/43b1cad1462de3eb73d6a5080084ba9a to your computer and use it in GitHub Desktop.
##############################################################
# DAY 11: LINEAR REGRESSION EXERCISES
##############################################################
# 1) Height and Mass. Scrape the height and mass data from here.
# ----------------------------------------------------------------------------
library(rvest)
heightweightdata <- read_html("http://socr.ucla.edu/docs/resources/SOCR_Data/SOCR_Data_Dinov_020108_HeightsWeights.html")
heightweightdata <- as.data.frame(heightweightdata %>% html_nodes("table") %>% .[[1]] %>% html_table())
# Convert from imperial to metric units.
head(heightweightdata) #shows that the header column got read in as a row accidentally. lets fix.
columnnames <- heightweightdata[1,]
heightweightdata_v2 <- heightweightdata[2:25001,]
colnames(heightweightdata_v2) <- columnnames
heightweightdata_v2 <- as.data.frame(lapply(heightweightdata_v2, as.numeric))
## actual conversion
heightweightdata_v2$height_m <- heightweightdata_v2$`Height(Inches)`*2.54/100
heightweightdata_v2$weight_kg <- heightweightdata_v2$`Weight(Pounds)`*0.453592
# Add a column for Body Mass Index (BMI).
heightweightdata_v2$bmi <- heightweightdata_v2$weight_kg/(heightweightdata_v2$height_m^2)
# Plot mass vesus height and use BMI to colour the points in the scatter plot.
##lets use ggplot for this
library(ggplot2)
library(RColorBrewer)
ggplot(data = heightweightdata_v2, aes(weight_kg, height_m)) +
geom_point(aes(color = bmi)) +
coord_cartesian() +
scale_color_gradient() +
scale_color_continuous(low = "blue", high ="red") +
theme_bw()
# Create a linear model of mass against height.
fit.1 <- lm(height_m ~ weight_kg, heightweightdata_v2)
summary(fit.1)
# Would a quadratic model be a better fit? Check with anova().
fit.2 = lm(height_m ~ poly(weight_kg, 2), data = heightweightdata_v2)
anova(fit.1, fit.2) #no, the quadratic model is not a better fit. P value is not significant.
# ----------------------------------------------------------------------------
# 2) Build a regression model using the Boston data from the MASS package, which gives median
# housing prices as a function of a variety of variables for census tracts in the Boston area.
# ----------------------------------------------------------------------------
# NOTE: medv is the median value of owner-occupied homes in \$1000s.
library(MASS)
str(Boston) #looking at what boston is.
fit.1 <- lm(medv ~ ., Boston)
summary(fit.1) #reports a 0.7338 adjusted r-squared, not bad.
#however, it also shows that indus and age are not significant. let's try it without them.
fit.2 <- lm(medv ~ . -age -indus, Boston)
summary(fit.2) #reports a 0.7348 adjusted r-squared, awesome! all variables are also significant.
# ----------------------------------------------------------------------------
# 3) Building an Automated Wine Critic. Grab a copy of the wine data. Use these data to build a
# model to predict the quality of a wine. Consider carefully how you will handle the two
# different types of wine. Consider using interactions.
# ----------------------------------------------------------------------------
load("wine.RData") #make sure you correct this filepath if file not in your current wd!
wine.fit.1 <- lm(quality ~., wine)
summary(wine.fit.1) #this looks like a pretty poor r-squared (<0.3)
#let's take andrew's hint and use some interaction variables.
# how do we know which interaction variables to include?
# there are a few different techniques, like including *every* potential
# interaction and then paring the model down, but that also means
# including 78 interaction variables! that could take a while to assess the impact of each one.
# let's do a boxplot of each variable by wine grouping. if we see that the boxplot
# distributions are very different depending on group, maybe it's worth including
# an interaction term with type of wine * that variable.
head(wine) #interesting. maybe we should do a facet boxplot with wine type.
library(reshape)
melted_wine <- melt(wine)
ggplot(data = melted_wine, aes(x=variable, y=value)) +
geom_boxplot(aes(fill=type)) +
facet_wrap( ~ variable, scales="free")
# it seems like interactions are appropriate for basically all
# variables except the amount of alcohol by type.
winefit.2 <- lm(quality ~ . + type*fixed.acidity + type*volatile.acidity +
type*citric.acid + type*residual.sugar + type*chlorides +
type*free.sulfur.dioxide + type*density + type*pH +
type*sulphates, wine)
summary(winefit.2) ## ok, the adj - r.squared seems to be marginally better (about ~0.3)
#from here, you can strip out the insignificant variables from winefit.2. There were a few.
# ----------------------------------------------------------------------------
# 4) You’ll find some data here which reflects the relationship between a laptop battery’s active
# time and charging time. Generate a model which can be used to predict this relationship.
# ----------------------------------------------------------------------------
charge.data <- read.csv("laptop-charging.csv")
#REMEMBER to split things into training and testing sets!!
index <- sample(1:nrow(charge.data),prob = )
percent_training <- 0.8
n_training_obs <- floor(percent_training*nrow(charge.data))
train_ind <- sample(1:nrow(charge.data), size = n_training_obs)
charge.data.train <- charge.data[train_ind, ]
charge.data.test <- charge.data[-train_ind, ]
### explore by plotting
dev.off() # clear any exsiting plots
plot(charge.data.train$charge, charge.data.train$active)
#looks weird. probably can't fit a linear model to that. let's try anyway.
laptop.fit.one <- lm(active ~ charge, charge.data.train)
summary(laptop.fit.one) #Model 1
abline(laptop.fit.one, col = "red")
# the above achieves a 0.6465 adjusted r-squared. not bad, but not the best.
laptop.fit.quad <- lm(active ~ poly(charge, 2) ,charge.data.train) #trying quadratic
summary(laptop.fit.quad) #model 2
abline(laptop.fit.quad, col = "blue")
laptop.fit.log <- lm(active ~ log(charge), charge.data.train) #trying log
summary(laptop.fit.log) #model 3 shows its better than 1 and 2 by r-squared
abline(laptop.fit.log, col = "green")
#user-defined model:
# as a gut check, it seems like every hour of charge results in 1.5 hours of active use
# until it hits three hours of charge, which is when active life totally plateaus.
# let's make a function with that.
arthurs_function <- function(chargetime){
#initialize prediction vector
active_predictions <- rep(NA, length(chargetime))
active_predictions[chargetime <= 3] <- 1.5*chargetime[chargetime<=3]
active_predictions[chargetime > 3] <- 4.5
return(active_predictions)
}
## evaluating models against test data - using MSE
## remember that MSE is defined by the summation of (actual y - predicted y) / (n-p)
## where p is the number of predictors, n is the number of modeled observations
#model 1 predictions
model1_pred <- predict(laptop.fit.one, charge.data.test)
model2_pred <- predict(laptop.fit.quad, charge.data.test)
model3_pred <- predict(laptop.fit.log, charge.data.test)
model4_pred <- arthurs_function(charge.data.test$charge)
mse <- function(actual, predicted) {
return(sum((actual-predicted)^2)/(length(predicted)-2))
}
#in interpreting mse, higher mse = worse model.
mse(charge.data.test$active, model1_pred) # highest mse, thus the worst model
mse(charge.data.test$active, model2_pred) # second lowest mse, thus the second best model
mse(charge.data.test$active, model3_pred) # second highest mse, thus the second worst model
mse(charge.data.test$active, model4_pred) # has the lowest mse, therefore the best model
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment