arthurwuhoo · June 13, 2016 18:05
diff --git a/Day 11: Linear Regression Solutions b/Day 11: Linear Regression Solutions
 ##############################################################

 # DAY 11: LINEAR REGRESSION EXERCISES

 ##############################################################

 # 1) Height and Mass. Scrape the height and mass data from here.
 # ----------------------------------------------------------------------------

 library(rvest)
 heightweightdata <- read_html("http://socr.ucla.edu/docs/resources/SOCR_Data/SOCR_Data_Dinov_020108_HeightsWeights.html")
 heightweightdata <- as.data.frame(heightweightdata %>% html_nodes("table") %>% .[[1]] %>% html_table())

 # Convert from imperial to metric units.

 head(heightweightdata) #shows that the header column got read in as a row accidentally. lets fix.
 columnnames <- heightweightdata[1,]
 heightweightdata_v2 <- heightweightdata[2:25001,]
 colnames(heightweightdata_v2) <- columnnames
 heightweightdata_v2 <- as.data.frame(lapply(heightweightdata_v2, as.numeric))

 ## actual conversion
 heightweightdata_v2$height_m <- heightweightdata_v2$`Height(Inches)`*2.54/100
 heightweightdata_v2$weight_kg <- heightweightdata_v2$`Weight(Pounds)`*0.453592

 # Add a column for Body Mass Index (BMI).
 heightweightdata_v2$bmi <- heightweightdata_v2$weight_kg/(heightweightdata_v2$height_m^2)

 # Plot mass vesus height and use BMI to colour the points in the scatter plot.
 ##lets use ggplot for this
 library(ggplot2)
 library(RColorBrewer)

 ggplot(data = heightweightdata_v2, aes(weight_kg, height_m)) +
  geom_point(aes(color = bmi)) +
  coord_cartesian() +
  scale_color_gradient() +
  scale_color_continuous(low = "blue", high ="red") +
  theme_bw()

 # Create a linear model of mass against height.
 fit.1 <- lm(height_m ~ weight_kg, heightweightdata_v2)
 summary(fit.1)

 # Would a quadratic model be a better fit? Check with anova().
 fit.2 = lm(height_m ~ poly(weight_kg, 2), data = heightweightdata_v2)
 anova(fit.1, fit.2) #no, the quadratic model is not a better fit. P value is not significant.



 # ----------------------------------------------------------------------------
 # 2) Build a regression model using the Boston data from the MASS package, which gives median 
 # housing prices as a function of a variety of variables for census tracts in the Boston area.
 # ----------------------------------------------------------------------------
 # NOTE: medv is the median value of owner-occupied homes in \$1000s.

 library(MASS)
 str(Boston) #looking at what boston is.

 fit.1 <- lm(medv ~ ., Boston)
 summary(fit.1) #reports a 0.7338 adjusted r-squared, not bad.
 #however, it also shows that indus and age are not significant. let's try it without them.

 fit.2 <- lm(medv ~ . -age -indus, Boston)
 summary(fit.2) #reports a 0.7348 adjusted r-squared, awesome! all variables are also significant.


 # ----------------------------------------------------------------------------
 # 3) Building an Automated Wine Critic. Grab a copy of the wine data. Use these data to build a 
 # model to predict the quality of a wine. Consider carefully how you will handle the two 
 # different types of wine. Consider using interactions.
 # ----------------------------------------------------------------------------
 load("wine.RData") #make sure you correct this filepath if file not in your current wd!

 wine.fit.1 <- lm(quality ~., wine)
 summary(wine.fit.1) #this looks like a pretty poor r-squared (<0.3)
 #let's take andrew's hint and use some interaction variables.

 # how do we know which interaction variables to include?
 # there are a few different techniques, like including *every* potential
 # interaction and then paring the model down, but that also means 
 # including 78 interaction variables! that could take a while to assess the impact of each one.

 # let's do a boxplot of each variable by wine grouping. if we see that the boxplot
 # distributions are very different depending on group, maybe it's worth including
 # an interaction term with type of wine * that variable.


 head(wine) #interesting. maybe we should do a facet boxplot with wine type.
 library(reshape)
 melted_wine <- melt(wine)

 ggplot(data = melted_wine, aes(x=variable, y=value)) + 
  geom_boxplot(aes(fill=type)) +
  facet_wrap( ~ variable, scales="free")

 # it seems like  interactions are appropriate for basically all 
 # variables except the amount of alcohol by type. 

 winefit.2 <- lm(quality ~ . + type*fixed.acidity + type*volatile.acidity +
                  type*citric.acid + type*residual.sugar + type*chlorides +
                  type*free.sulfur.dioxide + type*density + type*pH +
                  type*sulphates, wine)

 summary(winefit.2) ## ok, the adj - r.squared seems to be marginally better (about ~0.3)

 #from here, you can strip out the insignificant variables from winefit.2. There were a few.

 # ---------------------------------------------------------------------------- 
 # 4) You’ll find some data here which reflects the relationship between a laptop battery’s active 
 # time and charging time. Generate a model which can be used to predict this relationship.
 # ----------------------------------------------------------------------------

 charge.data <- read.csv("laptop-charging.csv")

 #REMEMBER to split things into training and testing sets!!
 index <- sample(1:nrow(charge.data),prob = )

 percent_training <- 0.8
 n_training_obs <- floor(percent_training*nrow(charge.data))

 train_ind <- sample(1:nrow(charge.data), size = n_training_obs)

 charge.data.train <- charge.data[train_ind, ]
 charge.data.test <- charge.data[-train_ind, ]


 ### explore by plotting

 dev.off() # clear any exsiting plots

 plot(charge.data.train$charge, charge.data.train$active) 
 #looks weird. probably can't fit a linear model to that. let's try anyway.

 laptop.fit.one <- lm(active ~ charge, charge.data.train)
 summary(laptop.fit.one) #Model 1
 abline(laptop.fit.one, col = "red")


 # the above achieves a 0.6465 adjusted r-squared. not bad, but not the best.
 laptop.fit.quad <- lm(active ~ poly(charge, 2) ,charge.data.train) #trying quadratic
 summary(laptop.fit.quad) #model 2
 abline(laptop.fit.quad, col = "blue")

 laptop.fit.log <- lm(active ~ log(charge), charge.data.train) #trying log
 summary(laptop.fit.log) #model 3 shows its better than 1 and 2 by r-squared
 abline(laptop.fit.log, col = "green")

 #user-defined model:
 # as a gut check, it seems like every hour of charge results in 1.5 hours of active use
 # until it hits three hours of charge, which is when active life totally plateaus.
 # let's make a function with that.

 arthurs_function <- function(chargetime){
  #initialize prediction vector
  active_predictions <- rep(NA, length(chargetime))
  active_predictions[chargetime <= 3] <- 1.5*chargetime[chargetime<=3]
  active_predictions[chargetime > 3] <- 4.5
  return(active_predictions)
 }


 ## evaluating models against test data - using MSE
 ## remember that MSE is defined by the summation of (actual y - predicted y) / (n-p)
 ## where p is the number of predictors, n is the number of modeled observations

 #model 1 predictions

 model1_pred <- predict(laptop.fit.one, charge.data.test)
 model2_pred <- predict(laptop.fit.quad, charge.data.test)
 model3_pred <- predict(laptop.fit.log, charge.data.test)
 model4_pred <- arthurs_function(charge.data.test$charge)

 mse <- function(actual, predicted) {
  return(sum((actual-predicted)^2)/(length(predicted)-2))
  
 }

 #in interpreting mse, higher mse = worse model.
 mse(charge.data.test$active, model1_pred) # highest mse, thus the worst model
 mse(charge.data.test$active, model2_pred) # second lowest mse, thus the second best model
 mse(charge.data.test$active, model3_pred) # second highest mse, thus the second worst model
 mse(charge.data.test$active, model4_pred) # has the lowest mse, therefore the best model
	##############################################################

	# DAY 11: LINEAR REGRESSION EXERCISES

	##############################################################

	# 1) Height and Mass. Scrape the height and mass data from here.
	# ----------------------------------------------------------------------------

	library(rvest)
	heightweightdata <- read_html("http://socr.ucla.edu/docs/resources/SOCR_Data/SOCR_Data_Dinov_020108_HeightsWeights.html")
	heightweightdata <- as.data.frame(heightweightdata %>% html_nodes("table") %>% .[[1]] %>% html_table())

	# Convert from imperial to metric units.

	head(heightweightdata) #shows that the header column got read in as a row accidentally. lets fix.
	columnnames <- heightweightdata[1,]
	heightweightdata_v2 <- heightweightdata[2:25001,]
	colnames(heightweightdata_v2) <- columnnames
	heightweightdata_v2 <- as.data.frame(lapply(heightweightdata_v2, as.numeric))

	## actual conversion
	heightweightdata_v2$height_m <- heightweightdata_v2$`Height(Inches)`*2.54/100
	heightweightdata_v2$weight_kg <- heightweightdata_v2$`Weight(Pounds)`*0.453592

	# Add a column for Body Mass Index (BMI).
	heightweightdata_v2$bmi <- heightweightdata_v2$weight_kg/(heightweightdata_v2$height_m^2)

	# Plot mass vesus height and use BMI to colour the points in the scatter plot.
	##lets use ggplot for this
	library(ggplot2)
	library(RColorBrewer)

	ggplot(data = heightweightdata_v2, aes(weight_kg, height_m)) +
	geom_point(aes(color = bmi)) +
	coord_cartesian() +
	scale_color_gradient() +
	scale_color_continuous(low = "blue", high ="red") +
	theme_bw()

	# Create a linear model of mass against height.
	fit.1 <- lm(height_m ~ weight_kg, heightweightdata_v2)
	summary(fit.1)

	# Would a quadratic model be a better fit? Check with anova().
	fit.2 = lm(height_m ~ poly(weight_kg, 2), data = heightweightdata_v2)
	anova(fit.1, fit.2) #no, the quadratic model is not a better fit. P value is not significant.



	# ----------------------------------------------------------------------------
	# 2) Build a regression model using the Boston data from the MASS package, which gives median
	# housing prices as a function of a variety of variables for census tracts in the Boston area.
	# ----------------------------------------------------------------------------
	# NOTE: medv is the median value of owner-occupied homes in \$1000s.

	library(MASS)
	str(Boston) #looking at what boston is.

	fit.1 <- lm(medv ~ ., Boston)
	summary(fit.1) #reports a 0.7338 adjusted r-squared, not bad.
	#however, it also shows that indus and age are not significant. let's try it without them.

	fit.2 <- lm(medv ~ . -age -indus, Boston)
	summary(fit.2) #reports a 0.7348 adjusted r-squared, awesome! all variables are also significant.


	# ----------------------------------------------------------------------------
	# 3) Building an Automated Wine Critic. Grab a copy of the wine data. Use these data to build a
	# model to predict the quality of a wine. Consider carefully how you will handle the two
	# different types of wine. Consider using interactions.
	# ----------------------------------------------------------------------------
	load("wine.RData") #make sure you correct this filepath if file not in your current wd!

	wine.fit.1 <- lm(quality ~., wine)
	summary(wine.fit.1) #this looks like a pretty poor r-squared (<0.3)
	#let's take andrew's hint and use some interaction variables.

	# how do we know which interaction variables to include?
	# there are a few different techniques, like including every potential
	# interaction and then paring the model down, but that also means
	# including 78 interaction variables! that could take a while to assess the impact of each one.

	# let's do a boxplot of each variable by wine grouping. if we see that the boxplot
	# distributions are very different depending on group, maybe it's worth including
	# an interaction term with type of wine * that variable.


	head(wine) #interesting. maybe we should do a facet boxplot with wine type.
	library(reshape)
	melted_wine <- melt(wine)

	ggplot(data = melted_wine, aes(x=variable, y=value)) +
	geom_boxplot(aes(fill=type)) +
	facet_wrap( ~ variable, scales="free")

	# it seems like interactions are appropriate for basically all
	# variables except the amount of alcohol by type.

	winefit.2 <- lm(quality ~ . + typefixed.acidity + typevolatile.acidity +
	typecitric.acid + typeresidual.sugar + type*chlorides +
	typefree.sulfur.dioxide + typedensity + type*pH +
	type*sulphates, wine)

	summary(winefit.2) ## ok, the adj - r.squared seems to be marginally better (about ~0.3)

	#from here, you can strip out the insignificant variables from winefit.2. There were a few.

	# ----------------------------------------------------------------------------
	# 4) You’ll find some data here which reflects the relationship between a laptop battery’s active
	# time and charging time. Generate a model which can be used to predict this relationship.
	# ----------------------------------------------------------------------------

	charge.data <- read.csv("laptop-charging.csv")

	#REMEMBER to split things into training and testing sets!!
	index <- sample(1:nrow(charge.data),prob = )

	percent_training <- 0.8
	n_training_obs <- floor(percent_training*nrow(charge.data))

	train_ind <- sample(1:nrow(charge.data), size = n_training_obs)

	charge.data.train <- charge.data[train_ind, ]
	charge.data.test <- charge.data[-train_ind, ]


	### explore by plotting

	dev.off() # clear any exsiting plots

	plot(charge.data.train$charge, charge.data.train$active)
	#looks weird. probably can't fit a linear model to that. let's try anyway.

	laptop.fit.one <- lm(active ~ charge, charge.data.train)
	summary(laptop.fit.one) #Model 1
	abline(laptop.fit.one, col = "red")


	# the above achieves a 0.6465 adjusted r-squared. not bad, but not the best.
	laptop.fit.quad <- lm(active ~ poly(charge, 2) ,charge.data.train) #trying quadratic
	summary(laptop.fit.quad) #model 2
	abline(laptop.fit.quad, col = "blue")

	laptop.fit.log <- lm(active ~ log(charge), charge.data.train) #trying log
	summary(laptop.fit.log) #model 3 shows its better than 1 and 2 by r-squared
	abline(laptop.fit.log, col = "green")

	#user-defined model:
	# as a gut check, it seems like every hour of charge results in 1.5 hours of active use
	# until it hits three hours of charge, which is when active life totally plateaus.
	# let's make a function with that.

	arthurs_function <- function(chargetime){
	#initialize prediction vector
	active_predictions <- rep(NA, length(chargetime))
	active_predictions[chargetime <= 3] <- 1.5*chargetime[chargetime<=3]
	active_predictions[chargetime > 3] <- 4.5
	return(active_predictions)
	}


	## evaluating models against test data - using MSE
	## remember that MSE is defined by the summation of (actual y - predicted y) / (n-p)
	## where p is the number of predictors, n is the number of modeled observations

	#model 1 predictions

	model1_pred <- predict(laptop.fit.one, charge.data.test)
	model2_pred <- predict(laptop.fit.quad, charge.data.test)
	model3_pred <- predict(laptop.fit.log, charge.data.test)
	model4_pred <- arthurs_function(charge.data.test$charge)

	mse <- function(actual, predicted) {
	return(sum((actual-predicted)^2)/(length(predicted)-2))

	}

	#in interpreting mse, higher mse = worse model.
	mse(charge.data.test$active, model1_pred) # highest mse, thus the worst model
	mse(charge.data.test$active, model2_pred) # second lowest mse, thus the second best model
	mse(charge.data.test$active, model3_pred) # second highest mse, thus the second worst model
	mse(charge.data.test$active, model4_pred) # has the lowest mse, therefore the best model