benfasoli · January 5, 2018 05:02
diff --git a/regression-coefficient-uncertainty.r b/regression-coefficient-uncertainty.r
 # Ben Fasoli
 rm(list = ls())

 library(gtrendsR)
 library(tidyverse)

 # Get google trend data for searches for skis and ski boots
 query <- c('skis', 'ski boots')
 trend <- gtrends(query, geo = 'US', time = 'all')
 trend_ts <- trend$interest_over_time %>%
  spread(key = keyword, value = hits) %>%
  select(date, skis, `ski boots`)

 # Linear regression to predict searches for skis by searches for ski boots
 # skis = m * ski boots + b
 mod <- lm(skis ~ `ski boots`, data = trend_ts)


 # Calculate the confidence interval
 yi <- trend_ts$skis        # actual values
 yih <- predict(mod)        # model predicted values
 N <- nrow(trend_ts)        # number of observations
 xi <- trend_ts$`ski boots` # independent variable values
 xh <- mean(xi)             # mean of independent variable

 # Calculate coefficient errors from the sum of squared errors
 se <- sqrt(sum((yi - yih)^2) / (N - 2)) / sqrt(sum((xi - xh)^2))
 se
 # 0.06204275

 # or from the variances (diagonals given by variance-covariance matrix)
 se <- sqrt(diag(vcov(mod)))[2]
 # 0.06204275

 # Manually calculate 95% confidence interval
 alpha <- 1 - 0.95
 cp <- 1 - alpha / 2 # critical probability
 DF <- N - 2         # degrees of freedom
 cv <- qt(cp, DF)    # critical value from quantile of t-distribution
 me <- cv * se       # margin of error

 ci <- rep(coef(mod)['`ski boots`'], times = 3) + c(-me, 0, me)
 ci
 # `ski boots` `ski boots` `ski boots` 
 #  2.485902    2.608391    2.730880 

 # Validate with confidence interval derived by R from the model object
 confint(mod, level = 0.95)
 #                 2.5 %   97.5 %
 #   (Intercept) 13.476521 16.26735
 #   `ski boots`  2.485902  2.73088


 # Compare coefficient standard error calculation methods
 # On first order, the margin of error at the 95% confidence interval should be 
 # roughly 2 * se assuming a gaussian distribution
 # Derived from variance-coviariance matrix diagonals (variances)
 se
 # [1] 0.06204275
 se * 2
 # 0.1240855
 me
 # 0.1224892

 # Calculated within summary.lm table
 summary(mod)
 # Coefficients:
 #             Estimate Std. Error t value Pr(>|t|)    
 # (Intercept) 14.87194    0.70680   21.04   <2e-16 ***
 # `ski boots`  2.60839    0.06204   42.04   <2e-16 ***

 # Alternative technique to estimate error in regression coefficients using 
 # random sampling with replacement (bootstrapping). Results are highly sensitive
 # to the size parameter of the resampled populations
 bootstrap <- function (x, fun, size = nrow(x), iter = 10) {
  sapply(1:iter, function(i) {
    id <- sample(1:nrow(x), size = size, replace = T)
    y <- x[id, ]
    fun(y)
  })
 }
 co <- bootstrap(trend_ts, iter = 10000, fun = function(df) {
  # Extract second coefficient (slope) from model output
  coef(lm(skis ~ `ski boots`, data = df))[2]
 })
 sd(co)
 # 0.06463646
	# Ben Fasoli
	rm(list = ls())

	library(gtrendsR)
	library(tidyverse)

	# Get google trend data for searches for skis and ski boots
	query <- c('skis', 'ski boots')
	trend <- gtrends(query, geo = 'US', time = 'all')
	trend_ts <- trend$interest_over_time %>%
	spread(key = keyword, value = hits) %>%
	select(date, skis, `ski boots`)

	# Linear regression to predict searches for skis by searches for ski boots
	# skis = m * ski boots + b
	mod <- lm(skis ~ `ski boots`, data = trend_ts)


	# Calculate the confidence interval
	yi <- trend_ts$skis # actual values
	yih <- predict(mod) # model predicted values
	N <- nrow(trend_ts) # number of observations
	xi <- trend_ts$`ski boots` # independent variable values
	xh <- mean(xi) # mean of independent variable

	# Calculate coefficient errors from the sum of squared errors
	se <- sqrt(sum((yi - yih)^2) / (N - 2)) / sqrt(sum((xi - xh)^2))
	se
	# 0.06204275

	# or from the variances (diagonals given by variance-covariance matrix)
	se <- sqrt(diag(vcov(mod)))[2]
	# 0.06204275

	# Manually calculate 95% confidence interval
	alpha <- 1 - 0.95
	cp <- 1 - alpha / 2 # critical probability
	DF <- N - 2 # degrees of freedom
	cv <- qt(cp, DF) # critical value from quantile of t-distribution
	me <- cv * se # margin of error

	ci <- rep(coef(mod)['`ski boots`'], times = 3) + c(-me, 0, me)
	ci
	# `ski boots` `ski boots` `ski boots`
	# 2.485902 2.608391 2.730880

	# Validate with confidence interval derived by R from the model object
	confint(mod, level = 0.95)
	# 2.5 % 97.5 %
	# (Intercept) 13.476521 16.26735
	# `ski boots` 2.485902 2.73088


	# Compare coefficient standard error calculation methods
	# On first order, the margin of error at the 95% confidence interval should be
	# roughly 2 * se assuming a gaussian distribution
	# Derived from variance-coviariance matrix diagonals (variances)
	se
	# [1] 0.06204275
	se * 2
	# 0.1240855
	me
	# 0.1224892

	# Calculated within summary.lm table
	summary(mod)
	# Coefficients:
	# Estimate Std. Error t value Pr(>\|t\|)
	# (Intercept) 14.87194 0.70680 21.04 <2e-16 ***
	# `ski boots` 2.60839 0.06204 42.04 <2e-16 ***

	# Alternative technique to estimate error in regression coefficients using
	# random sampling with replacement (bootstrapping). Results are highly sensitive
	# to the size parameter of the resampled populations
	bootstrap <- function (x, fun, size = nrow(x), iter = 10) {
	sapply(1:iter, function(i) {
	id <- sample(1:nrow(x), size = size, replace = T)
	y <- x[id, ]
	fun(y)
	})
	}
	co <- bootstrap(trend_ts, iter = 10000, fun = function(df) {
	# Extract second coefficient (slope) from model output
	coef(lm(skis ~ `ski boots`, data = df))[2]
	})
	sd(co)
	# 0.06463646