Last active
January 5, 2018 05:02
-
-
Save benfasoli/91f2fab4a20d4ba2e40379cdfa76e02c to your computer and use it in GitHub Desktop.
Methods for confidence estimation for regression coefficients
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Ben Fasoli | |
rm(list = ls()) | |
library(gtrendsR) | |
library(tidyverse) | |
# Get google trend data for searches for skis and ski boots | |
query <- c('skis', 'ski boots') | |
trend <- gtrends(query, geo = 'US', time = 'all') | |
trend_ts <- trend$interest_over_time %>% | |
spread(key = keyword, value = hits) %>% | |
select(date, skis, `ski boots`) | |
# Linear regression to predict searches for skis by searches for ski boots | |
# skis = m * ski boots + b | |
mod <- lm(skis ~ `ski boots`, data = trend_ts) | |
# Calculate the confidence interval | |
yi <- trend_ts$skis # actual values | |
yih <- predict(mod) # model predicted values | |
N <- nrow(trend_ts) # number of observations | |
xi <- trend_ts$`ski boots` # independent variable values | |
xh <- mean(xi) # mean of independent variable | |
# Calculate coefficient errors from the sum of squared errors | |
se <- sqrt(sum((yi - yih)^2) / (N - 2)) / sqrt(sum((xi - xh)^2)) | |
se | |
# 0.06204275 | |
# or from the variances (diagonals given by variance-covariance matrix) | |
se <- sqrt(diag(vcov(mod)))[2] | |
# 0.06204275 | |
# Manually calculate 95% confidence interval | |
alpha <- 1 - 0.95 | |
cp <- 1 - alpha / 2 # critical probability | |
DF <- N - 2 # degrees of freedom | |
cv <- qt(cp, DF) # critical value from quantile of t-distribution | |
me <- cv * se # margin of error | |
ci <- rep(coef(mod)['`ski boots`'], times = 3) + c(-me, 0, me) | |
ci | |
# `ski boots` `ski boots` `ski boots` | |
# 2.485902 2.608391 2.730880 | |
# Validate with confidence interval derived by R from the model object | |
confint(mod, level = 0.95) | |
# 2.5 % 97.5 % | |
# (Intercept) 13.476521 16.26735 | |
# `ski boots` 2.485902 2.73088 | |
# Compare coefficient standard error calculation methods | |
# On first order, the margin of error at the 95% confidence interval should be | |
# roughly 2 * se assuming a gaussian distribution | |
# Derived from variance-coviariance matrix diagonals (variances) | |
se | |
# [1] 0.06204275 | |
se * 2 | |
# 0.1240855 | |
me | |
# 0.1224892 | |
# Calculated within summary.lm table | |
summary(mod) | |
# Coefficients: | |
# Estimate Std. Error t value Pr(>|t|) | |
# (Intercept) 14.87194 0.70680 21.04 <2e-16 *** | |
# `ski boots` 2.60839 0.06204 42.04 <2e-16 *** | |
# Alternative technique to estimate error in regression coefficients using | |
# random sampling with replacement (bootstrapping). Results are highly sensitive | |
# to the size parameter of the resampled populations | |
bootstrap <- function (x, fun, size = nrow(x), iter = 10) { | |
sapply(1:iter, function(i) { | |
id <- sample(1:nrow(x), size = size, replace = T) | |
y <- x[id, ] | |
fun(y) | |
}) | |
} | |
co <- bootstrap(trend_ts, iter = 10000, fun = function(df) { | |
# Extract second coefficient (slope) from model output | |
coef(lm(skis ~ `ski boots`, data = df))[2] | |
}) | |
sd(co) | |
# 0.06463646 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment