dceoy · August 29, 2015 14:21
diff --git a/glm_confint.R b/glm_confint.R
 #!/usr/bin/env Rscript

 # Diabetes survey on Pima Indians
 #
 # variables:
 #   'glucose'   Plasma glucose concentration at 2 hours in an oral glucose tolerance test
 #   'diabetes'  Diabetes pedigree function
 #   'test'      test whether the patient shows signs of diabetes (coded 0 if negative, 1 if positive)
 data(pima, package = 'faraway')

 #
 # Logistic Regression
 #
 m <- glm(factor(test) ~ glucose + diabetes, family = binomial, data = pima)

 print(summary(m))
 #
 # Call:
 # glm(formula = test ~ glucose + diabetes, family = binomial, data = pima)
 #
 # Deviance Residuals:
 #     Min       1Q   Median       3Q      Max
 # -2.7557  -0.7801  -0.5295   0.8460   3.2897
 #
 # Coefficients:
 #              Estimate Std. Error z value Pr(>|z|)
 # (Intercept) -5.724331   0.447126 -12.803  < 2e-16 ***
 # glucose      0.037356   0.003284  11.375  < 2e-16 ***
 # diabetes     0.918678   0.273089   3.364 0.000768 ***
 # ---
 # Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
 #
 # (Dispersion parameter for binomial family taken to be 1)
 #
 #     Null deviance: 993.48  on 767  degrees of freedom
 # Residual deviance: 796.99  on 765  degrees of freedom
 # AIC: 802.99
 #
 # Number of Fisher Scoring iterations: 4
 #

 # Alpha Level
 alpha <- 0.05

 results <- list()

 #
 # Profile Likelihood CI
 #
 results$profile_likelihood_ci <- confint(m, level = 1 - alpha)

 #
 # Wald CI
 #
 results$wald_ci <- confint.default(m, level = 1 - alpha)

 #
 # Wald CI (manual calculation)
 #
 wald_ci <- function(model, alpha) {
  ce <- summary(model)$coefficients[,1:2]
  ci <- cbind(ce[,1] - qnorm(1 - alpha / 2) * ce[,2],
              ce[,1] + qnorm(1 - alpha / 2) * ce[,2])
  colnames(ci) <- c(paste((alpha / 2) * 100, '%'),
                    paste((1 - alpha / 2) * 100, '%'))
  return(ci)
 }
 results$wald_ci_manual_calc <- wald_ci(m, alpha)

 #
 # Odds Ratio
 #
 results$odds_ratio <- exp(cbind(Estimate = m$coefficients, confint(m, level = 1 - alpha)))

 print(results)
 #
 # $profile_likelihood_ci
 #                   2.5 %      97.5 %
 # (Intercept) -6.62859431 -4.87394015
 # glucose      0.03109338  0.04398227
 # diabetes     0.38948850  1.46127936
 #
 # $wald_ci
 #                   2.5 %      97.5 %
 # (Intercept) -6.60068146 -4.84798018
 # glucose      0.03091909  0.04379221
 # diabetes     0.38343319  1.45392188
 #
 # $wald_ci_manual_calc
 #                   2.5 %      97.5 %
 # (Intercept) -6.60068146 -4.84798018
 # glucose      0.03091909  0.04379221
 # diabetes     0.38343319  1.45392188
 #
 # $odds_ratio
 #                Estimate      2.5 %      97.5 %
 # (Intercept) 0.003265538 0.00132202 0.007643191
 # glucose     1.038062146 1.03158183 1.044963828
 # diabetes    2.505974144 1.47622552 4.311471930
 #
diff --git a/glm_predict.R b/glm_predict.R
 #!/usr/bin/env R

 sapply(c('dplyr', 'data.table', 'ggplot2'), function(p) require(p, character.only = TRUE))
 select <- dplyr::select

 # Diabetes survey on Pima Indians
 data(pima, package = 'faraway')

 #
 # Logistic Regression
 #
 m <- glm(factor(test) ~ glucose + diabetes, family = binomial, data = pima)

 # Alpha Level
 alpha = 0.05

 #
 # New Data having various values of Glucose and Diabetes
 #
 new_d <- with(pima, data.table(glucose = rep(quantile(pima$glucose)[2:4], each = 100),
                               diabetes = rep(seq(from = min(diabetes),
                                                  to = max(diabetes),
                                                  length.out = 100)), 3))

 #
 # Prediction of Probabilities
 #
 prd_p <- new_d %>%
           predict(m, newdata = ., type = 'link', se.fit = TRUE) %>%
           as.data.table() %>%
           mutate(predicted_prob = plogis(fit),
                  lower_limit = plogis(fit - qnorm(1 - alpha / 2) * se.fit),
                  upper_limit = plogis(fit + qnorm(1 - alpha / 2) * se.fit)) %>%
           cbind(new_d, .) %>%
           select(glucose, diabetes, predicted_prob, lower_limit, upper_limit)

 #
 # Visualization
 #
 prd_p$glucose <- factor(prd_p$glucose)

 ribbon <- ggplot(prd_p, aes(x = diabetes, y = predicted_prob)) +
            geom_ribbon(aes(ymin = lower_limit, ymax = upper_limit, fill = glucose), alpha = 0.2) +
            geom_line(aes(colour = glucose), size = 1) +
            labs(x = 'diabetes pedigree function', y = 'test of diabetes')

 svg('plot.svg', width = 6, height = 4)
 plot(ribbon)
 dev.off()
diff --git a/plot.svg b/plot.svg
	#!/usr/bin/env Rscript

	# Diabetes survey on Pima Indians
	#
	# variables:
	# 'glucose' Plasma glucose concentration at 2 hours in an oral glucose tolerance test
	# 'diabetes' Diabetes pedigree function
	# 'test' test whether the patient shows signs of diabetes (coded 0 if negative, 1 if positive)
	data(pima, package = 'faraway')

	#
	# Logistic Regression
	#
	m <- glm(factor(test) ~ glucose + diabetes, family = binomial, data = pima)

	print(summary(m))
	#
	# Call:
	# glm(formula = test ~ glucose + diabetes, family = binomial, data = pima)
	#
	# Deviance Residuals:
	# Min 1Q Median 3Q Max
	# -2.7557 -0.7801 -0.5295 0.8460 3.2897
	#
	# Coefficients:
	# Estimate Std. Error z value Pr(>\|z\|)
	# (Intercept) -5.724331 0.447126 -12.803 < 2e-16 ***
	# glucose 0.037356 0.003284 11.375 < 2e-16 ***
	# diabetes 0.918678 0.273089 3.364 0.000768 ***
	# ---
	# Signif. codes: 0 ‘*’ 0.001 ‘’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
	#
	# (Dispersion parameter for binomial family taken to be 1)
	#
	# Null deviance: 993.48 on 767 degrees of freedom
	# Residual deviance: 796.99 on 765 degrees of freedom
	# AIC: 802.99
	#
	# Number of Fisher Scoring iterations: 4
	#

	# Alpha Level
	alpha <- 0.05

	results <- list()

	#
	# Profile Likelihood CI
	#
	results$profile_likelihood_ci <- confint(m, level = 1 - alpha)

	#
	# Wald CI
	#
	results$wald_ci <- confint.default(m, level = 1 - alpha)

	#
	# Wald CI (manual calculation)
	#
	wald_ci <- function(model, alpha) {
	ce <- summary(model)$coefficients[,1:2]
	ci <- cbind(ce[,1] - qnorm(1 - alpha / 2) * ce[,2],
	ce[,1] + qnorm(1 - alpha / 2) * ce[,2])
	colnames(ci) <- c(paste((alpha / 2) * 100, '%'),
	paste((1 - alpha / 2) * 100, '%'))
	return(ci)
	}
	results$wald_ci_manual_calc <- wald_ci(m, alpha)

	#
	# Odds Ratio
	#
	results$odds_ratio <- exp(cbind(Estimate = m$coefficients, confint(m, level = 1 - alpha)))

	print(results)
	#
	# $profile_likelihood_ci
	# 2.5 % 97.5 %
	# (Intercept) -6.62859431 -4.87394015
	# glucose 0.03109338 0.04398227
	# diabetes 0.38948850 1.46127936
	#
	# $wald_ci
	# 2.5 % 97.5 %
	# (Intercept) -6.60068146 -4.84798018
	# glucose 0.03091909 0.04379221
	# diabetes 0.38343319 1.45392188
	#
	# $wald_ci_manual_calc
	# 2.5 % 97.5 %
	# (Intercept) -6.60068146 -4.84798018
	# glucose 0.03091909 0.04379221
	# diabetes 0.38343319 1.45392188
	#
	# $odds_ratio
	# Estimate 2.5 % 97.5 %
	# (Intercept) 0.003265538 0.00132202 0.007643191
	# glucose 1.038062146 1.03158183 1.044963828
	# diabetes 2.505974144 1.47622552 4.311471930
	#
	#!/usr/bin/env R

	sapply(c('dplyr', 'data.table', 'ggplot2'), function(p) require(p, character.only = TRUE))
	select <- dplyr::select

	# Diabetes survey on Pima Indians
	data(pima, package = 'faraway')

	#
	# Logistic Regression
	#
	m <- glm(factor(test) ~ glucose + diabetes, family = binomial, data = pima)

	# Alpha Level
	alpha = 0.05

	#
	# New Data having various values of Glucose and Diabetes
	#
	new_d <- with(pima, data.table(glucose = rep(quantile(pima$glucose)[2:4], each = 100),
	diabetes = rep(seq(from = min(diabetes),
	to = max(diabetes),
	length.out = 100)), 3))

	#
	# Prediction of Probabilities
	#
	prd_p <- new_d %>%
	predict(m, newdata = ., type = 'link', se.fit = TRUE) %>%
	as.data.table() %>%
	mutate(predicted_prob = plogis(fit),
	lower_limit = plogis(fit - qnorm(1 - alpha / 2) * se.fit),
	upper_limit = plogis(fit + qnorm(1 - alpha / 2) * se.fit)) %>%
	cbind(new_d, .) %>%
	select(glucose, diabetes, predicted_prob, lower_limit, upper_limit)

	#
	# Visualization
	#
	prd_p$glucose <- factor(prd_p$glucose)

	ribbon <- ggplot(prd_p, aes(x = diabetes, y = predicted_prob)) +
	geom_ribbon(aes(ymin = lower_limit, ymax = upper_limit, fill = glucose), alpha = 0.2) +
	geom_line(aes(colour = glucose), size = 1) +
	labs(x = 'diabetes pedigree function', y = 'test of diabetes')

	svg('plot.svg', width = 6, height = 4)
	plot(ribbon)
	dev.off()