dantonnoriega · April 16, 2018 07:25 · cyruskavwele12 · May 1, 2020
diff --git a/tweedie-simulations-with-dispersion-estimate.R b/tweedie-simulations-with-dispersion-estimate.R
 # inspired by https://stats.stackexchange.com/questions/174121/can-a-model-for-non-negative-data-with-clumping-at-zeros-tweedie-glm-zero-infl
 # additions by Danton Noriega
 library(statmod)
 library(tweedie)
 library(mgcv)

 # generate fake mu (poisson count rates)
 set.seed(1789)
 x <- seq(1,100, by = .1)
 mutrue <- exp(-1+x/25)
 summary(mutrue)
 n = length(x)

 y <- rtweedie(n, mu=mutrue, phi=1.4, power=1.3)
 summary(y)
 sum(y==0)

 fit.glm <- glm(y ~ x, family=tweedie(var.power=1.3, link.power=0))
 summary(fit.glm)
 ### estimates of dispersion parameter
 # found in stats::summary.glm
 # confirmed in https://www2.math.su.se/matstat/reports/serieb/2004/rep5/report.pdf and
 # page 15 of http://www.statsci.org/smyth/pubs/tweediepdf-series-preprint.pdf
 # (although without dividing by var(mu) = mu^p)

 # pearson estimator
 sum((fit.glm$weights * fit.glm$residuals^2)[fit.glm$weights > 0])/fit.glm$df.residual
 # mean deviance estimator
 fit.glm$deviance/fit.glm$df.residual

 # gam, specifying tweedie p
 fit.gam <- gam(y ~ x, family = Tweedie(p = 1.3, link = power(0)))
 summary(fit.gam)
 sum((fit.gam$weights * fit.gam$residuals^2)[fit.gam$weights > 0])/fit.gam$df.residual
 fit.gam$deviance/fit.gam$df.residual

 # gam, estimating p
 fit.gam.tw <- gam(y ~ x, family = tw(link = 'log'))
 summary(fit.gam.tw)
 sum((fit.gam.tw$weights * fit.gam.tw$residuals^2)[fit.gam.tw$weights > 0])/fit.gam.tw$df.residual
 fit.gam.tw$deviance/fit.gam.tw$df.residual

 # use fit.gam.tw estimates
 phi.hat <- fit.gam.tw$deviance/sum(fit.gam.tw$prior.weights)
 mu.hat <- fitted(fit.gam.tw)
 p.hat <- fit.gam.tw$family$getTheta(TRUE)
 prob_zero <- exp(-mu.hat^(2-p.hat) / phi.hat / (2-p.hat))
 prob_zero[1:5]

 # simulate random draws using fitted values
 y.tw <- mgcv::rTweedie(mu.hat, p = p.hat, phi = phi.hat)
 sum(y.tw == 0)

 # plot generated y vs simulated y from fitted values
 brks = seq(0, ceiling(max(max(y), max(y.tw))), by = 0.5)
 hist(y, breaks = brks, col = scales::alpha('red', .9))
 par(new = TRUE)
 hist(y.tw, breaks = brks, col = scales::alpha('blue', .5), axes = FALSE, xlab = NULL, ylab = NULL, main = NULL)

 # plot generate mu (trues) vs fitted
 hist(mutrue, breaks = seq(0, 25, by = .5), col = scales::alpha('red', .9))
 par(new = TRUE)
 hist(mu.hat, breaks = seq(0, 25, by = .5), col = scales::alpha('blue', .5), axes = FALSE, xlab = NULL, ylab = NULL, main = NULL)
	# inspired by https://stats.stackexchange.com/questions/174121/can-a-model-for-non-negative-data-with-clumping-at-zeros-tweedie-glm-zero-infl
	# additions by Danton Noriega
	library(statmod)
	library(tweedie)
	library(mgcv)

	# generate fake mu (poisson count rates)
	set.seed(1789)
	x <- seq(1,100, by = .1)
	mutrue <- exp(-1+x/25)
	summary(mutrue)
	n = length(x)

	y <- rtweedie(n, mu=mutrue, phi=1.4, power=1.3)
	summary(y)
	sum(y==0)

	fit.glm <- glm(y ~ x, family=tweedie(var.power=1.3, link.power=0))
	summary(fit.glm)
	### estimates of dispersion parameter
	# found in stats::summary.glm
	# confirmed in https://www2.math.su.se/matstat/reports/serieb/2004/rep5/report.pdf and
	# page 15 of http://www.statsci.org/smyth/pubs/tweediepdf-series-preprint.pdf
	# (although without dividing by var(mu) = mu^p)

	# pearson estimator
	sum((fit.glm$weights * fit.glm$residuals^2)[fit.glm$weights > 0])/fit.glm$df.residual
	# mean deviance estimator
	fit.glm$deviance/fit.glm$df.residual

	# gam, specifying tweedie p
	fit.gam <- gam(y ~ x, family = Tweedie(p = 1.3, link = power(0)))
	summary(fit.gam)
	sum((fit.gam$weights * fit.gam$residuals^2)[fit.gam$weights > 0])/fit.gam$df.residual
	fit.gam$deviance/fit.gam$df.residual

	# gam, estimating p
	fit.gam.tw <- gam(y ~ x, family = tw(link = 'log'))
	summary(fit.gam.tw)
	sum((fit.gam.tw$weights * fit.gam.tw$residuals^2)[fit.gam.tw$weights > 0])/fit.gam.tw$df.residual
	fit.gam.tw$deviance/fit.gam.tw$df.residual

	# use fit.gam.tw estimates
	phi.hat <- fit.gam.tw$deviance/sum(fit.gam.tw$prior.weights)
	mu.hat <- fitted(fit.gam.tw)
	p.hat <- fit.gam.tw$family$getTheta(TRUE)
	prob_zero <- exp(-mu.hat^(2-p.hat) / phi.hat / (2-p.hat))
	prob_zero[1:5]

	# simulate random draws using fitted values
	y.tw <- mgcv::rTweedie(mu.hat, p = p.hat, phi = phi.hat)
	sum(y.tw == 0)

	# plot generated y vs simulated y from fitted values
	brks = seq(0, ceiling(max(max(y), max(y.tw))), by = 0.5)
	hist(y, breaks = brks, col = scales::alpha('red', .9))
	par(new = TRUE)
	hist(y.tw, breaks = brks, col = scales::alpha('blue', .5), axes = FALSE, xlab = NULL, ylab = NULL, main = NULL)

	# plot generate mu (trues) vs fitted
	hist(mutrue, breaks = seq(0, 25, by = .5), col = scales::alpha('red', .9))
	par(new = TRUE)
	hist(mu.hat, breaks = seq(0, 25, by = .5), col = scales::alpha('blue', .5), axes = FALSE, xlab = NULL, ylab = NULL, main = NULL)