BioSciEconomist · March 8, 2023 16:55
diff --git a/ex marginal effects.R b/ex marginal effects.R
 # *-----------------------------------------------------------------
 # | PROGRAM NAME: ex marginal effects.R 
 # | DATE: 2/26/21 
 # | CREATED BY: MATT BOGARD 
 # | PROJECT FILE:            
 # *----------------------------------------------------------------
 # | PURPOSE: example calculations of marginal effects
 # *----------------------------------------------------------------


 #------------------------------------
 # simulating data for logistic regresssion and marginal effects 
 #-------------------------------------

 # all code at once
 set.seed(1)
 gender <- sample(c(0,1), size = 10000, replace = TRUE)
 age <- round(runif(10000, 18, 80))
 xb <- -9 + 3.5*gender + 0.2*age
 p <- 1/(1 + exp(-xb))
 y <- rbinom(n = 10000, size = 1, prob = p)
 mod <- glm(y ~ gender + age, family = "binomial")
 summary(mod)

 dat <- data.frame(y,gender,age)


 #--------------------------------------------
 # model of interest
 #--------------------------------------------

 mylogit <- glm(y ~ gender + age,family=binomial(link='logit'),data = dat)
 summary(mylogit)



 #-------------------------------
 # linear probability model
 #-------------------------------

 summary(lm(y ~ gender + age, data = dat))

 # notice in all the examples below the ME from the lpm above
 # are nearly identical to the ME derived from logistic regression

 #--------------------------------------------------
 # function for calculating marginal effects
 #--------------------------------------------------

 # simplified based on: Alan Fernihough - https://econpapers.repec.org/paper/ucnwpaper/201122.htm
 mfx <- function(x){
  pdf <- mean(dlogis(predict(x, type = "link")))
  marginal.effects <- pdf*coef(x)
  print("Marginal Effects")
  return(marginal.effects)
 }

 mfx(mylogit) # results

 #----------------------------------------------------
 # mfx with bootsrapped SEs 
 #----------------------------------------------------

 # (ref: https://diffuseprior.wordpress.com/2012/04/23/probitlogit-marginal-effects-in-r-2/)

 mfxboot <- function(modform,dist,data,boot=1000,digits=3){
  x <- glm(modform, family=binomial(link=dist),data)
  # get marginal effects
  pdf <- ifelse(dist=="probit",
                mean(dnorm(predict(x, type = "link"))),
                mean(dlogis(predict(x, type = "link"))))
  marginal.effects <- pdf*coef(x)
  # start bootstrap
  bootvals <- matrix(rep(NA,boot*length(coef(x))), nrow=boot)
  set.seed(1111)
  for(i in 1:boot){
    samp1 <- data[sample(1:dim(data)[1],replace=T,dim(data)[1]),]
    x1 <- glm(modform, family=binomial(link=dist),samp1)
    pdf1 <- ifelse(dist=="probit",
                   mean(dnorm(predict(x, type = "link"))),
                   mean(dlogis(predict(x, type = "link"))))
    bootvals[i,] <- pdf1*coef(x1)
  }
  res <- cbind(marginal.effects,apply(bootvals,2,sd),marginal.effects/apply(bootvals,2,sd))
  if(names(x$coefficients[1])=="(Intercept)"){
    res1 <- res[2:nrow(res),]
    res2 <- matrix(as.numeric(sprintf(paste("%.",paste(digits,"f",sep=""),sep=""),res1)),nrow=dim(res1)[1])     
    rownames(res2) <- rownames(res1)
  } else {
    res2 <- matrix(as.numeric(sprintf(paste("%.",paste(digits,"f",sep=""),sep="")),nrow=dim(res)[1]))
    rownames(res2) <- rownames(res)
  }
  colnames(res2) <- c("marginal.effect","standard.error","z.ratio")  
  return(res2)
 }


 # calculate MEs
 mfx2 <- mfxboot(y ~ gender + age,"logit",dat)

 # extract ME and SE
 mfxdat <- data.frame(cbind(rownames(mfx2),mfx2))
 mfxdat$me <- as.numeric(as.character(mfxdat$marginal.effect))
 mfxdat$se <- as.numeric(as.character(mfxdat$standard.error))

 # report 
 (rpt <- mfxdat[c("V1","me","se")])

 #---------------------------------------
 # using mfx package by Alan Fernihough
 #--------------------------------------

 library(mfx)

 logitmfx(y ~ gender + age, dat, atmean = TRUE, robust = FALSE, clustervar1 = NULL,
         clustervar2 = NULL, start = NULL, control = list())


 #----------------------------------------------
 # using new R margins package
 #----------------------------------------------

 library(margins)

 margins(mylogit)

 summary(margins(mylogit))

 #-------------------------------------------------
 # crude stimate of marginal effects at the mean
 #-------------------------------------------------

 ### averaging the meff for the whole data set

 summary(mylogit)

 b1 <- 3.44 # from coefficient of interest from logistic regression model
 b2 <- .200 # coefficient on age (control)
 b0 <- -8.97 # intercept from logistic regression model


 dat$XB <- dat$gender*b1 + dat$age*b2 + b0

 meffx <- (exp(dat$XB)/((1+exp(dat$XB))^2))*b1
 summary(meffx) # get mean

 #--------------------------------------------------
 # demonstrating divide by 4 rule for marginal effects
 #--------------------------------------------------

 ### !!!!! Notice that the maximum value above .859 corresponds to b1/4 or 3.44/4 = .86 DIVIDE BY FOUR RULE in action
 ### !!!!! this illustrates that the divide by 4 rule provides the upper bound of estimated individual marginal
 ### !!!!! effects in the data set

 # see http://econometricsense.blogspot.com/2016/05/divide-by-4-rule-for-marginal-effects.html

 # other

 (OR <- exp(3.44)) # this gives us an odds ratio
 (log(OR)) # this gets back the OR

 # so 

 log(OR)/4 # upper bound on marginal effects


 #-----------------------------------------------------
 # converting ORs to MEs
 #------------------------------------------------------

 # this function uses baseline rates (Pc) for the control group to convert ORs to marginal effects
 # see: https://stats.stackexchange.com/questions/324410/converting-odds-ratio-to-percentage-increase-reduction 

 # this works for an unadjusted OR so to demonstrate we run a single variabel model

 mylogit2 <- glm(y ~ gender,family=binomial(link='logit'),data = dat)

 mfx(mylogit2) # get ME for comparison
 summary(margins(mylogit2)) # ME for comparison


 meffOR <- function(Pc,OR){
  Pt <- (OR * Pc )/(1 + (OR*Pc) - Pc)
  me <- (Pt - Pc)
  print("Marginal Effects Based on OR")
  return(me)
 }

 (OR <- exp(1.33))  # 3.78 

 # baseline for control
 library(dplyr)

 dat%>%  
  group_by(gender)%>% 
  summarize(tavg = mean(y)) # .574 (control group)

 meffOR(.574,OR) # results. # .26 similar to results above

 #---------------------------------------------
 # studenmund's rule of thumb
 #---------------------------------------------

 # ref: Using Econometrics: A Practical Guide 5th Ed

 # p. 458
 # according to Studenmund this is approximating the derivitive of the logit
 # the expected value of D (probability of y = 1) caused by a 
 # one unit increase in X holding constant the other independent variables 
 # in the equation = b1*D*(1-D) where D and b1 are estimates 

 # this seems to work in a single variable regression but not the full model


 b1 <- 1.33 # from gender on single variable model mylogit 2
 Di <- mean(y)

 (me <- b1*(Di*(1-Di))) # .276

 # does not work on gender from multivariable logit
 b1 <- 3.45 # from gender on single variable model mylogit
 Di <- mean(y)

 (me <- b1*(Di*(1-Di)))

 # does not work on age from multivariable logit
 b1 <- .20 # from gender on single variable model mylogit
 Di <- mean(y)

 (me <- b1*(Di*(1-Di)))


 # does not work well on age from single variable logit (running with age instead of gender)
 b1 <- .1391 # from gender on single variable model mylogit
 Di <- mean(y)

 (me <- b1*(Di*(1-Di)))
	# *-----------------------------------------------------------------
	# \| PROGRAM NAME: ex marginal effects.R
	# \| DATE: 2/26/21
	# \| CREATED BY: MATT BOGARD
	# \| PROJECT FILE:
	# *----------------------------------------------------------------
	# \| PURPOSE: example calculations of marginal effects
	# *----------------------------------------------------------------


	#------------------------------------
	# simulating data for logistic regresssion and marginal effects
	#-------------------------------------

	# all code at once
	set.seed(1)
	gender <- sample(c(0,1), size = 10000, replace = TRUE)
	age <- round(runif(10000, 18, 80))
	xb <- -9 + 3.5gender + 0.2age
	p <- 1/(1 + exp(-xb))
	y <- rbinom(n = 10000, size = 1, prob = p)
	mod <- glm(y ~ gender + age, family = "binomial")
	summary(mod)

	dat <- data.frame(y,gender,age)


	#--------------------------------------------
	# model of interest
	#--------------------------------------------

	mylogit <- glm(y ~ gender + age,family=binomial(link='logit'),data = dat)
	summary(mylogit)



	#-------------------------------
	# linear probability model
	#-------------------------------

	summary(lm(y ~ gender + age, data = dat))

	# notice in all the examples below the ME from the lpm above
	# are nearly identical to the ME derived from logistic regression

	#--------------------------------------------------
	# function for calculating marginal effects
	#--------------------------------------------------

	# simplified based on: Alan Fernihough - https://econpapers.repec.org/paper/ucnwpaper/201122.htm
	mfx <- function(x){
	pdf <- mean(dlogis(predict(x, type = "link")))
	marginal.effects <- pdf*coef(x)
	print("Marginal Effects")
	return(marginal.effects)
	}

	mfx(mylogit) # results

	#----------------------------------------------------
	# mfx with bootsrapped SEs
	#----------------------------------------------------

	# (ref: https://diffuseprior.wordpress.com/2012/04/23/probitlogit-marginal-effects-in-r-2/)

	mfxboot <- function(modform,dist,data,boot=1000,digits=3){
	x <- glm(modform, family=binomial(link=dist),data)
	# get marginal effects
	pdf <- ifelse(dist=="probit",
	mean(dnorm(predict(x, type = "link"))),
	mean(dlogis(predict(x, type = "link"))))
	marginal.effects <- pdf*coef(x)
	# start bootstrap
	bootvals <- matrix(rep(NA,boot*length(coef(x))), nrow=boot)
	set.seed(1111)
	for(i in 1:boot){
	samp1 <- data[sample(1:dim(data)[1],replace=T,dim(data)[1]),]
	x1 <- glm(modform, family=binomial(link=dist),samp1)
	pdf1 <- ifelse(dist=="probit",
	mean(dnorm(predict(x, type = "link"))),
	mean(dlogis(predict(x, type = "link"))))
	bootvals[i,] <- pdf1*coef(x1)
	}
	res <- cbind(marginal.effects,apply(bootvals,2,sd),marginal.effects/apply(bootvals,2,sd))
	if(names(x$coefficients[1])=="(Intercept)"){
	res1 <- res[2:nrow(res),]
	res2 <- matrix(as.numeric(sprintf(paste("%.",paste(digits,"f",sep=""),sep=""),res1)),nrow=dim(res1)[1])
	rownames(res2) <- rownames(res1)
	} else {
	res2 <- matrix(as.numeric(sprintf(paste("%.",paste(digits,"f",sep=""),sep="")),nrow=dim(res)[1]))
	rownames(res2) <- rownames(res)
	}
	colnames(res2) <- c("marginal.effect","standard.error","z.ratio")
	return(res2)
	}


	# calculate MEs
	mfx2 <- mfxboot(y ~ gender + age,"logit",dat)

	# extract ME and SE
	mfxdat <- data.frame(cbind(rownames(mfx2),mfx2))
	mfxdat$me <- as.numeric(as.character(mfxdat$marginal.effect))
	mfxdat$se <- as.numeric(as.character(mfxdat$standard.error))

	# report
	(rpt <- mfxdat[c("V1","me","se")])

	#---------------------------------------
	# using mfx package by Alan Fernihough
	#--------------------------------------

	library(mfx)

	logitmfx(y ~ gender + age, dat, atmean = TRUE, robust = FALSE, clustervar1 = NULL,
	clustervar2 = NULL, start = NULL, control = list())


	#----------------------------------------------
	# using new R margins package
	#----------------------------------------------

	library(margins)

	margins(mylogit)

	summary(margins(mylogit))

	#-------------------------------------------------
	# crude stimate of marginal effects at the mean
	#-------------------------------------------------

	### averaging the meff for the whole data set

	summary(mylogit)

	b1 <- 3.44 # from coefficient of interest from logistic regression model
	b2 <- .200 # coefficient on age (control)
	b0 <- -8.97 # intercept from logistic regression model


	dat$XB <- dat$genderb1 + dat$ageb2 + b0

	meffx <- (exp(dat$XB)/((1+exp(dat$XB))^2))*b1
	summary(meffx) # get mean

	#--------------------------------------------------
	# demonstrating divide by 4 rule for marginal effects
	#--------------------------------------------------

	### !!!!! Notice that the maximum value above .859 corresponds to b1/4 or 3.44/4 = .86 DIVIDE BY FOUR RULE in action
	### !!!!! this illustrates that the divide by 4 rule provides the upper bound of estimated individual marginal
	### !!!!! effects in the data set

	# see http://econometricsense.blogspot.com/2016/05/divide-by-4-rule-for-marginal-effects.html

	# other

	(OR <- exp(3.44)) # this gives us an odds ratio
	(log(OR)) # this gets back the OR

	# so

	log(OR)/4 # upper bound on marginal effects


	#-----------------------------------------------------
	# converting ORs to MEs
	#------------------------------------------------------

	# this function uses baseline rates (Pc) for the control group to convert ORs to marginal effects
	# see: https://stats.stackexchange.com/questions/324410/converting-odds-ratio-to-percentage-increase-reduction

	# this works for an unadjusted OR so to demonstrate we run a single variabel model

	mylogit2 <- glm(y ~ gender,family=binomial(link='logit'),data = dat)

	mfx(mylogit2) # get ME for comparison
	summary(margins(mylogit2)) # ME for comparison


	meffOR <- function(Pc,OR){
	Pt <- (OR * Pc )/(1 + (OR*Pc) - Pc)
	me <- (Pt - Pc)
	print("Marginal Effects Based on OR")
	return(me)
	}

	(OR <- exp(1.33)) # 3.78

	# baseline for control
	library(dplyr)

	dat%>%
	group_by(gender)%>%
	summarize(tavg = mean(y)) # .574 (control group)

	meffOR(.574,OR) # results. # .26 similar to results above

	#---------------------------------------------
	# studenmund's rule of thumb
	#---------------------------------------------

	# ref: Using Econometrics: A Practical Guide 5th Ed

	# p. 458
	# according to Studenmund this is approximating the derivitive of the logit
	# the expected value of D (probability of y = 1) caused by a
	# one unit increase in X holding constant the other independent variables
	# in the equation = b1D(1-D) where D and b1 are estimates

	# this seems to work in a single variable regression but not the full model


	b1 <- 1.33 # from gender on single variable model mylogit 2
	Di <- mean(y)

	(me <- b1(Di(1-Di))) # .276

	# does not work on gender from multivariable logit
	b1 <- 3.45 # from gender on single variable model mylogit
	Di <- mean(y)

	(me <- b1(Di(1-Di)))

	# does not work on age from multivariable logit
	b1 <- .20 # from gender on single variable model mylogit
	Di <- mean(y)

	(me <- b1(Di(1-Di)))


	# does not work well on age from single variable logit (running with age instead of gender)
	b1 <- .1391 # from gender on single variable model mylogit
	Di <- mean(y)

	(me <- b1(Di(1-Di)))