mingjiphd · January 27, 2026 18:32 · mingjiphd · Jan 26, 2026
diff --git a/Readme.txt b/Readme.txt
 This R script provides a step-by-step guide for causal analysis using propensity score matching in R. The primary R package demonstrated is MatchIt. Multiple matching methods available in MatchIt are covered in detail. The tutorial also demonstrates how to visualize balance after propensity score subclassification. After matching, the video walks through calculating causal effect estimates using different matching methods.
 A step by step video demonstration can be found at: https://youtu.be/iaRos9pkI5k 
diff --git a/R Script b/R Script
 ## How to Perform a Propensity Score Matching Analysis using R

 #The propensity score is the probability that an individual receives a treatment given 
 #  their #observed characteristics.
 #It is used in observational studies to create a balance between treated and untreated 
 #   groups, aiming to mimic the conditions of a randomized controlled trial.
 #By matching or comparing individuals with similar propensity scores, researchers can 
 #    reduce bias due to confounding factors.
 #Propensity scores are commonly estimated using logistic regression models based on #baseline covariates.
 #  Methods using propensity scores include matching, stratification, weighting, and 
 #  covariate adjustment to estimate treatment effects more reliably.
 #Effective use of propensity scores depends on carefully choosing relevant covariates and 
 # Checking that treated and untreated groups are balanced after adjustment.


 # Load required libraries
 #install.packages("MatchIt")
 library(MatchIt)  # For propensity score matching
 library(dplyr)    # For data manipulation

 # Step 1: Generate a synthetic dataset
 set.seed(123)  # For reproducibility

 # Generate 200 observations with covariates and treatment assignment
 n <- 200

 # Covariates: age, gender, income
 age <- rnorm(n, mean = 50, sd = 10)
 gender <- rbinom(n, 1, 0.5)  # 0 = Female, 1 = Male
 income <- rnorm(n, mean = 60000, sd = 15000)

 # Treatment assignment based on logistic model of covariates
 logit_p <- -5 + 0.05*age + 0.8*gender + 0.00002*income
 p <- 1 / (1 + exp(-logit_p))
 treatment <- rbinom(n, 1, p)

 # Outcome variable depends on treatment and covariates
 outcome <- 3 + 2*treatment + 0.03*age + 0.5*gender + rnorm(n)

 # Combine into a data frame
 data <- data.frame(treatment = factor(treatment),
                   age = age,
                   gender = factor(gender),
                   income = income,
                   outcome = outcome)
 head(data)

 # Step 2: Perform propensity score matching
 # Match treated (treatment == 1) with untreated (treatment == 0) on covariates age, gender, income
 # Using nearest neighbor matching without replacement
 match_obj <- matchit(treatment ~ age + gender + income,
                     data = data,
                     method = "nearest",
                     ratio = 1,
                     replace = FALSE)

 # Step 3: Check balance before and after matching
 summary(match_obj)

 # Step 4: Extract matched data
 matched_data <- match.data(match_obj)

 # Step 5: Outcome analysis on matched data
 # Estimate treatment effect by linear regression on matched data
 outcome_model <- lm(outcome ~ treatment, data = matched_data)
 summary(outcome_model)

 ##matching on exact covariates
 match_obj<- matchit(treatment ~ age + gender + income,
                     data = data,
                     method = "exact")
 summary(match_obj)

 matched_data<-match.data(match_obj)
 outcome_model<-lm(outcome ~ treatment, data = matched_data)
 summary(outcome_model)

 ##partial exact match
 match_obj <- matchit(treatment ~ age + gender + income,
                     data = data,
                     method = "nearest",
                     exact = ~ gender)
 summary(match_obj)
 matched_data<-match.data(match_obj)
 outcome_model<-lm(outcome ~ treatment, data = matched_data)
 summary(outcome_model)

 ##CEM matching (Coarsened Exact Match) 
 ##coarsens continuous covariates into bins and applies exact matching on these grouped values, 
 ##improving overlap while retaining comparability.
 match_obj <- matchit(treatment ~ age + income + gender,
                     data = data,
                     method = "cem")

 summary(match_obj)
 matched_data<-match.data(match_obj)
 outcome_model<-lm(outcome ~ treatment, data = matched_data)
 summary(outcome_model)

 ## Optimal Match
 ##Minimizes the total distance across all pairs. Require the package optmatch
 match_obj <- matchit(treatment ~ age + income + gender,
                     data = data,
                     method = "optimal")
 summary(match_obj)
 matched_data<-match.data(match_obj)
 outcome_model<-lm(outcome ~ treatment, data = matched_data)
 summary(outcome_model)

 ##Full Match
 ##Assigns treated and controls to groups that minimize average within-group distance
 match_obj <- matchit(treatment ~ age + income + gender,
                     data = data,
                     method = "full")
 summary(match_obj)
 matched_data<-match.data(match_obj)
 outcome_model<-lm(outcome ~ treatment, data = matched_data)
 summary(outcome_model)

 ##Subclassification Match
 ##Divides data into strata based on the propensity score, commonly using quintiles
 match_obj <- matchit(treatment ~ age + gender + income,
 data = data,
 method = "subclass",
 subclass = 5)  # This creates 5 subclasses
 summary(match_obj)
 matched_data<-match.data(match_obj)
 outcome_model<-lm(outcome ~ treatment, data = matched_data)
 summary(outcome_model)

 # Load necessary packages
 #install.packages("cobalt")
 library(cobalt)  # for balance plots

 # Visualize covariate balance before and after matching using Love plot
 love.plot(match_obj, 
          stats = "mean.diffs", # standardized mean differences
          threshold = 0.1,      # commonly used threshold for balance
          var.order = "unadjusted", 
          colors = c("red", "blue"),
          stars = "raw")        # show stars for significance

 # Plotting jitter plot to see propensity score distribution by subclass
 plot(match_obj, type = "jitter")

 # Summary of balance statistics
 summary(match_obj, standardize = TRUE)

 summary(mathc_obj,standardiz=FALSE)

 # Optional: use bal.plot from cobalt to plot balance of specific covariates
 bal.plot(match_obj, var.name = "age")
 bal.plot(match_obj, var.name = "income")



 ## Mahalanobis Distance Matching
 ## 
 match_obj <- matchit(treatment ~ age + gender + income, 
                     data = data,
                     method = "nearest",
                     distance = "mahalanobis",
                     mahvars = ~ age + income)
 
 summary(match_obj)
 matched_data<-match.data(match_obj)
 outcome_model<-lm(outcome ~ treatment, data = matched_data)
 summary(outcome_model)
diff --git a/VideoDemo b/VideoDemo
 A step by step video demonstration can be found at: https://youtu.be/iaRos9pkI5k
	## How to Perform a Propensity Score Matching Analysis using R

	#The propensity score is the probability that an individual receives a treatment given
	# their #observed characteristics.
	#It is used in observational studies to create a balance between treated and untreated
	# groups, aiming to mimic the conditions of a randomized controlled trial.
	#By matching or comparing individuals with similar propensity scores, researchers can
	# reduce bias due to confounding factors.
	#Propensity scores are commonly estimated using logistic regression models based on #baseline covariates.
	# Methods using propensity scores include matching, stratification, weighting, and
	# covariate adjustment to estimate treatment effects more reliably.
	#Effective use of propensity scores depends on carefully choosing relevant covariates and
	# Checking that treated and untreated groups are balanced after adjustment.


	# Load required libraries
	#install.packages("MatchIt")
	library(MatchIt) # For propensity score matching
	library(dplyr) # For data manipulation

	# Step 1: Generate a synthetic dataset
	set.seed(123) # For reproducibility

	# Generate 200 observations with covariates and treatment assignment
	n <- 200

	# Covariates: age, gender, income
	age <- rnorm(n, mean = 50, sd = 10)
	gender <- rbinom(n, 1, 0.5) # 0 = Female, 1 = Male
	income <- rnorm(n, mean = 60000, sd = 15000)

	# Treatment assignment based on logistic model of covariates
	logit_p <- -5 + 0.05age + 0.8gender + 0.00002*income
	p <- 1 / (1 + exp(-logit_p))
	treatment <- rbinom(n, 1, p)

	# Outcome variable depends on treatment and covariates
	outcome <- 3 + 2treatment + 0.03age + 0.5*gender + rnorm(n)

	# Combine into a data frame
	data <- data.frame(treatment = factor(treatment),
	age = age,
	gender = factor(gender),
	income = income,
	outcome = outcome)
	head(data)

	# Step 2: Perform propensity score matching
	# Match treated (treatment == 1) with untreated (treatment == 0) on covariates age, gender, income
	# Using nearest neighbor matching without replacement
	match_obj <- matchit(treatment ~ age + gender + income,
	data = data,
	method = "nearest",
	ratio = 1,
	replace = FALSE)

	# Step 3: Check balance before and after matching
	summary(match_obj)

	# Step 4: Extract matched data
	matched_data <- match.data(match_obj)

	# Step 5: Outcome analysis on matched data
	# Estimate treatment effect by linear regression on matched data
	outcome_model <- lm(outcome ~ treatment, data = matched_data)
	summary(outcome_model)

	##matching on exact covariates
	match_obj<- matchit(treatment ~ age + gender + income,
	data = data,
	method = "exact")
	summary(match_obj)

	matched_data<-match.data(match_obj)
	outcome_model<-lm(outcome ~ treatment, data = matched_data)
	summary(outcome_model)

	##partial exact match
	match_obj <- matchit(treatment ~ age + gender + income,
	data = data,
	method = "nearest",
	exact = ~ gender)
	summary(match_obj)
	matched_data<-match.data(match_obj)
	outcome_model<-lm(outcome ~ treatment, data = matched_data)
	summary(outcome_model)

	##CEM matching (Coarsened Exact Match)
	##coarsens continuous covariates into bins and applies exact matching on these grouped values,
	##improving overlap while retaining comparability.
	match_obj <- matchit(treatment ~ age + income + gender,
	data = data,
	method = "cem")

	summary(match_obj)
	matched_data<-match.data(match_obj)
	outcome_model<-lm(outcome ~ treatment, data = matched_data)
	summary(outcome_model)

	## Optimal Match
	##Minimizes the total distance across all pairs. Require the package optmatch
	match_obj <- matchit(treatment ~ age + income + gender,
	data = data,
	method = "optimal")
	summary(match_obj)
	matched_data<-match.data(match_obj)
	outcome_model<-lm(outcome ~ treatment, data = matched_data)
	summary(outcome_model)

	##Full Match
	##Assigns treated and controls to groups that minimize average within-group distance
	match_obj <- matchit(treatment ~ age + income + gender,
	data = data,
	method = "full")
	summary(match_obj)
	matched_data<-match.data(match_obj)
	outcome_model<-lm(outcome ~ treatment, data = matched_data)
	summary(outcome_model)

	##Subclassification Match
	##Divides data into strata based on the propensity score, commonly using quintiles
	match_obj <- matchit(treatment ~ age + gender + income,
	data = data,
	method = "subclass",
	subclass = 5) # This creates 5 subclasses
	summary(match_obj)
	matched_data<-match.data(match_obj)
	outcome_model<-lm(outcome ~ treatment, data = matched_data)
	summary(outcome_model)

	# Load necessary packages
	#install.packages("cobalt")
	library(cobalt) # for balance plots

	# Visualize covariate balance before and after matching using Love plot
	love.plot(match_obj,
	stats = "mean.diffs", # standardized mean differences
	threshold = 0.1, # commonly used threshold for balance
	var.order = "unadjusted",
	colors = c("red", "blue"),
	stars = "raw") # show stars for significance

	# Plotting jitter plot to see propensity score distribution by subclass
	plot(match_obj, type = "jitter")

	# Summary of balance statistics
	summary(match_obj, standardize = TRUE)

	summary(mathc_obj,standardiz=FALSE)

	# Optional: use bal.plot from cobalt to plot balance of specific covariates
	bal.plot(match_obj, var.name = "age")
	bal.plot(match_obj, var.name = "income")



	## Mahalanobis Distance Matching
	##
	match_obj <- matchit(treatment ~ age + gender + income,
	data = data,
	method = "nearest",
	distance = "mahalanobis",
	mahvars = ~ age + income)

	summary(match_obj)
	matched_data<-match.data(match_obj)
	outcome_model<-lm(outcome ~ treatment, data = matched_data)
	summary(outcome_model)
No results found