tyarkoni · April 3, 2016 15:59
diff --git a/simulate_matching.py b/simulate_matching.py
 '''
 A small simulation to demonstrate that matching trials does not solve the
 problem of residual confounding. For description of original problem, see
 http://dx.doi.org/10.1371/journal.pone.0152719
 Here we simulate a situation where we match trials from two conditions that
 differ in Y on an indicator M. By hypothesis, there is no difference in Y in
 the population after controlling for M. But because of measurement error,
 matching on M will, on average, leave a residual mean difference in the Y's.
 Raising the reliability of M (REL_M) will decrease this difference, and setting
 it to 1.0 will eliminate it completely, demonstrating that matching works just
 as well as statistical control when the matching indicator is measured with
 perfect reliability.
 '''
 import numpy as np
 import matplotlib.pyplot as plt
 import seaborn as sns
 # %matplotlib inline  # If running in notebook

 # Number of trials to match
 N = 100

 # Number of simulation reps
 REPS = 1000

 # Difference (in sd) between conditions
 X_DIFF = 0.5

 # Correlation between M and Y
 R_MY = 0.5

 # Reliability of M: set to 1 to eliminate difference
 REL_M = 0.5

 # Pool to sample trials from
 POOL = 10000


 ### Simulation ###
 y_diffs = np.zeros(REPS)
 matched_diffs = np.zeros(REPS)

 for rep_i in range(REPS):
    
    # Create variables
    M = np.random.normal([0, X_DIFF], size=(POOL, 2))
    Y = M * R_MY + np.sqrt(1 - R_MY ** 2) * np.random.normal(size=(POOL, 2))

    # Introduce error to M
    M = M * REL_M + np.sqrt(1 - REL_M ** 2) * np.random.normal(size=(POOL, 2))

    # Compute zero-order difference in Y for same sample size
    y_diffs[rep_i] = np.diff(Y[np.random.choice(POOL, N, False)].mean(0))

    # Match on M
    c1, c2 = np.c_[M[:, 0], Y[:, 0]], np.c_[M[:, 1], Y[:, 1]]
    c1 = c1[np.argsort(c1[:, 0])]
    c2 = c2[np.argsort(c2[:, 0])]
    j = 0
    matched = []
    for i, v in enumerate(c1[:, 0]):
        other = c2[j, 0]
        # Call it a match if values are very close
        if np.abs(v - other) < 0.0002:
            matched.append([c1[i, 1], c2[j, 1]])
            j += 1
        elif v > other:
            j += 1

    matched = np.array(matched)
    matched = matched[np.random.choice(len(matched), N, False)]
    matched_diffs[rep_i] = np.diff(matched.mean(0))

 print("Original difference in y:", y_diffs.mean().round(2))
 print("Difference in y after matching:", matched_diffs.mean().round(2))
 ax = sns.kdeplot(y_diffs)
 ax = sns.kdeplot(matched_diffs, ax=ax)
	'''
	A small simulation to demonstrate that matching trials does not solve the
	problem of residual confounding. For description of original problem, see
	http://dx.doi.org/10.1371/journal.pone.0152719
	Here we simulate a situation where we match trials from two conditions that
	differ in Y on an indicator M. By hypothesis, there is no difference in Y in
	the population after controlling for M. But because of measurement error,
	matching on M will, on average, leave a residual mean difference in the Y's.
	Raising the reliability of M (REL_M) will decrease this difference, and setting
	it to 1.0 will eliminate it completely, demonstrating that matching works just
	as well as statistical control when the matching indicator is measured with
	perfect reliability.
	'''
	import numpy as np
	import matplotlib.pyplot as plt
	import seaborn as sns
	# %matplotlib inline # If running in notebook

	# Number of trials to match
	N = 100

	# Number of simulation reps
	REPS = 1000

	# Difference (in sd) between conditions
	X_DIFF = 0.5

	# Correlation between M and Y
	R_MY = 0.5

	# Reliability of M: set to 1 to eliminate difference
	REL_M = 0.5

	# Pool to sample trials from
	POOL = 10000


	### Simulation ###
	y_diffs = np.zeros(REPS)
	matched_diffs = np.zeros(REPS)

	for rep_i in range(REPS):

	# Create variables
	M = np.random.normal([0, X_DIFF], size=(POOL, 2))
	Y = M * R_MY + np.sqrt(1 - R_MY ** 2) * np.random.normal(size=(POOL, 2))

	# Introduce error to M
	M = M * REL_M + np.sqrt(1 - REL_M ** 2) * np.random.normal(size=(POOL, 2))

	# Compute zero-order difference in Y for same sample size
	y_diffs[rep_i] = np.diff(Y[np.random.choice(POOL, N, False)].mean(0))

	# Match on M
	c1, c2 = np.c_[M[:, 0], Y[:, 0]], np.c_[M[:, 1], Y[:, 1]]
	c1 = c1[np.argsort(c1[:, 0])]
	c2 = c2[np.argsort(c2[:, 0])]
	j = 0
	matched = []
	for i, v in enumerate(c1[:, 0]):
	other = c2[j, 0]
	# Call it a match if values are very close
	if np.abs(v - other) < 0.0002:
	matched.append([c1[i, 1], c2[j, 1]])
	j += 1
	elif v > other:
	j += 1

	matched = np.array(matched)
	matched = matched[np.random.choice(len(matched), N, False)]
	matched_diffs[rep_i] = np.diff(matched.mean(0))

	print("Original difference in y:", y_diffs.mean().round(2))
	print("Difference in y after matching:", matched_diffs.mean().round(2))
	ax = sns.kdeplot(y_diffs)
	ax = sns.kdeplot(matched_diffs, ax=ax)