Skip to content

Instantly share code, notes, and snippets.

@jseabold
Created September 27, 2016 20:13
Show Gist options
  • Save jseabold/b91555532584a918e51b88a818003e2e to your computer and use it in GitHub Desktop.
Save jseabold/b91555532584a918e51b88a818003e2e to your computer and use it in GitHub Desktop.
Create correlated binary variables. Based on Leisch, Weingessel, and Hornik (1998).
"""
Heavily inspired by the R package bindata.
"""
import numpy as np
from scipy import interpolate
from scipy import stats
def corr_to_joint(corr, marginals):
"""
Create joint probabilities from a correlations and marginal probabilities
Parameters
----------
corr : array_like
A correlation matrix
marginals : array_like
Marginal probabilities
Returns
-------
joint : ndarray
An array of joint probabilities
"""
corr = np.asarray(corr)
marginals = np.asarray(marginals)
std = (marginals * (1 - marginals)) ** .5
return corr * np.outer(std, std) + np.outer(marginals, marginals)
def create_joint_probability_grid(marginals, corr):
"""
Probability that two random variates are above zero.
Parameters
----------
marginals : array_like
1d array containing the grid of marginal probabilities
corr : array_like
1d array containing the grid of correlations
Returns
-------
joint : ndarray
A N x N X P array with N == marginals.shape[0] and P == corr.shape[0]
Each (i, j, k) entry contains the probability that both components
of a normal random variable with mean `
scipy.stats.norm.ppf(marginals[i, j])` and correlation corr[k] are
larger than zero.
Examples
--------
>>> marginal_grid = np.linspace(0, 1, 51)
>>> corr_grid = np.linspace(-1, 1, 101)
>>> values = create_joint_probability_grid(marginal_grid, corr_grid)
"""
marginals = np.asarray(marginals)
corr = np.asarray(corr)
n_marginals = marginals.shape[0]
n_corr = corr.shape[0]
z = np.zeros((n_marginals, n_marginals, n_corr))
mu = stats.norm.ppf(marginals)
for k in range(n_corr):
cov = np.array([[1, corr[k]], [corr[k], 1]])
for m in range(n_marginals):
mu1 = mu[m]
for n in range(m, n_marginals):
# could take care of the special cases of corr == -1, 0, 1
# as well
# if corr[k] == -1: # this can't ever be non-zero?
# z[m, n, k] = max(marginals[m] + marginals[n] - 1, 0)
# elif corr[k] == 0:
# z[m, n, k] = marginals[m] * marginals[n]
# elif corr[k] == 1:
# z[m, n, k] = min(marginals[m], marginals[n])
if marginals[m] * marginals[n] == 0:
z[m, n, k] = 0
elif marginals[m] == 1:
z[m, n, k] = marginals[n]
elif marginals[n] == 1:
z[m, n, k] = marginals[m]
# 10 is a "large" variable for the upper bound of integration
else:
z[m, n, k], _ = stats.mvn.mvnun(
[0, 0], [10, 10], [mu1, mu[n]], cov)
z[n, m, k] = z[m, n, k]
return z
def joint_to_cov(joint):
"""
Convert a joint probability matrix to a multivariate normal covariance
Parameters
----------
joint : array_like
An ndarray of joint probabilities for two random variables
Returns
-------
cov : ndarray
The covariance matrix
Notes
-----
Computes a covariance matrix for a normal distribution correpsonding to
a binary distribution with marginal probabilities on the diagonal of
`joint` and pairwise probabilities on the off-diagonal. This uses the
hard-coded SIMULVALS created by `create_joint_probability_grid` to do
interpolation.
"""
marginals = np.diag(joint) # you might already have these
# TODO: check joint for pos-def and symmetric
cov = np.eye(joint.shape[0])
# alternatively simulvals could be a dataframe so we can keep up with
# the grid it was generated on
corr_grid = np.linspace(-1, 1, SIMULVALS.shape[-1])
marginal_grid = np.linspace(0, 1, SIMULVALS.shape[0])
simul_grid = (marginal_grid, marginal_grid, corr_grid)
for m in range(joint.shape[1] - 1):
for n in range(m + 1, joint.shape[0]):
# given the observed mu1, mu2, and the joint probabilities
# back out the covariance that would have given these observations
x = marginals[np.repeat([m, n], corr_grid.shape[0])]
x = x.reshape(-1, 2, order='F')
x = np.column_stack((x, corr_grid))
# interpolate joint probabilities for these marginals over the
# correlations
y = interpolate.interpolate.interpn(simul_grid, SIMULVALS, x)
# given this get a function that maps joint -> cov and
# evaluate at joint
func = interpolate.interp1d(y, corr_grid)
cov[m, n] = cov[n, m] = func(joint[m, n])
return cov
def correlated_binary_rvs(marginals, corr, size=0):
"""
Generate correlated binary random variables
Parameters
----------
marginals : array_like
1d array of marginal probabilities
corr : array_like
2d array of binary correlation for the variables
size : int
The number of observations for the returned variables
Returns
-------
y : ndarray
The correlated random binary variables
"""
joint = corr_to_joint(corr, marginals)
cov = joint_to_cov(joint)
X = np.random.multivariate_normal(stats.norm.ppf(marginals),
cov=cov, size=size) > 0
return X.astype(float)
MARGINAL_GRID = np.linspace(0, 1, 51)
CORR_GRID = np.linspace(-1, 1, 101)
SIMULVALS = create_joint_probability_grid(MARGINAL_GRID, CORR_GRID)
@xdavio
Copy link

xdavio commented Oct 5, 2016

rough code to simulate a valid binary correlation matrix

https://gist.github.com/xdavio/c90347adda522ba784e4e37b31d36535

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment