Created
September 27, 2016 20:13
-
-
Save jseabold/b91555532584a918e51b88a818003e2e to your computer and use it in GitHub Desktop.
Create correlated binary variables. Based on Leisch, Weingessel, and Hornik (1998).
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Heavily inspired by the R package bindata. | |
""" | |
import numpy as np | |
from scipy import interpolate | |
from scipy import stats | |
def corr_to_joint(corr, marginals): | |
""" | |
Create joint probabilities from a correlations and marginal probabilities | |
Parameters | |
---------- | |
corr : array_like | |
A correlation matrix | |
marginals : array_like | |
Marginal probabilities | |
Returns | |
------- | |
joint : ndarray | |
An array of joint probabilities | |
""" | |
corr = np.asarray(corr) | |
marginals = np.asarray(marginals) | |
std = (marginals * (1 - marginals)) ** .5 | |
return corr * np.outer(std, std) + np.outer(marginals, marginals) | |
def create_joint_probability_grid(marginals, corr): | |
""" | |
Probability that two random variates are above zero. | |
Parameters | |
---------- | |
marginals : array_like | |
1d array containing the grid of marginal probabilities | |
corr : array_like | |
1d array containing the grid of correlations | |
Returns | |
------- | |
joint : ndarray | |
A N x N X P array with N == marginals.shape[0] and P == corr.shape[0] | |
Each (i, j, k) entry contains the probability that both components | |
of a normal random variable with mean ` | |
scipy.stats.norm.ppf(marginals[i, j])` and correlation corr[k] are | |
larger than zero. | |
Examples | |
-------- | |
>>> marginal_grid = np.linspace(0, 1, 51) | |
>>> corr_grid = np.linspace(-1, 1, 101) | |
>>> values = create_joint_probability_grid(marginal_grid, corr_grid) | |
""" | |
marginals = np.asarray(marginals) | |
corr = np.asarray(corr) | |
n_marginals = marginals.shape[0] | |
n_corr = corr.shape[0] | |
z = np.zeros((n_marginals, n_marginals, n_corr)) | |
mu = stats.norm.ppf(marginals) | |
for k in range(n_corr): | |
cov = np.array([[1, corr[k]], [corr[k], 1]]) | |
for m in range(n_marginals): | |
mu1 = mu[m] | |
for n in range(m, n_marginals): | |
# could take care of the special cases of corr == -1, 0, 1 | |
# as well | |
# if corr[k] == -1: # this can't ever be non-zero? | |
# z[m, n, k] = max(marginals[m] + marginals[n] - 1, 0) | |
# elif corr[k] == 0: | |
# z[m, n, k] = marginals[m] * marginals[n] | |
# elif corr[k] == 1: | |
# z[m, n, k] = min(marginals[m], marginals[n]) | |
if marginals[m] * marginals[n] == 0: | |
z[m, n, k] = 0 | |
elif marginals[m] == 1: | |
z[m, n, k] = marginals[n] | |
elif marginals[n] == 1: | |
z[m, n, k] = marginals[m] | |
# 10 is a "large" variable for the upper bound of integration | |
else: | |
z[m, n, k], _ = stats.mvn.mvnun( | |
[0, 0], [10, 10], [mu1, mu[n]], cov) | |
z[n, m, k] = z[m, n, k] | |
return z | |
def joint_to_cov(joint): | |
""" | |
Convert a joint probability matrix to a multivariate normal covariance | |
Parameters | |
---------- | |
joint : array_like | |
An ndarray of joint probabilities for two random variables | |
Returns | |
------- | |
cov : ndarray | |
The covariance matrix | |
Notes | |
----- | |
Computes a covariance matrix for a normal distribution correpsonding to | |
a binary distribution with marginal probabilities on the diagonal of | |
`joint` and pairwise probabilities on the off-diagonal. This uses the | |
hard-coded SIMULVALS created by `create_joint_probability_grid` to do | |
interpolation. | |
""" | |
marginals = np.diag(joint) # you might already have these | |
# TODO: check joint for pos-def and symmetric | |
cov = np.eye(joint.shape[0]) | |
# alternatively simulvals could be a dataframe so we can keep up with | |
# the grid it was generated on | |
corr_grid = np.linspace(-1, 1, SIMULVALS.shape[-1]) | |
marginal_grid = np.linspace(0, 1, SIMULVALS.shape[0]) | |
simul_grid = (marginal_grid, marginal_grid, corr_grid) | |
for m in range(joint.shape[1] - 1): | |
for n in range(m + 1, joint.shape[0]): | |
# given the observed mu1, mu2, and the joint probabilities | |
# back out the covariance that would have given these observations | |
x = marginals[np.repeat([m, n], corr_grid.shape[0])] | |
x = x.reshape(-1, 2, order='F') | |
x = np.column_stack((x, corr_grid)) | |
# interpolate joint probabilities for these marginals over the | |
# correlations | |
y = interpolate.interpolate.interpn(simul_grid, SIMULVALS, x) | |
# given this get a function that maps joint -> cov and | |
# evaluate at joint | |
func = interpolate.interp1d(y, corr_grid) | |
cov[m, n] = cov[n, m] = func(joint[m, n]) | |
return cov | |
def correlated_binary_rvs(marginals, corr, size=0): | |
""" | |
Generate correlated binary random variables | |
Parameters | |
---------- | |
marginals : array_like | |
1d array of marginal probabilities | |
corr : array_like | |
2d array of binary correlation for the variables | |
size : int | |
The number of observations for the returned variables | |
Returns | |
------- | |
y : ndarray | |
The correlated random binary variables | |
""" | |
joint = corr_to_joint(corr, marginals) | |
cov = joint_to_cov(joint) | |
X = np.random.multivariate_normal(stats.norm.ppf(marginals), | |
cov=cov, size=size) > 0 | |
return X.astype(float) | |
MARGINAL_GRID = np.linspace(0, 1, 51) | |
CORR_GRID = np.linspace(-1, 1, 101) | |
SIMULVALS = create_joint_probability_grid(MARGINAL_GRID, CORR_GRID) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
rough code to simulate a valid binary correlation matrix
https://gist.github.com/xdavio/c90347adda522ba784e4e37b31d36535