Skip to content

Instantly share code, notes, and snippets.

@raghavrv
Last active December 12, 2016 17:03
Show Gist options
  • Save raghavrv/3237a1753077b0e3bbc3aef870315771 to your computer and use it in GitHub Desktop.
Save raghavrv/3237a1753077b0e3bbc3aef870315771 to your computer and use it in GitHub Desktop.
Alternative (simplified?) api for dropping values (NMAR and MCAR)
import numpy as np
from sklearn.utils.validation import check_random_state
from sklearn.externals import six
from functools import partial
def mcar_mask(X, y=None, proba=0.1, random_state=None):
"""Generate a MCAR mask to uniformly drop values
Attributes
----------
proba : float or a vector of floats of shape (n_features,)
Specifies the drop-probability for all values of X, if
a single float or the per-feature drop-probabilities if
a vector of shape (n_features,).
random_state : int or np.random.RandomState instance, default None
Seed for the random number generator.
"""
rng = check_random_state(random_state)
return rng.uniform(X.shape) < proba
def mnar_softmax_mask(X, y=None, proba=0.1, random_state=None):
"""Generate a NMAR mask based on the values of X with a softmax probability.
Attributes
----------
X, y : ...
proba : float or a vector of floats of shape (n_features,)
Specifies the drop-probability for all values of X, if
a single float or the per-feature drop-probabilities if
a vector of shape (n_features,).
random_state : int or np.random.RandomState instance, default None
Seed for the random number generator.
"""
proba_multiplier = np.exp(X)
proba_multiplier /= proba_multiplier.sum(axis=1).reshape(X.shape[0], 1)
rng = check_random_state(random_state)
return rng.uniform(size=X.shape) < proba * proba_multiplier
def mnar_label_based_mask(X, y, label_importances, proba=0.1,
random_state=None):
"""Generate mask that will depend on the label, y
Attributes
----------
X, y ...
label_importances : dict
The label values are keys and the label importances are values.
For instance if one wants missing that depends on the label,
such that values of samples from class 1 has high drop-probabilities
and values from samples of class 0 have low drop-probabilities, and
0 drop-probabilities from values of samples from class 2. It could
be represented with a dict::
{0: 0.2, 1: 0.8}
"""
rng = check_random_state(random_state)
multiplier = np.zeros(X.shape[0], dtype=np.float)
for key, value in label_importances.items():
multiplier[y == key] = value
return rng.uniform(size=X.shape) < proba * multiplier[:, np.newaxis]
def drop_values(X, y=None, missing_values=np.nan, mask_function=None,
random_state=None, **kwargs):
"""Randomly put some value to nan according to ``mask_function``."""
rng = check_random_state(random_state)
X = X.copy()
mask = mask_function(X, y, random_state=rng, **kwargs)
X[mask] = missing_values
return X
# ===============================
# Example Snippet
# ===============================
X = np.random.RandomState(0).randint(0, 9, (20, 4)).astype(float)
y = np.random.RandomState(0).randint(0, 3, (20,))
print("\nBased on softmax of X")
for proba in (0.4, 0.9):
print(drop_values(X, y, missing_values=np.nan,
mask_function=mnar_softmax_mask, proba=proba))
print()
print("\n Based on class label of samples, y")
for proba in (0.2, 0.6):
# Twice class 1 samples missing compared to class 0
# No missing in class 2
X_dropped = drop_values(X, y, missing_values=np.nan,
mask_function=mnar_label_based_mask,
label_importances={0: 0.5, 1: 1},
proba=proba)
for i in range(X_dropped.shape[0]):
print(X_dropped[i], y[i])
print()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment