Last active
December 12, 2016 17:03
-
-
Save raghavrv/3237a1753077b0e3bbc3aef870315771 to your computer and use it in GitHub Desktop.
Alternative (simplified?) api for dropping values (NMAR and MCAR)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
from sklearn.utils.validation import check_random_state | |
from sklearn.externals import six | |
from functools import partial | |
def mcar_mask(X, y=None, proba=0.1, random_state=None): | |
"""Generate a MCAR mask to uniformly drop values | |
Attributes | |
---------- | |
proba : float or a vector of floats of shape (n_features,) | |
Specifies the drop-probability for all values of X, if | |
a single float or the per-feature drop-probabilities if | |
a vector of shape (n_features,). | |
random_state : int or np.random.RandomState instance, default None | |
Seed for the random number generator. | |
""" | |
rng = check_random_state(random_state) | |
return rng.uniform(X.shape) < proba | |
def mnar_softmax_mask(X, y=None, proba=0.1, random_state=None): | |
"""Generate a NMAR mask based on the values of X with a softmax probability. | |
Attributes | |
---------- | |
X, y : ... | |
proba : float or a vector of floats of shape (n_features,) | |
Specifies the drop-probability for all values of X, if | |
a single float or the per-feature drop-probabilities if | |
a vector of shape (n_features,). | |
random_state : int or np.random.RandomState instance, default None | |
Seed for the random number generator. | |
""" | |
proba_multiplier = np.exp(X) | |
proba_multiplier /= proba_multiplier.sum(axis=1).reshape(X.shape[0], 1) | |
rng = check_random_state(random_state) | |
return rng.uniform(size=X.shape) < proba * proba_multiplier | |
def mnar_label_based_mask(X, y, label_importances, proba=0.1, | |
random_state=None): | |
"""Generate mask that will depend on the label, y | |
Attributes | |
---------- | |
X, y ... | |
label_importances : dict | |
The label values are keys and the label importances are values. | |
For instance if one wants missing that depends on the label, | |
such that values of samples from class 1 has high drop-probabilities | |
and values from samples of class 0 have low drop-probabilities, and | |
0 drop-probabilities from values of samples from class 2. It could | |
be represented with a dict:: | |
{0: 0.2, 1: 0.8} | |
""" | |
rng = check_random_state(random_state) | |
multiplier = np.zeros(X.shape[0], dtype=np.float) | |
for key, value in label_importances.items(): | |
multiplier[y == key] = value | |
return rng.uniform(size=X.shape) < proba * multiplier[:, np.newaxis] | |
def drop_values(X, y=None, missing_values=np.nan, mask_function=None, | |
random_state=None, **kwargs): | |
"""Randomly put some value to nan according to ``mask_function``.""" | |
rng = check_random_state(random_state) | |
X = X.copy() | |
mask = mask_function(X, y, random_state=rng, **kwargs) | |
X[mask] = missing_values | |
return X | |
# =============================== | |
# Example Snippet | |
# =============================== | |
X = np.random.RandomState(0).randint(0, 9, (20, 4)).astype(float) | |
y = np.random.RandomState(0).randint(0, 3, (20,)) | |
print("\nBased on softmax of X") | |
for proba in (0.4, 0.9): | |
print(drop_values(X, y, missing_values=np.nan, | |
mask_function=mnar_softmax_mask, proba=proba)) | |
print() | |
print("\n Based on class label of samples, y") | |
for proba in (0.2, 0.6): | |
# Twice class 1 samples missing compared to class 0 | |
# No missing in class 2 | |
X_dropped = drop_values(X, y, missing_values=np.nan, | |
mask_function=mnar_label_based_mask, | |
label_importances={0: 0.5, 1: 1}, | |
proba=proba) | |
for i in range(X_dropped.shape[0]): | |
print(X_dropped[i], y[i]) | |
print() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment