Skip to content

Instantly share code, notes, and snippets.

@raghavrv
Created December 12, 2016 16:13
Show Gist options
  • Save raghavrv/9fb33819f6f0c447c5dacaa3eb92cbef to your computer and use it in GitHub Desktop.
Save raghavrv/9fb33819f6f0c447c5dacaa3eb92cbef to your computer and use it in GitHub Desktop.
value dropper alternative api
import numpy as np
from sklearn.utils.validation import check_random_state
from sklearn.externals import six
from functools import partial
def drop_values_mcar(X, y=None, missing_values=np.nan,
proba=0.1, random_state=None):
"""Generate a MCAR missingness to uniformly drop values
Attributes
----------
proba : float or a vector of floats of shape (n_features,)
Specifies the drop-probability for all values of X, if
a single float or the per-feature drop-probabilities if
a vector of shape (n_features,).
random_state : int or np.random.RandomState instance, default None
Seed for the random number generator.
"""
rng = check_random_state(random_state)
X = X.copy()
X[rng.uniform(X.shape) < proba] = missing_values
return X
def drop_values_nmar(
X, y=None, missing_values=np.nan, proba=0.1,
proba_multiplier_func=softmax_proba_multiplier,
random_state=None):
"""Generate a NMAR missing values based on the values of X.
Attributes
----------
X, y : ...
proba : float or a vector of floats of shape (n_features,)
Specifies the drop-probability for all values of X, if
a single float or the per-feature drop-probabilities if
a vector of shape (n_features,).
proba_multiplier_func : function object, default softmax_proba_multiplier
The ``proba_multiplier_func(X, y=None)`` will be multiplied with the
``proba`` vector to generate the final thresholding values to
generate the NMAR mask.
This ``proba_multiplier_func`` by default is the
``softmax_proba_multiplier`` function which returns the ``softmax(X)``
such that the missing mask now is::
missing_mask = np.random.uniform(X.shape) < softmax(X) * proba.
random_state : int or np.random.RandomState instance, default None
Seed for the random number generator.
"""
rng = check_random_state(random_state)
X = X.copy()
X[rng.uniform(size=X.shape) < (proba * proba_multiplier_func(X, y))] = missing_values
return X
def softmax_proba_multiplier(X, y=None):
"""Generate the softmax(X), which can be used as a multiplier
Attributes
----------
X : nd-array like of shape (n_samples, n_features)
The data on which the softmax will be computed.
Returns
-------
softmax : nd-array like of shape (n_samples, n_features)
The softmax for the data matrix, X, is returned.
e^X / sum(e^X)
"""
exp_X = np.exp(X)
return exp_X / exp_X.sum(axis=1).reshape(X.shape[0], 1)
def label_based_proba_multiplier(X, y, label_importances):
"""Generate multiplier values that will depend on the label, y
Attributes
----------
X, y ...
label_importances : dict
The label values are keys and the label importances are values.
For instance if one wants missing that depends on the label,
such that values of samples from class 1 has high drop-probabilities
and values from samples of class 0 have low drop-probabilities, and
0 drop-probabilities from values of samples from class 2. It could
be represented with a dict::
{0: 0.2, 1: 0.8}
"""
multiplier = np.zeros(X.shape[0], dtype=np.float)
for key, value in label_importances.items():
multiplier[y == key] = value
return multiplier[:, np.newaxis]
# ===============================
# Example Snippet
# ===============================
X = np.random.RandomState(0).randint(0, 9, (20, 4)).astype(float)
y = np.random.RandomState(0).randint(0, 3, (20,))
print("\nBased on softmax of X")
for proba in (0.4, 0.9):
print(drop_values_nmar(X, y, missing_values=np.nan,
proba_multiplier_func=softmax_proba_multiplier,
proba=proba))
print()
print("\n Based on class label of samples, y")
for proba in (0.2, 0.6):
# Twice class 1 samples missing compared to class 0
# No missing in class 2
multiplier = partial(label_based_proba_multiplier,
label_importances={0: 0.5, 1: 1})
X_dropped = drop_values_nmar(X, y, missing_values=np.nan,
proba_multiplier_func=multiplier,
proba=proba)
for i in range(X_dropped.shape[0]):
print(X_dropped[i], y[i])
print()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment