Created
December 12, 2016 16:13
-
-
Save raghavrv/9fb33819f6f0c447c5dacaa3eb92cbef to your computer and use it in GitHub Desktop.
value dropper alternative api
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
from sklearn.utils.validation import check_random_state | |
from sklearn.externals import six | |
from functools import partial | |
def drop_values_mcar(X, y=None, missing_values=np.nan, | |
proba=0.1, random_state=None): | |
"""Generate a MCAR missingness to uniformly drop values | |
Attributes | |
---------- | |
proba : float or a vector of floats of shape (n_features,) | |
Specifies the drop-probability for all values of X, if | |
a single float or the per-feature drop-probabilities if | |
a vector of shape (n_features,). | |
random_state : int or np.random.RandomState instance, default None | |
Seed for the random number generator. | |
""" | |
rng = check_random_state(random_state) | |
X = X.copy() | |
X[rng.uniform(X.shape) < proba] = missing_values | |
return X | |
def drop_values_nmar( | |
X, y=None, missing_values=np.nan, proba=0.1, | |
proba_multiplier_func=softmax_proba_multiplier, | |
random_state=None): | |
"""Generate a NMAR missing values based on the values of X. | |
Attributes | |
---------- | |
X, y : ... | |
proba : float or a vector of floats of shape (n_features,) | |
Specifies the drop-probability for all values of X, if | |
a single float or the per-feature drop-probabilities if | |
a vector of shape (n_features,). | |
proba_multiplier_func : function object, default softmax_proba_multiplier | |
The ``proba_multiplier_func(X, y=None)`` will be multiplied with the | |
``proba`` vector to generate the final thresholding values to | |
generate the NMAR mask. | |
This ``proba_multiplier_func`` by default is the | |
``softmax_proba_multiplier`` function which returns the ``softmax(X)`` | |
such that the missing mask now is:: | |
missing_mask = np.random.uniform(X.shape) < softmax(X) * proba. | |
random_state : int or np.random.RandomState instance, default None | |
Seed for the random number generator. | |
""" | |
rng = check_random_state(random_state) | |
X = X.copy() | |
X[rng.uniform(size=X.shape) < (proba * proba_multiplier_func(X, y))] = missing_values | |
return X | |
def softmax_proba_multiplier(X, y=None): | |
"""Generate the softmax(X), which can be used as a multiplier | |
Attributes | |
---------- | |
X : nd-array like of shape (n_samples, n_features) | |
The data on which the softmax will be computed. | |
Returns | |
------- | |
softmax : nd-array like of shape (n_samples, n_features) | |
The softmax for the data matrix, X, is returned. | |
e^X / sum(e^X) | |
""" | |
exp_X = np.exp(X) | |
return exp_X / exp_X.sum(axis=1).reshape(X.shape[0], 1) | |
def label_based_proba_multiplier(X, y, label_importances): | |
"""Generate multiplier values that will depend on the label, y | |
Attributes | |
---------- | |
X, y ... | |
label_importances : dict | |
The label values are keys and the label importances are values. | |
For instance if one wants missing that depends on the label, | |
such that values of samples from class 1 has high drop-probabilities | |
and values from samples of class 0 have low drop-probabilities, and | |
0 drop-probabilities from values of samples from class 2. It could | |
be represented with a dict:: | |
{0: 0.2, 1: 0.8} | |
""" | |
multiplier = np.zeros(X.shape[0], dtype=np.float) | |
for key, value in label_importances.items(): | |
multiplier[y == key] = value | |
return multiplier[:, np.newaxis] | |
# =============================== | |
# Example Snippet | |
# =============================== | |
X = np.random.RandomState(0).randint(0, 9, (20, 4)).astype(float) | |
y = np.random.RandomState(0).randint(0, 3, (20,)) | |
print("\nBased on softmax of X") | |
for proba in (0.4, 0.9): | |
print(drop_values_nmar(X, y, missing_values=np.nan, | |
proba_multiplier_func=softmax_proba_multiplier, | |
proba=proba)) | |
print() | |
print("\n Based on class label of samples, y") | |
for proba in (0.2, 0.6): | |
# Twice class 1 samples missing compared to class 0 | |
# No missing in class 2 | |
multiplier = partial(label_based_proba_multiplier, | |
label_importances={0: 0.5, 1: 1}) | |
X_dropped = drop_values_nmar(X, y, missing_values=np.nan, | |
proba_multiplier_func=multiplier, | |
proba=proba) | |
for i in range(X_dropped.shape[0]): | |
print(X_dropped[i], y[i]) | |
print() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment