raghavrv · December 12, 2016 17:03
diff --git a/value_dropper.py b/value_dropper.py
 import numpy as np

 from sklearn.utils.validation import check_random_state
 from sklearn.externals import six

 from functools import partial


 def mcar_mask(X, y=None, proba=0.1, random_state=None):
    """Generate a MCAR mask to uniformly drop values

    Attributes
    ----------

    proba : float or a vector of floats of shape (n_features,)
        Specifies the drop-probability for all values of X, if
        a single float or the per-feature drop-probabilities if
        a vector of shape (n_features,).

    random_state : int or np.random.RandomState instance, default None
        Seed for the random number generator.
    """
    rng = check_random_state(random_state)
    return rng.uniform(X.shape) < proba


 def mnar_softmax_mask(X, y=None, proba=0.1, random_state=None):
    """Generate a NMAR mask based on the values of X with a softmax probability.

    Attributes
    ----------
    X, y : ...

    proba : float or a vector of floats of shape (n_features,)
        Specifies the drop-probability for all values of X, if
        a single float or the per-feature drop-probabilities if
        a vector of shape (n_features,).

    random_state : int or np.random.RandomState instance, default None
        Seed for the random number generator.
    """
    proba_multiplier = np.exp(X)
    proba_multiplier /= proba_multiplier.sum(axis=1).reshape(X.shape[0], 1)
    rng = check_random_state(random_state)
    return rng.uniform(size=X.shape) < proba * proba_multiplier


 def mnar_label_based_mask(X, y, label_importances, proba=0.1,
                          random_state=None):
    """Generate mask that will depend on the label, y

    Attributes
    ----------

    X, y ...

    label_importances : dict
        The label values are keys and the label importances are values.

        For instance if one wants missing that depends on the label,
        such that values of samples from class 1 has high drop-probabilities
        and values from samples of class 0 have low drop-probabilities, and
        0 drop-probabilities from values of samples from class 2. It could
        be represented with a dict::
            {0: 0.2, 1: 0.8}
    """
    rng = check_random_state(random_state)
    multiplier = np.zeros(X.shape[0], dtype=np.float)
    for key, value in label_importances.items():
        multiplier[y == key] = value
    return rng.uniform(size=X.shape) < proba * multiplier[:, np.newaxis]


 def drop_values(X, y=None, missing_values=np.nan, mask_function=None,
                random_state=None, **kwargs):
    """Randomly put some value to nan according to ``mask_function``."""
    rng = check_random_state(random_state)
    X = X.copy()
    mask = mask_function(X, y, random_state=rng, **kwargs)
    X[mask] = missing_values
    return X


 # ===============================
 # Example Snippet
 # ===============================

 X = np.random.RandomState(0).randint(0, 9, (20, 4)).astype(float)
 y = np.random.RandomState(0).randint(0, 3, (20,))

 print("\nBased on softmax of X")
 for proba in (0.4, 0.9):
    print(drop_values(X, y, missing_values=np.nan,
                      mask_function=mnar_softmax_mask, proba=proba))
    print()

 print("\n Based on class label of samples, y")
 for proba in (0.2, 0.6):
    # Twice class 1 samples missing compared to class 0
    # No missing in class 2
    X_dropped = drop_values(X, y, missing_values=np.nan,
                            mask_function=mnar_label_based_mask,
                             label_importances={0: 0.5, 1: 1},
                             proba=proba)
    for i in range(X_dropped.shape[0]):
        print(X_dropped[i], y[i])
    print()
	import numpy as np

	from sklearn.utils.validation import check_random_state
	from sklearn.externals import six

	from functools import partial


	def mcar_mask(X, y=None, proba=0.1, random_state=None):
	"""Generate a MCAR mask to uniformly drop values

	Attributes
	----------

	proba : float or a vector of floats of shape (n_features,)
	Specifies the drop-probability for all values of X, if
	a single float or the per-feature drop-probabilities if
	a vector of shape (n_features,).

	random_state : int or np.random.RandomState instance, default None
	Seed for the random number generator.
	"""
	rng = check_random_state(random_state)
	return rng.uniform(X.shape) < proba


	def mnar_softmax_mask(X, y=None, proba=0.1, random_state=None):
	"""Generate a NMAR mask based on the values of X with a softmax probability.

	Attributes
	----------
	X, y : ...

	proba : float or a vector of floats of shape (n_features,)
	Specifies the drop-probability for all values of X, if
	a single float or the per-feature drop-probabilities if
	a vector of shape (n_features,).

	random_state : int or np.random.RandomState instance, default None
	Seed for the random number generator.
	"""
	proba_multiplier = np.exp(X)
	proba_multiplier /= proba_multiplier.sum(axis=1).reshape(X.shape[0], 1)
	rng = check_random_state(random_state)
	return rng.uniform(size=X.shape) < proba * proba_multiplier


	def mnar_label_based_mask(X, y, label_importances, proba=0.1,
	random_state=None):
	"""Generate mask that will depend on the label, y

	Attributes
	----------

	X, y ...

	label_importances : dict
	The label values are keys and the label importances are values.

	For instance if one wants missing that depends on the label,
	such that values of samples from class 1 has high drop-probabilities
	and values from samples of class 0 have low drop-probabilities, and
	0 drop-probabilities from values of samples from class 2. It could
	be represented with a dict::
	{0: 0.2, 1: 0.8}
	"""
	rng = check_random_state(random_state)
	multiplier = np.zeros(X.shape[0], dtype=np.float)
	for key, value in label_importances.items():
	multiplier[y == key] = value
	return rng.uniform(size=X.shape) < proba * multiplier[:, np.newaxis]


	def drop_values(X, y=None, missing_values=np.nan, mask_function=None,
	random_state=None, **kwargs):
	"""Randomly put some value to nan according to ``mask_function``."""
	rng = check_random_state(random_state)
	X = X.copy()
	mask = mask_function(X, y, random_state=rng, **kwargs)
	X[mask] = missing_values
	return X


	# ===============================
	# Example Snippet
	# ===============================

	X = np.random.RandomState(0).randint(0, 9, (20, 4)).astype(float)
	y = np.random.RandomState(0).randint(0, 3, (20,))

	print("\nBased on softmax of X")
	for proba in (0.4, 0.9):
	print(drop_values(X, y, missing_values=np.nan,
	mask_function=mnar_softmax_mask, proba=proba))
	print()

	print("\n Based on class label of samples, y")
	for proba in (0.2, 0.6):
	# Twice class 1 samples missing compared to class 0
	# No missing in class 2
	X_dropped = drop_values(X, y, missing_values=np.nan,
	mask_function=mnar_label_based_mask,
	label_importances={0: 0.5, 1: 1},
	proba=proba)
	for i in range(X_dropped.shape[0]):
	print(X_dropped[i], y[i])
	print()