raghavrv · December 12, 2016 16:13
diff --git a/drop_values.py b/drop_values.py
 import numpy as np

 from sklearn.utils.validation import check_random_state
 from sklearn.externals import six

 from functools import partial


 def drop_values_mcar(X, y=None, missing_values=np.nan,
                     proba=0.1, random_state=None):
    """Generate a MCAR missingness to uniformly drop values
    
    Attributes
    ----------
    
    proba : float or a vector of floats of shape (n_features,)
        Specifies the drop-probability for all values of X, if
        a single float or the per-feature drop-probabilities if
        a vector of shape (n_features,).
        
    random_state : int or np.random.RandomState instance, default None
        Seed for the random number generator.
    """
    rng = check_random_state(random_state)
    X = X.copy()
    X[rng.uniform(X.shape) < proba] = missing_values
    return X


 def drop_values_nmar(
        X, y=None, missing_values=np.nan, proba=0.1,
        proba_multiplier_func=softmax_proba_multiplier,
        random_state=None):
    """Generate a NMAR missing values based on the values of X.
    
    Attributes
    ----------
    X, y : ...
    
    proba : float or a vector of floats of shape (n_features,)
        Specifies the drop-probability for all values of X, if
        a single float or the per-feature drop-probabilities if
        a vector of shape (n_features,).
        
    proba_multiplier_func : function object, default softmax_proba_multiplier
        The ``proba_multiplier_func(X, y=None)`` will be multiplied with the
        ``proba`` vector to generate the final thresholding values to
        generate the NMAR mask.

        This ``proba_multiplier_func`` by default is the
        ``softmax_proba_multiplier`` function which returns the ``softmax(X)``
        such that the missing mask now is::
            missing_mask = np.random.uniform(X.shape) < softmax(X) * proba.
        
    random_state : int or np.random.RandomState instance, default None
        Seed for the random number generator.
    """
    rng = check_random_state(random_state)
    X = X.copy()
    X[rng.uniform(size=X.shape) < (proba * proba_multiplier_func(X, y))] = missing_values
    return X

    
 def softmax_proba_multiplier(X, y=None):
    """Generate the softmax(X), which can be used as a multiplier
    
    Attributes
    ----------
    
    X : nd-array like of shape (n_samples, n_features)
       The data on which the softmax will be computed.
       
    Returns
    -------
    
    softmax : nd-array like of shape (n_samples, n_features)
        The softmax for the data matrix, X, is returned.
        e^X / sum(e^X)
    """
    exp_X = np.exp(X)
    return exp_X / exp_X.sum(axis=1).reshape(X.shape[0], 1)


 def label_based_proba_multiplier(X, y, label_importances):
    """Generate multiplier values that will depend on the label, y
    
    Attributes
    ----------
    
    X, y ...
    
    label_importances : dict
        The label values are keys and the label importances are values.
        
        For instance if one wants missing that depends on the label,
        such that values of samples from class 1 has high drop-probabilities
        and values from samples of class 0 have low drop-probabilities, and
        0 drop-probabilities from values of samples from class 2. It could
        be represented with a dict::
            {0: 0.2, 1: 0.8}
    """
    multiplier = np.zeros(X.shape[0], dtype=np.float)
    for key, value in label_importances.items():
        multiplier[y == key] = value
    return multiplier[:, np.newaxis]


 # ===============================
 # Example Snippet
 # ===============================

 X = np.random.RandomState(0).randint(0, 9, (20, 4)).astype(float)
 y = np.random.RandomState(0).randint(0, 3, (20,))

 print("\nBased on softmax of X")
 for proba in (0.4, 0.9):
    print(drop_values_nmar(X, y, missing_values=np.nan,
                           proba_multiplier_func=softmax_proba_multiplier,
                           proba=proba))
    print()
    
 print("\n Based on class label of samples, y")
 for proba in (0.2, 0.6):
    # Twice class 1 samples missing compared to class 0
    # No missing in class 2
    multiplier = partial(label_based_proba_multiplier,
                         label_importances={0: 0.5, 1: 1})
    X_dropped = drop_values_nmar(X, y, missing_values=np.nan,
                                 proba_multiplier_func=multiplier,
                                 proba=proba)
    for i in range(X_dropped.shape[0]):
        print(X_dropped[i], y[i])
    print()
	import numpy as np

	from sklearn.utils.validation import check_random_state
	from sklearn.externals import six

	from functools import partial


	def drop_values_mcar(X, y=None, missing_values=np.nan,
	proba=0.1, random_state=None):
	"""Generate a MCAR missingness to uniformly drop values

	Attributes
	----------

	proba : float or a vector of floats of shape (n_features,)
	Specifies the drop-probability for all values of X, if
	a single float or the per-feature drop-probabilities if
	a vector of shape (n_features,).

	random_state : int or np.random.RandomState instance, default None
	Seed for the random number generator.
	"""
	rng = check_random_state(random_state)
	X = X.copy()
	X[rng.uniform(X.shape) < proba] = missing_values
	return X


	def drop_values_nmar(
	X, y=None, missing_values=np.nan, proba=0.1,
	proba_multiplier_func=softmax_proba_multiplier,
	random_state=None):
	"""Generate a NMAR missing values based on the values of X.

	Attributes
	----------
	X, y : ...

	proba : float or a vector of floats of shape (n_features,)
	Specifies the drop-probability for all values of X, if
	a single float or the per-feature drop-probabilities if
	a vector of shape (n_features,).

	proba_multiplier_func : function object, default softmax_proba_multiplier
	The ``proba_multiplier_func(X, y=None)`` will be multiplied with the
	``proba`` vector to generate the final thresholding values to
	generate the NMAR mask.

	This ``proba_multiplier_func`` by default is the
	``softmax_proba_multiplier`` function which returns the ``softmax(X)``
	such that the missing mask now is::
	missing_mask = np.random.uniform(X.shape) < softmax(X) * proba.

	random_state : int or np.random.RandomState instance, default None
	Seed for the random number generator.
	"""
	rng = check_random_state(random_state)
	X = X.copy()
	X[rng.uniform(size=X.shape) < (proba * proba_multiplier_func(X, y))] = missing_values
	return X


	def softmax_proba_multiplier(X, y=None):
	"""Generate the softmax(X), which can be used as a multiplier

	Attributes
	----------

	X : nd-array like of shape (n_samples, n_features)
	The data on which the softmax will be computed.

	Returns
	-------

	softmax : nd-array like of shape (n_samples, n_features)
	The softmax for the data matrix, X, is returned.
	e^X / sum(e^X)
	"""
	exp_X = np.exp(X)
	return exp_X / exp_X.sum(axis=1).reshape(X.shape[0], 1)


	def label_based_proba_multiplier(X, y, label_importances):
	"""Generate multiplier values that will depend on the label, y

	Attributes
	----------

	X, y ...

	label_importances : dict
	The label values are keys and the label importances are values.

	For instance if one wants missing that depends on the label,
	such that values of samples from class 1 has high drop-probabilities
	and values from samples of class 0 have low drop-probabilities, and
	0 drop-probabilities from values of samples from class 2. It could
	be represented with a dict::
	{0: 0.2, 1: 0.8}
	"""
	multiplier = np.zeros(X.shape[0], dtype=np.float)
	for key, value in label_importances.items():
	multiplier[y == key] = value
	return multiplier[:, np.newaxis]


	# ===============================
	# Example Snippet
	# ===============================

	X = np.random.RandomState(0).randint(0, 9, (20, 4)).astype(float)
	y = np.random.RandomState(0).randint(0, 3, (20,))

	print("\nBased on softmax of X")
	for proba in (0.4, 0.9):
	print(drop_values_nmar(X, y, missing_values=np.nan,
	proba_multiplier_func=softmax_proba_multiplier,
	proba=proba))
	print()

	print("\n Based on class label of samples, y")
	for proba in (0.2, 0.6):
	# Twice class 1 samples missing compared to class 0
	# No missing in class 2
	multiplier = partial(label_based_proba_multiplier,
	label_importances={0: 0.5, 1: 1})
	X_dropped = drop_values_nmar(X, y, missing_values=np.nan,
	proba_multiplier_func=multiplier,
	proba=proba)
	for i in range(X_dropped.shape[0]):
	print(X_dropped[i], y[i])
	print()