a-agmon · March 2, 2020 12:49
diff --git a/pu_est1.py b/pu_est1.py


 def fit_PU_estimator(X,y, hold_out_ratio, estimator):
    # The training set will be divided into a fitting-set that will be used 
    # to fit the estimator in order to estimate P(s=1|X) and a held-out set of positive samples
    # that will be used to estimate P(s=1|y=1)
    # --------
    # find the indices of the positive/labeled elements
    assert (type(y) == np.ndarray), "Must pass np.ndarray rather than list as y"
    positives = np.where(y == 1.)[0] 
    # hold_out_size = the *number* of positives/labeled samples 
    # that we will use later to estimate P(s=1|y=1)
    hold_out_size = int(np.ceil(len(positives) * hold_out_ratio))
    np.random.shuffle(positives)
    # hold_out = the *indices* of the positive elements 
    # that we will later use  to estimate P(s=1|y=1)
    hold_out = positives[:hold_out_size] 
    # the actual positive *elements* that we will keep aside
    X_hold_out = X[hold_out] 
    # remove the held out elements from X and y
    X = np.delete(X, hold_out,0) 
    y = np.delete(y, hold_out)
    # We fit the estimator on the unlabeled samples + (part of the) positive and labeled ones.
    # In order to estimate P(s=1|X) or  what is the probablity that an element is *labeled*
    estimator.fit(X, y)
    # We then use the estimator for prediction of the positive held-out set 
    # in order to estimate P(s=1|y=1)
    hold_out_predictions = estimator.predict_proba(X_hold_out)
    #take the probability that it is 1
    hold_out_predictions = hold_out_predictions[:,1]
    # save the mean probability 
    c = np.mean(hold_out_predictions)
    return estimator, c

 def predict_PU_prob(X, estimator, prob_s1y1):
    prob_pred = estimator.predict_proba(X)
    prob_pred = prob_pred[:,1]
    return prob_pred / prob_s1y1


	def fit_PU_estimator(X,y, hold_out_ratio, estimator):
	# The training set will be divided into a fitting-set that will be used
	# to fit the estimator in order to estimate P(s=1\|X) and a held-out set of positive samples
	# that will be used to estimate P(s=1\|y=1)
	# --------
	# find the indices of the positive/labeled elements
	assert (type(y) == np.ndarray), "Must pass np.ndarray rather than list as y"
	positives = np.where(y == 1.)[0]
	# hold_out_size = the number of positives/labeled samples
	# that we will use later to estimate P(s=1\|y=1)
	hold_out_size = int(np.ceil(len(positives) * hold_out_ratio))
	np.random.shuffle(positives)
	# hold_out = the indices of the positive elements
	# that we will later use to estimate P(s=1\|y=1)
	hold_out = positives[:hold_out_size]
	# the actual positive elements that we will keep aside
	X_hold_out = X[hold_out]
	# remove the held out elements from X and y
	X = np.delete(X, hold_out,0)
	y = np.delete(y, hold_out)
	# We fit the estimator on the unlabeled samples + (part of the) positive and labeled ones.
	# In order to estimate P(s=1\|X) or what is the probablity that an element is labeled
	estimator.fit(X, y)
	# We then use the estimator for prediction of the positive held-out set
	# in order to estimate P(s=1\|y=1)
	hold_out_predictions = estimator.predict_proba(X_hold_out)
	#take the probability that it is 1
	hold_out_predictions = hold_out_predictions[:,1]
	# save the mean probability
	c = np.mean(hold_out_predictions)
	return estimator, c

	def predict_PU_prob(X, estimator, prob_s1y1):
	prob_pred = estimator.predict_proba(X)
	prob_pred = prob_pred[:,1]
	return prob_pred / prob_s1y1