Skip to content

Instantly share code, notes, and snippets.

@a-agmon
Last active March 2, 2020 12:49
Show Gist options
  • Save a-agmon/51981b7a8f2b0b8eb5df6ac393ad8083 to your computer and use it in GitHub Desktop.
Save a-agmon/51981b7a8f2b0b8eb5df6ac393ad8083 to your computer and use it in GitHub Desktop.
def fit_PU_estimator(X,y, hold_out_ratio, estimator):
# The training set will be divided into a fitting-set that will be used
# to fit the estimator in order to estimate P(s=1|X) and a held-out set of positive samples
# that will be used to estimate P(s=1|y=1)
# --------
# find the indices of the positive/labeled elements
assert (type(y) == np.ndarray), "Must pass np.ndarray rather than list as y"
positives = np.where(y == 1.)[0]
# hold_out_size = the *number* of positives/labeled samples
# that we will use later to estimate P(s=1|y=1)
hold_out_size = int(np.ceil(len(positives) * hold_out_ratio))
np.random.shuffle(positives)
# hold_out = the *indices* of the positive elements
# that we will later use to estimate P(s=1|y=1)
hold_out = positives[:hold_out_size]
# the actual positive *elements* that we will keep aside
X_hold_out = X[hold_out]
# remove the held out elements from X and y
X = np.delete(X, hold_out,0)
y = np.delete(y, hold_out)
# We fit the estimator on the unlabeled samples + (part of the) positive and labeled ones.
# In order to estimate P(s=1|X) or what is the probablity that an element is *labeled*
estimator.fit(X, y)
# We then use the estimator for prediction of the positive held-out set
# in order to estimate P(s=1|y=1)
hold_out_predictions = estimator.predict_proba(X_hold_out)
#take the probability that it is 1
hold_out_predictions = hold_out_predictions[:,1]
# save the mean probability
c = np.mean(hold_out_predictions)
return estimator, c
def predict_PU_prob(X, estimator, prob_s1y1):
prob_pred = estimator.predict_proba(X)
prob_pred = prob_pred[:,1]
return prob_pred / prob_s1y1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment