Last active
March 2, 2020 12:49
-
-
Save a-agmon/51981b7a8f2b0b8eb5df6ac393ad8083 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def fit_PU_estimator(X,y, hold_out_ratio, estimator): | |
# The training set will be divided into a fitting-set that will be used | |
# to fit the estimator in order to estimate P(s=1|X) and a held-out set of positive samples | |
# that will be used to estimate P(s=1|y=1) | |
# -------- | |
# find the indices of the positive/labeled elements | |
assert (type(y) == np.ndarray), "Must pass np.ndarray rather than list as y" | |
positives = np.where(y == 1.)[0] | |
# hold_out_size = the *number* of positives/labeled samples | |
# that we will use later to estimate P(s=1|y=1) | |
hold_out_size = int(np.ceil(len(positives) * hold_out_ratio)) | |
np.random.shuffle(positives) | |
# hold_out = the *indices* of the positive elements | |
# that we will later use to estimate P(s=1|y=1) | |
hold_out = positives[:hold_out_size] | |
# the actual positive *elements* that we will keep aside | |
X_hold_out = X[hold_out] | |
# remove the held out elements from X and y | |
X = np.delete(X, hold_out,0) | |
y = np.delete(y, hold_out) | |
# We fit the estimator on the unlabeled samples + (part of the) positive and labeled ones. | |
# In order to estimate P(s=1|X) or what is the probablity that an element is *labeled* | |
estimator.fit(X, y) | |
# We then use the estimator for prediction of the positive held-out set | |
# in order to estimate P(s=1|y=1) | |
hold_out_predictions = estimator.predict_proba(X_hold_out) | |
#take the probability that it is 1 | |
hold_out_predictions = hold_out_predictions[:,1] | |
# save the mean probability | |
c = np.mean(hold_out_predictions) | |
return estimator, c | |
def predict_PU_prob(X, estimator, prob_s1y1): | |
prob_pred = estimator.predict_proba(X) | |
prob_pred = prob_pred[:,1] | |
return prob_pred / prob_s1y1 | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment