This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
pu_estimator, probs1y1 = fit_PU_estimator( | |
x_train, | |
y_train, | |
0.2, | |
xgb.XGBClassifier()) | |
predicted_s = pu_estimator.predict_proba(x_train) | |
predicted_s = predicted_s[:,1] | |
predicted_y = predicted_s / probs1y1 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
mod_data = data.copy() | |
#get the indices of the positives samples | |
pos_ind = np.where(mod_data.iloc[:,-1].values == 1)[0] | |
#shuffle them | |
np.random.shuffle(pos_ind) | |
#now lets take 150 positives that will remain labeled | |
pos_sample = pos_ind[:150] | |
#create the new target variable and mark all the data set as unlabeled | |
mod_data['class_test'] = -1 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from lifelines import KaplanMeierFitter | |
kmf_breq = KaplanMeierFitter() | |
kmf_breq.fit(df['days_with_bad_req'], event_observed=df['event']) | |
ax = kmf_breq.plot() | |
plt.rcParams['figure.figsize'] = [6, 5] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
print(f'50% of the population will not make bad req after {kmf_breq.percentile(0.50)} days') | |
print(f'75% of the population will not make bad req after {kmf_breq.percentile(0.25)} days') | |
print(f'99% of the population will not make bad req after {kmf_breq.percentile(0.01)} days') | |
print(f'no one will make bad req after {kmf_breq.percentile(0.00)} days') | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
print(f'Probability that user will made BAD requests after 3 days is: {kmf_breq.predict(3)}') | |
print(f'Probability that user will made BAD requests after 14 days is: {kmf_breq.predict(14)}') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from lifelines import CoxPHFitter | |
df_reg = df.loc[:,['event','days_with_bad_req','req_day','bad_req_days_rate']] | |
cp = CoxPHFitter() | |
cp.fit(df_reg, duration_col='days_with_bad_req', event_col='event') | |
cp.print_summary() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
VOCAB_SIZE = 750 | |
# take just the target feature | |
clean_sequences = sequences.loc[:,FEAT_FIELD] | |
# create a tokenizer with 750 'words' - | |
# we will have a number representing each of the top 750 wordsx | |
tokenizer = Tokenizer(num_words=VOCAB_SIZE) | |
# fit the tokenizer on our data | |
tokenizer.fit_on_texts(clean_sequences) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def vectorize_sequences(sequences, dimension): | |
results = np.zeros((len(sequences), dimension)) | |
for i, sequence in enumerate(sequences): | |
results[i, sequence] = 1. | |
return results | |
vec_seqs = vectorize_sequences(pad_seqs, VOCAB_SIZE) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Create the train and test set | |
TRAIN_RATIO = 0.75 | |
train_size = int(len(vec_seqs) * TRAIN_RATIO) | |
X_train = vec_seqs[:train_size] | |
X_test = vec_seqs[train_size:] | |
#define the encoder |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
optimizer = optimizers.Adam(lr=1e-2) | |
autoencoder.compile(optimizer=optimizer, | |
loss='mean_squared_error', | |
metrics=['accuracy']) | |
checkpointer = ModelCheckpoint(filepath="model_bin.h5", | |
verbose=0, | |
save_best_only=True) |