Skip to content

Instantly share code, notes, and snippets.

pu_estimator, probs1y1 = fit_PU_estimator(
x_train,
y_train,
0.2,
xgb.XGBClassifier())
predicted_s = pu_estimator.predict_proba(x_train)
predicted_s = predicted_s[:,1]
predicted_y = predicted_s / probs1y1
mod_data = data.copy()
#get the indices of the positives samples
pos_ind = np.where(mod_data.iloc[:,-1].values == 1)[0]
#shuffle them
np.random.shuffle(pos_ind)
#now lets take 150 positives that will remain labeled
pos_sample = pos_ind[:150]
#create the new target variable and mark all the data set as unlabeled
mod_data['class_test'] = -1
from lifelines import KaplanMeierFitter
kmf_breq = KaplanMeierFitter()
kmf_breq.fit(df['days_with_bad_req'], event_observed=df['event'])
ax = kmf_breq.plot()
plt.rcParams['figure.figsize'] = [6, 5]
print(f'50% of the population will not make bad req after {kmf_breq.percentile(0.50)} days')
print(f'75% of the population will not make bad req after {kmf_breq.percentile(0.25)} days')
print(f'99% of the population will not make bad req after {kmf_breq.percentile(0.01)} days')
print(f'no one will make bad req after {kmf_breq.percentile(0.00)} days')
print(f'Probability that user will made BAD requests after 3 days is: {kmf_breq.predict(3)}')
print(f'Probability that user will made BAD requests after 14 days is: {kmf_breq.predict(14)}')
from lifelines import CoxPHFitter
df_reg = df.loc[:,['event','days_with_bad_req','req_day','bad_req_days_rate']]
cp = CoxPHFitter()
cp.fit(df_reg, duration_col='days_with_bad_req', event_col='event')
cp.print_summary()
VOCAB_SIZE = 750
# take just the target feature
clean_sequences = sequences.loc[:,FEAT_FIELD]
# create a tokenizer with 750 'words' -
# we will have a number representing each of the top 750 wordsx
tokenizer = Tokenizer(num_words=VOCAB_SIZE)
# fit the tokenizer on our data
tokenizer.fit_on_texts(clean_sequences)
def vectorize_sequences(sequences, dimension):
results = np.zeros((len(sequences), dimension))
for i, sequence in enumerate(sequences):
results[i, sequence] = 1.
return results
vec_seqs = vectorize_sequences(pad_seqs, VOCAB_SIZE)
#Create the train and test set
TRAIN_RATIO = 0.75
train_size = int(len(vec_seqs) * TRAIN_RATIO)
X_train = vec_seqs[:train_size]
X_test = vec_seqs[train_size:]
#define the encoder
optimizer = optimizers.Adam(lr=1e-2)
autoencoder.compile(optimizer=optimizer,
loss='mean_squared_error',
metrics=['accuracy'])
checkpointer = ModelCheckpoint(filepath="model_bin.h5",
verbose=0,
save_best_only=True)