Skip to content

Instantly share code, notes, and snippets.

def vectorize_sequences(sequences, dimension):
results = np.zeros((len(sequences), dimension))
for i, sequence in enumerate(sequences):
results[i, sequence] = 1.
return results
vec_seqs = vectorize_sequences(pad_seqs, VOCAB_SIZE)
VOCAB_SIZE = 750
# take just the target feature
clean_sequences = sequences.loc[:,FEAT_FIELD]
# create a tokenizer with 750 'words' -
# we will have a number representing each of the top 750 wordsx
tokenizer = Tokenizer(num_words=VOCAB_SIZE)
# fit the tokenizer on our data
tokenizer.fit_on_texts(clean_sequences)
from lifelines import CoxPHFitter
df_reg = df.loc[:,['event','days_with_bad_req','req_day','bad_req_days_rate']]
cp = CoxPHFitter()
cp.fit(df_reg, duration_col='days_with_bad_req', event_col='event')
cp.print_summary()
print(f'Probability that user will made BAD requests after 3 days is: {kmf_breq.predict(3)}')
print(f'Probability that user will made BAD requests after 14 days is: {kmf_breq.predict(14)}')
print(f'50% of the population will not make bad req after {kmf_breq.percentile(0.50)} days')
print(f'75% of the population will not make bad req after {kmf_breq.percentile(0.25)} days')
print(f'99% of the population will not make bad req after {kmf_breq.percentile(0.01)} days')
print(f'no one will make bad req after {kmf_breq.percentile(0.00)} days')
from lifelines import KaplanMeierFitter
kmf_breq = KaplanMeierFitter()
kmf_breq.fit(df['days_with_bad_req'], event_observed=df['event'])
ax = kmf_breq.plot()
plt.rcParams['figure.figsize'] = [6, 5]
mod_data = data.copy()
#get the indices of the positives samples
pos_ind = np.where(mod_data.iloc[:,-1].values == 1)[0]
#shuffle them
np.random.shuffle(pos_ind)
#now lets take 150 positives that will remain labeled
pos_sample = pos_ind[:150]
#create the new target variable and mark all the data set as unlabeled
mod_data['class_test'] = -1
pu_estimator, probs1y1 = fit_PU_estimator(
x_train,
y_train,
0.2,
xgb.XGBClassifier())
predicted_s = pu_estimator.predict_proba(x_train)
predicted_s = predicted_s[:,1]
predicted_y = predicted_s / probs1y1
def fit_PU_estimator(X,y, hold_out_ratio, estimator):
# The training set will be divided into a fitting-set that will be used
# to fit the estimator in order to estimate P(s=1|X) and a held-out set of positive samples
# that will be used to estimate P(s=1|y=1)
# --------
# find the indices of the positive/labeled elements
assert (type(y) == np.ndarray), "Must pass np.ndarray rather than list as y"
positives = np.where(y == 1.)[0]
#encode all the data
encoded_seqs = encode_sequence_list(seqs_ds.iloc[:,0])
#scale it
scaled_data = MinMaxScaler().fit_transform(encoded_seqs)
#predict it
predicted = autoencoder.predict(scaled_data)
#get the error term
mse = np.mean(np.power(scaled_data - predicted, 2), axis=1)
#now add them to our data frame
seqs_ds['MSE'] = mse