This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def get_sequence_anomaly_rank(seq_a): | |
t_vec = tokenizer.texts_to_sequences([seq_a]) | |
pad_seq = pad_sequences(t_vec, maxlen=MAX_FEAT_LEN, padding='post', truncating='post') | |
vec_seq = vectorize_sequences(pad_seq, VOCAB_SIZE) | |
pred_seq = autoencoder.predict(vec_seq) | |
# get anomaly score | |
score = np.mean(np.power(vec_seq - pred_seq, 2), axis=1) | |
# get anomaly rank |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#get the MSE or loss error term | |
predictions = autoencoder.predict(vec_seqs) | |
mse = np.mean(np.power(vec_seqs - predictions, 2), axis=1) | |
sequences['MSE'] = mse |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# assign each vector the cluster withwhich it is most associated | |
df_avg['cluster'] = np.argmax(lda.transform(df_avg), axis=1) | |
#chart the different clusters | |
fig, axs = plt.subplots(3, 1, sharex=True, sharey=True) | |
df_avg.iloc[:,:-1].loc[df_avg.cluster == 0].mean().plot.bar(ax=axs[0]) | |
axs[0].set_title('Category - 0 (Evening/Night)') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Show a pie chart of the groups | |
summs = df_avg.cluster.value_counts() | |
cats = np.zeros(len(summs)) | |
for i, x in enumerate(summs): cats[i] = x | |
labels = ['Evening / Night','Normal Day', 'Late Night / Early Morning'] | |
sizes = summs.astype(int) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def user_embedding_model(embedding_size = 50): | |
#Embed items and users in vec space | |
# Both inputs are 1-dimensional | |
user = Input(name = 'user', shape = [1]) | |
item = Input(name = 'item', shape = [1]) | |
# (None, 1, 50)) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# text_parts is a dictionary in which each key is a userID and the corresponding values are the items the user engaged with | |
# vector_size is the size of the embedding vector or the latent factors dimension | |
# window should be set to the max # of items of the user with the greatest # of items | |
model = Word2Vec(text_parts.values(), min_count=3, vector_size=20, window=36, sg=1, ns_exponent=-0.5) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# a dictionary consisting of K:user and V:mean of the user's items | |
user_means = {} | |
for user in list(text_parts.keys()): | |
# get a list of artists the user listens to | |
# (but only if we have a vec for it) | |
artists = [artist for artist in text_parts[user] if model.wv.has_index_for(artist)] | |
if len(artists) == 0: |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.mixture import GaussianMixture as GMM | |
def cluster_gmm(matrix, k=4): | |
gmm_model = GMM(k, covariance_type='full', random_state=0, n_init=10) | |
gmm_model.fit(matrix) | |
gmm_labels = gmm_model.predict(matrix) | |
centers = gmm_model.means_ | |
return gmm_model, gmm_labels, centers | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# First cluster the item data and return the model | |
items_model, items_labels, items_cluster_centers = cluster_gmm(exp_model.wv.vectors, k=8) | |
# [user_means] is a list of vectors, each represents the mean of the item vectors each user has listened to | |
# then use the model to create a new user vector to each user | |
#based on their probability of being part of each item cluster | |
# this is the empty array |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# encoder model | |
inputs = Input(shape=input_shape, name='encoder_input') | |
x = Dense(intermediate_dim, activation='relu')(inputs) | |
z_mean = Dense(latent_dim, name='z_mean')(x) | |
z_log_var = Dense(latent_dim, name='z_log_var')(x) | |
# use the reparameterization trick and get the output from the sample() function | |
z = Lambda(sample, output_shape=(latent_dim,), name='z')([z_mean, z_log_var]) | |
encoder = Model(inputs, z, name='encoder') | |
encoder.summary() |