Skip to content

Instantly share code, notes, and snippets.

def get_sequence_anomaly_rank(seq_a):
t_vec = tokenizer.texts_to_sequences([seq_a])
pad_seq = pad_sequences(t_vec, maxlen=MAX_FEAT_LEN, padding='post', truncating='post')
vec_seq = vectorize_sequences(pad_seq, VOCAB_SIZE)
pred_seq = autoencoder.predict(vec_seq)
# get anomaly score
score = np.mean(np.power(vec_seq - pred_seq, 2), axis=1)
# get anomaly rank
#get the MSE or loss error term
predictions = autoencoder.predict(vec_seqs)
mse = np.mean(np.power(vec_seqs - predictions, 2), axis=1)
sequences['MSE'] = mse
# assign each vector the cluster withwhich it is most associated
df_avg['cluster'] = np.argmax(lda.transform(df_avg), axis=1)
#chart the different clusters
fig, axs = plt.subplots(3, 1, sharex=True, sharey=True)
df_avg.iloc[:,:-1].loc[df_avg.cluster == 0].mean().plot.bar(ax=axs[0])
axs[0].set_title('Category - 0 (Evening/Night)')
# Show a pie chart of the groups
summs = df_avg.cluster.value_counts()
cats = np.zeros(len(summs))
for i, x in enumerate(summs): cats[i] = x
labels = ['Evening / Night','Normal Day', 'Late Night / Early Morning']
sizes = summs.astype(int)
def user_embedding_model(embedding_size = 50):
#Embed items and users in vec space
# Both inputs are 1-dimensional
user = Input(name = 'user', shape = [1])
item = Input(name = 'item', shape = [1])
# (None, 1, 50))
# text_parts is a dictionary in which each key is a userID and the corresponding values are the items the user engaged with
# vector_size is the size of the embedding vector or the latent factors dimension
# window should be set to the max # of items of the user with the greatest # of items
model = Word2Vec(text_parts.values(), min_count=3, vector_size=20, window=36, sg=1, ns_exponent=-0.5)
# a dictionary consisting of K:user and V:mean of the user's items
user_means = {}
for user in list(text_parts.keys()):
# get a list of artists the user listens to
# (but only if we have a vec for it)
artists = [artist for artist in text_parts[user] if model.wv.has_index_for(artist)]
if len(artists) == 0:
from sklearn.mixture import GaussianMixture as GMM
def cluster_gmm(matrix, k=4):
gmm_model = GMM(k, covariance_type='full', random_state=0, n_init=10)
gmm_model.fit(matrix)
gmm_labels = gmm_model.predict(matrix)
centers = gmm_model.means_
return gmm_model, gmm_labels, centers
# First cluster the item data and return the model
items_model, items_labels, items_cluster_centers = cluster_gmm(exp_model.wv.vectors, k=8)
# [user_means] is a list of vectors, each represents the mean of the item vectors each user has listened to
# then use the model to create a new user vector to each user
#based on their probability of being part of each item cluster
# this is the empty array
# encoder model
inputs = Input(shape=input_shape, name='encoder_input')
x = Dense(intermediate_dim, activation='relu')(inputs)
z_mean = Dense(latent_dim, name='z_mean')(x)
z_log_var = Dense(latent_dim, name='z_log_var')(x)
# use the reparameterization trick and get the output from the sample() function
z = Lambda(sample, output_shape=(latent_dim,), name='z')([z_mean, z_log_var])
encoder = Model(inputs, z, name='encoder')
encoder.summary()