Skip to content

Instantly share code, notes, and snippets.

from sklearn.mixture import GaussianMixture as GMM
def cluster_gmm(matrix, k=4):
gmm_model = GMM(k, covariance_type='full', random_state=0, n_init=10)
gmm_model.fit(matrix)
gmm_labels = gmm_model.predict(matrix)
centers = gmm_model.means_
return gmm_model, gmm_labels, centers
# a dictionary consisting of K:user and V:mean of the user's items
user_means = {}
for user in list(text_parts.keys()):
# get a list of artists the user listens to
# (but only if we have a vec for it)
artists = [artist for artist in text_parts[user] if model.wv.has_index_for(artist)]
if len(artists) == 0:
# text_parts is a dictionary in which each key is a userID and the corresponding values are the items the user engaged with
# vector_size is the size of the embedding vector or the latent factors dimension
# window should be set to the max # of items of the user with the greatest # of items
model = Word2Vec(text_parts.values(), min_count=3, vector_size=20, window=36, sg=1, ns_exponent=-0.5)
def user_embedding_model(embedding_size = 50):
#Embed items and users in vec space
# Both inputs are 1-dimensional
user = Input(name = 'user', shape = [1])
item = Input(name = 'item', shape = [1])
# (None, 1, 50))
# Show a pie chart of the groups
summs = df_avg.cluster.value_counts()
cats = np.zeros(len(summs))
for i, x in enumerate(summs): cats[i] = x
labels = ['Evening / Night','Normal Day', 'Late Night / Early Morning']
sizes = summs.astype(int)
# assign each vector the cluster withwhich it is most associated
df_avg['cluster'] = np.argmax(lda.transform(df_avg), axis=1)
#chart the different clusters
fig, axs = plt.subplots(3, 1, sharex=True, sharey=True)
df_avg.iloc[:,:-1].loc[df_avg.cluster == 0].mean().plot.bar(ax=axs[0])
axs[0].set_title('Category - 0 (Evening/Night)')
#get the MSE or loss error term
predictions = autoencoder.predict(vec_seqs)
mse = np.mean(np.power(vec_seqs - predictions, 2), axis=1)
sequences['MSE'] = mse
def get_sequence_anomaly_rank(seq_a):
t_vec = tokenizer.texts_to_sequences([seq_a])
pad_seq = pad_sequences(t_vec, maxlen=MAX_FEAT_LEN, padding='post', truncating='post')
vec_seq = vectorize_sequences(pad_seq, VOCAB_SIZE)
pred_seq = autoencoder.predict(vec_seq)
# get anomaly score
score = np.mean(np.power(vec_seq - pred_seq, 2), axis=1)
# get anomaly rank
optimizer = optimizers.Adam(lr=1e-2)
autoencoder.compile(optimizer=optimizer,
loss='mean_squared_error',
metrics=['accuracy'])
checkpointer = ModelCheckpoint(filepath="model_bin.h5",
verbose=0,
save_best_only=True)
#Create the train and test set
TRAIN_RATIO = 0.75
train_size = int(len(vec_seqs) * TRAIN_RATIO)
X_train = vec_seqs[:train_size]
X_test = vec_seqs[train_size:]
#define the encoder