Skip to content

Instantly share code, notes, and snippets.

@momvart
Created January 20, 2021 18:37
Show Gist options
  • Save momvart/ed113320c5a36191f00173f9c39cd14d to your computer and use it in GitHub Desktop.
Save momvart/ed113320c5a36191f00173f9c39cd14d to your computer and use it in GitHub Desktop.
Implementation of Gaussian Mixture Models clustering using numpy and scipy.
import numpy as np
from numpy.random import choice
from numpy.linalg import norm
from scipy.stats import multivariate_normal
def train_gmm(data, k, convergence_threshold):
raw_data = data
data = pd.DataFrame(data) # Should be removed
feature_columns = data.columns
pis = np.ones((k, 1)) * 1 / k
mus = X[np.random.choice(len(X), size=k, replace=False), :]
sigmas = np.repeat(np.eye(len(feature_columns))[np.newaxis, :, :] * 0.1, k, axis=0)
N = len(raw_data)
log_likelihood = -1000
change = 10000
while change > convergence_threshold:
# E step
probabilities = np.array([multivariate_normal.pdf(raw_data, mu, sigma) for mu, sigma in zip(mus, sigmas)])
probabilities *= pis
assert probabilities.shape == (k, N)
gammas = probabilities / np.sum(probabilities, axis=0, keepdims=True)
assert gammas.shape == (k, N)
gamma_sums = np.sum(gammas, axis=1, keepdims=True)
assert gamma_sums.shape == (k, 1)
# M step
# shape: (k, n) @ (n, f) = (k, f)
mus = (gammas @ raw_data) / gamma_sums
assert mus.shape == (k, len(feature_columns))
sigmas = np.array([1 / gamma_sums[i] * (gammas[i] * (raw_data - mus[i]).T) @ (raw_data - mus[i]) for i in range(k)])
assert sigmas.shape == (k, len(feature_columns), len(feature_columns))
pis = gamma_sums / np.sum(gamma_sums)
assert np.abs(np.sum(pis) - 1) < 1e-6
log_likelihood_new = np.sum(np.log(np.sum(probabilities, axis=0)))
change = np.abs(log_likelihood_new - log_likelihood)
log_likelihood = log_likelihood_new
return pis, mus, sigmas
def predict_gmm(data, mus, sigmas):
probabilities = np.array([multivariate_normal.pdf(data, mu, sigma) for mu, sigma in zip(mus, sigmas)])
return np.argmax(probabilities, axis=0)
# Example
from sklearn.datasets import make_classification
X, Y = make_classification(n_samples=700, n_features=2,
n_informative=2, n_redundant=0,
n_classes=2)
weights, means, covars = train_gmm(X, 2, 0.1)
gmm_pred = predict_gmm(X, means, covars)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment