Last active
May 28, 2016 05:16
MajorClust algorithm implementation using sklearn based on SO conversation about text clustering using python(http://stackoverflow.com/questions/1789254/clustering-text-in-python).
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.metrics.pairwise import linear_kernel | |
import numpy as np | |
from itertools import combinations | |
from random import shuffle | |
def majorclust_sklearn(): | |
texts = [ | |
"foo blub baz", | |
"foo bar baz", | |
"asdf bsdf csdf", | |
"foo bab blub", | |
"csdf hddf kjtz", | |
"123 456 890", | |
"321 890 456 foo", | |
"123 890 uiop", | |
] | |
vectorizer = TfidfVectorizer() | |
corpus_mat = vectorizer.fit_transform(texts) | |
num_of_samples, num_of_features = corpus_mat.shape | |
cosine_distances = np.zeros((num_of_samples, num_of_samples)) | |
for i in range(len(texts)): | |
cosine_distances[i] = linear_kernel(corpus_mat[i:i+1], corpus_mat).flatten() | |
cosine_distances[i, i] = 0 | |
t = False | |
indices = np.arange(num_of_samples) | |
while not t: | |
t = True | |
shuffled_indices = np.arange(num_of_samples) | |
shuffle(shuffled_indices) | |
for index in shuffled_indices: | |
# aggregating edge weights | |
new_index = np.argmax(np.bincount(indices, | |
weights=cosine_distances[index])) | |
if indices[new_index] != indices[index]: | |
indices[index] = indices[new_index] | |
t = False | |
clusters = {} | |
for index, target in enumerate(indices): | |
clusters.setdefault(target, []).append(texts[index]) | |
for cluster in clusters: | |
print(80*"=") | |
print("\n".join(clusters[cluster])) | |
return clusters |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment