Skip to content

Instantly share code, notes, and snippets.

@mcrisc
Created February 26, 2016 13:16
Show Gist options
  • Select an option

  • Save mcrisc/06b8650c201c2152227f to your computer and use it in GitHub Desktop.

Select an option

Save mcrisc/06b8650c201c2152227f to your computer and use it in GitHub Desktop.
Document clustering from similarity matrix
# coding: utf-8
import logging
import random
import numpy as np
UNKNOWN = -1
NOT_FOUND = -1
MIN_SIMILARITY = 0.90
def select_centroid(sim_matrix, centroids, doc):
similarities = [(c, sim_matrix[doc][c]) for c in centroids
if sim_matrix[doc][c] > MIN_SIMILARITY]
centroid, _ = min(similarities, default=(UNKNOWN, 1), key=lambda s: s[1])
return centroid
def find_clusters(sim_matrix):
"""
`labels` is an array representing each row of corpus matrix tagged
with the index of the centroid it belongs.
"""
centroids = []
nrows = len(sim_matrix)
labels = np.array([UNKNOWN] * nrows, dtype=np.int)
docs = list(range(nrows))
random.shuffle(docs)
for doc in docs:
centroid = select_centroid(sim_matrix, centroids, doc)
if centroid == NOT_FOUND:
centroid = doc
centroids.append(centroid)
print(len(centroids), 'centroids')
labels[doc] = centroid
return labels
def main():
logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s',
level=logging.INFO)
logging.info('loading similarity matrix')
sims = np.load('mydocuments.matrix-similarity.npy')
logging.info('finding clusters')
labels = find_clusters(sims)
logging.info('saving labels')
with open('labels_%03d.txt' % (MIN_SIMILARITY * 100), 'w') as fout:
print('# labels docid', file=fout)
for i in range(len(labels)):
print(labels[i], i, file=fout)
logging.info('finished')
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment