Created
February 26, 2016 13:16
-
-
Save mcrisc/06b8650c201c2152227f to your computer and use it in GitHub Desktop.
Document clustering from similarity matrix
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # coding: utf-8 | |
| import logging | |
| import random | |
| import numpy as np | |
| UNKNOWN = -1 | |
| NOT_FOUND = -1 | |
| MIN_SIMILARITY = 0.90 | |
| def select_centroid(sim_matrix, centroids, doc): | |
| similarities = [(c, sim_matrix[doc][c]) for c in centroids | |
| if sim_matrix[doc][c] > MIN_SIMILARITY] | |
| centroid, _ = min(similarities, default=(UNKNOWN, 1), key=lambda s: s[1]) | |
| return centroid | |
| def find_clusters(sim_matrix): | |
| """ | |
| `labels` is an array representing each row of corpus matrix tagged | |
| with the index of the centroid it belongs. | |
| """ | |
| centroids = [] | |
| nrows = len(sim_matrix) | |
| labels = np.array([UNKNOWN] * nrows, dtype=np.int) | |
| docs = list(range(nrows)) | |
| random.shuffle(docs) | |
| for doc in docs: | |
| centroid = select_centroid(sim_matrix, centroids, doc) | |
| if centroid == NOT_FOUND: | |
| centroid = doc | |
| centroids.append(centroid) | |
| print(len(centroids), 'centroids') | |
| labels[doc] = centroid | |
| return labels | |
| def main(): | |
| logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s', | |
| level=logging.INFO) | |
| logging.info('loading similarity matrix') | |
| sims = np.load('mydocuments.matrix-similarity.npy') | |
| logging.info('finding clusters') | |
| labels = find_clusters(sims) | |
| logging.info('saving labels') | |
| with open('labels_%03d.txt' % (MIN_SIMILARITY * 100), 'w') as fout: | |
| print('# labels docid', file=fout) | |
| for i in range(len(labels)): | |
| print(labels[i], i, file=fout) | |
| logging.info('finished') | |
| if __name__ == '__main__': | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment