Skip to content

Instantly share code, notes, and snippets.

@hakanilter
Last active April 5, 2019 09:30
Show Gist options
  • Save hakanilter/afa0d77f184262d304dae0fa6eb445ab to your computer and use it in GitHub Desktop.
Save hakanilter/afa0d77f184262d304dae0fa6eb445ab to your computer and use it in GitHub Desktop.
Poor Man's text clustering using cosine similarity
from scipy import spatial
distances = spatial.distance.squareform(spatial.distance.pdist(message_embeddings, 'cosine'))
def progress(i):
print('\r{} {}'.format('-\|/'[i % 4], i), end='')
def cluster(items, distances, similarity_threshold=0.11):
print('Clustering threshold:', similarity_threshold)
clusters = list()
inverted_index = dict()
# Iterate over embeddings
for i in range(0, len(items)):
progress(i)
for j in range(0, len(items)):
if i != j:
# find cluster
source_cluster = -1 if i not in inverted_index else inverted_index[i]
target_cluster = -1 if j not in inverted_index else inverted_index[j]
if source_cluster == -1 or target_cluster == -1:
# calculate similarity
similarity = distances[i][j]
if similarity <= similarity_threshold:
#print("Question {} looks similar to {} with score {}".format(i, j, similarity))
if source_cluster == -1 and target_cluster == -1:
# create new cluster
cluster_id = len(clusters)
clusters.append(list([i, j]))
# update inverted index
inverted_index[i] = cluster_id
inverted_index[j] = cluster_id
elif source_cluster != -1 and target_cluster == -1:
# add target into source cluster
clusters[source_cluster].append(j)
# update inverted index
inverted_index[j] = source_cluster
elif source_cluster == -1 and target_cluster != -1:
# add source into target cluster
clusters[target_cluster].append(i)
# update inverted index
inverted_index[i] = target_cluster
return clusters
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment