Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save datavudeja/64b90099d48fbbb65df759ccbedf455f to your computer and use it in GitHub Desktop.
Save datavudeja/64b90099d48fbbb65df759ccbedf455f to your computer and use it in GitHub Desktop.
import llm
import sqlite_utils
import numpy as np
from scipy.cluster.hierarchy import linkage, cut_tree
from collections import defaultdict
collection_path = "your path here"
db = sqlite_utils.Database(collection_path)
collection = llm.Collection("pocket", db, create=False)
embeddings_array = np.vstack([llm.decode(emb['embedding']) for emb in collection.db['embeddings'].rows])
complete_clustering = linkage(embeddings_array, method="complete", metric="cosine")
cluster_labels = cut_tree(complete_clustering, n_clusters=7).reshape(-1, )
groups = defaultdict(list)
for id, label in zip(article_ids, cluster_labels):
groups[label].append(id)
print(groups)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment