Skip to content

Instantly share code, notes, and snippets.

@NikhilVerma
Last active December 13, 2024 06:29
Show Gist options
  • Save NikhilVerma/ef74d6117acdcc9fbaccfbc0b47b1949 to your computer and use it in GitHub Desktop.
Save NikhilVerma/ef74d6117acdcc9fbaccfbc0b47b1949 to your computer and use it in GitHub Desktop.
thresholdClusteringWithAdaptiveSimilarity
import { cos_sim } from "@huggingface/transformers";
export type ClusteringSentence = {
str: string;
embedding: number[];
};
export function thresholdClusteringWithAdaptiveSimilarity(
data: ClusteringSentence[],
minSimilarityThreshold: number,
maxClusterSize = 100
): string[][] {
let clusters: { indices: number[]; threshold: number }[] = [];
const clusterAssignments = new Array(data.length).fill(-1);
// Adjust similarity threshold until all clusters are within the size limit
let adjustedThreshold = minSimilarityThreshold;
do {
clusters = [];
clusterAssignments.fill(-1);
for (let i = 0; i < data.length; i++) {
if (clusterAssignments[i] !== -1) {
continue;
}
const cluster: number[] = [i];
clusterAssignments[i] = clusters.length;
for (let j = 0; j < data.length; j++) {
if (i === j || clusterAssignments[j] !== -1) {
continue;
}
const similarity = cos_sim(data[i]!.embedding, data[j]!.embedding);
if (similarity >= adjustedThreshold) {
cluster.push(j);
clusterAssignments[j] = clusters.length;
}
}
clusters.push({ indices: cluster, threshold: adjustedThreshold });
}
console.log(
`🧠 Trying threshold ${adjustedThreshold.toFixed(2)} with ${clusters.length} clusters`
);
adjustedThreshold += 0.01;
if (adjustedThreshold >= 1) {
break;
}
} while (clusters.some(cluster => cluster.indices.length > maxClusterSize));
// Convert clusters to string arrays and sort by size (largest first)
return clusters
.map(cluster => cluster.indices.map(index => data[index].str))
.sort((a, b) => b.length - a.length);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment