Skip to content

Instantly share code, notes, and snippets.

@ldrahnik
Last active June 3, 2025 09:59
Show Gist options
  • Save ldrahnik/4ce6f3660a420a3e35dfec925a8af03b to your computer and use it in GitHub Desktop.
Save ldrahnik/4ce6f3660a420a3e35dfec925a8af03b to your computer and use it in GitHub Desktop.
HDBSCAN
all_features = np.concatenate([train_features, val_features])
subset_size = 50000 # adjust based on your RAM
# --- Random selection from all features ---
subset_features, subset_indices = shuffle(all_features, np.arange(len(all_features)), random_state=42)
subset_features = subset_features[:subset_size]
subset_indices = subset_indices[:subset_size]
EPSILON_VALUES = [0.05, 0.01, 0.1]
MIN_CLUSTER_SIZES = [5, 10, 20, 30]
METHODS = ['leaf', 'eom']
DEFAULT_EPSILON = 0.05
DEFAULT_CLUSTER_KEY = (5, DEFAULT_EPSILON, 'leaf')
hdbscan_labels_all = {}
from sklearn.neighbors import NearestNeighbors
for min_cluster_size in MIN_CLUSTER_SIZES:
for eps in EPSILON_VALUES:
for method in METHODS:
print(f"Running HDBSCAN on subset (size={subset_size}) with min_cluster_size={min_cluster_size}, eps={eps}, method={method}")
clusterer = hdbscan.HDBSCAN(
min_cluster_size=min_cluster_size,
min_samples=1,
cluster_selection_epsilon=eps,
cluster_selection_method=method
)
from collections import Counter
labels_subset = clusterer.fit_predict(subset_features)
# --- How to remove noise group ---
non_noise_mask = labels_subset != -1
non_noise_labels = labels_subset[non_noise_mask]
non_noise_features = subset_features[non_noise_mask]
cluster_counts = Counter(non_noise_labels)
print("Cluster sizes:", dict(cluster_counts))
nn = NearestNeighbors(n_neighbors=1, algorithm='auto')
nn.fit(subset_features)
distances, nearest_indices = nn.kneighbors(all_features)
assigned_labels = labels_subset[nearest_indices.flatten()]
key = (min_cluster_size, eps, method)
hdbscan_labels_all[key] = assigned_labels
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment