Skip to content

Instantly share code, notes, and snippets.

@allthesignals
Last active February 25, 2026 00:25
Show Gist options
  • Select an option

  • Save allthesignals/bb9c1d9e25cb5005ae2d21621d6bd83a to your computer and use it in GitHub Desktop.

Select an option

Save allthesignals/bb9c1d9e25cb5005ae2d21621d6bd83a to your computer and use it in GitHub Desktop.
import numpy as np
import pandas as pd
from k_means_constrained import KMeansConstrained
from sklearn.preprocessing import OneHotEncoder, StandardScaler
# 2. Vectorize Categories (One-Hot Encoding)
encoder = OneHotEncoder(sparse_output=False)
encoded_features = encoder.fit_transform(df[['Contract', 'Domain', 'Portfolio', 'Department']])
feature_names = encoder.get_feature_names_out()
# 3. Apply Hierarchy Weights
# We multiply the columns to prioritize Project > Portfolio > Discipline
weights = {
'Contract': 10.0, # Top priority
'Domain': 7.0, # Medium priority # todo - need supragroupings
'Portfolio': 3.0, # Medium priority # todo - need supragroupings
'Department': 1.0 # Lowest priority
}
weighted_features = encoded_features.copy()
for i, col in enumerate(feature_names):
for category, weight in weights.items():
if category in col:
weighted_features[:, i] *= weight
# 4. Run Constrained K-Means
# Setting size_min=15 and size_max=20 to meet your specific grouping needs
n_clusters = len(df) // 17 # Aim for ~17 per group
clf = KMeansConstrained(
n_clusters=n_clusters,
size_min=15,
size_max=20,
random_state=42
)
df['cluster'] = clf.fit_predict(weighted_features)
# 5. Output Results
print(df.groupby('cluster').size()) # Verify sizes are between 15-20
print(df.sort_values('cluster').head(20))
df.sort_values('cluster').to_csv('clustered.csv', index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment