cayetanobv · September 11, 2020 11:36
diff --git a/kmeans_balanced.py b/kmeans_balanced.py
 import math

 import pandas as pd
 import numpy as np

 from k_means_constrained import KMeansConstrained

 SAMPLE_SIZE = 5300
 N_CLUSTERS = 4
 SIZE_MIN = math.floor(SAMPLE_SIZE / N_CLUSTERS)
 SIZE_MAX = SAMPLE_SIZE - (SIZE_MIN * N_CLUSTERS) + SIZE_MIN

 X = np.random.normal(
    loc=(-3.7, 40.4),
    scale=(0.05, 0.05),
    size=(SAMPLE_SIZE, 2)
 )

 clf = KMeansConstrained(
    n_clusters=N_CLUSTERS,
    size_min=SIZE_MIN,
    size_max=SIZE_MAX,
    random_state=0
 )
 clf.fit(X)

 df = pd.DataFrame(X, columns=['x', 'y'])
 df['cluster'] = clf.labels_


 print(df.groupby('cluster').count())

 df.to_csv('/tmp/clusters.csv', index=False)
	import math

	import pandas as pd
	import numpy as np

	from k_means_constrained import KMeansConstrained

	SAMPLE_SIZE = 5300
	N_CLUSTERS = 4
	SIZE_MIN = math.floor(SAMPLE_SIZE / N_CLUSTERS)
	SIZE_MAX = SAMPLE_SIZE - (SIZE_MIN * N_CLUSTERS) + SIZE_MIN

	X = np.random.normal(
	loc=(-3.7, 40.4),
	scale=(0.05, 0.05),
	size=(SAMPLE_SIZE, 2)
	)

	clf = KMeansConstrained(
	n_clusters=N_CLUSTERS,
	size_min=SIZE_MIN,
	size_max=SIZE_MAX,
	random_state=0
	)
	clf.fit(X)

	df = pd.DataFrame(X, columns=['x', 'y'])
	df['cluster'] = clf.labels_


	print(df.groupby('cluster').count())

	df.to_csv('/tmp/clusters.csv', index=False)