|
"""Script demonstrating the lack of adjustment for randomness of V-Measure |
|
|
|
Here we make 2 independent clustering labels labels_a and labels_b for various |
|
values if n_samples and k the number of clusters in both the a and b labelings. |
|
|
|
""" |
|
import numpy as np |
|
|
|
from sklearn.metrics import v_measure_score |
|
|
|
|
|
def v_measures_same_k(n_samples=100, k_range=range(2, 100), n_runs=10, |
|
seed=42): |
|
random_labels = np.random.RandomState(seed).random_integers |
|
scores = np.zeros((len(k_range), n_runs)) |
|
for i in range(n_runs): |
|
for j, k in enumerate(k_range): |
|
labels_a = random_labels(low=0, high=k - 1, size=n_samples) |
|
labels_b = random_labels(low=0, high=k - 1, size=n_samples) |
|
scores[j, i] = v_measure_score(labels_a, labels_b) |
|
return scores |
|
|
|
|
|
def v_measures_fixed_k_a(n_samples=100, k_a=10, k_b_range=range(2, 100), |
|
n_runs=10, seed=42): |
|
random_labels = np.random.RandomState(seed).random_integers |
|
scores = np.zeros((len(k_b_range), n_runs)) |
|
for i in range(n_runs): |
|
for j, k_b in enumerate(k_b_range): |
|
labels_a = random_labels(low=0, high=k_a - 1, size=n_samples) |
|
labels_b = random_labels(low=0, high=k_b - 1, size=n_samples) |
|
scores[j, i] = v_measure_score(labels_a, labels_b) |
|
return scores |
|
|
|
|
|
if __name__ == '__main__': |
|
import pylab as pl |
|
|
|
n_samples = 100 |
|
k_range = range(2, n_samples + 1) |
|
|
|
scores = v_measures_same_k(n_samples=n_samples, k_range=k_range) |
|
mean = scores.mean(axis=1) |
|
std = scores.std(axis=1) |
|
pl.errorbar(k_range, mean, yerr=std) |
|
pl.title("V-Measures for 2 uniform labelings with various centers\n" |
|
"and fixed total number of samples to label %d." % n_samples) |
|
pl.show() |
|
|
|
k_a = 20 |
|
scores = v_measures_fixed_k_a(n_samples=n_samples, k_a=k_a, k_b_range=k_range) |
|
mean = scores.mean(axis=1) |
|
std = scores.std(axis=1) |
|
pl.errorbar(k_range, mean, yerr=std) |
|
pl.title("V-Measures for 2 uniform labelings, one with various centers\n" |
|
"and fixed number of centers %d for the other and\n" |
|
"total number of samples to label %d." % (k_a, n_samples)) |
|
pl.show() |