Created
December 6, 2018 16:00
-
-
Save zyocum/062285ae1aa282d04d1ab00bc72ebe28 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
"""Compute pair-wise cluster-level comparison metrics | |
These metrics are proposed by: | |
https://www.cs.umd.edu/class/spring2012/cmsc828L/Papers/MenestrinaVLDB10.pdf""" | |
import random | |
from string import ascii_lowercase as alphabet | |
from math import sqrt, log | |
from itertools import combinations, product, chain | |
def partition(items, seed=None): | |
shuffled = [item for item in items] | |
if seed: | |
random.seed(seed) | |
random.shuffle(shuffled) | |
cell = [] | |
while shuffled: | |
while random.randint(0, 1): | |
if shuffled: | |
cell.append(shuffled.pop()) | |
if cell: | |
yield tuple(cell) | |
cell = [] | |
def universe(*clusters): | |
return set(chain(*chain(*clusters))) | |
def intersection(*args): | |
return set.intersection(*({tuple(x) for x in arg} for arg in args)) | |
def k(r, s): | |
"""Compute pair-wise cluster-level comparison metric K""" | |
author_purities = [] | |
cluster_purities = [] | |
n = float(len(universe(r, s))) | |
for x, y in product(r, s): | |
numerator = float(len(intersection(x, y)) ** 2) | |
author_purities.append(numerator / float(len(x))) | |
cluster_purities.append(numerator / float(len(y))) | |
average_author_purities = sum(author_purities) / n | |
average_cluster_purities = sum(cluster_purities) / n | |
return sqrt(average_author_purities * average_cluster_purities) | |
def entropy(r, n): | |
return -1 * sum(len(x) / n * log(len(x) / n, 2) for x in r) | |
def information(r, s, n): | |
i = 0.0 | |
for pair in product(r, s): | |
x, y = map(set, pair) | |
overlap = len(intersection(x, y)) | |
if overlap > 0: | |
i += (overlap / n) * log((overlap * n) / (len(x) * len(y)), 2) | |
return i | |
def vi(r, s): | |
"""Compute the pair-wise Variation of Information (VI)""" | |
n = float(len(universe(r, s))) | |
return entropy(r, n) + entropy(s, n) - (2 * information(r, s, n)) | |
def demo(r, s): | |
print('R:', *list(sorted(x) for x in sorted(r)), sep=', ') | |
print('S:', *list(sorted(x) for x in sorted(s)), sep=', ') | |
print('K(R,S):', k(r, s)) | |
print('VI(R,S):', vi(r, s)) | |
if __name__ == '__main__': | |
a, b, c, d = 'abcd' | |
r = (a, b), (c,), (d,) | |
s = (a, b), (c, d) | |
demo(r, s) | |
#partitions = [tuple(partition(alphabet, seed)) for seed in alphabet] | |
#for r, s in combinations(partitions, 2): | |
# demo(r, s) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment