Skip to content

Instantly share code, notes, and snippets.

@dustalov
Last active January 2, 2018 15:54
Show Gist options
  • Save dustalov/df3c644ebe3367660e13a1f23ac200a7 to your computer and use it in GitHub Desktop.
Save dustalov/df3c644ebe3367660e13a1f23ac200a7 to your computer and use it in GitHub Desktop.
Normalized Modified Purity in Python.
#!/usr/bin/env python
# This script computes the normalized modified purity and inverse purity
# as according to this paper: https://aclweb.org/anthology/P14-1097.
# In fact, this program is currently quite a rough translation of
# the evaluation-verb-classes.perl script provided by Daisuke Kawahara.
import argparse
import re
import sys
from collections import defaultdict
from math import log
CLUSTER = re.compile('^Class (\d+): (.+)')
VERB = re.compile('^(.+)-([.\d]+)$')
TAB = re.compile('\t+')
VALUE = re.compile(':\d+')
parser = argparse.ArgumentParser()
parser.add_argument('-m', '--multi', action='store_true')
parser.add_argument('--gold', type=argparse.FileType('r', encoding='UTF-8'))
parser.add_argument('resource', type=argparse.FileType('r', encoding='UTF-8'))
args = parser.parse_args()
classes = defaultdict(lambda: defaultdict(int))
clusters = defaultdict(lambda: defaultdict(int))
verbs_in_class = defaultdict(int)
verbs_in_cluster = defaultdict(int)
def count_cluster_num(verb_hr):
if not args.multi:
return len(verb_hr)
sum = 0.
for verb in verb_hr:
sum += verb_hr[verb]
return sum
def evaluate_one_cluster(verb_hr, classes_hr):
max_count = -1
for klass in classes_hr:
count = 0.
for verb in verb_hr:
if verb in classes_hr[klass]:
if args.multi:
count += verb_hr[verb]
else:
count += 1
if max_count < count:
max_count = count
return max_count
# F_beta score
def calc_f(p, r, beta=1.):
return (1 + pow(beta, 2)) * p * r / (pow(beta, 2) * p + r)
def entropy(cluster_hr):
N = 0.
for cluster in cluster_hr:
N += count_cluster_num(cluster_hr[cluster])
score = 0.
for cluster in cluster_hr:
cluster_count = count_cluster_num(cluster_hr[cluster])
score += -1 * cluster_count / N * log(cluster_count / N)
return score
def conditional_entropy(cluster_hr, class_hr):
N = 0
for cluster in cluster_hr:
N += count_cluster_num(cluster_hr[cluster])
score = 0
for cluster in cluster_hr:
overlap_sum_given_cluster = 0
for klass in class_hr:
overlap_sum_given_cluster += count_hash_overlap(cluster_hr[cluster], class_hr[klass])
for klass in class_hr:
overlap_count = count_hash_overlap(cluster_hr[cluster], class_hr[klass])
if overlap_count > 0:
score += -1 * overlap_count / N * log(overlap_count / overlap_sum_given_cluster)
return score
def mutual_information(cluster_hr, class_hr):
N = 0
for cluster in cluster_hr:
N += count_cluster_num(cluster_hr[cluster])
score = 0
for cluster in cluster_hr:
cluster_count = count_cluster_num(cluster_hr[cluster])
for klass in class_hr:
class_count = count_cluster_num(class_hr[klass])
overlap_count = count_hash_overlap(cluster_hr[cluster], class_hr[klass])
if overlap_count > 0:
score += overlap_count / N * log(N * overlap_count / cluster_count / class_count)
return score
def count_hash_overlap(hash1_hr, hash2_hr):
count = 0
for key in hash1_hr:
if key in hash2_hr:
if args.multi:
count += hash2_hr[key]
else:
count += 1
return count
for line in args.gold:
klass, _, verbs_str = TAB.split(line.rstrip(), 2)
for verb in verbs_str.split(' '):
classes[klass][verb] += 1
verbs_in_class[verb] += 1
if args.multi:
for klass in classes:
for verb in classes[klass]:
classes[klass][verb] /= verbs_in_class[verb]
for line in args.resource:
match = CLUSTER.match(line)
if match:
cluster, verbs_str = match.group(1), match.group(2)
for verb in verbs_str.rstrip().split(' '):
value = None
match = VERB.match(verb)
if match:
verb, value = match.group(1), match.group(2)
value = float(value)
elif '-' in verb:
print('Hyphen found in the verb.', file=sys.stderr)
sys.exit(1)
else:
value = 1
verb = VALUE.sub('', verb)
clusters[cluster][verb] += value
verbs_in_cluster[verb] += value
if args.multi:
for cluster in clusters:
for verb in clusters[cluster]:
clusters[cluster][verb] /= verbs_in_cluster[verb]
cluster_num = len(clusters)
print('# of Clusters: {0}'.format(cluster_num))
cluster_entropy = entropy(clusters)
class_entropy = entropy(classes)
print('clu_e = %.5f' % cluster_entropy)
print('cla_e = %.5f' % class_entropy)
nmi = 2 * mutual_information(clusters, classes) / (cluster_entropy + class_entropy)
print('MI = %.5f' % mutual_information(clusters, classes))
print('NMI = %.5f' % nmi)
homogeneity = 1 if class_entropy == 0 else 1 - conditional_entropy(clusters, classes) / class_entropy
completeness = 1 if cluster_entropy == 0 else 1 - conditional_entropy(classes, clusters) / cluster_entropy
print('h = %.5f' % homogeneity)
print('c = %.5f' % completeness)
print('V1 = %.5f' % calc_f(homogeneity, completeness))
print('Cluster status:')
correct_sum = 0
modified_correct_sum = 0
all_sum = 0
for cluster in sorted(clusters):
max_count = evaluate_one_cluster(clusters[cluster], classes)
verb_num = count_cluster_num(clusters[cluster])
correct_sum += max_count
if len(clusters[cluster]) > 1:
modified_correct_sum += max_count
all_sum += verb_num
print('\t%s %.5f (%.1f / %.1f)' % (cluster, max_count / verb_num, max_count, verb_num))
purity = correct_sum / all_sum
modified_purity = modified_correct_sum / all_sum
print('purity = %.5f (%.1f / %.1f)' % (purity, correct_sum, all_sum))
print('modified purity = %.5f (%.1f / %.1f)' % (modified_purity, modified_correct_sum, all_sum))
print('Class status:')
correct_sum = 0
all_sum = 0
for klass in classes:
max_count = evaluate_one_cluster(classes[klass], clusters)
verb_num = count_cluster_num(classes[klass])
correct_sum += max_count
all_sum += verb_num
print('\t%s %.5f (%.1f / %.1f)' % (klass, max_count / verb_num, max_count, verb_num))
inverse_purity = correct_sum / all_sum
print('inverse purity = %.5f (%.1f / %.1f)' % (inverse_purity, correct_sum, all_sum))
print('F1 (purity&inverse_purity) = %.5f' % calc_f(purity, inverse_purity))
print('F1 (modified_purity&inverse_purity) = %.5f' % calc_f(modified_purity, inverse_purity))
print('F0.5 (purity&inverse_purity) = %.5f' % calc_f(purity, inverse_purity, 0.5))
print('F0.5 (modified_purity&inverse_purity) = %.5f' % (calc_f(modified_purity, inverse_purity, 0.5)))
print('#%d %.5f %.5f %.5f %.5f %.5f' % (
cluster_num, purity * 100, modified_purity * 100, inverse_purity * 100,
calc_f(modified_purity, inverse_purity) * 100,
calc_f(modified_purity, inverse_purity, 0.5) * 100))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment