Created
April 15, 2015 04:02
-
-
Save jonghwanhyeon/7175b0aa4332913f9640 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
FILENAMES = { | |
'first': [ 'shuffled_review_with_tag_{0}'.format(i) for i in range(0, 8) ], | |
'second': [ 'shuffled_review_without_tag_{0}'.format(i) for i in range(0, 8) ], | |
} | |
NUMBER_OF_CATEGORIES = 8 + 1 # including neutral | |
def read_entry(input_file): | |
key = input_file.readline().strip() | |
if not key: | |
return None | |
content = input_file.readline().strip() | |
tag = input_file.readline().strip() | |
if tag: | |
tag = int(tag) | |
else: | |
tag = -1 | |
return (key, content, tag) | |
def compare_entries(first_file, second_file): | |
counts = [ [ 0 for i in range(0, NUMBER_OF_CATEGORIES) ] for i in range(0, NUMBER_OF_CATEGORIES) ] | |
not_matched_entries = [] | |
while True: | |
first_entry = read_entry(first_file) | |
second_entry = read_entry(second_file) | |
if (not first_entry) or (not second_entry): | |
if (not first_entry) and (not second_entry): | |
break | |
else: | |
sys.exit('size of entry is not matched') | |
if first_entry[0] != second_entry[0]: | |
print 'key1: {0}\nkey2: {1}'.format(first_entry[0], second_entry[0]) | |
sys.exit('key is not matched') | |
counts[first_entry[2]][second_entry[2]] += 1 | |
if first_entry[2] != second_entry[2]: | |
not_matched_entries.append((first_entry, second_entry)) | |
return (counts, not_matched_entries) | |
def calculate_kappa(counts): | |
total = sum(map(lambda x: sum(x), counts)) | |
p = map(lambda x: map(lambda y: float(y) / total, x), counts) | |
p_a = 0.0 | |
for i in range(0, NUMBER_OF_CATEGORIES): | |
p_a += p[i][i] | |
p_e = 0.0 | |
for i in range(0, NUMBER_OF_CATEGORIES): | |
for j in range(0, NUMBER_OF_CATEGORIES): | |
p_e += p[i][j] * p[j][i] | |
return (p_a - p_e) / (1.0 - p_e) | |
while True: | |
for (first_filename, second_filename) in zip(FILENAMES['first'], FILENAMES['second']): | |
with open(first_filename) as first_file: | |
with open (second_filename) as second_file: | |
(counts, not_matched_entries) = compare_entries(first_file, second_file) | |
kappa = calculate_kappa(counts) | |
print(kappa) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment