Skip to content

Instantly share code, notes, and snippets.

@jonghwanhyeon
Created April 15, 2015 04:02
Show Gist options
  • Save jonghwanhyeon/7175b0aa4332913f9640 to your computer and use it in GitHub Desktop.
Save jonghwanhyeon/7175b0aa4332913f9640 to your computer and use it in GitHub Desktop.
import sys
FILENAMES = {
'first': [ 'shuffled_review_with_tag_{0}'.format(i) for i in range(0, 8) ],
'second': [ 'shuffled_review_without_tag_{0}'.format(i) for i in range(0, 8) ],
}
NUMBER_OF_CATEGORIES = 8 + 1 # including neutral
def read_entry(input_file):
key = input_file.readline().strip()
if not key:
return None
content = input_file.readline().strip()
tag = input_file.readline().strip()
if tag:
tag = int(tag)
else:
tag = -1
return (key, content, tag)
def compare_entries(first_file, second_file):
counts = [ [ 0 for i in range(0, NUMBER_OF_CATEGORIES) ] for i in range(0, NUMBER_OF_CATEGORIES) ]
not_matched_entries = []
while True:
first_entry = read_entry(first_file)
second_entry = read_entry(second_file)
if (not first_entry) or (not second_entry):
if (not first_entry) and (not second_entry):
break
else:
sys.exit('size of entry is not matched')
if first_entry[0] != second_entry[0]:
print 'key1: {0}\nkey2: {1}'.format(first_entry[0], second_entry[0])
sys.exit('key is not matched')
counts[first_entry[2]][second_entry[2]] += 1
if first_entry[2] != second_entry[2]:
not_matched_entries.append((first_entry, second_entry))
return (counts, not_matched_entries)
def calculate_kappa(counts):
total = sum(map(lambda x: sum(x), counts))
p = map(lambda x: map(lambda y: float(y) / total, x), counts)
p_a = 0.0
for i in range(0, NUMBER_OF_CATEGORIES):
p_a += p[i][i]
p_e = 0.0
for i in range(0, NUMBER_OF_CATEGORIES):
for j in range(0, NUMBER_OF_CATEGORIES):
p_e += p[i][j] * p[j][i]
return (p_a - p_e) / (1.0 - p_e)
while True:
for (first_filename, second_filename) in zip(FILENAMES['first'], FILENAMES['second']):
with open(first_filename) as first_file:
with open (second_filename) as second_file:
(counts, not_matched_entries) = compare_entries(first_file, second_file)
kappa = calculate_kappa(counts)
print(kappa)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment