Last active
June 30, 2017 12:53
-
-
Save minhlab/613e74b25fd13e9ddb50fbb2aacd5bf4 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from collections import Counter | |
| import numpy as np | |
| from scipy.stats import pearsonr | |
| train = 'output/dep/penntree.jk.sd/train.mrg.dep' | |
| ref = 'output/dep/penntree.jk.sd/valid.mrg.dep' | |
| sys = 'output/dep/sd_parse-published-model_valid.conll' | |
| out_path = 'output/occurrence-performance.npy' | |
| count_path = 'output/occurrence-counts.npy' | |
| punctuations = { "``", "''", ".", ",", ":" } | |
| def count_occurrences(path): | |
| print('Counting occurrences...') | |
| c = Counter() | |
| with open(path) as f: | |
| for line_no, line in enumerate(f): | |
| fields = line.strip().split('\t') | |
| if len(fields) > 5: | |
| c[fields[1]] += 1 | |
| if (line_no+1) % 100000 == 0: | |
| print('%s...' %(line_no+1)) | |
| print('Counting occurrences... Done.') | |
| return c | |
| def iter_sents(path): | |
| with open(path) as f: | |
| sent = [] | |
| for line_no, line in enumerate(f): | |
| if line.strip(): | |
| fields = line.split('\t') | |
| sent.append([fields[1]] + fields[6:8] + [line_no]) | |
| else: | |
| if sent: | |
| yield sent | |
| sent = [] | |
| if sent: yield sent | |
| if __name__ == '__main__': | |
| occ_counts = count_occurrences(train) | |
| occ_counts_arr = np.array([occ_counts[k] for k in occ_counts]) | |
| with open(count_path, 'wb') as f: np.save(f, occ_counts_arr) | |
| print('Occurence counts written to %s' %count_path) | |
| data = [] | |
| for sent1, sent2 in zip(iter_sents(ref), iter_sents(sys)): | |
| tok_count = 0 | |
| rare_count = 0.0 | |
| occ_count = 0.0 | |
| uas = 0.0 | |
| las = 0.0 | |
| assert len(sent1) == len(sent2) | |
| for row1, row2 in zip(sent1, sent2): | |
| assert row1[0] == row2[0] and row1[3] == row2[3] | |
| if row1[0] not in punctuations: | |
| tok_count += 1 | |
| occ_count += occ_counts[row1[0]] | |
| rare_count += (5 < occ_counts[row1[0]] <= 20) | |
| uas += (row1[1] == row2[1]) | |
| las += (row1[1:3] == row2[1:3]) | |
| uas /= tok_count | |
| las /= tok_count | |
| data.append([rare_count, occ_count, uas, las, tok_count, row1[3]]) | |
| data = np.array(data) | |
| rare_counts, occ_counts, uas, las, lens = data[:,0], data[:,1], data[:,2], data[:,3], data[:,4] | |
| print('Correlation between rare (but not UNKN) counts and UAS: %f' %pearsonr(rare_counts, uas)[0]) | |
| print('Correlation between rare (but not UNKN) counts and LAS: %f' %pearsonr(rare_counts, las)[0]) | |
| print('Correlation between occurrence counts and UAS: %f' %pearsonr(occ_counts, uas)[0]) | |
| print('Correlation between occurrence counts and LAS: %f' %pearsonr(occ_counts, las)[0]) | |
| print('Correlation between lengths and UAS: %f' %pearsonr(lens, uas)[0]) | |
| print('Correlation between lengths and LAS: %f' %pearsonr(lens, las)[0]) | |
| mask0 = (uas < 0.9) | |
| print('After filtering out "easy" sentences:') | |
| print('Correlation between occurrence counts and UAS: %f' | |
| %pearsonr(occ_counts[mask0], uas[mask0])[0]) | |
| for min_len in range(0, 40, 10): | |
| mask = np.logical_and(mask0, lens >= min_len, lens < min_len+10) | |
| print('Correlation between occurrence counts and UAS (%d <= len < %d): %f' | |
| %(min_len, min_len+10, pearsonr(occ_counts[mask], uas[mask])[0])) | |
| mask = np.logical_and(mask0, lens >= 40, ) | |
| print('Correlation between occurrence counts and UAS (len >= 40): %f' | |
| %(pearsonr(occ_counts[mask], uas[mask])[0])) | |
| print('Sample data:') | |
| print(data[:10]) | |
| with open(out_path, 'wb') as f: np.save(f, data) | |
| print('Data written to %s' %out_path) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Uh oh!
There was an error while loading. Please reload this page.