minhlab · June 30, 2017 12:53 · minhlab · Jun 30, 2017
diff --git a/measure-occurrence-performance-corelation.py b/measure-occurrence-performance-corelation.py
 from collections import Counter
 import numpy as np
 from scipy.stats import pearsonr

 train = 'output/dep/penntree.jk.sd/train.mrg.dep'
 ref = 'output/dep/penntree.jk.sd/valid.mrg.dep'
 sys = 'output/dep/sd_parse-published-model_valid.conll'
 out_path = 'output/occurrence-performance.npy'
 count_path = 'output/occurrence-counts.npy'

 punctuations = { "``", "''", ".", ",", ":" }

 def count_occurrences(path):
    print('Counting occurrences...')
    c = Counter()
    with open(path) as f:
        for line_no, line in enumerate(f):
            fields = line.strip().split('\t')
            if len(fields) > 5:
                c[fields[1]] += 1
            if (line_no+1) % 100000 == 0:
                print('%s...' %(line_no+1))
    print('Counting occurrences... Done.')
    return c

 def iter_sents(path):
    with open(path) as f:
        sent = []
        for line_no, line in enumerate(f):
            if line.strip():
                fields = line.split('\t')
                sent.append([fields[1]] + fields[6:8] + [line_no])
            else:
                if sent: 
                    yield sent
                    sent = []
        if sent: yield sent

 if __name__ == '__main__':
    occ_counts = count_occurrences(train)
    occ_counts_arr = np.array([occ_counts[k] for k in occ_counts])
    with open(count_path, 'wb') as f: np.save(f, occ_counts_arr)
    print('Occurence counts written to %s' %count_path)
    
    data = []
    for sent1, sent2 in zip(iter_sents(ref), iter_sents(sys)):
        tok_count = 0
        rare_count = 0.0
        occ_count = 0.0
        uas = 0.0
        las = 0.0
        assert len(sent1) == len(sent2)
        for row1, row2 in zip(sent1, sent2):
            assert row1[0] == row2[0] and row1[3] == row2[3]
            if row1[0] not in punctuations:
                tok_count += 1
                occ_count += occ_counts[row1[0]]
                rare_count += (5 < occ_counts[row1[0]] <= 20)
                uas += (row1[1] == row2[1])
                las += (row1[1:3] == row2[1:3])
        uas /= tok_count
        las /= tok_count
        data.append([rare_count, occ_count, uas, las, tok_count, row1[3]])
    data = np.array(data)
    rare_counts, occ_counts, uas, las, lens = data[:,0], data[:,1], data[:,2], data[:,3], data[:,4]
    print('Correlation between rare (but not UNKN) counts and UAS: %f' %pearsonr(rare_counts, uas)[0])
    print('Correlation between rare (but not UNKN) counts and LAS: %f' %pearsonr(rare_counts, las)[0])
    print('Correlation between occurrence counts and UAS: %f' %pearsonr(occ_counts, uas)[0])
    print('Correlation between occurrence counts and LAS: %f' %pearsonr(occ_counts, las)[0])
    print('Correlation between lengths and UAS: %f' %pearsonr(lens, uas)[0])
    print('Correlation between lengths and LAS: %f' %pearsonr(lens, las)[0])
    
    mask0 = (uas < 0.9)
    print('After filtering out "easy" sentences:')
    print('Correlation between occurrence counts and UAS: %f' 
          %pearsonr(occ_counts[mask0], uas[mask0])[0])
    for min_len in range(0, 40, 10):
        mask = np.logical_and(mask0, lens >= min_len, lens < min_len+10)
        print('Correlation between occurrence counts and UAS (%d <= len < %d): %f' 
              %(min_len, min_len+10, pearsonr(occ_counts[mask], uas[mask])[0]))
    mask = np.logical_and(mask0, lens >= 40, )
    print('Correlation between occurrence counts and UAS (len >= 40): %f' 
          %(pearsonr(occ_counts[mask], uas[mask])[0]))
        
    
    print('Sample data:')
    print(data[:10])
    with open(out_path, 'wb') as f: np.save(f, data)
    print('Data written to %s' %out_path)
	from collections import Counter
	import numpy as np
	from scipy.stats import pearsonr

	train = 'output/dep/penntree.jk.sd/train.mrg.dep'
	ref = 'output/dep/penntree.jk.sd/valid.mrg.dep'
	sys = 'output/dep/sd_parse-published-model_valid.conll'
	out_path = 'output/occurrence-performance.npy'
	count_path = 'output/occurrence-counts.npy'

	punctuations = { "``", "''", ".", ",", ":" }

	def count_occurrences(path):
	print('Counting occurrences...')
	c = Counter()
	with open(path) as f:
	for line_no, line in enumerate(f):
	fields = line.strip().split('\t')
	if len(fields) > 5:
	c[fields[1]] += 1
	if (line_no+1) % 100000 == 0:
	print('%s...' %(line_no+1))
	print('Counting occurrences... Done.')
	return c

	def iter_sents(path):
	with open(path) as f:
	sent = []
	for line_no, line in enumerate(f):
	if line.strip():
	fields = line.split('\t')
	sent.append([fields[1]] + fields[6:8] + [line_no])
	else:
	if sent:
	yield sent
	sent = []
	if sent: yield sent

	if __name__ == '__main__':
	occ_counts = count_occurrences(train)
	occ_counts_arr = np.array([occ_counts[k] for k in occ_counts])
	with open(count_path, 'wb') as f: np.save(f, occ_counts_arr)
	print('Occurence counts written to %s' %count_path)

	data = []
	for sent1, sent2 in zip(iter_sents(ref), iter_sents(sys)):
	tok_count = 0
	rare_count = 0.0
	occ_count = 0.0
	uas = 0.0
	las = 0.0
	assert len(sent1) == len(sent2)
	for row1, row2 in zip(sent1, sent2):
	assert row1[0] == row2[0] and row1[3] == row2[3]
	if row1[0] not in punctuations:
	tok_count += 1
	occ_count += occ_counts[row1[0]]
	rare_count += (5 < occ_counts[row1[0]] <= 20)
	uas += (row1[1] == row2[1])
	las += (row1[1:3] == row2[1:3])
	uas /= tok_count
	las /= tok_count
	data.append([rare_count, occ_count, uas, las, tok_count, row1[3]])
	data = np.array(data)
	rare_counts, occ_counts, uas, las, lens = data[:,0], data[:,1], data[:,2], data[:,3], data[:,4]
	print('Correlation between rare (but not UNKN) counts and UAS: %f' %pearsonr(rare_counts, uas)[0])
	print('Correlation between rare (but not UNKN) counts and LAS: %f' %pearsonr(rare_counts, las)[0])
	print('Correlation between occurrence counts and UAS: %f' %pearsonr(occ_counts, uas)[0])
	print('Correlation between occurrence counts and LAS: %f' %pearsonr(occ_counts, las)[0])
	print('Correlation between lengths and UAS: %f' %pearsonr(lens, uas)[0])
	print('Correlation between lengths and LAS: %f' %pearsonr(lens, las)[0])

	mask0 = (uas < 0.9)
	print('After filtering out "easy" sentences:')
	print('Correlation between occurrence counts and UAS: %f'
	%pearsonr(occ_counts[mask0], uas[mask0])[0])
	for min_len in range(0, 40, 10):
	mask = np.logical_and(mask0, lens >= min_len, lens < min_len+10)
	print('Correlation between occurrence counts and UAS (%d <= len < %d): %f'
	%(min_len, min_len+10, pearsonr(occ_counts[mask], uas[mask])[0]))
	mask = np.logical_and(mask0, lens >= 40, )
	print('Correlation between occurrence counts and UAS (len >= 40): %f'
	%(pearsonr(occ_counts[mask], uas[mask])[0]))


	print('Sample data:')
	print(data[:10])
	with open(out_path, 'wb') as f: np.save(f, data)
	print('Data written to %s' %out_path)
No results found