Created
July 5, 2017 11:07
-
-
Save ozancaglayan/213f37a0bccbcc0b91e6e17b2ec261a9 to your computer and use it in GitHub Desktop.
mteval-v13a.pl Python3 port
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import re | |
import math | |
import argparse | |
from collections import defaultdict | |
LOG_2 = math.log(2) | |
################################################################################### | |
# This is an exact reimplementation of mteval-v13a.pl without requiring SGM. | |
# | |
# Example: | |
# $ mteval-v13a -t model.beam12.detok -r newstest2017-tren-ref.en.txt -c | |
# | |
# BLEU(v13) = 16.09, 53.5/23.4/11.7/6.2 (ratio=0.927, hyp_len=62777, ref_len=67703) | |
################################################################################### | |
# Only single ref | |
def score_segment(tst_words, ref_words, ref_ngram_freqs, max_order): | |
# Create initial lists | |
match_cnt = [0 for i in range(max_order)] | |
tst_cnt = [0 for i in range(max_order)] | |
ref_cnt = [0 for i in range(max_order)] | |
tst_info = [0 for i in range(max_order)] | |
ref_info = [0 for i in range(max_order)] | |
ref_ngrams_max = {} | |
# Get the ngram counts for the test segment | |
tst_ngrams = words_to_ngrams(tst_words, max_order) | |
len_tst = len(tst_words) | |
for i in range(max_order): | |
tst_cnt[i] = (len_tst - i) if i < len_tst else 0 | |
########### | |
# Reference | |
########### | |
ref_ngrams = words_to_ngrams(ref_words, max_order) | |
len_ref = len(ref_words) | |
for ngram_words, freq in ref_ngrams.items(): | |
# Counts of ngrams for this sentence | |
ref_info[len(ngram_words) - 1] += ref_ngram_freqs[ngram_words] | |
# Update the maximum count of this ngram | |
# Shorter=>ref_ngrams_max[ngram_words] = max(ref_ngrams_max.get(ngram_words, -1), ref_ngrams[ngram_words]) | |
if ngram_words in ref_ngrams_max: | |
ref_ngrams_max[ngram_words] = max(ref_ngrams_max[ngram_words], freq) | |
else: | |
ref_ngrams_max[ngram_words] = freq | |
# Update reference ngram counts | |
for i in range(max_order): | |
ref_cnt[i] = (len_ref - i) if i < len_ref else 0 | |
for ngram_words, freq in tst_ngrams.items(): | |
if ngram_words in ref_ngrams_max: | |
m = min(freq, ref_ngrams_max[ngram_words]) | |
l = len(ngram_words) - 1 | |
tst_info[l] += ref_ngram_freqs[ngram_words] * m | |
match_cnt[l] += m | |
return len_ref, match_cnt, tst_cnt, ref_cnt, tst_info, ref_info | |
def score_system(ref_segs, tst_segs, max_order): | |
ref_ngram_freqs = compute_ngram_info(ref_segs, max_order) | |
# 0-based indexing in contrast to perl version | |
cum_match = [0 for i in range(max_order)] | |
cum_tst_cnt = [0 for i in range(max_order)] | |
cum_ref_cnt = [0 for i in range(max_order)] | |
cum_tst_info = [0 for i in range(max_order)] | |
cum_ref_info = [0 for i in range(max_order)] | |
cum_ref_len = 0 | |
# Score each segment and keep statistics | |
for tst, ref in zip(tst_segs, ref_segs): | |
ref_len, match_cnt, tst_cnt, ref_cnt, tst_info, ref_info = score_segment(tst, ref, ref_ngram_freqs, max_order) | |
# Sum ref length | |
cum_ref_len += ref_len | |
for i in range(max_order): | |
cum_match[i] += match_cnt[i] | |
cum_tst_cnt[i] += tst_cnt[i] | |
cum_ref_cnt[i] += ref_cnt[i] | |
cum_tst_info[i] += tst_info[i] | |
cum_ref_info[i] += ref_info[i] | |
# Compute length score | |
exp_len_score = math.exp(min(0, 1 - cum_ref_len / cum_tst_cnt[0])) \ | |
if cum_tst_cnt[0] > 0 else 0 | |
# For further length ratio computation | |
tst_vs_ref_ratio = (cum_tst_cnt[0], cum_ref_len, math.log(exp_len_score)) | |
return bleu_score(cum_ref_len, cum_match, cum_tst_cnt, exp_len_score, max_order), tst_vs_ref_ratio | |
def read_file(fileobj, tokenizer, is_cased): | |
"""Read simple plain text file.""" | |
sents = [] | |
for line in fileobj: | |
sents.append(tokenizer(line, is_cased)) | |
fileobj.close() | |
return sents | |
def words_to_ngrams(words, max_order): | |
"""Convert a sequence of words to an ngram count dict.""" | |
d = defaultdict(int) | |
# Iterate over word indices as start pointers | |
for i in range(len(words)): | |
# Sliding windows | |
for j in range(min(max_order, len(words) - i)): | |
# Increment counter, keep keys as tuples | |
d[tuple(words[i: i+j+1])] += 1 | |
return d | |
def compute_ngram_info(ref_segs, max_order): | |
tot_words = 0 | |
# Segment-wise frequencies | |
ngram_count = defaultdict(int) | |
ngram_info = {} | |
for words in ref_segs: | |
tot_words += len(words) | |
# Get frequencies and add them to ngramcpunt | |
for key, value in words_to_ngrams(words, max_order).items(): | |
ngram_count[key] += value | |
for ngram_words, freq in ngram_count.items(): | |
if len(ngram_words) == 1: | |
# ngram is unigram => corpus frequency | |
denom = tot_words | |
else: | |
# n-gram is n-gram => n-gram frequency | |
denom = ngram_count[ngram_words[:-1]] | |
ngram_info[ngram_words] = -math.log(freq / denom) / LOG_2 | |
return ngram_info | |
################################################################################################# | |
# Default method used to compute the BLEU score, using smoothing. | |
# The smoothing is computed by taking 1 / ( 2^k ), instead of 0, | |
# for each precision score whose matching n-gram count is null | |
# k is 1 for the first 'n' value for which the n-gram match count is null | |
# For example, if the text contains: | |
# - one 2-gram match | |
# - and (consequently) two 1-gram matches | |
# the n-gram count for each individual precision score would be: | |
# - n=1 => prec_count = 2 (two unigrams) | |
# - n=2 => prec_count = 1 (one bigram) | |
# - n=3 => prec_count = 1/2 (no trigram, taking 'smoothed' value of 1 / ( 2^k ), with k=1) | |
# - n=4 => prec_count = 1/4 (no fourgram, taking 'smoothed' value of 1 / ( 2^k ), with k=2) | |
################################################################################################# | |
def bleu_score(ref_len, matching_ngrams, tst_ngrams, exp_len_score, max_order): | |
score = 0 | |
iscore = 0 | |
smooth = 1 | |
ind_scores = [] | |
cum_scores = [] | |
for i in range(max_order): | |
if tst_ngrams[i] == 0: | |
iscore = 0 | |
elif matching_ngrams[i] == 0: | |
smooth *= 2 | |
iscore = math.log(1 / (smooth * tst_ngrams[i])) | |
else: | |
iscore = math.log(matching_ngrams[i] / tst_ngrams[i]) | |
ind_scores.append(math.exp(iscore)) | |
score += iscore | |
cum_scores.append(math.exp(score / (i+1)) * exp_len_score) | |
return ind_scores, cum_scores | |
############################################################################# | |
# function returning the shortest reference length | |
# takes as input: | |
# - currentLength : the current (shortest) reference length | |
# - referenceSentenceLength : the current reference sentence length | |
# - candidateSentenceLength : the current candidate sentence length (unused) | |
############################################################################# | |
def brevity_penalty_shortest(cur_len, ref_len, cand_len): | |
return ref_len if ref_len < cur_len else cur_len | |
#################################################################################### | |
# function returning the closest reference length (to the candidate sentence length) | |
# takes as input: | |
# - currentLength: the current (closest) reference length. | |
# - candidateSentenceLength : the current reference sentence length | |
# - candidateSentenceLength : the current candidate sentence length | |
# when two reference sentences are at the same distance, it will return the shortest reference sentence length | |
# example of 4 iterations, given: | |
# - one candidate sentence containing 7 tokens | |
# - one reference translation containing 11 tokens | |
# - one reference translation containing 8 tokens | |
# - one reference translation containing 6 tokens | |
# - one reference translation containing 7 tokens | |
# the multiple invokations will return: | |
# - currentLength is set to 11 (outside of this function) | |
# - brevity_penalty_closest( 11, 8, 7 ) returns 8, since abs( 8 - 7 ) < abs( 11 - 7 ) | |
# - brevity_penalty_closest( 8, 6, 7 ) returns 6, since abs( 8 - 7 ) == abs( 6 - 7 ) AND 6 < 8 | |
# - brevity_penalty_closest( 7, 6, 7 ) returns 7, since abs( 7 - 7 ) < abs( 6 - 7 ) | |
################################################################################### | |
def brevity_penalty_closest(cur_len, ref_len, cand_len): | |
result = cur_len | |
if abs(cand_len - ref_len) <= abs(cand_len - cur_len): | |
if abs(cand_len - ref_len) == abs(cand_len - cur_len): | |
if cur_len > ref_len: | |
result = ref_len | |
else: | |
result = ref_len | |
return result | |
def tokenizer(s, is_cased): | |
s = s.strip() | |
# language-independent part: | |
if '<skipped>' in s: | |
s = re.sub('<skipped>', '', s) | |
# Disabled: These should not happen with plain text | |
#s = re.sub('-\n', '') | |
#s = re.sub('\n', ' ') | |
#s = re.sub('"', '"', s) | |
#s = re.sub('&', '&', s) | |
#s = re.sub('<', '<', s) | |
#s = re.sub('>', '>', s) | |
# language-dependent part (assuming Western languages): | |
s = " %s " % s | |
if not is_cased: | |
s = s.lower() | |
# tokenize punctuation | |
s = re.sub('([\{-\~\[-\` -\&\(-\+\:-\@\/])', ' \\1 ', s) | |
# tokenize period and comma unless preceded by a digit | |
s = re.sub('([^0-9])([\.,])', '\\1 \\2 ', s) | |
# tokenize period and comma unless followed by a digit | |
s = re.sub('([\.,])([^0-9])', ' \\1 \\2', s) | |
# tokenize dash when preceded by a digit | |
if '-' in s: | |
s = re.sub('([0-9])(-)', '\\1 \\2 ', s) | |
# Strip multiple spaces | |
# Strip trailing and leading spaces | |
return re.sub('\s+', ' ', s).strip().split() | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser(prog='mteval-v13a') | |
parser.add_argument('-c', '--cased' , action='store_true', help="Preserve upper-case alphabetic characters.") | |
parser.add_argument('-n', '--max-order' , type=int, default=4, help="Max n-gram order.") | |
parser.add_argument('-p', '--brevity-penalty' , type=str, default='closest', help="Brevity penalty: closest or shortest.") | |
parser.add_argument('-d', '--detailed' , action='store_true', help="Show detailed report.") | |
parser.add_argument('-t', '--tst-file' , type=argparse.FileType('r'), required=True, help="Test sentences file.") | |
parser.add_argument('-r', '--ref-file' , type=argparse.FileType('r'), required=True, help="Reference sentences file.") | |
args = parser.parse_args() | |
# Set BP function | |
# (Only useful when multiple references) | |
#bp_func = brevity_penalty_closest | |
#if args.brevity_penalty == 'shortest': | |
# bp_func = brevity_penalty_shortest | |
# Read (detokenized) files and tokenize them | |
ref_segs = read_file(args.ref_file, tokenizer, args.cased) | |
tst_segs = read_file(args.tst_file, tokenizer, args.cased) | |
assert(len(ref_segs) == len(tst_segs)) | |
(ind_scores, cum_scores), ratios = score_system(ref_segs, tst_segs, args.max_order) | |
print("BLEU(v13) = %.2f, %s (ratio=%.3f, hyp_len=%d, ref_len=%d)" % ( | |
cum_scores[3]*100, | |
"/".join([("%.1f" % (s*100)) for s in ind_scores[:4]]), | |
(ratios[0]/ratios[1]), ratios[0], ratios[1])) | |
if args.detailed: | |
print("-------------------------------------") | |
print("Individual N-gram scoring") | |
for i in range(args.max_order): | |
print(' %d-gram' % (i+1), end='') | |
print() | |
for i in range(args.max_order): | |
print(' ------', end=' ') | |
print() | |
for i in range(args.max_order): | |
print(' %2.4f' % ind_scores[i], end='') | |
print() | |
print("Cumulative N-gram scoring") | |
for i in range(args.max_order): | |
print(' %d-gram' % (i+1), end='') | |
print() | |
for i in range(args.max_order): | |
print(' ------', end='') | |
print() | |
for i in range(args.max_order): | |
print(' %2.4f' % cum_scores[i], end='') | |
print() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment