Created
June 27, 2017 01:03
-
-
Save goodmami/9e210a80f513f5b6e2f171aabbaee442 to your computer and use it in GitHub Desktop.
Simple multi-bleu utility using the NLTK
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# Copyright 2017 Michael Wayne Goodman <[email protected]> | |
# Licensed under the MIT license: https://opensource.org/licenses/MIT | |
import sys | |
import os | |
import gzip | |
import docopt | |
from nltk.translate import bleu_score | |
from nltk.tokenize import (simple, treebank, moses, toktok) | |
USAGE = ''' | |
Usage: nltk-bleu.py [options] REFERENCE [-] | |
Arguments: | |
REFERENCE path to reference sentence file; if the path | |
doesn't exist but path{0,1,...} do exist, | |
use them in multi-reference mode | |
hypothesis the hypothesis lines come from <stdin> | |
Options: | |
-h, --help display this help and exit | |
--lc lowercase the references and hypotheses | |
--smoothing-method=M apply smoothing method M (0--7; 0 is no smoothing, | |
3 is NIST) [default: 3] | |
--tokenizer=T tokenize with T (simple, treebank, moses, toktok) | |
[default: moses] | |
For a description of the smoothing methods, see: | |
http://www.nltk.org/api/nltk.translate.html | |
For a description of the tokenizers, see: | |
http://www.nltk.org/api/nltk.tokenize.html | |
''' | |
_smoother = bleu_score.SmoothingFunction() | |
_smoothers = { | |
'0': _smoother.method0, | |
'1': _smoother.method1, | |
'2': _smoother.method2, | |
'3': _smoother.method3, | |
'4': _smoother.method4, | |
'5': _smoother.method5, | |
'6': _smoother.method6, | |
'7': _smoother.method7, | |
} | |
_tokenizers = { | |
'simple': simple.SpaceTokenizer(), | |
'treebank': treebank.TreebankWordTokenizer(), | |
'moses': moses.MosesTokenizer(), | |
'toktok': toktok.ToktokTokenizer() | |
} | |
def main(): | |
args = docopt.docopt(USAGE) | |
smoother = _smoothers[args['--smoothing-method']] | |
tokenizer = _tokenizers[args['--tokenizer']] | |
refs = [] | |
for suffix in ref_suffixes(args['REFERENCE']): | |
refs.append( | |
read(args['REFERENCE'], suffix, args['--lc'], tokenizer) | |
) | |
reflen = len(refs[0]) | |
if any(len(reflist) != reflen for reflist in refs): | |
sys.exit('reference files do not have the same number of lines') | |
refs = list(zip(*refs)) | |
hyps = [prepare(line, args['--lc'], tokenizer) for line in sys.stdin] | |
score = bleu_score.corpus_bleu(refs, hyps, smoothing_function=smoother) | |
print('BLEU: {:4.2f}'.format(score)) | |
def ref_suffixes(stem): | |
if os.path.isfile(stem): | |
yield '' | |
else: | |
i = 0 | |
while os.path.isfile(stem + str(i)): | |
yield str(i) | |
i += 1 | |
def read(stem, suffix, lowercase, tokenizer): | |
openfile = gzip.open if stem.endswith('.gz') else open | |
data = [] | |
with openfile(stem + suffix) as f: | |
data.extend(prepare(line, lowercase, tokenizer) for line in f) | |
return data | |
def prepare(line, lowercase, tokenizer): | |
if lowercase: | |
line = line.lower() | |
return tokenizer.tokenize(line) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment