-
-
Save 8enmann/120f29589c340064c273c2cc5201c553 to your computer and use it in GitHub Desktop.
WikiText: Python 2 post processing used on Moses tokenized input
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# encoding=utf8 | |
import sys | |
reload(sys) | |
sys.setdefaultencoding('utf8') | |
import re | |
number_match_re = re.compile(r'^([0-9]+[,.]?)+$') | |
number_split_re = re.compile(r'([,.])') | |
for i, line in enumerate(sys.stdin): | |
# Fix a silly tokenization that was never intended | |
line = line.replace('< formula >', '<formula>') | |
raw_tokens = [x for x in line.split() if x] | |
tokens = [] | |
for token in raw_tokens: | |
if number_match_re.match(token): | |
token = number_split_re.sub(r' @\1@ ', token) | |
tokens.append(token) | |
# Starting each line with a blank line is required | |
# Some systems replace \n with <eos> and assume, like in PTB, everything is space separated | |
tokens = [''] + tokens + ['\n'] | |
line = ' '.join(tokens) | |
sys.stdout.write(line) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Pseudocode for UNKs