Last active
April 20, 2020 09:53
-
-
Save psorianom/32d54b743f3e3bf9c7ee6417ef8b042e to your computer and use it in GitHub Desktop.
Moses tokenizer with spans. Built upon the Python's sacremoses port.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Class that inherits MosesTokenizer and adds a method which returns the spans. Kinda flaky with the escape, unescape, | |
detokenize situation, so watch out! | |
""" | |
from sacremoses import MosesTokenizer, MosesDetokenizer | |
class MosesTokenizerSpans(MosesTokenizer): | |
def __init__(self, lang="en", custom_nonbreaking_prefixes_file=None): | |
MosesTokenizer.__init__(self, lang=lang, | |
custom_nonbreaking_prefixes_file=custom_nonbreaking_prefixes_file) | |
self.lang = lang | |
def span_tokenize( | |
self, | |
text, | |
aggressive_dash_splits=False, | |
escape=True, | |
protected_patterns=None, | |
): | |
# https://stackoverflow.com/a/35634472 | |
import re | |
detokenizer = MosesDetokenizer(lang=self.lang) | |
tokens = self.tokenize(text=text, aggressive_dash_splits=aggressive_dash_splits, | |
return_str=False, escape=escape, | |
protected_patterns=protected_patterns) | |
tail = text | |
accum = 0 | |
tokens_spans = [] | |
for token in tokens: | |
detokenized_token = detokenizer.detokenize(tokens=[token], | |
return_str=True, | |
unescape=escape) | |
escaped_token = re.escape(detokenized_token) | |
m = re.search(escaped_token, tail) | |
tok_start_pos, tok_end_pos = m.span() | |
sent_start_pos = accum + tok_start_pos | |
sent_end_pos = accum + tok_end_pos | |
accum += tok_end_pos | |
tail = tail[tok_end_pos:] | |
tokens_spans.append((detokenized_token, (sent_start_pos, sent_end_pos))) | |
return tokens_spans | |
# moses_tokenizer = MosesTokenizerSpans(lang="fr") | |
# print(moses_tokenizer.span_tokenize("Le chat noir.")) | |
# Out[1]: [('Le', (0, 2)), ('chat', (3, 7)), ('noir', (8, 12)), ('.', (12, 13))] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def build_moses_tokenizer(tokenizer: MosesTokenizerSpans, | |
normalizer: MosesPunctNormalizer = None) -> Callable[[str], List[Token]]: | |
""" | |
Wrap a sacremoses tokenizer (with spans) to build a tokenizer for the Sentence class. | |
:param a MosesTokenizerSpans tokenizer | |
:return a tokenizer function to provide to Sentence class constructor | |
""" | |
try: | |
from sacremoses import MosesTokenizer | |
from sacremoses import MosesPunctNormalizer | |
except ImportError: | |
raise ImportError( | |
"Please install sacremoses or better before using the moses tokenizer, otherwise you can use segtok_tokenizer as advanced tokenizer." | |
) | |
moses_tokenizer: MosesTokenizerSpans = tokenizer | |
if normalizer: | |
normalizer: MosesPunctNormalizer = normalizer | |
def tokenizer(text: str) -> List[Token]: | |
if normalizer: | |
text = normalizer.normalize(text=text) | |
doc = moses_tokenizer.span_tokenize(text=text, escape=False) | |
previous_token = None | |
tokens: List[Token] = [] | |
for word, (start_pos, end_pos) in doc: | |
word: str = word | |
token = Token( | |
text=word, start_position=start_pos, whitespace_after=True | |
) | |
tokens.append(token) | |
if (previous_token is not None) and ( | |
token.start_pos - 1 | |
== previous_token.start_pos + len(previous_token.text) | |
): | |
previous_token.whitespace_after = False | |
previous_token = token | |
return tokens | |
return tokenizer |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def flair_moses_tokenizer(): | |
moses_tokenizer = MosesTokenizerSpans(lang="fr") | |
moses_tokenizer = build_moses_tokenizer(tokenizer=moses_tokenizer) | |
return moses_tokenizer |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment