pmbaumgartner · January 10, 2022 15:49
diff --git a/cleaning_tokenizer.py b/cleaning_tokenizer.py
 from spacy.tokenizer import Tokenizer

 class CTLTokenizer(Tokenizer):
    # https://stackoverflow.com/a/58718664
    def __call__(self, string) -> spacy.tokens.Doc:
        string = self.clean_string(string)
        doc = super().__call__(string)
        return doc

    def clean_string(self, string: str) -> str:
        """String cleaning function. You can call this to clean a string
        without tokenizing.

        e.g.
            nlp.tokenizer.clean_string('Some example sentence')
        """
        if not string.endswith("."):
            string = string + "."
        return string
	from spacy.tokenizer import Tokenizer

	class CTLTokenizer(Tokenizer):
	# https://stackoverflow.com/a/58718664
	def __call__(self, string) -> spacy.tokens.Doc:
	string = self.clean_string(string)
	doc = super().__call__(string)
	return doc

	def clean_string(self, string: str) -> str:
	"""String cleaning function. You can call this to clean a string
	without tokenizing.

	e.g.
	nlp.tokenizer.clean_string('Some example sentence')
	"""
	if not string.endswith("."):
	string = string + "."
	return string