dayyass · June 17, 2021 13:54
diff --git a/sklearn_tokenizer.py b/sklearn_tokenizer.py
 import re


 # Method build_tokenizer from _VectorizerMixin mixin from which classes HashingVectorizer, CountVectorizer and 
 # TfidfVectorizer (through CountVectorizer) are partially inherited.
 # It is used to split a string into a sequence of tokens (only if analyzer == 'word').
 def build_tokenizer(token_pattern: str = r"(?u)\b\w\w+\b"):
    """
    Return a function that splits a string into a sequence of tokens.
    
    Returns
    -------
    tokenizer: callable
          A function to split a string into a sequence of tokens.
    """
    
    return re.compile(token_pattern).findall


 sentence = "This is sentence example."

 tokenizer = build_tokenizer()
 tokenizer(sentence)  # ['This', 'is', 'sentence', 'example']
	import re


	# Method build_tokenizer from _VectorizerMixin mixin from which classes HashingVectorizer, CountVectorizer and
	# TfidfVectorizer (through CountVectorizer) are partially inherited.
	# It is used to split a string into a sequence of tokens (only if analyzer == 'word').
	def build_tokenizer(token_pattern: str = r"(?u)\b\w\w+\b"):
	"""
	Return a function that splits a string into a sequence of tokens.

	Returns
	-------
	tokenizer: callable
	A function to split a string into a sequence of tokens.
	"""

	return re.compile(token_pattern).findall


	sentence = "This is sentence example."

	tokenizer = build_tokenizer()
	tokenizer(sentence) # ['This', 'is', 'sentence', 'example']