twolodzko · August 21, 2018 07:09
diff --git a/TopKTokenizer-class.py b/TopKTokenizer-class.py
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 from six import iteritems

 from tqdm import tqdm
 from collections import Counter
 import re

 class TopKTokenizer(object):
    """Tokenize Top K Words"""
    
    def __init__(self, num_tokens, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                 lower=True, split=' ', unk_label=None):
        """Tokenize Top K Words
        
        Parameters
        ----------
        num_tokens : int
            Tokenize the `num_tokens - 1` most frequent words, other words are treated
            as unknown and labeled as `unk_label`.
        filters : list or string, default: '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
            Characters to filter out, such as punctuation.
        lower : bool, default : True
            Whether to set the text to lowercase.
        split : string, default : ' '
            Separator for word splitting.
        unk_label : string
            Label used for the "other" words. If provided, the label is ignored
            by the tokenizer. Notice that the text is first cleaned, then tokenized,
            so `unk_label` must not contain the special `filtered` characters if it is
            ought to be considered by the tokenizer.
            The `unk_label` is used when back transforming from tokens to words. By
            default, `<UNK>` is used as a label.
            
        Attributes
        ----------
        map : dict
            Mapping from words to tokens.
        inv_map : dict
            Mapping from tokens to words.
        num_unique_words : int
            Number of unique words found in the training set.
        """
        self.num_tokens = num_tokens
        if isinstance(filters, list):
            filters = "".join(filters)
        self.filters = '['+filters+']'
        self.lower = lower
        self.split = split
        self.unk_label = unk_label.lower() if unk_label else '<UNK>'
        
        self.map = {}
        self.inv_map = {}
        self.num_unique_words = 0
        
    def _clean(self, x):
        try:
            if isinstance(x, int) or isinstance(x, float):
                x = str(x)
            x = re.sub(self.filters, ' ', x)
            x = re.sub(' +', ' ', x).strip()
            x = x.lower()
            if len(self.split) > 0:
                return x.split(self.split)
            return list(x)
        except TypeError:
            return []
    
    def fit(self, X, show_progress=True):
        """Fit the tokenizer
        
        Parameters
        ----------
        X : list
            List of documents (strings).
        progress : bool
            Display progress bar.
            
        Returns
        -------
        self
        """
        cnt = Counter()
        for elem in tqdm(X, disable=not show_progress):           
            for word in self._clean(elem):
                # the default will always differ due to cleaning
                if word != self.unk_label:
                    cnt[word] += 1
        self.num_unique_words = len(cnt)
        for i, (w, c) in enumerate(cnt.most_common(self.num_tokens - 1)):
            self.map[w] = i + 1
        self.inv_map = { v : k for k, v in iteritems(self.map) }
        return self
        
    def transform(self, X, show_progress=True):
        """Transform words to tokens
        
        Parameters
        ----------
        X : list
            List of documents (strings).
        show_progress : bool
            Display progress bar.
            
        Returns
        -------
        list of lists
        """
        new_X = []
        for i, elem in tqdm(enumerate(X), disable=not show_progress,
                            total=len(X)):          
            new_X.append([])
            for word in self._clean(elem):
                try:
                    new_X[i].append(self.map[word])
                except KeyError:
                    new_X[i].append(0)
        return new_X
    
    def fit_transform(self, X, show_progress=True):
        """Fit the tokenizer and transform words to tokens
        
        Parameters
        ----------
        X : list
            List of documents (strings).
        show_progress : bool
            Display progress bar.
            
        Returns
        -------
        list of lists
        """
        return self.fit(X, show_progress=show_progress)\
                   .transform(X, show_progress=show_progress)
    
    def back_transform(self, X, join=False):
        """Transform tokens to words
        
        Parameters
        ----------
        X : list
            List of tokens.
        join : bool, default : False
            Join the words into a string using the `split` text separator.
        
        Returns
        -------
        list of lists, or list of strings
        """
        new_X = []
        for i, elem in enumerate(X):
            new_X.append([])
            for token in elem:
                if token == 0:
                    new_X[i].append(self.unk_label)
                else:
                    new_X[i].append(self.inv_map[token])
            if join:
                new_X[i] = self.split.join(new_X[i])
        return new_X
                    
            
 if __name__ == '__main__' :
    
    tkn = TopKTokenizer(12)

    X = [
        "loerm ipsum silvia dolor",
        "hello world",
        5,
        "[email protected]",
        "@realDonaldTrump #USA",
        "John Fitzgerald Kennedy",
    ]
    
    new_X = tkn.fit_transform(X, show_progress=False)
    
    assert len(tkn.map) + 1 == tkn.num_tokens

    [print(x) for x in X]
    print()
    [print(x) for x in new_X]
    print()
    [print(x) for x in tkn.back_transform(new_X)]
	from __future__ import absolute_import
	from __future__ import division
	from __future__ import print_function
	from six import iteritems

	from tqdm import tqdm
	from collections import Counter
	import re

	class TopKTokenizer(object):
	"""Tokenize Top K Words"""

	def __init__(self, num_tokens, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{\|}~\t\n',
	lower=True, split=' ', unk_label=None):
	"""Tokenize Top K Words

	Parameters
	----------
	num_tokens : int
	Tokenize the `num_tokens - 1` most frequent words, other words are treated
	as unknown and labeled as `unk_label`.
	filters : list or string, default: '!"#$%&()*+,-./:;<=>?@[\\]^_`{\|}~\t\n'
	Characters to filter out, such as punctuation.
	lower : bool, default : True
	Whether to set the text to lowercase.
	split : string, default : ' '
	Separator for word splitting.
	unk_label : string
	Label used for the "other" words. If provided, the label is ignored
	by the tokenizer. Notice that the text is first cleaned, then tokenized,
	so `unk_label` must not contain the special `filtered` characters if it is
	ought to be considered by the tokenizer.
	The `unk_label` is used when back transforming from tokens to words. By
	default, `<UNK>` is used as a label.

	Attributes
	----------
	map : dict
	Mapping from words to tokens.
	inv_map : dict
	Mapping from tokens to words.
	num_unique_words : int
	Number of unique words found in the training set.
	"""
	self.num_tokens = num_tokens
	if isinstance(filters, list):
	filters = "".join(filters)
	self.filters = '['+filters+']'
	self.lower = lower
	self.split = split
	self.unk_label = unk_label.lower() if unk_label else '<UNK>'

	self.map = {}
	self.inv_map = {}
	self.num_unique_words = 0

	def _clean(self, x):
	try:
	if isinstance(x, int) or isinstance(x, float):
	x = str(x)
	x = re.sub(self.filters, ' ', x)
	x = re.sub(' +', ' ', x).strip()
	x = x.lower()
	if len(self.split) > 0:
	return x.split(self.split)
	return list(x)
	except TypeError:
	return []

	def fit(self, X, show_progress=True):
	"""Fit the tokenizer

	Parameters
	----------
	X : list
	List of documents (strings).
	progress : bool
	Display progress bar.

	Returns
	-------
	self
	"""
	cnt = Counter()
	for elem in tqdm(X, disable=not show_progress):
	for word in self._clean(elem):
	# the default will always differ due to cleaning
	if word != self.unk_label:
	cnt[word] += 1
	self.num_unique_words = len(cnt)
	for i, (w, c) in enumerate(cnt.most_common(self.num_tokens - 1)):
	self.map[w] = i + 1
	self.inv_map = { v : k for k, v in iteritems(self.map) }
	return self

	def transform(self, X, show_progress=True):
	"""Transform words to tokens

	Parameters
	----------
	X : list
	List of documents (strings).
	show_progress : bool
	Display progress bar.

	Returns
	-------
	list of lists
	"""
	new_X = []
	for i, elem in tqdm(enumerate(X), disable=not show_progress,
	total=len(X)):
	new_X.append([])
	for word in self._clean(elem):
	try:
	new_X[i].append(self.map[word])
	except KeyError:
	new_X[i].append(0)
	return new_X

	def fit_transform(self, X, show_progress=True):
	"""Fit the tokenizer and transform words to tokens

	Parameters
	----------
	X : list
	List of documents (strings).
	show_progress : bool
	Display progress bar.

	Returns
	-------
	list of lists
	"""
	return self.fit(X, show_progress=show_progress)\
	.transform(X, show_progress=show_progress)

	def back_transform(self, X, join=False):
	"""Transform tokens to words

	Parameters
	----------
	X : list
	List of tokens.
	join : bool, default : False
	Join the words into a string using the `split` text separator.

	Returns
	-------
	list of lists, or list of strings
	"""
	new_X = []
	for i, elem in enumerate(X):
	new_X.append([])
	for token in elem:
	if token == 0:
	new_X[i].append(self.unk_label)
	else:
	new_X[i].append(self.inv_map[token])
	if join:
	new_X[i] = self.split.join(new_X[i])
	return new_X


	if __name__ == '__main__' :

	tkn = TopKTokenizer(12)

	X = [
	"loerm ipsum silvia dolor",
	"hello world",
	5,
	"[email protected]",
	"@realDonaldTrump #USA",
	"John Fitzgerald Kennedy",
	]

	new_X = tkn.fit_transform(X, show_progress=False)

	assert len(tkn.map) + 1 == tkn.num_tokens

	[print(x) for x in X]
	print()
	[print(x) for x in new_X]
	print()
	[print(x) for x in tkn.back_transform(new_X)]
No results found