Last active
August 21, 2018 07:09
-
-
Save twolodzko/057e6de5e4aa1622fed9d7e0c8fd2982 to your computer and use it in GitHub Desktop.
Tokenize Top K Words
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from __future__ import absolute_import | |
| from __future__ import division | |
| from __future__ import print_function | |
| from six import iteritems | |
| from tqdm import tqdm | |
| from collections import Counter | |
| import re | |
| class TopKTokenizer(object): | |
| """Tokenize Top K Words""" | |
| def __init__(self, num_tokens, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', | |
| lower=True, split=' ', unk_label=None): | |
| """Tokenize Top K Words | |
| Parameters | |
| ---------- | |
| num_tokens : int | |
| Tokenize the `num_tokens - 1` most frequent words, other words are treated | |
| as unknown and labeled as `unk_label`. | |
| filters : list or string, default: '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n' | |
| Characters to filter out, such as punctuation. | |
| lower : bool, default : True | |
| Whether to set the text to lowercase. | |
| split : string, default : ' ' | |
| Separator for word splitting. | |
| unk_label : string | |
| Label used for the "other" words. If provided, the label is ignored | |
| by the tokenizer. Notice that the text is first cleaned, then tokenized, | |
| so `unk_label` must not contain the special `filtered` characters if it is | |
| ought to be considered by the tokenizer. | |
| The `unk_label` is used when back transforming from tokens to words. By | |
| default, `<UNK>` is used as a label. | |
| Attributes | |
| ---------- | |
| map : dict | |
| Mapping from words to tokens. | |
| inv_map : dict | |
| Mapping from tokens to words. | |
| num_unique_words : int | |
| Number of unique words found in the training set. | |
| """ | |
| self.num_tokens = num_tokens | |
| if isinstance(filters, list): | |
| filters = "".join(filters) | |
| self.filters = '['+filters+']' | |
| self.lower = lower | |
| self.split = split | |
| self.unk_label = unk_label.lower() if unk_label else '<UNK>' | |
| self.map = {} | |
| self.inv_map = {} | |
| self.num_unique_words = 0 | |
| def _clean(self, x): | |
| try: | |
| if isinstance(x, int) or isinstance(x, float): | |
| x = str(x) | |
| x = re.sub(self.filters, ' ', x) | |
| x = re.sub(' +', ' ', x).strip() | |
| x = x.lower() | |
| if len(self.split) > 0: | |
| return x.split(self.split) | |
| return list(x) | |
| except TypeError: | |
| return [] | |
| def fit(self, X, show_progress=True): | |
| """Fit the tokenizer | |
| Parameters | |
| ---------- | |
| X : list | |
| List of documents (strings). | |
| progress : bool | |
| Display progress bar. | |
| Returns | |
| ------- | |
| self | |
| """ | |
| cnt = Counter() | |
| for elem in tqdm(X, disable=not show_progress): | |
| for word in self._clean(elem): | |
| # the default will always differ due to cleaning | |
| if word != self.unk_label: | |
| cnt[word] += 1 | |
| self.num_unique_words = len(cnt) | |
| for i, (w, c) in enumerate(cnt.most_common(self.num_tokens - 1)): | |
| self.map[w] = i + 1 | |
| self.inv_map = { v : k for k, v in iteritems(self.map) } | |
| return self | |
| def transform(self, X, show_progress=True): | |
| """Transform words to tokens | |
| Parameters | |
| ---------- | |
| X : list | |
| List of documents (strings). | |
| show_progress : bool | |
| Display progress bar. | |
| Returns | |
| ------- | |
| list of lists | |
| """ | |
| new_X = [] | |
| for i, elem in tqdm(enumerate(X), disable=not show_progress, | |
| total=len(X)): | |
| new_X.append([]) | |
| for word in self._clean(elem): | |
| try: | |
| new_X[i].append(self.map[word]) | |
| except KeyError: | |
| new_X[i].append(0) | |
| return new_X | |
| def fit_transform(self, X, show_progress=True): | |
| """Fit the tokenizer and transform words to tokens | |
| Parameters | |
| ---------- | |
| X : list | |
| List of documents (strings). | |
| show_progress : bool | |
| Display progress bar. | |
| Returns | |
| ------- | |
| list of lists | |
| """ | |
| return self.fit(X, show_progress=show_progress)\ | |
| .transform(X, show_progress=show_progress) | |
| def back_transform(self, X, join=False): | |
| """Transform tokens to words | |
| Parameters | |
| ---------- | |
| X : list | |
| List of tokens. | |
| join : bool, default : False | |
| Join the words into a string using the `split` text separator. | |
| Returns | |
| ------- | |
| list of lists, or list of strings | |
| """ | |
| new_X = [] | |
| for i, elem in enumerate(X): | |
| new_X.append([]) | |
| for token in elem: | |
| if token == 0: | |
| new_X[i].append(self.unk_label) | |
| else: | |
| new_X[i].append(self.inv_map[token]) | |
| if join: | |
| new_X[i] = self.split.join(new_X[i]) | |
| return new_X | |
| if __name__ == '__main__' : | |
| tkn = TopKTokenizer(12) | |
| X = [ | |
| "loerm ipsum silvia dolor", | |
| "hello world", | |
| 5, | |
| "[email protected]", | |
| "@realDonaldTrump #USA", | |
| "John Fitzgerald Kennedy", | |
| ] | |
| new_X = tkn.fit_transform(X, show_progress=False) | |
| assert len(tkn.map) + 1 == tkn.num_tokens | |
| [print(x) for x in X] | |
| print() | |
| [print(x) for x in new_X] | |
| print() | |
| [print(x) for x in tkn.back_transform(new_X)] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment