Skip to content

Instantly share code, notes, and snippets.

@twolodzko
Last active August 21, 2018 07:09
Show Gist options
  • Select an option

  • Save twolodzko/057e6de5e4aa1622fed9d7e0c8fd2982 to your computer and use it in GitHub Desktop.

Select an option

Save twolodzko/057e6de5e4aa1622fed9d7e0c8fd2982 to your computer and use it in GitHub Desktop.
Tokenize Top K Words
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from six import iteritems
from tqdm import tqdm
from collections import Counter
import re
class TopKTokenizer(object):
"""Tokenize Top K Words"""
def __init__(self, num_tokens, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
lower=True, split=' ', unk_label=None):
"""Tokenize Top K Words
Parameters
----------
num_tokens : int
Tokenize the `num_tokens - 1` most frequent words, other words are treated
as unknown and labeled as `unk_label`.
filters : list or string, default: '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
Characters to filter out, such as punctuation.
lower : bool, default : True
Whether to set the text to lowercase.
split : string, default : ' '
Separator for word splitting.
unk_label : string
Label used for the "other" words. If provided, the label is ignored
by the tokenizer. Notice that the text is first cleaned, then tokenized,
so `unk_label` must not contain the special `filtered` characters if it is
ought to be considered by the tokenizer.
The `unk_label` is used when back transforming from tokens to words. By
default, `<UNK>` is used as a label.
Attributes
----------
map : dict
Mapping from words to tokens.
inv_map : dict
Mapping from tokens to words.
num_unique_words : int
Number of unique words found in the training set.
"""
self.num_tokens = num_tokens
if isinstance(filters, list):
filters = "".join(filters)
self.filters = '['+filters+']'
self.lower = lower
self.split = split
self.unk_label = unk_label.lower() if unk_label else '<UNK>'
self.map = {}
self.inv_map = {}
self.num_unique_words = 0
def _clean(self, x):
try:
if isinstance(x, int) or isinstance(x, float):
x = str(x)
x = re.sub(self.filters, ' ', x)
x = re.sub(' +', ' ', x).strip()
x = x.lower()
if len(self.split) > 0:
return x.split(self.split)
return list(x)
except TypeError:
return []
def fit(self, X, show_progress=True):
"""Fit the tokenizer
Parameters
----------
X : list
List of documents (strings).
progress : bool
Display progress bar.
Returns
-------
self
"""
cnt = Counter()
for elem in tqdm(X, disable=not show_progress):
for word in self._clean(elem):
# the default will always differ due to cleaning
if word != self.unk_label:
cnt[word] += 1
self.num_unique_words = len(cnt)
for i, (w, c) in enumerate(cnt.most_common(self.num_tokens - 1)):
self.map[w] = i + 1
self.inv_map = { v : k for k, v in iteritems(self.map) }
return self
def transform(self, X, show_progress=True):
"""Transform words to tokens
Parameters
----------
X : list
List of documents (strings).
show_progress : bool
Display progress bar.
Returns
-------
list of lists
"""
new_X = []
for i, elem in tqdm(enumerate(X), disable=not show_progress,
total=len(X)):
new_X.append([])
for word in self._clean(elem):
try:
new_X[i].append(self.map[word])
except KeyError:
new_X[i].append(0)
return new_X
def fit_transform(self, X, show_progress=True):
"""Fit the tokenizer and transform words to tokens
Parameters
----------
X : list
List of documents (strings).
show_progress : bool
Display progress bar.
Returns
-------
list of lists
"""
return self.fit(X, show_progress=show_progress)\
.transform(X, show_progress=show_progress)
def back_transform(self, X, join=False):
"""Transform tokens to words
Parameters
----------
X : list
List of tokens.
join : bool, default : False
Join the words into a string using the `split` text separator.
Returns
-------
list of lists, or list of strings
"""
new_X = []
for i, elem in enumerate(X):
new_X.append([])
for token in elem:
if token == 0:
new_X[i].append(self.unk_label)
else:
new_X[i].append(self.inv_map[token])
if join:
new_X[i] = self.split.join(new_X[i])
return new_X
if __name__ == '__main__' :
tkn = TopKTokenizer(12)
X = [
"loerm ipsum silvia dolor",
"hello world",
5,
"[email protected]",
"@realDonaldTrump #USA",
"John Fitzgerald Kennedy",
]
new_X = tkn.fit_transform(X, show_progress=False)
assert len(tkn.map) + 1 == tkn.num_tokens
[print(x) for x in X]
print()
[print(x) for x in new_X]
print()
[print(x) for x in tkn.back_transform(new_X)]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment