Skip to content

Instantly share code, notes, and snippets.

@twolodzko
Last active July 16, 2018 09:03
Show Gist options
  • Select an option

  • Save twolodzko/69fca83aba9c342da3e0ae26a93be2ec to your computer and use it in GitHub Desktop.

Select an option

Save twolodzko/69fca83aba9c342da3e0ae26a93be2ec to your computer and use it in GitHub Desktop.
Encode words according to popularity in the reference vocabluary
from collections import Counter
class PopularityEncoder(object):
def __init__(self, vocab_size=50000):
"""Encode words according to popularity index
Parameters
----------
vocab_size : int
Number of most common words to encode is vocab_size - 2.
All the words with popularity index > vocab_size + 2 are
encoded as 1 and 0 is a reserved index.
"""
self.vocab_size = vocab_size - 2
def fit(self, X):
"""Fit the transformer
Parameters
----------
X : array
Array of words used to fit the transformer.
"""
vocab = Counter(X)
self.word2idx = { x[0]: i+2 for i,x in enumerate(vocab.most_common(self.vocab_size)) }
self.idx2word = dict((v,k) for k,v in self.word2idx.items())
return self
def transform(self, X):
"""Transform words to encodings
Parameters
----------
X : array
Array of words to encode.
"""
out = []
for item in X:
try:
out.append(self.word2idx[item])
except KeyError:
out.append(1)
return out
def fit_transform(self, X):
"""Fit and transform words to encodings
Parameters
----------
X : array
Array of words to fit the transformer and to transform.
"""
return self.fit(X).transform(X)
def invert(self, X):
"""Backtransform encodings to words
Parameters
----------
X : array
Array of encodings to be decoded to words.
"""
return [ self.idx2word[item] if item > 1 else '<UNK>' for item in X ]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment