Last active
July 16, 2018 09:03
-
-
Save twolodzko/69fca83aba9c342da3e0ae26a93be2ec to your computer and use it in GitHub Desktop.
Encode words according to popularity in the reference vocabluary
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from collections import Counter | |
| class PopularityEncoder(object): | |
| def __init__(self, vocab_size=50000): | |
| """Encode words according to popularity index | |
| Parameters | |
| ---------- | |
| vocab_size : int | |
| Number of most common words to encode is vocab_size - 2. | |
| All the words with popularity index > vocab_size + 2 are | |
| encoded as 1 and 0 is a reserved index. | |
| """ | |
| self.vocab_size = vocab_size - 2 | |
| def fit(self, X): | |
| """Fit the transformer | |
| Parameters | |
| ---------- | |
| X : array | |
| Array of words used to fit the transformer. | |
| """ | |
| vocab = Counter(X) | |
| self.word2idx = { x[0]: i+2 for i,x in enumerate(vocab.most_common(self.vocab_size)) } | |
| self.idx2word = dict((v,k) for k,v in self.word2idx.items()) | |
| return self | |
| def transform(self, X): | |
| """Transform words to encodings | |
| Parameters | |
| ---------- | |
| X : array | |
| Array of words to encode. | |
| """ | |
| out = [] | |
| for item in X: | |
| try: | |
| out.append(self.word2idx[item]) | |
| except KeyError: | |
| out.append(1) | |
| return out | |
| def fit_transform(self, X): | |
| """Fit and transform words to encodings | |
| Parameters | |
| ---------- | |
| X : array | |
| Array of words to fit the transformer and to transform. | |
| """ | |
| return self.fit(X).transform(X) | |
| def invert(self, X): | |
| """Backtransform encodings to words | |
| Parameters | |
| ---------- | |
| X : array | |
| Array of encodings to be decoded to words. | |
| """ | |
| return [ self.idx2word[item] if item > 1 else '<UNK>' for item in X ] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment