-
-
Save cheeyeo/783222b2c6bfc738ccdb003b5071c271 to your computer and use it in GitHub Desktop.
Keras Tokenizer Gist
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Keras tokenizer lacks serialization. Therefore I created the below to address this without changing the API. | |
# (Since I don't know how long it'll take for keras to support it) | |
# The Tokenizer __init__ should be modified to take the word_stats dictionary as a kwarg, | |
# and a method added to the class to return the stats | |
# Expiermentally this works, but I am not sure of any nuances in the Tokenizer class. | |
def test_tokenizer(): | |
texts = ["It was the best of times, it was the worst of times, it was the age of wisdom", | |
"it was the age of foolishness, it was the epoch of belief, it was the epoch of incredulity, ", | |
"it was the season of Light, it was the season of Darkness, it was the spring of hope, ", | |
"it was the winter of despair, we had nothing before us, we were all going direct to Heaven, ", | |
"we were all going direct the other way"] | |
tokenizer = Tokenizer() | |
#tokenizer.word_counts = dict() | |
tokenizer.fit_on_texts(texts) | |
print 'tokenizer.word_index', tokenizer.word_index | |
print 'tokenizer.word_counts',tokenizer.word_counts | |
hack_tokenizer(tokenizer) | |
encoded_docs1 = tokenizer.texts_to_sequences(texts) | |
word_stats1 = export_tokenizer(tokenizer) | |
word_stats2 = json.loads(json.dumps(word_stats1, sort_keys=True)) # Key order can be sorted or unsorted | |
#print 'word_stats1',word_stats1 | |
#print 'word_stats2',word_stats2 | |
tokenizer2 = restore_tokenizer(word_stats2) | |
encoded_docs2 = tokenizer2.texts_to_sequences(texts) | |
print 'encoded_docs1' | |
for doc in encoded_docs1: | |
print '\t', doc | |
print 'encoded_docs2' | |
for doc in encoded_docs2: | |
print '\t', doc | |
print encoded_docs1 == encoded_docs2 | |
def export_tokenizer(tokenizer): | |
return {'word_counts': tokenizer.word_counts, 'word_docs': tokenizer.word_counts} | |
def hack_tokenizer(tokenizer): #Show that OrderedDict is not needed, if x[0] is included in the lambda | |
tokenizer.word_counts = dict(tokenizer.word_counts) # Bye OrderedDict! | |
wcounts = list(tokenizer.word_counts.items()) | |
wcounts.sort(key=lambda x: (x[1], x[0]), reverse=True) | |
sorted_voc = [wc[0] for wc in wcounts] | |
# note that index 0 is reserved, never assigned to an existing word | |
tokenizer.word_index = dict(zip(sorted_voc, range(1, len(sorted_voc) + 1))) | |
print 'tokenizer.word_index', tokenizer.word_index | |
if tokenizer.oov_token is not None: | |
i = tokenizer.word_index.get(self.oov_token) | |
if i is None: | |
tokenizer.word_index[tokenizer.oov_token] = len(self.word_index) + 1 | |
tokenizer.index_docs = {} | |
for w, c in list(tokenizer.word_docs.items()): | |
tokenizer.index_docs[tokenizer.word_index[w]] = c | |
def restore_tokenizer(word_stats): | |
tokenizer = Tokenizer() | |
tokenizer.word_counts = word_stats['word_counts'] | |
tokenizer.word_docs = word_stats['word_docs'] | |
tokenizer.document_count = len(word_stats['word_docs']) | |
# from here on, this is taken from the original fit_on_texts(), except as noted | |
wcounts = list(tokenizer.word_counts.items()) | |
wcounts.sort(key=lambda x: (x[1], x[0]), reverse=True) # except for including x[0] | |
sorted_voc = [wc[0] for wc in wcounts] | |
# note that index 0 is reserved, never assigned to an existing word | |
tokenizer.word_index = dict(zip(sorted_voc, range(1, len(sorted_voc) + 1))) # and except superfulous list()s removed | |
if tokenizer.oov_token is not None: | |
i = tokenizer.word_index.get(self.oov_token) | |
if i is None: | |
tokenizer.word_index[tokenizer.oov_token] = len(self.word_index) + 1 | |
tokenizer.index_docs = {} | |
for w, c in list(tokenizer.word_docs.items()): | |
tokenizer.index_docs[tokenizer.word_index[w]] = c | |
return tokenizer |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment