cheeyeo · April 13, 2018 19:28
diff --git a/tokenizer_serialization.py b/tokenizer_serialization.py
 # Keras tokenizer lacks serialization. Therefore I created the below to address this without changing the API.
 # (Since I don't know how long it'll take for keras to support it)
 # The Tokenizer __init__ should be modified to take the word_stats dictionary as a kwarg, 
 # and a method added to the class to return the stats
 # Expiermentally this works, but I am not sure of any nuances in the Tokenizer class.

 def test_tokenizer():
 	texts = ["It was the best of times, it was the worst of times, it was the age of wisdom",
      "it was the age of foolishness, it was the epoch of belief, it was the epoch of incredulity, ",
      "it was the season of Light, it was the season of Darkness, it was the spring of hope, ",
      "it was the winter of despair, we had nothing before us, we were all going direct to Heaven, ",
      "we were all going direct the other way"]

 	tokenizer = Tokenizer()
 	#tokenizer.word_counts = dict()
 	tokenizer.fit_on_texts(texts)
 	print 'tokenizer.word_index', tokenizer.word_index

 	print 'tokenizer.word_counts',tokenizer.word_counts
 	hack_tokenizer(tokenizer)
 	encoded_docs1 = tokenizer.texts_to_sequences(texts)
 	word_stats1 = export_tokenizer(tokenizer)
 	word_stats2 = json.loads(json.dumps(word_stats1, sort_keys=True)) # Key order can be sorted or unsorted
 	#print 'word_stats1',word_stats1
 	#print 'word_stats2',word_stats2
 	tokenizer2 = restore_tokenizer(word_stats2)
 	encoded_docs2 = tokenizer2.texts_to_sequences(texts)
 	print 'encoded_docs1'
 	for doc in encoded_docs1:
 		print '\t', doc
 	print 'encoded_docs2'
 	for doc in encoded_docs2:
 		print '\t', doc

 	print encoded_docs1 == encoded_docs2
  
 def export_tokenizer(tokenizer):
 	return {'word_counts': tokenizer.word_counts, 'word_docs': tokenizer.word_counts}

 def hack_tokenizer(tokenizer): #Show that OrderedDict is not needed, if x[0] is included in the lambda
 	tokenizer.word_counts = dict(tokenizer.word_counts) # Bye OrderedDict! 
 	wcounts = list(tokenizer.word_counts.items())
 	wcounts.sort(key=lambda x: (x[1], x[0]), reverse=True)
 	sorted_voc = [wc[0] for wc in wcounts]
 	# note that index 0 is reserved, never assigned to an existing word
 	tokenizer.word_index = dict(zip(sorted_voc, range(1, len(sorted_voc) + 1)))
 	print 'tokenizer.word_index', tokenizer.word_index

 	if tokenizer.oov_token is not None:
 		i = tokenizer.word_index.get(self.oov_token)
 		if i is None:
 			tokenizer.word_index[tokenizer.oov_token] = len(self.word_index) + 1

 	tokenizer.index_docs = {}
 	for w, c in list(tokenizer.word_docs.items()):
 		tokenizer.index_docs[tokenizer.word_index[w]] = c
 		

 def restore_tokenizer(word_stats):
 	tokenizer = Tokenizer()
 	tokenizer.word_counts = word_stats['word_counts']
 	tokenizer.word_docs = word_stats['word_docs']
 	tokenizer.document_count = len(word_stats['word_docs'])

 	# from here on, this is taken from the original fit_on_texts(), except as noted
 	wcounts = list(tokenizer.word_counts.items())
 	wcounts.sort(key=lambda x: (x[1], x[0]), reverse=True) # except for including x[0]
 	sorted_voc = [wc[0] for wc in wcounts]
 	# note that index 0 is reserved, never assigned to an existing word
 	tokenizer.word_index = dict(zip(sorted_voc, range(1, len(sorted_voc) + 1))) # and except superfulous list()s removed

 	if tokenizer.oov_token is not None:
 		i = tokenizer.word_index.get(self.oov_token)
 		if i is None:
 			tokenizer.word_index[tokenizer.oov_token] = len(self.word_index) + 1

 	tokenizer.index_docs = {}
 	for w, c in list(tokenizer.word_docs.items()):
 		tokenizer.index_docs[tokenizer.word_index[w]] = c

 	return tokenizer
	# Keras tokenizer lacks serialization. Therefore I created the below to address this without changing the API.
	# (Since I don't know how long it'll take for keras to support it)
	# The Tokenizer __init__ should be modified to take the word_stats dictionary as a kwarg,
	# and a method added to the class to return the stats
	# Expiermentally this works, but I am not sure of any nuances in the Tokenizer class.

	def test_tokenizer():
	texts = ["It was the best of times, it was the worst of times, it was the age of wisdom",
	"it was the age of foolishness, it was the epoch of belief, it was the epoch of incredulity, ",
	"it was the season of Light, it was the season of Darkness, it was the spring of hope, ",
	"it was the winter of despair, we had nothing before us, we were all going direct to Heaven, ",
	"we were all going direct the other way"]

	tokenizer = Tokenizer()
	#tokenizer.word_counts = dict()
	tokenizer.fit_on_texts(texts)
	print 'tokenizer.word_index', tokenizer.word_index

	print 'tokenizer.word_counts',tokenizer.word_counts
	hack_tokenizer(tokenizer)
	encoded_docs1 = tokenizer.texts_to_sequences(texts)
	word_stats1 = export_tokenizer(tokenizer)
	word_stats2 = json.loads(json.dumps(word_stats1, sort_keys=True)) # Key order can be sorted or unsorted
	#print 'word_stats1',word_stats1
	#print 'word_stats2',word_stats2
	tokenizer2 = restore_tokenizer(word_stats2)
	encoded_docs2 = tokenizer2.texts_to_sequences(texts)
	print 'encoded_docs1'
	for doc in encoded_docs1:
	print '\t', doc
	print 'encoded_docs2'
	for doc in encoded_docs2:
	print '\t', doc

	print encoded_docs1 == encoded_docs2

	def export_tokenizer(tokenizer):
	return {'word_counts': tokenizer.word_counts, 'word_docs': tokenizer.word_counts}

	def hack_tokenizer(tokenizer): #Show that OrderedDict is not needed, if x[0] is included in the lambda
	tokenizer.word_counts = dict(tokenizer.word_counts) # Bye OrderedDict!
	wcounts = list(tokenizer.word_counts.items())
	wcounts.sort(key=lambda x: (x[1], x[0]), reverse=True)
	sorted_voc = [wc[0] for wc in wcounts]
	# note that index 0 is reserved, never assigned to an existing word
	tokenizer.word_index = dict(zip(sorted_voc, range(1, len(sorted_voc) + 1)))
	print 'tokenizer.word_index', tokenizer.word_index

	if tokenizer.oov_token is not None:
	i = tokenizer.word_index.get(self.oov_token)
	if i is None:
	tokenizer.word_index[tokenizer.oov_token] = len(self.word_index) + 1

	tokenizer.index_docs = {}
	for w, c in list(tokenizer.word_docs.items()):
	tokenizer.index_docs[tokenizer.word_index[w]] = c


	def restore_tokenizer(word_stats):
	tokenizer = Tokenizer()
	tokenizer.word_counts = word_stats['word_counts']
	tokenizer.word_docs = word_stats['word_docs']
	tokenizer.document_count = len(word_stats['word_docs'])

	# from here on, this is taken from the original fit_on_texts(), except as noted
	wcounts = list(tokenizer.word_counts.items())
	wcounts.sort(key=lambda x: (x[1], x[0]), reverse=True) # except for including x[0]
	sorted_voc = [wc[0] for wc in wcounts]
	# note that index 0 is reserved, never assigned to an existing word
	tokenizer.word_index = dict(zip(sorted_voc, range(1, len(sorted_voc) + 1))) # and except superfulous list()s removed

	if tokenizer.oov_token is not None:
	i = tokenizer.word_index.get(self.oov_token)
	if i is None:
	tokenizer.word_index[tokenizer.oov_token] = len(self.word_index) + 1

	tokenizer.index_docs = {}
	for w, c in list(tokenizer.word_docs.items()):
	tokenizer.index_docs[tokenizer.word_index[w]] = c

	return tokenizer