Skip to content

Instantly share code, notes, and snippets.

@pranjalAI
Created September 4, 2020 14:33
Show Gist options
  • Save pranjalAI/0c343e49b0fdfc3cf953d49a4ce446c3 to your computer and use it in GitHub Desktop.
Save pranjalAI/0c343e49b0fdfc3cf953d49a4ce446c3 to your computer and use it in GitHub Desktop.
def tokenized_data(questions,answers,VOCAB_SIZE,tokenizer):
# encoder_input_data
import numpy as np
tokenized_questions = tokenizer.texts_to_sequences( questions )
maxlen_questions = max( [ len(x) for x in tokenized_questions ] )
padded_questions = preprocessing.sequence.pad_sequences( tokenized_questions , maxlen=maxlen , padding='post' )
encoder_input_data = np.array( padded_questions )
#print( encoder_input_data.shape , maxlen_questions )
# decoder_input_data
tokenized_answers = tokenizer.texts_to_sequences( answers )
maxlen_answers = max( [ len(x) for x in tokenized_answers ] )
padded_answers = preprocessing.sequence.pad_sequences( tokenized_answers , maxlen=maxlen , padding='post' )
decoder_input_data = np.array( padded_answers )
#print( decoder_input_data.shape , maxlen_answers )
# decoder_output_data
tokenized_answers = tokenizer.texts_to_sequences( answers )
for i in range(len(tokenized_answers)) :
tokenized_answers[i] = tokenized_answers[i][1:] # remove <start> take rest
padded_answers = preprocessing.sequence.pad_sequences( tokenized_answers , maxlen=maxlen , padding='post' )
onehot_answers = utils.to_categorical( padded_answers , VOCAB_SIZE)
decoder_output_data = np.array( onehot_answers )
#print( decoder_output_data.shape )
return [encoder_input_data,decoder_input_data,decoder_output_data,maxlen_answers]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment