Skip to content

Instantly share code, notes, and snippets.

@pranjalAI
Created September 4, 2020 14:35
Show Gist options
  • Save pranjalAI/db58075ac671105492e23d3902c1007a to your computer and use it in GitHub Desktop.
Save pranjalAI/db58075ac671105492e23d3902c1007a to your computer and use it in GitHub Desktop.
def prepare_data(questions,answers):
answers=pd.DataFrame(answers, columns=["Ans"])
questions=pd.DataFrame(questions, columns=["Question"])
questions["TokQues"]=questions["Question"].apply(getFeatureVector)
answers=np.array(answers["Ans"])
questions=np.array(questions["TokQues"])
answers_with_tags = list()
for i in range( len( answers ) ):
if type( answers[i] ) == str:
answers_with_tags.append( answers[i] )
else:
print(questions[i])
print(answers[i])
print(type(answers[i]))
questions.pop(i)
answers = list()
for i in range( len( answers_with_tags ) ) :
answers.append( '<START> ' + answers_with_tags[i] + ' <END>' )
tokenizer = preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(questions+answers)
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
#embedding_matrix=emb_mat(nb_words)[0]
#emb_vec=emb_mat(nb_words)[1]
VOCAB_SIZE = len( tokenizer.word_index )+1
tok_out=tokenized_data(questions,answers,VOCAB_SIZE,tokenizer)
encoder_input_data=tok_out[0]
decoder_input_data=tok_out[1]
decoder_output_data=tok_out[2]
maxlen_answers=tok_out[3]
return [encoder_input_data,decoder_input_data,decoder_output_data,maxlen_answers,nb_words,word_index,tokenizer]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment