Skip to content

Instantly share code, notes, and snippets.

@choonkiattay
Created June 23, 2021 10:17
Show Gist options
  • Save choonkiattay/dc8def4ca94770cd2481b1b864448a3d to your computer and use it in GitHub Desktop.
Save choonkiattay/dc8def4ca94770cd2481b1b864448a3d to your computer and use it in GitHub Desktop.
Creates sequence dataset for RNN
def dataset_gen():
sequence_dict = {}
alphabet_list = [chr(x) for x in range(65,91)]
for i in range(len(alphabet_list)-5):
sequence_dict[i] = alphabet_list[i:i+6]
number_trans = 2000
df_in_list = [(sequence_dict.get(random.randint(1,9))) for x in range(number_trans)]
df = pd.DataFrame(df_in_list, columns=['week1', 'week2', 'week3', 'week4', 'week5', 'week6'])
df['seq_string'] = df.apply(lambda x: ' '.join([str(y) for y in x.values.tolist()]), axis=1)
lines = df['seq_string'].tolist()
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)
sequences = np.array(sequences)
split_ratio = int(0.8*number_trans)
X, y = sequences[:, :-1], sequences[:,-1]
X_test, y_test = sequences[split_ratio:, :-1], sequences[split_ratio:,-1]
vocab_size = len(tokenizer.word_index) + 1
y = to_categorical(y, num_classes=vocab_size)
y_test = to_categorical(y_test, num_classes=vocab_size)
seq_length = X.shape[1]
output_dim = seq_length + 1
return X, y, X_test, y_test, vocab_size, seq_length, output_dim, lines
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment