Skip to content

Instantly share code, notes, and snippets.

@a-agmon
Last active January 15, 2020 19:34
Show Gist options
  • Save a-agmon/59c9fe02c5d196f01d54da307398899e to your computer and use it in GitHub Desktop.
Save a-agmon/59c9fe02c5d196f01d54da307398899e to your computer and use it in GitHub Desktop.
#Build the char index that we will use to encode seqs to numbers
#(this char index was written by Jason Brownlee from Machine Learning Mastery)
char_index = '0abcdefghijklmnopqrstuvwxyz'
char_index +='ABCDEFGHIJKLMNOPQRSTUVWXYZ'
char_index += '123456789'
char_index += '().,-/+=&$?@#!*:;_[]|%⸏{}\"\'' + ' ' +'\\'
char_to_int = dict((c, i) for i, c in enumerate(char_index))
int_to_char = dict((i, c) for i, c in enumerate(char_index))
from keras.preprocessing.sequence import pad_sequences
#function that convert a char seqs to numbers seqs
#(it does a little more but lets leave it for now)
def encode_sequence_list(seqs, feat_n=0):
encoded_seqs = []
for seq in seqs:
encoded_seq = [char_to_int[c] for c in seq]
encoded_seqs.append(encoded_seq)
if(feat_n > 0):
encoded_seqs.append(np.zeros(feat_n))
return pad_sequences(encoded_seqs, padding='post')
def decode_sequence_list(seqs):
decoded_seqs = []
for seq in seqs:
decoded_seq = [int_to_char[i] for i in seq]
decoded_seqs.append(decoded_seq)
return decoded_seqs
# Using the char_index, the encode_sequence_list function
# will turn a string like this EBCA0OXO
#to an array like this [29 32 27 27 0 42 42 38]
# encode each string seq to an integer array [[1],[5],[67]], [[45],[76],[7]
encoded_seqs = encode_sequence_list(random_sequences)
# mix everything up
np.random.shuffle(encoded_seqs)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment