Last active
January 15, 2020 19:34
-
-
Save a-agmon/59c9fe02c5d196f01d54da307398899e to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Build the char index that we will use to encode seqs to numbers | |
#(this char index was written by Jason Brownlee from Machine Learning Mastery) | |
char_index = '0abcdefghijklmnopqrstuvwxyz' | |
char_index +='ABCDEFGHIJKLMNOPQRSTUVWXYZ' | |
char_index += '123456789' | |
char_index += '().,-/+=&$?@#!*:;_[]|%⸏{}\"\'' + ' ' +'\\' | |
char_to_int = dict((c, i) for i, c in enumerate(char_index)) | |
int_to_char = dict((i, c) for i, c in enumerate(char_index)) | |
from keras.preprocessing.sequence import pad_sequences | |
#function that convert a char seqs to numbers seqs | |
#(it does a little more but lets leave it for now) | |
def encode_sequence_list(seqs, feat_n=0): | |
encoded_seqs = [] | |
for seq in seqs: | |
encoded_seq = [char_to_int[c] for c in seq] | |
encoded_seqs.append(encoded_seq) | |
if(feat_n > 0): | |
encoded_seqs.append(np.zeros(feat_n)) | |
return pad_sequences(encoded_seqs, padding='post') | |
def decode_sequence_list(seqs): | |
decoded_seqs = [] | |
for seq in seqs: | |
decoded_seq = [int_to_char[i] for i in seq] | |
decoded_seqs.append(decoded_seq) | |
return decoded_seqs | |
# Using the char_index, the encode_sequence_list function | |
# will turn a string like this EBCA0OXO | |
#to an array like this [29 32 27 27 0 42 42 38] | |
# encode each string seq to an integer array [[1],[5],[67]], [[45],[76],[7] | |
encoded_seqs = encode_sequence_list(random_sequences) | |
# mix everything up | |
np.random.shuffle(encoded_seqs) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment