This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import re | |
| import string | |
| from numpy import array, argmax, random, take | |
| import pandas as pd | |
| from keras.models import Sequential | |
| from keras.layers import Dense, LSTM, Embedding, Bidirectional, RepeatVector, TimeDistributed | |
| from keras.preprocessing.text import Tokenizer | |
| from keras.callbacks import ModelCheckpoint | |
| from keras.preprocessing.sequence import pad_sequences | |
| from keras.models import load_model |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import string | |
| import re | |
| from numpy import array, argmax, random, take | |
| import pandas as pd | |
| from keras.models import Sequential | |
| from keras.layers import Dense, LSTM, Embedding, RepeatVector | |
| from keras.preprocessing.text import Tokenizer | |
| from keras.callbacks import ModelCheckpoint | |
| from keras.preprocessing.sequence import pad_sequences | |
| from keras.models import load_model |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # function to read raw text file | |
| def read_text(filename): | |
| # open the file | |
| file = open(filename, mode='rt', encoding='utf-8') | |
| # read all text | |
| text = file.read() | |
| file.close() | |
| return text |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # split text into sentences | |
| def to_lines(text): | |
| sents = text.strip().split('\n') | |
| sents = [i.split('\t') for i in sents] | |
| return sents |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Remove punctuation | |
| deu_eng[:,0] = [s.translate(str.maketrans('', '', string.punctuation)) for s in deu_eng[:,0]] | |
| deu_eng[:,1] = [s.translate(str.maketrans('', '', string.punctuation)) for s in deu_eng[:,1]] | |
| # convert text to lowercase | |
| for i in range(len(deu_eng)): | |
| deu_eng[i,0] = deu_eng[i,0].lower() | |
| deu_eng[i,1] = deu_eng[i,1].lower() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # empty lists | |
| eng_l = [] | |
| deu_l = [] | |
| # populate the lists with sentence lengths | |
| for i in deu_eng[:,0]: | |
| eng_l.append(len(i.split())) | |
| for i in deu_eng[:,1]: | |
| deu_l.append(len(i.split())) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # function to build a tokenizer | |
| def tokenization(lines): | |
| tokenizer = Tokenizer() | |
| tokenizer.fit_on_texts(lines) | |
| return tokenizer | |
| # prepare english tokenizer | |
| eng_tokenizer = tokenization(deu_eng[:, 0]) | |
| eng_vocab_size = len(eng_tokenizer.word_index) + 1 | |
| eng_length = 8 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # encode and pad sequences | |
| def encode_sequences(tokenizer, length, lines): | |
| # integer encode sequences | |
| seq = tokenizer.texts_to_sequences(lines) | |
| # pad sequences with 0 values | |
| seq = pad_sequences(seq, maxlen=length, padding='post') | |
| return seq |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from sklearn.model_selection import train_test_split | |
| # split data into train and test set | |
| train,test= train_test_split(deu_eng,test_size=0.2,random_state= 12) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # prepare training data | |
| trainX = encode_sequences(deu_tokenizer, deu_length, train[:, 1]) | |
| trainY = encode_sequences(eng_tokenizer, eng_length, train[:, 0]) | |
| # prepare validation data | |
| testX = encode_sequences(deu_tokenizer, deu_length, test[:, 1]) | |
| testY = encode_sequences(eng_tokenizer, eng_length, test[:, 0]) |
OlderNewer