Created
May 20, 2017 15:00
-
-
Save giuseppebonaccorso/d6e5bee6d50480344493b66f88fc414b to your computer and use it in GitHub Desktop.
Mathematica expression learning experiment using a Seq2Seq approach
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Number of samples = 100000 | |
Symbols = Integer relative numbers bounded between (0, 100), Arithmetic operations, Brackets, Empty space (for padding) | |
Keras backend = Theano 0.9.0 | |
Training hardware = Core i7, GeForce 960, 32 GB Ram | |
Training time = 5.4 hours / 5 epochs | |
Test results (as expected there are many errors due to the size of the training dataset): | |
-50/-68 = 0 | |
(-96*85) = -7820 | |
-(-17--82) = -63 | |
-16*5 = -74 | |
48*-60 = -2840 | |
(66+-19) = 43 | |
69+41 = 116 | |
(-16-26) = -44 | |
17/-11 = -2 | |
-20-11 = -33 | |
5+60 = 63 | |
-(-81+62) = 29 | |
(-60/-89) = 0 | |
(45+21) = 62 | |
-(91-39) = -44 | |
-68*-12 = 778 | |
(-92+-7) = -97 | |
-(35*-91) = 3175 | |
-(13+-89) = 70 | |
-(-5/-38) = 0 | |
-(83+54) = -145 | |
-61-44 = -107 | |
65*-82 = -5470 | |
-(-99/-64) = -2 | |
(-88--78) = -1 | |
-12*-94 = 904 | |
-22*5 = -120 | |
-91*-69 = 6227 | |
-40/90 = -1 | |
68/-83 = -1 | |
-40+-89 = -137 | |
-62--14 = -44 | |
-87--72 = -17 | |
(82*-35) = -3870 | |
(-71*65) = -4085 | |
(-51-66) = -117 | |
-(18/-79) = 1 | |
(-23*46) = -1162 | |
-6*98 = -578 | |
-(-32/-5) = -7 | |
-18*-4 = 72 | |
98/19 = 5 | |
-5-68 = -61 | |
-(99--13) = 110 | |
(99--6) = 117 | |
-(65/-91) = 1 | |
-29/99 = -1 | |
-13/-64 = 0 | |
39/-1 = 47 | |
-(-11*-13) = -17 | |
(-51-26) = -77 | |
-89-15 = -104 | |
-(-81+9) = 62 | |
-44/-46 = 1 | |
(-51+96) = 43 | |
88--42 = 138 | |
-82*-43 = 3774 | |
-85+56 = -23 | |
(2/-16) = -1 | |
-88+28 = -64 | |
-(42+72) = -118 | |
(7+-48) = -43 | |
(78--41) = 127 | |
24--35 = 53 | |
(-4--99) = 97 | |
(-32/-89) = 0 | |
-62-84 = -148 | |
78+90 = 174 | |
(-76-83) = -157 | |
(-80--45) = -27 | |
-95/-21 = 4 | |
9*56 = 484 | |
(-25*70) = -1450 | |
(-36-78) = -116 | |
-84+-32 = -114 | |
(-69-70) = -139 | |
-(58*-53) = 3474 | |
(4/97) = 0 | |
-27*-70 = 1870 | |
-(-8--12) = -12 | |
(-34--20) = -18 | |
-48+-75 = -123 | |
-(-40*42) = 2840 | |
-81/56 = -2 | |
-97/24 = -4 | |
(64/-90) = -1 | |
-7*-99 = 627 | |
-46*47 = -2208 | |
-22*80 = -1440 | |
26+46 = 68 | |
(8+50) = 53 | |
(12+-85) = -77 | |
(78+-47) = 29 | |
-(-80*-75) = -5600 | |
-(-38+-29) = 63 | |
(-25*-45) = 1155 | |
(88+60) = 144 | |
37+-71 = -34 | |
83+-36 = 43 | |
(-14*-84) = 114 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
Mathematical expression learning experiment | |
Giuseppe Bonaccorso (https://www.bonaccorso.eu) | |
Based on: http://machinelearningmastery.com/learn-add-numbers-seq2seq-recurrent-neural-networks/ | |
''' | |
from __future__ import print_function | |
from keras.models import Sequential | |
from keras.layers import Dense, TimeDistributed, RepeatVector | |
from keras.layers.recurrent import LSTM | |
from sklearn.preprocessing import LabelBinarizer | |
import keras.backend as K | |
import numpy as np | |
# Set random seed (for reproducibility) | |
np.random.seed(1000) | |
# Mathematical symbols | |
symbols = [' ', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '-', '/', '*', '(', ')'] | |
operation_offset = 11 | |
minus_symbol = 12 | |
open_bracket = 15 | |
closed_bracket = 16 | |
# Number of training samples | |
nb_samples = 100000 | |
# Sequence(s) lenght | |
input_sequence_length = 340 | |
output_sequence_length = 340 | |
# Binarize symbols | |
label_binarizer = LabelBinarizer() | |
label_binarizer.fit(symbols) | |
# Symbol length | |
symbol_lenght = len(label_binarizer.transform([symbols[0]])[0]) | |
# Empty symbol | |
empty_symbol = label_binarizer.transform([symbols[0]])[0] | |
# Time steps | |
time_steps = int(input_sequence_length / symbol_lenght) | |
def expression_to_symbols(value): | |
s = [] | |
for digit in str(value): | |
s.append(digit) | |
return label_binarizer.transform(np.array(s)).flatten() | |
def symbols_to_expression(expression): | |
syms = '' | |
for row in expression: | |
syms += label_binarizer.inverse_transform(to_binary(row).reshape((1, symbol_lenght)))[0] | |
return syms.strip() | |
def operation(op_type, a, b): | |
ops = { | |
0: a + b, | |
1: a - b, | |
2: int(a / b), | |
3: a * b | |
} | |
return ops.get(op_type) | |
def generate_random_expression(): | |
# First term | |
a = np.random.randint(-100, 100) | |
# Second term (avoid zero for divisions) | |
b = np.random.randint(1, 100) | |
if binary_decision(): | |
b = -b | |
# Operator | |
op = np.random.randint(0, 4) | |
result = operation(op, a, b) | |
full_expression = (expression_to_symbols(a), | |
expression_to_symbols(symbols[op + operation_offset]), | |
expression_to_symbols(b)) | |
if binary_decision(): | |
# Insert brackets | |
open_bracket_expression = (expression_to_symbols(symbols[open_bracket]),) | |
if binary_decision(): | |
# Insert a minus in front of the exception | |
open_bracket_expression = (expression_to_symbols(symbols[minus_symbol]),) + open_bracket_expression | |
result *= -1 | |
full_expression = open_bracket_expression + full_expression | |
full_expression += (expression_to_symbols(symbols[closed_bracket]),) | |
x = pad(np.concatenate(full_expression), input_sequence_length).reshape(time_steps, symbol_lenght) | |
r = pad(expression_to_symbols(result), output_sequence_length).reshape(time_steps, symbol_lenght) | |
return x, r, result | |
def create_dataset(n_samples=5000): | |
print('Creating dataset with %d samples' % nb_samples) | |
X = [] | |
Y = [] | |
for _ in range(n_samples): | |
x, r, _ = generate_random_expression() | |
X.append(x.astype(K.floatx())) | |
Y.append(r.astype(K.floatx())) | |
return np.array(X).astype(K.floatx()), np.array(Y).astype(K.floatx()) | |
def binary_decision(): | |
return True if np.random.uniform(0, 1) < 0.5 else False | |
def pad(x, sequence_length): | |
if len(x) < sequence_length: | |
n = int((sequence_length - len(x)) / len(empty_symbol)) | |
for _ in range(n): | |
x = np.concatenate((x, empty_symbol)) | |
return x | |
def to_binary(x): | |
v = np.argmax(x) | |
z = np.zeros(shape=symbol_lenght) | |
z[v] = 1.0 | |
return z | |
def make_expression(string_expression): | |
s = [] | |
for digit in string_expression.strip(): | |
s.append(digit) | |
return pad(label_binarizer.transform(np.array(s)).flatten(), input_sequence_length).\ | |
reshape(1, time_steps, symbol_lenght) | |
def create_model(): | |
model = Sequential() | |
model.add(LSTM(250, input_shape=(time_steps, symbol_lenght))) | |
model.add(RepeatVector(time_steps)) | |
model.add(LSTM(100, return_sequences=True)) | |
model.add(TimeDistributed(Dense(symbol_lenght, activation='softmax'))) | |
# Compile model | |
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy']) | |
return model | |
if __name__ == '__main__': | |
print('Expression learning experiment') | |
print('Symbol table:') | |
for symbol in symbols: | |
print(symbol + ' -> ' + str(label_binarizer.transform([symbol]))) | |
# Create dataset | |
print('Training model...') | |
X, Y = create_dataset(n_samples=nb_samples) | |
# Create model | |
model = create_model() | |
# Train model | |
model.fit(X, Y, batch_size=1, epochs=5) | |
# Test | |
print('Test:') | |
X_test, Y_test = create_dataset(n_samples=100) | |
Y_pred = model.predict(X_test) | |
for i, y in enumerate(Y_pred): | |
print('%s = %s' % (symbols_to_expression(X_test[i]), symbols_to_expression(y))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi, I trained and modified (wrote input method for) your LSTM Script, and after training for two days I got accuracy up to 98%, which seemed fine. On testing I noted that it works great with inputs in the range it trained on (-99 to 99) with nearly no error, but resultet in totally wrong output for larger numbers (like 100*2)!
Any idea why? Is this inherent in the LSTM Setup, is it only interpolating and not learning the structure? I'll try to train it on bigger input numbers now, any idea if it is even possible to generalize? Goal should be for it to work for arbitrary large numbers so that it really has learned the rules of caculating, right? Thank you!