Last active
March 2, 2024 20:59
-
-
Save arivero/155ebcff7f1a1cd27be0bd80dce739c2 to your computer and use it in GitHub Desktop.
Grokking example, in train dataset
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import tensorflow as tf | |
import numpy as np | |
import matplotlib.pyplot as plt | |
def create_synthetic_data(input_dim=1024, num_samples=10000): | |
num_classes = input_dim | |
y = np.random.randint(0, num_classes, size=(num_samples,)) | |
y= tf.keras.utils.to_categorical(y, num_classes) | |
X = y | |
return X, y | |
input_dim=1024 | |
num_samples = 10000 | |
X, y = create_synthetic_data(input_dim=input_dim, num_samples=num_samples) | |
# Custom callback for detailed per-step logging | |
class DetailedLoggingCallback(tf.keras.callbacks.Callback): | |
def on_train_begin(self, logs=None): | |
self.step_accuracy = [] | |
self.learning_rate = [] | |
def on_train_batch_begin(self, batch, logs=None): | |
# Reset the metric at the start of each batch | |
self.model.reset_metrics() | |
def on_train_batch_end(self, batch, logs=None): | |
# Log the metric at the end of each batch | |
self.step_accuracy.append(logs.get('accuracy')) | |
self.learning_rate.append(self.model.optimizer.iterations.numpy()) | |
activators = ['selu', 'linear', 'elu', 'tanh', 'leaky_relu', 'softsign', 'relu6', 'relu', 'gelu', 'swish', 'softplus', 'sigmoid', 'hard_sigmoid', 'exponential'] | |
activators = ['selu','tanh','linear','relu', 'gelu'] | |
dtypes = [tf.float32, tf.float16, tf.float64] | |
def create_model(input_dim, num_layers, activation='relu', dtype=tf.float32): | |
model = tf.keras.Sequential() | |
model.add(tf.keras.Input(shape=(input_dim,), dtype=dtype)) | |
for _ in range(num_layers): | |
model.add(tf.keras.layers.Dense(128, activation=activation, dtype=dtype)) | |
model.add(tf.keras.layers.Dense(input_dim, activation='softmax', dtype=dtype)) #sigmoid seria para multiclass | |
return model | |
#plt.rcParams.update({'axes.facecolor': 'white', 'figure.facecolor': 'white'}) | |
input_dim = 1024 | |
nlayers = 12 | |
for tipo in dtypes: | |
for opt in ['RMSprop', 'Adam', 'SGD', 'Adagrad', 'Adadelta', 'Adamax', 'Nadam', 'Ftrl']: | |
plt.figure(figsize=(12, 8)) | |
order = {} | |
line={} | |
for act in activators: | |
model = create_model(input_dim=input_dim, num_layers=nlayers, activation=act, dtype=tipo) | |
model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy']) | |
detailed_logging_callback = DetailedLoggingCallback() | |
model.fit(X, y, epochs=2*nlayers*nlayers, batch_size=256, verbose=0, callbacks=[detailed_logging_callback]) | |
smoothed_accuracy = detailed_logging_callback.step_accuracy | |
lr = detailed_logging_callback.learning_rate | |
plt.scatter(range(len(smoothed_accuracy)), smoothed_accuracy, label=f'{act}',s=1) | |
plt.xlabel('Training Step') | |
plt.ylabel('Accuracy') | |
plt.title(f'Per-Step {tipo.name} {opt} Training Accuracy for {nlayers} Layers and Different Activators') | |
plt.legend() | |
plt.show() |
The delay disappears, or at least decreases, if we do not use bias in the embedding layer.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
At least in Keras, it can be seen that gelu and relu have some tendency to exhibit delayed learning, but a peculiar thing is that it is exaggerated by the optimiser. Compare the training of a gelu fully connected network of 12 layers when trained with the defaults RMSprop and Adam.