Skip to content

Instantly share code, notes, and snippets.

@arivero
Last active March 2, 2024 20:59
Show Gist options
  • Save arivero/155ebcff7f1a1cd27be0bd80dce739c2 to your computer and use it in GitHub Desktop.
Save arivero/155ebcff7f1a1cd27be0bd80dce739c2 to your computer and use it in GitHub Desktop.
Grokking example, in train dataset
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
def create_synthetic_data(input_dim=1024, num_samples=10000):
num_classes = input_dim
y = np.random.randint(0, num_classes, size=(num_samples,))
y= tf.keras.utils.to_categorical(y, num_classes)
X = y
return X, y
input_dim=1024
num_samples = 10000
X, y = create_synthetic_data(input_dim=input_dim, num_samples=num_samples)
# Custom callback for detailed per-step logging
class DetailedLoggingCallback(tf.keras.callbacks.Callback):
def on_train_begin(self, logs=None):
self.step_accuracy = []
self.learning_rate = []
def on_train_batch_begin(self, batch, logs=None):
# Reset the metric at the start of each batch
self.model.reset_metrics()
def on_train_batch_end(self, batch, logs=None):
# Log the metric at the end of each batch
self.step_accuracy.append(logs.get('accuracy'))
self.learning_rate.append(self.model.optimizer.iterations.numpy())
activators = ['selu', 'linear', 'elu', 'tanh', 'leaky_relu', 'softsign', 'relu6', 'relu', 'gelu', 'swish', 'softplus', 'sigmoid', 'hard_sigmoid', 'exponential']
activators = ['selu','tanh','linear','relu', 'gelu']
dtypes = [tf.float32, tf.float16, tf.float64]
def create_model(input_dim, num_layers, activation='relu', dtype=tf.float32):
model = tf.keras.Sequential()
model.add(tf.keras.Input(shape=(input_dim,), dtype=dtype))
for _ in range(num_layers):
model.add(tf.keras.layers.Dense(128, activation=activation, dtype=dtype))
model.add(tf.keras.layers.Dense(input_dim, activation='softmax', dtype=dtype)) #sigmoid seria para multiclass
return model
#plt.rcParams.update({'axes.facecolor': 'white', 'figure.facecolor': 'white'})
input_dim = 1024
nlayers = 12
for tipo in dtypes:
for opt in ['RMSprop', 'Adam', 'SGD', 'Adagrad', 'Adadelta', 'Adamax', 'Nadam', 'Ftrl']:
plt.figure(figsize=(12, 8))
order = {}
line={}
for act in activators:
model = create_model(input_dim=input_dim, num_layers=nlayers, activation=act, dtype=tipo)
model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
detailed_logging_callback = DetailedLoggingCallback()
model.fit(X, y, epochs=2*nlayers*nlayers, batch_size=256, verbose=0, callbacks=[detailed_logging_callback])
smoothed_accuracy = detailed_logging_callback.step_accuracy
lr = detailed_logging_callback.learning_rate
plt.scatter(range(len(smoothed_accuracy)), smoothed_accuracy, label=f'{act}',s=1)
plt.xlabel('Training Step')
plt.ylabel('Accuracy')
plt.title(f'Per-Step {tipo.name} {opt} Training Accuracy for {nlayers} Layers and Different Activators')
plt.legend()
plt.show()
@arivero
Copy link
Author

arivero commented Mar 2, 2024

The delay disappears, or at least decreases, if we do not use bias in the embedding layer.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment