Skip to content

Instantly share code, notes, and snippets.

@marcusbuffett
Created July 23, 2025 11:22
Show Gist options
  • Save marcusbuffett/6ce4165a701b8ae5f83fdf55487d07a3 to your computer and use it in GitHub Desktop.
Save marcusbuffett/6ce4165a701b8ae5f83fdf55487d07a3 to your computer and use it in GitHub Desktop.
2025-07-23T10:24:36.343588Z INFO burn_train::learner::train_val: Fitting the model:
OXIModel {
encoder: ChessResNet {
input_conv: Conv2d {stride: [1, 1], kernel_size: [3, 3], dilation: [1, 1], groups: 1, padding: Same, params: 9280}
input_bn: BatchNorm {num_features: 64, momentum: 0.1, epsilon: 0.00001, params: 256}
blocks: Vec<0..6> {
0: BasicBlock {
conv1: Conv2d {stride: [1, 1], kernel_size: [3, 3], dilation: [1, 1], groups: 1, padding: Same, params: 36928}
bn1: BatchNorm {num_features: 64, momentum: 0.1, epsilon: 0.00001, params: 256}
conv2: Conv2d {stride: [1, 1], kernel_size: [3, 3], dilation: [1, 1], groups: 1, padding: Same, params: 36928}
bn2: BatchNorm {num_features: 64, momentum: 0.1, epsilon: 0.00001, params: 256}
se: SqueezeExcitation {
fc1: Linear {d_input: 64, d_output: 8, bias: true, params: 520}
fc2: Linear {d_input: 8, d_output: 64, bias: true, params: 576}
params: 1096
}
params: 75464
}
1: BasicBlock {
conv1: Conv2d {stride: [1, 1], kernel_size: [3, 3], dilation: [1, 1], groups: 1, padding: Same, params: 36928}
bn1: BatchNorm {num_features: 64, momentum: 0.1, epsilon: 0.00001, params: 256}
conv2: Conv2d {stride: [1, 1], kernel_size: [3, 3], dilation: [1, 1], groups: 1, padding: Same, params: 36928}
bn2: BatchNorm {num_features: 64, momentum: 0.1, epsilon: 0.00001, params: 256}
se: SqueezeExcitation {
fc1: Linear {d_input: 64, d_output: 8, bias: true, params: 520}
fc2: Linear {d_input: 8, d_output: 64, bias: true, params: 576}
params: 1096
}
params: 75464
}
2: BasicBlock {
conv1: Conv2d {stride: [1, 1], kernel_size: [3, 3], dilation: [1, 1], groups: 1, padding: Same, params: 36928}
bn1: BatchNorm {num_features: 64, momentum: 0.1, epsilon: 0.00001, params: 256}
conv2: Conv2d {stride: [1, 1], kernel_size: [3, 3], dilation: [1, 1], groups: 1, padding: Same, params: 36928}
bn2: BatchNorm {num_features: 64, momentum: 0.1, epsilon: 0.00001, params: 256}
se: SqueezeExcitation {
fc1: Linear {d_input: 64, d_output: 8, bias: true, params: 520}
fc2: Linear {d_input: 8, d_output: 64, bias: true, params: 576}
params: 1096
}
params: 75464
}
3: BasicBlock {
conv1: Conv2d {stride: [1, 1], kernel_size: [3, 3], dilation: [1, 1], groups: 1, padding: Same, params: 36928}
bn1: BatchNorm {num_features: 64, momentum: 0.1, epsilon: 0.00001, params: 256}
conv2: Conv2d {stride: [1, 1], kernel_size: [3, 3], dilation: [1, 1], groups: 1, padding: Same, params: 36928}
bn2: BatchNorm {num_features: 64, momentum: 0.1, epsilon: 0.00001, params: 256}
se: SqueezeExcitation {
fc1: Linear {d_input: 64, d_output: 8, bias: true, params: 520}
fc2: Linear {d_input: 8, d_output: 64, bias: true, params: 576}
params: 1096
}
params: 75464
}
4: BasicBlock {
conv1: Conv2d {stride: [1, 1], kernel_size: [3, 3], dilation: [1, 1], groups: 1, padding: Same, params: 36928}
bn1: BatchNorm {num_features: 64, momentum: 0.1, epsilon: 0.00001, params: 256}
conv2: Conv2d {stride: [1, 1], kernel_size: [3, 3], dilation: [1, 1], groups: 1, padding: Same, params: 36928}
bn2: BatchNorm {num_features: 64, momentum: 0.1, epsilon: 0.00001, params: 256}
se: SqueezeExcitation {
fc1: Linear {d_input: 64, d_output: 8, bias: true, params: 520}
fc2: Linear {d_input: 8, d_output: 64, bias: true, params: 576}
params: 1096
}
params: 75464
}
5: BasicBlock {
conv1: Conv2d {stride: [1, 1], kernel_size: [3, 3], dilation: [1, 1], groups: 1, padding: Same, params: 36928}
bn1: BatchNorm {num_features: 64, momentum: 0.1, epsilon: 0.00001, params: 256}
conv2: Conv2d {stride: [1, 1], kernel_size: [3, 3], dilation: [1, 1], groups: 1, padding: Same, params: 36928}
bn2: BatchNorm {num_features: 64, momentum: 0.1, epsilon: 0.00001, params: 256}
se: SqueezeExcitation {
fc1: Linear {d_input: 64, d_output: 8, bias: true, params: 520}
fc2: Linear {d_input: 8, d_output: 64, bias: true, params: 576}
params: 1096
}
params: 75464
}
}
policy_conv: Conv2d {stride: [1, 1], kernel_size: [1, 1], dilation: [1, 1], groups: 1, padding: Valid, params: 1300}
policy_bn: BatchNorm {num_features: 20, momentum: 0.1, epsilon: 0.00001, params: 80}
params: 463700
}
patch_embed: Linear {d_input: 64, d_output: 64, bias: true, params: 4160}
pos_embed: ParamTensor {rank: 3, shape: [1, 64, 64], kind: float}
transformer_blocks: Vec<0..6> {
0: TransformerBlock {
attention: EloAwareAttention {
num_heads: 4
head_dim: 16
elo_embeddings: Embedding {n_embedding: 20, d_model: 32, params: 640}
q_proj: Linear {d_input: 64, d_output: 64, bias: true, params: 4160}
k_proj: Linear {d_input: 64, d_output: 64, bias: true, params: 4160}
v_proj: Linear {d_input: 64, d_output: 64, bias: true, params: 4160}
elo_proj: Linear {d_input: 64, d_output: 64, bias: true, params: 4160}
out_proj: Linear {d_input: 64, d_output: 64, bias: true, params: 4160}
dropout: Dropout {prob: 0.2}
params: 21440
}
norm1: LayerNorm {d_model: 64, epsilon: 0.00001, params: 128}
norm2: LayerNorm {d_model: 64, epsilon: 0.00001, params: 128}
mlp: MLP {
fc1: Linear {d_input: 64, d_output: 128, bias: true, params: 8320}
fc2: Linear {d_input: 128, d_output: 64, bias: true, params: 8256}
dropout: Dropout {prob: 0.2}
params: 16576
}
dropout: Dropout {prob: 0.2}
params: 38272
}
1: TransformerBlock {
attention: EloAwareAttention {
num_heads: 4
head_dim: 16
elo_embeddings: Embedding {n_embedding: 20, d_model: 32, params: 640}
q_proj: Linear {d_input: 64, d_output: 64, bias: true, params: 4160}
k_proj: Linear {d_input: 64, d_output: 64, bias: true, params: 4160}
v_proj: Linear {d_input: 64, d_output: 64, bias: true, params: 4160}
elo_proj: Linear {d_input: 64, d_output: 64, bias: true, params: 4160}
out_proj: Linear {d_input: 64, d_output: 64, bias: true, params: 4160}
dropout: Dropout {prob: 0.2}
params: 21440
}
norm1: LayerNorm {d_model: 64, epsilon: 0.00001, params: 128}
norm2: LayerNorm {d_model: 64, epsilon: 0.00001, params: 128}
mlp: MLP {
fc1: Linear {d_input: 64, d_output: 128, bias: true, params: 8320}
fc2: Linear {d_input: 128, d_output: 64, bias: true, params: 8256}
dropout: Dropout {prob: 0.2}
params: 16576
}
dropout: Dropout {prob: 0.2}
params: 38272
}
2: TransformerBlock {
attention: EloAwareAttention {
num_heads: 4
head_dim: 16
elo_embeddings: Embedding {n_embedding: 20, d_model: 32, params: 640}
q_proj: Linear {d_input: 64, d_output: 64, bias: true, params: 4160}
k_proj: Linear {d_input: 64, d_output: 64, bias: true, params: 4160}
v_proj: Linear {d_input: 64, d_output: 64, bias: true, params: 4160}
elo_proj: Linear {d_input: 64, d_output: 64, bias: true, params: 4160}
out_proj: Linear {d_input: 64, d_output: 64, bias: true, params: 4160}
dropout: Dropout {prob: 0.2}
params: 21440
}
norm1: LayerNorm {d_model: 64, epsilon: 0.00001, params: 128}
norm2: LayerNorm {d_model: 64, epsilon: 0.00001, params: 128}
mlp: MLP {
fc1: Linear {d_input: 64, d_output: 128, bias: true, params: 8320}
fc2: Linear {d_input: 128, d_output: 64, bias: true, params: 8256}
dropout: Dropout {prob: 0.2}
params: 16576
}
dropout: Dropout {prob: 0.2}
params: 38272
}
3: TransformerBlock {
attention: EloAwareAttention {
num_heads: 4
head_dim: 16
elo_embeddings: Embedding {n_embedding: 20, d_model: 32, params: 640}
q_proj: Linear {d_input: 64, d_output: 64, bias: true, params: 4160}
k_proj: Linear {d_input: 64, d_output: 64, bias: true, params: 4160}
v_proj: Linear {d_input: 64, d_output: 64, bias: true, params: 4160}
elo_proj: Linear {d_input: 64, d_output: 64, bias: true, params: 4160}
out_proj: Linear {d_input: 64, d_output: 64, bias: true, params: 4160}
dropout: Dropout {prob: 0.2}
params: 21440
}
norm1: LayerNorm {d_model: 64, epsilon: 0.00001, params: 128}
norm2: LayerNorm {d_model: 64, epsilon: 0.00001, params: 128}
mlp: MLP {
fc1: Linear {d_input: 64, d_output: 128, bias: true, params: 8320}
fc2: Linear {d_input: 128, d_output: 64, bias: true, params: 8256}
dropout: Dropout {prob: 0.2}
params: 16576
}
dropout: Dropout {prob: 0.2}
params: 38272
}
4: TransformerBlock {
attention: EloAwareAttention {
num_heads: 4
head_dim: 16
elo_embeddings: Embedding {n_embedding: 20, d_model: 32, params: 640}
q_proj: Linear {d_input: 64, d_output: 64, bias: true, params: 4160}
k_proj: Linear {d_input: 64, d_output: 64, bias: true, params: 4160}
v_proj: Linear {d_input: 64, d_output: 64, bias: true, params: 4160}
elo_proj: Linear {d_input: 64, d_output: 64, bias: true, params: 4160}
out_proj: Linear {d_input: 64, d_output: 64, bias: true, params: 4160}
dropout: Dropout {prob: 0.2}
params: 21440
}
norm1: LayerNorm {d_model: 64, epsilon: 0.00001, params: 128}
norm2: LayerNorm {d_model: 64, epsilon: 0.00001, params: 128}
mlp: MLP {
fc1: Linear {d_input: 64, d_output: 128, bias: true, params: 8320}
fc2: Linear {d_input: 128, d_output: 64, bias: true, params: 8256}
dropout: Dropout {prob: 0.2}
params: 16576
}
dropout: Dropout {prob: 0.2}
params: 38272
}
5: TransformerBlock {
attention: EloAwareAttention {
num_heads: 4
head_dim: 16
elo_embeddings: Embedding {n_embedding: 20, d_model: 32, params: 640}
q_proj: Linear {d_input: 64, d_output: 64, bias: true, params: 4160}
k_proj: Linear {d_input: 64, d_output: 64, bias: true, params: 4160}
v_proj: Linear {d_input: 64, d_output: 64, bias: true, params: 4160}
elo_proj: Linear {d_input: 64, d_output: 64, bias: true, params: 4160}
out_proj: Linear {d_input: 64, d_output: 64, bias: true, params: 4160}
dropout: Dropout {prob: 0.2}
params: 21440
}
norm1: LayerNorm {d_model: 64, epsilon: 0.00001, params: 128}
norm2: LayerNorm {d_model: 64, epsilon: 0.00001, params: 128}
mlp: MLP {
fc1: Linear {d_input: 64, d_output: 128, bias: true, params: 8320}
fc2: Linear {d_input: 128, d_output: 64, bias: true, params: 8256}
dropout: Dropout {prob: 0.2}
params: 16576
}
dropout: Dropout {prob: 0.2}
params: 38272
}
}
norm: LayerNorm {d_model: 64, epsilon: 0.00001, params: 128}
policy_head: Linear {d_input: 1344, d_output: 4096, bias: true, params: 5509120}
side_info_head: Linear {d_input: 64, d_output: 13, bias: true, params: 845}
side_info_bce: BinaryCrossEntropyLoss {weights: None, smoothing: None, logits: true}
policy_projection: Linear {d_input: 64, d_output: 64, bias: true, params: 4160}
value_projection: Linear {d_input: 64, d_output: 64, bias: true, params: 4160}
side_info_projection: Linear {d_input: 64, d_output: 64, bias: true, params: 4160}
value_hidden: Linear {d_input: 64, d_output: 128, bias: true, params: 8320}
value_head: Linear {d_input: 128, d_output: 3, bias: true, params: 387}
value_dropout: Dropout {prob: 0.2}
policy_uncertainty: ParamTensor {rank: 1, shape: [1], kind: float}
value_uncertainty: ParamTensor {rank: 1, shape: [1], kind: float}
side_info_uncertainty: ParamTensor {rank: 1, shape: [1], kind: float}
params: 6232871
}
2025-07-23T10:24:36.812607Z INFO burn_train::learner::epoch: Executing training step for epoch 1 on devices [Cuda(0), Cuda(1), Cuda(2), Cuda(3), Cuda(4), Cuda(5), Cuda(6), Cuda(7)]
2025-07-23T10:24:38.817962Z INFO oxi::model: forward_classification: batch_size=54
2025-07-23T10:24:38.819140Z INFO cubecl_runtime::tune::tune_cache: Load autotune cache ...
2025-07-23T10:24:38.819217Z INFO cubecl_runtime::tune::tune_cache: Loaded 18 autotune cached entries
2025-07-23T10:24:39.196001Z INFO cubecl_runtime::tune::tune_cache: Load autotune cache ...
2025-07-23T10:24:39.196085Z INFO cubecl_runtime::tune::tune_cache: Loaded 58 autotune cached entries
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment