Created
July 23, 2025 11:22
-
-
Save marcusbuffett/6ce4165a701b8ae5f83fdf55487d07a3 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| 2025-07-23T10:24:36.343588Z INFO burn_train::learner::train_val: Fitting the model: | |
| OXIModel { | |
| encoder: ChessResNet { | |
| input_conv: Conv2d {stride: [1, 1], kernel_size: [3, 3], dilation: [1, 1], groups: 1, padding: Same, params: 9280} | |
| input_bn: BatchNorm {num_features: 64, momentum: 0.1, epsilon: 0.00001, params: 256} | |
| blocks: Vec<0..6> { | |
| 0: BasicBlock { | |
| conv1: Conv2d {stride: [1, 1], kernel_size: [3, 3], dilation: [1, 1], groups: 1, padding: Same, params: 36928} | |
| bn1: BatchNorm {num_features: 64, momentum: 0.1, epsilon: 0.00001, params: 256} | |
| conv2: Conv2d {stride: [1, 1], kernel_size: [3, 3], dilation: [1, 1], groups: 1, padding: Same, params: 36928} | |
| bn2: BatchNorm {num_features: 64, momentum: 0.1, epsilon: 0.00001, params: 256} | |
| se: SqueezeExcitation { | |
| fc1: Linear {d_input: 64, d_output: 8, bias: true, params: 520} | |
| fc2: Linear {d_input: 8, d_output: 64, bias: true, params: 576} | |
| params: 1096 | |
| } | |
| params: 75464 | |
| } | |
| 1: BasicBlock { | |
| conv1: Conv2d {stride: [1, 1], kernel_size: [3, 3], dilation: [1, 1], groups: 1, padding: Same, params: 36928} | |
| bn1: BatchNorm {num_features: 64, momentum: 0.1, epsilon: 0.00001, params: 256} | |
| conv2: Conv2d {stride: [1, 1], kernel_size: [3, 3], dilation: [1, 1], groups: 1, padding: Same, params: 36928} | |
| bn2: BatchNorm {num_features: 64, momentum: 0.1, epsilon: 0.00001, params: 256} | |
| se: SqueezeExcitation { | |
| fc1: Linear {d_input: 64, d_output: 8, bias: true, params: 520} | |
| fc2: Linear {d_input: 8, d_output: 64, bias: true, params: 576} | |
| params: 1096 | |
| } | |
| params: 75464 | |
| } | |
| 2: BasicBlock { | |
| conv1: Conv2d {stride: [1, 1], kernel_size: [3, 3], dilation: [1, 1], groups: 1, padding: Same, params: 36928} | |
| bn1: BatchNorm {num_features: 64, momentum: 0.1, epsilon: 0.00001, params: 256} | |
| conv2: Conv2d {stride: [1, 1], kernel_size: [3, 3], dilation: [1, 1], groups: 1, padding: Same, params: 36928} | |
| bn2: BatchNorm {num_features: 64, momentum: 0.1, epsilon: 0.00001, params: 256} | |
| se: SqueezeExcitation { | |
| fc1: Linear {d_input: 64, d_output: 8, bias: true, params: 520} | |
| fc2: Linear {d_input: 8, d_output: 64, bias: true, params: 576} | |
| params: 1096 | |
| } | |
| params: 75464 | |
| } | |
| 3: BasicBlock { | |
| conv1: Conv2d {stride: [1, 1], kernel_size: [3, 3], dilation: [1, 1], groups: 1, padding: Same, params: 36928} | |
| bn1: BatchNorm {num_features: 64, momentum: 0.1, epsilon: 0.00001, params: 256} | |
| conv2: Conv2d {stride: [1, 1], kernel_size: [3, 3], dilation: [1, 1], groups: 1, padding: Same, params: 36928} | |
| bn2: BatchNorm {num_features: 64, momentum: 0.1, epsilon: 0.00001, params: 256} | |
| se: SqueezeExcitation { | |
| fc1: Linear {d_input: 64, d_output: 8, bias: true, params: 520} | |
| fc2: Linear {d_input: 8, d_output: 64, bias: true, params: 576} | |
| params: 1096 | |
| } | |
| params: 75464 | |
| } | |
| 4: BasicBlock { | |
| conv1: Conv2d {stride: [1, 1], kernel_size: [3, 3], dilation: [1, 1], groups: 1, padding: Same, params: 36928} | |
| bn1: BatchNorm {num_features: 64, momentum: 0.1, epsilon: 0.00001, params: 256} | |
| conv2: Conv2d {stride: [1, 1], kernel_size: [3, 3], dilation: [1, 1], groups: 1, padding: Same, params: 36928} | |
| bn2: BatchNorm {num_features: 64, momentum: 0.1, epsilon: 0.00001, params: 256} | |
| se: SqueezeExcitation { | |
| fc1: Linear {d_input: 64, d_output: 8, bias: true, params: 520} | |
| fc2: Linear {d_input: 8, d_output: 64, bias: true, params: 576} | |
| params: 1096 | |
| } | |
| params: 75464 | |
| } | |
| 5: BasicBlock { | |
| conv1: Conv2d {stride: [1, 1], kernel_size: [3, 3], dilation: [1, 1], groups: 1, padding: Same, params: 36928} | |
| bn1: BatchNorm {num_features: 64, momentum: 0.1, epsilon: 0.00001, params: 256} | |
| conv2: Conv2d {stride: [1, 1], kernel_size: [3, 3], dilation: [1, 1], groups: 1, padding: Same, params: 36928} | |
| bn2: BatchNorm {num_features: 64, momentum: 0.1, epsilon: 0.00001, params: 256} | |
| se: SqueezeExcitation { | |
| fc1: Linear {d_input: 64, d_output: 8, bias: true, params: 520} | |
| fc2: Linear {d_input: 8, d_output: 64, bias: true, params: 576} | |
| params: 1096 | |
| } | |
| params: 75464 | |
| } | |
| } | |
| policy_conv: Conv2d {stride: [1, 1], kernel_size: [1, 1], dilation: [1, 1], groups: 1, padding: Valid, params: 1300} | |
| policy_bn: BatchNorm {num_features: 20, momentum: 0.1, epsilon: 0.00001, params: 80} | |
| params: 463700 | |
| } | |
| patch_embed: Linear {d_input: 64, d_output: 64, bias: true, params: 4160} | |
| pos_embed: ParamTensor {rank: 3, shape: [1, 64, 64], kind: float} | |
| transformer_blocks: Vec<0..6> { | |
| 0: TransformerBlock { | |
| attention: EloAwareAttention { | |
| num_heads: 4 | |
| head_dim: 16 | |
| elo_embeddings: Embedding {n_embedding: 20, d_model: 32, params: 640} | |
| q_proj: Linear {d_input: 64, d_output: 64, bias: true, params: 4160} | |
| k_proj: Linear {d_input: 64, d_output: 64, bias: true, params: 4160} | |
| v_proj: Linear {d_input: 64, d_output: 64, bias: true, params: 4160} | |
| elo_proj: Linear {d_input: 64, d_output: 64, bias: true, params: 4160} | |
| out_proj: Linear {d_input: 64, d_output: 64, bias: true, params: 4160} | |
| dropout: Dropout {prob: 0.2} | |
| params: 21440 | |
| } | |
| norm1: LayerNorm {d_model: 64, epsilon: 0.00001, params: 128} | |
| norm2: LayerNorm {d_model: 64, epsilon: 0.00001, params: 128} | |
| mlp: MLP { | |
| fc1: Linear {d_input: 64, d_output: 128, bias: true, params: 8320} | |
| fc2: Linear {d_input: 128, d_output: 64, bias: true, params: 8256} | |
| dropout: Dropout {prob: 0.2} | |
| params: 16576 | |
| } | |
| dropout: Dropout {prob: 0.2} | |
| params: 38272 | |
| } | |
| 1: TransformerBlock { | |
| attention: EloAwareAttention { | |
| num_heads: 4 | |
| head_dim: 16 | |
| elo_embeddings: Embedding {n_embedding: 20, d_model: 32, params: 640} | |
| q_proj: Linear {d_input: 64, d_output: 64, bias: true, params: 4160} | |
| k_proj: Linear {d_input: 64, d_output: 64, bias: true, params: 4160} | |
| v_proj: Linear {d_input: 64, d_output: 64, bias: true, params: 4160} | |
| elo_proj: Linear {d_input: 64, d_output: 64, bias: true, params: 4160} | |
| out_proj: Linear {d_input: 64, d_output: 64, bias: true, params: 4160} | |
| dropout: Dropout {prob: 0.2} | |
| params: 21440 | |
| } | |
| norm1: LayerNorm {d_model: 64, epsilon: 0.00001, params: 128} | |
| norm2: LayerNorm {d_model: 64, epsilon: 0.00001, params: 128} | |
| mlp: MLP { | |
| fc1: Linear {d_input: 64, d_output: 128, bias: true, params: 8320} | |
| fc2: Linear {d_input: 128, d_output: 64, bias: true, params: 8256} | |
| dropout: Dropout {prob: 0.2} | |
| params: 16576 | |
| } | |
| dropout: Dropout {prob: 0.2} | |
| params: 38272 | |
| } | |
| 2: TransformerBlock { | |
| attention: EloAwareAttention { | |
| num_heads: 4 | |
| head_dim: 16 | |
| elo_embeddings: Embedding {n_embedding: 20, d_model: 32, params: 640} | |
| q_proj: Linear {d_input: 64, d_output: 64, bias: true, params: 4160} | |
| k_proj: Linear {d_input: 64, d_output: 64, bias: true, params: 4160} | |
| v_proj: Linear {d_input: 64, d_output: 64, bias: true, params: 4160} | |
| elo_proj: Linear {d_input: 64, d_output: 64, bias: true, params: 4160} | |
| out_proj: Linear {d_input: 64, d_output: 64, bias: true, params: 4160} | |
| dropout: Dropout {prob: 0.2} | |
| params: 21440 | |
| } | |
| norm1: LayerNorm {d_model: 64, epsilon: 0.00001, params: 128} | |
| norm2: LayerNorm {d_model: 64, epsilon: 0.00001, params: 128} | |
| mlp: MLP { | |
| fc1: Linear {d_input: 64, d_output: 128, bias: true, params: 8320} | |
| fc2: Linear {d_input: 128, d_output: 64, bias: true, params: 8256} | |
| dropout: Dropout {prob: 0.2} | |
| params: 16576 | |
| } | |
| dropout: Dropout {prob: 0.2} | |
| params: 38272 | |
| } | |
| 3: TransformerBlock { | |
| attention: EloAwareAttention { | |
| num_heads: 4 | |
| head_dim: 16 | |
| elo_embeddings: Embedding {n_embedding: 20, d_model: 32, params: 640} | |
| q_proj: Linear {d_input: 64, d_output: 64, bias: true, params: 4160} | |
| k_proj: Linear {d_input: 64, d_output: 64, bias: true, params: 4160} | |
| v_proj: Linear {d_input: 64, d_output: 64, bias: true, params: 4160} | |
| elo_proj: Linear {d_input: 64, d_output: 64, bias: true, params: 4160} | |
| out_proj: Linear {d_input: 64, d_output: 64, bias: true, params: 4160} | |
| dropout: Dropout {prob: 0.2} | |
| params: 21440 | |
| } | |
| norm1: LayerNorm {d_model: 64, epsilon: 0.00001, params: 128} | |
| norm2: LayerNorm {d_model: 64, epsilon: 0.00001, params: 128} | |
| mlp: MLP { | |
| fc1: Linear {d_input: 64, d_output: 128, bias: true, params: 8320} | |
| fc2: Linear {d_input: 128, d_output: 64, bias: true, params: 8256} | |
| dropout: Dropout {prob: 0.2} | |
| params: 16576 | |
| } | |
| dropout: Dropout {prob: 0.2} | |
| params: 38272 | |
| } | |
| 4: TransformerBlock { | |
| attention: EloAwareAttention { | |
| num_heads: 4 | |
| head_dim: 16 | |
| elo_embeddings: Embedding {n_embedding: 20, d_model: 32, params: 640} | |
| q_proj: Linear {d_input: 64, d_output: 64, bias: true, params: 4160} | |
| k_proj: Linear {d_input: 64, d_output: 64, bias: true, params: 4160} | |
| v_proj: Linear {d_input: 64, d_output: 64, bias: true, params: 4160} | |
| elo_proj: Linear {d_input: 64, d_output: 64, bias: true, params: 4160} | |
| out_proj: Linear {d_input: 64, d_output: 64, bias: true, params: 4160} | |
| dropout: Dropout {prob: 0.2} | |
| params: 21440 | |
| } | |
| norm1: LayerNorm {d_model: 64, epsilon: 0.00001, params: 128} | |
| norm2: LayerNorm {d_model: 64, epsilon: 0.00001, params: 128} | |
| mlp: MLP { | |
| fc1: Linear {d_input: 64, d_output: 128, bias: true, params: 8320} | |
| fc2: Linear {d_input: 128, d_output: 64, bias: true, params: 8256} | |
| dropout: Dropout {prob: 0.2} | |
| params: 16576 | |
| } | |
| dropout: Dropout {prob: 0.2} | |
| params: 38272 | |
| } | |
| 5: TransformerBlock { | |
| attention: EloAwareAttention { | |
| num_heads: 4 | |
| head_dim: 16 | |
| elo_embeddings: Embedding {n_embedding: 20, d_model: 32, params: 640} | |
| q_proj: Linear {d_input: 64, d_output: 64, bias: true, params: 4160} | |
| k_proj: Linear {d_input: 64, d_output: 64, bias: true, params: 4160} | |
| v_proj: Linear {d_input: 64, d_output: 64, bias: true, params: 4160} | |
| elo_proj: Linear {d_input: 64, d_output: 64, bias: true, params: 4160} | |
| out_proj: Linear {d_input: 64, d_output: 64, bias: true, params: 4160} | |
| dropout: Dropout {prob: 0.2} | |
| params: 21440 | |
| } | |
| norm1: LayerNorm {d_model: 64, epsilon: 0.00001, params: 128} | |
| norm2: LayerNorm {d_model: 64, epsilon: 0.00001, params: 128} | |
| mlp: MLP { | |
| fc1: Linear {d_input: 64, d_output: 128, bias: true, params: 8320} | |
| fc2: Linear {d_input: 128, d_output: 64, bias: true, params: 8256} | |
| dropout: Dropout {prob: 0.2} | |
| params: 16576 | |
| } | |
| dropout: Dropout {prob: 0.2} | |
| params: 38272 | |
| } | |
| } | |
| norm: LayerNorm {d_model: 64, epsilon: 0.00001, params: 128} | |
| policy_head: Linear {d_input: 1344, d_output: 4096, bias: true, params: 5509120} | |
| side_info_head: Linear {d_input: 64, d_output: 13, bias: true, params: 845} | |
| side_info_bce: BinaryCrossEntropyLoss {weights: None, smoothing: None, logits: true} | |
| policy_projection: Linear {d_input: 64, d_output: 64, bias: true, params: 4160} | |
| value_projection: Linear {d_input: 64, d_output: 64, bias: true, params: 4160} | |
| side_info_projection: Linear {d_input: 64, d_output: 64, bias: true, params: 4160} | |
| value_hidden: Linear {d_input: 64, d_output: 128, bias: true, params: 8320} | |
| value_head: Linear {d_input: 128, d_output: 3, bias: true, params: 387} | |
| value_dropout: Dropout {prob: 0.2} | |
| policy_uncertainty: ParamTensor {rank: 1, shape: [1], kind: float} | |
| value_uncertainty: ParamTensor {rank: 1, shape: [1], kind: float} | |
| side_info_uncertainty: ParamTensor {rank: 1, shape: [1], kind: float} | |
| params: 6232871 | |
| } | |
| 2025-07-23T10:24:36.812607Z INFO burn_train::learner::epoch: Executing training step for epoch 1 on devices [Cuda(0), Cuda(1), Cuda(2), Cuda(3), Cuda(4), Cuda(5), Cuda(6), Cuda(7)] | |
| 2025-07-23T10:24:38.817962Z INFO oxi::model: forward_classification: batch_size=54 | |
| 2025-07-23T10:24:38.819140Z INFO cubecl_runtime::tune::tune_cache: Load autotune cache ... | |
| 2025-07-23T10:24:38.819217Z INFO cubecl_runtime::tune::tune_cache: Loaded 18 autotune cached entries | |
| 2025-07-23T10:24:39.196001Z INFO cubecl_runtime::tune::tune_cache: Load autotune cache ... | |
| 2025-07-23T10:24:39.196085Z INFO cubecl_runtime::tune::tune_cache: Loaded 58 autotune cached entries |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment