Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save tysam-code/fdb39b51c5b8f0e3f3ae91f673f51b57 to your computer and use it in GitHub Desktop.
Save tysam-code/fdb39b51c5b8f0e3f3ae91f673f51b57 to your computer and use it in GitHub Desktop.
diloco_nesterov_.7lr_.0_to_.9_momentum_1000_momentum_warmup_1-momentum_dampening_dampening_initial_step_bugfix_25_steps_all_run3.log
import os
import sys
with open(sys.argv[0]) as f:
code = f.read() # read the code of this file ASAP, for logging
import uuid
import time
import glob
import subprocess
import contextlib
from dataclasses import dataclass
import torch
torch.empty(1, device='cuda', requires_grad=True).backward()
from torch import nn
import torch.nn.functional as F
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
# use of FlexAttention contributed by @KoszarskyB
from torch.nn.attention.flex_attention import BlockMask, flex_attention
# -----------------------------------------------------------------------------
# Muon optimizer
@torch.compile
def zeropower_via_newtonschulz5(G, steps):
"""
Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose
of minimizing steps, it turns out to be empirically effective to keep increasing the slope at
zero even beyond the point where the iteration no longer converges all the way to one everywhere
on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T
where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model
performance at all relative to UV^T, where USV^T = G is the SVD.
"""
assert len(G.shape) == 2
a, b, c = (3.4445, -4.7750, 2.0315)
X = G.bfloat16()
if G.size(0) > G.size(1):
X = X.T
# Ensure spectral norm is at most 1
X = X / (X.norm() + 1e-7)
# Perform the NS iterations
for _ in range(steps):
A = X @ X.T
B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng
X = a * X + B @ X
if G.size(0) > G.size(1):
X = X.T
return X
class Muon(torch.optim.Optimizer):
"""
Muon - MomentUm Orthogonalized by Newton-schulz
Muon internally runs standard SGD-momentum, and then performs an orthogonalization post-
processing step, in which each 2D parameter's update is replaced with the nearest orthogonal
matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has
the advantage that it can be stably run in bfloat16 on the GPU.
Some warnings:
- This optimizer assumes that all parameters passed in are 2D.
- It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D
parameters; those should all be optimized by a standard method (e.g., AdamW).
- To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions.
- We believe it is unlikely to work well for training with small batch size.
- We believe it may not work well for finetuning pretrained models, but we haven't tested this.
- We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M).
Arguments:
lr: The learning rate used by the internal SGD.
momentum: The momentum used by the internal SGD.
nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended)
ns_steps: The number of Newton-Schulz iteration steps to use.
"""
def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, ns_steps=5):
self.world_size = int(os.environ['WORLD_SIZE'])
self.rank = int(os.environ['RANK'])
defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, ns_steps=ns_steps)
assert all(isinstance(p, torch.Tensor) for p in params)
sizes = {p.numel() for p in params}
param_groups = [dict(params=[p for p in params if p.numel() == size],
update_buffer=[torch.empty(size, device='cuda', dtype=torch.bfloat16) for _ in range(self.world_size)])
for size in sizes]
super().__init__(param_groups, defaults)
def step(self):
for group in self.param_groups:
lr = group['lr']
momentum = group['momentum']
nesterov = group['nesterov']
ns_steps = group['ns_steps']
update_buffers = group['update_buffer']
# generate weight updates in distributed fashion
params = group['params']
"""
handle = None
params_world = None
def update_prev():
if params_world is None:
return
assert handle is not None
handle.wait()
for p_world, g_world in zip(params_world, update_buffers):
p_world.data.add_(
g_world.view_as(p_world),
alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5,
)
"""
# Single-GPU-only experiments, disabling comms silliness due to segfault stuff :(
for base_i in range(len(params)): #[::self.world_size]:
if True:
#if base_i + rank < len(params):
p = params[base_i + self.rank]
g = p.grad
assert g is not None
state = self.state[p]
if 'momentum_buffer' not in state:
state['momentum_buffer'] = torch.zeros_like(g)
buf = state['momentum_buffer']
buf.lerp_(g, 1 - momentum)
g = g.lerp_(buf, momentum) if nesterov else buf
g = zeropower_via_newtonschulz5(g, steps=ns_steps).flatten()
p.data.add_(g.view_as(p), alpha=-lr * max(1, p.size(0) / p.size(1)) ** 0.5)
#else:
# g = update_buffers[rank]
#update_prev() # async all_gather instead of sync all_reduce by @YouJiacheng
#handle = dist.all_gather(update_buffers, g, async_op=True)
#params_world = params[base_i : base_i + self.world_size]
#update_prev()
# -----------------------------------------------------------------------------
# PyTorch nn.Module definitions for the GPT-2 model
def norm(x):
return F.rms_norm(x, (x.size(-1),))
class CastedLinear(nn.Linear):
def __init__(self, in_features, out_features):
super().__init__(in_features, out_features, bias=False)
def forward(self, x):
return F.linear(x, self.weight.type_as(x))
class Rotary(nn.Module):
def __init__(self, dim, max_seq_len=65536):
super().__init__()
# half-truncate RoPE by @YouJiacheng
angular_freq = (1 / 1024) ** torch.linspace(0, 1, steps=dim//4, dtype=torch.float32)
angular_freq = torch.cat([angular_freq, angular_freq.new_zeros(dim//4)])
t = torch.arange(max_seq_len, dtype=torch.float32)
theta = torch.einsum('i,j -> ij', t, angular_freq)
self.cos = nn.Buffer(theta.cos(), persistent=False)
self.sin = nn.Buffer(theta.sin(), persistent=False)
def forward(self, x):
cos, sin = self.cos[None, :x.size(-3), None, :], self.sin[None, :x.size(-3), None, :]
x1, x2 = x.float().chunk(2, dim=-1)
y1 = x1 * cos + x2 * sin
y2 = x1 * (-sin) + x2 * cos
return torch.cat((y1, y2), 3).type_as(x)
class CausalSelfAttention(nn.Module):
def __init__(self, dim, num_heads):
super().__init__()
assert dim % num_heads == 0
self.num_heads = num_heads
self.c_q = CastedLinear(dim, dim)
self.c_k = CastedLinear(dim, dim)
self.c_v = CastedLinear(dim, dim)
self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5]))
self.rotary = Rotary(dim // num_heads) # dim // num_heads = head_dim
self.c_proj = CastedLinear(dim, dim)
self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977
def forward(self, x, ve, block_mask):
B, T = x.size(0), x.size(1) # batch size, sequence length
assert B == 1, 'Must use batch size = 1 for FlexAttention'
q = self.c_q(x).view(B, T, self.num_heads, -1)
k = self.c_k(x).view(B, T, self.num_heads, -1)
v = self.c_v(x).view(B, T, self.num_heads, -1)
if ve is not None:
v = self.lambdas[0] * v + self.lambdas[1] * ve.view_as(v) # @KoszarskyB & @Grad62304977
else: # skip mid-layers token value embeddings by @YouJiacheng
v = self.lambdas[0] * v
q, k = norm(q), norm(k) # QK norm @Grad62304977
q, k = self.rotary(q), self.rotary(k)
y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask)
y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side
y = self.c_proj(y)
return y
class MLP(nn.Module):
def __init__(self, dim):
super().__init__()
self.c_fc = CastedLinear(dim, 4 * dim)
self.c_proj = CastedLinear(4 * dim, dim)
self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977
def forward(self, x):
x = self.c_fc(x)
x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977
x = self.c_proj(x)
return x
class Block(nn.Module):
def __init__(self, model_dim, num_heads, use_attn=True):
super().__init__()
self.attn = CausalSelfAttention(model_dim, num_heads) if use_attn else None
self.mlp = MLP(model_dim)
self.lambdas = nn.Parameter(torch.tensor([1., 0.]))
def forward(self, x, ve, x0, block_mask):
x = self.lambdas[0] * x + self.lambdas[1] * x0
if self.attn is not None:
x = x + self.attn(norm(x), ve, block_mask)
x = x + self.mlp(norm(x))
return x
class ValueEmbedding(nn.Module):
def __init__(self, vocab_size, model_dim):
super().__init__()
self.embed = nn.ModuleList([nn.Embedding(vocab_size, model_dim) for _ in range(3)])
def forward(self, inputs):
ve = [emb(inputs).bfloat16() for emb in self.embed]
# 012 ... 012 structure on token value embeddings by @YouJiacheng, improved on @leloykun's U-net structure
ve = [ve[0], ve[1], ve[2], None, None, None, None, None, None, ve[0], ve[1], ve[2]]
return ve
# -----------------------------------------------------------------------------
# The main GPT-2 model
class GPT(nn.Module):
def __init__(self, vocab_size, num_layers, num_heads, model_dim):
super().__init__()
self.embed = nn.Embedding(vocab_size, model_dim)
# skip attention of blocks.7 (the 8th layer) by @YouJiacheng
self.blocks = nn.ModuleList([Block(model_dim, num_heads, use_attn=(i != 7))
for i in range(num_layers)])
# token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning
# U-net structure on token value embeddings by @leloykun
self.value_embeds = ValueEmbedding(vocab_size, model_dim)
self.lm_head = CastedLinear(model_dim, vocab_size)
self.lm_head.weight.data.zero_() # @Grad62304977
# U-net design by @brendanh0gan
self.num_encoder_layers = num_layers // 2 # Half of the layers for encoder
self.num_decoder_layers = num_layers - self.num_encoder_layers # Remaining for decoder
# Add learnable skip connection weights for decoder layers
self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers))
def forward(self, inputs, targets, sliding_window_num_blocks):
BLOCK_SIZE = 128
seq_len = len(inputs)
assert seq_len % BLOCK_SIZE == 0
total_num_blocks = seq_len // BLOCK_SIZE
assert inputs.ndim == 1
docs = (inputs == 50256).cumsum(0)
docs_low = docs.view(-1, BLOCK_SIZE)[:, 0].contiguous()
docs_high = docs.view(-1, BLOCK_SIZE)[:, -1].contiguous()
def document_causal(b, h, q_idx, kv_idx):
causal_mask = q_idx >= kv_idx
document_mask = docs[q_idx] == docs[kv_idx]
return causal_mask & document_mask
def dense_to_ordered(dense_mask):
num_blocks = dense_mask.sum(dim=-1, dtype=torch.int32)
indices = dense_mask.argsort(dim=-1, descending=True, stable=True).to(torch.int32)
return num_blocks[None, None].contiguous(), indices[None, None].contiguous()
def create_doc_swc_block_mask(sliding_window_num_blocks):
kv_idx = block_idx = torch.arange(total_num_blocks, dtype=torch.int32, device='cuda')
q_idx = block_idx[:, None]
causal_bm = q_idx >= kv_idx
causal_full_bm = q_idx > kv_idx
window_bm = q_idx - kv_idx < sliding_window_num_blocks
window_full_bm = window_bm # block-wise sliding window by @YouJiacheng
# document_bm = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx])
document_bm = (docs_low[:, None] <= docs_high) & (docs_low <= docs_high[:, None])
document_full_bm = (docs_low[:, None] == docs_high) & (docs_low == docs_high[:, None])
nonzero_bm = causal_bm & window_bm & document_bm
full_bm = causal_full_bm & window_full_bm & document_full_bm
kv_num_blocks, kv_indices = dense_to_ordered(nonzero_bm & ~full_bm)
full_kv_num_blocks, full_kv_indices = dense_to_ordered(full_bm)
return BlockMask.from_kv_blocks(
kv_num_blocks,
kv_indices,
full_kv_num_blocks,
full_kv_indices,
BLOCK_SIZE=BLOCK_SIZE,
mask_mod=document_causal,
)
block_mask = create_doc_swc_block_mask(sliding_window_num_blocks)
x0 = norm(self.embed(inputs[None]).bfloat16()) # use of norm here by @Grad62304977
x = x0
ve = self.value_embeds(inputs)
assert len(ve) == len(self.blocks)
ve_enc, ve_dec = ve[:self.num_encoder_layers], ve[self.num_encoder_layers:]
# Store outputs for U-Net skip connections
skip_connections = []
# Encoder pass - process only the first half of the blocks
for i in range(self.num_encoder_layers):
x = self.blocks[i](x, ve_enc[i], x0, block_mask)
skip_connections.append(x)
# Decoder pass - process the remaining blocks with weighted skip connections
for i in range(self.num_decoder_layers):
x = x + self.skip_weights[i] * skip_connections.pop()
# U-net structure on token value embeddings by @leloykun
x = self.blocks[self.num_encoder_layers + i](x, ve_dec[i], x0, block_mask)
x = norm(x)
logits = self.lm_head(x)
logits = 15 * torch.tanh(logits / 15) # @Grad62304977 added tanh softcapping, @KoszarskyB reduced it from 30 to 15
logits = logits.float()
loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets)
return loss
# -----------------------------------------------------------------------------
# Our own simple Distributed Data Loader
def _load_data_shard(path):
# only reads the header, returns header data
# header is 256 int32
header = torch.from_file(path, False, 256, dtype=torch.int32)
assert header[0] == 20240520, 'magic number mismatch in the data .bin file'
assert header[1] == 1, 'unsupported version'
num_tokens = int(header[2]) # number of tokens (claimed)
with open(path, 'rb', buffering=0) as f:
tokens = torch.empty(num_tokens, dtype=torch.uint16, pin_memory=True) # avoid pin_memory copy by @YouJiacheng
f.seek(256 * 4)
nbytes = f.readinto(tokens.numpy()) # avoid bytes->array copy by @YouJiacheng
assert nbytes == 2 * num_tokens, 'number of tokens read does not match header'
return tokens
class DistributedDataLoader:
def __init__(self, filename_pattern):
self.rank = int(os.environ['RANK'])
self.world_size = int(os.environ['WORLD_SIZE'])
self.files = sorted(glob.glob(filename_pattern))
self.reset()
def reset(self):
self.current_shard = -1
self.advance()
def advance(self):
self.current_shard = (self.current_shard + 1) % len(self.files)
self.current_position = 0
self.tokens = _load_data_shard(self.files[self.current_shard])
def next_batch(self, batch_size):
assert batch_size % self.world_size == 0
device_batch_size = batch_size // self.world_size
# load next shard if necessary
if self.current_position + batch_size + 1 >= len(self.tokens):
self.advance()
pos = self.current_position + self.rank * device_batch_size
device_batch_tokens = self.tokens[pos:pos+device_batch_size+1]
# advance current position
self.current_position += batch_size
inputs = device_batch_tokens[:-1].to(device='cuda', dtype=torch.int32, non_blocking=True)
targets = device_batch_tokens[1:].to(device='cuda', dtype=torch.int64, non_blocking=True)
return inputs, targets
# -----------------------------------------------------------------------------
# int main
@dataclass
class Hyperparameters:
# data
train_bin = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on
val_bin = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on
# optimization
batch_size = 8*64*1024 # batch size in tokens
max_device_batch_size = 64*1024 # batch size per device in tokens
num_iterations = 1390 # number of iterations to run
cooldown_frac = 0.4 # fraction of training spent cooling down the learning rate
bf16_embeds = True
# evaluation and logging
val_loss_every = 25 #125 # every how many steps to evaluate val loss? 0 for only at the end
val_tokens = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons
# implementation
save_checkpoint = False
args = Hyperparameters()
micro_bs = args.max_device_batch_size
# set up DDP (distributed data parallel). torchrun sets this env variable
rank = int(os.environ['RANK'])
local_rank = int(os.environ['LOCAL_RANK'])
world_size = int(os.environ['WORLD_SIZE'])
assert torch.cuda.is_available()
torch.cuda.set_device(local_rank)
dist.init_process_group(backend='nccl', device_id=torch.device(local_rank))
dist.barrier()
master_process = (rank == 0) # this process will do logging, checkpointing etc.
# begin logging
logfile = None
if master_process:
run_id = uuid.uuid4()
os.makedirs('logs', exist_ok=True)
logfile = f'logs/{run_id}.txt'
print(logfile)
def print0(s, console=False):
if master_process:
with open(logfile, 'a') as f:
if console:
print(s)
print(s, file=f)
# begin by printing this file (the Python code)
print0(code)
print0('='*100)
# log information about the hardware/software environment this is running on
print0(f'Running Python {sys.version}')
print0(f'Running PyTorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}')
print0(subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True).stdout)
print0('='*100)
# load data
train_loader = DistributedDataLoader(args.train_bin)
val_loader = DistributedDataLoader(args.val_bin)
print0(f'Training dataloader files: {train_loader.files}')
print0(f'Validation dataloader files: {val_loader.files}')
print0('='*100)
# init model_opt dict, this will hold all of the separate models that we use here
outer_opt_lr = 0.7
outer_opt_momentum = .9
models_opts_schedulers = []
num_models_to_simulate = 8
#diloco_update_steps = 1
diloco_update_steps = 25 #10
# Steps to compile before copying out the model to its replicas
compile_steps = 15 #2 #20
# there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977.
# this originates from Karpathy's experiments.
core_model = GPT(vocab_size=50304, num_layers=12, num_heads=6, model_dim=768)
core_model = core_model.cuda()
if args.bf16_embeds:
for m in core_model.modules():
if isinstance(m, nn.Embedding):
m.bfloat16()
core_model = torch.compile(core_model)
#####ddp_model = DDP(model, device_ids=[local_rank], broadcast_buffers=False, gradient_as_bucket_view=True)
# Add outer Nesterov optimizer to the core model
outer_opt = torch.optim.SGD(core_model.parameters(), lr=outer_opt_lr, momentum=outer_opt_momentum, nesterov=True)
# Disabling bugfix for default diloco runs
"""
######################################################################################
# Set Initial Momentum To 0 in outer_opt (PyTorch bug w/ first step dampening) #
######################################################################################
for parameter in core_model.parameters():
parameter.grad = torch.zeros_like(parameter)
# Set outer opt momentum buffers (best to do this internally to avoid spaghetti code)
outer_opt.step()
core_model.zero_grad(set_to_none=True)
"""
print("Compiling model!")
# call model so it is properly built, before cloning
for _ in range(compile_steps):
core_model.forward(torch.randint(0, 128, (1024*64,)).to(device='cuda', dtype=torch.long), torch.randint(0, 128, (1024*64,)).to(device='cuda', dtype=torch.long), torch.tensor([128], device='cuda', dtype=torch.long)).mean().backward()
# Set gradients to none
core_model.zero_grad(set_to_none=True)
print("Model compiled.")
# tmp dev import
import copy
for _ in range(num_models_to_simulate):
# make model copy
model_copy = copy.deepcopy(core_model)
# collect the parameters to optimize
hidden_matrix_params = [p for p in model_copy.blocks.parameters() if p.ndim == 2]
embed_params = [model_copy.embed.weight, *model_copy.value_embeds.parameters()]
scalar_params = [p for p in model_copy.parameters() if p.ndim < 2]
head_params = [model_copy.lm_head.weight]
# init the optimizer(s)
optimizer1 = torch.optim.Adam([dict(params=embed_params, lr=0.6),
dict(params=head_params, lr=0.008),
dict(params=scalar_params, lr=0.04)],
betas=(0.8, 0.95), fused=True)
optimizer2 = Muon(hidden_matrix_params, lr=0.05, momentum=0.95)
optimizers = [optimizer1, optimizer2]
# learning rate schedule: stable then decay
def get_lr(it):
t = 1 - it / args.num_iterations # time remaining in training
assert 1 >= t > 0
# 1) constant lr for first part of training
if t >= args.cooldown_frac:
return 1.0
# 2) then linear cooldown
else:
return t / args.cooldown_frac
schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers]
models_opts_schedulers.append((model_copy, optimizers, schedulers))
# sliding window size schedule: linear increase over training in chunks of 128 from 128 -> 1792. By @fernbear.bsky.social
def get_sliding_window_blocks(it):
x = it / args.num_iterations # training progress
assert 0 <= x <= 1
return int(((1 - x) * 128 + x * 1856) // 128)
sliding_window_num_blocks = torch.tensor(1, dtype=torch.int32, device='cuda')
# Start training loop
training_time_ms = 0
# start the clock
torch.cuda.synchronize()
t0 = time.perf_counter()
# begin training
train_steps = args.num_iterations
for step in range(train_steps + 1):
last_step = (step == train_steps)
# This effectively ignores timing first 10 steps, which are slower for weird reasons.
# Alternately, and slightly more correctly in terms of benchmarking, we could do 10
# steps with dummy data first, and then re-initialize the model and reset the loader.
if step == 10:
training_time_ms = 0
t0 = time.perf_counter()
timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val
sliding_window_num_blocks.copy_(get_sliding_window_blocks(step))
# --------------- VALIDATION SECTION -----------------
if last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0):
# stop the clock
torch.cuda.synchronize()
training_time_ms += 1000 * (time.perf_counter() - t0)
# run validation batches
core_model.eval()
val_loader.reset()
val_loss = 0.0
# calculate the number of steps to take in the val loop.
val_batch_size = world_size * micro_bs
assert args.val_tokens % val_batch_size == 0
val_steps = args.val_tokens // val_batch_size
for _ in range(val_steps):
with torch.no_grad():
inputs_val, targets_val = val_loader.next_batch(val_batch_size)
val_loss += core_model(inputs_val, targets_val, sliding_window_num_blocks)
dist.all_reduce(val_loss, op=dist.ReduceOp.AVG)
val_loss /= val_steps
# logging
print0(f'step:{step}/{train_steps} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms', console=True)
# start the clock again
torch.cuda.synchronize()
t0 = time.perf_counter()
if last_step:
if master_process and args.save_checkpoint:
log = dict(step=step, code=code, model=model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers])
os.makedirs(f'logs/{run_id}', exist_ok=True)
torch.save(log, f'logs/{run_id}/state_step{step:06d}.pt')
# the last step only has the validation loop, so break to avoid training
break
# --------------- TRAINING SECTION -----------------
#model.train()
# set each model to train
for model, _, _ in models_opts_schedulers:
model.train()
batch_size = args.batch_size
assert batch_size % world_size == 0
inputs_train, targets_train = train_loader.next_batch(batch_size)
assert len(inputs_train) <= micro_bs or len(inputs_train) % micro_bs == 0
assert batch_size//micro_bs == len(models_opts_schedulers), "Microbatchsize and number of model_opt pairs need to be equal in this experiment (functions may need to be written to support iteration over model pairs instead of indexing by microbatch idx)."
for i, (micro_inputs_train, micro_targets_train) in enumerate(zip(inputs_train.split(micro_bs), targets_train.split(micro_bs))):
# forward on distinct model
models_opts_schedulers[i][0](micro_inputs_train, micro_targets_train, sliding_window_num_blocks).backward()
#model(micro_inputs_train, micro_targets_train, sliding_window_num_blocks).backward()
# momentum warmup for Muon
frac = min(step/300, 1)
for model, opts, schedulers in models_opts_schedulers:
# update momentum for muon in each group
for group in opts[1].param_groups: #optimizer2.param_groups:
group['momentum'] = (1 - frac) * 0.85 + frac * 0.95
# step the optimizers and schedulers
for opt, sched in zip(opts, schedulers):
opt.step()
if step != train_steps-1:
sched.step()
# null the gradients
model.zero_grad(set_to_none=True)
#############################################
# DiLoCo Outer Loop (Distributed) Updates #
#############################################
# Update core model w/ updates from other models (optionally on different timescales for different parts, just simply 1 step per for now)
# Zip all parameters together, so we can stack them then average them, then merge them to the core model
if last_step or (step != 0 and step % diloco_update_steps == 0):
models_group_params = [mos[0].parameters() for mos in models_opts_schedulers]
models_grouped_params = zip(*models_group_params)
##################
# Momentum #
##################
outer_opt_momentum_warmup_steps = 1000 #300 #250 #500 #100 #300 #600 #300
outer_opt_min_momentum = .9 #0. #.6 #.9 #.6 #.5 #.6
outer_opt_max_momentum = .9 #.85 #.95 #.9 #.9
frac = min(step/outer_opt_momentum_warmup_steps, 1)
curr_outer_momentum = (1 - frac) * outer_opt_min_momentum + frac * outer_opt_max_momentum
#################
# Dampening #
#################
curr_dampening = 0.0 # tmp for now
"""
# damping hparams
dampening_steps = 300 #100 #300
dampening_max = 0.6
dampening_min = 0.0
frac = min(step/dampening_steps, 1)
curr_dampening = (1 - frac) * dampening_max + frac * dampening_min
"""
# update momentum for each param group in outer opt
for group in outer_opt.param_groups: #optimizer2.param_groups:
group['momentum'] = curr_outer_momentum
#group['dampening'] = curr_dampening
####group['dampening'] = 1. - curr_outer_momentum #curr_dampening
for core_parameters, dist_parameters_list in zip(core_model.parameters(), models_grouped_params):
# TODO: individual parameter schedules?
# TMP hack
#params_list = list(dist_parameters_list)
#dist_parameters_list = params_list
# Simulate grad creation
grads_all = (core_parameters.data.unsqueeze(0) - torch.stack(dist_parameters_list, dim=0))
core_parameters.grad = grads_all.mean(dim=0)
# Simulate update # reduce_mean
#parameters.data.add_(grads, alpha=-diloco_lr) # = torch.stack(dist_parameters_list, dim=0).mean(dim=0)
# If this is the first outer step, PyTorch defaults to filling the momentum buffer with
# the grad, which is a horribly-biased estimator of the state of the network over training
# Here, to account for the momentum warmup-process removing this zero-debiasing operation,
# we simply act as if the momentum buffer zero for the first step (i.e. simply averaging
# the network weights), and then let momemtum and warmup do their things from there
# W/ the nesterov step, this means halving the initial first grad
"""
if step == 0: #diloco_update_steps:
#outer_opt.state[core_parameters]['momentum_buffer'].data.zero_()
core_parameters.grad.data.div_(2./(2.-dampening_max))
"""
# outer_opt step update
outer_opt.step()
outer_opt.zero_grad(set_to_none=True)
# update model copies to reset to original value (would be done locally by each distributed worker)
for dist_params in dist_parameters_list:
dist_params.data.copy_(core_parameters.data)
# Update core model for evals
####parameters.data = torch.stack(dist_parameters_list, dim=0).mean(dim=0)
# Simulate broadcast back out (use this if not using distributed grads, but using the core model to sync instead)
#[d_param.data.copy_(parameters.data) for d_param in dist_parameters_list]
# logging
approx_time = training_time_ms + 1000 * (time.perf_counter() - t0)
print0(f'step:{step+1}/{train_steps} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms', console=True)
print0(f'peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB')
dist.destroy_process_group()
====================================================================================================
Running Python 3.12.7 (main, Apr 29 2025, 18:46:21) [GCC 13.2.0]
Running PyTorch 2.7.0.dev20250310+cu126 compiled for CUDA 12.6
Tue Apr 29 22:39:03 2025
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.144.03 Driver Version: 550.144.03 CUDA Version: 12.4 |
|-----------------------------------------+------------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+========================+======================|
| 0 NVIDIA H100 80GB HBM3 On | 00000000:8D:00.0 Off | 0 |
| N/A 48C P0 126W / 700W | 1180MiB / 81559MiB | 0% Default |
| | | Disabled |
+-----------------------------------------+------------------------+----------------------+
+-----------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=========================================================================================|
+-----------------------------------------------------------------------------------------+
====================================================================================================
Training dataloader files: ['data/fineweb10B/fineweb_train_000001.bin', 'data/fineweb10B/fineweb_train_000002.bin', 'data/fineweb10B/fineweb_train_000003.bin', 'data/fineweb10B/fineweb_train_000004.bin', 'data/fineweb10B/fineweb_train_000005.bin', 'data/fineweb10B/fineweb_train_000006.bin', 'data/fineweb10B/fineweb_train_000007.bin', 'data/fineweb10B/fineweb_train_000008.bin']
Validation dataloader files: ['data/fineweb10B/fineweb_val_000000.bin']
====================================================================================================
step:0/1390 val_loss:10.8258 train_time:0ms step_avg:nanms
step:1/1390 train_time:119375ms step_avg:nanms
step:2/1390 train_time:120478ms step_avg:nanms
step:3/1390 train_time:121632ms step_avg:nanms
step:4/1390 train_time:122798ms step_avg:nanms
step:5/1390 train_time:123967ms step_avg:nanms
step:6/1390 train_time:125128ms step_avg:nanms
step:7/1390 train_time:126282ms step_avg:nanms
step:8/1390 train_time:127442ms step_avg:nanms
step:9/1390 train_time:128606ms step_avg:nanms
step:10/1390 train_time:129772ms step_avg:nanms
step:11/1390 train_time:1173ms step_avg:nanms
step:12/1390 train_time:2340ms step_avg:nanms
step:13/1390 train_time:3511ms step_avg:1170.39ms
step:14/1390 train_time:4677ms step_avg:1169.19ms
step:15/1390 train_time:5844ms step_avg:1168.77ms
step:16/1390 train_time:7011ms step_avg:1168.47ms
step:17/1390 train_time:8185ms step_avg:1169.35ms
step:18/1390 train_time:9355ms step_avg:1169.42ms
step:19/1390 train_time:10528ms step_avg:1169.73ms
step:20/1390 train_time:11695ms step_avg:1169.52ms
step:21/1390 train_time:12854ms step_avg:1168.53ms
step:22/1390 train_time:14028ms step_avg:1168.97ms
step:23/1390 train_time:15193ms step_avg:1168.73ms
step:24/1390 train_time:16353ms step_avg:1168.08ms
step:25/1390 train_time:17518ms step_avg:1167.89ms
step:25/1390 val_loss:10.8258 train_time:17519ms step_avg:1167.90ms
step:26/1390 train_time:18712ms step_avg:1169.49ms
step:27/1390 train_time:19891ms step_avg:1170.06ms
step:28/1390 train_time:21068ms step_avg:1170.47ms
step:29/1390 train_time:22237ms step_avg:1170.36ms
step:30/1390 train_time:23407ms step_avg:1170.33ms
step:31/1390 train_time:24566ms step_avg:1169.81ms
step:32/1390 train_time:25731ms step_avg:1169.60ms
step:33/1390 train_time:26892ms step_avg:1169.21ms
step:34/1390 train_time:28067ms step_avg:1169.48ms
step:35/1390 train_time:29228ms step_avg:1169.14ms
step:36/1390 train_time:30389ms step_avg:1168.81ms
step:37/1390 train_time:31555ms step_avg:1168.69ms
step:38/1390 train_time:32735ms step_avg:1169.11ms
step:39/1390 train_time:33920ms step_avg:1169.65ms
step:40/1390 train_time:35087ms step_avg:1169.55ms
step:41/1390 train_time:36245ms step_avg:1169.20ms
step:42/1390 train_time:37412ms step_avg:1169.13ms
step:43/1390 train_time:38591ms step_avg:1169.41ms
step:44/1390 train_time:39766ms step_avg:1169.58ms
step:45/1390 train_time:40931ms step_avg:1169.45ms
step:46/1390 train_time:42104ms step_avg:1169.57ms
step:47/1390 train_time:43269ms step_avg:1169.42ms
step:48/1390 train_time:44441ms step_avg:1169.49ms
step:49/1390 train_time:45608ms step_avg:1169.43ms
step:50/1390 train_time:46779ms step_avg:1169.47ms
step:50/1390 val_loss:6.6558 train_time:46779ms step_avg:1169.48ms
step:51/1390 train_time:47978ms step_avg:1170.20ms
step:52/1390 train_time:49173ms step_avg:1170.80ms
step:53/1390 train_time:50354ms step_avg:1171.03ms
step:54/1390 train_time:51528ms step_avg:1171.10ms
step:55/1390 train_time:52702ms step_avg:1171.17ms
step:56/1390 train_time:53872ms step_avg:1171.13ms
step:57/1390 train_time:55058ms step_avg:1171.45ms
step:58/1390 train_time:56226ms step_avg:1171.38ms
step:59/1390 train_time:57391ms step_avg:1171.25ms
step:60/1390 train_time:58565ms step_avg:1171.30ms
step:61/1390 train_time:59735ms step_avg:1171.28ms
step:62/1390 train_time:60915ms step_avg:1171.45ms
step:63/1390 train_time:62086ms step_avg:1171.43ms
step:64/1390 train_time:63263ms step_avg:1171.54ms
step:65/1390 train_time:64433ms step_avg:1171.50ms
step:66/1390 train_time:65611ms step_avg:1171.62ms
step:67/1390 train_time:66786ms step_avg:1171.69ms
step:68/1390 train_time:67952ms step_avg:1171.58ms
step:69/1390 train_time:69133ms step_avg:1171.74ms
step:70/1390 train_time:70312ms step_avg:1171.87ms
step:71/1390 train_time:71487ms step_avg:1171.93ms
step:72/1390 train_time:72667ms step_avg:1172.04ms
step:73/1390 train_time:73846ms step_avg:1172.16ms
step:74/1390 train_time:75025ms step_avg:1172.27ms
step:75/1390 train_time:76198ms step_avg:1172.28ms
step:75/1390 val_loss:5.9188 train_time:76199ms step_avg:1172.29ms
step:76/1390 train_time:77404ms step_avg:1172.78ms
step:77/1390 train_time:78593ms step_avg:1173.03ms
step:78/1390 train_time:79763ms step_avg:1172.99ms
step:79/1390 train_time:80950ms step_avg:1173.19ms
step:80/1390 train_time:82127ms step_avg:1173.25ms
step:81/1390 train_time:83307ms step_avg:1173.34ms
step:82/1390 train_time:84492ms step_avg:1173.50ms
step:83/1390 train_time:85665ms step_avg:1173.49ms
step:84/1390 train_time:86853ms step_avg:1173.69ms
step:85/1390 train_time:88030ms step_avg:1173.73ms
step:86/1390 train_time:89208ms step_avg:1173.80ms
step:87/1390 train_time:90392ms step_avg:1173.93ms
step:88/1390 train_time:91570ms step_avg:1173.98ms
step:89/1390 train_time:92749ms step_avg:1174.04ms
step:90/1390 train_time:93932ms step_avg:1174.15ms
step:91/1390 train_time:95121ms step_avg:1174.33ms
step:92/1390 train_time:96312ms step_avg:1174.53ms
step:93/1390 train_time:97486ms step_avg:1174.53ms
step:94/1390 train_time:98668ms step_avg:1174.62ms
step:95/1390 train_time:99849ms step_avg:1174.69ms
step:96/1390 train_time:101026ms step_avg:1174.72ms
step:97/1390 train_time:102211ms step_avg:1174.83ms
step:98/1390 train_time:103385ms step_avg:1174.83ms
step:99/1390 train_time:104571ms step_avg:1174.96ms
step:100/1390 train_time:105749ms step_avg:1174.99ms
step:100/1390 val_loss:5.3295 train_time:105749ms step_avg:1174.99ms
step:101/1390 train_time:106952ms step_avg:1175.30ms
step:102/1390 train_time:108141ms step_avg:1175.45ms
step:103/1390 train_time:109325ms step_avg:1175.54ms
step:104/1390 train_time:110533ms step_avg:1175.88ms
step:105/1390 train_time:111735ms step_avg:1176.16ms
step:106/1390 train_time:112939ms step_avg:1176.45ms
step:107/1390 train_time:114143ms step_avg:1176.73ms
step:108/1390 train_time:115352ms step_avg:1177.06ms
step:109/1390 train_time:116566ms step_avg:1177.44ms
step:110/1390 train_time:117775ms step_avg:1177.75ms
step:111/1390 train_time:118992ms step_avg:1178.14ms
step:112/1390 train_time:120199ms step_avg:1178.42ms
step:113/1390 train_time:121403ms step_avg:1178.67ms
step:114/1390 train_time:122607ms step_avg:1178.91ms
step:115/1390 train_time:123810ms step_avg:1179.15ms
step:116/1390 train_time:125021ms step_avg:1179.45ms
step:117/1390 train_time:126230ms step_avg:1179.72ms
step:118/1390 train_time:127442ms step_avg:1180.02ms
step:119/1390 train_time:128656ms step_avg:1180.33ms
step:120/1390 train_time:129869ms step_avg:1180.62ms
step:121/1390 train_time:131085ms step_avg:1180.94ms
step:122/1390 train_time:132299ms step_avg:1181.24ms
step:123/1390 train_time:133503ms step_avg:1181.44ms
step:124/1390 train_time:134709ms step_avg:1181.65ms
step:125/1390 train_time:135917ms step_avg:1181.88ms
step:125/1390 val_loss:5.0053 train_time:135917ms step_avg:1181.89ms
step:126/1390 train_time:137146ms step_avg:1182.29ms
step:127/1390 train_time:138360ms step_avg:1182.57ms
step:128/1390 train_time:139565ms step_avg:1182.75ms
step:129/1390 train_time:140773ms step_avg:1182.96ms
step:130/1390 train_time:141979ms step_avg:1183.16ms
step:131/1390 train_time:143183ms step_avg:1183.33ms
step:132/1390 train_time:144389ms step_avg:1183.52ms
step:133/1390 train_time:145595ms step_avg:1183.70ms
step:134/1390 train_time:146806ms step_avg:1183.92ms
step:135/1390 train_time:148016ms step_avg:1184.13ms
step:136/1390 train_time:149228ms step_avg:1184.35ms
step:137/1390 train_time:150437ms step_avg:1184.54ms
step:138/1390 train_time:151650ms step_avg:1184.77ms
step:139/1390 train_time:152861ms step_avg:1184.97ms
step:140/1390 train_time:154079ms step_avg:1185.22ms
step:141/1390 train_time:155286ms step_avg:1185.39ms
step:142/1390 train_time:156494ms step_avg:1185.56ms
step:143/1390 train_time:157698ms step_avg:1185.70ms
step:144/1390 train_time:158904ms step_avg:1185.85ms
step:145/1390 train_time:160115ms step_avg:1186.03ms
step:146/1390 train_time:161325ms step_avg:1186.21ms
step:147/1390 train_time:162532ms step_avg:1186.36ms
step:148/1390 train_time:163740ms step_avg:1186.52ms
step:149/1390 train_time:164946ms step_avg:1186.66ms
step:150/1390 train_time:166154ms step_avg:1186.82ms
step:150/1390 val_loss:4.7989 train_time:166154ms step_avg:1186.82ms
step:151/1390 train_time:167381ms step_avg:1187.10ms
step:152/1390 train_time:168596ms step_avg:1187.30ms
step:153/1390 train_time:169800ms step_avg:1187.41ms
step:154/1390 train_time:171004ms step_avg:1187.53ms
step:155/1390 train_time:172215ms step_avg:1187.69ms
step:156/1390 train_time:173420ms step_avg:1187.81ms
step:157/1390 train_time:174622ms step_avg:1187.90ms
step:158/1390 train_time:175823ms step_avg:1188.00ms
step:159/1390 train_time:177042ms step_avg:1188.20ms
step:160/1390 train_time:178249ms step_avg:1188.33ms
step:161/1390 train_time:179453ms step_avg:1188.43ms
step:162/1390 train_time:180656ms step_avg:1188.53ms
step:163/1390 train_time:181865ms step_avg:1188.66ms
step:164/1390 train_time:183073ms step_avg:1188.79ms
step:165/1390 train_time:184286ms step_avg:1188.94ms
step:166/1390 train_time:185495ms step_avg:1189.07ms
step:167/1390 train_time:186704ms step_avg:1189.20ms
step:168/1390 train_time:187912ms step_avg:1189.32ms
step:169/1390 train_time:189123ms step_avg:1189.45ms
step:170/1390 train_time:190335ms step_avg:1189.59ms
step:171/1390 train_time:191543ms step_avg:1189.71ms
step:172/1390 train_time:192750ms step_avg:1189.82ms
step:173/1390 train_time:193962ms step_avg:1189.95ms
step:174/1390 train_time:195175ms step_avg:1190.09ms
step:175/1390 train_time:196378ms step_avg:1190.17ms
step:175/1390 val_loss:4.6435 train_time:196378ms step_avg:1190.17ms
step:176/1390 train_time:197615ms step_avg:1190.45ms
step:177/1390 train_time:198836ms step_avg:1190.64ms
step:178/1390 train_time:200047ms step_avg:1190.76ms
step:179/1390 train_time:201259ms step_avg:1190.88ms
step:180/1390 train_time:202468ms step_avg:1190.99ms
step:181/1390 train_time:203673ms step_avg:1191.07ms
step:182/1390 train_time:204885ms step_avg:1191.19ms
step:183/1390 train_time:206096ms step_avg:1191.30ms
step:184/1390 train_time:207306ms step_avg:1191.42ms
step:185/1390 train_time:208513ms step_avg:1191.50ms
step:186/1390 train_time:209726ms step_avg:1191.63ms
step:187/1390 train_time:210940ms step_avg:1191.75ms
step:188/1390 train_time:212153ms step_avg:1191.87ms
step:189/1390 train_time:213359ms step_avg:1191.95ms
step:190/1390 train_time:214567ms step_avg:1192.04ms
step:191/1390 train_time:215811ms step_avg:1192.32ms
step:192/1390 train_time:217029ms step_avg:1192.47ms
step:193/1390 train_time:218241ms step_avg:1192.57ms
step:194/1390 train_time:219448ms step_avg:1192.65ms
step:195/1390 train_time:220657ms step_avg:1192.74ms
step:196/1390 train_time:221863ms step_avg:1192.81ms
step:197/1390 train_time:223077ms step_avg:1192.92ms
step:198/1390 train_time:224290ms step_avg:1193.03ms
step:199/1390 train_time:225496ms step_avg:1193.10ms
step:200/1390 train_time:226704ms step_avg:1193.18ms
step:200/1390 val_loss:4.5155 train_time:226704ms step_avg:1193.18ms
step:201/1390 train_time:227942ms step_avg:1193.41ms
step:202/1390 train_time:229172ms step_avg:1193.60ms
step:203/1390 train_time:230388ms step_avg:1193.72ms
step:204/1390 train_time:231600ms step_avg:1193.81ms
step:205/1390 train_time:232809ms step_avg:1193.89ms
step:206/1390 train_time:234016ms step_avg:1193.96ms
step:207/1390 train_time:235244ms step_avg:1194.13ms
step:208/1390 train_time:236471ms step_avg:1194.30ms
step:209/1390 train_time:237702ms step_avg:1194.48ms
step:210/1390 train_time:238944ms step_avg:1194.72ms
step:211/1390 train_time:240175ms step_avg:1194.90ms
step:212/1390 train_time:241409ms step_avg:1195.09ms
step:213/1390 train_time:242638ms step_avg:1195.26ms
step:214/1390 train_time:243863ms step_avg:1195.41ms
step:215/1390 train_time:245097ms step_avg:1195.60ms
step:216/1390 train_time:246324ms step_avg:1195.75ms
step:217/1390 train_time:247556ms step_avg:1195.92ms
step:218/1390 train_time:248789ms step_avg:1196.10ms
step:219/1390 train_time:250021ms step_avg:1196.27ms
step:220/1390 train_time:251248ms step_avg:1196.42ms
step:221/1390 train_time:252478ms step_avg:1196.58ms
step:222/1390 train_time:253707ms step_avg:1196.73ms
step:223/1390 train_time:254943ms step_avg:1196.91ms
step:224/1390 train_time:256177ms step_avg:1197.09ms
step:225/1390 train_time:257404ms step_avg:1197.23ms
step:225/1390 val_loss:4.3912 train_time:257405ms step_avg:1197.23ms
step:226/1390 train_time:258663ms step_avg:1197.51ms
step:227/1390 train_time:259898ms step_avg:1197.69ms
step:228/1390 train_time:261132ms step_avg:1197.85ms
step:229/1390 train_time:262360ms step_avg:1197.99ms
step:230/1390 train_time:263586ms step_avg:1198.12ms
step:231/1390 train_time:264814ms step_avg:1198.25ms
step:232/1390 train_time:266045ms step_avg:1198.40ms
step:233/1390 train_time:267278ms step_avg:1198.56ms
step:234/1390 train_time:268509ms step_avg:1198.70ms
step:235/1390 train_time:269736ms step_avg:1198.83ms
step:236/1390 train_time:270964ms step_avg:1198.96ms
step:237/1390 train_time:272193ms step_avg:1199.09ms
step:238/1390 train_time:273415ms step_avg:1199.19ms
step:239/1390 train_time:274642ms step_avg:1199.31ms
step:240/1390 train_time:275870ms step_avg:1199.44ms
step:241/1390 train_time:277104ms step_avg:1199.58ms
step:242/1390 train_time:278330ms step_avg:1199.70ms
step:243/1390 train_time:279554ms step_avg:1199.80ms
step:244/1390 train_time:280786ms step_avg:1199.94ms
step:245/1390 train_time:282020ms step_avg:1200.08ms
step:246/1390 train_time:283248ms step_avg:1200.20ms
step:247/1390 train_time:284481ms step_avg:1200.34ms
step:248/1390 train_time:285712ms step_avg:1200.47ms
step:249/1390 train_time:286942ms step_avg:1200.59ms
step:250/1390 train_time:288167ms step_avg:1200.70ms
step:250/1390 val_loss:4.3000 train_time:288167ms step_avg:1200.70ms
step:251/1390 train_time:289420ms step_avg:1200.91ms
step:252/1390 train_time:290664ms step_avg:1201.09ms
step:253/1390 train_time:291894ms step_avg:1201.21ms
step:254/1390 train_time:293125ms step_avg:1201.33ms
step:255/1390 train_time:294355ms step_avg:1201.45ms
step:256/1390 train_time:295584ms step_avg:1201.56ms
step:257/1390 train_time:296818ms step_avg:1201.69ms
step:258/1390 train_time:298049ms step_avg:1201.81ms
step:259/1390 train_time:299278ms step_avg:1201.92ms
step:260/1390 train_time:300523ms step_avg:1202.09ms
step:261/1390 train_time:301761ms step_avg:1202.23ms
step:262/1390 train_time:302992ms step_avg:1202.35ms
step:263/1390 train_time:304225ms step_avg:1202.47ms
step:264/1390 train_time:305462ms step_avg:1202.61ms
step:265/1390 train_time:306686ms step_avg:1202.69ms
step:266/1390 train_time:307915ms step_avg:1202.79ms
step:267/1390 train_time:309145ms step_avg:1202.90ms
step:268/1390 train_time:310379ms step_avg:1203.02ms
step:269/1390 train_time:311620ms step_avg:1203.17ms
step:270/1390 train_time:312853ms step_avg:1203.28ms
step:271/1390 train_time:314087ms step_avg:1203.40ms
step:272/1390 train_time:315314ms step_avg:1203.49ms
step:273/1390 train_time:316543ms step_avg:1203.59ms
step:274/1390 train_time:317777ms step_avg:1203.70ms
step:275/1390 train_time:319016ms step_avg:1203.83ms
step:275/1390 val_loss:4.2273 train_time:319016ms step_avg:1203.84ms
step:276/1390 train_time:320262ms step_avg:1203.99ms
step:277/1390 train_time:321496ms step_avg:1204.11ms
step:278/1390 train_time:322718ms step_avg:1204.17ms
step:279/1390 train_time:323934ms step_avg:1204.22ms
step:280/1390 train_time:325156ms step_avg:1204.28ms
step:281/1390 train_time:326381ms step_avg:1204.36ms
step:282/1390 train_time:327600ms step_avg:1204.41ms
step:283/1390 train_time:328818ms step_avg:1204.46ms
step:284/1390 train_time:330042ms step_avg:1204.53ms
step:285/1390 train_time:331264ms step_avg:1204.60ms
step:286/1390 train_time:332476ms step_avg:1204.62ms
step:287/1390 train_time:333698ms step_avg:1204.69ms
step:288/1390 train_time:334918ms step_avg:1204.74ms
step:289/1390 train_time:336143ms step_avg:1204.81ms
step:290/1390 train_time:337363ms step_avg:1204.87ms
step:291/1390 train_time:338584ms step_avg:1204.93ms
step:292/1390 train_time:339819ms step_avg:1205.03ms
step:293/1390 train_time:341043ms step_avg:1205.10ms
step:294/1390 train_time:342265ms step_avg:1205.16ms
step:295/1390 train_time:343493ms step_avg:1205.24ms
step:296/1390 train_time:344712ms step_avg:1205.29ms
step:297/1390 train_time:345931ms step_avg:1205.33ms
step:298/1390 train_time:347152ms step_avg:1205.39ms
step:299/1390 train_time:348375ms step_avg:1205.45ms
step:300/1390 train_time:349602ms step_avg:1205.52ms
step:300/1390 val_loss:4.1725 train_time:349602ms step_avg:1205.52ms
step:301/1390 train_time:350852ms step_avg:1205.68ms
step:302/1390 train_time:352083ms step_avg:1205.76ms
step:303/1390 train_time:353312ms step_avg:1205.84ms
step:304/1390 train_time:354532ms step_avg:1205.89ms
step:305/1390 train_time:355754ms step_avg:1205.94ms
step:306/1390 train_time:356978ms step_avg:1206.01ms
step:307/1390 train_time:358207ms step_avg:1206.08ms
step:308/1390 train_time:359426ms step_avg:1206.13ms
step:309/1390 train_time:360651ms step_avg:1206.19ms
step:310/1390 train_time:361886ms step_avg:1206.29ms
step:311/1390 train_time:363124ms step_avg:1206.39ms
step:312/1390 train_time:364360ms step_avg:1206.49ms
step:313/1390 train_time:365604ms step_avg:1206.61ms
step:314/1390 train_time:366850ms step_avg:1206.74ms
step:315/1390 train_time:368088ms step_avg:1206.85ms
step:316/1390 train_time:369327ms step_avg:1206.95ms
step:317/1390 train_time:370564ms step_avg:1207.05ms
step:318/1390 train_time:371804ms step_avg:1207.16ms
step:319/1390 train_time:373047ms step_avg:1207.27ms
step:320/1390 train_time:374282ms step_avg:1207.36ms
step:321/1390 train_time:375518ms step_avg:1207.45ms
step:322/1390 train_time:376756ms step_avg:1207.55ms
step:323/1390 train_time:377992ms step_avg:1207.64ms
step:324/1390 train_time:379233ms step_avg:1207.75ms
step:325/1390 train_time:380471ms step_avg:1207.84ms
step:325/1390 val_loss:4.1125 train_time:380471ms step_avg:1207.85ms
step:326/1390 train_time:381745ms step_avg:1208.05ms
step:327/1390 train_time:383000ms step_avg:1208.20ms
step:328/1390 train_time:384238ms step_avg:1208.29ms
step:329/1390 train_time:385474ms step_avg:1208.38ms
step:330/1390 train_time:386712ms step_avg:1208.47ms
step:331/1390 train_time:387951ms step_avg:1208.57ms
step:332/1390 train_time:389190ms step_avg:1208.66ms
step:333/1390 train_time:390425ms step_avg:1208.75ms
step:334/1390 train_time:391670ms step_avg:1208.86ms
step:335/1390 train_time:392911ms step_avg:1208.96ms
step:336/1390 train_time:394152ms step_avg:1209.06ms
step:337/1390 train_time:395401ms step_avg:1209.18ms
step:338/1390 train_time:396646ms step_avg:1209.29ms
step:339/1390 train_time:397884ms step_avg:1209.37ms
step:340/1390 train_time:399126ms step_avg:1209.47ms
step:341/1390 train_time:400369ms step_avg:1209.58ms
step:342/1390 train_time:401610ms step_avg:1209.67ms
step:343/1390 train_time:402851ms step_avg:1209.76ms
step:344/1390 train_time:404092ms step_avg:1209.86ms
step:345/1390 train_time:405333ms step_avg:1209.95ms
step:346/1390 train_time:406575ms step_avg:1210.05ms
step:347/1390 train_time:407815ms step_avg:1210.13ms
step:348/1390 train_time:409056ms step_avg:1210.23ms
step:349/1390 train_time:410294ms step_avg:1210.31ms
step:350/1390 train_time:411531ms step_avg:1210.39ms
step:350/1390 val_loss:4.0659 train_time:411531ms step_avg:1210.39ms
step:351/1390 train_time:412797ms step_avg:1210.55ms
step:352/1390 train_time:414050ms step_avg:1210.67ms
step:353/1390 train_time:415293ms step_avg:1210.77ms
step:354/1390 train_time:416538ms step_avg:1210.87ms
step:355/1390 train_time:417778ms step_avg:1210.95ms
step:356/1390 train_time:419021ms step_avg:1211.04ms
step:357/1390 train_time:420270ms step_avg:1211.15ms
step:358/1390 train_time:421521ms step_avg:1211.27ms
step:359/1390 train_time:422769ms step_avg:1211.37ms
step:360/1390 train_time:424005ms step_avg:1211.44ms
step:361/1390 train_time:425248ms step_avg:1211.53ms
step:362/1390 train_time:426489ms step_avg:1211.62ms
step:363/1390 train_time:427733ms step_avg:1211.71ms
step:364/1390 train_time:428973ms step_avg:1211.79ms
step:365/1390 train_time:430211ms step_avg:1211.86ms
step:366/1390 train_time:431456ms step_avg:1211.96ms
step:367/1390 train_time:432701ms step_avg:1212.05ms
step:368/1390 train_time:433944ms step_avg:1212.13ms
step:369/1390 train_time:435190ms step_avg:1212.23ms
step:370/1390 train_time:436434ms step_avg:1212.32ms
step:371/1390 train_time:437674ms step_avg:1212.39ms
step:372/1390 train_time:438915ms step_avg:1212.47ms
step:373/1390 train_time:440156ms step_avg:1212.55ms
step:374/1390 train_time:441406ms step_avg:1212.65ms
step:375/1390 train_time:442653ms step_avg:1212.75ms
step:375/1390 val_loss:4.0286 train_time:442653ms step_avg:1212.75ms
step:376/1390 train_time:443919ms step_avg:1212.89ms
step:377/1390 train_time:445176ms step_avg:1213.01ms
step:378/1390 train_time:446409ms step_avg:1213.07ms
step:379/1390 train_time:447652ms step_avg:1213.15ms
step:380/1390 train_time:448896ms step_avg:1213.23ms
step:381/1390 train_time:450167ms step_avg:1213.39ms
step:382/1390 train_time:451406ms step_avg:1213.46ms
step:383/1390 train_time:452648ms step_avg:1213.53ms
step:384/1390 train_time:453886ms step_avg:1213.60ms
step:385/1390 train_time:455124ms step_avg:1213.66ms
step:386/1390 train_time:456367ms step_avg:1213.74ms
step:387/1390 train_time:457608ms step_avg:1213.82ms
step:388/1390 train_time:458851ms step_avg:1213.89ms
step:389/1390 train_time:460096ms step_avg:1213.97ms
step:390/1390 train_time:461337ms step_avg:1214.04ms
step:391/1390 train_time:462574ms step_avg:1214.11ms
step:392/1390 train_time:463810ms step_avg:1214.16ms
step:393/1390 train_time:465053ms step_avg:1214.24ms
step:394/1390 train_time:466291ms step_avg:1214.30ms
step:395/1390 train_time:467531ms step_avg:1214.37ms
step:396/1390 train_time:468772ms step_avg:1214.43ms
step:397/1390 train_time:470012ms step_avg:1214.50ms
step:398/1390 train_time:471249ms step_avg:1214.56ms
step:399/1390 train_time:472488ms step_avg:1214.62ms
step:400/1390 train_time:473731ms step_avg:1214.69ms
step:400/1390 val_loss:3.9967 train_time:473731ms step_avg:1214.70ms
step:401/1390 train_time:474997ms step_avg:1214.83ms
step:402/1390 train_time:476249ms step_avg:1214.92ms
step:403/1390 train_time:477490ms step_avg:1214.99ms
step:404/1390 train_time:478728ms step_avg:1215.05ms
step:405/1390 train_time:479983ms step_avg:1215.15ms
step:406/1390 train_time:481227ms step_avg:1215.22ms
step:407/1390 train_time:482475ms step_avg:1215.30ms
step:408/1390 train_time:483714ms step_avg:1215.36ms
step:409/1390 train_time:484950ms step_avg:1215.41ms
step:410/1390 train_time:486190ms step_avg:1215.47ms
step:411/1390 train_time:487429ms step_avg:1215.53ms
step:412/1390 train_time:488676ms step_avg:1215.61ms
step:413/1390 train_time:489933ms step_avg:1215.71ms
step:414/1390 train_time:491192ms step_avg:1215.82ms
step:415/1390 train_time:492443ms step_avg:1215.91ms
step:416/1390 train_time:493705ms step_avg:1216.02ms
step:417/1390 train_time:494975ms step_avg:1216.15ms
step:418/1390 train_time:496230ms step_avg:1216.25ms
step:419/1390 train_time:497483ms step_avg:1216.34ms
step:420/1390 train_time:498731ms step_avg:1216.42ms
step:421/1390 train_time:499990ms step_avg:1216.52ms
step:422/1390 train_time:501248ms step_avg:1216.62ms
step:423/1390 train_time:502497ms step_avg:1216.70ms
step:424/1390 train_time:503750ms step_avg:1216.79ms
step:425/1390 train_time:505002ms step_avg:1216.87ms
step:425/1390 val_loss:3.9622 train_time:505002ms step_avg:1216.87ms
step:426/1390 train_time:506278ms step_avg:1217.01ms
step:427/1390 train_time:507545ms step_avg:1217.13ms
step:428/1390 train_time:508804ms step_avg:1217.23ms
step:429/1390 train_time:510068ms step_avg:1217.35ms
step:430/1390 train_time:511321ms step_avg:1217.43ms
step:431/1390 train_time:512588ms step_avg:1217.55ms
step:432/1390 train_time:513840ms step_avg:1217.63ms
step:433/1390 train_time:515098ms step_avg:1217.73ms
step:434/1390 train_time:516358ms step_avg:1217.83ms
step:435/1390 train_time:517621ms step_avg:1217.93ms
step:436/1390 train_time:518877ms step_avg:1218.02ms
step:437/1390 train_time:520137ms step_avg:1218.12ms
step:438/1390 train_time:521387ms step_avg:1218.19ms
step:439/1390 train_time:522641ms step_avg:1218.28ms
step:440/1390 train_time:523885ms step_avg:1218.34ms
step:441/1390 train_time:525138ms step_avg:1218.42ms
step:442/1390 train_time:526392ms step_avg:1218.50ms
step:443/1390 train_time:527647ms step_avg:1218.58ms
step:444/1390 train_time:528898ms step_avg:1218.66ms
step:445/1390 train_time:530150ms step_avg:1218.74ms
step:446/1390 train_time:531402ms step_avg:1218.81ms
step:447/1390 train_time:532671ms step_avg:1218.93ms
step:448/1390 train_time:533933ms step_avg:1219.03ms
step:449/1390 train_time:535184ms step_avg:1219.10ms
step:450/1390 train_time:536440ms step_avg:1219.18ms
step:450/1390 val_loss:3.9371 train_time:536441ms step_avg:1219.18ms
step:451/1390 train_time:537716ms step_avg:1219.31ms
step:452/1390 train_time:538979ms step_avg:1219.41ms
step:453/1390 train_time:540237ms step_avg:1219.50ms
step:454/1390 train_time:541498ms step_avg:1219.59ms
step:455/1390 train_time:542755ms step_avg:1219.67ms
step:456/1390 train_time:544012ms step_avg:1219.76ms
step:457/1390 train_time:545260ms step_avg:1219.82ms
step:458/1390 train_time:546516ms step_avg:1219.90ms
step:459/1390 train_time:547773ms step_avg:1219.98ms
step:460/1390 train_time:549031ms step_avg:1220.07ms
step:461/1390 train_time:550279ms step_avg:1220.13ms
step:462/1390 train_time:551537ms step_avg:1220.22ms
step:463/1390 train_time:552793ms step_avg:1220.29ms
step:464/1390 train_time:554044ms step_avg:1220.36ms
step:465/1390 train_time:555312ms step_avg:1220.47ms
step:466/1390 train_time:556563ms step_avg:1220.53ms
step:467/1390 train_time:557818ms step_avg:1220.61ms
step:468/1390 train_time:559077ms step_avg:1220.69ms
step:469/1390 train_time:560336ms step_avg:1220.78ms
step:470/1390 train_time:561588ms step_avg:1220.84ms
step:471/1390 train_time:562835ms step_avg:1220.90ms
step:472/1390 train_time:564077ms step_avg:1220.95ms
step:473/1390 train_time:565327ms step_avg:1221.01ms
step:474/1390 train_time:566582ms step_avg:1221.08ms
step:475/1390 train_time:567839ms step_avg:1221.16ms
step:475/1390 val_loss:3.9150 train_time:567839ms step_avg:1221.16ms
step:476/1390 train_time:569127ms step_avg:1221.30ms
step:477/1390 train_time:570404ms step_avg:1221.42ms
step:478/1390 train_time:571654ms step_avg:1221.48ms
step:479/1390 train_time:572906ms step_avg:1221.55ms
step:480/1390 train_time:574159ms step_avg:1221.61ms
step:481/1390 train_time:575407ms step_avg:1221.67ms
step:482/1390 train_time:576657ms step_avg:1221.73ms
step:483/1390 train_time:577904ms step_avg:1221.79ms
step:484/1390 train_time:579169ms step_avg:1221.88ms
step:485/1390 train_time:580425ms step_avg:1221.95ms
step:486/1390 train_time:581691ms step_avg:1222.04ms
step:487/1390 train_time:582942ms step_avg:1222.10ms
step:488/1390 train_time:584195ms step_avg:1222.17ms
step:489/1390 train_time:585442ms step_avg:1222.22ms
step:490/1390 train_time:586698ms step_avg:1222.29ms
step:491/1390 train_time:587950ms step_avg:1222.35ms
step:492/1390 train_time:589200ms step_avg:1222.41ms
step:493/1390 train_time:590449ms step_avg:1222.46ms
step:494/1390 train_time:591706ms step_avg:1222.53ms
step:495/1390 train_time:592964ms step_avg:1222.61ms
step:496/1390 train_time:594207ms step_avg:1222.65ms
step:497/1390 train_time:595453ms step_avg:1222.70ms
step:498/1390 train_time:596696ms step_avg:1222.74ms
step:499/1390 train_time:597939ms step_avg:1222.78ms
step:500/1390 train_time:599184ms step_avg:1222.82ms
step:500/1390 val_loss:3.8995 train_time:599184ms step_avg:1222.82ms
step:501/1390 train_time:600459ms step_avg:1222.93ms
step:502/1390 train_time:601713ms step_avg:1222.99ms
step:503/1390 train_time:602956ms step_avg:1223.03ms
step:504/1390 train_time:604205ms step_avg:1223.09ms
step:505/1390 train_time:605446ms step_avg:1223.12ms
step:506/1390 train_time:606688ms step_avg:1223.16ms
step:507/1390 train_time:607937ms step_avg:1223.21ms
step:508/1390 train_time:609173ms step_avg:1223.24ms
step:509/1390 train_time:610413ms step_avg:1223.27ms
step:510/1390 train_time:611650ms step_avg:1223.30ms
step:511/1390 train_time:612894ms step_avg:1223.34ms
step:512/1390 train_time:614141ms step_avg:1223.39ms
step:513/1390 train_time:615389ms step_avg:1223.44ms
step:514/1390 train_time:616634ms step_avg:1223.48ms
step:515/1390 train_time:617895ms step_avg:1223.55ms
step:516/1390 train_time:619148ms step_avg:1223.61ms
step:517/1390 train_time:620398ms step_avg:1223.66ms
step:518/1390 train_time:621652ms step_avg:1223.73ms
step:519/1390 train_time:622912ms step_avg:1223.80ms
step:520/1390 train_time:624163ms step_avg:1223.85ms
step:521/1390 train_time:625433ms step_avg:1223.94ms
step:522/1390 train_time:626687ms step_avg:1224.00ms
step:523/1390 train_time:627950ms step_avg:1224.07ms
step:524/1390 train_time:629205ms step_avg:1224.13ms
step:525/1390 train_time:630465ms step_avg:1224.20ms
step:525/1390 val_loss:3.8785 train_time:630465ms step_avg:1224.20ms
step:526/1390 train_time:631747ms step_avg:1224.32ms
step:527/1390 train_time:633020ms step_avg:1224.41ms
step:528/1390 train_time:634274ms step_avg:1224.47ms
step:529/1390 train_time:635539ms step_avg:1224.54ms
step:530/1390 train_time:636798ms step_avg:1224.61ms
step:531/1390 train_time:638055ms step_avg:1224.67ms
step:532/1390 train_time:639308ms step_avg:1224.73ms
step:533/1390 train_time:640575ms step_avg:1224.81ms
step:534/1390 train_time:641832ms step_avg:1224.87ms
step:535/1390 train_time:643095ms step_avg:1224.94ms
step:536/1390 train_time:644355ms step_avg:1225.01ms
step:537/1390 train_time:645618ms step_avg:1225.08ms
step:538/1390 train_time:646884ms step_avg:1225.16ms
step:539/1390 train_time:648150ms step_avg:1225.24ms
step:540/1390 train_time:649411ms step_avg:1225.30ms
step:541/1390 train_time:650675ms step_avg:1225.38ms
step:542/1390 train_time:651931ms step_avg:1225.43ms
step:543/1390 train_time:653194ms step_avg:1225.51ms
step:544/1390 train_time:654457ms step_avg:1225.57ms
step:545/1390 train_time:655719ms step_avg:1225.64ms
step:546/1390 train_time:656981ms step_avg:1225.71ms
step:547/1390 train_time:658246ms step_avg:1225.78ms
step:548/1390 train_time:659514ms step_avg:1225.86ms
step:549/1390 train_time:660776ms step_avg:1225.93ms
step:550/1390 train_time:662040ms step_avg:1226.00ms
step:550/1390 val_loss:3.8629 train_time:662040ms step_avg:1226.00ms
step:551/1390 train_time:663326ms step_avg:1226.11ms
step:552/1390 train_time:664604ms step_avg:1226.21ms
step:553/1390 train_time:665873ms step_avg:1226.29ms
step:554/1390 train_time:667133ms step_avg:1226.35ms
step:555/1390 train_time:668403ms step_avg:1226.43ms
step:556/1390 train_time:669669ms step_avg:1226.50ms
step:557/1390 train_time:670931ms step_avg:1226.56ms
step:558/1390 train_time:672191ms step_avg:1226.63ms
step:559/1390 train_time:673453ms step_avg:1226.69ms
step:560/1390 train_time:674713ms step_avg:1226.75ms
step:561/1390 train_time:675977ms step_avg:1226.82ms
step:562/1390 train_time:677239ms step_avg:1226.88ms
step:563/1390 train_time:678510ms step_avg:1226.96ms
step:564/1390 train_time:679779ms step_avg:1227.04ms
step:565/1390 train_time:681045ms step_avg:1227.11ms
step:566/1390 train_time:682308ms step_avg:1227.17ms
step:567/1390 train_time:683565ms step_avg:1227.23ms
step:568/1390 train_time:684828ms step_avg:1227.29ms
step:569/1390 train_time:686083ms step_avg:1227.34ms
step:570/1390 train_time:687350ms step_avg:1227.41ms
step:571/1390 train_time:688647ms step_avg:1227.53ms
step:572/1390 train_time:689907ms step_avg:1227.59ms
step:573/1390 train_time:691171ms step_avg:1227.66ms
step:574/1390 train_time:692444ms step_avg:1227.74ms
step:575/1390 train_time:693713ms step_avg:1227.81ms
step:575/1390 val_loss:3.8504 train_time:693713ms step_avg:1227.81ms
step:576/1390 train_time:694999ms step_avg:1227.91ms
step:577/1390 train_time:696276ms step_avg:1228.00ms
step:578/1390 train_time:697532ms step_avg:1228.05ms
step:579/1390 train_time:698802ms step_avg:1228.12ms
step:580/1390 train_time:700076ms step_avg:1228.20ms
step:581/1390 train_time:701344ms step_avg:1228.27ms
step:582/1390 train_time:702607ms step_avg:1228.33ms
step:583/1390 train_time:703877ms step_avg:1228.41ms
step:584/1390 train_time:705135ms step_avg:1228.46ms
step:585/1390 train_time:706400ms step_avg:1228.52ms
step:586/1390 train_time:707670ms step_avg:1228.59ms
step:587/1390 train_time:708946ms step_avg:1228.68ms
step:588/1390 train_time:710211ms step_avg:1228.74ms
step:589/1390 train_time:711477ms step_avg:1228.80ms
step:590/1390 train_time:712744ms step_avg:1228.87ms
step:591/1390 train_time:714018ms step_avg:1228.95ms
step:592/1390 train_time:715279ms step_avg:1229.00ms
step:593/1390 train_time:716548ms step_avg:1229.07ms
step:594/1390 train_time:717811ms step_avg:1229.13ms
step:595/1390 train_time:719080ms step_avg:1229.20ms
step:596/1390 train_time:720359ms step_avg:1229.28ms
step:597/1390 train_time:721620ms step_avg:1229.34ms
step:598/1390 train_time:722890ms step_avg:1229.40ms
step:599/1390 train_time:724163ms step_avg:1229.48ms
step:600/1390 train_time:725435ms step_avg:1229.55ms
step:600/1390 val_loss:3.8351 train_time:725435ms step_avg:1229.55ms
step:601/1390 train_time:726724ms step_avg:1229.65ms
step:602/1390 train_time:727999ms step_avg:1229.73ms
step:603/1390 train_time:729262ms step_avg:1229.78ms
step:604/1390 train_time:730526ms step_avg:1229.84ms
step:605/1390 train_time:731797ms step_avg:1229.91ms
step:606/1390 train_time:733064ms step_avg:1229.97ms
step:607/1390 train_time:734320ms step_avg:1230.02ms
step:608/1390 train_time:735588ms step_avg:1230.08ms
step:609/1390 train_time:736855ms step_avg:1230.14ms
step:610/1390 train_time:738120ms step_avg:1230.20ms
step:611/1390 train_time:739388ms step_avg:1230.26ms
step:612/1390 train_time:740653ms step_avg:1230.32ms
step:613/1390 train_time:741918ms step_avg:1230.38ms
step:614/1390 train_time:743178ms step_avg:1230.43ms
step:615/1390 train_time:744444ms step_avg:1230.49ms
step:616/1390 train_time:745702ms step_avg:1230.53ms
step:617/1390 train_time:746962ms step_avg:1230.58ms
step:618/1390 train_time:748222ms step_avg:1230.63ms
step:619/1390 train_time:749505ms step_avg:1230.71ms
step:620/1390 train_time:750780ms step_avg:1230.79ms
step:621/1390 train_time:752058ms step_avg:1230.86ms
step:622/1390 train_time:753345ms step_avg:1230.96ms
step:623/1390 train_time:754634ms step_avg:1231.05ms
step:624/1390 train_time:755904ms step_avg:1231.11ms
step:625/1390 train_time:757187ms step_avg:1231.20ms
step:625/1390 val_loss:3.8192 train_time:757187ms step_avg:1231.20ms
step:626/1390 train_time:758485ms step_avg:1231.31ms
step:627/1390 train_time:759777ms step_avg:1231.41ms
step:628/1390 train_time:761067ms step_avg:1231.50ms
step:629/1390 train_time:762340ms step_avg:1231.57ms
step:630/1390 train_time:763616ms step_avg:1231.64ms
step:631/1390 train_time:764889ms step_avg:1231.70ms
step:632/1390 train_time:766171ms step_avg:1231.79ms
step:633/1390 train_time:767447ms step_avg:1231.86ms
step:634/1390 train_time:768728ms step_avg:1231.94ms
step:635/1390 train_time:770001ms step_avg:1232.00ms
step:636/1390 train_time:771273ms step_avg:1232.06ms
step:637/1390 train_time:772544ms step_avg:1232.13ms
step:638/1390 train_time:773832ms step_avg:1232.22ms
step:639/1390 train_time:775109ms step_avg:1232.29ms
step:640/1390 train_time:776383ms step_avg:1232.35ms
step:641/1390 train_time:777668ms step_avg:1232.44ms
step:642/1390 train_time:778952ms step_avg:1232.52ms
step:643/1390 train_time:780227ms step_avg:1232.59ms
step:644/1390 train_time:781504ms step_avg:1232.66ms
step:645/1390 train_time:782790ms step_avg:1232.74ms
step:646/1390 train_time:784068ms step_avg:1232.81ms
step:647/1390 train_time:785358ms step_avg:1232.90ms
step:648/1390 train_time:786639ms step_avg:1232.98ms
step:649/1390 train_time:787928ms step_avg:1233.06ms
step:650/1390 train_time:789203ms step_avg:1233.13ms
step:650/1390 val_loss:3.8050 train_time:789203ms step_avg:1233.13ms
step:651/1390 train_time:790516ms step_avg:1233.25ms
step:652/1390 train_time:791808ms step_avg:1233.35ms
step:653/1390 train_time:793084ms step_avg:1233.41ms
step:654/1390 train_time:794360ms step_avg:1233.48ms
step:655/1390 train_time:795630ms step_avg:1233.53ms
step:656/1390 train_time:796913ms step_avg:1233.61ms
step:657/1390 train_time:798202ms step_avg:1233.70ms
step:658/1390 train_time:799480ms step_avg:1233.77ms
step:659/1390 train_time:800756ms step_avg:1233.83ms
step:660/1390 train_time:802034ms step_avg:1233.90ms
step:661/1390 train_time:803310ms step_avg:1233.96ms
step:662/1390 train_time:804590ms step_avg:1234.03ms
step:663/1390 train_time:805874ms step_avg:1234.11ms
step:664/1390 train_time:807148ms step_avg:1234.17ms
step:665/1390 train_time:808431ms step_avg:1234.25ms
step:666/1390 train_time:809716ms step_avg:1234.32ms
step:667/1390 train_time:810994ms step_avg:1234.39ms
step:668/1390 train_time:812268ms step_avg:1234.45ms
step:669/1390 train_time:813546ms step_avg:1234.52ms
step:670/1390 train_time:814818ms step_avg:1234.57ms
step:671/1390 train_time:816091ms step_avg:1234.63ms
step:672/1390 train_time:817366ms step_avg:1234.69ms
step:673/1390 train_time:818645ms step_avg:1234.76ms
step:674/1390 train_time:819920ms step_avg:1234.82ms
step:675/1390 train_time:821201ms step_avg:1234.89ms
step:675/1390 val_loss:3.7901 train_time:821201ms step_avg:1234.89ms
step:676/1390 train_time:822504ms step_avg:1234.99ms
step:677/1390 train_time:823777ms step_avg:1235.05ms
step:678/1390 train_time:825038ms step_avg:1235.09ms
step:679/1390 train_time:826317ms step_avg:1235.15ms
step:680/1390 train_time:827583ms step_avg:1235.20ms
step:681/1390 train_time:828851ms step_avg:1235.25ms
step:682/1390 train_time:830120ms step_avg:1235.30ms
step:683/1390 train_time:831387ms step_avg:1235.34ms
step:684/1390 train_time:832660ms step_avg:1235.40ms
step:685/1390 train_time:833933ms step_avg:1235.46ms
step:686/1390 train_time:835197ms step_avg:1235.50ms
step:687/1390 train_time:836461ms step_avg:1235.54ms
step:688/1390 train_time:837732ms step_avg:1235.59ms
step:689/1390 train_time:839004ms step_avg:1235.65ms
step:690/1390 train_time:840277ms step_avg:1235.70ms
step:691/1390 train_time:841544ms step_avg:1235.75ms
step:692/1390 train_time:842812ms step_avg:1235.79ms
step:693/1390 train_time:844081ms step_avg:1235.84ms
step:694/1390 train_time:845344ms step_avg:1235.88ms
step:695/1390 train_time:846610ms step_avg:1235.93ms
step:696/1390 train_time:847879ms step_avg:1235.98ms
step:697/1390 train_time:849155ms step_avg:1236.03ms
step:698/1390 train_time:850421ms step_avg:1236.08ms
step:699/1390 train_time:851696ms step_avg:1236.13ms
step:700/1390 train_time:852963ms step_avg:1236.18ms
step:700/1390 val_loss:3.7752 train_time:852963ms step_avg:1236.18ms
step:701/1390 train_time:854255ms step_avg:1236.26ms
step:702/1390 train_time:855537ms step_avg:1236.33ms
step:703/1390 train_time:856810ms step_avg:1236.38ms
step:704/1390 train_time:858080ms step_avg:1236.43ms
step:705/1390 train_time:859358ms step_avg:1236.49ms
step:706/1390 train_time:860625ms step_avg:1236.53ms
step:707/1390 train_time:861903ms step_avg:1236.59ms
step:708/1390 train_time:863171ms step_avg:1236.64ms
step:709/1390 train_time:864444ms step_avg:1236.69ms
step:710/1390 train_time:865719ms step_avg:1236.74ms
step:711/1390 train_time:866991ms step_avg:1236.79ms
step:712/1390 train_time:868258ms step_avg:1236.83ms
step:713/1390 train_time:869524ms step_avg:1236.88ms
step:714/1390 train_time:870789ms step_avg:1236.92ms
step:715/1390 train_time:872060ms step_avg:1236.96ms
step:716/1390 train_time:873334ms step_avg:1237.02ms
step:717/1390 train_time:874603ms step_avg:1237.06ms
step:718/1390 train_time:875869ms step_avg:1237.10ms
step:719/1390 train_time:877143ms step_avg:1237.16ms
step:720/1390 train_time:878410ms step_avg:1237.20ms
step:721/1390 train_time:879684ms step_avg:1237.25ms
step:722/1390 train_time:880962ms step_avg:1237.31ms
step:723/1390 train_time:882241ms step_avg:1237.36ms
step:724/1390 train_time:883530ms step_avg:1237.44ms
step:725/1390 train_time:884820ms step_avg:1237.51ms
step:725/1390 val_loss:3.7612 train_time:884821ms step_avg:1237.51ms
step:726/1390 train_time:886139ms step_avg:1237.62ms
step:727/1390 train_time:887431ms step_avg:1237.70ms
step:728/1390 train_time:888709ms step_avg:1237.76ms
step:729/1390 train_time:889995ms step_avg:1237.82ms
step:730/1390 train_time:891284ms step_avg:1237.89ms
step:731/1390 train_time:892562ms step_avg:1237.95ms
step:732/1390 train_time:893839ms step_avg:1238.00ms
step:733/1390 train_time:895116ms step_avg:1238.06ms
step:734/1390 train_time:896399ms step_avg:1238.12ms
step:735/1390 train_time:897675ms step_avg:1238.17ms
step:736/1390 train_time:898963ms step_avg:1238.24ms
step:737/1390 train_time:900246ms step_avg:1238.30ms
step:738/1390 train_time:901524ms step_avg:1238.36ms
step:739/1390 train_time:902805ms step_avg:1238.42ms
step:740/1390 train_time:904094ms step_avg:1238.48ms
step:741/1390 train_time:905379ms step_avg:1238.55ms
step:742/1390 train_time:906658ms step_avg:1238.60ms
step:743/1390 train_time:907935ms step_avg:1238.66ms
step:744/1390 train_time:909223ms step_avg:1238.72ms
step:745/1390 train_time:910510ms step_avg:1238.79ms
step:746/1390 train_time:911784ms step_avg:1238.84ms
step:747/1390 train_time:913057ms step_avg:1238.88ms
step:748/1390 train_time:914347ms step_avg:1238.95ms
step:749/1390 train_time:915630ms step_avg:1239.01ms
step:750/1390 train_time:916920ms step_avg:1239.08ms
step:750/1390 val_loss:3.7500 train_time:916920ms step_avg:1239.08ms
step:751/1390 train_time:918238ms step_avg:1239.19ms
step:752/1390 train_time:919527ms step_avg:1239.26ms
step:753/1390 train_time:920810ms step_avg:1239.31ms
step:754/1390 train_time:922094ms step_avg:1239.37ms
step:755/1390 train_time:923382ms step_avg:1239.44ms
step:756/1390 train_time:924681ms step_avg:1239.52ms
step:757/1390 train_time:925968ms step_avg:1239.58ms
step:758/1390 train_time:927238ms step_avg:1239.62ms
step:759/1390 train_time:928525ms step_avg:1239.69ms
step:760/1390 train_time:929816ms step_avg:1239.75ms
step:761/1390 train_time:931128ms step_avg:1239.85ms
step:762/1390 train_time:932412ms step_avg:1239.91ms
step:763/1390 train_time:933692ms step_avg:1239.96ms
step:764/1390 train_time:934982ms step_avg:1240.03ms
step:765/1390 train_time:936268ms step_avg:1240.09ms
step:766/1390 train_time:937562ms step_avg:1240.16ms
step:767/1390 train_time:938853ms step_avg:1240.23ms
step:768/1390 train_time:940148ms step_avg:1240.30ms
step:769/1390 train_time:941434ms step_avg:1240.36ms
step:770/1390 train_time:942713ms step_avg:1240.41ms
step:771/1390 train_time:943998ms step_avg:1240.47ms
step:772/1390 train_time:945288ms step_avg:1240.54ms
step:773/1390 train_time:946576ms step_avg:1240.60ms
step:774/1390 train_time:947856ms step_avg:1240.65ms
step:775/1390 train_time:949134ms step_avg:1240.70ms
step:775/1390 val_loss:3.7354 train_time:949135ms step_avg:1240.70ms
step:776/1390 train_time:950449ms step_avg:1240.80ms
step:777/1390 train_time:951740ms step_avg:1240.86ms
step:778/1390 train_time:953027ms step_avg:1240.92ms
step:779/1390 train_time:954316ms step_avg:1240.98ms
step:780/1390 train_time:955607ms step_avg:1241.05ms
step:781/1390 train_time:956894ms step_avg:1241.11ms
step:782/1390 train_time:958180ms step_avg:1241.17ms
step:783/1390 train_time:959460ms step_avg:1241.22ms
step:784/1390 train_time:960740ms step_avg:1241.27ms
step:785/1390 train_time:962029ms step_avg:1241.33ms
step:786/1390 train_time:963310ms step_avg:1241.38ms
step:787/1390 train_time:964591ms step_avg:1241.43ms
step:788/1390 train_time:965873ms step_avg:1241.48ms
step:789/1390 train_time:967158ms step_avg:1241.54ms
step:790/1390 train_time:968441ms step_avg:1241.59ms
step:791/1390 train_time:969733ms step_avg:1241.66ms
step:792/1390 train_time:971021ms step_avg:1241.72ms
step:793/1390 train_time:972312ms step_avg:1241.78ms
step:794/1390 train_time:973613ms step_avg:1241.85ms
step:795/1390 train_time:974897ms step_avg:1241.91ms
step:796/1390 train_time:976187ms step_avg:1241.97ms
step:797/1390 train_time:977478ms step_avg:1242.03ms
step:798/1390 train_time:978766ms step_avg:1242.09ms
step:799/1390 train_time:980056ms step_avg:1242.15ms
step:800/1390 train_time:981347ms step_avg:1242.21ms
step:800/1390 val_loss:3.7223 train_time:981347ms step_avg:1242.21ms
step:801/1390 train_time:982657ms step_avg:1242.30ms
step:802/1390 train_time:983945ms step_avg:1242.35ms
step:803/1390 train_time:985226ms step_avg:1242.40ms
step:804/1390 train_time:986514ms step_avg:1242.46ms
step:805/1390 train_time:987793ms step_avg:1242.51ms
step:806/1390 train_time:989068ms step_avg:1242.55ms
step:807/1390 train_time:990346ms step_avg:1242.59ms
step:808/1390 train_time:991626ms step_avg:1242.64ms
step:809/1390 train_time:992910ms step_avg:1242.69ms
step:810/1390 train_time:994187ms step_avg:1242.73ms
step:811/1390 train_time:995465ms step_avg:1242.78ms
step:812/1390 train_time:996745ms step_avg:1242.82ms
step:813/1390 train_time:998029ms step_avg:1242.88ms
step:814/1390 train_time:999314ms step_avg:1242.93ms
step:815/1390 train_time:1000591ms step_avg:1242.97ms
step:816/1390 train_time:1001871ms step_avg:1243.02ms
step:817/1390 train_time:1003147ms step_avg:1243.06ms
step:818/1390 train_time:1004429ms step_avg:1243.10ms
step:819/1390 train_time:1005710ms step_avg:1243.15ms
step:820/1390 train_time:1006988ms step_avg:1243.20ms
step:821/1390 train_time:1008270ms step_avg:1243.24ms
step:822/1390 train_time:1009546ms step_avg:1243.28ms
step:823/1390 train_time:1010825ms step_avg:1243.33ms
step:824/1390 train_time:1012105ms step_avg:1243.37ms
step:825/1390 train_time:1013406ms step_avg:1243.44ms
step:825/1390 val_loss:3.7085 train_time:1013406ms step_avg:1243.44ms
step:826/1390 train_time:1014719ms step_avg:1243.53ms
step:827/1390 train_time:1016029ms step_avg:1243.61ms
step:828/1390 train_time:1017313ms step_avg:1243.66ms
step:829/1390 train_time:1018606ms step_avg:1243.72ms
step:830/1390 train_time:1019905ms step_avg:1243.79ms
step:831/1390 train_time:1021203ms step_avg:1243.85ms
step:832/1390 train_time:1022496ms step_avg:1243.91ms
step:833/1390 train_time:1023801ms step_avg:1243.99ms
step:834/1390 train_time:1025093ms step_avg:1244.04ms
step:835/1390 train_time:1026392ms step_avg:1244.11ms
step:836/1390 train_time:1027683ms step_avg:1244.17ms
step:837/1390 train_time:1028979ms step_avg:1244.23ms
step:838/1390 train_time:1030270ms step_avg:1244.29ms
step:839/1390 train_time:1031556ms step_avg:1244.34ms
step:840/1390 train_time:1032846ms step_avg:1244.39ms
step:841/1390 train_time:1034134ms step_avg:1244.45ms
step:842/1390 train_time:1035429ms step_avg:1244.51ms
step:843/1390 train_time:1036715ms step_avg:1244.56ms
step:844/1390 train_time:1038002ms step_avg:1244.61ms
step:845/1390 train_time:1039300ms step_avg:1244.67ms
step:846/1390 train_time:1040588ms step_avg:1244.72ms
step:847/1390 train_time:1041881ms step_avg:1244.78ms
step:848/1390 train_time:1043175ms step_avg:1244.84ms
step:849/1390 train_time:1044472ms step_avg:1244.90ms
step:850/1390 train_time:1045768ms step_avg:1244.96ms
step:850/1390 val_loss:3.6973 train_time:1045768ms step_avg:1244.96ms
step:851/1390 train_time:1047085ms step_avg:1245.05ms
step:852/1390 train_time:1048392ms step_avg:1245.12ms
step:853/1390 train_time:1049675ms step_avg:1245.17ms
step:854/1390 train_time:1050962ms step_avg:1245.22ms
step:855/1390 train_time:1052243ms step_avg:1245.26ms
step:856/1390 train_time:1053538ms step_avg:1245.32ms
step:857/1390 train_time:1054839ms step_avg:1245.38ms
step:858/1390 train_time:1056137ms step_avg:1245.44ms
step:859/1390 train_time:1057437ms step_avg:1245.51ms
step:860/1390 train_time:1058729ms step_avg:1245.56ms
step:861/1390 train_time:1060021ms step_avg:1245.62ms
step:862/1390 train_time:1061319ms step_avg:1245.68ms
step:863/1390 train_time:1062620ms step_avg:1245.74ms
step:864/1390 train_time:1063905ms step_avg:1245.79ms
step:865/1390 train_time:1065206ms step_avg:1245.85ms
step:866/1390 train_time:1066500ms step_avg:1245.91ms
step:867/1390 train_time:1067787ms step_avg:1245.96ms
step:868/1390 train_time:1069075ms step_avg:1246.01ms
step:869/1390 train_time:1070369ms step_avg:1246.06ms
step:870/1390 train_time:1071663ms step_avg:1246.12ms
step:871/1390 train_time:1072944ms step_avg:1246.16ms
step:872/1390 train_time:1074239ms step_avg:1246.22ms
step:873/1390 train_time:1075519ms step_avg:1246.26ms
step:874/1390 train_time:1076820ms step_avg:1246.32ms
step:875/1390 train_time:1078112ms step_avg:1246.37ms
step:875/1390 val_loss:3.6852 train_time:1078113ms step_avg:1246.37ms
step:876/1390 train_time:1079434ms step_avg:1246.46ms
step:877/1390 train_time:1080736ms step_avg:1246.52ms
step:878/1390 train_time:1082028ms step_avg:1246.58ms
step:879/1390 train_time:1083326ms step_avg:1246.64ms
step:880/1390 train_time:1084608ms step_avg:1246.68ms
step:881/1390 train_time:1085893ms step_avg:1246.72ms
step:882/1390 train_time:1087178ms step_avg:1246.76ms
step:883/1390 train_time:1088466ms step_avg:1246.81ms
step:884/1390 train_time:1089759ms step_avg:1246.86ms
step:885/1390 train_time:1091052ms step_avg:1246.92ms
step:886/1390 train_time:1092340ms step_avg:1246.96ms
step:887/1390 train_time:1093634ms step_avg:1247.02ms
step:888/1390 train_time:1094927ms step_avg:1247.07ms
step:889/1390 train_time:1096225ms step_avg:1247.13ms
step:890/1390 train_time:1097517ms step_avg:1247.18ms
step:891/1390 train_time:1098811ms step_avg:1247.23ms
step:892/1390 train_time:1100106ms step_avg:1247.29ms
step:893/1390 train_time:1101391ms step_avg:1247.33ms
step:894/1390 train_time:1102684ms step_avg:1247.38ms
step:895/1390 train_time:1103970ms step_avg:1247.42ms
step:896/1390 train_time:1105261ms step_avg:1247.47ms
step:897/1390 train_time:1106549ms step_avg:1247.52ms
step:898/1390 train_time:1107849ms step_avg:1247.58ms
step:899/1390 train_time:1109135ms step_avg:1247.62ms
step:900/1390 train_time:1110432ms step_avg:1247.68ms
step:900/1390 val_loss:3.6754 train_time:1110432ms step_avg:1247.68ms
step:901/1390 train_time:1111744ms step_avg:1247.75ms
step:902/1390 train_time:1113048ms step_avg:1247.81ms
step:903/1390 train_time:1114336ms step_avg:1247.86ms
step:904/1390 train_time:1115623ms step_avg:1247.90ms
step:905/1390 train_time:1116913ms step_avg:1247.95ms
step:906/1390 train_time:1118213ms step_avg:1248.01ms
step:907/1390 train_time:1119501ms step_avg:1248.05ms
step:908/1390 train_time:1120788ms step_avg:1248.09ms
step:909/1390 train_time:1122094ms step_avg:1248.16ms
step:910/1390 train_time:1123391ms step_avg:1248.21ms
step:911/1390 train_time:1124676ms step_avg:1248.25ms
step:912/1390 train_time:1125970ms step_avg:1248.30ms
step:913/1390 train_time:1127259ms step_avg:1248.35ms
step:914/1390 train_time:1128558ms step_avg:1248.41ms
step:915/1390 train_time:1129851ms step_avg:1248.45ms
step:916/1390 train_time:1131141ms step_avg:1248.50ms
step:917/1390 train_time:1132423ms step_avg:1248.54ms
step:918/1390 train_time:1133723ms step_avg:1248.59ms
step:919/1390 train_time:1135026ms step_avg:1248.65ms
step:920/1390 train_time:1136315ms step_avg:1248.70ms
step:921/1390 train_time:1137607ms step_avg:1248.75ms
step:922/1390 train_time:1138891ms step_avg:1248.78ms
step:923/1390 train_time:1140180ms step_avg:1248.83ms
step:924/1390 train_time:1141478ms step_avg:1248.88ms
step:925/1390 train_time:1142773ms step_avg:1248.93ms
step:925/1390 val_loss:3.6647 train_time:1142773ms step_avg:1248.93ms
step:926/1390 train_time:1144098ms step_avg:1249.02ms
step:927/1390 train_time:1145406ms step_avg:1249.08ms
step:928/1390 train_time:1146707ms step_avg:1249.14ms
step:929/1390 train_time:1148007ms step_avg:1249.19ms
step:930/1390 train_time:1149311ms step_avg:1249.25ms
step:931/1390 train_time:1150615ms step_avg:1249.31ms
step:932/1390 train_time:1151923ms step_avg:1249.37ms
step:933/1390 train_time:1153227ms step_avg:1249.43ms
step:934/1390 train_time:1154526ms step_avg:1249.49ms
step:935/1390 train_time:1155826ms step_avg:1249.54ms
step:936/1390 train_time:1157139ms step_avg:1249.61ms
step:937/1390 train_time:1158437ms step_avg:1249.66ms
step:938/1390 train_time:1159726ms step_avg:1249.71ms
step:939/1390 train_time:1161043ms step_avg:1249.78ms
step:940/1390 train_time:1162334ms step_avg:1249.82ms
step:941/1390 train_time:1163635ms step_avg:1249.88ms
step:942/1390 train_time:1164943ms step_avg:1249.94ms
step:943/1390 train_time:1166242ms step_avg:1249.99ms
step:944/1390 train_time:1167547ms step_avg:1250.05ms
step:945/1390 train_time:1168849ms step_avg:1250.11ms
step:946/1390 train_time:1170145ms step_avg:1250.16ms
step:947/1390 train_time:1171450ms step_avg:1250.21ms
step:948/1390 train_time:1172758ms step_avg:1250.28ms
step:949/1390 train_time:1174060ms step_avg:1250.33ms
step:950/1390 train_time:1175369ms step_avg:1250.39ms
step:950/1390 val_loss:3.6546 train_time:1175369ms step_avg:1250.39ms
step:951/1390 train_time:1176721ms step_avg:1250.50ms
step:952/1390 train_time:1178034ms step_avg:1250.57ms
step:953/1390 train_time:1179336ms step_avg:1250.62ms
step:954/1390 train_time:1180632ms step_avg:1250.67ms
step:955/1390 train_time:1181937ms step_avg:1250.73ms
step:956/1390 train_time:1183238ms step_avg:1250.78ms
step:957/1390 train_time:1184539ms step_avg:1250.83ms
step:958/1390 train_time:1185847ms step_avg:1250.89ms
step:959/1390 train_time:1187163ms step_avg:1250.96ms
step:960/1390 train_time:1188468ms step_avg:1251.02ms
step:961/1390 train_time:1189777ms step_avg:1251.08ms
step:962/1390 train_time:1191083ms step_avg:1251.14ms
step:963/1390 train_time:1192374ms step_avg:1251.18ms
step:964/1390 train_time:1193677ms step_avg:1251.23ms
step:965/1390 train_time:1194975ms step_avg:1251.28ms
step:966/1390 train_time:1196278ms step_avg:1251.34ms
step:967/1390 train_time:1197564ms step_avg:1251.37ms
step:968/1390 train_time:1198871ms step_avg:1251.43ms
step:969/1390 train_time:1200167ms step_avg:1251.48ms
step:970/1390 train_time:1201465ms step_avg:1251.53ms
step:971/1390 train_time:1202759ms step_avg:1251.57ms
step:972/1390 train_time:1204061ms step_avg:1251.62ms
step:973/1390 train_time:1205356ms step_avg:1251.67ms
step:974/1390 train_time:1206652ms step_avg:1251.71ms
step:975/1390 train_time:1207957ms step_avg:1251.77ms
step:975/1390 val_loss:3.6452 train_time:1207957ms step_avg:1251.77ms
step:976/1390 train_time:1209279ms step_avg:1251.84ms
step:977/1390 train_time:1210590ms step_avg:1251.90ms
step:978/1390 train_time:1211887ms step_avg:1251.95ms
step:979/1390 train_time:1213189ms step_avg:1252.00ms
step:980/1390 train_time:1214483ms step_avg:1252.04ms
step:981/1390 train_time:1215777ms step_avg:1252.09ms
step:982/1390 train_time:1217068ms step_avg:1252.13ms
step:983/1390 train_time:1218376ms step_avg:1252.18ms
step:984/1390 train_time:1219681ms step_avg:1252.24ms
step:985/1390 train_time:1220982ms step_avg:1252.29ms
step:986/1390 train_time:1222277ms step_avg:1252.33ms
step:987/1390 train_time:1223577ms step_avg:1252.38ms
step:988/1390 train_time:1224871ms step_avg:1252.42ms
step:989/1390 train_time:1226176ms step_avg:1252.48ms
step:990/1390 train_time:1227469ms step_avg:1252.52ms
step:991/1390 train_time:1228767ms step_avg:1252.57ms
step:992/1390 train_time:1230079ms step_avg:1252.63ms
step:993/1390 train_time:1231373ms step_avg:1252.67ms
step:994/1390 train_time:1232665ms step_avg:1252.71ms
step:995/1390 train_time:1233959ms step_avg:1252.75ms
step:996/1390 train_time:1235247ms step_avg:1252.79ms
step:997/1390 train_time:1236535ms step_avg:1252.82ms
step:998/1390 train_time:1237833ms step_avg:1252.87ms
step:999/1390 train_time:1239124ms step_avg:1252.91ms
step:1000/1390 train_time:1240432ms step_avg:1252.96ms
step:1000/1390 val_loss:3.6361 train_time:1240432ms step_avg:1252.96ms
step:1001/1390 train_time:1241764ms step_avg:1253.04ms
step:1002/1390 train_time:1243075ms step_avg:1253.10ms
step:1003/1390 train_time:1244377ms step_avg:1253.15ms
step:1004/1390 train_time:1245681ms step_avg:1253.20ms
step:1005/1390 train_time:1246973ms step_avg:1253.24ms
step:1006/1390 train_time:1248270ms step_avg:1253.28ms
step:1007/1390 train_time:1249564ms step_avg:1253.32ms
step:1008/1390 train_time:1250869ms step_avg:1253.38ms
step:1009/1390 train_time:1252165ms step_avg:1253.42ms
step:1010/1390 train_time:1253469ms step_avg:1253.47ms
step:1011/1390 train_time:1254760ms step_avg:1253.51ms
step:1012/1390 train_time:1256061ms step_avg:1253.55ms
step:1013/1390 train_time:1257360ms step_avg:1253.60ms
step:1014/1390 train_time:1258658ms step_avg:1253.64ms
step:1015/1390 train_time:1259951ms step_avg:1253.68ms
step:1016/1390 train_time:1261254ms step_avg:1253.73ms
step:1017/1390 train_time:1262548ms step_avg:1253.77ms
step:1018/1390 train_time:1263851ms step_avg:1253.82ms
step:1019/1390 train_time:1265148ms step_avg:1253.86ms
step:1020/1390 train_time:1266443ms step_avg:1253.90ms
step:1021/1390 train_time:1267732ms step_avg:1253.94ms
step:1022/1390 train_time:1269033ms step_avg:1253.99ms
step:1023/1390 train_time:1270325ms step_avg:1254.02ms
step:1024/1390 train_time:1271631ms step_avg:1254.07ms
step:1025/1390 train_time:1272926ms step_avg:1254.11ms
step:1025/1390 val_loss:3.6269 train_time:1272926ms step_avg:1254.11ms
step:1026/1390 train_time:1274244ms step_avg:1254.18ms
step:1027/1390 train_time:1275568ms step_avg:1254.25ms
step:1028/1390 train_time:1276875ms step_avg:1254.30ms
step:1029/1390 train_time:1278172ms step_avg:1254.34ms
step:1030/1390 train_time:1279460ms step_avg:1254.37ms
step:1031/1390 train_time:1280757ms step_avg:1254.41ms
step:1032/1390 train_time:1282059ms step_avg:1254.46ms
step:1033/1390 train_time:1283375ms step_avg:1254.52ms
step:1034/1390 train_time:1284695ms step_avg:1254.59ms
step:1035/1390 train_time:1285991ms step_avg:1254.63ms
step:1036/1390 train_time:1287299ms step_avg:1254.68ms
step:1037/1390 train_time:1288615ms step_avg:1254.74ms
step:1038/1390 train_time:1289917ms step_avg:1254.78ms
step:1039/1390 train_time:1291226ms step_avg:1254.84ms
step:1040/1390 train_time:1292538ms step_avg:1254.89ms
step:1041/1390 train_time:1293847ms step_avg:1254.94ms
step:1042/1390 train_time:1295154ms step_avg:1254.99ms
step:1043/1390 train_time:1296463ms step_avg:1255.05ms
step:1044/1390 train_time:1297786ms step_avg:1255.11ms
step:1045/1390 train_time:1299099ms step_avg:1255.17ms
step:1046/1390 train_time:1300394ms step_avg:1255.21ms
step:1047/1390 train_time:1301696ms step_avg:1255.25ms
step:1048/1390 train_time:1303000ms step_avg:1255.30ms
step:1049/1390 train_time:1304298ms step_avg:1255.34ms
step:1050/1390 train_time:1305609ms step_avg:1255.39ms
step:1050/1390 val_loss:3.6178 train_time:1305610ms step_avg:1255.39ms
step:1051/1390 train_time:1306935ms step_avg:1255.46ms
step:1052/1390 train_time:1308239ms step_avg:1255.51ms
step:1053/1390 train_time:1309539ms step_avg:1255.55ms
step:1054/1390 train_time:1310844ms step_avg:1255.60ms
step:1055/1390 train_time:1312135ms step_avg:1255.63ms
step:1056/1390 train_time:1313429ms step_avg:1255.67ms
step:1057/1390 train_time:1314743ms step_avg:1255.72ms
step:1058/1390 train_time:1316046ms step_avg:1255.77ms
step:1059/1390 train_time:1317346ms step_avg:1255.81ms
step:1060/1390 train_time:1318645ms step_avg:1255.85ms
step:1061/1390 train_time:1319950ms step_avg:1255.90ms
step:1062/1390 train_time:1321239ms step_avg:1255.93ms
step:1063/1390 train_time:1322548ms step_avg:1255.98ms
step:1064/1390 train_time:1323842ms step_avg:1256.02ms
step:1065/1390 train_time:1325160ms step_avg:1256.08ms
step:1066/1390 train_time:1326475ms step_avg:1256.13ms
step:1067/1390 train_time:1327779ms step_avg:1256.18ms
step:1068/1390 train_time:1329080ms step_avg:1256.22ms
step:1069/1390 train_time:1330381ms step_avg:1256.26ms
step:1070/1390 train_time:1331687ms step_avg:1256.31ms
step:1071/1390 train_time:1332986ms step_avg:1256.35ms
step:1072/1390 train_time:1334276ms step_avg:1256.38ms
step:1073/1390 train_time:1335581ms step_avg:1256.43ms
step:1074/1390 train_time:1336892ms step_avg:1256.48ms
step:1075/1390 train_time:1338189ms step_avg:1256.52ms
step:1075/1390 val_loss:3.6107 train_time:1338189ms step_avg:1256.52ms
step:1076/1390 train_time:1339507ms step_avg:1256.57ms
step:1077/1390 train_time:1340820ms step_avg:1256.63ms
step:1078/1390 train_time:1342126ms step_avg:1256.67ms
step:1079/1390 train_time:1343431ms step_avg:1256.72ms
step:1080/1390 train_time:1344742ms step_avg:1256.77ms
step:1081/1390 train_time:1346042ms step_avg:1256.81ms
step:1082/1390 train_time:1347349ms step_avg:1256.86ms
step:1083/1390 train_time:1348657ms step_avg:1256.90ms
step:1084/1390 train_time:1349962ms step_avg:1256.95ms
step:1085/1390 train_time:1351272ms step_avg:1257.00ms
step:1086/1390 train_time:1352589ms step_avg:1257.05ms
step:1087/1390 train_time:1353895ms step_avg:1257.10ms
step:1088/1390 train_time:1355217ms step_avg:1257.16ms
step:1089/1390 train_time:1356524ms step_avg:1257.20ms
step:1090/1390 train_time:1357828ms step_avg:1257.25ms
step:1091/1390 train_time:1359140ms step_avg:1257.30ms
step:1092/1390 train_time:1360448ms step_avg:1257.35ms
step:1093/1390 train_time:1361752ms step_avg:1257.39ms
step:1094/1390 train_time:1363056ms step_avg:1257.43ms
step:1095/1390 train_time:1364367ms step_avg:1257.48ms
step:1096/1390 train_time:1365676ms step_avg:1257.53ms
step:1097/1390 train_time:1366977ms step_avg:1257.57ms
step:1098/1390 train_time:1368278ms step_avg:1257.61ms
step:1099/1390 train_time:1369582ms step_avg:1257.65ms
step:1100/1390 train_time:1370873ms step_avg:1257.68ms
step:1100/1390 val_loss:3.6030 train_time:1370873ms step_avg:1257.68ms
step:1101/1390 train_time:1372213ms step_avg:1257.76ms
step:1102/1390 train_time:1373539ms step_avg:1257.82ms
step:1103/1390 train_time:1374839ms step_avg:1257.86ms
step:1104/1390 train_time:1376144ms step_avg:1257.90ms
step:1105/1390 train_time:1377458ms step_avg:1257.95ms
step:1106/1390 train_time:1378765ms step_avg:1258.00ms
step:1107/1390 train_time:1380080ms step_avg:1258.05ms
step:1108/1390 train_time:1381379ms step_avg:1258.09ms
step:1109/1390 train_time:1382686ms step_avg:1258.13ms
step:1110/1390 train_time:1383990ms step_avg:1258.17ms
step:1111/1390 train_time:1385288ms step_avg:1258.21ms
step:1112/1390 train_time:1386581ms step_avg:1258.24ms
step:1113/1390 train_time:1387882ms step_avg:1258.28ms
step:1114/1390 train_time:1389182ms step_avg:1258.32ms
step:1115/1390 train_time:1390483ms step_avg:1258.36ms
step:1116/1390 train_time:1391786ms step_avg:1258.40ms
step:1117/1390 train_time:1393112ms step_avg:1258.46ms
step:1118/1390 train_time:1394406ms step_avg:1258.49ms
step:1119/1390 train_time:1395705ms step_avg:1258.53ms
step:1120/1390 train_time:1397016ms step_avg:1258.57ms
step:1121/1390 train_time:1398325ms step_avg:1258.62ms
step:1122/1390 train_time:1399626ms step_avg:1258.66ms
step:1123/1390 train_time:1400925ms step_avg:1258.69ms
step:1124/1390 train_time:1402221ms step_avg:1258.73ms
step:1125/1390 train_time:1403548ms step_avg:1258.79ms
step:1125/1390 val_loss:3.5962 train_time:1403548ms step_avg:1258.79ms
step:1126/1390 train_time:1404886ms step_avg:1258.86ms
step:1127/1390 train_time:1406201ms step_avg:1258.91ms
step:1128/1390 train_time:1407504ms step_avg:1258.95ms
step:1129/1390 train_time:1408799ms step_avg:1258.98ms
step:1130/1390 train_time:1410105ms step_avg:1259.02ms
step:1131/1390 train_time:1411408ms step_avg:1259.06ms
step:1132/1390 train_time:1412703ms step_avg:1259.09ms
step:1133/1390 train_time:1414008ms step_avg:1259.13ms
step:1134/1390 train_time:1415323ms step_avg:1259.18ms
step:1135/1390 train_time:1416650ms step_avg:1259.24ms
step:1136/1390 train_time:1417949ms step_avg:1259.28ms
step:1137/1390 train_time:1419257ms step_avg:1259.32ms
step:1138/1390 train_time:1420568ms step_avg:1259.37ms
step:1139/1390 train_time:1421884ms step_avg:1259.42ms
step:1140/1390 train_time:1423184ms step_avg:1259.45ms
step:1141/1390 train_time:1424518ms step_avg:1259.52ms
step:1142/1390 train_time:1425833ms step_avg:1259.57ms
step:1143/1390 train_time:1427140ms step_avg:1259.61ms
step:1144/1390 train_time:1428455ms step_avg:1259.66ms
step:1145/1390 train_time:1429767ms step_avg:1259.71ms
step:1146/1390 train_time:1431075ms step_avg:1259.75ms
step:1147/1390 train_time:1432383ms step_avg:1259.79ms
step:1148/1390 train_time:1433691ms step_avg:1259.83ms
step:1149/1390 train_time:1435014ms step_avg:1259.89ms
step:1150/1390 train_time:1436330ms step_avg:1259.94ms
step:1150/1390 val_loss:3.5904 train_time:1436331ms step_avg:1259.94ms
step:1151/1390 train_time:1437666ms step_avg:1260.00ms
step:1152/1390 train_time:1438986ms step_avg:1260.06ms
step:1153/1390 train_time:1440298ms step_avg:1260.10ms
step:1154/1390 train_time:1441610ms step_avg:1260.15ms
step:1155/1390 train_time:1442936ms step_avg:1260.21ms
step:1156/1390 train_time:1444255ms step_avg:1260.26ms
step:1157/1390 train_time:1445569ms step_avg:1260.30ms
step:1158/1390 train_time:1446882ms step_avg:1260.35ms
step:1159/1390 train_time:1448199ms step_avg:1260.40ms
step:1160/1390 train_time:1449501ms step_avg:1260.44ms
step:1161/1390 train_time:1450820ms step_avg:1260.49ms
step:1162/1390 train_time:1452116ms step_avg:1260.52ms
step:1163/1390 train_time:1453433ms step_avg:1260.57ms
step:1164/1390 train_time:1454743ms step_avg:1260.61ms
step:1165/1390 train_time:1456057ms step_avg:1260.66ms
step:1166/1390 train_time:1457362ms step_avg:1260.69ms
step:1167/1390 train_time:1458683ms step_avg:1260.75ms
step:1168/1390 train_time:1459995ms step_avg:1260.79ms
step:1169/1390 train_time:1461304ms step_avg:1260.83ms
step:1170/1390 train_time:1462611ms step_avg:1260.87ms
step:1171/1390 train_time:1463921ms step_avg:1260.91ms
step:1172/1390 train_time:1465229ms step_avg:1260.95ms
step:1173/1390 train_time:1466559ms step_avg:1261.01ms
step:1174/1390 train_time:1467869ms step_avg:1261.06ms
step:1175/1390 train_time:1469186ms step_avg:1261.10ms
step:1175/1390 val_loss:3.5835 train_time:1469186ms step_avg:1261.10ms
step:1176/1390 train_time:1470540ms step_avg:1261.18ms
step:1177/1390 train_time:1471862ms step_avg:1261.24ms
step:1178/1390 train_time:1473167ms step_avg:1261.27ms
step:1179/1390 train_time:1474496ms step_avg:1261.33ms
step:1180/1390 train_time:1475811ms step_avg:1261.38ms
step:1181/1390 train_time:1477119ms step_avg:1261.42ms
step:1182/1390 train_time:1478444ms step_avg:1261.47ms
step:1183/1390 train_time:1479751ms step_avg:1261.51ms
step:1184/1390 train_time:1481067ms step_avg:1261.56ms
step:1185/1390 train_time:1482380ms step_avg:1261.60ms
step:1186/1390 train_time:1483722ms step_avg:1261.67ms
step:1187/1390 train_time:1485033ms step_avg:1261.71ms
step:1188/1390 train_time:1486341ms step_avg:1261.75ms
step:1189/1390 train_time:1487649ms step_avg:1261.79ms
step:1190/1390 train_time:1488963ms step_avg:1261.83ms
step:1191/1390 train_time:1490271ms step_avg:1261.87ms
step:1192/1390 train_time:1491572ms step_avg:1261.90ms
step:1193/1390 train_time:1492877ms step_avg:1261.94ms
step:1194/1390 train_time:1494195ms step_avg:1261.99ms
step:1195/1390 train_time:1495504ms step_avg:1262.03ms
step:1196/1390 train_time:1496820ms step_avg:1262.07ms
step:1197/1390 train_time:1498145ms step_avg:1262.13ms
step:1198/1390 train_time:1499460ms step_avg:1262.17ms
step:1199/1390 train_time:1500765ms step_avg:1262.21ms
step:1200/1390 train_time:1502065ms step_avg:1262.24ms
step:1200/1390 val_loss:3.5771 train_time:1502065ms step_avg:1262.24ms
step:1201/1390 train_time:1503429ms step_avg:1262.32ms
step:1202/1390 train_time:1504770ms step_avg:1262.39ms
step:1203/1390 train_time:1506083ms step_avg:1262.43ms
step:1204/1390 train_time:1507393ms step_avg:1262.47ms
step:1205/1390 train_time:1508703ms step_avg:1262.51ms
step:1206/1390 train_time:1510020ms step_avg:1262.56ms
step:1207/1390 train_time:1511337ms step_avg:1262.60ms
step:1208/1390 train_time:1512649ms step_avg:1262.64ms
step:1209/1390 train_time:1513983ms step_avg:1262.70ms
step:1210/1390 train_time:1515303ms step_avg:1262.75ms
step:1211/1390 train_time:1516634ms step_avg:1262.81ms
step:1212/1390 train_time:1517939ms step_avg:1262.84ms
step:1213/1390 train_time:1519257ms step_avg:1262.89ms
step:1214/1390 train_time:1520578ms step_avg:1262.94ms
step:1215/1390 train_time:1521880ms step_avg:1262.97ms
step:1216/1390 train_time:1523194ms step_avg:1263.01ms
step:1217/1390 train_time:1524494ms step_avg:1263.04ms
step:1218/1390 train_time:1525792ms step_avg:1263.07ms
step:1219/1390 train_time:1527109ms step_avg:1263.12ms
step:1220/1390 train_time:1528415ms step_avg:1263.15ms
step:1221/1390 train_time:1529721ms step_avg:1263.19ms
step:1222/1390 train_time:1531027ms step_avg:1263.22ms
step:1223/1390 train_time:1532343ms step_avg:1263.27ms
step:1224/1390 train_time:1533681ms step_avg:1263.33ms
step:1225/1390 train_time:1534993ms step_avg:1263.37ms
step:1225/1390 val_loss:3.5716 train_time:1534994ms step_avg:1263.37ms
step:1226/1390 train_time:1536345ms step_avg:1263.44ms
step:1227/1390 train_time:1537678ms step_avg:1263.50ms
step:1228/1390 train_time:1538991ms step_avg:1263.54ms
step:1229/1390 train_time:1540321ms step_avg:1263.59ms
step:1230/1390 train_time:1541632ms step_avg:1263.63ms
step:1231/1390 train_time:1542961ms step_avg:1263.69ms
step:1232/1390 train_time:1544275ms step_avg:1263.73ms
step:1233/1390 train_time:1545585ms step_avg:1263.76ms
step:1234/1390 train_time:1546890ms step_avg:1263.80ms
step:1235/1390 train_time:1548199ms step_avg:1263.84ms
step:1236/1390 train_time:1549501ms step_avg:1263.87ms
step:1237/1390 train_time:1550836ms step_avg:1263.93ms
step:1238/1390 train_time:1552150ms step_avg:1263.97ms
step:1239/1390 train_time:1553463ms step_avg:1264.01ms
step:1240/1390 train_time:1554792ms step_avg:1264.06ms
step:1241/1390 train_time:1556103ms step_avg:1264.10ms
step:1242/1390 train_time:1557415ms step_avg:1264.14ms
step:1243/1390 train_time:1558732ms step_avg:1264.18ms
step:1244/1390 train_time:1560043ms step_avg:1264.22ms
step:1245/1390 train_time:1561348ms step_avg:1264.25ms
step:1246/1390 train_time:1562661ms step_avg:1264.29ms
step:1247/1390 train_time:1563961ms step_avg:1264.32ms
step:1248/1390 train_time:1565260ms step_avg:1264.35ms
step:1249/1390 train_time:1566568ms step_avg:1264.38ms
step:1250/1390 train_time:1567891ms step_avg:1264.43ms
step:1250/1390 val_loss:3.5659 train_time:1567891ms step_avg:1264.43ms
step:1251/1390 train_time:1569235ms step_avg:1264.49ms
step:1252/1390 train_time:1570537ms step_avg:1264.52ms
step:1253/1390 train_time:1571837ms step_avg:1264.55ms
step:1254/1390 train_time:1573170ms step_avg:1264.61ms
step:1255/1390 train_time:1574481ms step_avg:1264.64ms
step:1256/1390 train_time:1575790ms step_avg:1264.68ms
step:1257/1390 train_time:1577106ms step_avg:1264.72ms
step:1258/1390 train_time:1578437ms step_avg:1264.77ms
step:1259/1390 train_time:1579741ms step_avg:1264.80ms
step:1260/1390 train_time:1581044ms step_avg:1264.84ms
step:1261/1390 train_time:1582394ms step_avg:1264.90ms
step:1262/1390 train_time:1583712ms step_avg:1264.95ms
step:1263/1390 train_time:1585023ms step_avg:1264.98ms
step:1264/1390 train_time:1586325ms step_avg:1265.01ms
step:1265/1390 train_time:1587648ms step_avg:1265.06ms
step:1266/1390 train_time:1588958ms step_avg:1265.09ms
step:1267/1390 train_time:1590287ms step_avg:1265.14ms
step:1268/1390 train_time:1591608ms step_avg:1265.19ms
step:1269/1390 train_time:1592931ms step_avg:1265.24ms
step:1270/1390 train_time:1594236ms step_avg:1265.27ms
step:1271/1390 train_time:1595546ms step_avg:1265.30ms
step:1272/1390 train_time:1596850ms step_avg:1265.33ms
step:1273/1390 train_time:1598158ms step_avg:1265.37ms
step:1274/1390 train_time:1599471ms step_avg:1265.40ms
step:1275/1390 train_time:1600785ms step_avg:1265.44ms
step:1275/1390 val_loss:3.5614 train_time:1600786ms step_avg:1265.44ms
step:1276/1390 train_time:1602131ms step_avg:1265.51ms
step:1277/1390 train_time:1603455ms step_avg:1265.55ms
step:1278/1390 train_time:1604772ms step_avg:1265.59ms
step:1279/1390 train_time:1606107ms step_avg:1265.65ms
step:1280/1390 train_time:1607427ms step_avg:1265.69ms
step:1281/1390 train_time:1608735ms step_avg:1265.72ms
step:1282/1390 train_time:1610038ms step_avg:1265.75ms
step:1283/1390 train_time:1611363ms step_avg:1265.80ms
step:1284/1390 train_time:1612670ms step_avg:1265.83ms
step:1285/1390 train_time:1613988ms step_avg:1265.87ms
step:1286/1390 train_time:1615310ms step_avg:1265.92ms
step:1287/1390 train_time:1616617ms step_avg:1265.95ms
step:1288/1390 train_time:1617929ms step_avg:1265.98ms
step:1289/1390 train_time:1619267ms step_avg:1266.04ms
step:1290/1390 train_time:1620602ms step_avg:1266.10ms
step:1291/1390 train_time:1621918ms step_avg:1266.13ms
step:1292/1390 train_time:1623231ms step_avg:1266.17ms
step:1293/1390 train_time:1624542ms step_avg:1266.21ms
step:1294/1390 train_time:1625856ms step_avg:1266.24ms
step:1295/1390 train_time:1627177ms step_avg:1266.29ms
step:1296/1390 train_time:1628510ms step_avg:1266.34ms
step:1297/1390 train_time:1629817ms step_avg:1266.37ms
step:1298/1390 train_time:1631124ms step_avg:1266.40ms
step:1299/1390 train_time:1632440ms step_avg:1266.44ms
step:1300/1390 train_time:1633763ms step_avg:1266.48ms
step:1300/1390 val_loss:3.5575 train_time:1633764ms step_avg:1266.48ms
step:1301/1390 train_time:1635104ms step_avg:1266.54ms
step:1302/1390 train_time:1636429ms step_avg:1266.59ms
step:1303/1390 train_time:1637762ms step_avg:1266.64ms
step:1304/1390 train_time:1639085ms step_avg:1266.68ms
step:1305/1390 train_time:1640405ms step_avg:1266.72ms
step:1306/1390 train_time:1641724ms step_avg:1266.76ms
step:1307/1390 train_time:1643047ms step_avg:1266.81ms
step:1308/1390 train_time:1644366ms step_avg:1266.85ms
step:1309/1390 train_time:1645683ms step_avg:1266.88ms
step:1310/1390 train_time:1647005ms step_avg:1266.93ms
step:1311/1390 train_time:1648319ms step_avg:1266.96ms
step:1312/1390 train_time:1649648ms step_avg:1267.01ms
step:1313/1390 train_time:1650951ms step_avg:1267.04ms
step:1314/1390 train_time:1652271ms step_avg:1267.08ms
step:1315/1390 train_time:1653588ms step_avg:1267.12ms
step:1316/1390 train_time:1654901ms step_avg:1267.15ms
step:1317/1390 train_time:1656212ms step_avg:1267.19ms
step:1318/1390 train_time:1657529ms step_avg:1267.22ms
step:1319/1390 train_time:1658849ms step_avg:1267.26ms
step:1320/1390 train_time:1660160ms step_avg:1267.30ms
step:1321/1390 train_time:1661489ms step_avg:1267.35ms
step:1322/1390 train_time:1662795ms step_avg:1267.37ms
step:1323/1390 train_time:1664105ms step_avg:1267.41ms
step:1324/1390 train_time:1665418ms step_avg:1267.44ms
step:1325/1390 train_time:1666757ms step_avg:1267.50ms
step:1325/1390 val_loss:3.5543 train_time:1666757ms step_avg:1267.50ms
step:1326/1390 train_time:1668102ms step_avg:1267.55ms
step:1327/1390 train_time:1669422ms step_avg:1267.59ms
step:1328/1390 train_time:1670757ms step_avg:1267.65ms
step:1329/1390 train_time:1672068ms step_avg:1267.68ms
step:1330/1390 train_time:1673402ms step_avg:1267.73ms
step:1331/1390 train_time:1674753ms step_avg:1267.79ms
step:1332/1390 train_time:1676076ms step_avg:1267.83ms
step:1333/1390 train_time:1677390ms step_avg:1267.87ms
step:1334/1390 train_time:1678702ms step_avg:1267.90ms
step:1335/1390 train_time:1680019ms step_avg:1267.94ms
step:1336/1390 train_time:1681344ms step_avg:1267.98ms
step:1337/1390 train_time:1682656ms step_avg:1268.01ms
step:1338/1390 train_time:1683977ms step_avg:1268.05ms
step:1339/1390 train_time:1685303ms step_avg:1268.10ms
step:1340/1390 train_time:1686614ms step_avg:1268.13ms
step:1341/1390 train_time:1687956ms step_avg:1268.19ms
step:1342/1390 train_time:1689263ms step_avg:1268.22ms
step:1343/1390 train_time:1690578ms step_avg:1268.25ms
step:1344/1390 train_time:1691910ms step_avg:1268.30ms
step:1345/1390 train_time:1693232ms step_avg:1268.34ms
step:1346/1390 train_time:1694569ms step_avg:1268.39ms
step:1347/1390 train_time:1695895ms step_avg:1268.43ms
step:1348/1390 train_time:1697224ms step_avg:1268.48ms
step:1349/1390 train_time:1698529ms step_avg:1268.51ms
step:1350/1390 train_time:1699862ms step_avg:1268.55ms
step:1350/1390 val_loss:3.5522 train_time:1699862ms step_avg:1268.55ms
step:1351/1390 train_time:1701224ms step_avg:1268.62ms
step:1352/1390 train_time:1702570ms step_avg:1268.68ms
step:1353/1390 train_time:1703905ms step_avg:1268.73ms
step:1354/1390 train_time:1705230ms step_avg:1268.77ms
step:1355/1390 train_time:1706543ms step_avg:1268.81ms
step:1356/1390 train_time:1707883ms step_avg:1268.86ms
step:1357/1390 train_time:1709215ms step_avg:1268.90ms
step:1358/1390 train_time:1710529ms step_avg:1268.94ms
step:1359/1390 train_time:1711850ms step_avg:1268.98ms
step:1360/1390 train_time:1713177ms step_avg:1269.02ms
step:1361/1390 train_time:1714510ms step_avg:1269.07ms
step:1362/1390 train_time:1715841ms step_avg:1269.11ms
step:1363/1390 train_time:1717172ms step_avg:1269.16ms
step:1364/1390 train_time:1718472ms step_avg:1269.18ms
step:1365/1390 train_time:1719789ms step_avg:1269.22ms
step:1366/1390 train_time:1721120ms step_avg:1269.26ms
step:1367/1390 train_time:1722437ms step_avg:1269.30ms
step:1368/1390 train_time:1723775ms step_avg:1269.35ms
step:1369/1390 train_time:1725123ms step_avg:1269.41ms
step:1370/1390 train_time:1726445ms step_avg:1269.45ms
step:1371/1390 train_time:1727775ms step_avg:1269.49ms
step:1372/1390 train_time:1729095ms step_avg:1269.53ms
step:1373/1390 train_time:1730432ms step_avg:1269.58ms
step:1374/1390 train_time:1731747ms step_avg:1269.61ms
step:1375/1390 train_time:1733060ms step_avg:1269.64ms
step:1375/1390 val_loss:3.5515 train_time:1733061ms step_avg:1269.64ms
step:1376/1390 train_time:1734420ms step_avg:1269.71ms
step:1377/1390 train_time:1735755ms step_avg:1269.75ms
step:1378/1390 train_time:1737081ms step_avg:1269.80ms
step:1379/1390 train_time:1738399ms step_avg:1269.83ms
step:1380/1390 train_time:1739728ms step_avg:1269.87ms
step:1381/1390 train_time:1741070ms step_avg:1269.93ms
step:1382/1390 train_time:1742382ms step_avg:1269.96ms
step:1383/1390 train_time:1743727ms step_avg:1270.01ms
step:1384/1390 train_time:1745042ms step_avg:1270.04ms
step:1385/1390 train_time:1746361ms step_avg:1270.08ms
step:1386/1390 train_time:1747683ms step_avg:1270.12ms
step:1387/1390 train_time:1748993ms step_avg:1270.15ms
step:1388/1390 train_time:1750296ms step_avg:1270.17ms
step:1389/1390 train_time:1751607ms step_avg:1270.20ms
step:1390/1390 train_time:1752918ms step_avg:1270.23ms
step:1390/1390 val_loss:3.5530 train_time:1752918ms step_avg:1270.23ms
peak memory consumption: 56226 MiB
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment