Created
May 14, 2025 22:39
-
-
Save tysam-code/1c3c6ce933b1746cdcf19c6e6d7cd873 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import sys | |
with open(sys.argv[0]) as f: | |
code = f.read() # read the code of this file ASAP, for logging | |
import uuid | |
import time | |
import glob | |
import subprocess | |
import contextlib | |
from dataclasses import dataclass | |
import torch | |
torch.empty(1, device='cuda', requires_grad=True).backward() | |
from torch import nn | |
import torch.nn.functional as F | |
import torch.distributed as dist | |
from torch.nn.parallel import DistributedDataParallel as DDP | |
# use of FlexAttention contributed by @KoszarskyB | |
from torch.nn.attention.flex_attention import BlockMask, flex_attention | |
# ----------------------------------------------------------------------------- | |
# Muon optimizer | |
@torch.compile | |
def zeropower_via_newtonschulz5(G, steps): | |
""" | |
Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a | |
quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose | |
of minimizing steps, it turns out to be empirically effective to keep increasing the slope at | |
zero even beyond the point where the iteration no longer converges all the way to one everywhere | |
on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T | |
where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model | |
performance at all relative to UV^T, where USV^T = G is the SVD. | |
""" | |
assert len(G.shape) == 2 | |
a, b, c = (3.4445, -4.7750, 2.0315) | |
X = G.bfloat16() | |
if G.size(0) > G.size(1): | |
X = X.T | |
# Ensure spectral norm is at most 1 | |
X = X / (X.norm() + 1e-7) | |
# Perform the NS iterations | |
for _ in range(steps): | |
A = X @ X.T | |
B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng | |
X = a * X + B @ X | |
if G.size(0) > G.size(1): | |
X = X.T | |
return X | |
class Muon(torch.optim.Optimizer): | |
""" | |
Muon - MomentUm Orthogonalized by Newton-schulz | |
Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- | |
processing step, in which each 2D parameter's update is replaced with the nearest orthogonal | |
matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has | |
the advantage that it can be stably run in bfloat16 on the GPU. | |
Some warnings: | |
- This optimizer assumes that all parameters passed in are 2D. | |
- It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D | |
parameters; those should all be optimized by a standard method (e.g., AdamW). | |
- To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. | |
- We believe it is unlikely to work well for training with small batch size. | |
- We believe it may not work well for finetuning pretrained models, but we haven't tested this. | |
- We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). | |
Arguments: | |
lr: The learning rate used by the internal SGD. | |
momentum: The momentum used by the internal SGD. | |
nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) | |
ns_steps: The number of Newton-Schulz iteration steps to use. | |
""" | |
def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, ns_steps=5): | |
self.world_size = int(os.environ['WORLD_SIZE']) | |
self.rank = int(os.environ['RANK']) | |
defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, ns_steps=ns_steps) | |
assert all(isinstance(p, torch.Tensor) for p in params) | |
sizes = {p.numel() for p in params} | |
param_groups = [dict(params=[p for p in params if p.numel() == size], | |
update_buffer=[torch.empty(size, device='cuda', dtype=torch.bfloat16) for _ in range(self.world_size)]) | |
for size in sizes] | |
super().__init__(param_groups, defaults) | |
def step(self): | |
for group in self.param_groups: | |
lr = group['lr'] | |
momentum = group['momentum'] | |
nesterov = group['nesterov'] | |
ns_steps = group['ns_steps'] | |
update_buffers = group['update_buffer'] | |
# generate weight updates in distributed fashion | |
params = group['params'] | |
""" | |
handle = None | |
params_world = None | |
def update_prev(): | |
if params_world is None: | |
return | |
assert handle is not None | |
handle.wait() | |
for p_world, g_world in zip(params_world, update_buffers): | |
p_world.data.add_( | |
g_world.view_as(p_world), | |
alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, | |
) | |
""" | |
# Single-GPU-only experiments, disabling comms silliness due to segfault stuff :( | |
for base_i in range(len(params)): #[::self.world_size]: | |
if True: | |
#if base_i + rank < len(params): | |
p = params[base_i + self.rank] | |
g = p.grad | |
assert g is not None | |
state = self.state[p] | |
if 'momentum_buffer' not in state: | |
state['momentum_buffer'] = torch.zeros_like(g) | |
buf = state['momentum_buffer'] | |
buf.lerp_(g, 1 - momentum) | |
g = g.lerp_(buf, momentum) if nesterov else buf | |
g = zeropower_via_newtonschulz5(g, steps=ns_steps).flatten() | |
p.data.add_(g.view_as(p), alpha=-lr * max(1, p.size(0) / p.size(1)) ** 0.5) | |
#else: | |
# g = update_buffers[rank] | |
#update_prev() # async all_gather instead of sync all_reduce by @YouJiacheng | |
#handle = dist.all_gather(update_buffers, g, async_op=True) | |
#params_world = params[base_i : base_i + self.world_size] | |
#update_prev() | |
# ----------------------------------------------------------------------------- | |
# PyTorch nn.Module definitions for the GPT-2 model | |
def norm(x): | |
return F.rms_norm(x, (x.size(-1),)) | |
class CastedLinear(nn.Linear): | |
def __init__(self, in_features, out_features): | |
super().__init__(in_features, out_features, bias=False) | |
def forward(self, x): | |
return F.linear(x, self.weight.type_as(x)) | |
class Rotary(nn.Module): | |
def __init__(self, dim, max_seq_len=65536): | |
super().__init__() | |
# half-truncate RoPE by @YouJiacheng | |
angular_freq = (1 / 1024) ** torch.linspace(0, 1, steps=dim//4, dtype=torch.float32) | |
angular_freq = torch.cat([angular_freq, angular_freq.new_zeros(dim//4)]) | |
t = torch.arange(max_seq_len, dtype=torch.float32) | |
theta = torch.einsum('i,j -> ij', t, angular_freq) | |
self.cos = nn.Buffer(theta.cos(), persistent=False) | |
self.sin = nn.Buffer(theta.sin(), persistent=False) | |
def forward(self, x): | |
cos, sin = self.cos[None, :x.size(-3), None, :], self.sin[None, :x.size(-3), None, :] | |
x1, x2 = x.float().chunk(2, dim=-1) | |
y1 = x1 * cos + x2 * sin | |
y2 = x1 * (-sin) + x2 * cos | |
return torch.cat((y1, y2), 3).type_as(x) | |
class CausalSelfAttention(nn.Module): | |
def __init__(self, dim, num_heads): | |
super().__init__() | |
assert dim % num_heads == 0 | |
self.num_heads = num_heads | |
self.c_q = CastedLinear(dim, dim) | |
self.c_k = CastedLinear(dim, dim) | |
self.c_v = CastedLinear(dim, dim) | |
self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) | |
self.rotary = Rotary(dim // num_heads) # dim // num_heads = head_dim | |
self.c_proj = CastedLinear(dim, dim) | |
self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 | |
def forward(self, x, ve, block_mask): | |
B, T = x.size(0), x.size(1) # batch size, sequence length | |
assert B == 1, 'Must use batch size = 1 for FlexAttention' | |
q = self.c_q(x).view(B, T, self.num_heads, -1) | |
k = self.c_k(x).view(B, T, self.num_heads, -1) | |
v = self.c_v(x).view(B, T, self.num_heads, -1) | |
if ve is not None: | |
v = self.lambdas[0] * v + self.lambdas[1] * ve.view_as(v) # @KoszarskyB & @Grad62304977 | |
else: # skip mid-layers token value embeddings by @YouJiacheng | |
v = self.lambdas[0] * v | |
q, k = norm(q), norm(k) # QK norm @Grad62304977 | |
q, k = self.rotary(q), self.rotary(k) | |
y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask) | |
y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side | |
y = self.c_proj(y) | |
return y | |
class MLP(nn.Module): | |
def __init__(self, dim): | |
super().__init__() | |
self.c_fc = CastedLinear(dim, 4 * dim) | |
self.c_proj = CastedLinear(4 * dim, dim) | |
self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 | |
def forward(self, x): | |
x = self.c_fc(x) | |
x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 | |
x = self.c_proj(x) | |
return x | |
class Block(nn.Module): | |
def __init__(self, model_dim, num_heads, use_attn=True): | |
super().__init__() | |
self.attn = CausalSelfAttention(model_dim, num_heads) if use_attn else None | |
self.mlp = MLP(model_dim) | |
self.lambdas = nn.Parameter(torch.tensor([1., 0.])) | |
def forward(self, x, ve, x0, block_mask): | |
x = self.lambdas[0] * x + self.lambdas[1] * x0 | |
if self.attn is not None: | |
x = x + self.attn(norm(x), ve, block_mask) | |
x = x + self.mlp(norm(x)) | |
return x | |
class ValueEmbedding(nn.Module): | |
def __init__(self, vocab_size, model_dim): | |
super().__init__() | |
self.embed = nn.ModuleList([nn.Embedding(vocab_size, model_dim) for _ in range(3)]) | |
def forward(self, inputs): | |
ve = [emb(inputs).bfloat16() for emb in self.embed] | |
# 012 ... 012 structure on token value embeddings by @YouJiacheng, improved on @leloykun's U-net structure | |
ve = [ve[0], ve[1], ve[2], None, None, None, None, None, None, ve[0], ve[1], ve[2]] | |
return ve | |
# ----------------------------------------------------------------------------- | |
# The main GPT-2 model | |
class GPT(nn.Module): | |
def __init__(self, vocab_size, num_layers, num_heads, model_dim): | |
super().__init__() | |
self.embed = nn.Embedding(vocab_size, model_dim) | |
# skip attention of blocks.7 (the 8th layer) by @YouJiacheng | |
self.blocks = nn.ModuleList([Block(model_dim, num_heads, use_attn=(i != 7)) | |
for i in range(num_layers)]) | |
# token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning | |
# U-net structure on token value embeddings by @leloykun | |
self.value_embeds = ValueEmbedding(vocab_size, model_dim) | |
self.lm_head = CastedLinear(model_dim, vocab_size) | |
self.lm_head.weight.data.zero_() # @Grad62304977 | |
# U-net design by @brendanh0gan | |
self.num_encoder_layers = num_layers // 2 # Half of the layers for encoder | |
self.num_decoder_layers = num_layers - self.num_encoder_layers # Remaining for decoder | |
# Add learnable skip connection weights for decoder layers | |
self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) | |
def forward(self, inputs, targets, sliding_window_num_blocks): | |
BLOCK_SIZE = 128 | |
seq_len = len(inputs) | |
assert seq_len % BLOCK_SIZE == 0 | |
total_num_blocks = seq_len // BLOCK_SIZE | |
assert inputs.ndim == 1 | |
docs = (inputs == 50256).cumsum(0) | |
docs_low = docs.view(-1, BLOCK_SIZE)[:, 0].contiguous() | |
docs_high = docs.view(-1, BLOCK_SIZE)[:, -1].contiguous() | |
def document_causal(b, h, q_idx, kv_idx): | |
causal_mask = q_idx >= kv_idx | |
document_mask = docs[q_idx] == docs[kv_idx] | |
return causal_mask & document_mask | |
def dense_to_ordered(dense_mask): | |
num_blocks = dense_mask.sum(dim=-1, dtype=torch.int32) | |
indices = dense_mask.argsort(dim=-1, descending=True, stable=True).to(torch.int32) | |
return num_blocks[None, None].contiguous(), indices[None, None].contiguous() | |
def create_doc_swc_block_mask(sliding_window_num_blocks): | |
kv_idx = block_idx = torch.arange(total_num_blocks, dtype=torch.int32, device='cuda') | |
q_idx = block_idx[:, None] | |
causal_bm = q_idx >= kv_idx | |
causal_full_bm = q_idx > kv_idx | |
window_bm = q_idx - kv_idx < sliding_window_num_blocks | |
window_full_bm = window_bm # block-wise sliding window by @YouJiacheng | |
# document_bm = (docs_low[q_idx] <= docs_high[kv_idx]) & (docs_low[kv_idx] <= docs_high[q_idx]) | |
document_bm = (docs_low[:, None] <= docs_high) & (docs_low <= docs_high[:, None]) | |
document_full_bm = (docs_low[:, None] == docs_high) & (docs_low == docs_high[:, None]) | |
nonzero_bm = causal_bm & window_bm & document_bm | |
full_bm = causal_full_bm & window_full_bm & document_full_bm | |
kv_num_blocks, kv_indices = dense_to_ordered(nonzero_bm & ~full_bm) | |
full_kv_num_blocks, full_kv_indices = dense_to_ordered(full_bm) | |
return BlockMask.from_kv_blocks( | |
kv_num_blocks, | |
kv_indices, | |
full_kv_num_blocks, | |
full_kv_indices, | |
BLOCK_SIZE=BLOCK_SIZE, | |
mask_mod=document_causal, | |
) | |
block_mask = create_doc_swc_block_mask(sliding_window_num_blocks) | |
x0 = norm(self.embed(inputs[None]).bfloat16()) # use of norm here by @Grad62304977 | |
x = x0 | |
ve = self.value_embeds(inputs) | |
assert len(ve) == len(self.blocks) | |
ve_enc, ve_dec = ve[:self.num_encoder_layers], ve[self.num_encoder_layers:] | |
# Store outputs for U-Net skip connections | |
skip_connections = [] | |
# Encoder pass - process only the first half of the blocks | |
for i in range(self.num_encoder_layers): | |
x = self.blocks[i](x, ve_enc[i], x0, block_mask) | |
skip_connections.append(x) | |
# Decoder pass - process the remaining blocks with weighted skip connections | |
for i in range(self.num_decoder_layers): | |
x = x + self.skip_weights[i] * skip_connections.pop() | |
# U-net structure on token value embeddings by @leloykun | |
x = self.blocks[self.num_encoder_layers + i](x, ve_dec[i], x0, block_mask) | |
x = norm(x) | |
logits = self.lm_head(x) | |
logits = 15 * torch.tanh(logits / 15) # @Grad62304977 added tanh softcapping, @KoszarskyB reduced it from 30 to 15 | |
logits = logits.float() | |
loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets) | |
return loss | |
# ----------------------------------------------------------------------------- | |
# Our own simple Distributed Data Loader | |
def _load_data_shard(path): | |
# only reads the header, returns header data | |
# header is 256 int32 | |
header = torch.from_file(path, False, 256, dtype=torch.int32) | |
assert header[0] == 20240520, 'magic number mismatch in the data .bin file' | |
assert header[1] == 1, 'unsupported version' | |
num_tokens = int(header[2]) # number of tokens (claimed) | |
with open(path, 'rb', buffering=0) as f: | |
tokens = torch.empty(num_tokens, dtype=torch.uint16, pin_memory=True) # avoid pin_memory copy by @YouJiacheng | |
f.seek(256 * 4) | |
nbytes = f.readinto(tokens.numpy()) # avoid bytes->array copy by @YouJiacheng | |
assert nbytes == 2 * num_tokens, 'number of tokens read does not match header' | |
return tokens | |
class DistributedDataLoader: | |
def __init__(self, filename_pattern): | |
self.rank = int(os.environ['RANK']) | |
self.world_size = int(os.environ['WORLD_SIZE']) | |
self.files = sorted(glob.glob(filename_pattern)) | |
self.reset() | |
def reset(self): | |
self.current_shard = -1 | |
self.advance() | |
def advance(self): | |
self.current_shard = (self.current_shard + 1) % len(self.files) | |
self.current_position = 0 | |
self.tokens = _load_data_shard(self.files[self.current_shard]) | |
def next_batch(self, batch_size): | |
assert batch_size % self.world_size == 0 | |
device_batch_size = batch_size // self.world_size | |
# load next shard if necessary | |
if self.current_position + batch_size + 1 >= len(self.tokens): | |
self.advance() | |
pos = self.current_position + self.rank * device_batch_size | |
device_batch_tokens = self.tokens[pos:pos+device_batch_size+1] | |
# advance current position | |
self.current_position += batch_size | |
inputs = device_batch_tokens[:-1].to(device='cuda', dtype=torch.int32, non_blocking=True) | |
targets = device_batch_tokens[1:].to(device='cuda', dtype=torch.int64, non_blocking=True) | |
return inputs, targets | |
# ----------------------------------------------------------------------------- | |
# int main | |
@dataclass | |
class Hyperparameters: | |
# data | |
train_bin = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on | |
val_bin = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on | |
# optimization | |
batch_size = 8*64*1024 # batch size in tokens | |
max_device_batch_size = 64*1024 # batch size per device in tokens | |
num_iterations = 1390 # number of iterations to run | |
cooldown_frac = 0.4 # fraction of training spent cooling down the learning rate | |
bf16_embeds = True | |
# evaluation and logging | |
val_loss_every = 25 #125 # every how many steps to evaluate val loss? 0 for only at the end | |
val_tokens = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons | |
# implementation | |
save_checkpoint = False | |
args = Hyperparameters() | |
micro_bs = args.max_device_batch_size | |
# set up DDP (distributed data parallel). torchrun sets this env variable | |
rank = int(os.environ['RANK']) | |
local_rank = int(os.environ['LOCAL_RANK']) | |
world_size = int(os.environ['WORLD_SIZE']) | |
assert torch.cuda.is_available() | |
torch.cuda.set_device(local_rank) | |
dist.init_process_group(backend='nccl', device_id=torch.device(local_rank)) | |
dist.barrier() | |
master_process = (rank == 0) # this process will do logging, checkpointing etc. | |
# begin logging | |
logfile = None | |
if master_process: | |
run_id = uuid.uuid4() | |
os.makedirs('logs', exist_ok=True) | |
logfile = f'logs/{run_id}.txt' | |
print(logfile) | |
def print0(s, console=False): | |
if master_process: | |
with open(logfile, 'a') as f: | |
if console: | |
print(s) | |
print(s, file=f) | |
# begin by printing this file (the Python code) | |
print0(code) | |
print0('='*100) | |
# log information about the hardware/software environment this is running on | |
print0(f'Running Python {sys.version}') | |
print0(f'Running PyTorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}') | |
print0(subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True).stdout) | |
print0('='*100) | |
# load data | |
train_loader = DistributedDataLoader(args.train_bin) | |
val_loader = DistributedDataLoader(args.val_bin) | |
print0(f'Training dataloader files: {train_loader.files}') | |
print0(f'Validation dataloader files: {val_loader.files}') | |
print0('='*100) | |
# init model_opt dict, this will hold all of the separate models that we use here | |
outer_opt_lr = 0.7 | |
outer_opt_momentum = .9 | |
models_opts_schedulers = [] | |
num_models_to_simulate = 8 | |
#diloco_update_steps = 1 | |
diloco_update_steps = 25 #10 | |
# Steps to compile before copying out the model to its replicas | |
compile_steps = 15 #2 #20 | |
# there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. | |
# this originates from Karpathy's experiments. | |
core_model = GPT(vocab_size=50304, num_layers=12, num_heads=6, model_dim=768) | |
core_model = core_model.cuda() | |
if args.bf16_embeds: | |
for m in core_model.modules(): | |
if isinstance(m, nn.Embedding): | |
m.bfloat16() | |
core_model = torch.compile(core_model) | |
#####ddp_model = DDP(model, device_ids=[local_rank], broadcast_buffers=False, gradient_as_bucket_view=True) | |
# Add outer Nesterov optimizer to the core model | |
outer_opt = torch.optim.SGD(core_model.parameters(), lr=outer_opt_lr, momentum=outer_opt_momentum, nesterov=True) | |
###################################################################################### | |
# Set Initial Momentum To 0 in outer_opt (PyTorch bug w/ first step dampening) # | |
###################################################################################### | |
for parameter in core_model.parameters(): | |
parameter.grad = torch.zeros_like(parameter) | |
# Set outer opt momentum buffers (best to do this internally to avoid spaghetti code) | |
outer_opt.step() | |
core_model.zero_grad(set_to_none=True) | |
print("Compiling model!") | |
# call model so it is properly built, before cloning | |
for _ in range(compile_steps): | |
core_model.forward(torch.randint(0, 128, (1024*64,)).to(device='cuda', dtype=torch.long), torch.randint(0, 128, (1024*64,)).to(device='cuda', dtype=torch.long), torch.tensor([128], device='cuda', dtype=torch.long)).mean().backward() | |
# Set gradients to none | |
core_model.zero_grad(set_to_none=True) | |
print("Model compiled.") | |
# tmp dev import | |
import copy | |
for _ in range(num_models_to_simulate): | |
# make model copy | |
model_copy = copy.deepcopy(core_model) | |
# collect the parameters to optimize | |
hidden_matrix_params = [p for p in model_copy.blocks.parameters() if p.ndim == 2] | |
embed_params = [model_copy.embed.weight, *model_copy.value_embeds.parameters()] | |
scalar_params = [p for p in model_copy.parameters() if p.ndim < 2] | |
head_params = [model_copy.lm_head.weight] | |
# init the optimizer(s) | |
optimizer1 = torch.optim.Adam([dict(params=embed_params, lr=0.6), | |
dict(params=head_params, lr=0.008), | |
dict(params=scalar_params, lr=0.04)], | |
betas=(0.8, 0.95), fused=True) | |
optimizer2 = Muon(hidden_matrix_params, lr=0.05, momentum=0.95) | |
optimizers = [optimizer1, optimizer2] | |
# learning rate schedule: stable then decay | |
def get_lr(it): | |
t = 1 - it / args.num_iterations # time remaining in training | |
assert 1 >= t > 0 | |
# 1) constant lr for first part of training | |
if t >= args.cooldown_frac: | |
return 1.0 | |
# 2) then linear cooldown | |
else: | |
return t / args.cooldown_frac | |
schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] | |
models_opts_schedulers.append((model_copy, optimizers, schedulers)) | |
outer_opt_scheduler = torch.optim.lr_scheduler.LambdaLR(outer_opt, get_lr) | |
# sliding window size schedule: linear increase over training in chunks of 128 from 128 -> 1792. By @fernbear.bsky.social | |
def get_sliding_window_blocks(it): | |
x = it / args.num_iterations # training progress | |
assert 0 <= x <= 1 | |
return int(((1 - x) * 128 + x * 1856) // 128) | |
sliding_window_num_blocks = torch.tensor(1, dtype=torch.int32, device='cuda') | |
# Start training loop | |
training_time_ms = 0 | |
# start the clock | |
torch.cuda.synchronize() | |
t0 = time.perf_counter() | |
# begin training | |
train_steps = args.num_iterations | |
for step in range(train_steps + 1): | |
last_step = (step == train_steps) | |
# This effectively ignores timing first 10 steps, which are slower for weird reasons. | |
# Alternately, and slightly more correctly in terms of benchmarking, we could do 10 | |
# steps with dummy data first, and then re-initialize the model and reset the loader. | |
if step == 10: | |
training_time_ms = 0 | |
t0 = time.perf_counter() | |
timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val | |
sliding_window_num_blocks.copy_(get_sliding_window_blocks(step)) | |
# --------------- VALIDATION SECTION ----------------- | |
if last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0): | |
# stop the clock | |
torch.cuda.synchronize() | |
training_time_ms += 1000 * (time.perf_counter() - t0) | |
# run validation batches | |
core_model.eval() | |
val_loader.reset() | |
val_loss = 0.0 | |
# calculate the number of steps to take in the val loop. | |
val_batch_size = world_size * micro_bs | |
assert args.val_tokens % val_batch_size == 0 | |
val_steps = args.val_tokens // val_batch_size | |
for _ in range(val_steps): | |
with torch.no_grad(): | |
inputs_val, targets_val = val_loader.next_batch(val_batch_size) | |
val_loss += core_model(inputs_val, targets_val, sliding_window_num_blocks) | |
dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) | |
val_loss /= val_steps | |
# logging | |
print0(f'step:{step}/{train_steps} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms', console=True) | |
# start the clock again | |
torch.cuda.synchronize() | |
t0 = time.perf_counter() | |
if last_step: | |
if master_process and args.save_checkpoint: | |
log = dict(step=step, code=code, model=model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) | |
os.makedirs(f'logs/{run_id}', exist_ok=True) | |
torch.save(log, f'logs/{run_id}/state_step{step:06d}.pt') | |
# the last step only has the validation loop, so break to avoid training | |
break | |
# --------------- TRAINING SECTION ----------------- | |
#model.train() | |
# set each model to train | |
for model, _, _ in models_opts_schedulers: | |
model.train() | |
batch_size = args.batch_size | |
assert batch_size % world_size == 0 | |
inputs_train, targets_train = train_loader.next_batch(batch_size) | |
assert len(inputs_train) <= micro_bs or len(inputs_train) % micro_bs == 0 | |
assert batch_size//micro_bs == len(models_opts_schedulers), "Microbatchsize and number of model_opt pairs need to be equal in this experiment (functions may need to be written to support iteration over model pairs instead of indexing by microbatch idx)." | |
for i, (micro_inputs_train, micro_targets_train) in enumerate(zip(inputs_train.split(micro_bs), targets_train.split(micro_bs))): | |
# forward on distinct model | |
models_opts_schedulers[i][0](micro_inputs_train, micro_targets_train, sliding_window_num_blocks).backward() | |
#model(micro_inputs_train, micro_targets_train, sliding_window_num_blocks).backward() | |
# momentum warmup for Muon | |
frac = min(step/300, 1) | |
for model, opts, schedulers in models_opts_schedulers: | |
# update momentum for muon in each group | |
for group in opts[1].param_groups: #optimizer2.param_groups: | |
group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 | |
# step the optimizers and schedulers | |
for opt, sched in zip(opts, schedulers): | |
opt.step() | |
if step != train_steps-1: | |
sched.step() | |
# null the gradients | |
model.zero_grad(set_to_none=True) | |
############################################# | |
# DiLoCo Outer Loop (Distributed) Updates # | |
############################################# | |
# Update core model w/ updates from other models (optionally on different timescales for different parts, just simply 1 step per for now) | |
# Zip all parameters together, so we can stack them then average them, then merge them to the core model | |
if last_step or (step != 0 and step % diloco_update_steps == 0): | |
models_group_params = [mos[0].parameters() for mos in models_opts_schedulers] | |
models_grouped_params = zip(*models_group_params) | |
################## | |
# Momentum # | |
################## | |
outer_opt_momentum_warmup_steps = 1000 #300 #250 #500 #100 #300 #600 #300 | |
outer_opt_min_momentum = 0. #.6 #.9 #.6 #.5 #.6 | |
outer_opt_max_momentum = .9 #.85 #.95 #.9 #.9 | |
frac = min(step/outer_opt_momentum_warmup_steps, 1) | |
curr_outer_momentum = (1 - frac) * outer_opt_min_momentum + frac * outer_opt_max_momentum | |
################# | |
# Dampening # | |
################# | |
curr_dampening = 0.0 # tmp for now | |
""" | |
# damping hparams | |
dampening_steps = 300 #100 #300 | |
dampening_max = 0.6 | |
dampening_min = 0.0 | |
frac = min(step/dampening_steps, 1) | |
curr_dampening = (1 - frac) * dampening_max + frac * dampening_min | |
""" | |
# update momentum for each param group in outer opt | |
for group in outer_opt.param_groups: #optimizer2.param_groups: | |
group['lr'] = outer_opt_lr * get_lr(step) | |
group['momentum'] = curr_outer_momentum | |
#group['dampening'] = curr_dampening | |
group['dampening'] = 1. - curr_outer_momentum #curr_dampening | |
for core_parameters, dist_parameters_list in zip(core_model.parameters(), models_grouped_params): | |
# TODO: individual parameter schedules? | |
# TMP hack | |
#params_list = list(dist_parameters_list) | |
#dist_parameters_list = params_list | |
# Simulate grad creation | |
grads_all = (core_parameters.data.unsqueeze(0) - torch.stack(dist_parameters_list, dim=0)) | |
core_parameters.grad = grads_all.mean(dim=0) | |
# Simulate update # reduce_mean | |
#parameters.data.add_(grads, alpha=-diloco_lr) # = torch.stack(dist_parameters_list, dim=0).mean(dim=0) | |
# If this is the first outer step, PyTorch defaults to filling the momentum buffer with | |
# the grad, which is a horribly-biased estimator of the state of the network over training | |
# Here, to account for the momentum warmup-process removing this zero-debiasing operation, | |
# we simply act as if the momentum buffer zero for the first step (i.e. simply averaging | |
# the network weights), and then let momemtum and warmup do their things from there | |
# W/ the nesterov step, this means halving the initial first grad | |
""" | |
if step == 0: #diloco_update_steps: | |
#outer_opt.state[core_parameters]['momentum_buffer'].data.zero_() | |
core_parameters.grad.data.div_(2./(2.-dampening_max)) | |
""" | |
# outer_opt step update | |
outer_opt.step() | |
outer_opt.zero_grad(set_to_none=True) | |
# update model copies to reset to original value (would be done locally by each distributed worker) | |
for dist_params in dist_parameters_list: | |
dist_params.data.copy_(core_parameters.data) | |
# Update core model for evals | |
####parameters.data = torch.stack(dist_parameters_list, dim=0).mean(dim=0) | |
# Simulate broadcast back out (use this if not using distributed grads, but using the core model to sync instead) | |
#[d_param.data.copy_(parameters.data) for d_param in dist_parameters_list] | |
# logging | |
approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) | |
print0(f'step:{step+1}/{train_steps} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms', console=True) | |
print0(f'peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB') | |
dist.destroy_process_group() | |
==================================================================================================== | |
Running Python 3.12.7 (main, May 13 2025, 19:58:20) [GCC 13.2.0] | |
Running PyTorch 2.8.0.dev20250510+cu126 compiled for CUDA 12.6 | |
Tue May 13 21:51:24 2025 | |
+-----------------------------------------------------------------------------------------+ | |
| NVIDIA-SMI 560.35.03 Driver Version: 560.35.03 CUDA Version: 12.6 | | |
|-----------------------------------------+------------------------+----------------------+ | |
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | |
| | | MIG M. | | |
|=========================================+========================+======================| | |
| 0 NVIDIA H100 PCIe On | 00000000:01:00.0 Off | 0 | | |
| N/A 33C P0 79W / 350W | 1104MiB / 81559MiB | 3% Default | | |
| | | Disabled | | |
+-----------------------------------------+------------------------+----------------------+ | |
+-----------------------------------------------------------------------------------------+ | |
| Processes: | | |
| GPU GI CI PID Type Process name GPU Memory | | |
| ID ID Usage | | |
|=========================================================================================| | |
| 0 N/A N/A 30 C /usr/local/bin/python 0MiB | | |
+-----------------------------------------------------------------------------------------+ | |
==================================================================================================== | |
Training dataloader files: ['data/fineweb10B/fineweb_train_000001.bin', 'data/fineweb10B/fineweb_train_000002.bin', 'data/fineweb10B/fineweb_train_000003.bin', 'data/fineweb10B/fineweb_train_000004.bin', 'data/fineweb10B/fineweb_train_000005.bin', 'data/fineweb10B/fineweb_train_000006.bin', 'data/fineweb10B/fineweb_train_000007.bin', 'data/fineweb10B/fineweb_train_000008.bin'] | |
Validation dataloader files: ['data/fineweb10B/fineweb_val_000000.bin'] | |
==================================================================================================== | |
step:0/1390 val_loss:10.8258 train_time:0ms step_avg:nanms | |
step:1/1390 train_time:68715ms step_avg:nanms | |
step:2/1390 train_time:70393ms step_avg:nanms | |
step:3/1390 train_time:72158ms step_avg:nanms | |
step:4/1390 train_time:73942ms step_avg:nanms | |
step:5/1390 train_time:75711ms step_avg:nanms | |
step:6/1390 train_time:77513ms step_avg:nanms | |
step:7/1390 train_time:79298ms step_avg:nanms | |
step:8/1390 train_time:81079ms step_avg:nanms | |
step:9/1390 train_time:82860ms step_avg:nanms | |
step:10/1390 train_time:84639ms step_avg:nanms | |
step:11/1390 train_time:1766ms step_avg:nanms | |
step:12/1390 train_time:3544ms step_avg:nanms | |
step:13/1390 train_time:5322ms step_avg:1774.07ms | |
step:14/1390 train_time:7112ms step_avg:1777.95ms | |
step:15/1390 train_time:8895ms step_avg:1778.94ms | |
step:16/1390 train_time:10678ms step_avg:1779.75ms | |
step:17/1390 train_time:12469ms step_avg:1781.32ms | |
step:18/1390 train_time:14261ms step_avg:1782.61ms | |
step:19/1390 train_time:16058ms step_avg:1784.18ms | |
step:20/1390 train_time:17839ms step_avg:1783.89ms | |
step:21/1390 train_time:19618ms step_avg:1783.49ms | |
step:22/1390 train_time:21408ms step_avg:1784.02ms | |
step:23/1390 train_time:23194ms step_avg:1784.12ms | |
step:24/1390 train_time:24984ms step_avg:1784.59ms | |
step:25/1390 train_time:26789ms step_avg:1785.93ms | |
step:25/1390 val_loss:10.8258 train_time:26789ms step_avg:1785.95ms | |
step:26/1390 train_time:28643ms step_avg:1790.17ms | |
step:27/1390 train_time:30456ms step_avg:1791.51ms | |
step:28/1390 train_time:32252ms step_avg:1791.78ms | |
step:29/1390 train_time:34043ms step_avg:1791.72ms | |
step:30/1390 train_time:35830ms step_avg:1791.51ms | |
step:31/1390 train_time:37636ms step_avg:1792.20ms | |
step:32/1390 train_time:39429ms step_avg:1792.23ms | |
step:33/1390 train_time:41222ms step_avg:1792.25ms | |
step:34/1390 train_time:43032ms step_avg:1792.99ms | |
step:35/1390 train_time:44833ms step_avg:1793.34ms | |
step:36/1390 train_time:46631ms step_avg:1793.48ms | |
step:37/1390 train_time:48434ms step_avg:1793.85ms | |
step:38/1390 train_time:50242ms step_avg:1794.34ms | |
step:39/1390 train_time:52050ms step_avg:1794.81ms | |
step:40/1390 train_time:53854ms step_avg:1795.12ms | |
step:41/1390 train_time:55657ms step_avg:1795.40ms | |
step:42/1390 train_time:57454ms step_avg:1795.44ms | |
step:43/1390 train_time:59251ms step_avg:1795.48ms | |
step:44/1390 train_time:61049ms step_avg:1795.57ms | |
step:45/1390 train_time:62858ms step_avg:1795.94ms | |
step:46/1390 train_time:64655ms step_avg:1795.97ms | |
step:47/1390 train_time:66455ms step_avg:1796.08ms | |
step:48/1390 train_time:68257ms step_avg:1796.23ms | |
step:49/1390 train_time:70061ms step_avg:1796.43ms | |
step:50/1390 train_time:71862ms step_avg:1796.54ms | |
step:50/1390 val_loss:6.2260 train_time:71862ms step_avg:1796.55ms | |
step:51/1390 train_time:73705ms step_avg:1797.68ms | |
step:52/1390 train_time:75521ms step_avg:1798.11ms | |
step:53/1390 train_time:77317ms step_avg:1798.07ms | |
step:54/1390 train_time:79114ms step_avg:1798.04ms | |
step:55/1390 train_time:80914ms step_avg:1798.08ms | |
step:56/1390 train_time:82715ms step_avg:1798.16ms | |
step:57/1390 train_time:84519ms step_avg:1798.27ms | |
step:58/1390 train_time:86321ms step_avg:1798.35ms | |
step:59/1390 train_time:88119ms step_avg:1798.35ms | |
step:60/1390 train_time:89914ms step_avg:1798.28ms | |
step:61/1390 train_time:91711ms step_avg:1798.26ms | |
step:62/1390 train_time:93509ms step_avg:1798.25ms | |
step:63/1390 train_time:95305ms step_avg:1798.21ms | |
step:64/1390 train_time:97100ms step_avg:1798.16ms | |
step:65/1390 train_time:98904ms step_avg:1798.25ms | |
step:66/1390 train_time:100704ms step_avg:1798.29ms | |
step:67/1390 train_time:102512ms step_avg:1798.45ms | |
step:68/1390 train_time:104318ms step_avg:1798.58ms | |
step:69/1390 train_time:106124ms step_avg:1798.72ms | |
step:70/1390 train_time:107926ms step_avg:1798.77ms | |
step:71/1390 train_time:109739ms step_avg:1799.00ms | |
step:72/1390 train_time:111546ms step_avg:1799.13ms | |
step:73/1390 train_time:113352ms step_avg:1799.24ms | |
step:74/1390 train_time:115156ms step_avg:1799.31ms | |
step:75/1390 train_time:116960ms step_avg:1799.38ms | |
step:75/1390 val_loss:5.7637 train_time:116960ms step_avg:1799.38ms | |
step:76/1390 train_time:118801ms step_avg:1800.01ms | |
step:77/1390 train_time:120610ms step_avg:1800.15ms | |
step:78/1390 train_time:122403ms step_avg:1800.04ms | |
step:79/1390 train_time:124196ms step_avg:1799.95ms | |
step:80/1390 train_time:125996ms step_avg:1799.94ms | |
step:81/1390 train_time:127797ms step_avg:1799.96ms | |
step:82/1390 train_time:129599ms step_avg:1799.99ms | |
step:83/1390 train_time:131407ms step_avg:1800.10ms | |
step:84/1390 train_time:133234ms step_avg:1800.45ms | |
step:85/1390 train_time:135024ms step_avg:1800.31ms | |
step:86/1390 train_time:136832ms step_avg:1800.42ms | |
step:87/1390 train_time:138623ms step_avg:1800.30ms | |
step:88/1390 train_time:140424ms step_avg:1800.31ms | |
step:89/1390 train_time:142230ms step_avg:1800.38ms | |
step:90/1390 train_time:144034ms step_avg:1800.42ms | |
step:91/1390 train_time:145829ms step_avg:1800.35ms | |
step:92/1390 train_time:147633ms step_avg:1800.40ms | |
step:93/1390 train_time:149430ms step_avg:1800.36ms | |
step:94/1390 train_time:151234ms step_avg:1800.41ms | |
step:95/1390 train_time:153046ms step_avg:1800.54ms | |
step:96/1390 train_time:154854ms step_avg:1800.63ms | |
step:97/1390 train_time:156655ms step_avg:1800.64ms | |
step:98/1390 train_time:158469ms step_avg:1800.79ms | |
step:99/1390 train_time:160273ms step_avg:1800.82ms | |
step:100/1390 train_time:162078ms step_avg:1800.87ms | |
step:100/1390 val_loss:5.4354 train_time:162079ms step_avg:1800.87ms | |
step:101/1390 train_time:163914ms step_avg:1801.25ms | |
step:102/1390 train_time:165723ms step_avg:1801.33ms | |
step:103/1390 train_time:167516ms step_avg:1801.24ms | |
step:104/1390 train_time:169375ms step_avg:1801.86ms | |
step:105/1390 train_time:171205ms step_avg:1802.16ms | |
step:106/1390 train_time:173041ms step_avg:1802.51ms | |
step:107/1390 train_time:174869ms step_avg:1802.77ms | |
step:108/1390 train_time:176688ms step_avg:1802.94ms | |
step:109/1390 train_time:178517ms step_avg:1803.20ms | |
step:110/1390 train_time:180347ms step_avg:1803.47ms | |
step:111/1390 train_time:182199ms step_avg:1803.95ms | |
step:112/1390 train_time:184042ms step_avg:1804.34ms | |
step:113/1390 train_time:185908ms step_avg:1804.93ms | |
step:114/1390 train_time:187742ms step_avg:1805.21ms | |
step:115/1390 train_time:189592ms step_avg:1805.63ms | |
step:116/1390 train_time:191432ms step_avg:1805.96ms | |
step:117/1390 train_time:193272ms step_avg:1806.28ms | |
step:118/1390 train_time:195110ms step_avg:1806.58ms | |
step:119/1390 train_time:196943ms step_avg:1806.81ms | |
step:120/1390 train_time:198789ms step_avg:1807.17ms | |
step:121/1390 train_time:200615ms step_avg:1807.34ms | |
step:122/1390 train_time:202455ms step_avg:1807.63ms | |
step:123/1390 train_time:204294ms step_avg:1807.91ms | |
step:124/1390 train_time:206137ms step_avg:1808.22ms | |
step:125/1390 train_time:207982ms step_avg:1808.54ms | |
step:125/1390 val_loss:5.1573 train_time:207982ms step_avg:1808.54ms | |
step:126/1390 train_time:209861ms step_avg:1809.15ms | |
step:127/1390 train_time:211732ms step_avg:1809.68ms | |
step:128/1390 train_time:213563ms step_avg:1809.86ms | |
step:129/1390 train_time:215411ms step_avg:1810.17ms | |
step:130/1390 train_time:217253ms step_avg:1810.44ms | |
step:131/1390 train_time:219145ms step_avg:1811.12ms | |
step:132/1390 train_time:220982ms step_avg:1811.33ms | |
step:133/1390 train_time:222823ms step_avg:1811.57ms | |
step:134/1390 train_time:224663ms step_avg:1811.80ms | |
step:135/1390 train_time:226515ms step_avg:1812.12ms | |
step:136/1390 train_time:228358ms step_avg:1812.36ms | |
step:137/1390 train_time:230200ms step_avg:1812.60ms | |
step:138/1390 train_time:232053ms step_avg:1812.91ms | |
step:139/1390 train_time:233893ms step_avg:1813.12ms | |
step:140/1390 train_time:235738ms step_avg:1813.37ms | |
step:141/1390 train_time:237583ms step_avg:1813.61ms | |
step:142/1390 train_time:239425ms step_avg:1813.83ms | |
step:143/1390 train_time:241271ms step_avg:1814.07ms | |
step:144/1390 train_time:243112ms step_avg:1814.27ms | |
step:145/1390 train_time:244949ms step_avg:1814.43ms | |
step:146/1390 train_time:246794ms step_avg:1814.66ms | |
step:147/1390 train_time:248636ms step_avg:1814.86ms | |
step:148/1390 train_time:250492ms step_avg:1815.16ms | |
step:149/1390 train_time:252349ms step_avg:1815.46ms | |
step:150/1390 train_time:254201ms step_avg:1815.72ms | |
step:150/1390 val_loss:4.9263 train_time:254201ms step_avg:1815.72ms | |
step:151/1390 train_time:256090ms step_avg:1816.24ms | |
step:152/1390 train_time:257949ms step_avg:1816.54ms | |
step:153/1390 train_time:259780ms step_avg:1816.64ms | |
step:154/1390 train_time:261632ms step_avg:1816.89ms | |
step:155/1390 train_time:263467ms step_avg:1817.01ms | |
step:156/1390 train_time:265324ms step_avg:1817.29ms | |
step:157/1390 train_time:267165ms step_avg:1817.45ms | |
step:158/1390 train_time:269002ms step_avg:1817.58ms | |
step:159/1390 train_time:270853ms step_avg:1817.80ms | |
step:160/1390 train_time:272706ms step_avg:1818.04ms | |
step:161/1390 train_time:274553ms step_avg:1818.23ms | |
step:162/1390 train_time:276405ms step_avg:1818.45ms | |
step:163/1390 train_time:278253ms step_avg:1818.65ms | |
step:164/1390 train_time:280103ms step_avg:1818.85ms | |
step:165/1390 train_time:281961ms step_avg:1819.10ms | |
step:166/1390 train_time:283808ms step_avg:1819.28ms | |
step:167/1390 train_time:285657ms step_avg:1819.47ms | |
step:168/1390 train_time:287508ms step_avg:1819.67ms | |
step:169/1390 train_time:289356ms step_avg:1819.85ms | |
step:170/1390 train_time:291211ms step_avg:1820.07ms | |
step:171/1390 train_time:293055ms step_avg:1820.22ms | |
step:172/1390 train_time:294903ms step_avg:1820.39ms | |
step:173/1390 train_time:296759ms step_avg:1820.61ms | |
step:174/1390 train_time:298608ms step_avg:1820.78ms | |
step:175/1390 train_time:300460ms step_avg:1820.97ms | |
step:175/1390 val_loss:4.7409 train_time:300460ms step_avg:1820.97ms | |
step:176/1390 train_time:302353ms step_avg:1821.40ms | |
step:177/1390 train_time:304220ms step_avg:1821.67ms | |
step:178/1390 train_time:306080ms step_avg:1821.90ms | |
step:179/1390 train_time:307941ms step_avg:1822.13ms | |
step:180/1390 train_time:309787ms step_avg:1822.28ms | |
step:181/1390 train_time:311630ms step_avg:1822.40ms | |
step:182/1390 train_time:313473ms step_avg:1822.52ms | |
step:183/1390 train_time:315320ms step_avg:1822.66ms | |
step:184/1390 train_time:317184ms step_avg:1822.90ms | |
step:185/1390 train_time:319036ms step_avg:1823.06ms | |
step:186/1390 train_time:320891ms step_avg:1823.24ms | |
step:187/1390 train_time:322735ms step_avg:1823.36ms | |
step:188/1390 train_time:324582ms step_avg:1823.49ms | |
step:189/1390 train_time:326429ms step_avg:1823.62ms | |
step:190/1390 train_time:328280ms step_avg:1823.78ms | |
step:191/1390 train_time:330159ms step_avg:1824.08ms | |
step:192/1390 train_time:332004ms step_avg:1824.20ms | |
step:193/1390 train_time:333855ms step_avg:1824.35ms | |
step:194/1390 train_time:335690ms step_avg:1824.40ms | |
step:195/1390 train_time:337543ms step_avg:1824.56ms | |
step:196/1390 train_time:339377ms step_avg:1824.61ms | |
step:197/1390 train_time:341226ms step_avg:1824.74ms | |
step:198/1390 train_time:343065ms step_avg:1824.81ms | |
step:199/1390 train_time:344902ms step_avg:1824.88ms | |
step:200/1390 train_time:346753ms step_avg:1825.01ms | |
step:200/1390 val_loss:4.5919 train_time:346753ms step_avg:1825.02ms | |
step:201/1390 train_time:348649ms step_avg:1825.39ms | |
step:202/1390 train_time:350513ms step_avg:1825.59ms | |
step:203/1390 train_time:352360ms step_avg:1825.70ms | |
step:204/1390 train_time:354206ms step_avg:1825.80ms | |
step:205/1390 train_time:356062ms step_avg:1825.96ms | |
step:206/1390 train_time:357915ms step_avg:1826.09ms | |
step:207/1390 train_time:359813ms step_avg:1826.46ms | |
step:208/1390 train_time:361702ms step_avg:1826.78ms | |
step:209/1390 train_time:363596ms step_avg:1827.12ms | |
step:210/1390 train_time:365471ms step_avg:1827.35ms | |
step:211/1390 train_time:367372ms step_avg:1827.72ms | |
step:212/1390 train_time:369244ms step_avg:1827.94ms | |
step:213/1390 train_time:371128ms step_avg:1828.22ms | |
step:214/1390 train_time:373008ms step_avg:1828.47ms | |
step:215/1390 train_time:374886ms step_avg:1828.71ms | |
step:216/1390 train_time:376763ms step_avg:1828.95ms | |
step:217/1390 train_time:378648ms step_avg:1829.22ms | |
step:218/1390 train_time:380542ms step_avg:1829.53ms | |
step:219/1390 train_time:382416ms step_avg:1829.74ms | |
step:220/1390 train_time:384291ms step_avg:1829.96ms | |
step:221/1390 train_time:386163ms step_avg:1830.16ms | |
step:222/1390 train_time:388042ms step_avg:1830.38ms | |
step:223/1390 train_time:389916ms step_avg:1830.59ms | |
step:224/1390 train_time:391796ms step_avg:1830.82ms | |
step:225/1390 train_time:393674ms step_avg:1831.04ms | |
step:225/1390 val_loss:4.4554 train_time:393674ms step_avg:1831.04ms | |
step:226/1390 train_time:395601ms step_avg:1831.48ms | |
step:227/1390 train_time:397503ms step_avg:1831.81ms | |
step:228/1390 train_time:399385ms step_avg:1832.04ms | |
step:229/1390 train_time:401249ms step_avg:1832.19ms | |
step:230/1390 train_time:403107ms step_avg:1832.30ms | |
step:231/1390 train_time:404971ms step_avg:1832.45ms | |
step:232/1390 train_time:406834ms step_avg:1832.59ms | |
step:233/1390 train_time:408708ms step_avg:1832.77ms | |
step:234/1390 train_time:410581ms step_avg:1832.95ms | |
step:235/1390 train_time:412456ms step_avg:1833.14ms | |
step:236/1390 train_time:414349ms step_avg:1833.40ms | |
step:237/1390 train_time:416234ms step_avg:1833.63ms | |
step:238/1390 train_time:418111ms step_avg:1833.82ms | |
step:239/1390 train_time:419976ms step_avg:1833.96ms | |
step:240/1390 train_time:421840ms step_avg:1834.09ms | |
step:241/1390 train_time:423704ms step_avg:1834.22ms | |
step:242/1390 train_time:425577ms step_avg:1834.38ms | |
step:243/1390 train_time:427452ms step_avg:1834.56ms | |
step:244/1390 train_time:429319ms step_avg:1834.70ms | |
step:245/1390 train_time:431184ms step_avg:1834.83ms | |
step:246/1390 train_time:433050ms step_avg:1834.96ms | |
step:247/1390 train_time:434920ms step_avg:1835.11ms | |
step:248/1390 train_time:436792ms step_avg:1835.26ms | |
step:249/1390 train_time:438672ms step_avg:1835.45ms | |
step:250/1390 train_time:440544ms step_avg:1835.60ms | |
step:250/1390 val_loss:4.3579 train_time:440544ms step_avg:1835.60ms | |
step:251/1390 train_time:442465ms step_avg:1835.95ms | |
step:252/1390 train_time:444358ms step_avg:1836.19ms | |
step:253/1390 train_time:446226ms step_avg:1836.32ms | |
step:254/1390 train_time:448105ms step_avg:1836.50ms | |
step:255/1390 train_time:449971ms step_avg:1836.62ms | |
step:256/1390 train_time:451834ms step_avg:1836.72ms | |
step:257/1390 train_time:453702ms step_avg:1836.85ms | |
step:258/1390 train_time:455595ms step_avg:1837.07ms | |
step:259/1390 train_time:457459ms step_avg:1837.19ms | |
step:260/1390 train_time:459323ms step_avg:1837.29ms | |
step:261/1390 train_time:461192ms step_avg:1837.42ms | |
step:262/1390 train_time:463070ms step_avg:1837.58ms | |
step:263/1390 train_time:464934ms step_avg:1837.69ms | |
step:264/1390 train_time:466795ms step_avg:1837.78ms | |
step:265/1390 train_time:468651ms step_avg:1837.85ms | |
step:266/1390 train_time:470509ms step_avg:1837.92ms | |
step:267/1390 train_time:472369ms step_avg:1838.01ms | |
step:268/1390 train_time:474229ms step_avg:1838.10ms | |
step:269/1390 train_time:476097ms step_avg:1838.21ms | |
step:270/1390 train_time:477962ms step_avg:1838.32ms | |
step:271/1390 train_time:479828ms step_avg:1838.42ms | |
step:272/1390 train_time:481695ms step_avg:1838.53ms | |
step:273/1390 train_time:483557ms step_avg:1838.62ms | |
step:274/1390 train_time:485418ms step_avg:1838.70ms | |
step:275/1390 train_time:487279ms step_avg:1838.79ms | |
step:275/1390 val_loss:4.2822 train_time:487279ms step_avg:1838.79ms | |
step:276/1390 train_time:489199ms step_avg:1839.09ms | |
step:277/1390 train_time:491087ms step_avg:1839.28ms | |
step:278/1390 train_time:492955ms step_avg:1839.38ms | |
step:279/1390 train_time:494820ms step_avg:1839.48ms | |
step:280/1390 train_time:496679ms step_avg:1839.55ms | |
step:281/1390 train_time:498536ms step_avg:1839.62ms | |
step:282/1390 train_time:500404ms step_avg:1839.72ms | |
step:283/1390 train_time:502267ms step_avg:1839.80ms | |
step:284/1390 train_time:504132ms step_avg:1839.90ms | |
step:285/1390 train_time:506010ms step_avg:1840.04ms | |
step:286/1390 train_time:507927ms step_avg:1840.32ms | |
step:287/1390 train_time:509792ms step_avg:1840.41ms | |
step:288/1390 train_time:511655ms step_avg:1840.49ms | |
step:289/1390 train_time:513518ms step_avg:1840.57ms | |
step:290/1390 train_time:515429ms step_avg:1840.82ms | |
step:291/1390 train_time:517297ms step_avg:1840.91ms | |
step:292/1390 train_time:519166ms step_avg:1841.01ms | |
step:293/1390 train_time:521035ms step_avg:1841.11ms | |
step:294/1390 train_time:522901ms step_avg:1841.20ms | |
step:295/1390 train_time:524760ms step_avg:1841.26ms | |
step:296/1390 train_time:526617ms step_avg:1841.32ms | |
step:297/1390 train_time:528490ms step_avg:1841.43ms | |
step:298/1390 train_time:530352ms step_avg:1841.50ms | |
step:299/1390 train_time:532226ms step_avg:1841.61ms | |
step:300/1390 train_time:534102ms step_avg:1841.73ms | |
step:300/1390 val_loss:4.2202 train_time:534102ms step_avg:1841.73ms | |
step:301/1390 train_time:536016ms step_avg:1841.98ms | |
step:302/1390 train_time:537889ms step_avg:1842.08ms | |
step:303/1390 train_time:539741ms step_avg:1842.12ms | |
step:304/1390 train_time:541600ms step_avg:1842.18ms | |
step:305/1390 train_time:543477ms step_avg:1842.29ms | |
step:306/1390 train_time:545349ms step_avg:1842.40ms | |
step:307/1390 train_time:547225ms step_avg:1842.51ms | |
step:308/1390 train_time:549090ms step_avg:1842.58ms | |
step:309/1390 train_time:550951ms step_avg:1842.65ms | |
step:310/1390 train_time:552829ms step_avg:1842.76ms | |
step:311/1390 train_time:554727ms step_avg:1842.95ms | |
step:312/1390 train_time:556629ms step_avg:1843.14ms | |
step:313/1390 train_time:558522ms step_avg:1843.31ms | |
step:314/1390 train_time:560417ms step_avg:1843.48ms | |
step:315/1390 train_time:562309ms step_avg:1843.64ms | |
step:316/1390 train_time:564199ms step_avg:1843.79ms | |
step:317/1390 train_time:566121ms step_avg:1844.04ms | |
step:318/1390 train_time:568021ms step_avg:1844.22ms | |
step:319/1390 train_time:569917ms step_avg:1844.39ms | |
step:320/1390 train_time:571808ms step_avg:1844.54ms | |
step:321/1390 train_time:573692ms step_avg:1844.67ms | |
step:322/1390 train_time:575594ms step_avg:1844.85ms | |
step:323/1390 train_time:577485ms step_avg:1845.00ms | |
step:324/1390 train_time:579377ms step_avg:1845.15ms | |
step:325/1390 train_time:581273ms step_avg:1845.31ms | |
step:325/1390 val_loss:4.1504 train_time:581274ms step_avg:1845.31ms | |
step:326/1390 train_time:583235ms step_avg:1845.68ms | |
step:327/1390 train_time:585138ms step_avg:1845.86ms | |
step:328/1390 train_time:587019ms step_avg:1845.97ms | |
step:329/1390 train_time:588901ms step_avg:1846.09ms | |
step:330/1390 train_time:590791ms step_avg:1846.22ms | |
step:331/1390 train_time:592694ms step_avg:1846.40ms | |
step:332/1390 train_time:594580ms step_avg:1846.52ms | |
step:333/1390 train_time:596477ms step_avg:1846.68ms | |
step:334/1390 train_time:598375ms step_avg:1846.84ms | |
step:335/1390 train_time:600266ms step_avg:1846.97ms | |
step:336/1390 train_time:602164ms step_avg:1847.13ms | |
step:337/1390 train_time:604053ms step_avg:1847.26ms | |
step:338/1390 train_time:605942ms step_avg:1847.38ms | |
step:339/1390 train_time:607830ms step_avg:1847.51ms | |
step:340/1390 train_time:609723ms step_avg:1847.65ms | |
step:341/1390 train_time:611622ms step_avg:1847.80ms | |
step:342/1390 train_time:613505ms step_avg:1847.91ms | |
step:343/1390 train_time:615398ms step_avg:1848.04ms | |
step:344/1390 train_time:617290ms step_avg:1848.17ms | |
step:345/1390 train_time:619187ms step_avg:1848.32ms | |
step:346/1390 train_time:621083ms step_avg:1848.46ms | |
step:347/1390 train_time:622978ms step_avg:1848.60ms | |
step:348/1390 train_time:624859ms step_avg:1848.70ms | |
step:349/1390 train_time:626736ms step_avg:1848.78ms | |
step:350/1390 train_time:628616ms step_avg:1848.87ms | |
step:350/1390 val_loss:4.0984 train_time:628617ms step_avg:1848.87ms | |
step:351/1390 train_time:630557ms step_avg:1849.14ms | |
step:352/1390 train_time:632484ms step_avg:1849.37ms | |
step:353/1390 train_time:634362ms step_avg:1849.45ms | |
step:354/1390 train_time:636248ms step_avg:1849.56ms | |
step:355/1390 train_time:638145ms step_avg:1849.70ms | |
step:356/1390 train_time:640045ms step_avg:1849.84ms | |
step:357/1390 train_time:641945ms step_avg:1849.99ms | |
step:358/1390 train_time:643848ms step_avg:1850.14ms | |
step:359/1390 train_time:645747ms step_avg:1850.28ms | |
step:360/1390 train_time:647644ms step_avg:1850.41ms | |
step:361/1390 train_time:649531ms step_avg:1850.52ms | |
step:362/1390 train_time:651422ms step_avg:1850.63ms | |
step:363/1390 train_time:653302ms step_avg:1850.71ms | |
step:364/1390 train_time:655190ms step_avg:1850.82ms | |
step:365/1390 train_time:657090ms step_avg:1850.96ms | |
step:366/1390 train_time:658984ms step_avg:1851.08ms | |
step:367/1390 train_time:660874ms step_avg:1851.19ms | |
step:368/1390 train_time:662793ms step_avg:1851.38ms | |
step:369/1390 train_time:664690ms step_avg:1851.51ms | |
step:370/1390 train_time:666585ms step_avg:1851.62ms | |
step:371/1390 train_time:668471ms step_avg:1851.72ms | |
step:372/1390 train_time:670359ms step_avg:1851.82ms | |
step:373/1390 train_time:672245ms step_avg:1851.91ms | |
step:374/1390 train_time:674128ms step_avg:1852.00ms | |
step:375/1390 train_time:676016ms step_avg:1852.10ms | |
step:375/1390 val_loss:4.0589 train_time:676016ms step_avg:1852.10ms | |
step:376/1390 train_time:677963ms step_avg:1852.36ms | |
step:377/1390 train_time:679869ms step_avg:1852.50ms | |
step:378/1390 train_time:681746ms step_avg:1852.57ms | |
step:379/1390 train_time:683627ms step_avg:1852.65ms | |
step:380/1390 train_time:685514ms step_avg:1852.74ms | |
step:381/1390 train_time:687431ms step_avg:1852.91ms | |
step:382/1390 train_time:689306ms step_avg:1852.97ms | |
step:383/1390 train_time:691198ms step_avg:1853.08ms | |
step:384/1390 train_time:693094ms step_avg:1853.19ms | |
step:385/1390 train_time:694991ms step_avg:1853.31ms | |
step:386/1390 train_time:696885ms step_avg:1853.42ms | |
step:387/1390 train_time:698772ms step_avg:1853.51ms | |
step:388/1390 train_time:700675ms step_avg:1853.64ms | |
step:389/1390 train_time:702572ms step_avg:1853.75ms | |
step:390/1390 train_time:704490ms step_avg:1853.92ms | |
step:391/1390 train_time:706372ms step_avg:1853.99ms | |
step:392/1390 train_time:708260ms step_avg:1854.08ms | |
step:393/1390 train_time:710143ms step_avg:1854.16ms | |
step:394/1390 train_time:712071ms step_avg:1854.35ms | |
step:395/1390 train_time:713943ms step_avg:1854.40ms | |
step:396/1390 train_time:715824ms step_avg:1854.47ms | |
step:397/1390 train_time:717711ms step_avg:1854.55ms | |
step:398/1390 train_time:719608ms step_avg:1854.66ms | |
step:399/1390 train_time:721487ms step_avg:1854.72ms | |
step:400/1390 train_time:723366ms step_avg:1854.78ms | |
step:400/1390 val_loss:4.0158 train_time:723366ms step_avg:1854.78ms | |
step:401/1390 train_time:725305ms step_avg:1855.00ms | |
step:402/1390 train_time:727207ms step_avg:1855.12ms | |
step:403/1390 train_time:729093ms step_avg:1855.20ms | |
step:404/1390 train_time:731002ms step_avg:1855.33ms | |
step:405/1390 train_time:732902ms step_avg:1855.45ms | |
step:406/1390 train_time:734793ms step_avg:1855.54ms | |
step:407/1390 train_time:736702ms step_avg:1855.67ms | |
step:408/1390 train_time:738604ms step_avg:1855.79ms | |
step:409/1390 train_time:740514ms step_avg:1855.92ms | |
step:410/1390 train_time:742405ms step_avg:1856.01ms | |
step:411/1390 train_time:744291ms step_avg:1856.09ms | |
step:412/1390 train_time:746173ms step_avg:1856.15ms | |
step:413/1390 train_time:748078ms step_avg:1856.27ms | |
step:414/1390 train_time:749986ms step_avg:1856.40ms | |
step:415/1390 train_time:751904ms step_avg:1856.55ms | |
step:416/1390 train_time:753854ms step_avg:1856.78ms | |
step:417/1390 train_time:755762ms step_avg:1856.91ms | |
step:418/1390 train_time:757685ms step_avg:1857.07ms | |
step:419/1390 train_time:759592ms step_avg:1857.19ms | |
step:420/1390 train_time:761536ms step_avg:1857.40ms | |
step:421/1390 train_time:763438ms step_avg:1857.51ms | |
step:422/1390 train_time:765355ms step_avg:1857.66ms | |
step:423/1390 train_time:767265ms step_avg:1857.79ms | |
step:424/1390 train_time:769180ms step_avg:1857.92ms | |
step:425/1390 train_time:771108ms step_avg:1858.09ms | |
step:425/1390 val_loss:3.9710 train_time:771108ms step_avg:1858.09ms | |
step:426/1390 train_time:773070ms step_avg:1858.34ms | |
step:427/1390 train_time:775010ms step_avg:1858.54ms | |
step:428/1390 train_time:776917ms step_avg:1858.65ms | |
step:429/1390 train_time:778830ms step_avg:1858.78ms | |
step:430/1390 train_time:780731ms step_avg:1858.88ms | |
step:431/1390 train_time:782654ms step_avg:1859.04ms | |
step:432/1390 train_time:784559ms step_avg:1859.15ms | |
step:433/1390 train_time:786477ms step_avg:1859.28ms | |
step:434/1390 train_time:788400ms step_avg:1859.43ms | |
step:435/1390 train_time:790313ms step_avg:1859.56ms | |
step:436/1390 train_time:792229ms step_avg:1859.69ms | |
step:437/1390 train_time:794143ms step_avg:1859.82ms | |
step:438/1390 train_time:796050ms step_avg:1859.93ms | |
step:439/1390 train_time:797950ms step_avg:1860.02ms | |
step:440/1390 train_time:799851ms step_avg:1860.12ms | |
step:441/1390 train_time:801772ms step_avg:1860.26ms | |
step:442/1390 train_time:803687ms step_avg:1860.39ms | |
step:443/1390 train_time:805613ms step_avg:1860.54ms | |
step:444/1390 train_time:807535ms step_avg:1860.68ms | |
step:445/1390 train_time:809445ms step_avg:1860.79ms | |
step:446/1390 train_time:811353ms step_avg:1860.90ms | |
step:447/1390 train_time:813252ms step_avg:1860.99ms | |
step:448/1390 train_time:815165ms step_avg:1861.11ms | |
step:449/1390 train_time:817082ms step_avg:1861.24ms | |
step:450/1390 train_time:819008ms step_avg:1861.38ms | |
step:450/1390 val_loss:3.9411 train_time:819008ms step_avg:1861.38ms | |
step:451/1390 train_time:820960ms step_avg:1861.59ms | |
step:452/1390 train_time:822898ms step_avg:1861.76ms | |
step:453/1390 train_time:824818ms step_avg:1861.89ms | |
step:454/1390 train_time:826737ms step_avg:1862.02ms | |
step:455/1390 train_time:828641ms step_avg:1862.11ms | |
step:456/1390 train_time:830555ms step_avg:1862.23ms | |
step:457/1390 train_time:832490ms step_avg:1862.39ms | |
step:458/1390 train_time:834410ms step_avg:1862.52ms | |
step:459/1390 train_time:836317ms step_avg:1862.62ms | |
step:460/1390 train_time:838240ms step_avg:1862.76ms | |
step:461/1390 train_time:840141ms step_avg:1862.84ms | |
step:462/1390 train_time:842052ms step_avg:1862.95ms | |
step:463/1390 train_time:843958ms step_avg:1863.04ms | |
step:464/1390 train_time:845869ms step_avg:1863.15ms | |
step:465/1390 train_time:847775ms step_avg:1863.24ms | |
step:466/1390 train_time:849686ms step_avg:1863.35ms | |
step:467/1390 train_time:851604ms step_avg:1863.47ms | |
step:468/1390 train_time:853521ms step_avg:1863.58ms | |
step:469/1390 train_time:855433ms step_avg:1863.69ms | |
step:470/1390 train_time:857348ms step_avg:1863.80ms | |
step:471/1390 train_time:859283ms step_avg:1863.95ms | |
step:472/1390 train_time:861198ms step_avg:1864.07ms | |
step:473/1390 train_time:863114ms step_avg:1864.18ms | |
step:474/1390 train_time:865024ms step_avg:1864.28ms | |
step:475/1390 train_time:866925ms step_avg:1864.36ms | |
step:475/1390 val_loss:3.9094 train_time:866926ms step_avg:1864.36ms | |
step:476/1390 train_time:868881ms step_avg:1864.55ms | |
step:477/1390 train_time:870810ms step_avg:1864.69ms | |
step:478/1390 train_time:872726ms step_avg:1864.80ms | |
step:479/1390 train_time:874638ms step_avg:1864.90ms | |
step:480/1390 train_time:876539ms step_avg:1864.98ms | |
step:481/1390 train_time:878464ms step_avg:1865.10ms | |
step:482/1390 train_time:880381ms step_avg:1865.21ms | |
step:483/1390 train_time:882299ms step_avg:1865.33ms | |
step:484/1390 train_time:884202ms step_avg:1865.41ms | |
step:485/1390 train_time:886110ms step_avg:1865.49ms | |
step:486/1390 train_time:888026ms step_avg:1865.60ms | |
step:487/1390 train_time:889952ms step_avg:1865.73ms | |
step:488/1390 train_time:891865ms step_avg:1865.83ms | |
step:489/1390 train_time:893775ms step_avg:1865.92ms | |
step:490/1390 train_time:895684ms step_avg:1866.01ms | |
step:491/1390 train_time:897607ms step_avg:1866.13ms | |
step:492/1390 train_time:899538ms step_avg:1866.26ms | |
step:493/1390 train_time:901467ms step_avg:1866.39ms | |
step:494/1390 train_time:903386ms step_avg:1866.50ms | |
step:495/1390 train_time:905320ms step_avg:1866.64ms | |
step:496/1390 train_time:907241ms step_avg:1866.75ms | |
step:497/1390 train_time:909146ms step_avg:1866.83ms | |
step:498/1390 train_time:911054ms step_avg:1866.91ms | |
step:499/1390 train_time:912952ms step_avg:1866.98ms | |
step:500/1390 train_time:914866ms step_avg:1867.07ms | |
step:500/1390 val_loss:3.8861 train_time:914866ms step_avg:1867.07ms | |
step:501/1390 train_time:916823ms step_avg:1867.26ms | |
step:502/1390 train_time:918751ms step_avg:1867.38ms | |
step:503/1390 train_time:920665ms step_avg:1867.48ms | |
step:504/1390 train_time:922574ms step_avg:1867.56ms | |
step:505/1390 train_time:924491ms step_avg:1867.66ms | |
step:506/1390 train_time:926407ms step_avg:1867.76ms | |
step:507/1390 train_time:928310ms step_avg:1867.83ms | |
step:508/1390 train_time:930231ms step_avg:1867.93ms | |
step:509/1390 train_time:932123ms step_avg:1867.98ms | |
step:510/1390 train_time:934030ms step_avg:1868.06ms | |
step:511/1390 train_time:935964ms step_avg:1868.19ms | |
step:512/1390 train_time:937872ms step_avg:1868.27ms | |
step:513/1390 train_time:939781ms step_avg:1868.35ms | |
step:514/1390 train_time:941689ms step_avg:1868.43ms | |
step:515/1390 train_time:943602ms step_avg:1868.52ms | |
step:516/1390 train_time:945528ms step_avg:1868.63ms | |
step:517/1390 train_time:947456ms step_avg:1868.75ms | |
step:518/1390 train_time:949395ms step_avg:1868.89ms | |
step:519/1390 train_time:951319ms step_avg:1869.00ms | |
step:520/1390 train_time:953239ms step_avg:1869.10ms | |
step:521/1390 train_time:955157ms step_avg:1869.19ms | |
step:522/1390 train_time:957090ms step_avg:1869.32ms | |
step:523/1390 train_time:959039ms step_avg:1869.47ms | |
step:524/1390 train_time:960962ms step_avg:1869.58ms | |
step:525/1390 train_time:962884ms step_avg:1869.68ms | |
step:525/1390 val_loss:3.8543 train_time:962884ms step_avg:1869.68ms | |
step:526/1390 train_time:964854ms step_avg:1869.87ms | |
step:527/1390 train_time:966799ms step_avg:1870.02ms | |
step:528/1390 train_time:968717ms step_avg:1870.11ms | |
step:529/1390 train_time:970641ms step_avg:1870.21ms | |
step:530/1390 train_time:972580ms step_avg:1870.35ms | |
step:531/1390 train_time:974528ms step_avg:1870.50ms | |
step:532/1390 train_time:976455ms step_avg:1870.60ms | |
step:533/1390 train_time:978379ms step_avg:1870.71ms | |
step:534/1390 train_time:980294ms step_avg:1870.79ms | |
step:535/1390 train_time:982222ms step_avg:1870.90ms | |
step:536/1390 train_time:984153ms step_avg:1871.01ms | |
step:537/1390 train_time:986086ms step_avg:1871.13ms | |
step:538/1390 train_time:988016ms step_avg:1871.24ms | |
step:539/1390 train_time:989933ms step_avg:1871.33ms | |
step:540/1390 train_time:991859ms step_avg:1871.43ms | |
step:541/1390 train_time:993790ms step_avg:1871.54ms | |
step:542/1390 train_time:995716ms step_avg:1871.65ms | |
step:543/1390 train_time:997639ms step_avg:1871.74ms | |
step:544/1390 train_time:999566ms step_avg:1871.85ms | |
step:545/1390 train_time:1001494ms step_avg:1871.95ms | |
step:546/1390 train_time:1003451ms step_avg:1872.11ms | |
step:547/1390 train_time:1005385ms step_avg:1872.23ms | |
step:548/1390 train_time:1007314ms step_avg:1872.33ms | |
step:549/1390 train_time:1009239ms step_avg:1872.43ms | |
step:550/1390 train_time:1011175ms step_avg:1872.55ms | |
step:550/1390 val_loss:3.8319 train_time:1011176ms step_avg:1872.55ms | |
step:551/1390 train_time:1013154ms step_avg:1872.74ms | |
step:552/1390 train_time:1015089ms step_avg:1872.86ms | |
step:553/1390 train_time:1017013ms step_avg:1872.95ms | |
step:554/1390 train_time:1018949ms step_avg:1873.07ms | |
step:555/1390 train_time:1020879ms step_avg:1873.17ms | |
step:556/1390 train_time:1022813ms step_avg:1873.28ms | |
step:557/1390 train_time:1024734ms step_avg:1873.37ms | |
step:558/1390 train_time:1026666ms step_avg:1873.48ms | |
step:559/1390 train_time:1028600ms step_avg:1873.59ms | |
step:560/1390 train_time:1030525ms step_avg:1873.68ms | |
step:561/1390 train_time:1032448ms step_avg:1873.77ms | |
step:562/1390 train_time:1034372ms step_avg:1873.86ms | |
step:563/1390 train_time:1036289ms step_avg:1873.94ms | |
step:564/1390 train_time:1038203ms step_avg:1874.01ms | |
step:565/1390 train_time:1040133ms step_avg:1874.11ms | |
step:566/1390 train_time:1042067ms step_avg:1874.22ms | |
step:567/1390 train_time:1043993ms step_avg:1874.31ms | |
step:568/1390 train_time:1045919ms step_avg:1874.41ms | |
step:569/1390 train_time:1047869ms step_avg:1874.54ms | |
step:570/1390 train_time:1049798ms step_avg:1874.64ms | |
step:571/1390 train_time:1051784ms step_avg:1874.84ms | |
step:572/1390 train_time:1053733ms step_avg:1874.97ms | |
step:573/1390 train_time:1055688ms step_avg:1875.11ms | |
step:574/1390 train_time:1057613ms step_avg:1875.20ms | |
step:575/1390 train_time:1059539ms step_avg:1875.29ms | |
step:575/1390 val_loss:3.8128 train_time:1059539ms step_avg:1875.29ms | |
step:576/1390 train_time:1061506ms step_avg:1875.45ms | |
step:577/1390 train_time:1063443ms step_avg:1875.56ms | |
step:578/1390 train_time:1065362ms step_avg:1875.64ms | |
step:579/1390 train_time:1067288ms step_avg:1875.73ms | |
step:580/1390 train_time:1069217ms step_avg:1875.82ms | |
step:581/1390 train_time:1071154ms step_avg:1875.93ms | |
step:582/1390 train_time:1073088ms step_avg:1876.03ms | |
step:583/1390 train_time:1075016ms step_avg:1876.12ms | |
step:584/1390 train_time:1076938ms step_avg:1876.20ms | |
step:585/1390 train_time:1078874ms step_avg:1876.30ms | |
step:586/1390 train_time:1080810ms step_avg:1876.41ms | |
step:587/1390 train_time:1082738ms step_avg:1876.49ms | |
step:588/1390 train_time:1084657ms step_avg:1876.57ms | |
step:589/1390 train_time:1086575ms step_avg:1876.64ms | |
step:590/1390 train_time:1088507ms step_avg:1876.74ms | |
step:591/1390 train_time:1090440ms step_avg:1876.83ms | |
step:592/1390 train_time:1092379ms step_avg:1876.94ms | |
step:593/1390 train_time:1094312ms step_avg:1877.04ms | |
step:594/1390 train_time:1096229ms step_avg:1877.10ms | |
step:595/1390 train_time:1098150ms step_avg:1877.18ms | |
step:596/1390 train_time:1100081ms step_avg:1877.27ms | |
step:597/1390 train_time:1102016ms step_avg:1877.37ms | |
step:598/1390 train_time:1103960ms step_avg:1877.48ms | |
step:599/1390 train_time:1105897ms step_avg:1877.58ms | |
step:600/1390 train_time:1107825ms step_avg:1877.67ms | |
step:600/1390 val_loss:3.7907 train_time:1107825ms step_avg:1877.67ms | |
step:601/1390 train_time:1109805ms step_avg:1877.84ms | |
step:602/1390 train_time:1111766ms step_avg:1877.98ms | |
step:603/1390 train_time:1113700ms step_avg:1878.08ms | |
step:604/1390 train_time:1115631ms step_avg:1878.17ms | |
step:605/1390 train_time:1117552ms step_avg:1878.24ms | |
step:606/1390 train_time:1119488ms step_avg:1878.33ms | |
step:607/1390 train_time:1121426ms step_avg:1878.44ms | |
step:608/1390 train_time:1123357ms step_avg:1878.52ms | |
step:609/1390 train_time:1125290ms step_avg:1878.61ms | |
step:610/1390 train_time:1127211ms step_avg:1878.69ms | |
step:611/1390 train_time:1129132ms step_avg:1878.75ms | |
step:612/1390 train_time:1131064ms step_avg:1878.84ms | |
step:613/1390 train_time:1132994ms step_avg:1878.93ms | |
step:614/1390 train_time:1134936ms step_avg:1879.03ms | |
step:615/1390 train_time:1136866ms step_avg:1879.12ms | |
step:616/1390 train_time:1138793ms step_avg:1879.20ms | |
step:617/1390 train_time:1140728ms step_avg:1879.29ms | |
step:618/1390 train_time:1142656ms step_avg:1879.37ms | |
step:619/1390 train_time:1144617ms step_avg:1879.50ms | |
step:620/1390 train_time:1146573ms step_avg:1879.63ms | |
step:621/1390 train_time:1148515ms step_avg:1879.73ms | |
step:622/1390 train_time:1150472ms step_avg:1879.86ms | |
step:623/1390 train_time:1152414ms step_avg:1879.96ms | |
step:624/1390 train_time:1154354ms step_avg:1880.06ms | |
step:625/1390 train_time:1156310ms step_avg:1880.18ms | |
step:625/1390 val_loss:3.7681 train_time:1156310ms step_avg:1880.18ms | |
step:626/1390 train_time:1158299ms step_avg:1880.36ms | |
step:627/1390 train_time:1160263ms step_avg:1880.49ms | |
step:628/1390 train_time:1162202ms step_avg:1880.59ms | |
step:629/1390 train_time:1164143ms step_avg:1880.68ms | |
step:630/1390 train_time:1166077ms step_avg:1880.77ms | |
step:631/1390 train_time:1168008ms step_avg:1880.85ms | |
step:632/1390 train_time:1169954ms step_avg:1880.95ms | |
step:633/1390 train_time:1171901ms step_avg:1881.06ms | |
step:634/1390 train_time:1173845ms step_avg:1881.16ms | |
step:635/1390 train_time:1175783ms step_avg:1881.25ms | |
step:636/1390 train_time:1177730ms step_avg:1881.36ms | |
step:637/1390 train_time:1179684ms step_avg:1881.47ms | |
step:638/1390 train_time:1181620ms step_avg:1881.56ms | |
step:639/1390 train_time:1183556ms step_avg:1881.65ms | |
step:640/1390 train_time:1185509ms step_avg:1881.76ms | |
step:641/1390 train_time:1187443ms step_avg:1881.84ms | |
step:642/1390 train_time:1189383ms step_avg:1881.94ms | |
step:643/1390 train_time:1191311ms step_avg:1882.01ms | |
step:644/1390 train_time:1193264ms step_avg:1882.12ms | |
step:645/1390 train_time:1195208ms step_avg:1882.22ms | |
step:646/1390 train_time:1197137ms step_avg:1882.29ms | |
step:647/1390 train_time:1199093ms step_avg:1882.41ms | |
step:648/1390 train_time:1201052ms step_avg:1882.53ms | |
step:649/1390 train_time:1203010ms step_avg:1882.64ms | |
step:650/1390 train_time:1204955ms step_avg:1882.74ms | |
step:650/1390 val_loss:3.7511 train_time:1204955ms step_avg:1882.74ms | |
step:651/1390 train_time:1206937ms step_avg:1882.90ms | |
step:652/1390 train_time:1208904ms step_avg:1883.03ms | |
step:653/1390 train_time:1210851ms step_avg:1883.13ms | |
step:654/1390 train_time:1212792ms step_avg:1883.22ms | |
step:655/1390 train_time:1214735ms step_avg:1883.31ms | |
step:656/1390 train_time:1216680ms step_avg:1883.41ms | |
step:657/1390 train_time:1218623ms step_avg:1883.50ms | |
step:658/1390 train_time:1220553ms step_avg:1883.57ms | |
step:659/1390 train_time:1222490ms step_avg:1883.65ms | |
step:660/1390 train_time:1224438ms step_avg:1883.75ms | |
step:661/1390 train_time:1226370ms step_avg:1883.82ms | |
step:662/1390 train_time:1228310ms step_avg:1883.91ms | |
step:663/1390 train_time:1230261ms step_avg:1884.01ms | |
step:664/1390 train_time:1232202ms step_avg:1884.10ms | |
step:665/1390 train_time:1234140ms step_avg:1884.18ms | |
step:666/1390 train_time:1236078ms step_avg:1884.27ms | |
step:667/1390 train_time:1238015ms step_avg:1884.35ms | |
step:668/1390 train_time:1239946ms step_avg:1884.42ms | |
step:669/1390 train_time:1241899ms step_avg:1884.52ms | |
step:670/1390 train_time:1243846ms step_avg:1884.62ms | |
step:671/1390 train_time:1245786ms step_avg:1884.70ms | |
step:672/1390 train_time:1247736ms step_avg:1884.80ms | |
step:673/1390 train_time:1249675ms step_avg:1884.88ms | |
step:674/1390 train_time:1251622ms step_avg:1884.97ms | |
step:675/1390 train_time:1253553ms step_avg:1885.04ms | |
step:675/1390 val_loss:3.7364 train_time:1253553ms step_avg:1885.04ms | |
step:676/1390 train_time:1255538ms step_avg:1885.19ms | |
step:677/1390 train_time:1257503ms step_avg:1885.31ms | |
step:678/1390 train_time:1259436ms step_avg:1885.38ms | |
step:679/1390 train_time:1261372ms step_avg:1885.46ms | |
step:680/1390 train_time:1263315ms step_avg:1885.54ms | |
step:681/1390 train_time:1265252ms step_avg:1885.62ms | |
step:682/1390 train_time:1267204ms step_avg:1885.72ms | |
step:683/1390 train_time:1269144ms step_avg:1885.80ms | |
step:684/1390 train_time:1271108ms step_avg:1885.92ms | |
step:685/1390 train_time:1273051ms step_avg:1886.00ms | |
step:686/1390 train_time:1274996ms step_avg:1886.09ms | |
step:687/1390 train_time:1276931ms step_avg:1886.16ms | |
step:688/1390 train_time:1278878ms step_avg:1886.25ms | |
step:689/1390 train_time:1280839ms step_avg:1886.36ms | |
step:690/1390 train_time:1282768ms step_avg:1886.42ms | |
step:691/1390 train_time:1284703ms step_avg:1886.50ms | |
step:692/1390 train_time:1286642ms step_avg:1886.57ms | |
step:693/1390 train_time:1288578ms step_avg:1886.64ms | |
step:694/1390 train_time:1290507ms step_avg:1886.71ms | |
step:695/1390 train_time:1292451ms step_avg:1886.79ms | |
step:696/1390 train_time:1294394ms step_avg:1886.87ms | |
step:697/1390 train_time:1296372ms step_avg:1887.01ms | |
step:698/1390 train_time:1298301ms step_avg:1887.07ms | |
step:699/1390 train_time:1300254ms step_avg:1887.16ms | |
step:700/1390 train_time:1302196ms step_avg:1887.24ms | |
step:700/1390 val_loss:3.7213 train_time:1302196ms step_avg:1887.24ms | |
step:701/1390 train_time:1304177ms step_avg:1887.38ms | |
step:702/1390 train_time:1306137ms step_avg:1887.48ms | |
step:703/1390 train_time:1308072ms step_avg:1887.55ms | |
step:704/1390 train_time:1310020ms step_avg:1887.64ms | |
step:705/1390 train_time:1311968ms step_avg:1887.72ms | |
step:706/1390 train_time:1313897ms step_avg:1887.78ms | |
step:707/1390 train_time:1315836ms step_avg:1887.86ms | |
step:708/1390 train_time:1317784ms step_avg:1887.94ms | |
step:709/1390 train_time:1319727ms step_avg:1888.02ms | |
step:710/1390 train_time:1321663ms step_avg:1888.09ms | |
step:711/1390 train_time:1323613ms step_avg:1888.18ms | |
step:712/1390 train_time:1325558ms step_avg:1888.26ms | |
step:713/1390 train_time:1327485ms step_avg:1888.31ms | |
step:714/1390 train_time:1329419ms step_avg:1888.38ms | |
step:715/1390 train_time:1331359ms step_avg:1888.45ms | |
step:716/1390 train_time:1333296ms step_avg:1888.52ms | |
step:717/1390 train_time:1335234ms step_avg:1888.59ms | |
step:718/1390 train_time:1337166ms step_avg:1888.65ms | |
step:719/1390 train_time:1339127ms step_avg:1888.75ms | |
step:720/1390 train_time:1341097ms step_avg:1888.87ms | |
step:721/1390 train_time:1343043ms step_avg:1888.95ms | |
step:722/1390 train_time:1345013ms step_avg:1889.06ms | |
step:723/1390 train_time:1346980ms step_avg:1889.17ms | |
step:724/1390 train_time:1348955ms step_avg:1889.29ms | |
step:725/1390 train_time:1350917ms step_avg:1889.39ms | |
step:725/1390 val_loss:3.7047 train_time:1350918ms step_avg:1889.40ms | |
step:726/1390 train_time:1352925ms step_avg:1889.56ms | |
step:727/1390 train_time:1354896ms step_avg:1889.67ms | |
step:728/1390 train_time:1356855ms step_avg:1889.77ms | |
step:729/1390 train_time:1358827ms step_avg:1889.88ms | |
step:730/1390 train_time:1360778ms step_avg:1889.97ms | |
step:731/1390 train_time:1362726ms step_avg:1890.05ms | |
step:732/1390 train_time:1364681ms step_avg:1890.14ms | |
step:733/1390 train_time:1366615ms step_avg:1890.20ms | |
step:734/1390 train_time:1368581ms step_avg:1890.31ms | |
step:735/1390 train_time:1370541ms step_avg:1890.40ms | |
step:736/1390 train_time:1372498ms step_avg:1890.49ms | |
step:737/1390 train_time:1374441ms step_avg:1890.57ms | |
step:738/1390 train_time:1376402ms step_avg:1890.66ms | |
step:739/1390 train_time:1378344ms step_avg:1890.73ms | |
step:740/1390 train_time:1380306ms step_avg:1890.83ms | |
step:741/1390 train_time:1382261ms step_avg:1890.92ms | |
step:742/1390 train_time:1384216ms step_avg:1891.00ms | |
step:743/1390 train_time:1386172ms step_avg:1891.09ms | |
step:744/1390 train_time:1388145ms step_avg:1891.21ms | |
step:745/1390 train_time:1390106ms step_avg:1891.30ms | |
step:746/1390 train_time:1392045ms step_avg:1891.37ms | |
step:747/1390 train_time:1394010ms step_avg:1891.47ms | |
step:748/1390 train_time:1395978ms step_avg:1891.57ms | |
step:749/1390 train_time:1397937ms step_avg:1891.66ms | |
step:750/1390 train_time:1399906ms step_avg:1891.76ms | |
step:750/1390 val_loss:3.6925 train_time:1399906ms step_avg:1891.77ms | |
step:751/1390 train_time:1401917ms step_avg:1891.93ms | |
step:752/1390 train_time:1403876ms step_avg:1892.02ms | |
step:753/1390 train_time:1405833ms step_avg:1892.10ms | |
step:754/1390 train_time:1407790ms step_avg:1892.19ms | |
step:755/1390 train_time:1409735ms step_avg:1892.26ms | |
step:756/1390 train_time:1411696ms step_avg:1892.35ms | |
step:757/1390 train_time:1413671ms step_avg:1892.47ms | |
step:758/1390 train_time:1415617ms step_avg:1892.54ms | |
step:759/1390 train_time:1417572ms step_avg:1892.62ms | |
step:760/1390 train_time:1419519ms step_avg:1892.69ms | |
step:761/1390 train_time:1421515ms step_avg:1892.83ms | |
step:762/1390 train_time:1423466ms step_avg:1892.91ms | |
step:763/1390 train_time:1425416ms step_avg:1892.98ms | |
step:764/1390 train_time:1427371ms step_avg:1893.06ms | |
step:765/1390 train_time:1429331ms step_avg:1893.15ms | |
step:766/1390 train_time:1431289ms step_avg:1893.24ms | |
step:767/1390 train_time:1433253ms step_avg:1893.33ms | |
step:768/1390 train_time:1435218ms step_avg:1893.43ms | |
step:769/1390 train_time:1437177ms step_avg:1893.51ms | |
step:770/1390 train_time:1439139ms step_avg:1893.60ms | |
step:771/1390 train_time:1441088ms step_avg:1893.68ms | |
step:772/1390 train_time:1443057ms step_avg:1893.78ms | |
step:773/1390 train_time:1445014ms step_avg:1893.86ms | |
step:774/1390 train_time:1446980ms step_avg:1893.95ms | |
step:775/1390 train_time:1448922ms step_avg:1894.02ms | |
step:775/1390 val_loss:3.6836 train_time:1448922ms step_avg:1894.02ms | |
step:776/1390 train_time:1450921ms step_avg:1894.15ms | |
step:777/1390 train_time:1452877ms step_avg:1894.23ms | |
step:778/1390 train_time:1454831ms step_avg:1894.31ms | |
step:779/1390 train_time:1456798ms step_avg:1894.41ms | |
step:780/1390 train_time:1458754ms step_avg:1894.49ms | |
step:781/1390 train_time:1460703ms step_avg:1894.56ms | |
step:782/1390 train_time:1462647ms step_avg:1894.62ms | |
step:783/1390 train_time:1464600ms step_avg:1894.70ms | |
step:784/1390 train_time:1466556ms step_avg:1894.78ms | |
step:785/1390 train_time:1468513ms step_avg:1894.86ms | |
step:786/1390 train_time:1470456ms step_avg:1894.92ms | |
step:787/1390 train_time:1472420ms step_avg:1895.01ms | |
step:788/1390 train_time:1474372ms step_avg:1895.08ms | |
step:789/1390 train_time:1476337ms step_avg:1895.17ms | |
step:790/1390 train_time:1478282ms step_avg:1895.23ms | |
step:791/1390 train_time:1480244ms step_avg:1895.32ms | |
step:792/1390 train_time:1482197ms step_avg:1895.39ms | |
step:793/1390 train_time:1484155ms step_avg:1895.47ms | |
step:794/1390 train_time:1486131ms step_avg:1895.58ms | |
step:795/1390 train_time:1488086ms step_avg:1895.65ms | |
step:796/1390 train_time:1490050ms step_avg:1895.74ms | |
step:797/1390 train_time:1492036ms step_avg:1895.85ms | |
step:798/1390 train_time:1493998ms step_avg:1895.94ms | |
step:799/1390 train_time:1495955ms step_avg:1896.01ms | |
step:800/1390 train_time:1497908ms step_avg:1896.09ms | |
step:800/1390 val_loss:3.6724 train_time:1497909ms step_avg:1896.09ms | |
step:801/1390 train_time:1499903ms step_avg:1896.21ms | |
step:802/1390 train_time:1501873ms step_avg:1896.30ms | |
step:803/1390 train_time:1503828ms step_avg:1896.38ms | |
step:804/1390 train_time:1505791ms step_avg:1896.46ms | |
step:805/1390 train_time:1507739ms step_avg:1896.53ms | |
step:806/1390 train_time:1509681ms step_avg:1896.58ms | |
step:807/1390 train_time:1511634ms step_avg:1896.65ms | |
step:808/1390 train_time:1513580ms step_avg:1896.72ms | |
step:809/1390 train_time:1515532ms step_avg:1896.79ms | |
step:810/1390 train_time:1517470ms step_avg:1896.84ms | |
step:811/1390 train_time:1519423ms step_avg:1896.91ms | |
step:812/1390 train_time:1521382ms step_avg:1896.99ms | |
step:813/1390 train_time:1523337ms step_avg:1897.06ms | |
step:814/1390 train_time:1525283ms step_avg:1897.12ms | |
step:815/1390 train_time:1527223ms step_avg:1897.17ms | |
step:816/1390 train_time:1529181ms step_avg:1897.25ms | |
step:817/1390 train_time:1531147ms step_avg:1897.33ms | |
step:818/1390 train_time:1533096ms step_avg:1897.40ms | |
step:819/1390 train_time:1535046ms step_avg:1897.46ms | |
step:820/1390 train_time:1537009ms step_avg:1897.54ms | |
step:821/1390 train_time:1538955ms step_avg:1897.60ms | |
step:822/1390 train_time:1540924ms step_avg:1897.69ms | |
step:823/1390 train_time:1542865ms step_avg:1897.74ms | |
step:824/1390 train_time:1544821ms step_avg:1897.81ms | |
step:825/1390 train_time:1546809ms step_avg:1897.93ms | |
step:825/1390 val_loss:3.6598 train_time:1546810ms step_avg:1897.93ms | |
step:826/1390 train_time:1548821ms step_avg:1898.07ms | |
step:827/1390 train_time:1550806ms step_avg:1898.17ms | |
step:828/1390 train_time:1552770ms step_avg:1898.25ms | |
step:829/1390 train_time:1554733ms step_avg:1898.33ms | |
step:830/1390 train_time:1556708ms step_avg:1898.42ms | |
step:831/1390 train_time:1558692ms step_avg:1898.53ms | |
step:832/1390 train_time:1560643ms step_avg:1898.59ms | |
step:833/1390 train_time:1562614ms step_avg:1898.68ms | |
step:834/1390 train_time:1564576ms step_avg:1898.76ms | |
step:835/1390 train_time:1566559ms step_avg:1898.86ms | |
step:836/1390 train_time:1568528ms step_avg:1898.94ms | |
step:837/1390 train_time:1570484ms step_avg:1899.01ms | |
step:838/1390 train_time:1572451ms step_avg:1899.10ms | |
step:839/1390 train_time:1574416ms step_avg:1899.17ms | |
step:840/1390 train_time:1576380ms step_avg:1899.25ms | |
step:841/1390 train_time:1578345ms step_avg:1899.33ms | |
step:842/1390 train_time:1580324ms step_avg:1899.43ms | |
step:843/1390 train_time:1582286ms step_avg:1899.50ms | |
step:844/1390 train_time:1584252ms step_avg:1899.58ms | |
step:845/1390 train_time:1586232ms step_avg:1899.68ms | |
step:846/1390 train_time:1588190ms step_avg:1899.75ms | |
step:847/1390 train_time:1590155ms step_avg:1899.83ms | |
step:848/1390 train_time:1592149ms step_avg:1899.94ms | |
step:849/1390 train_time:1594112ms step_avg:1900.01ms | |
step:850/1390 train_time:1596085ms step_avg:1900.10ms | |
step:850/1390 val_loss:3.6513 train_time:1596085ms step_avg:1900.10ms | |
step:851/1390 train_time:1598088ms step_avg:1900.22ms | |
step:852/1390 train_time:1600062ms step_avg:1900.31ms | |
step:853/1390 train_time:1602032ms step_avg:1900.39ms | |
step:854/1390 train_time:1603997ms step_avg:1900.47ms | |
step:855/1390 train_time:1605956ms step_avg:1900.54ms | |
step:856/1390 train_time:1607933ms step_avg:1900.63ms | |
step:857/1390 train_time:1609908ms step_avg:1900.72ms | |
step:858/1390 train_time:1611887ms step_avg:1900.81ms | |
step:859/1390 train_time:1613854ms step_avg:1900.89ms | |
step:860/1390 train_time:1615813ms step_avg:1900.96ms | |
step:861/1390 train_time:1617789ms step_avg:1901.04ms | |
step:862/1390 train_time:1619763ms step_avg:1901.13ms | |
step:863/1390 train_time:1621738ms step_avg:1901.22ms | |
step:864/1390 train_time:1623703ms step_avg:1901.29ms | |
step:865/1390 train_time:1625693ms step_avg:1901.39ms | |
step:866/1390 train_time:1627665ms step_avg:1901.48ms | |
step:867/1390 train_time:1629616ms step_avg:1901.54ms | |
step:868/1390 train_time:1631596ms step_avg:1901.63ms | |
step:869/1390 train_time:1633605ms step_avg:1901.75ms | |
step:870/1390 train_time:1635564ms step_avg:1901.82ms | |
step:871/1390 train_time:1637520ms step_avg:1901.88ms | |
step:872/1390 train_time:1639489ms step_avg:1901.96ms | |
step:873/1390 train_time:1641489ms step_avg:1902.07ms | |
step:874/1390 train_time:1643468ms step_avg:1902.16ms | |
step:875/1390 train_time:1645443ms step_avg:1902.25ms | |
step:875/1390 val_loss:3.6424 train_time:1645443ms step_avg:1902.25ms | |
step:876/1390 train_time:1647458ms step_avg:1902.38ms | |
step:877/1390 train_time:1649445ms step_avg:1902.47ms | |
step:878/1390 train_time:1651412ms step_avg:1902.55ms | |
step:879/1390 train_time:1653382ms step_avg:1902.63ms | |
step:880/1390 train_time:1655334ms step_avg:1902.68ms | |
step:881/1390 train_time:1657310ms step_avg:1902.77ms | |
step:882/1390 train_time:1659279ms step_avg:1902.84ms | |
step:883/1390 train_time:1661248ms step_avg:1902.92ms | |
step:884/1390 train_time:1663214ms step_avg:1902.99ms | |
step:885/1390 train_time:1665187ms step_avg:1903.07ms | |
step:886/1390 train_time:1667152ms step_avg:1903.14ms | |
step:887/1390 train_time:1669120ms step_avg:1903.22ms | |
step:888/1390 train_time:1671104ms step_avg:1903.31ms | |
step:889/1390 train_time:1673064ms step_avg:1903.37ms | |
step:890/1390 train_time:1675017ms step_avg:1903.43ms | |
step:891/1390 train_time:1676993ms step_avg:1903.51ms | |
step:892/1390 train_time:1678962ms step_avg:1903.58ms | |
step:893/1390 train_time:1680919ms step_avg:1903.65ms | |
step:894/1390 train_time:1682899ms step_avg:1903.73ms | |
step:895/1390 train_time:1684863ms step_avg:1903.80ms | |
step:896/1390 train_time:1686843ms step_avg:1903.89ms | |
step:897/1390 train_time:1688814ms step_avg:1903.96ms | |
step:898/1390 train_time:1690799ms step_avg:1904.05ms | |
step:899/1390 train_time:1692754ms step_avg:1904.11ms | |
step:900/1390 train_time:1694734ms step_avg:1904.20ms | |
step:900/1390 val_loss:3.6300 train_time:1694734ms step_avg:1904.20ms | |
step:901/1390 train_time:1696741ms step_avg:1904.31ms | |
step:902/1390 train_time:1698720ms step_avg:1904.39ms | |
step:903/1390 train_time:1700687ms step_avg:1904.46ms | |
step:904/1390 train_time:1702652ms step_avg:1904.53ms | |
step:905/1390 train_time:1704607ms step_avg:1904.59ms | |
step:906/1390 train_time:1706585ms step_avg:1904.67ms | |
step:907/1390 train_time:1708549ms step_avg:1904.74ms | |
step:908/1390 train_time:1710502ms step_avg:1904.79ms | |
step:909/1390 train_time:1712488ms step_avg:1904.88ms | |
step:910/1390 train_time:1714465ms step_avg:1904.96ms | |
step:911/1390 train_time:1716434ms step_avg:1905.03ms | |
step:912/1390 train_time:1718399ms step_avg:1905.10ms | |
step:913/1390 train_time:1720365ms step_avg:1905.17ms | |
step:914/1390 train_time:1722338ms step_avg:1905.24ms | |
step:915/1390 train_time:1724315ms step_avg:1905.32ms | |
step:916/1390 train_time:1726287ms step_avg:1905.39ms | |
step:917/1390 train_time:1728241ms step_avg:1905.45ms | |
step:918/1390 train_time:1730221ms step_avg:1905.53ms | |
step:919/1390 train_time:1732179ms step_avg:1905.59ms | |
step:920/1390 train_time:1734131ms step_avg:1905.64ms | |
step:921/1390 train_time:1736105ms step_avg:1905.71ms | |
step:922/1390 train_time:1738076ms step_avg:1905.78ms | |
step:923/1390 train_time:1740034ms step_avg:1905.84ms | |
step:924/1390 train_time:1742002ms step_avg:1905.91ms | |
step:925/1390 train_time:1743952ms step_avg:1905.96ms | |
step:925/1390 val_loss:3.6151 train_time:1743953ms step_avg:1905.96ms | |
step:926/1390 train_time:1745974ms step_avg:1906.09ms | |
step:927/1390 train_time:1747954ms step_avg:1906.17ms | |
step:928/1390 train_time:1749931ms step_avg:1906.24ms | |
step:929/1390 train_time:1751906ms step_avg:1906.32ms | |
step:930/1390 train_time:1753893ms step_avg:1906.41ms | |
step:931/1390 train_time:1755866ms step_avg:1906.48ms | |
step:932/1390 train_time:1757852ms step_avg:1906.56ms | |
step:933/1390 train_time:1759835ms step_avg:1906.65ms | |
step:934/1390 train_time:1761814ms step_avg:1906.73ms | |
step:935/1390 train_time:1763801ms step_avg:1906.81ms | |
step:936/1390 train_time:1765782ms step_avg:1906.89ms | |
step:937/1390 train_time:1767759ms step_avg:1906.97ms | |
step:938/1390 train_time:1769729ms step_avg:1907.04ms | |
step:939/1390 train_time:1771724ms step_avg:1907.13ms | |
step:940/1390 train_time:1773706ms step_avg:1907.21ms | |
step:941/1390 train_time:1775686ms step_avg:1907.29ms | |
step:942/1390 train_time:1777661ms step_avg:1907.36ms | |
step:943/1390 train_time:1779658ms step_avg:1907.46ms | |
step:944/1390 train_time:1781657ms step_avg:1907.56ms | |
step:945/1390 train_time:1783628ms step_avg:1907.62ms | |
step:946/1390 train_time:1785607ms step_avg:1907.70ms | |
step:947/1390 train_time:1787618ms step_avg:1907.81ms | |
step:948/1390 train_time:1789616ms step_avg:1907.91ms | |
step:949/1390 train_time:1791595ms step_avg:1907.98ms | |
step:950/1390 train_time:1793570ms step_avg:1908.05ms | |
step:950/1390 val_loss:3.6006 train_time:1793570ms step_avg:1908.05ms | |
step:951/1390 train_time:1795731ms step_avg:1908.32ms | |
step:952/1390 train_time:1797738ms step_avg:1908.43ms | |
step:953/1390 train_time:1799706ms step_avg:1908.49ms | |
step:954/1390 train_time:1801680ms step_avg:1908.56ms | |
step:955/1390 train_time:1803672ms step_avg:1908.65ms | |
step:956/1390 train_time:1805651ms step_avg:1908.72ms | |
step:957/1390 train_time:1807698ms step_avg:1908.87ms | |
step:958/1390 train_time:1809677ms step_avg:1908.94ms | |
step:959/1390 train_time:1811680ms step_avg:1909.04ms | |
step:960/1390 train_time:1813655ms step_avg:1909.11ms | |
step:961/1390 train_time:1815631ms step_avg:1909.18ms | |
step:962/1390 train_time:1817619ms step_avg:1909.26ms | |
step:963/1390 train_time:1819588ms step_avg:1909.33ms | |
step:964/1390 train_time:1821568ms step_avg:1909.40ms | |
step:965/1390 train_time:1823539ms step_avg:1909.47ms | |
step:966/1390 train_time:1825530ms step_avg:1909.55ms | |
step:967/1390 train_time:1827483ms step_avg:1909.60ms | |
step:968/1390 train_time:1829477ms step_avg:1909.68ms | |
step:969/1390 train_time:1831467ms step_avg:1909.77ms | |
step:970/1390 train_time:1833447ms step_avg:1909.84ms | |
step:971/1390 train_time:1835445ms step_avg:1909.93ms | |
step:972/1390 train_time:1837439ms step_avg:1910.02ms | |
step:973/1390 train_time:1839410ms step_avg:1910.08ms | |
step:974/1390 train_time:1841386ms step_avg:1910.15ms | |
step:975/1390 train_time:1843387ms step_avg:1910.25ms | |
step:975/1390 val_loss:3.5887 train_time:1843387ms step_avg:1910.25ms | |
step:976/1390 train_time:1845403ms step_avg:1910.36ms | |
step:977/1390 train_time:1847396ms step_avg:1910.44ms | |
step:978/1390 train_time:1849355ms step_avg:1910.49ms | |
step:979/1390 train_time:1851351ms step_avg:1910.58ms | |
step:980/1390 train_time:1853322ms step_avg:1910.64ms | |
step:981/1390 train_time:1855282ms step_avg:1910.69ms | |
step:982/1390 train_time:1857250ms step_avg:1910.75ms | |
step:983/1390 train_time:1859229ms step_avg:1910.82ms | |
step:984/1390 train_time:1861216ms step_avg:1910.90ms | |
step:985/1390 train_time:1863199ms step_avg:1910.97ms | |
step:986/1390 train_time:1865173ms step_avg:1911.04ms | |
step:987/1390 train_time:1867149ms step_avg:1911.10ms | |
step:988/1390 train_time:1869122ms step_avg:1911.17ms | |
step:989/1390 train_time:1871110ms step_avg:1911.25ms | |
step:990/1390 train_time:1873095ms step_avg:1911.32ms | |
step:991/1390 train_time:1875078ms step_avg:1911.39ms | |
step:992/1390 train_time:1877115ms step_avg:1911.52ms | |
step:993/1390 train_time:1879100ms step_avg:1911.60ms | |
step:994/1390 train_time:1881081ms step_avg:1911.67ms | |
step:995/1390 train_time:1883056ms step_avg:1911.73ms | |
step:996/1390 train_time:1885071ms step_avg:1911.84ms | |
step:997/1390 train_time:1887058ms step_avg:1911.91ms | |
step:998/1390 train_time:1889041ms step_avg:1911.98ms | |
step:999/1390 train_time:1891008ms step_avg:1912.04ms | |
step:1000/1390 train_time:1892987ms step_avg:1912.11ms | |
step:1000/1390 val_loss:3.5765 train_time:1892987ms step_avg:1912.11ms | |
step:1001/1390 train_time:1895016ms step_avg:1912.23ms | |
step:1002/1390 train_time:1897020ms step_avg:1912.32ms | |
step:1003/1390 train_time:1898998ms step_avg:1912.38ms | |
step:1004/1390 train_time:1900990ms step_avg:1912.46ms | |
step:1005/1390 train_time:1902959ms step_avg:1912.52ms | |
step:1006/1390 train_time:1904923ms step_avg:1912.57ms | |
step:1007/1390 train_time:1906903ms step_avg:1912.64ms | |
step:1008/1390 train_time:1908891ms step_avg:1912.72ms | |
step:1009/1390 train_time:1910873ms step_avg:1912.79ms | |
step:1010/1390 train_time:1912842ms step_avg:1912.84ms | |
step:1011/1390 train_time:1914823ms step_avg:1912.91ms | |
step:1012/1390 train_time:1916813ms step_avg:1912.99ms | |
step:1013/1390 train_time:1918779ms step_avg:1913.04ms | |
step:1014/1390 train_time:1920744ms step_avg:1913.09ms | |
step:1015/1390 train_time:1922729ms step_avg:1913.16ms | |
step:1016/1390 train_time:1924714ms step_avg:1913.23ms | |
step:1017/1390 train_time:1926692ms step_avg:1913.30ms | |
step:1018/1390 train_time:1928684ms step_avg:1913.38ms | |
step:1019/1390 train_time:1930664ms step_avg:1913.44ms | |
step:1020/1390 train_time:1932640ms step_avg:1913.51ms | |
step:1021/1390 train_time:1934596ms step_avg:1913.55ms | |
step:1022/1390 train_time:1936611ms step_avg:1913.65ms | |
step:1023/1390 train_time:1938585ms step_avg:1913.71ms | |
step:1024/1390 train_time:1940560ms step_avg:1913.77ms | |
step:1025/1390 train_time:1942527ms step_avg:1913.82ms | |
step:1025/1390 val_loss:3.5656 train_time:1942527ms step_avg:1913.82ms | |
step:1026/1390 train_time:1944549ms step_avg:1913.93ms | |
step:1027/1390 train_time:1946561ms step_avg:1914.02ms | |
step:1028/1390 train_time:1948549ms step_avg:1914.10ms | |
step:1029/1390 train_time:1950537ms step_avg:1914.17ms | |
step:1030/1390 train_time:1952514ms step_avg:1914.23ms | |
step:1031/1390 train_time:1954498ms step_avg:1914.30ms | |
step:1032/1390 train_time:1956477ms step_avg:1914.36ms | |
step:1033/1390 train_time:1958495ms step_avg:1914.46ms | |
step:1034/1390 train_time:1960502ms step_avg:1914.55ms | |
step:1035/1390 train_time:1962486ms step_avg:1914.62ms | |
step:1036/1390 train_time:1964487ms step_avg:1914.71ms | |
step:1037/1390 train_time:1966480ms step_avg:1914.78ms | |
step:1038/1390 train_time:1968461ms step_avg:1914.85ms | |
step:1039/1390 train_time:1970448ms step_avg:1914.92ms | |
step:1040/1390 train_time:1972444ms step_avg:1914.99ms | |
step:1041/1390 train_time:1974465ms step_avg:1915.10ms | |
step:1042/1390 train_time:1976447ms step_avg:1915.16ms | |
step:1043/1390 train_time:1978446ms step_avg:1915.24ms | |
step:1044/1390 train_time:1980452ms step_avg:1915.33ms | |
step:1045/1390 train_time:1982441ms step_avg:1915.40ms | |
step:1046/1390 train_time:1984415ms step_avg:1915.46ms | |
step:1047/1390 train_time:1986420ms step_avg:1915.55ms | |
step:1048/1390 train_time:1988398ms step_avg:1915.61ms | |
step:1049/1390 train_time:1990390ms step_avg:1915.68ms | |
step:1050/1390 train_time:1992386ms step_avg:1915.76ms | |
step:1050/1390 val_loss:3.5542 train_time:1992386ms step_avg:1915.76ms | |
step:1051/1390 train_time:1994424ms step_avg:1915.87ms | |
step:1052/1390 train_time:1996415ms step_avg:1915.95ms | |
step:1053/1390 train_time:1998397ms step_avg:1916.01ms | |
step:1054/1390 train_time:2000401ms step_avg:1916.09ms | |
step:1055/1390 train_time:2002386ms step_avg:1916.16ms | |
step:1056/1390 train_time:2004369ms step_avg:1916.22ms | |
step:1057/1390 train_time:2006370ms step_avg:1916.30ms | |
step:1058/1390 train_time:2008364ms step_avg:1916.38ms | |
step:1059/1390 train_time:2010366ms step_avg:1916.46ms | |
step:1060/1390 train_time:2012353ms step_avg:1916.53ms | |
step:1061/1390 train_time:2014345ms step_avg:1916.60ms | |
step:1062/1390 train_time:2016323ms step_avg:1916.66ms | |
step:1063/1390 train_time:2018304ms step_avg:1916.72ms | |
step:1064/1390 train_time:2020296ms step_avg:1916.79ms | |
step:1065/1390 train_time:2022292ms step_avg:1916.86ms | |
step:1066/1390 train_time:2024284ms step_avg:1916.94ms | |
step:1067/1390 train_time:2026275ms step_avg:1917.01ms | |
step:1068/1390 train_time:2028266ms step_avg:1917.08ms | |
step:1069/1390 train_time:2030259ms step_avg:1917.15ms | |
step:1070/1390 train_time:2032261ms step_avg:1917.23ms | |
step:1071/1390 train_time:2034253ms step_avg:1917.30ms | |
step:1072/1390 train_time:2036226ms step_avg:1917.35ms | |
step:1073/1390 train_time:2038208ms step_avg:1917.41ms | |
step:1074/1390 train_time:2040209ms step_avg:1917.49ms | |
step:1075/1390 train_time:2042203ms step_avg:1917.56ms | |
step:1075/1390 val_loss:3.5443 train_time:2042203ms step_avg:1917.56ms | |
step:1076/1390 train_time:2044225ms step_avg:1917.66ms | |
step:1077/1390 train_time:2046244ms step_avg:1917.75ms | |
step:1078/1390 train_time:2048249ms step_avg:1917.84ms | |
step:1079/1390 train_time:2050249ms step_avg:1917.91ms | |
step:1080/1390 train_time:2052257ms step_avg:1918.00ms | |
step:1081/1390 train_time:2054235ms step_avg:1918.05ms | |
step:1082/1390 train_time:2056245ms step_avg:1918.14ms | |
step:1083/1390 train_time:2058254ms step_avg:1918.22ms | |
step:1084/1390 train_time:2060240ms step_avg:1918.29ms | |
step:1085/1390 train_time:2062248ms step_avg:1918.37ms | |
step:1086/1390 train_time:2064249ms step_avg:1918.45ms | |
step:1087/1390 train_time:2066242ms step_avg:1918.52ms | |
step:1088/1390 train_time:2068236ms step_avg:1918.59ms | |
step:1089/1390 train_time:2070248ms step_avg:1918.67ms | |
step:1090/1390 train_time:2072251ms step_avg:1918.75ms | |
step:1091/1390 train_time:2074245ms step_avg:1918.82ms | |
step:1092/1390 train_time:2076249ms step_avg:1918.90ms | |
step:1093/1390 train_time:2078252ms step_avg:1918.98ms | |
step:1094/1390 train_time:2080234ms step_avg:1919.04ms | |
step:1095/1390 train_time:2082255ms step_avg:1919.13ms | |
step:1096/1390 train_time:2084249ms step_avg:1919.20ms | |
step:1097/1390 train_time:2086229ms step_avg:1919.25ms | |
step:1098/1390 train_time:2088224ms step_avg:1919.32ms | |
step:1099/1390 train_time:2090224ms step_avg:1919.40ms | |
step:1100/1390 train_time:2092222ms step_avg:1919.47ms | |
step:1100/1390 val_loss:3.5342 train_time:2092222ms step_avg:1919.47ms | |
step:1101/1390 train_time:2094258ms step_avg:1919.58ms | |
step:1102/1390 train_time:2096280ms step_avg:1919.67ms | |
step:1103/1390 train_time:2098270ms step_avg:1919.73ms | |
step:1104/1390 train_time:2100258ms step_avg:1919.80ms | |
step:1105/1390 train_time:2102266ms step_avg:1919.88ms | |
step:1106/1390 train_time:2104260ms step_avg:1919.95ms | |
step:1107/1390 train_time:2106257ms step_avg:1920.02ms | |
step:1108/1390 train_time:2108250ms step_avg:1920.08ms | |
step:1109/1390 train_time:2110252ms step_avg:1920.16ms | |
step:1110/1390 train_time:2112235ms step_avg:1920.21ms | |
step:1111/1390 train_time:2114217ms step_avg:1920.27ms | |
step:1112/1390 train_time:2116207ms step_avg:1920.33ms | |
step:1113/1390 train_time:2118196ms step_avg:1920.40ms | |
step:1114/1390 train_time:2120222ms step_avg:1920.49ms | |
step:1115/1390 train_time:2122218ms step_avg:1920.56ms | |
step:1116/1390 train_time:2124205ms step_avg:1920.62ms | |
step:1117/1390 train_time:2126239ms step_avg:1920.72ms | |
step:1118/1390 train_time:2128237ms step_avg:1920.79ms | |
step:1119/1390 train_time:2130224ms step_avg:1920.85ms | |
step:1120/1390 train_time:2132233ms step_avg:1920.93ms | |
step:1121/1390 train_time:2134215ms step_avg:1920.99ms | |
step:1122/1390 train_time:2136201ms step_avg:1921.04ms | |
step:1123/1390 train_time:2138200ms step_avg:1921.11ms | |
step:1124/1390 train_time:2140161ms step_avg:1921.15ms | |
step:1125/1390 train_time:2142167ms step_avg:1921.23ms | |
step:1125/1390 val_loss:3.5243 train_time:2142167ms step_avg:1921.23ms | |
step:1126/1390 train_time:2144219ms step_avg:1921.34ms | |
step:1127/1390 train_time:2146239ms step_avg:1921.43ms | |
step:1128/1390 train_time:2148242ms step_avg:1921.50ms | |
step:1129/1390 train_time:2150238ms step_avg:1921.57ms | |
step:1130/1390 train_time:2152238ms step_avg:1921.64ms | |
step:1131/1390 train_time:2154243ms step_avg:1921.72ms | |
step:1132/1390 train_time:2156230ms step_avg:1921.77ms | |
step:1133/1390 train_time:2158225ms step_avg:1921.84ms | |
step:1134/1390 train_time:2160239ms step_avg:1921.92ms | |
step:1135/1390 train_time:2162250ms step_avg:1922.00ms | |
step:1136/1390 train_time:2164244ms step_avg:1922.06ms | |
step:1137/1390 train_time:2166254ms step_avg:1922.14ms | |
step:1138/1390 train_time:2168261ms step_avg:1922.22ms | |
step:1139/1390 train_time:2170292ms step_avg:1922.31ms | |
step:1140/1390 train_time:2172279ms step_avg:1922.37ms | |
step:1141/1390 train_time:2174318ms step_avg:1922.47ms | |
step:1142/1390 train_time:2176335ms step_avg:1922.56ms | |
step:1143/1390 train_time:2178327ms step_avg:1922.62ms | |
step:1144/1390 train_time:2180336ms step_avg:1922.70ms | |
step:1145/1390 train_time:2182330ms step_avg:1922.76ms | |
step:1146/1390 train_time:2184335ms step_avg:1922.83ms | |
step:1147/1390 train_time:2186342ms step_avg:1922.90ms | |
step:1148/1390 train_time:2188318ms step_avg:1922.95ms | |
step:1149/1390 train_time:2190330ms step_avg:1923.03ms | |
step:1150/1390 train_time:2192325ms step_avg:1923.09ms | |
step:1150/1390 val_loss:3.5150 train_time:2192325ms step_avg:1923.09ms | |
step:1151/1390 train_time:2194373ms step_avg:1923.20ms | |
step:1152/1390 train_time:2196404ms step_avg:1923.30ms | |
step:1153/1390 train_time:2198408ms step_avg:1923.37ms | |
step:1154/1390 train_time:2200396ms step_avg:1923.42ms | |
step:1155/1390 train_time:2202427ms step_avg:1923.52ms | |
step:1156/1390 train_time:2204434ms step_avg:1923.59ms | |
step:1157/1390 train_time:2206433ms step_avg:1923.66ms | |
step:1158/1390 train_time:2208436ms step_avg:1923.72ms | |
step:1159/1390 train_time:2210430ms step_avg:1923.79ms | |
step:1160/1390 train_time:2212431ms step_avg:1923.85ms | |
step:1161/1390 train_time:2214437ms step_avg:1923.92ms | |
step:1162/1390 train_time:2216429ms step_avg:1923.98ms | |
step:1163/1390 train_time:2218438ms step_avg:1924.06ms | |
step:1164/1390 train_time:2220433ms step_avg:1924.12ms | |
step:1165/1390 train_time:2222442ms step_avg:1924.19ms | |
step:1166/1390 train_time:2224445ms step_avg:1924.26ms | |
step:1167/1390 train_time:2226456ms step_avg:1924.34ms | |
step:1168/1390 train_time:2228456ms step_avg:1924.40ms | |
step:1169/1390 train_time:2230441ms step_avg:1924.45ms | |
step:1170/1390 train_time:2232445ms step_avg:1924.52ms | |
step:1171/1390 train_time:2234445ms step_avg:1924.59ms | |
step:1172/1390 train_time:2236423ms step_avg:1924.63ms | |
step:1173/1390 train_time:2238454ms step_avg:1924.72ms | |
step:1174/1390 train_time:2240444ms step_avg:1924.78ms | |
step:1175/1390 train_time:2242463ms step_avg:1924.86ms | |
step:1175/1390 val_loss:3.5063 train_time:2242463ms step_avg:1924.86ms | |
step:1176/1390 train_time:2244514ms step_avg:1924.97ms | |
step:1177/1390 train_time:2246533ms step_avg:1925.05ms | |
step:1178/1390 train_time:2248520ms step_avg:1925.10ms | |
step:1179/1390 train_time:2250548ms step_avg:1925.19ms | |
step:1180/1390 train_time:2252555ms step_avg:1925.26ms | |
step:1181/1390 train_time:2254546ms step_avg:1925.32ms | |
step:1182/1390 train_time:2256561ms step_avg:1925.39ms | |
step:1183/1390 train_time:2258559ms step_avg:1925.46ms | |
step:1184/1390 train_time:2260568ms step_avg:1925.53ms | |
step:1185/1390 train_time:2262569ms step_avg:1925.59ms | |
step:1186/1390 train_time:2264614ms step_avg:1925.69ms | |
step:1187/1390 train_time:2266621ms step_avg:1925.76ms | |
step:1188/1390 train_time:2268623ms step_avg:1925.83ms | |
step:1189/1390 train_time:2270633ms step_avg:1925.90ms | |
step:1190/1390 train_time:2272627ms step_avg:1925.96ms | |
step:1191/1390 train_time:2274630ms step_avg:1926.02ms | |
step:1192/1390 train_time:2276620ms step_avg:1926.07ms | |
step:1193/1390 train_time:2278603ms step_avg:1926.12ms | |
step:1194/1390 train_time:2280611ms step_avg:1926.19ms | |
step:1195/1390 train_time:2282615ms step_avg:1926.26ms | |
step:1196/1390 train_time:2284635ms step_avg:1926.34ms | |
step:1197/1390 train_time:2286654ms step_avg:1926.41ms | |
step:1198/1390 train_time:2288656ms step_avg:1926.48ms | |
step:1199/1390 train_time:2290660ms step_avg:1926.54ms | |
step:1200/1390 train_time:2292642ms step_avg:1926.59ms | |
step:1200/1390 val_loss:3.4981 train_time:2292642ms step_avg:1926.59ms | |
step:1201/1390 train_time:2294728ms step_avg:1926.72ms | |
step:1202/1390 train_time:2296778ms step_avg:1926.83ms | |
step:1203/1390 train_time:2298774ms step_avg:1926.89ms | |
step:1204/1390 train_time:2300776ms step_avg:1926.95ms | |
step:1205/1390 train_time:2302790ms step_avg:1927.02ms | |
step:1206/1390 train_time:2304794ms step_avg:1927.09ms | |
step:1207/1390 train_time:2306808ms step_avg:1927.16ms | |
step:1208/1390 train_time:2308809ms step_avg:1927.22ms | |
step:1209/1390 train_time:2310825ms step_avg:1927.29ms | |
step:1210/1390 train_time:2312842ms step_avg:1927.37ms | |
step:1211/1390 train_time:2314861ms step_avg:1927.44ms | |
step:1212/1390 train_time:2316861ms step_avg:1927.50ms | |
step:1213/1390 train_time:2318892ms step_avg:1927.59ms | |
step:1214/1390 train_time:2320912ms step_avg:1927.67ms | |
step:1215/1390 train_time:2322894ms step_avg:1927.71ms | |
step:1216/1390 train_time:2324892ms step_avg:1927.77ms | |
step:1217/1390 train_time:2326894ms step_avg:1927.83ms | |
step:1218/1390 train_time:2328886ms step_avg:1927.89ms | |
step:1219/1390 train_time:2330884ms step_avg:1927.94ms | |
step:1220/1390 train_time:2332885ms step_avg:1928.00ms | |
step:1221/1390 train_time:2334895ms step_avg:1928.07ms | |
step:1222/1390 train_time:2336890ms step_avg:1928.13ms | |
step:1223/1390 train_time:2338899ms step_avg:1928.19ms | |
step:1224/1390 train_time:2340938ms step_avg:1928.28ms | |
step:1225/1390 train_time:2342928ms step_avg:1928.34ms | |
step:1225/1390 val_loss:3.4907 train_time:2342928ms step_avg:1928.34ms | |
step:1226/1390 train_time:2344985ms step_avg:1928.44ms | |
step:1227/1390 train_time:2347020ms step_avg:1928.53ms | |
step:1228/1390 train_time:2349018ms step_avg:1928.59ms | |
step:1229/1390 train_time:2351041ms step_avg:1928.66ms | |
step:1230/1390 train_time:2353055ms step_avg:1928.73ms | |
step:1231/1390 train_time:2355083ms step_avg:1928.81ms | |
step:1232/1390 train_time:2357075ms step_avg:1928.87ms | |
step:1233/1390 train_time:2359059ms step_avg:1928.91ms | |
step:1234/1390 train_time:2361063ms step_avg:1928.97ms | |
step:1235/1390 train_time:2363092ms step_avg:1929.05ms | |
step:1236/1390 train_time:2365101ms step_avg:1929.12ms | |
step:1237/1390 train_time:2367157ms step_avg:1929.22ms | |
step:1238/1390 train_time:2369214ms step_avg:1929.33ms | |
step:1239/1390 train_time:2371259ms step_avg:1929.42ms | |
step:1240/1390 train_time:2373300ms step_avg:1929.51ms | |
step:1241/1390 train_time:2375313ms step_avg:1929.58ms | |
step:1242/1390 train_time:2377336ms step_avg:1929.66ms | |
step:1243/1390 train_time:2379356ms step_avg:1929.73ms | |
step:1244/1390 train_time:2381365ms step_avg:1929.79ms | |
step:1245/1390 train_time:2383366ms step_avg:1929.85ms | |
step:1246/1390 train_time:2385387ms step_avg:1929.93ms | |
step:1247/1390 train_time:2387390ms step_avg:1929.98ms | |
step:1248/1390 train_time:2389381ms step_avg:1930.03ms | |
step:1249/1390 train_time:2391380ms step_avg:1930.09ms | |
step:1250/1390 train_time:2393406ms step_avg:1930.17ms | |
step:1250/1390 val_loss:3.4833 train_time:2393406ms step_avg:1930.17ms | |
step:1251/1390 train_time:2395477ms step_avg:1930.28ms | |
step:1252/1390 train_time:2397486ms step_avg:1930.34ms | |
step:1253/1390 train_time:2399478ms step_avg:1930.39ms | |
step:1254/1390 train_time:2401525ms step_avg:1930.49ms | |
step:1255/1390 train_time:2403516ms step_avg:1930.53ms | |
step:1256/1390 train_time:2405516ms step_avg:1930.59ms | |
step:1257/1390 train_time:2407528ms step_avg:1930.66ms | |
step:1258/1390 train_time:2409569ms step_avg:1930.74ms | |
step:1259/1390 train_time:2411584ms step_avg:1930.81ms | |
step:1260/1390 train_time:2413596ms step_avg:1930.88ms | |
step:1261/1390 train_time:2415648ms step_avg:1930.97ms | |
step:1262/1390 train_time:2417667ms step_avg:1931.04ms | |
step:1263/1390 train_time:2419684ms step_avg:1931.11ms | |
step:1264/1390 train_time:2421681ms step_avg:1931.16ms | |
step:1265/1390 train_time:2423719ms step_avg:1931.25ms | |
step:1266/1390 train_time:2425728ms step_avg:1931.31ms | |
step:1267/1390 train_time:2427744ms step_avg:1931.38ms | |
step:1268/1390 train_time:2429772ms step_avg:1931.46ms | |
step:1269/1390 train_time:2431801ms step_avg:1931.53ms | |
step:1270/1390 train_time:2433789ms step_avg:1931.58ms | |
step:1271/1390 train_time:2435797ms step_avg:1931.64ms | |
step:1272/1390 train_time:2437798ms step_avg:1931.69ms | |
step:1273/1390 train_time:2439785ms step_avg:1931.74ms | |
step:1274/1390 train_time:2441809ms step_avg:1931.81ms | |
step:1275/1390 train_time:2443821ms step_avg:1931.87ms | |
step:1275/1390 val_loss:3.4776 train_time:2443821ms step_avg:1931.87ms | |
step:1276/1390 train_time:2445866ms step_avg:1931.96ms | |
step:1277/1390 train_time:2447901ms step_avg:1932.05ms | |
step:1278/1390 train_time:2449925ms step_avg:1932.12ms | |
step:1279/1390 train_time:2451965ms step_avg:1932.20ms | |
step:1280/1390 train_time:2453988ms step_avg:1932.27ms | |
step:1281/1390 train_time:2455972ms step_avg:1932.31ms | |
step:1282/1390 train_time:2457962ms step_avg:1932.36ms | |
step:1283/1390 train_time:2460001ms step_avg:1932.44ms | |
step:1284/1390 train_time:2462005ms step_avg:1932.50ms | |
step:1285/1390 train_time:2464007ms step_avg:1932.55ms | |
step:1286/1390 train_time:2466014ms step_avg:1932.61ms | |
step:1287/1390 train_time:2468037ms step_avg:1932.68ms | |
step:1288/1390 train_time:2470062ms step_avg:1932.76ms | |
step:1289/1390 train_time:2472119ms step_avg:1932.85ms | |
step:1290/1390 train_time:2474157ms step_avg:1932.94ms | |
step:1291/1390 train_time:2476176ms step_avg:1933.00ms | |
step:1292/1390 train_time:2478182ms step_avg:1933.06ms | |
step:1293/1390 train_time:2480184ms step_avg:1933.11ms | |
step:1294/1390 train_time:2482207ms step_avg:1933.18ms | |
step:1295/1390 train_time:2484217ms step_avg:1933.24ms | |
step:1296/1390 train_time:2486251ms step_avg:1933.32ms | |
step:1297/1390 train_time:2488252ms step_avg:1933.37ms | |
step:1298/1390 train_time:2490256ms step_avg:1933.43ms | |
step:1299/1390 train_time:2492260ms step_avg:1933.48ms | |
step:1300/1390 train_time:2494277ms step_avg:1933.55ms | |
step:1300/1390 val_loss:3.4731 train_time:2494278ms step_avg:1933.55ms | |
step:1301/1390 train_time:2496328ms step_avg:1933.64ms | |
step:1302/1390 train_time:2498375ms step_avg:1933.73ms | |
step:1303/1390 train_time:2500419ms step_avg:1933.81ms | |
step:1304/1390 train_time:2502432ms step_avg:1933.87ms | |
step:1305/1390 train_time:2504444ms step_avg:1933.93ms | |
step:1306/1390 train_time:2506464ms step_avg:1934.00ms | |
step:1307/1390 train_time:2508500ms step_avg:1934.08ms | |
step:1308/1390 train_time:2510522ms step_avg:1934.15ms | |
step:1309/1390 train_time:2512532ms step_avg:1934.20ms | |
step:1310/1390 train_time:2514537ms step_avg:1934.26ms | |
step:1311/1390 train_time:2516565ms step_avg:1934.33ms | |
step:1312/1390 train_time:2518581ms step_avg:1934.39ms | |
step:1313/1390 train_time:2520586ms step_avg:1934.45ms | |
step:1314/1390 train_time:2522617ms step_avg:1934.52ms | |
step:1315/1390 train_time:2524628ms step_avg:1934.58ms | |
step:1316/1390 train_time:2526631ms step_avg:1934.63ms | |
step:1317/1390 train_time:2528655ms step_avg:1934.70ms | |
step:1318/1390 train_time:2530663ms step_avg:1934.76ms | |
step:1319/1390 train_time:2532681ms step_avg:1934.82ms | |
step:1320/1390 train_time:2534690ms step_avg:1934.88ms | |
step:1321/1390 train_time:2536717ms step_avg:1934.95ms | |
step:1322/1390 train_time:2538726ms step_avg:1935.00ms | |
step:1323/1390 train_time:2540734ms step_avg:1935.06ms | |
step:1324/1390 train_time:2542741ms step_avg:1935.12ms | |
step:1325/1390 train_time:2544780ms step_avg:1935.19ms | |
step:1325/1390 val_loss:3.4695 train_time:2544781ms step_avg:1935.19ms | |
step:1326/1390 train_time:2546847ms step_avg:1935.29ms | |
step:1327/1390 train_time:2548853ms step_avg:1935.35ms | |
step:1328/1390 train_time:2550891ms step_avg:1935.43ms | |
step:1329/1390 train_time:2552904ms step_avg:1935.48ms | |
step:1330/1390 train_time:2554942ms step_avg:1935.56ms | |
step:1331/1390 train_time:2557029ms step_avg:1935.68ms | |
step:1332/1390 train_time:2559057ms step_avg:1935.75ms | |
step:1333/1390 train_time:2561072ms step_avg:1935.81ms | |
step:1334/1390 train_time:2563066ms step_avg:1935.85ms | |
step:1335/1390 train_time:2565088ms step_avg:1935.92ms | |
step:1336/1390 train_time:2567089ms step_avg:1935.96ms | |
step:1337/1390 train_time:2569101ms step_avg:1936.02ms | |
step:1338/1390 train_time:2571105ms step_avg:1936.07ms | |
step:1339/1390 train_time:2573142ms step_avg:1936.15ms | |
step:1340/1390 train_time:2575137ms step_avg:1936.19ms | |
step:1341/1390 train_time:2577151ms step_avg:1936.25ms | |
step:1342/1390 train_time:2579155ms step_avg:1936.30ms | |
step:1343/1390 train_time:2581162ms step_avg:1936.36ms | |
step:1344/1390 train_time:2583180ms step_avg:1936.42ms | |
step:1345/1390 train_time:2585186ms step_avg:1936.47ms | |
step:1346/1390 train_time:2587225ms step_avg:1936.55ms | |
step:1347/1390 train_time:2589232ms step_avg:1936.60ms | |
step:1348/1390 train_time:2591263ms step_avg:1936.67ms | |
step:1349/1390 train_time:2593255ms step_avg:1936.71ms | |
step:1350/1390 train_time:2595279ms step_avg:1936.78ms | |
step:1350/1390 val_loss:3.4668 train_time:2595279ms step_avg:1936.78ms | |
step:1351/1390 train_time:2597357ms step_avg:1936.88ms | |
step:1352/1390 train_time:2599409ms step_avg:1936.97ms | |
step:1353/1390 train_time:2601431ms step_avg:1937.03ms | |
step:1354/1390 train_time:2603447ms step_avg:1937.09ms | |
step:1355/1390 train_time:2605459ms step_avg:1937.14ms | |
step:1356/1390 train_time:2607504ms step_avg:1937.22ms | |
step:1357/1390 train_time:2609536ms step_avg:1937.29ms | |
step:1358/1390 train_time:2611548ms step_avg:1937.35ms | |
step:1359/1390 train_time:2613551ms step_avg:1937.40ms | |
step:1360/1390 train_time:2615579ms step_avg:1937.47ms | |
step:1361/1390 train_time:2617603ms step_avg:1937.53ms | |
step:1362/1390 train_time:2619633ms step_avg:1937.60ms | |
step:1363/1390 train_time:2621671ms step_avg:1937.67ms | |
step:1364/1390 train_time:2623655ms step_avg:1937.71ms | |
step:1365/1390 train_time:2625667ms step_avg:1937.76ms | |
step:1366/1390 train_time:2627683ms step_avg:1937.82ms | |
step:1367/1390 train_time:2629700ms step_avg:1937.88ms | |
step:1368/1390 train_time:2631745ms step_avg:1937.96ms | |
step:1369/1390 train_time:2633780ms step_avg:1938.03ms | |
step:1370/1390 train_time:2635793ms step_avg:1938.08ms | |
step:1371/1390 train_time:2637814ms step_avg:1938.14ms | |
step:1372/1390 train_time:2639828ms step_avg:1938.20ms | |
step:1373/1390 train_time:2641875ms step_avg:1938.28ms | |
step:1374/1390 train_time:2643875ms step_avg:1938.32ms | |
step:1375/1390 train_time:2645872ms step_avg:1938.37ms | |
step:1375/1390 val_loss:3.4655 train_time:2645873ms step_avg:1938.37ms | |
step:1376/1390 train_time:2647949ms step_avg:1938.47ms | |
step:1377/1390 train_time:2649967ms step_avg:1938.53ms | |
step:1378/1390 train_time:2651971ms step_avg:1938.58ms | |
step:1379/1390 train_time:2654017ms step_avg:1938.65ms | |
step:1380/1390 train_time:2656050ms step_avg:1938.72ms | |
step:1381/1390 train_time:2658097ms step_avg:1938.80ms | |
step:1382/1390 train_time:2660098ms step_avg:1938.85ms | |
step:1383/1390 train_time:2662147ms step_avg:1938.93ms | |
step:1384/1390 train_time:2664151ms step_avg:1938.97ms | |
step:1385/1390 train_time:2666151ms step_avg:1939.02ms | |
step:1386/1390 train_time:2668187ms step_avg:1939.09ms | |
step:1387/1390 train_time:2670184ms step_avg:1939.13ms | |
step:1388/1390 train_time:2672184ms step_avg:1939.18ms | |
step:1389/1390 train_time:2674175ms step_avg:1939.21ms | |
step:1390/1390 train_time:2676189ms step_avg:1939.27ms | |
step:1390/1390 val_loss:3.4650 train_time:2676189ms step_avg:1939.27ms | |
peak memory consumption: 56232 MiB |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment