Skip to content

Instantly share code, notes, and snippets.

View bearpelican's full-sized avatar

Andrew Shaw bearpelican

View GitHub Profile
import torch
import torch.nn as nn
class tofp16(nn.Module):
def __init__(self):
super(tofp16, self).__init__()
def forward(self, input):
return input.half()
Namespace(arch='resnet50', batch_sched='512,192,128', data='/home/ubuntu/data/imagenet', dist_backend='nccl', dist_url='file:///home/ubuntu/data/file.sync', distributed=True, epochs=35, evaluate=False, fp16=True, init_bn0=True, local_rank=5, logdir='/efs/runs/one_machine_e35_nobnwd.03', loss_scale=1024.0, lr=1.0, lr_linear_scale=True, lr_sched='0.14,0.47,0.78,0.95', momentum=0.9, no_bn_wd=True, pretrained=False, print_freq=10, prof=False, resize_sched='0.4,0.92', resume='', save_dir='/home/ubuntu/data/training/nv/2018-08-01_22-38-one_machine_e35_nobnwd-w8', start_epoch=0, val_ar=True, weight_decay=0.0001, workers=8, world_size=8)
~~epoch hours top1Accuracy
Distributed: initializing process group
Distributed: success (5/8)
Loading model
Creating data loaders (this could take 6-12 minutes)
Begin training
Dataset changed.
Image size: 128
Namespace(arch='resnet50', batch_size=192, data='/home/ubuntu/data/imagenet', dist_backend='nccl', dist_url='file:///home/ubuntu/data/file.sync', distributed=True, epochs=45, evaluate=False, fp16=True, init_bn0=True, local_rank=7, loss_scale=512.0, lr=0.4, lr_sched='0.14,0.47,0.78,0.95', momentum=0.9, pretrained=False, print_freq=10, prof=False, resize_sched='0.4,0.92', resume='', save_dir='/home/ubuntu/data/training/nv/2018-07-10_21-56-cluster_1_region_b_spot_single_2-lr4.0e45bs192-dawn', start_epoch=0, val_ar=False, weight_decay=0.0001, workers=8, world_size=8)
~~epoch hours top1Accuracy
Distributed: initializing process group
Loaded model
Defined loss and optimizer
Created data loaders
Begin training
Changing LR from None to 0.4
~~0 0.033377203333333334 26.712
~~epoch hours top1Accuracy
Distributed: init_process_group success
Loaded model
Defined loss and optimizer
Created data loaders
Begin training
Begin training loop: 1530911465.107739
Prefetcher first preload complete
Received input: 3.9817962646484375
~~epoch hours top1Accuracy
Distributed: init_process_group success
Loaded model
Defined loss and optimizer
Created data loaders
Begin training
Changing LR from None to 1.4
~~0 0.01853289027777778 14.500
~~epoch hours top1 top5
Dataset changed.
Image size: 128
Batch size: 128
Train Directory: /home/ubuntu/data/imagenet-sz/160/train
Validation Directory: /home/ubuntu/data/imagenet-sz/160/validation
Changing LR from None to 1.9220382165605094
Changing LR from 2.2379617834394905 to 2.2399999999999998
~~0 0.01241 4.248 12.422
import argparse
import os
import torch
import torch.nn as nn
import torch.backends.cudnn as cudnn
import torch.distributed as dist
import torch.utils.data
import torch.utils.data.distributed
from torch.nn.parallel import DistributedDataParallel
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
import argparse
import os
import torch
import torch.nn as nn
import torch.backends.cudnn as cudnn
import torch.utils.data
import torch.utils.data.distributed
cudnn.benchmark = True