Created
July 30, 2020 23:40
-
-
Save mmgaggle/545aae2daf821ea4b30326a4590201c3 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Kyles-Mac-mini:infra-nodes kylebader$ oc logs -f ssd | |
_CUDA_COMPAT_STATUS=CUDA Driver UNAVAILABLE (cuInit(0) returned 100) | |
NVIDIA_PYTORCH_VERSION=19.05 | |
MOFED_VERSION=4.4-1.0.0 | |
COCOAPI_VERSION=2.0+nv0.3.1 | |
CUDNN_VERSION=7.6.0.64 | |
HOSTNAME=ssd | |
DATADIR=/ocs-ml-data/coco | |
NVIDIA_REQUIRE_CUDA=cuda>=5.0 | |
KUBERNETES_PORT_443_TCP_PORT=443 | |
KUBERNETES_PORT=tcp://172.30.0.1:443 | |
TERM=xterm | |
NSIGHT_SYSTEMS_VERSION=2019.3.1 | |
CUBLAS_VERSION=10.2.0.163 | |
LIBRARY_PATH=/usr/local/cuda/lib64/stubs: | |
KUBERNETES_SERVICE_PORT=443 | |
KUBERNETES_SERVICE_HOST=172.30.0.1 | |
NEXP=1 | |
LC_ALL=C.UTF-8 | |
PYTHONIOENCODING=utf-8 | |
LD_LIBRARY_PATH=/usr/local/cuda/compat/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64 | |
NVIDIA_VISIBLE_DEVICES=all | |
ENV=/etc/shinit | |
_CUDA_COMPAT_PATH=/usr/local/cuda/compat | |
CUDA_CACHE_DISABLE=1 | |
NVIDIA_DRIVER_CAPABILITIES=compute,utility | |
TRT_VERSION=5.1.5.0 | |
CUDA_DRIVER_VERSION=418.67 | |
NVIDIA_BUILD_ID=6411784 | |
PATH=/opt/conda/bin:/usr/local/mpi/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin | |
PWD=/workspace/single_stage_detector | |
PYTORCH_VERSION=1.1.0a0+828a6a3 | |
PYTORCH_BUILD_VERSION=1.1.0a0+828a6a3 | |
CUDA_VERSION=10.1.163 | |
OMPI_MCA_btl_vader_single_copy_mechanism=none | |
SHLVL=1 | |
HOME=/root | |
DALI_VERSION=0.9.1 | |
KUBERNETES_PORT_443_TCP_PROTO=tcp | |
KUBERNETES_SERVICE_PORT_HTTPS=443 | |
DALI_BUILD=719215 | |
OPENMPI_VERSION=3.1.3 | |
NCCL_VERSION=2.4.6 | |
INSLURM=0 | |
BASH_ENV=/etc/bash.bashrc | |
LOGDIR=/ocs-ml-data/logs | |
NSS_SDB_USE_CACHE=no | |
OPENCV_FOR_THREADS_NUM=1 | |
OMP_NUM_THREADS=1 | |
PYTORCH_BUILD_NUMBER=0 | |
KUBERNETES_PORT_443_TCP_ADDR=172.30.0.1 | |
KUBERNETES_PORT_443_TCP=tcp://172.30.0.1:443 | |
_=/usr/bin/printenv | |
Run vars: id 28869 gpus 4 mparams | |
STARTING TIMING RUN AT 2020-07-30 11:40:05 PM | |
+ NUMEPOCHS=80 | |
running benchmark | |
+ echo 'running benchmark' | |
+ export DATASET_DIR=/data/coco2017 | |
+ DATASET_DIR=/data/coco2017 | |
+ export TORCH_MODEL_ZOO=/data/torchvision | |
+ TORCH_MODEL_ZOO=/data/torchvision | |
+ python3 -m bind_launch --nsockets_per_node 1 --ncores_per_socket 16 --nproc_per_node 4 train.py --use-fp16 --nhwc --pad-input --jit --delay-allreduce --opt-loss --epochs 80 --warmup-factor 0 --no-save --threshold=0.23 --data /data/coco2017 --evaluation 120000 160000 180000 200000 220000 240000 260000 280000 --batch-size 120 --eval-batch-size 160 --warmup 650 --lr 2.92e-3 --wd 1.6e-4 --use-nvjpeg --use-roi-decode | |
:::MLL 1596152406.833 init_start: {"value": null, "metadata": {"file": "train.py", "lineno": 833}} | |
:::MLL 1596152406.835 init_start: {"value": null, "metadata": {"file": "train.py", "lineno": 833}} | |
:::MLL 1596152406.835 init_start: {"value": null, "metadata": {"file": "train.py", "lineno": 833}} | |
BN group: 1 | |
BN group: 1 | |
BN group: 1 | |
:::MLL 1596152406.839 init_start: {"value": null, "metadata": {"file": "train.py", "lineno": 833}} | |
BN group: 1 | |
0 Using seed = 4100562049 | |
1 Using seed = 4100562050 | |
2 Using seed = 4100562051 | |
3 Using seed = 4100562052 | |
:::MLL 1596152411.996 max_samples: {"value": 1, "metadata": {"file": "utils.py", "lineno": 465}} | |
Downloading: "https://download.pytorch.org/models/resnet34-333f7ec4.pth" to /data/torchvision/resnet34-333f7ec4.pth | |
Downloading: "https://download.pytorch.org/models/resnet34-333f7ec4.pth" to /data/torchvision/resnet34-333f7ec4.pth | |
Downloading: "https://download.pytorch.org/models/resnet34-333f7ec4.pth" to /data/torchvision/resnet34-333f7ec4.pth | |
Downloading: "https://download.pytorch.org/models/resnet34-333f7ec4.pth" to /data/torchvision/resnet34-333f7ec4.pth | |
87306240it [00:01, 49622227.05it/s] | |
87306240it [00:01, 49923219.48it/s] | |
/opt/conda/lib/python3.6/site-packages/torch/nn/_reduction.py:46: UserWarning: size_average and reduce args will be deprecated, please use reduction='none' instead. | |
warnings.warn(warning.format(ret)) | |
/opt/conda/lib/python3.6/site-packages/torch/nn/_reduction.py:46: UserWarning: size_average and reduce args will be deprecated, please use reduction='none' instead. | |
warnings.warn(warning.format(ret)) | |
Delaying allreduces to the end of backward() | |
:::MLL 1596152414.605 model_bn_span: {"value": 120, "metadata": {"file": "train.py", "lineno": 480}} | |
:::MLL 1596152414.605 global_batch_size: {"value": 480, "metadata": {"file": "train.py", "lineno": 481}} | |
:::MLL 1596152414.613 opt_base_learning_rate: {"value": 0.045, "metadata": {"file": "train.py", "lineno": 511}} | |
:::MLL 1596152414.614 opt_weight_decay: {"value": 0.00016, "metadata": {"file": "train.py", "lineno": 513}} | |
:::MLL 1596152414.614 opt_learning_rate_warmup_steps: {"value": 650, "metadata": {"file": "train.py", "lineno": 516}} | |
:::MLL 1596152414.615 opt_learning_rate_warmup_factor: {"value": 0, "metadata": {"file": "train.py", "lineno": 518}} | |
87306240it [00:00, 97183120.00it/s] | |
87306240it [00:00, 94301685.86it/s] | |
/opt/conda/lib/python3.6/site-packages/torch/nn/_reduction.py:46: UserWarning: size_average and reduce args will be deprecated, please use reduction='none' instead. | |
warnings.warn(warning.format(ret)) | |
/opt/conda/lib/python3.6/site-packages/torch/nn/_reduction.py:46: UserWarning: size_average and reduce args will be deprecated, please use reduction='none' instead. | |
warnings.warn(warning.format(ret)) | |
epoch nbatch loss | |
:::MLL 1596152423.319 init_stop: {"value": null, "metadata": {"file": "train.py", "lineno": 604}} | |
:::MLL 1596152423.320 run_start: {"value": null, "metadata": {"file": "train.py", "lineno": 610}} | |
loading annotations into memory... | |
loading annotations into memory... | |
loading annotations into memory... | |
loading annotations into memory... | |
Done (t=0.58s) | |
creating index... | |
Done (t=0.62s) | |
creating index... | |
Done (t=0.65s) | |
creating index... | |
Done (t=0.65s) | |
creating index... | |
time_check a: 1596152425.581552744 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment