Skip to content

Instantly share code, notes, and snippets.

@davidberard98
Last active September 9, 2022 03:07
Show Gist options
  • Save davidberard98/cd3a3fe1c41f30fe04194cbb60827003 to your computer and use it in GitHub Desktop.
Save davidberard98/cd3a3fe1c41f30fe04194cbb60827003 to your computer and use it in GitHub Desktop.
/data/home/dberard/miniconda/envs/bench-fast/lib/python3.8/site-packages/torchvision/models/_utils.py:208: UserWarning: The parameter 'pretrained' is deprecated since 0.13 and will be removed in 0.15, please use 'weights' instead.
warnings.warn(
/data/home/dberard/miniconda/envs/bench-fast/lib/python3.8/site-packages/torchvision/models/_utils.py:223: UserWarning: Arguments other than a weight enum or `None` for 'weights' are deprecated since 0.13 and will be removed in 0.15. The current behavior is equivalent to passing `weights=ResNet50_Weights.IMAGENET1K_V1`. You can also use `weights=ResNet50_Weights.DEFAULT` to get the most up-to-date weights.
warnings.warn(msg)
WARNING:root:Using TorchDynamo with a context manager will be deprecated soon.Please read https://github.com/pytorch/torchdynamo#usage-example to use TorchDynamo using an annotation.
ERROR:root:Error while processing frame
Traceback (most recent call last):
File "/data/home/dberard/miniconda/envs/bench-fast/lib/python3.8/site-packages/torchdynamo/eval_frame.py", line 225, in catch_errors
log.debug(f"skipping {frame.f_code.co_name} {frame.f_code.co_filename}")
File "/fsx/users/dberard/scratch-local/bench-fast/benchmark/torchbenchmark/util/model.py", line 188, in invoke
self.train()
File "/fsx/users/dberard/scratch-local/bench-fast/benchmark/torchbenchmark/util/framework/vision/model_factory.py", line 65, in train
pred = self.model(data)
File "/data/home/dberard/miniconda/envs/bench-fast/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1190, in _call_impl
return forward_call(*input, **kwargs)
File "/data/home/dberard/miniconda/envs/bench-fast/lib/python3.8/site-packages/torch/nn/parallel/distributed.py", line 1040, in forward
output = self._run_ddp_forward(*inputs, **kwargs)
File "/data/home/dberard/miniconda/envs/bench-fast/lib/python3.8/site-packages/torch/nn/parallel/distributed.py", line 1000, in _run_ddp_forward
return module_to_run(*inputs[0], **kwargs[0])
File "/data/home/dberard/miniconda/envs/bench-fast/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1190, in _call_impl
return forward_call(*input, **kwargs)
File "/data/home/dberard/miniconda/envs/bench-fast/lib/python3.8/site-packages/torchvision/models/resnet.py", line 284, in forward
def forward(self, x: Tensor) -> Tensor:
File "/data/home/dberard/miniconda/envs/bench-fast/lib/python3.8/site-packages/torchdynamo/eval_frame.py", line 166, in _fn
return fn(*args, **kwargs)
File "/data/home/dberard/miniconda/envs/bench-fast/lib/python3.8/site-packages/functorch/_src/aot_autograd.py", line 889, in forward
return compiled_f(
File "/data/home/dberard/miniconda/envs/bench-fast/lib/python3.8/site-packages/functorch/_src/aot_autograd.py", line 875, in new_func
compiled_fn = create_aot_dispatcher_function(
File "/data/home/dberard/miniconda/envs/bench-fast/lib/python3.8/site-packages/torchdynamo/utils.py", line 75, in time_wrapper
r = func(*args, **kwargs)
File "/data/home/dberard/miniconda/envs/bench-fast/lib/python3.8/site-packages/functorch/_src/aot_autograd.py", line 450, in create_aot_dispatcher_function
aot_dispatch_autograd(flat_fn, fake_flat_tensor_args, aot_config)
File "/data/home/dberard/miniconda/envs/bench-fast/lib/python3.8/site-packages/functorch/_src/aot_autograd.py", line 335, in aot_dispatch_autograd
compiled_fw_func = aot_config.fw_compiler(fw_module, flat_args)
File "/data/home/dberard/miniconda/envs/bench-fast/lib/python3.8/site-packages/functorch/_src/aot_autograd.py", line 237, in f
out_f = compiler(fx_g, inps)
File "/data/home/dberard/miniconda/envs/bench-fast/lib/python3.8/site-packages/torchdynamo/utils.py", line 75, in time_wrapper
r = func(*args, **kwargs)
File "/data/home/dberard/miniconda/envs/bench-fast/lib/python3.8/site-packages/torchinductor/compile_fx.py", line 177, in fw_compiler
return compile_fx_inner(
File "/data/home/dberard/miniconda/envs/bench-fast/lib/python3.8/site-packages/torchdynamo/debug_utils.py", line 329, in debug_wrapper
compiled_fn = compiler(gm, example_inputs, **kwargs)
File "/data/home/dberard/miniconda/envs/bench-fast/lib/python3.8/site-packages/torchinductor/debug.py", line 182, in inner
return fn(*args, **kwargs)
File "/data/home/dberard/miniconda/envs/bench-fast/lib/python3.8/site-packages/torchinductor/compile_fx.py", line 60, in compile_fx_inner
compiled_fn = wrap(graph.compile_to_fn())
File "/data/home/dberard/miniconda/envs/bench-fast/lib/python3.8/site-packages/torchinductor/graph.py", line 333, in compile_to_fn
return self.compile_to_module().call
File "/data/home/dberard/miniconda/envs/bench-fast/lib/python3.8/site-packages/torchdynamo/utils.py", line 75, in time_wrapper
r = func(*args, **kwargs)
File "/data/home/dberard/miniconda/envs/bench-fast/lib/python3.8/site-packages/torchinductor/graph.py", line 323, in compile_to_module
mod = PyCodeCache.load(code)
File "/data/home/dberard/miniconda/envs/bench-fast/lib/python3.8/site-packages/torchinductor/codecache.py", line 162, in load
exec(code, mod.__dict__, mod.__dict__)
File "/tmp/torchinductor_dberard/s7/cs7yddkfqu2hqk2hbxty2b65q4zavo3yxmatm3yx2ie73ghiqkh7.py", line 18, in <module>
kernel0 = TritonCodeCache.load('''
File "/data/home/dberard/miniconda/envs/bench-fast/lib/python3.8/site-packages/torchinductor/codecache.py", line 243, in load
return PyCodeCache.load(source_code)
File "/data/home/dberard/miniconda/envs/bench-fast/lib/python3.8/site-packages/torchinductor/codecache.py", line 156, in load
key, path = write(source_code, "py")
File "/data/home/dberard/miniconda/envs/bench-fast/lib/python3.8/site-packages/torchinductor/codecache.py", line 46, in write
os.rename(tmp_path, path)
FileNotFoundError: [Errno 2] No such file or directory: '/tmp/torchinductor_dberard/hb/chbtiqye637eosh5eyoh7t3ve4mviybimx2m2uudsrm6bt22x3nd.py.1571306751' -> '/tmp/torchinductor_dberard/hb/chbtiqye637eosh5eyoh7t3ve4mviybimx2m2uudsrm6bt22x3nd.py'
submitit ERROR (2022-09-09 02:24:05,677) - Submitted job triggered an exception
ERROR:submitit:Submitted job triggered an exception
Traceback (most recent call last):
File "/data/home/dberard/miniconda/envs/bench-fast/lib/python3.8/runpy.py", line 194, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/data/home/dberard/miniconda/envs/bench-fast/lib/python3.8/runpy.py", line 87, in _run_code
exec(code, run_globals)
File "/data/home/dberard/miniconda/envs/bench-fast/lib/python3.8/site-packages/submitit/core/_submit.py", line 11, in <module>
submitit_main()
File "/data/home/dberard/miniconda/envs/bench-fast/lib/python3.8/site-packages/submitit/core/submission.py", line 72, in submitit_main
process_job(args.folder)
File "/data/home/dberard/miniconda/envs/bench-fast/lib/python3.8/site-packages/submitit/core/submission.py", line 65, in process_job
raise error
File "/data/home/dberard/miniconda/envs/bench-fast/lib/python3.8/site-packages/submitit/core/submission.py", line 54, in process_job
result = delayed.result()
File "/data/home/dberard/miniconda/envs/bench-fast/lib/python3.8/site-packages/submitit/core/utils.py", line 133, in result
self._result = self.function(*self.args, **self.kwargs)
File "ddp_experiments.py", line 149, in __call__
return trainer_class(self.args, model_class, model_args=self.model_args).measure()
File "/fsx/users/dberard/scratch-local/bench-fast/benchmark/torchbenchmark/util/distributed/core_model/trainer.py", line 79, in measure
self.benchmark.invoke()
File "/fsx/users/dberard/scratch-local/bench-fast/benchmark/torchbenchmark/util/model.py", line 188, in invoke
self.train()
File "/fsx/users/dberard/scratch-local/bench-fast/benchmark/torchbenchmark/util/framework/vision/model_factory.py", line 65, in train
pred = self.model(data)
File "/data/home/dberard/miniconda/envs/bench-fast/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1190, in _call_impl
return forward_call(*input, **kwargs)
File "/data/home/dberard/miniconda/envs/bench-fast/lib/python3.8/site-packages/torch/nn/parallel/distributed.py", line 1040, in forward
output = self._run_ddp_forward(*inputs, **kwargs)
File "/data/home/dberard/miniconda/envs/bench-fast/lib/python3.8/site-packages/torch/nn/parallel/distributed.py", line 1000, in _run_ddp_forward
return module_to_run(*inputs[0], **kwargs[0])
File "/data/home/dberard/miniconda/envs/bench-fast/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1190, in _call_impl
return forward_call(*input, **kwargs)
File "/data/home/dberard/miniconda/envs/bench-fast/lib/python3.8/site-packages/torchvision/models/resnet.py", line 284, in forward
def forward(self, x: Tensor) -> Tensor:
File "/data/home/dberard/miniconda/envs/bench-fast/lib/python3.8/site-packages/torchdynamo/eval_frame.py", line 166, in _fn
return fn(*args, **kwargs)
File "/data/home/dberard/miniconda/envs/bench-fast/lib/python3.8/site-packages/functorch/_src/aot_autograd.py", line 889, in forward
return compiled_f(
File "/data/home/dberard/miniconda/envs/bench-fast/lib/python3.8/site-packages/functorch/_src/aot_autograd.py", line 875, in new_func
compiled_fn = create_aot_dispatcher_function(
File "/data/home/dberard/miniconda/envs/bench-fast/lib/python3.8/site-packages/torchdynamo/utils.py", line 75, in time_wrapper
r = func(*args, **kwargs)
File "/data/home/dberard/miniconda/envs/bench-fast/lib/python3.8/site-packages/functorch/_src/aot_autograd.py", line 450, in create_aot_dispatcher_function
aot_dispatch_autograd(flat_fn, fake_flat_tensor_args, aot_config)
File "/data/home/dberard/miniconda/envs/bench-fast/lib/python3.8/site-packages/functorch/_src/aot_autograd.py", line 335, in aot_dispatch_autograd
compiled_fw_func = aot_config.fw_compiler(fw_module, flat_args)
File "/data/home/dberard/miniconda/envs/bench-fast/lib/python3.8/site-packages/functorch/_src/aot_autograd.py", line 237, in f
out_f = compiler(fx_g, inps)
File "/data/home/dberard/miniconda/envs/bench-fast/lib/python3.8/site-packages/torchdynamo/utils.py", line 75, in time_wrapper
r = func(*args, **kwargs)
File "/data/home/dberard/miniconda/envs/bench-fast/lib/python3.8/site-packages/torchinductor/compile_fx.py", line 177, in fw_compiler
return compile_fx_inner(
File "/data/home/dberard/miniconda/envs/bench-fast/lib/python3.8/site-packages/torchdynamo/debug_utils.py", line 329, in debug_wrapper
compiled_fn = compiler(gm, example_inputs, **kwargs)
File "/data/home/dberard/miniconda/envs/bench-fast/lib/python3.8/site-packages/torchinductor/debug.py", line 182, in inner
return fn(*args, **kwargs)
File "/data/home/dberard/miniconda/envs/bench-fast/lib/python3.8/site-packages/torchinductor/compile_fx.py", line 60, in compile_fx_inner
compiled_fn = wrap(graph.compile_to_fn())
File "/data/home/dberard/miniconda/envs/bench-fast/lib/python3.8/site-packages/torchinductor/graph.py", line 333, in compile_to_fn
return self.compile_to_module().call
File "/data/home/dberard/miniconda/envs/bench-fast/lib/python3.8/site-packages/torchdynamo/utils.py", line 75, in time_wrapper
r = func(*args, **kwargs)
File "/data/home/dberard/miniconda/envs/bench-fast/lib/python3.8/site-packages/torchinductor/graph.py", line 323, in compile_to_module
mod = PyCodeCache.load(code)
File "/data/home/dberard/miniconda/envs/bench-fast/lib/python3.8/site-packages/torchinductor/codecache.py", line 162, in load
exec(code, mod.__dict__, mod.__dict__)
File "/tmp/torchinductor_dberard/s7/cs7yddkfqu2hqk2hbxty2b65q4zavo3yxmatm3yx2ie73ghiqkh7.py", line 18, in <module>
kernel0 = TritonCodeCache.load('''
File "/data/home/dberard/miniconda/envs/bench-fast/lib/python3.8/site-packages/torchinductor/codecache.py", line 243, in load
return PyCodeCache.load(source_code)
File "/data/home/dberard/miniconda/envs/bench-fast/lib/python3.8/site-packages/torchinductor/codecache.py", line 156, in load
key, path = write(source_code, "py")
File "/data/home/dberard/miniconda/envs/bench-fast/lib/python3.8/site-packages/torchinductor/codecache.py", line 46, in write
os.rename(tmp_path, path)
FileNotFoundError: [Errno 2] No such file or directory: '/tmp/torchinductor_dberard/hb/chbtiqye637eosh5eyoh7t3ve4mviybimx2m2uudsrm6bt22x3nd.py.1571306751' -> '/tmp/torchinductor_dberard/hb/chbtiqye637eosh5eyoh7t3ve4mviybimx2m2uudsrm6bt22x3nd.py'
# this basically shows which jobs failed.
# the ones with size(err file) > 10,000 bytes failed.
# failure example from 62606_13_log.err is shown in ddp_error_cache.txt
$ ls 62606* -l
-rw-rw-r-- 1 dberard dberard 1126 Sep 9 02:58 62606_0_log.err
-rw-rw-r-- 1 dberard dberard 28388 Sep 9 02:27 62606_0_log.out
-rw-rw-r-- 1 dberard dberard 12549 Sep 9 02:24 62606_10_log.err
-rw-rw-r-- 1 dberard dberard 25944 Sep 9 02:24 62606_10_log.out
-rw-rw-r-- 1 dberard dberard 5631 Sep 9 02:24 62606_10_result.pkl
-rw-rw-r-- 1 dberard dberard 1053 Sep 9 02:57 62606_11_log.err
-rw-rw-r-- 1 dberard dberard 24748 Sep 9 02:27 62606_11_log.out
-rw-rw-r-- 1 dberard dberard 1053 Sep 9 02:57 62606_12_log.err
-rw-rw-r-- 1 dberard dberard 27593 Sep 9 02:27 62606_12_log.out
-rw-rw-r-- 1 dberard dberard 12549 Sep 9 02:24 62606_13_log.err
-rw-rw-r-- 1 dberard dberard 23722 Sep 9 02:24 62606_13_log.out
-rw-rw-r-- 1 dberard dberard 5631 Sep 9 02:24 62606_13_result.pkl
-rw-rw-r-- 1 dberard dberard 1053 Sep 9 02:57 62606_14_log.err
-rw-rw-r-- 1 dberard dberard 26313 Sep 9 02:27 62606_14_log.out
-rw-rw-r-- 1 dberard dberard 1053 Sep 9 02:57 62606_15_log.err
-rw-rw-r-- 1 dberard dberard 24740 Sep 9 02:27 62606_15_log.out
-rw-rw-r-- 1 dberard dberard 1052 Sep 9 02:57 62606_1_log.err
-rw-rw-r-- 1 dberard dberard 23814 Sep 9 02:27 62606_1_log.out
-rw-rw-r-- 1 dberard dberard 1052 Sep 9 02:58 62606_2_log.err
-rw-rw-r-- 1 dberard dberard 26053 Sep 9 02:27 62606_2_log.out
-rw-rw-r-- 1 dberard dberard 1052 Sep 9 02:58 62606_3_log.err
-rw-rw-r-- 1 dberard dberard 25205 Sep 9 02:58 62606_3_log.out
-rw-rw-r-- 1 dberard dberard 1052 Sep 9 02:58 62606_4_log.err
-rw-rw-r-- 1 dberard dberard 27334 Sep 9 02:27 62606_4_log.out
-rw-rw-r-- 1 dberard dberard 1052 Sep 9 02:58 62606_5_log.err
-rw-rw-r-- 1 dberard dberard 23821 Sep 9 02:27 62606_5_log.out
-rw-rw-r-- 1 dberard dberard 1052 Sep 9 02:58 62606_6_log.err
-rw-rw-r-- 1 dberard dberard 26060 Sep 9 02:27 62606_6_log.out
-rw-rw-r-- 1 dberard dberard 1052 Sep 9 02:57 62606_7_log.err
-rw-rw-r-- 1 dberard dberard 24699 Sep 9 02:27 62606_7_log.out
-rw-rw-r-- 1 dberard dberard 1052 Sep 9 02:57 62606_8_log.err
-rw-rw-r-- 1 dberard dberard 27535 Sep 9 02:27 62606_8_log.out
-rw-rw-r-- 1 dberard dberard 1052 Sep 9 02:57 62606_9_log.err
-rw-rw-r-- 1 dberard dberard 23826 Sep 9 02:27 62606_9_log.out
-rw-rw-r-- 1 dberard dberard 821 Sep 9 02:20 62606_submission.sh
-rw-rw-r-- 1 dberard dberard 3477 Sep 9 02:20 62606_submitted.pkl
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment