Created
December 19, 2024 19:02
-
-
Save jerryzh168/ebb6546b0b9bd68c52632f45235a2fd0 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[2024-12-19 10:59:34 TP5] Scheduler hit an exception: Traceback (most recent call last): | |
File "/data/users/jerryzh/sglang/python/sglang/srt/managers/scheduler.py", line 1528, in run_scheduler_process | |
scheduler = Scheduler(server_args, port_args, gpu_id, tp_rank, dp_rank) | |
File "/data/users/jerryzh/sglang/python/sglang/srt/managers/scheduler.py", line 192, in __init__ | |
self.tp_worker = TpWorkerClass( | |
File "/data/users/jerryzh/sglang/python/sglang/srt/managers/tp_worker_overlap_thread.py", line 62, in __init__ | |
self.worker = TpModelWorker(server_args, gpu_id, tp_rank, dp_rank, nccl_port) | |
File "/data/users/jerryzh/sglang/python/sglang/srt/managers/tp_worker.py", line 62, in __init__ | |
self.model_runner = ModelRunner( | |
File "/data/users/jerryzh/sglang/python/sglang/srt/model_executor/model_runner.py", line 158, in __init__ | |
self.load_model() | |
File "/data/users/jerryzh/sglang/python/sglang/srt/model_executor/model_runner.py", line 258, in load_model | |
self.model = get_model( | |
File "/data/users/jerryzh/sglang/python/sglang/srt/model_loader/__init__.py", line 22, in get_model | |
return loader.load_model( | |
File "/data/users/jerryzh/sglang/python/sglang/srt/model_loader/loader.py", line 357, in load_model | |
model = _initialize_model( | |
File "/data/users/jerryzh/sglang/python/sglang/srt/model_loader/loader.py", line 138, in _initialize_model | |
return model_class( | |
File "/data/users/jerryzh/sglang/python/sglang/srt/models/torch_native_llama.py", line 394, in __init__ | |
self.model = LlamaModel(config, quant_config=quant_config) | |
File "/data/users/jerryzh/sglang/python/sglang/srt/models/torch_native_llama.py", line 351, in __init__ | |
[ | |
File "/data/users/jerryzh/sglang/python/sglang/srt/models/torch_native_llama.py", line 352, in <listcomp> | |
LlamaDecoderLayer( | |
File "/data/users/jerryzh/sglang/python/sglang/srt/models/torch_native_llama.py", line 299, in __init__ | |
self.mlp = LlamaMLP( | |
File "/data/users/jerryzh/sglang/python/sglang/srt/models/torch_native_llama.py", line 126, in __init__ | |
self.down_proj = torch.nn.Linear(intermediate_size, hidden_size, bias=False) | |
File "/home/jerryzh/.conda/envs/sglang/lib/python3.10/site-packages/torch/nn/modules/linear.py", line 106, in __init__ | |
torch.empty((out_features, in_features), **factory_kwargs) | |
File "/home/jerryzh/.conda/envs/sglang/lib/python3.10/site-packages/torch/utils/_device.py", line 104, in __torch_function__ | |
return func(*args, **kwargs) | |
torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 448.00 MiB. GPU 5 has a total capacity of 95.00 GiB of which 246.31 MiB is free. Including non-PyTorch memory, this process has 94.76 GiB memory in use. Of the allocated memory 92.29 GiB is allocated by PyTorch, and 13.72 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment