--max_num_seqs default value set to 128 and when OOM you can set --max_num_seqs to lower value
http://base_url:8000//v1/models
python3 -m vllm.entrypoints.openai.api_server
--model /models/Qwen3-235B-A22B-Instruct-2507-AWQ
--served-model-name Qwen3/Qwen3-235B-A22B-Instruct-2507-AWQ
--enable-expert-parallel
--swap-space 16
--max-model-len 16384
--gpu-memory-utilization 0.95
--tensor-parallel-size 8
--trust-remote-code
--host 0.0.0.0
--port 8000
--load-format runai_streamer
--model-loader-extra-config '{"concurrency":16}'
vllm serve /models/Qwen3-VL-235B-A22B-Instruct
--served-model-name Qwen/Qwen3-VL-235B-A22B-Instruct
--tensor-parallel-size 8 \
--limit-mm-per-prompt.video 0 \
--gpu-memory-utilization 0.95 \
--host 0.0.0.0 \
--port 8000 \
--enable-expert-parallel \
--disable-custom-all-reduce \
--max-model-len 128000 \
--max-num-batched-tokens 128 \
--enable-auto-tool-choice \
--tool-call-parser hermes \
--enable-log-requests
http://base_url:30000/get_server_info
docker buildx build --output type=image,compression=zstd . -f docker/Dockerfile --build-arg CUDA_VERSION=12.8.1 --build-arg BUILD_TYPE=blackwell -t lmsysorg/sglang:v0.5.4.post2-blackwell --no-cache
docker buildx build --output type=image,compression=zstd . -f docker/Dockerfile --build-arg CUDA_VERSION=12.8.1 --build-arg BUILD_TYPE=blackwell -t lmsysorg/sglang:v0.5.4.post2-blackwell --no-cache
https://github.com/sgl-project/sglang/actions/runs/17847201227/job/50748735339
python -m sglang.launch_server
--model-path /mnt/external-quantized-models/models/nvidia__Llama-3.1-405B-Instruct-FP4
--mem-fraction 0.8
--cuda-graph-max-bs 4
--tp 8
--attention-backend triton
--quantization modelopt_fp4
--model-loader-extra-config '{
"enable_multithread_load": true,
"num_threads": 8
}'
--trust-remote-code
python -m sglang.launch_server --model-path /models/Qwen3-32B --served-model-name Qwen/Qwen3-32B --tensor-parallel-size 4 --mem-fraction-static 0.85 --port 8000 --host 0.0.0.0 --trust-remote-code --context-length 16384 --dtype bfloat16 --disable-custom-all-reduce --chunked-prefill-size 2048
image: lmsysorg/sglang:v0.5.2-cu129-b200 error: File "/sgl-workspace/sglang/python/sglang/srt/managers/scheduler.py", line 2569, in run_scheduler_process faulthandler.enable() method: just comment out faulthandler.enable() line
https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3#installation--launch
- curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.40.3/install.sh | bash
- nvm install 22
- npm install -g @anthropic-ai/claude-code
huggingface-cli download moonshotai/Kimi-K2-Instruct --local-dir Kimi-K2-Instruct --exclude ".git"
https://apxml.com/tools/vram-calculator
https://docs.nvidia.com/deeplearning/cudnn/backend/latest/reference/support-matrix.html#f3
jupyter server list