-
-
Save Ithanil/fd7644bf3e44eec752d1263a8b8acb3a to your computer and use it in GitHub Desktop.
| ARG CUDA_VERSION=12.8.1 | |
| FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS vllm_gpt-oss | |
| ARG CUDA_VERSION | |
| ARG PYTHON_VERSION=3.12 | |
| ### --- Stuff from default Dockerfile ---- ### | |
| # The PyPA get-pip.py script is a self contained script+zip file, that provides | |
| # both the installer script and the pip base85-encoded zip archive. This allows | |
| # bootstrapping pip in environment where a dsitribution package does not exist. | |
| # | |
| # By parameterizing the URL for get-pip.py installation script, we allow | |
| # third-party to use their own copy of the script stored in a private mirror. | |
| # We set the default value to the PyPA owned get-pip.py script. | |
| # | |
| # Reference: https://pip.pypa.io/en/stable/installation/#get-pip-py | |
| ARG GET_PIP_URL="https://bootstrap.pypa.io/get-pip.py" | |
| # PIP supports fetching the packages from custom indexes, allowing third-party | |
| # to host the packages in private mirrors. The PIP_INDEX_URL and | |
| # PIP_EXTRA_INDEX_URL are standard PIP environment variables to override the | |
| # default indexes. By letting them empty by default, PIP will use its default | |
| # indexes if the build process doesn't override the indexes. | |
| # | |
| # Uv uses different variables. We set them by default to the same values as | |
| # PIP, but they can be overridden. | |
| ARG PIP_INDEX_URL | |
| ARG PIP_EXTRA_INDEX_URL | |
| ARG UV_INDEX_URL=${PIP_INDEX_URL} | |
| ARG UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL} | |
| # PyTorch provides its own indexes for standard and nightly builds | |
| ARG PYTORCH_CUDA_INDEX_BASE_URL=https://download.pytorch.org/whl | |
| ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL=https://download.pytorch.org/whl/nightly | |
| # PIP supports multiple authentication schemes, including keyring | |
| # By parameterizing the PIP_KEYRING_PROVIDER variable and setting it to | |
| # disabled by default, we allow third-party to use keyring authentication for | |
| # their private Python indexes, while not changing the default behavior which | |
| # is no authentication. | |
| # | |
| # Reference: https://pip.pypa.io/en/stable/topics/authentication/#keyring-support | |
| ARG PIP_KEYRING_PROVIDER=disabled | |
| ARG UV_KEYRING_PROVIDER=${PIP_KEYRING_PROVIDER} | |
| # Flag enables built-in KV-connector dependency libs into docker images | |
| ARG INSTALL_KV_CONNECTORS=false | |
| # prepare basic build environment | |
| ARG TARGETPLATFORM | |
| ARG INSTALL_KV_CONNECTORS=false | |
| ENV DEBIAN_FRONTEND=noninteractive | |
| ARG DEADSNAKES_MIRROR_URL | |
| ARG DEADSNAKES_GPGKEY_URL | |
| ARG GET_PIP_URL | |
| # Install Python and other dependencies | |
| RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ | |
| && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \ | |
| && apt-get update -y \ | |
| && apt-get install -y ccache software-properties-common git curl sudo \ | |
| && add-apt-repository -y ppa:deadsnakes/ppa \ | |
| && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \ | |
| && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \ | |
| && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \ | |
| && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \ | |
| && curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION} \ | |
| && python3 --version && python3 -m pip --version | |
| ARG PIP_INDEX_URL UV_INDEX_URL | |
| ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL | |
| ARG PYTORCH_CUDA_INDEX_BASE_URL | |
| ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL | |
| ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER | |
| # Install uv for faster pip installs | |
| RUN --mount=type=cache,target=/root/.cache/uv \ | |
| python3 -m pip install uv | |
| # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out | |
| # Reference: https://github.com/astral-sh/uv/pull/1694 | |
| ENV UV_HTTP_TIMEOUT=500 | |
| ENV UV_INDEX_STRATEGY="unsafe-best-match" | |
| # Use copy mode to avoid hardlink failures with Docker cache mounts | |
| ENV UV_LINK_MODE=copy | |
| # Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519 | |
| # as it was causing spam when compiling the CUTLASS kernels | |
| RUN apt-get install -y gcc-10 g++-10 | |
| RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 110 --slave /usr/bin/g++ g++ /usr/bin/g++-10 | |
| RUN <<EOF | |
| gcc --version | |
| EOF | |
| # Workaround for https://github.com/openai/triton/issues/2507 and | |
| # https://github.com/pytorch/pytorch/issues/107960 -- hopefully | |
| # this won't be needed for future versions of this docker image | |
| # or future versions of triton. | |
| RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/ | |
| # max jobs used by Ninja to build extensions | |
| ARG max_jobs=2 | |
| ENV MAX_JOBS=${max_jobs} | |
| # number of threads used by nvcc | |
| ARG nvcc_threads=8 | |
| ENV NVCC_THREADS=$nvcc_threads | |
| ### ----------------------------------------------------- ### | |
| ### --- Build instructions for GPT-OSS on Ampere --- ### | |
| ### Translated from https://github.com/vllm-project/vllm/issues/22290#issuecomment-3162301278 ### | |
| ARG CCACHE_NOHASHDIR="true" | |
| COPY . /tmp | |
| WORKDIR /tmp | |
| RUN pip install uv | |
| RUN uv pip install --system --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu128 | |
| RUN uv pip install --system "transformers[torch]" | |
| RUN python3 use_existing_torch.py | |
| RUN uv pip install --system -r requirements/build.txt | |
| RUN uv pip install --system --no-build-isolation -e . -v | |
| Run uv pip uninstall --system triton pytorch-triton | |
| RUN uv pip install --system triton==3.4.0 openai_harmony mcp | |
| RUN git clone https://github.com/openai/triton.git | |
| RUN uv pip install --system -e triton/python/triton_kernels --no-deps | |
| # Run | |
| ENV VLLM_ATTENTION_BACKEND=TRITON_ATTN_VLLM_V1 | |
| ENTRYPOINT ["vllm"] |
try to build docker file with github but it's not success
name: Build and Push vLLM GPT-OSS Docker Image
on:
push:
branches: [ main ]
jobs:
build:
runs-on: ubuntu-latest
steps:
# 1️⃣ Checkout your repo so the Dockerfile exists locally
- name: Checkout this repository
uses: actions/checkout@v3
# 2️⃣ Free up space for large CUDA build
- name: Free up space
run: |
sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc /opt/hostedtoolcache
df -h
# 3️⃣ Clone the vLLM repo
- name: Clone vLLM repository
run: |
git clone https://github.com/zyongye/vllm.git
cd vllm
git checkout 8260948cdc379d13bf4b80d3172a03d21a983e05
# 4️⃣ Copy your custom Dockerfile into vllm folder
- name: Copy custom Dockerfile
run: cp .github/workflows/Dockerfile_gpt-oss vllm/
# 5️⃣ Set up Docker Buildx
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
# 6️⃣ Log in to Docker Hub
- name: Log in to Docker Hub
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_TOKEN }}
# 7️⃣ Build and Push Docker Image
- name: Build and Push vLLM GPT-OSS Image
uses: docker/build-push-action@v5
with:
context: vllm
file: vllm/Dockerfile_gpt-oss
push: true
build-args: |
RUN_WHEEL_CHECK=false
max_jobs=64
nvcc_threads=2
target: vllm_gpt-oss
tags: nhuytan/vllm-gptoss:0.10.2
The image was built, but when I launch it, I get this error. Can you tell me what the problem is?
(EngineCore_0 pid=34) ERROR 08-08 15:01:03 [core.py:718] EngineCore failed to start.
(EngineCore_0 pid=34) ERROR 08-08 15:01:03 [core.py:718] Traceback (most recent call last):
(EngineCore_0 pid=34) ERROR 08-08 15:01:03 [core.py:718] File "/tmp/vllm/v1/engine/core.py", line 709, in run_engine_core
(EngineCore_0 pid=34) ERROR 08-08 15:01:03 [core.py:718] engine_core = EngineCoreProc(*args, **kwargs)
(EngineCore_0 pid=34) ERROR 08-08 15:01:03 [core.py:718] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_0 pid=34) ERROR 08-08 15:01:03 [core.py:718] File "/tmp/vllm/v1/engine/core.py", line 510, in init
(EngineCore_0 pid=34) ERROR 08-08 15:01:03 [core.py:718] super().init(vllm_config, executor_class, log_stats,
(EngineCore_0 pid=34) ERROR 08-08 15:01:03 [core.py:718] File "/tmp/vllm/v1/engine/core.py", line 91, in init
(EngineCore_0 pid=34) ERROR 08-08 15:01:03 [core.py:718] self._initialize_kv_caches(vllm_config)
(EngineCore_0 pid=34) ERROR 08-08 15:01:03 [core.py:718] File "/tmp/vllm/v1/engine/core.py", line 181, in _initialize_kv_caches
(EngineCore_0 pid=34) ERROR 08-08 15:01:03 [core.py:718] self.model_executor.determine_available_memory())
(EngineCore_0 pid=34) ERROR 08-08 15:01:03 [core.py:718] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_0 pid=34) ERROR 08-08 15:01:03 [core.py:718] File "/tmp/vllm/v1/executor/abstract.py", line 76, in determine_available_memory
(EngineCore_0 pid=34) ERROR 08-08 15:01:03 [core.py:718] output = self.collective_rpc("determine_available_memory")
(EngineCore_0 pid=34) ERROR 08-08 15:01:03 [core.py:718] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_0 pid=34) ERROR 08-08 15:01:03 [core.py:718] File "/tmp/vllm/v1/executor/multiproc_executor.py", line 243, in collective_rpc
(EngineCore_0 pid=34) ERROR 08-08 15:01:03 [core.py:718] result = get_response(w, dequeue_timeout)
(EngineCore_0 pid=34) ERROR 08-08 15:01:03 [core.py:718] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_0 pid=34) ERROR 08-08 15:01:03 [core.py:718] File "/tmp/vllm/v1/executor/multiproc_executor.py", line 230, in get_response
(EngineCore_0 pid=34) ERROR 08-08 15:01:03 [core.py:718] raise RuntimeError(
(EngineCore_0 pid=34) ERROR 08-08 15:01:03 [core.py:718] RuntimeError: Worker failed with error ''Mxfp4MoEMethod' object has no attribute 'fused_experts'', please check the stack trace above for the root cause
(EngineCore_0 pid=34) ERROR 08-08 15:01:06 [multiproc_executor.py:146] Worker proc VllmWorker-1 died unexpectedly, shutting down executor.
(EngineCore_0 pid=34) Process EngineCore_0:
(EngineCore_0 pid=34) Traceback (most recent call last):
(EngineCore_0 pid=34) File "/usr/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
(EngineCore_0 pid=34) self.run()
(EngineCore_0 pid=34) File "/usr/lib/python3.12/multiprocessing/process.py", line 108, in run
(EngineCore_0 pid=34) self._target(*self._args, **self._kwargs)
(EngineCore_0 pid=34) File "/tmp/vllm/v1/engine/core.py", line 722, in run_engine_core
(EngineCore_0 pid=34) raise e
(EngineCore_0 pid=34) File "/tmp/vllm/v1/engine/core.py", line 709, in run_engine_core
(EngineCore_0 pid=34) engine_core = EngineCoreProc(*args, **kwargs)
(EngineCore_0 pid=34) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_0 pid=34) File "/tmp/vllm/v1/engine/core.py", line 510, in init
(EngineCore_0 pid=34) super().init(vllm_config, executor_class, log_stats,
(EngineCore_0 pid=34) File "/tmp/vllm/v1/engine/core.py", line 91, in init
(EngineCore_0 pid=34) self._initialize_kv_caches(vllm_config)
(EngineCore_0 pid=34) File "/tmp/vllm/v1/engine/core.py", line 181, in _initialize_kv_caches
(EngineCore_0 pid=34) self.model_executor.determine_available_memory())
(EngineCore_0 pid=34) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_0 pid=34) File "/tmp/vllm/v1/executor/abstract.py", line 76, in determine_available_memory
(EngineCore_0 pid=34) output = self.collective_rpc("determine_available_memory")
(EngineCore_0 pid=34) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_0 pid=34) File "/tmp/vllm/v1/executor/multiproc_executor.py", line 243, in collective_rpc
(EngineCore_0 pid=34) result = get_response(w, dequeue_timeout)
(EngineCore_0 pid=34) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_0 pid=34) File "/tmp/vllm/v1/executor/multiproc_executor.py", line 230, in get_response
(EngineCore_0 pid=34) raise RuntimeError(
(EngineCore_0 pid=34) RuntimeError: Worker failed with error ''Mxfp4MoEMethod' object has no attribute 'fused_experts'', please check the stack trace above for the root cause
(APIServer pid=1) Traceback (most recent call last):
(APIServer pid=1) File "", line 198, in _run_module_as_main
(APIServer pid=1) File "", line 88, in _run_code
(APIServer pid=1) File "/tmp/vllm/entrypoints/openai/api_server.py", line 1895, in
(APIServer pid=1) uvloop.run(run_server(args))
(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/init.py", line 109, in run
(APIServer pid=1) return __asyncio.run(
(APIServer pid=1) ^^^^^^^^^^^^^^
(APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 195, in run
(APIServer pid=1) return runner.run(main)
(APIServer pid=1) ^^^^^^^^^^^^^^^^
(APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 118, in run
(APIServer pid=1) return self._loop.run_until_complete(task)
(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=1) File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete
(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/init.py", line 61, in wrapper
(APIServer pid=1) return await main
(APIServer pid=1) ^^^^^^^^^^
(APIServer pid=1) File "/tmp/vllm/entrypoints/openai/api_server.py", line 1827, in run_server
(APIServer pid=1) await run_server_worker(listen_address, sock, args, **uvicorn_kwargs)
(APIServer pid=1) File "/tmp/vllm/entrypoints/openai/api_server.py", line 1847, in run_server_worker
(APIServer pid=1) async with build_async_engine_client(
(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in aenter
(APIServer pid=1) return await anext(self.gen)
(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=1) File "/tmp/vllm/entrypoints/openai/api_server.py", line 167, in build_async_engine_client
(APIServer pid=1) async with build_async_engine_client_from_engine_args(
(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in aenter
(APIServer pid=1) return await anext(self.gen)
(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=1) File "/tmp/vllm/entrypoints/openai/api_server.py", line 209, in build_async_engine_client_from_engine_args
(APIServer pid=1) async_llm = AsyncLLM.from_vllm_config(
(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=1) File "/tmp/vllm/utils/init.py", line 1520, in inner
(APIServer pid=1) return fn(*args, **kwargs)
(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^
(APIServer pid=1) File "/tmp/vllm/v1/engine/async_llm.py", line 173, in from_vllm_config
(APIServer pid=1) return cls(
(APIServer pid=1) ^^^^
(APIServer pid=1) File "/tmp/vllm/v1/engine/async_llm.py", line 119, in init
(APIServer pid=1) self.engine_core = EngineCoreClient.make_async_mp_client(
(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=1) File "/tmp/vllm/v1/engine/core_client.py", line 101, in make_async_mp_client
(APIServer pid=1) return AsyncMPClient(*client_args)
(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=1) File "/tmp/vllm/v1/engine/core_client.py", line 733, in init
(APIServer pid=1) super().init(
(APIServer pid=1) File "/tmp/vllm/v1/engine/core_client.py", line 421, in init
(APIServer pid=1) with launch_core_engines(vllm_config, executor_class,
(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 144, in exit
(APIServer pid=1) next(self.gen)
(APIServer pid=1) File "/tmp/vllm/v1/engine/utils.py", line 697, in launch_core_engines
(APIServer pid=1) wait_for_engine_startup(
(APIServer pid=1) File "/tmp/vllm/v1/engine/utils.py", line 750, in wait_for_engine_startup
(APIServer pid=1) raise RuntimeError("Engine core initialization failed. "
(APIServer pid=1) RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {}
The image was built, but when I launch it, I get this error. Can you tell me what the problem is?
...
I think I got something like this when I tried to use other parallelism than tensor parallelism. I think only TP is supported right now.
Please check my command-line posted above and see what you are doing differently.
Yes. Problem was solved after deleting this parameter "--enable-expert-parallel"
but i have a new problem, may be you known have to solve this???
(VllmWorker TP0 pid=44) INFO 08-10 09:49:24 [multiproc_executor.py:520] Parent process exited, terminating worker
(VllmWorker TP1 pid=45) INFO 08-10 09:49:24 [multiproc_executor.py:520] Parent process exited, terminating worker
(VllmWorker TP2 pid=46) INFO 08-10 09:49:24 [multiproc_executor.py:520] Parent process exited, terminating worker
(VllmWorker TP3 pid=47) INFO 08-10 09:49:24 [multiproc_executor.py:520] Parent process exited, terminating worker
(APIServer pid=1) Traceback (most recent call last):
(APIServer pid=1) File "", line 198, in _run_module_as_main
(APIServer pid=1) File "", line 88, in _run_code
(APIServer pid=1) File "/tmp/vllm/entrypoints/openai/api_server.py", line 1895, in
(APIServer pid=1) uvloop.run(run_server(args))
(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/init.py", line 109, in run
(APIServer pid=1) return __asyncio.run(
(APIServer pid=1) ^^^^^^^^^^^^^^
(APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 195, in run
(APIServer pid=1) return runner.run(main)
(APIServer pid=1) ^^^^^^^^^^^^^^^^
(APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 118, in run
(APIServer pid=1) return self._loop.run_until_complete(task)
(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=1) File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete
(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/init.py", line 61, in wrapper
(APIServer pid=1) return await main
(APIServer pid=1) ^^^^^^^^^^
(APIServer pid=1) File "/tmp/vllm/entrypoints/openai/api_server.py", line 1827, in run_server
(APIServer pid=1) await run_server_worker(listen_address, sock, args, **uvicorn_kwargs)
(APIServer pid=1) File "/tmp/vllm/entrypoints/openai/api_server.py", line 1855, in run_server_worker
(APIServer pid=1) await init_app_state(engine_client, vllm_config, app.state, args)
(APIServer pid=1) File "/tmp/vllm/entrypoints/openai/api_server.py", line 1657, in init_app_state
(APIServer pid=1) state.openai_serving_responses = OpenAIServingResponses(
(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=1) File "/tmp/vllm/entrypoints/openai/serving_responses.py", line 130, in init
(APIServer pid=1) get_stop_tokens_for_assistant_actions())
(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=1) File "/tmp/vllm/entrypoints/harmony_utils.py", line 187, in get_stop_tokens_for_assistant_actions
(APIServer pid=1) return get_encoding().stop_tokens_for_assistant_actions()
(APIServer pid=1) ^^^^^^^^^^^^^^
(APIServer pid=1) File "/tmp/vllm/entrypoints/harmony_utils.py", line 37, in get_encoding
(APIServer pid=1) _harmony_encoding = load_harmony_encoding(
(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/openai_harmony/init.py", line 689, in load_harmony_encoding
(APIServer pid=1) inner: _PyHarmonyEncoding = _load_harmony_encoding(name)
(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=1) openai_harmony.HarmonyError: error downloading or loading vocab file: failed to download or load vocab file
VLLM Docker build for GPT-OSS on Ampere
Quickly hacked together from default vllm Dockerfile https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile and specific instructions to compile vllm for GPT-OSS on Ampere: vllm-project/vllm#22290 (comment)
Note: Maximally unoptimized Dockerfile, image will be >30GB.
Note: Special tools (Python/Browser) not included.
How to build
How to start on 1x A100
docker run -d -e CUDA_VISIBLE_DEVICES=0 --gpus all --shm-size=16gb -p 127.0.0.1:8000:8000--ipc=host --mount type=bind,src=/YOUR/MODEL/TOPDIR,dst=/models --name vllm_gpt-oss vllm-0.10.1_gpt-oss serve /models/gpt-oss-120b --served-model-name GPT-OSS-120B --download-dir /models --disable-log-requests --port 8000 --tensor-parallel-size 1 --gpu-memory-utilization 0.95 --max-model-len 131072 --enable-prefix-caching --enable-chunked-prefill --async-scheduling --override-generation-config '{"temperature": 1.0, "top_p": 1.0, "top_k": 0}'Have fun!