diff --git a/Dockerfile b/Dockerfile index 5d18f018..85c4e77b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -155,3 +155,66 @@ COPY --from=builder /app/model-runner /app/model-runner FROM sglang AS final-sglang # Copy the built binary from builder-sglang (without vLLM) COPY --from=builder-sglang /app/model-runner /app/model-runner + +# --- vLLM ROCm: builder stage --- +# Builds upstream vLLM from source on AMD's pre-built ROCm dev image, which +# already contains PyTorch ROCm, Triton, flash-attention, and the ROCm SDK +# (see https://hub.docker.com/r/rocm/vllm-dev). vLLM is checked out at the +# tagged release matching VLLM_VERSION — no fork, no custom wheels. +FROM rocm/vllm-dev:base AS vllm-rocm-builder + +ARG VLLM_VERSION=0.19.1 +# Target GPU architectures officially supported by vLLM ROCm: +# gfx90a (MI200), gfx942 (MI300), gfx1100/1101 (RDNA3 7900/7800). +ARG PYTORCH_ROCM_ARCH="gfx90a;gfx942;gfx1100;gfx1101" +ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} + +RUN git clone --depth 1 --branch v${VLLM_VERSION} \ + https://github.com/vllm-project/vllm.git /vllm-src + +WORKDIR /vllm-src +RUN python3 -m pip install --no-cache-dir -r requirements/rocm.txt \ + && python3 setup.py bdist_wheel --dist-dir=/wheels + +# --- vLLM ROCm: runtime stage --- +# Mirrors the /opt/vllm-env layout that pkg/inference/backends/vllm/vllm.go +# expects (binary at /opt/vllm-env/bin/vllm, version file at +# /opt/vllm-env/version). Symlinks are used instead of a real venv because +# rocm/vllm-dev:base installs Python dependencies system-wide and recreating +# a venv would break the PyTorch ROCm / Triton ROCm wiring. +# +# Note: unlike the CUDA vllm stage, this image does NOT include llama.cpp. +# The base image is incompatible (different ROCm runtime versions), and the +# rocm vllm image is intended as a vLLM-only artifact. +FROM rocm/vllm-dev:base AS vllm-rocm + +COPY --from=vllm-rocm-builder /wheels/*.whl /tmp/ +RUN python3 -m pip install --no-cache-dir /tmp/*.whl && rm /tmp/*.whl + +RUN groupadd --system modelrunner \ + && useradd --system --gid modelrunner -G video \ + --create-home --home-dir /home/modelrunner modelrunner + +RUN mkdir -p /opt/vllm-env/bin \ + && ln -s "$(command -v vllm)" /opt/vllm-env/bin/vllm \ + && python3 -c "import vllm; print(vllm.__version__)" > /opt/vllm-env/version \ + && chown -R modelrunner:modelrunner /opt/vllm-env + +RUN mkdir -p /var/run/model-runner /models /app \ + && chown -R modelrunner:modelrunner /var/run/model-runner /app /models \ + && chmod -R 755 /models + +USER modelrunner + +ENV MODEL_RUNNER_SOCK=/var/run/model-runner/model-runner.sock +ENV MODEL_RUNNER_PORT=12434 +ENV HOME=/home/modelrunner +ENV MODELS_PATH=/models + +LABEL com.docker.desktop.service="model-runner" + +ENTRYPOINT ["/app/model-runner"] + +FROM vllm-rocm AS final-vllm-rocm +# Copy the built binary from builder +COPY --from=builder /app/model-runner /app/model-runner diff --git a/Makefile b/Makefile index 7361a224..8b3dddb8 100644 --- a/Makefile +++ b/Makefile @@ -9,6 +9,7 @@ LLAMA_UPSTREAM_IMAGE ?= $(shell \ "$(LLAMA_SERVER_VERSION)" "$(LLAMA_SERVER_VARIANT)") DOCKER_IMAGE := docker/model-runner:latest DOCKER_IMAGE_VLLM := docker/model-runner:latest-vllm-cuda +DOCKER_IMAGE_VLLM_ROCM := docker/model-runner:latest-vllm-rocm DOCKER_IMAGE_SGLANG := docker/model-runner:latest-sglang DOCKER_IMAGE_MUSA := docker/model-runner:latest-musa DOCKER_IMAGE_OPENVINO := docker/model-runner:latest-openvino @@ -43,7 +44,7 @@ DOCKER_BUILD_COMMON_ARGS = \ .PHONY: build build-cli build-dmr build-llamacpp install-cli run clean test integration-tests e2e .PHONY: validate validate-versions validate-all lint help .PHONY: docker-build docker-build-multiplatform docker-run docker-run-impl -.PHONY: docker-build-vllm docker-run-vllm docker-build-sglang docker-run-sglang +.PHONY: docker-build-vllm docker-run-vllm docker-build-vllm-rocm docker-run-vllm-rocm docker-build-sglang docker-run-sglang .PHONY: docker-build-musa docker-run-musa docker-build-openvino docker-run-openvino .PHONY: test-docker-ce-installation .PHONY: vllm-metal-build vllm-metal-install vllm-metal-dev vllm-metal-clean @@ -194,6 +195,23 @@ docker-build-vllm: docker-run-vllm: docker-build-vllm @$(MAKE) -s docker-run-impl DOCKER_IMAGE=$(DOCKER_IMAGE_VLLM) +# Build vLLM Docker image with ROCm (AMD GPU) support. +# Builds upstream vLLM from source on top of rocm/vllm-dev:base — this is a +# vLLM-only image (no llama.cpp), unlike the CUDA variant. Build is heavy: +# expect 30-60 min and ~12-15 GB final image size. +# LLAMA_SERVER_VARIANT is not consumed by the Dockerfile stages here, but +# setting it to "rocm" restricts DOCKER_BUILD_PLATFORMS to linux/amd64 +# (vLLM ROCm has no aarch64 support). +docker-build-vllm-rocm: + @$(MAKE) docker-build \ + DOCKER_TARGET=final-vllm-rocm \ + DOCKER_IMAGE=$(DOCKER_IMAGE_VLLM_ROCM) \ + LLAMA_SERVER_VARIANT=rocm + +# Run vLLM ROCm Docker container with TCP port access and mounted model storage +docker-run-vllm-rocm: docker-build-vllm-rocm + @$(MAKE) -s docker-run-impl DOCKER_IMAGE=$(DOCKER_IMAGE_VLLM_ROCM) + # Build SGLang Docker image docker-build-sglang: @$(MAKE) docker-build \ @@ -402,8 +420,10 @@ help: @echo " docker-build - Build Docker image for current platform" @echo " docker-build-multiplatform - Build Docker image for multiple platforms" @echo " docker-run - Run in Docker container with TCP port access and mounted model storage" - @echo " docker-build-vllm - Build vLLM Docker image" - @echo " docker-run-vllm - Run vLLM Docker container" + @echo " docker-build-vllm - Build vLLM Docker image (CUDA)" + @echo " docker-run-vllm - Run vLLM Docker container (CUDA)" + @echo " docker-build-vllm-rocm - Build vLLM Docker image (ROCm / AMD GPU, source build)" + @echo " docker-run-vllm-rocm - Run vLLM Docker container (ROCm / AMD GPU)" @echo " docker-build-sglang - Build SGLang Docker image" @echo " docker-run-sglang - Run SGLang Docker container" @echo " docker-build-musa - Build MUSA Docker image"