From f56ccd4afa81baf53d47406d22bfee4638b08d11 Mon Sep 17 00:00:00 2001 From: Dennis Ramdass Date: Tue, 16 Jun 2026 21:57:10 -0700 Subject: [PATCH 1/6] Make PrefillDecode disaggregate via selective prefix-based PD MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit serving.mode: PrefillDecode composes an EndpointPicker whose disaggregation is gated by a decider plugin. The composed config never actually disaggregated: across a full benchmark on a live GKE cluster the vLLM prefill engine handled zero requests, the decode engine served everything, and no KV cache transferred. Three defaults all conspired against it on the EPP image we run (llm-d-inference-scheduler v0.8.0, embedding gateway-api-inference-extension v1.5.0), and fixing only one is not enough: 1. prefix-based-pd-decider was declared with no parameters, so nonCachedTokens took its int zero value, which the decider treats as "disabled" — every request decode-only. 2. The decider reads a PrefixCacheMatchInfo attribute that prefix-cache-scorer no longer produces. GIE v1.5.0 split production into a separate plugin and made prepare-data default-on, so the prepareDataPlugins feature gate the v0.8.0 docs still tell you to set is unregistered and crashloops the EPP. The producer is now an explicit plugin, approx-prefix-cache-producer. 3. That producer defaults to autoTune: true, which leaves its block size 0 and never populates the attribute. Add approx-prefix-cache-producer pinned to autoTune: false, set nonCachedTokens: 16, drop the feature gate, and wire the scorers into both profiles, matching the data path the v1.5.0 binary actually uses. Verified live: long prompts now disaggregate (prefill engine's request_prefill_time counter increments and KV flows prefill->decode over NIXL) while short prompts correctly skip the prefill hop and serve decode-only. Add a regression test pinning the three load-bearing settings. Signed-off-by: Dennis Ramdass --- .../compose-model-replica/function/routing.py | 45 ++++++++++++++++++- .../tests/test_backends.py | 17 +++++++ 2 files changed, 60 insertions(+), 2 deletions(-) diff --git a/functions/compose-model-replica/function/routing.py b/functions/compose-model-replica/function/routing.py index d2a1be7f4..d3bc23972 100644 --- a/functions/compose-model-replica/function/routing.py +++ b/functions/compose-model-replica/function/routing.py @@ -34,16 +34,49 @@ # EndpointPickerConfig for the disaggregated profile. The apiVersion is the GIE # group the EPP binary registers (inference.networking.x-k8s.io/v1alpha1). +# +# The decider in disagg-profile-handler is what makes a request disaggregate: it +# runs the prefill profile, picks a prefill endpoint, and the handler sets the +# x-prefiller-host-port header the routing sidecar uses to send the prefill phase +# there (KV then flows prefill->decode over NIXL). The selective +# prefix-based-pd-decider disaggregates only when a request's uncached suffix is +# at least nonCachedTokens long, so short or cache-hot prompts skip the prefill +# hop (and its KV-transfer cost) and serve decode-only. +# +# Three things must line up or it silently never disaggregates, and the EPP +# image we run (llm-d-inference-scheduler v0.8.0, embedding +# gateway-api-inference-extension v1.5.0) makes the defaults wrong on every one: +# 1. nonCachedTokens defaults to 0, which the decider treats as "disabled" +# (always decode-only). Set it explicitly. +# 2. The decider reads a PrefixCacheMatchInfo attribute that prefix-cache-scorer +# no longer produces (GIE v1.5.0 split production into a separate plugin and +# made prepare-data default-on, so the old `prepareDataPlugins` feature gate +# the v0.8.0 docs still mention is *unregistered* and crashes the EPP). The +# producer is now an explicit plugin: approx-prefix-cache-producer. +# 3. That producer defaults to autoTune: true, which leaves its block size 0 +# and never populates the attribute. Pin autoTune: false + blockSizeTokens. +# (Verified live: with this config the prefill engine's request_prefill_time +# counter increments for long prompts and stays flat for short ones; with the +# defaults it stayed at zero for everything.) _EPP_CONFIG_YAML = """\ apiVersion: inference.networking.x-k8s.io/v1alpha1 kind: EndpointPickerConfig plugins: +- type: approx-prefix-cache-producer + parameters: + autoTune: false + blockSizeTokens: 16 + maxPrefixBlocksToMatch: 256 + lruCapacityPerServer: 31250 +- type: prefix-cache-scorer +- type: disagg-headers-handler +- type: queue-scorer - type: prefill-filter - type: decode-filter - type: max-score-picker -- type: prefix-cache-scorer -- type: queue-scorer - type: prefix-based-pd-decider + parameters: + nonCachedTokens: 16 - type: disagg-profile-handler parameters: deciders: @@ -53,10 +86,18 @@ plugins: - pluginRef: prefill-filter - pluginRef: max-score-picker + - pluginRef: prefix-cache-scorer + weight: 2 + - pluginRef: queue-scorer + weight: 1 - name: decode plugins: - pluginRef: decode-filter - pluginRef: max-score-picker + - pluginRef: prefix-cache-scorer + weight: 2 + - pluginRef: queue-scorer + weight: 1 """ diff --git a/functions/compose-model-replica/tests/test_backends.py b/functions/compose-model-replica/tests/test_backends.py index f625e367d..64cadc10a 100644 --- a/functions/compose-model-replica/tests/test_backends.py +++ b/functions/compose-model-replica/tests/test_backends.py @@ -711,6 +711,23 @@ def test_replaces_unified_service_with_pool_and_epp(self): self.assertEqual(pool["kind"], "InferencePool") self.assertEqual(pool["spec"]["endpointPickerRef"]["name"], "r-epp") + def test_epp_config_arms_the_pd_decider(self): + """PrefillDecode silently serves decode-only unless the PD decider is armed. + + Selective prefix-based-pd-decider needs all of: nonCachedTokens > 0 (0 = + disabled), the approx-prefix-cache-producer plugin that populates the + attribute it reads, and that producer pinned to autoTune: false (the + true default never populates). And it must NOT carry the prepareDataPlugins + feature gate, which the v0.8.0 EPP image rejects and crashloops on. + """ + cfg = self._apply()["epp-config"].spec.forProvider.manifest["data"]["pd-epp-config.yaml"] + self.assertIn("prefix-based-pd-decider", cfg) + self.assertIn("nonCachedTokens: 16", cfg) + self.assertIn("approx-prefix-cache-producer", cfg) + self.assertIn("autoTune: false", cfg) + self.assertNotIn("nonCachedTokens: 0", cfg) + self.assertNotIn("prepareDataPlugins", cfg) + def test_epp_role_watches_inferenceobjectives(self): """The picker watches InferenceObjectives (GIE x-k8s.io group); the Role must allow it.""" rules = self._apply()["epp-role"].spec.forProvider.manifest["rules"] From 57d2a282c53a886591250c48dfa2c468c010fd5a Mon Sep 17 00:00:00 2001 From: Dennis Ramdass Date: Wed, 17 Jun 2026 10:41:46 -0700 Subject: [PATCH 2/6] Derive the EPP prefix-cache block size from the engine flags MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The EPP's approx-prefix-cache-producer must chunk prefixes at the same KV block size the engine uses, or prefix-cache routing silently degrades (no error, just worse decisions). The config hardcoded blockSizeTokens: 16, which only works because it matches vLLM's default --block-size; a user who sets --block-size 32 (engine flags are the user's, per #137) would quietly get bad routing. Derive it best-effort from the decode engine's flags — vLLM's --block-size and SGLang's --page-size — falling back to 16 when absent or unparseable, and render it into the EPP config. Marked a HACK: peeking at user-owned engine args is the pragmatic v0.1 unblock; the durable fix is a typed/overridable knob on the serving block (#179). Signed-off-by: Dennis Ramdass --- .../compose-model-replica/function/routing.py | 65 +++++++++++++++++-- .../tests/test_backends.py | 24 +++++++ 2 files changed, 83 insertions(+), 6 deletions(-) diff --git a/functions/compose-model-replica/function/routing.py b/functions/compose-model-replica/function/routing.py index d3bc23972..3e4c70d81 100644 --- a/functions/compose-model-replica/function/routing.py +++ b/functions/compose-model-replica/function/routing.py @@ -58,14 +58,18 @@ # (Verified live: with this config the prefill engine's request_prefill_time # counter increments for long prompts and stays flat for short ones; with the # defaults it stayed at zero for everything.) -_EPP_CONFIG_YAML = """\ +# +# blockSizeTokens MUST match the engine's KV block size or prefix-cache routing +# silently degrades (#179). It's derived best-effort from the engine flags via +# _kv_block_size() (BLOCK_SIZE_TOKENS placeholder), defaulting to vLLM's 16. +_EPP_CONFIG_TEMPLATE = """\ apiVersion: inference.networking.x-k8s.io/v1alpha1 kind: EndpointPickerConfig plugins: - type: approx-prefix-cache-producer parameters: autoTune: false - blockSizeTokens: 16 + blockSizeTokens: BLOCK_SIZE_TOKENS maxPrefixBlocksToMatch: 256 lruCapacityPerServer: 31250 - type: prefix-cache-scorer @@ -100,6 +104,48 @@ weight: 1 """ +_DEFAULT_KV_BLOCK_SIZE = 16 + + +def _epp_config_yaml(block_size: int) -> str: + """Render the EPP config with the engine's KV block size.""" + return _EPP_CONFIG_TEMPLATE.replace("BLOCK_SIZE_TOKENS", str(block_size)) + + +def _kv_block_size(engine_args: list) -> int: + """HACK: best-effort read the engine's KV block size from its flags so the + EPP prefix-cache producer chunks prefixes the same way the engine does. + + Engine flags belong to the user (per #137); we peek for the common ones — + vLLM's --block-size and SGLang's --page-size — and fall back to vLLM's + default of 16. A mismatch silently degrades prefix-cache routing with no + error (#179), so deriving it beats hardcoding. The durable fix is a + typed/overridable knob on the serving block (#179); until then, this peek. + """ + args = engine_args or [] + for i, a in enumerate(args): + for flag in ("--block-size", "--page-size"): + if a == flag and i + 1 < len(args): + try: + return int(args[i + 1]) + except ValueError: + pass + elif a.startswith(flag + "="): + try: + return int(a.split("=", 1)[1]) + except ValueError: + pass + return _DEFAULT_KV_BLOCK_SIZE + + +def _engine_args(obj: k8sobjv1alpha1.Object) -> list: + """The engine container's args from a workload Object (best-effort).""" + for tmpl in _serving_pod_templates(obj.spec.forProvider.manifest): + for c in tmpl["spec"]["containers"]: + if c.get("name") == "engine": + return c.get("args", []) + return [] + def apply( composed: dict[str, k8sobjv1alpha1.Object], @@ -148,9 +194,12 @@ def _disaggregated( _label_role(out[decode_key], role="decode", app=name) _add_sidecar_to_decode(out[decode_key]) + # The EPP's prefix-cache producer must chunk prefixes at the decode engine's + # KV block size; derive it from the decode engine's flags (HACK, #179). + block_size = _kv_block_size(_engine_args(out[decode_key])) out["inference-pool"] = base.wrap_object(provider_config, _inference_pool(name)) out[base.ROUTE_KEY] = base.wrap_object(provider_config, _http_route(replica, name)) - out.update(_epp_objects(name, provider_config)) + out.update(_epp_objects(name, provider_config, block_size)) return out @@ -263,8 +312,12 @@ def _http_route(replica: v1alpha1.ModelReplica, name: str) -> dict: } -def _epp_objects(name: str, provider_config: str) -> dict[str, k8sobjv1alpha1.Object]: - """The hardcoded endpoint picker: ServiceAccount, RBAC, ConfigMap, Deployment, Service.""" +def _epp_objects(name: str, provider_config: str, block_size: int) -> dict[str, k8sobjv1alpha1.Object]: + """The endpoint picker: ServiceAccount, RBAC, ConfigMap, Deployment, Service. + + block_size is the engine's KV block size, rendered into the prefix-cache + producer so its prefix chunking matches the engine. + """ ns = base.REMOTE_NAMESPACE epp = f"{name}-epp" sa = {"apiVersion": "v1", "kind": "ServiceAccount", "metadata": {"name": epp, "namespace": ns}} @@ -299,7 +352,7 @@ def _epp_objects(name: str, provider_config: str) -> dict[str, k8sobjv1alpha1.Ob "apiVersion": "v1", "kind": "ConfigMap", "metadata": {"name": epp, "namespace": ns}, - "data": {"pd-epp-config.yaml": _EPP_CONFIG_YAML}, + "data": {"pd-epp-config.yaml": _epp_config_yaml(block_size)}, } deployment = { "apiVersion": "apps/v1", diff --git a/functions/compose-model-replica/tests/test_backends.py b/functions/compose-model-replica/tests/test_backends.py index 64cadc10a..3494e0ad9 100644 --- a/functions/compose-model-replica/tests/test_backends.py +++ b/functions/compose-model-replica/tests/test_backends.py @@ -821,5 +821,29 @@ def test_adds_service_and_route(self): self.assertNotIn("inference-pool", out) +class TestKvBlockSize(unittest.TestCase): + """The EPP prefix-cache producer's blockSizeTokens is derived best-effort + from the engine flags (#179) so it matches the engine's KV block size.""" + + def test_defaults_to_16_when_absent(self): + self.assertEqual(routing._kv_block_size([]), 16) + self.assertEqual(routing._kv_block_size(["--model=/mnt/models"]), 16) + + def test_reads_vllm_block_size(self): + self.assertEqual(routing._kv_block_size(["--block-size", "32"]), 32) + self.assertEqual(routing._kv_block_size(["--model=/m", "--block-size=8"]), 8) + + def test_reads_sglang_page_size(self): + self.assertEqual(routing._kv_block_size(["--page-size=64"]), 64) + + def test_non_integer_falls_back_to_default(self): + self.assertEqual(routing._kv_block_size(["--block-size", "auto"]), 16) + + def test_rendered_config_uses_block_size(self): + cfg = routing._epp_config_yaml(32) + self.assertIn("blockSizeTokens: 32", cfg) + self.assertNotIn("BLOCK_SIZE_TOKENS", cfg) + + if __name__ == "__main__": unittest.main() From a91541dffbd3026f01d4672f75dc1e43fc20c9e5 Mon Sep 17 00:00:00 2001 From: Dennis Ramdass Date: Wed, 17 Jun 2026 11:18:24 -0700 Subject: [PATCH 3/6] Document the NIXL engine-image prerequisite for PrefillDecode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PrefillDecode silently fails when the engine image lacks the NIXL runtime: vLLM's NixlConnector (and SGLang's PD transfer) import the `nixl` package, which the base vllm/vllm-openai image doesn't include, so disaggregated engines crashloop with "NIXL is not available". Engine images are the user's (#137), so Modelplane can't bundle it — but nothing told the user it was required. Document the prerequisite where it's relevant: the _disaggregated composition docstring, the user-facing ModelDeployment doc, and the unopinionated-deployments design. The fix is to use a kv-connector-enabled image — build vLLM with INSTALL_KV_CONNECTORS=true (nixl + lmcache + mooncake) or a pre-built one such as lmcache/vllm-openai. Signed-off-by: Dennis Ramdass --- design/unopinionated-deployments.md | 9 +++++++++ docs/content/models/model-deployment.md | 10 ++++++++++ functions/compose-model-replica/function/routing.py | 10 ++++++++++ 3 files changed, 29 insertions(+) diff --git a/design/unopinionated-deployments.md b/design/unopinionated-deployments.md index cd3d20962..d0b2135c9 100644 --- a/design/unopinionated-deployments.md +++ b/design/unopinionated-deployments.md @@ -307,6 +307,15 @@ prefill to the chosen worker; the engines themselves transfer the KV cache over their configured connector. Modelplane injects the sidecar, labels the pods as either prefill or decode, and configures the endpoint picker accordingly. +Because the engines transfer the KV cache over their connector (e.g. vLLM's +`NixlConnector`), the engine image must ship that connector's runtime — the NIXL +library. The base `vllm/vllm-openai` image does **not** include it, so a +disaggregated deployment must supply a kv-connector-enabled image (build vLLM +with `INSTALL_KV_CONNECTORS=true`, or use a pre-built one such as +`lmcache/vllm-openai`). Since the engine image and flags are the user's, this is +a deployment prerequisite Modelplane does not provide; failing it surfaces as +engines crashlooping with `NIXL is not available`. + ### Scheduling The fleet scheduler places each ModelReplica on one InferenceCluster. However diff --git a/docs/content/models/model-deployment.md b/docs/content/models/model-deployment.md index 42085ae1d..b88f94487 100644 --- a/docs/content/models/model-deployment.md +++ b/docs/content/models/model-deployment.md @@ -74,6 +74,16 @@ and long context. For small models or low traffic the KV-transfer overhead outweighs the benefit, so aggregated serving (optionally with chunked prefill) is the default. +Disaggregation requires the **engine image to provide the NIXL KV-transfer +runtime**. vLLM's `NixlConnector` (and SGLang's prefill/decode transfer) import +the `nixl` package, which the base `vllm/vllm-openai` image does **not** include +— disaggregated engines crash at startup with `NIXL is not available`. Use a +kv-connector-enabled image: build vLLM with `INSTALL_KV_CONNECTORS=true` (which +installs `nixl`, `lmcache`, and `mooncake` per vLLM's +`requirements/kv_connectors.txt`) or use a pre-built one such as +`lmcache/vllm-openai`. The engine image is yours to choose, so this is a +prerequisite Modelplane does not bundle for you. + ## Examples {{< tabs >}} diff --git a/functions/compose-model-replica/function/routing.py b/functions/compose-model-replica/function/routing.py index 3e4c70d81..4cd014fc2 100644 --- a/functions/compose-model-replica/function/routing.py +++ b/functions/compose-model-replica/function/routing.py @@ -183,6 +183,16 @@ def _disaggregated( pd-sidecar to decode, and adds the InferencePool, endpoint picker, and an HTTPRoute pointing at the pool. The engine workloads are reused as-is apart from the label/sidecar decoration. + + Engine-image prerequisite: PrefillDecode needs the engine image to ship the + NIXL runtime. vLLM's NixlConnector (and SGLang's PD transfer) import the + `nixl` package, which the base vllm/vllm-openai image does NOT include — + engines crashloop at startup with "NIXL is not available". Use a + kv-connector-enabled image: build vLLM with `INSTALL_KV_CONNECTORS=true` + (installs nixl + lmcache + mooncake, per vLLM's requirements/kv_connectors.txt) + or use a pre-built one such as lmcache/vllm-openai. Engine images are the + user's (#137), so Modelplane can't bundle this; it is a deployment + prerequisite, not something the composition provides. """ name = replica.metadata.name prefill = next(e for e in replica.spec.engines if e.phase == "Prefill") From 0975fb49b24663706ab365951bdbdf06603e805d Mon Sep 17 00:00:00 2001 From: Dennis Ramdass Date: Wed, 17 Jun 2026 11:37:44 -0700 Subject: [PATCH 4/6] Inject the NIXL KV-transfer plumbing for disaggregated engines MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PrefillDecode engines need two things for cross-pod KV transfer that the ModelDeployment schema can't express: a Memory-backed /dev/shm (the container default 64Mi is too small for NIXL's shared-memory buffers) and VLLM_NIXL_SIDE_CHANNEL_HOST set to the pod IP (via fieldRef) so peer engines can reach this one's NIXL metadata channel. The engine template only allows valueFrom.secretKeyRef/configMapKeyRef (no fieldRef) and no volumes, so a user literally cannot supply them — and without them the decode engine can't fetch the prefill's KV and requests fail with a 500 and no error in the engine logs. Inject both onto every disaggregated engine, the same way the pd-sidecar is injected — infra-level and always-correct for PrefillDecode, no user input. Signed-off-by: Dennis Ramdass --- .../compose-model-replica/function/routing.py | 44 ++++++++++++++++++- .../tests/test_backends.py | 17 +++++++ 2 files changed, 60 insertions(+), 1 deletion(-) diff --git a/functions/compose-model-replica/function/routing.py b/functions/compose-model-replica/function/routing.py index 4cd014fc2..50ac99a81 100644 --- a/functions/compose-model-replica/function/routing.py +++ b/functions/compose-model-replica/function/routing.py @@ -27,6 +27,10 @@ # The pd-sidecar takes ENGINE_PORT (8000), so the decode engine listens here. _DECODE_ENGINE_PORT = 8001 +# NIXL KV-transfer plumbing injected onto every disaggregated engine. +_NIXL_SHM_VOLUME = "nixl-shm" +_NIXL_SIDE_CHANNEL_PORT = "5557" + # Selector labels shared by both engines' serving pods (the InferencePool # matchLabels) and the per-role label the picker partitions on. _LABEL_ROLE = "llm-d.ai/role" @@ -199,11 +203,17 @@ def _disaggregated( decode = next(e for e in replica.spec.engines if e.phase == "Decode") out = dict(composed) + prefill_key = base.workload_key(prefill) decode_key = base.workload_key(decode) - _label_role(out[base.workload_key(prefill)], role="prefill", app=name) + _label_role(out[prefill_key], role="prefill", app=name) _label_role(out[decode_key], role="decode", app=name) _add_sidecar_to_decode(out[decode_key]) + # Both engines need NIXL KV-transfer plumbing the ModelDeployment schema + # can't express (no fieldRef env, no volumes). Inject it for them. + _inject_nixl_plumbing(out[prefill_key]) + _inject_nixl_plumbing(out[decode_key]) + # The EPP's prefix-cache producer must chunk prefixes at the decode engine's # KV block size; derive it from the decode engine's flags (HACK, #179). block_size = _kv_block_size(_engine_args(out[decode_key])) @@ -284,6 +294,38 @@ def _add_sidecar_to_decode(obj: k8sobjv1alpha1.Object) -> None: ) +def _inject_nixl_plumbing(obj: k8sobjv1alpha1.Object) -> None: + """Add the NIXL KV-transfer plumbing every disaggregated engine needs but + that the ModelDeployment schema can't express (no fieldRef env, no volumes). + + Two pieces, both infra-level and always-correct for PrefillDecode, so we + inject them the same way we inject the sidecar rather than asking the user: + - a Memory-backed /dev/shm: vLLM's NixlConnector uses shared memory, and + the container default (64Mi) is far too small. + - VLLM_NIXL_SIDE_CHANNEL_HOST set to the pod IP (+ a fixed port) so peer + engines can reach this one's NIXL metadata channel. Without it the + engine advertises an unreachable address and cross-pod KV transfer + fails — requests get a 500 with no error in the engine logs. + """ + for tmpl in _serving_pod_templates(obj.spec.forProvider.manifest): + spec = tmpl["spec"] + volumes = spec.setdefault("volumes", []) + if not any(v.get("name") == _NIXL_SHM_VOLUME for v in volumes): + volumes.append({"name": _NIXL_SHM_VOLUME, "emptyDir": {"medium": "Memory"}}) + engine = next(c for c in spec["containers"] if c["name"] == "engine") + mounts = engine.setdefault("volumeMounts", []) + if not any(m.get("mountPath") == "/dev/shm" for m in mounts): + mounts.append({"name": _NIXL_SHM_VOLUME, "mountPath": "/dev/shm"}) + env = engine.setdefault("env", []) + existing = {e.get("name") for e in env} + if "VLLM_NIXL_SIDE_CHANNEL_HOST" not in existing: + env.append( + {"name": "VLLM_NIXL_SIDE_CHANNEL_HOST", "valueFrom": {"fieldRef": {"fieldPath": "status.podIP"}}} + ) + if "VLLM_NIXL_SIDE_CHANNEL_PORT" not in existing: + env.append({"name": "VLLM_NIXL_SIDE_CHANNEL_PORT", "value": _NIXL_SIDE_CHANNEL_PORT}) + + def _inference_pool(name: str) -> dict: return { "apiVersion": "inference.networking.k8s.io/v1", diff --git a/functions/compose-model-replica/tests/test_backends.py b/functions/compose-model-replica/tests/test_backends.py index 3494e0ad9..6f9abea39 100644 --- a/functions/compose-model-replica/tests/test_backends.py +++ b/functions/compose-model-replica/tests/test_backends.py @@ -711,6 +711,23 @@ def test_replaces_unified_service_with_pool_and_epp(self): self.assertEqual(pool["kind"], "InferencePool") self.assertEqual(pool["spec"]["endpointPickerRef"]["name"], "r-epp") + def test_injects_nixl_plumbing(self): + """Both disagg engines get the NIXL plumbing the schema can't express: + a Memory /dev/shm and VLLM_NIXL_SIDE_CHANNEL_HOST = pod IP.""" + out = self._apply() + for role in ("prefill", "decode"): + pod = self._serving_pod(out, role)["spec"] + self.assertTrue( + any(v.get("emptyDir", {}).get("medium") == "Memory" for v in pod["volumes"]), + f"{role} missing Memory /dev/shm volume", + ) + engine = next(c for c in pod["containers"] if c["name"] == "engine") + self.assertIn("/dev/shm", [m["mountPath"] for m in engine["volumeMounts"]]) + host = next((e for e in engine["env"] if e["name"] == "VLLM_NIXL_SIDE_CHANNEL_HOST"), None) + self.assertIsNotNone(host, f"{role} missing VLLM_NIXL_SIDE_CHANNEL_HOST") + self.assertEqual(host["valueFrom"]["fieldRef"]["fieldPath"], "status.podIP") + self.assertIn("VLLM_NIXL_SIDE_CHANNEL_PORT", [e["name"] for e in engine["env"]]) + def test_epp_config_arms_the_pd_decider(self): """PrefillDecode silently serves decode-only unless the PD decider is armed. From cb1902fc10b1940f3efc745007eacd80b6577565 Mon Sep 17 00:00:00 2001 From: Dennis Ramdass Date: Wed, 17 Jun 2026 11:57:58 -0700 Subject: [PATCH 5/6] Document the NIXL runtime prerequisite for disaggregation Disaggregated engines import the NIXL KV-transfer runtime through their connector (vLLM's NixlConnector, SGLang's transfer path). An image without NIXL crashes at startup with "NIXL is not available", which is easy to hit and hard to diagnose. Recent vanilla vllm/vllm-openai images ship NIXL, so the guidance is simply to pin a current tag. Note this prerequisite in the ModelDeployment guide and the design doc, and teach the docs vocabulary the NIXL/NixlConnector terms so Vale stops flagging them. Signed-off-by: Dennis Ramdass --- design/unopinionated-deployments.md | 13 ++++++------- docs/content/models/model-deployment.md | 11 ++++------- .../config/vocabularies/Modelplane/accept.txt | 2 ++ 3 files changed, 12 insertions(+), 14 deletions(-) diff --git a/design/unopinionated-deployments.md b/design/unopinionated-deployments.md index d0b2135c9..12c2469ac 100644 --- a/design/unopinionated-deployments.md +++ b/design/unopinionated-deployments.md @@ -308,13 +308,12 @@ their configured connector. Modelplane injects the sidecar, labels the pods as either prefill or decode, and configures the endpoint picker accordingly. Because the engines transfer the KV cache over their connector (e.g. vLLM's -`NixlConnector`), the engine image must ship that connector's runtime — the NIXL -library. The base `vllm/vllm-openai` image does **not** include it, so a -disaggregated deployment must supply a kv-connector-enabled image (build vLLM -with `INSTALL_KV_CONNECTORS=true`, or use a pre-built one such as -`lmcache/vllm-openai`). Since the engine image and flags are the user's, this is -a deployment prerequisite Modelplane does not provide; failing it surfaces as -engines crashlooping with `NIXL is not available`. +`NixlConnector`), the engine image must ship that connector's runtime: the NIXL +library. Recent vanilla `vllm/vllm-openai` images include it, so a disaggregated +deployment pins a current tag rather than an old one. Since the engine image and +flags are the user's, this is a deployment prerequisite Modelplane does not +provide; failing it surfaces as engines crashlooping with `NIXL is not +available`. ### Scheduling diff --git a/docs/content/models/model-deployment.md b/docs/content/models/model-deployment.md index b88f94487..e83489468 100644 --- a/docs/content/models/model-deployment.md +++ b/docs/content/models/model-deployment.md @@ -76,13 +76,10 @@ the default. Disaggregation requires the **engine image to provide the NIXL KV-transfer runtime**. vLLM's `NixlConnector` (and SGLang's prefill/decode transfer) import -the `nixl` package, which the base `vllm/vllm-openai` image does **not** include -— disaggregated engines crash at startup with `NIXL is not available`. Use a -kv-connector-enabled image: build vLLM with `INSTALL_KV_CONNECTORS=true` (which -installs `nixl`, `lmcache`, and `mooncake` per vLLM's -`requirements/kv_connectors.txt`) or use a pre-built one such as -`lmcache/vllm-openai`. The engine image is yours to choose, so this is a -prerequisite Modelplane does not bundle for you. +the `nixl` package, so disaggregated engines crash at startup with `NIXL is not +available` on an image that lacks it. Recent vanilla `vllm/vllm-openai` images +ship NIXL, so pin a current tag rather than an old one. The engine image is +yours to choose, so this is a prerequisite Modelplane does not bundle for you. ## Examples diff --git a/docs/utils/vale/styles/config/vocabularies/Modelplane/accept.txt b/docs/utils/vale/styles/config/vocabularies/Modelplane/accept.txt index 4b3fa4b12..0df50f850 100644 --- a/docs/utils/vale/styles/config/vocabularies/Modelplane/accept.txt +++ b/docs/utils/vale/styles/config/vocabularies/Modelplane/accept.txt @@ -98,6 +98,8 @@ NVIDIA Envoy Traefik MetalLB +NIXL +NixlConnector Prometheus ArgoCD FluxCD From d14dccaa45dcd71aa6f3199c056a17643f9f99cb Mon Sep 17 00:00:00 2001 From: Dennis Ramdass Date: Wed, 17 Jun 2026 12:00:55 -0700 Subject: [PATCH 6/6] Tidy disaggregated routing: share flag parsing, align NIXL docstring _kv_block_size and _decode_port each open-coded the same "--flag value" / "--flag=value" engine-arg scan. Factor it into one _flag_value helper so both read the user's flags the same way. Also bring the _disaggregated NIXL-prerequisite docstring in line with the docs: recent vanilla vllm/vllm-openai images ship the NIXL runtime, so the guidance is to pin a current tag rather than build a kv-connector image. Signed-off-by: Dennis Ramdass --- .../compose-model-replica/function/routing.py | 64 +++++++++---------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/functions/compose-model-replica/function/routing.py b/functions/compose-model-replica/function/routing.py index 50ac99a81..f2c29d350 100644 --- a/functions/compose-model-replica/function/routing.py +++ b/functions/compose-model-replica/function/routing.py @@ -116,29 +116,37 @@ def _epp_config_yaml(block_size: int) -> str: return _EPP_CONFIG_TEMPLATE.replace("BLOCK_SIZE_TOKENS", str(block_size)) +def _flag_value(args: list, *flags: str) -> str | None: + """Best-effort value of a `--flag value` or `--flag=value` engine arg. + + Engine flags belong to the user (per #137); callers only peek. Returns the + first match's raw string value, or None if no flag is present. + """ + for i, a in enumerate(args or []): + for flag in flags: + if a == flag and i + 1 < len(args): + return args[i + 1] + if a.startswith(flag + "="): + return a.split("=", 1)[1] + return None + + def _kv_block_size(engine_args: list) -> int: """HACK: best-effort read the engine's KV block size from its flags so the EPP prefix-cache producer chunks prefixes the same way the engine does. - Engine flags belong to the user (per #137); we peek for the common ones — - vLLM's --block-size and SGLang's --page-size — and fall back to vLLM's - default of 16. A mismatch silently degrades prefix-cache routing with no - error (#179), so deriving it beats hardcoding. The durable fix is a - typed/overridable knob on the serving block (#179); until then, this peek. + We peek for the common flags — vLLM's --block-size and SGLang's --page-size + — and fall back to vLLM's default of 16. A mismatch silently degrades + prefix-cache routing with no error (#179), so deriving it beats hardcoding. + The durable fix is a typed/overridable knob on the serving block (#179); + until then, this peek. """ - args = engine_args or [] - for i, a in enumerate(args): - for flag in ("--block-size", "--page-size"): - if a == flag and i + 1 < len(args): - try: - return int(args[i + 1]) - except ValueError: - pass - elif a.startswith(flag + "="): - try: - return int(a.split("=", 1)[1]) - except ValueError: - pass + raw = _flag_value(engine_args, "--block-size", "--page-size") + if raw is not None: + try: + return int(raw) + except ValueError: + pass return _DEFAULT_KV_BLOCK_SIZE @@ -190,13 +198,10 @@ def _disaggregated( Engine-image prerequisite: PrefillDecode needs the engine image to ship the NIXL runtime. vLLM's NixlConnector (and SGLang's PD transfer) import the - `nixl` package, which the base vllm/vllm-openai image does NOT include — - engines crashloop at startup with "NIXL is not available". Use a - kv-connector-enabled image: build vLLM with `INSTALL_KV_CONNECTORS=true` - (installs nixl + lmcache + mooncake, per vLLM's requirements/kv_connectors.txt) - or use a pre-built one such as lmcache/vllm-openai. Engine images are the - user's (#137), so Modelplane can't bundle this; it is a deployment - prerequisite, not something the composition provides. + `nixl` package, so an image without it crashloops at startup with "NIXL is + not available". Recent vanilla vllm/vllm-openai images ship NIXL, so pin a + current tag. Engine images are the user's (#137), so Modelplane can't bundle + this; it is a deployment prerequisite, not something the composition provides. """ name = replica.metadata.name prefill = next(e for e in replica.spec.engines if e.phase == "Prefill") @@ -253,13 +258,8 @@ def _decode_port(engine: dict) -> int: expects _DECODE_ENGINE_PORT. The user owns the engine flags (per #137), so we also best-effort honor an explicit --port override rather than assume one. """ - args = engine.get("args", []) - for i, a in enumerate(args): - if a.startswith("--port="): - return int(a.split("=", 1)[1]) - if a == "--port" and i + 1 < len(args): - return int(args[i + 1]) - return _DECODE_ENGINE_PORT + raw = _flag_value(engine.get("args", []), "--port") + return int(raw) if raw is not None else _DECODE_ENGINE_PORT def _add_sidecar_to_decode(obj: k8sobjv1alpha1.Object) -> None: