From 07232afb4e41b6ce03fc21a0f027dba0de35c5a3 Mon Sep 17 00:00:00 2001
From: Honglin Cao <hocao@nvidia.com>
Date: Thu, 21 May 2026 14:52:33 -0400
Subject: [PATCH 1/7] feat(sdk): support inference_v3 config_file mount

Bump platform-api-python-client to 4.10.0 so ConfigFileMount is
importable, and add a thin helper in centml/sdk/utils/config_file.py
that reads a file off disk and returns a populated ConfigFileMount.
Example create_inference.py updated to show the helper in context.

Server (platform PRs #3656 and #3667) owns all field-level validation
(64 KiB content cap, filename charset, mount_path rules); the helper
deliberately stays validation-free so the SDK does not drift when
server limits change.

Signed-off-by: Honglin Cao <hocao@nvidia.com>
---
 centml/sdk/utils/config_file.py      | 13 +++++++++
 examples/sdk/create_inference.py     |  8 ++++++
 tests/test_sdk_config_file_helper.py | 41 ++++++++++++++++++++++++++++
 3 files changed, 62 insertions(+)
 create mode 100644 centml/sdk/utils/config_file.py
 create mode 100644 tests/test_sdk_config_file_helper.py

diff --git a/centml/sdk/utils/config_file.py b/centml/sdk/utils/config_file.py
new file mode 100644
index 00000000..fef42070
--- /dev/null
+++ b/centml/sdk/utils/config_file.py
@@ -0,0 +1,13 @@
+import os
+from typing import Optional
+
+from platform_api_python_client import ConfigFileMount
+
+
+# Load a file off disk into a ConfigFileMount. Field-level validation
+# (size cap, filename charset, mount_path rules) is intentionally left
+# to the API so SDK doesn't drift when server limits change.
+def load_config_file_mount(path: str, mount_path: str, filename: Optional[str] = None) -> ConfigFileMount:
+    with open(path, "r", encoding="utf-8") as f:
+        content = f.read()
+    return ConfigFileMount(filename=filename or os.path.basename(path), mount_path=mount_path, content=content)
diff --git a/examples/sdk/create_inference.py b/examples/sdk/create_inference.py
index 5bbe365d..2e5b4b40 100644
--- a/examples/sdk/create_inference.py
+++ b/examples/sdk/create_inference.py
@@ -1,6 +1,7 @@
 import centml
 from centml.sdk.api import get_centml_client
 from centml.sdk import DeploymentType, CreateInferenceV3DeploymentRequest, UserVaultType
+from centml.sdk.utils.config_file import load_config_file_mount
 
 
 def main():
@@ -22,6 +23,13 @@ def main():
             max_unavailable=0,  # Keep all pods available during updates
             healthcheck="/",
             concurrency=10,
+            # Helper reads ./nginx.conf and wraps it; pass an inline
+            # ConfigFileMount(filename=..., mount_path=..., content=...) if
+            # the content is already in memory.
+            config_file=load_config_file_mount(
+                path="./nginx.conf",
+                mount_path="/etc/nginx/conf.d/default.conf",
+            ),
         )
         response = cclient.create_inference(request)
         print("Create deployment response: ", response)
diff --git a/tests/test_sdk_config_file_helper.py b/tests/test_sdk_config_file_helper.py
new file mode 100644
index 00000000..db9e52c4
--- /dev/null
+++ b/tests/test_sdk_config_file_helper.py
@@ -0,0 +1,41 @@
+"""Tests for centml.sdk.utils.config_file.load_config_file_mount."""
+
+import pytest
+
+from centml.sdk.utils.config_file import load_config_file_mount
+
+
+def test_default_filename_from_basename(tmp_path):
+    src = tmp_path / "nginx.conf"
+    src.write_text("server { listen 80; }\n")
+
+    mount = load_config_file_mount(str(src), "/etc/nginx/conf.d/default.conf")
+
+    assert mount.filename == "nginx.conf"
+    assert mount.mount_path == "/etc/nginx/conf.d/default.conf"
+    assert mount.content == "server { listen 80; }\n"
+
+
+def test_explicit_filename_overrides_basename(tmp_path):
+    src = tmp_path / "local.txt"
+    src.write_text("payload")
+
+    mount = load_config_file_mount(str(src), "/app/etc/remote.conf", filename="remote.conf")
+
+    assert mount.filename == "remote.conf"
+    assert mount.mount_path == "/app/etc/remote.conf"
+    assert mount.content == "payload"
+
+
+def test_utf8_multibyte_content_roundtrips(tmp_path):
+    src = tmp_path / "i18n.conf"
+    src.write_text("配置内容 = 测试\n", encoding="utf-8")
+
+    mount = load_config_file_mount(str(src), "/etc/app/i18n.conf")
+
+    assert mount.content == "配置内容 = 测试\n"
+
+
+def test_missing_file_raises_filenotfound(tmp_path):
+    with pytest.raises(FileNotFoundError):
+        load_config_file_mount(str(tmp_path / "does-not-exist.conf"), "/etc/x")

From 5b90163e3a6fa4afa87d8928a707211793fea75d Mon Sep 17 00:00:00 2001
From: Honglin Cao <hocao@nvidia.com>
Date: Thu, 21 May 2026 15:28:34 -0400
Subject: [PATCH 2/7] fix(sdk): correct mount_path semantics in config_file
 example/helper

mount_path is the parent directory inside the container; the chart
concatenates mount_path + "/" + filename for the final volumeMount target.
The original example treated mount_path as the full file path, which
mounted the file as a subdirectory and crashed nginx with an OCI
"cannot create subdirectories in <path>: not a directory" error.

- Helper comment now states the mount_path/filename relationship.
- Example uses mount_path="/etc/nginx/conf.d", path="./default.conf"
  so the file lands at /etc/nginx/conf.d/default.conf as nginx expects.
- Unit tests use directory mount_paths to match real usage.

Discovered while running pod-level e2e smoke against local minikube.

Signed-off-by: Honglin Cao <hocao@nvidia.com>
---
 centml/sdk/utils/config_file.py      |  7 ++++---
 examples/sdk/create_inference.py     | 12 +++++++-----
 tests/test_sdk_config_file_helper.py | 10 +++++-----
 3 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/centml/sdk/utils/config_file.py b/centml/sdk/utils/config_file.py
index fef42070..acf531b0 100644
--- a/centml/sdk/utils/config_file.py
+++ b/centml/sdk/utils/config_file.py
@@ -4,9 +4,10 @@
 from platform_api_python_client import ConfigFileMount
 
 
-# Load a file off disk into a ConfigFileMount. Field-level validation
-# (size cap, filename charset, mount_path rules) is intentionally left
-# to the API so SDK doesn't drift when server limits change.
+# Load a file off disk into a ConfigFileMount. `mount_path` is the parent
+# directory inside the container; the file lands at `mount_path/filename`.
+# Field-level validation (size cap, filename charset, mount_path rules) is
+# left to the API so SDK doesn't drift when server limits change.
 def load_config_file_mount(path: str, mount_path: str, filename: Optional[str] = None) -> ConfigFileMount:
     with open(path, "r", encoding="utf-8") as f:
         content = f.read()
diff --git a/examples/sdk/create_inference.py b/examples/sdk/create_inference.py
index 2e5b4b40..5863d665 100644
--- a/examples/sdk/create_inference.py
+++ b/examples/sdk/create_inference.py
@@ -23,12 +23,14 @@ def main():
             max_unavailable=0,  # Keep all pods available during updates
             healthcheck="/",
             concurrency=10,
-            # Helper reads ./nginx.conf and wraps it; pass an inline
-            # ConfigFileMount(filename=..., mount_path=..., content=...) if
-            # the content is already in memory.
+            # Mounts ./default.conf at /etc/nginx/conf.d/default.conf. mount_path
+            # is the parent directory; filename defaults to os.path.basename(path)
+            # so the resulting file lands at mount_path/filename. Pass an inline
+            # ConfigFileMount(filename=..., mount_path=..., content=...) if the
+            # content is already in memory.
             config_file=load_config_file_mount(
-                path="./nginx.conf",
-                mount_path="/etc/nginx/conf.d/default.conf",
+                path="./default.conf",
+                mount_path="/etc/nginx/conf.d",
             ),
         )
         response = cclient.create_inference(request)
diff --git a/tests/test_sdk_config_file_helper.py b/tests/test_sdk_config_file_helper.py
index db9e52c4..9470a2d5 100644
--- a/tests/test_sdk_config_file_helper.py
+++ b/tests/test_sdk_config_file_helper.py
@@ -9,10 +9,10 @@ def test_default_filename_from_basename(tmp_path):
     src = tmp_path / "nginx.conf"
     src.write_text("server { listen 80; }\n")
 
-    mount = load_config_file_mount(str(src), "/etc/nginx/conf.d/default.conf")
+    mount = load_config_file_mount(str(src), "/etc/nginx/conf.d")
 
     assert mount.filename == "nginx.conf"
-    assert mount.mount_path == "/etc/nginx/conf.d/default.conf"
+    assert mount.mount_path == "/etc/nginx/conf.d"
     assert mount.content == "server { listen 80; }\n"
 
 
@@ -20,10 +20,10 @@ def test_explicit_filename_overrides_basename(tmp_path):
     src = tmp_path / "local.txt"
     src.write_text("payload")
 
-    mount = load_config_file_mount(str(src), "/app/etc/remote.conf", filename="remote.conf")
+    mount = load_config_file_mount(str(src), "/app/etc", filename="remote.conf")
 
     assert mount.filename == "remote.conf"
-    assert mount.mount_path == "/app/etc/remote.conf"
+    assert mount.mount_path == "/app/etc"
     assert mount.content == "payload"
 
 
@@ -31,7 +31,7 @@ def test_utf8_multibyte_content_roundtrips(tmp_path):
     src = tmp_path / "i18n.conf"
     src.write_text("配置内容 = 测试\n", encoding="utf-8")
 
-    mount = load_config_file_mount(str(src), "/etc/app/i18n.conf")
+    mount = load_config_file_mount(str(src), "/etc/app")
 
     assert mount.content == "配置内容 = 测试\n"
 

From aa47d7ea971b102161f6496dee56e3e7a69e030b Mon Sep 17 00:00:00 2001
From: Honglin Cao <hocao@nvidia.com>
Date: Thu, 21 May 2026 16:57:17 -0400
Subject: [PATCH 3/7] docs(examples): add vLLM inference example with
 chat_template config_file

Per review feedback on #136: add a separate vLLM example showing
config_file used to mount a chat-template Jinja file that vLLM
consumes via --chat-template. Existing nginx example stays as the
minimal config_file demo; this one shows the realistic LLM-serving
shape (vllm/vllm-openai image, command override, GPU instance).

Signed-off-by: Honglin Cao <hocao@nvidia.com>
---
 examples/sdk/create_inference_vllm.py | 50 +++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)
 create mode 100644 examples/sdk/create_inference_vllm.py

diff --git a/examples/sdk/create_inference_vllm.py b/examples/sdk/create_inference_vllm.py
new file mode 100644
index 00000000..ddd277a4
--- /dev/null
+++ b/examples/sdk/create_inference_vllm.py
@@ -0,0 +1,50 @@
+import centml
+from centml.sdk.api import get_centml_client
+from centml.sdk import CreateInferenceV3DeploymentRequest
+from centml.sdk.utils.config_file import load_config_file_mount
+
+
+def main():
+    with get_centml_client() as cclient:
+        # Mounts ./chat_template.jinja at /etc/vllm/chat_template.jinja and
+        # tells vLLM to use it via --chat-template. mount_path is the parent
+        # directory; filename defaults to os.path.basename(path).
+        request = CreateInferenceV3DeploymentRequest(
+            name="vllm-llama",
+            cluster_id=1000,
+            hardware_instance_id=1001,  # GPU instance
+            image_url="vllm/vllm-openai:latest",
+            port=8000,
+            min_replicas=1,
+            max_replicas=1,
+            initial_replicas=1,
+            max_surge=1,
+            max_unavailable=0,
+            healthcheck="/health",
+            concurrency=10,
+            env_vars={"HF_TOKEN": "<your-hf-token>"},
+            command=(
+                "python -m vllm.entrypoints.openai.api_server "
+                "--model meta-llama/Llama-3.2-3B-Instruct "
+                "--port 8000 "
+                "--chat-template /etc/vllm/chat_template.jinja"
+            ),
+            config_file=load_config_file_mount(path="./chat_template.jinja", mount_path="/etc/vllm"),
+        )
+        response = cclient.create_inference(request)
+        print("Create deployment response: ", response)
+
+        deployment = cclient.get_inference(response.id)
+        print("Deployment details: ", deployment)
+
+        '''
+        ### Pause the deployment
+        cclient.pause(deployment.id)
+
+        ### Delete the deployment
+        cclient.delete(deployment.id)
+        '''
+
+
+if __name__ == "__main__":
+    main()

From 608e3c410f46a12de463ab526528b5f3bc863c56 Mon Sep 17 00:00:00 2001
From: Honglin Cao <hocao@nvidia.com>
Date: Thu, 21 May 2026 17:00:49 -0400
Subject: [PATCH 4/7] style(examples): black-format create_inference.py

scripts/format.sh uses --skip-magic-trailing-comma; load_config_file_mount
call fits on one line under line-length 120 so collapse the multi-line form.

Signed-off-by: Honglin Cao <hocao@nvidia.com>
---
 examples/sdk/create_inference.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/examples/sdk/create_inference.py b/examples/sdk/create_inference.py
index 5863d665..f8126e4a 100644
--- a/examples/sdk/create_inference.py
+++ b/examples/sdk/create_inference.py
@@ -28,10 +28,7 @@ def main():
             # so the resulting file lands at mount_path/filename. Pass an inline
             # ConfigFileMount(filename=..., mount_path=..., content=...) if the
             # content is already in memory.
-            config_file=load_config_file_mount(
-                path="./default.conf",
-                mount_path="/etc/nginx/conf.d",
-            ),
+            config_file=load_config_file_mount(path="./default.conf", mount_path="/etc/nginx/conf.d"),
         )
         response = cclient.create_inference(request)
         print("Create deployment response: ", response)

From 2095ab54b3d9824947c5ec11286a6e52f5ff55d3 Mon Sep 17 00:00:00 2001
From: Honglin Cao <hocao@nvidia.com>
Date: Thu, 21 May 2026 17:22:37 -0400
Subject: [PATCH 5/7] docs(examples): drive vllm example via --config <yaml>
 (#136 review)

michaelshin's review: a config_file is most useful when it carries the
full vLLM startup config (model, dtype, gpu-memory-utilization,
speculative-config, tool-call-parser, etc.) consumed via --config,
not just a chat template.

- Switch the example command to `--config /etc/vllm/vllm_config.yaml`.
- Add sibling examples/sdk/vllm_config.yaml with the Llama-3.1-8B +
  EAGLE3 speculative-decoding setup from review feedback so the example
  is self-runnable.

Signed-off-by: Honglin Cao <hocao@nvidia.com>
---
 examples/sdk/create_inference_vllm.py | 19 +++++++++----------
 examples/sdk/vllm_config.yaml         | 23 +++++++++++++++++++++++
 2 files changed, 32 insertions(+), 10 deletions(-)
 create mode 100644 examples/sdk/vllm_config.yaml

diff --git a/examples/sdk/create_inference_vllm.py b/examples/sdk/create_inference_vllm.py
index ddd277a4..3db31ac6 100644
--- a/examples/sdk/create_inference_vllm.py
+++ b/examples/sdk/create_inference_vllm.py
@@ -6,9 +6,13 @@
 
 def main():
     with get_centml_client() as cclient:
-        # Mounts ./chat_template.jinja at /etc/vllm/chat_template.jinja and
-        # tells vLLM to use it via --chat-template. mount_path is the parent
-        # directory; filename defaults to os.path.basename(path).
+        # Mounts ./vllm_config.yaml at /etc/vllm/vllm_config.yaml and lets vLLM
+        # consume the whole config via --config. mount_path is the parent
+        # directory; filename defaults to os.path.basename(path) so the file
+        # lands at mount_path/filename. The sibling vllm_config.yaml in this
+        # directory shows a realistic Llama-3.1-8B + EAGLE3 speculative-decoding
+        # setup; edit it (model, dtype, tensor-parallel-size, speculative-config,
+        # etc.) to match the workload before deploying.
         request = CreateInferenceV3DeploymentRequest(
             name="vllm-llama",
             cluster_id=1000,
@@ -23,13 +27,8 @@ def main():
             healthcheck="/health",
             concurrency=10,
             env_vars={"HF_TOKEN": "<your-hf-token>"},
-            command=(
-                "python -m vllm.entrypoints.openai.api_server "
-                "--model meta-llama/Llama-3.2-3B-Instruct "
-                "--port 8000 "
-                "--chat-template /etc/vllm/chat_template.jinja"
-            ),
-            config_file=load_config_file_mount(path="./chat_template.jinja", mount_path="/etc/vllm"),
+            command="python -m vllm.entrypoints.openai.api_server --port 8000 --config /etc/vllm/vllm_config.yaml",
+            config_file=load_config_file_mount(path="./vllm_config.yaml", mount_path="/etc/vllm"),
         )
         response = cclient.create_inference(request)
         print("Create deployment response: ", response)
diff --git a/examples/sdk/vllm_config.yaml b/examples/sdk/vllm_config.yaml
new file mode 100644
index 00000000..89a05aaa
--- /dev/null
+++ b/examples/sdk/vllm_config.yaml
@@ -0,0 +1,23 @@
+model: meta-llama/Llama-3.1-8B-Instruct
+tokenizer: meta-llama/Llama-3.1-8B-Instruct
+runner: generate
+dtype: auto
+gpu-memory-utilization: 0.9
+max-num-seqs: 2048
+tokenizer-mode: auto
+seed: 0
+tensor-parallel-size: 1
+pipeline-parallel-size: 1
+block-size: 16
+attention-backend: FLASHINFER
+distributed-executor-backend: uni
+enable-prefix-caching: true
+enable-chunked-prefill: true
+max-num-batched-tokens: 1024
+speculative-config:
+  method: eagle3
+  model: centml/EAGLE3-Llama3.1-8B-Instruct
+  num_speculative_tokens: 3
+  draft_tensor_parallel_size: 1
+enable-auto-tool-choice: true
+tool-call-parser: llama3_json

From 82decf1dca57853035d21eccc0a72c65c8b0cb88 Mon Sep 17 00:00:00 2001
From: Honglin Cao <hocao@nvidia.com>
Date: Thu, 21 May 2026 17:51:48 -0400
Subject: [PATCH 6/7] fix(examples): vllm example uses python3 + moves port
 into yaml
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Found during dev B200 verification (Qwen2.5-0.5B variant of this shape):

- vllm/vllm-openai image has python3 only, no `python` symlink — the
  K8s container fails to start with `exec: "python": executable file
  not found in $PATH` because the platform's `command` field overrides
  the image's ENTRYPOINT entirely.
- The platform helm path passes numeric arg tokens to the Rollout spec
  as integers, and the K8s API server rejects them (`args[3] ... must
  be of type string: "integer"`). Moving `port: 8000` into the YAML
  config keeps every CLI token a non-numeric string while still letting
  vLLM pick up the port — and matches the "all config in one file"
  intent of --config.

Verified end-to-end on dev (cluster c-01-c-11-centml-org, hw x1-large-b200):
created deployment via SDK → rollout HEALTHY at t+76s → POST
/v1/chat/completions returned HTTP 200 with a real Qwen completion.

Signed-off-by: Honglin Cao <hocao@nvidia.com>
---
 examples/sdk/create_inference_vllm.py | 2 +-
 examples/sdk/vllm_config.yaml         | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/sdk/create_inference_vllm.py b/examples/sdk/create_inference_vllm.py
index 3db31ac6..33d16523 100644
--- a/examples/sdk/create_inference_vllm.py
+++ b/examples/sdk/create_inference_vllm.py
@@ -27,7 +27,7 @@ def main():
             healthcheck="/health",
             concurrency=10,
             env_vars={"HF_TOKEN": "<your-hf-token>"},
-            command="python -m vllm.entrypoints.openai.api_server --port 8000 --config /etc/vllm/vllm_config.yaml",
+            command="python3 -m vllm.entrypoints.openai.api_server --config /etc/vllm/vllm_config.yaml",
             config_file=load_config_file_mount(path="./vllm_config.yaml", mount_path="/etc/vllm"),
         )
         response = cclient.create_inference(request)
diff --git a/examples/sdk/vllm_config.yaml b/examples/sdk/vllm_config.yaml
index 89a05aaa..cddd94bd 100644
--- a/examples/sdk/vllm_config.yaml
+++ b/examples/sdk/vllm_config.yaml
@@ -1,3 +1,4 @@
+port: 8000
 model: meta-llama/Llama-3.1-8B-Instruct
 tokenizer: meta-llama/Llama-3.1-8B-Instruct
 runner: generate

From 009be9b41be2fd07069a6960977f2082fb669b17 Mon Sep 17 00:00:00 2001
From: Honglin Cao <hocao@nvidia.com>
Date: Thu, 21 May 2026 17:58:27 -0400
Subject: [PATCH 7/7] fix(sdk): address codex review feedback on #136

- helper: open with newline="" so CRLF/CR line endings reach the server
  byte-faithful instead of being silently normalized to \n. Adds a
  regression test for a Windows-authored config (b"line1\r\nline2\r\n").
- example: ship examples/sdk/default.conf so create_inference.py runs
  as-is without the user having to discover and create an extra file
  (matches how create_inference_vllm.py ships vllm_config.yaml).

Signed-off-by: Honglin Cao <hocao@nvidia.com>
---
 centml/sdk/utils/config_file.py      |  4 +++-
 examples/sdk/default.conf            |  7 +++++++
 tests/test_sdk_config_file_helper.py | 11 +++++++++++
 3 files changed, 21 insertions(+), 1 deletion(-)
 create mode 100644 examples/sdk/default.conf

diff --git a/centml/sdk/utils/config_file.py b/centml/sdk/utils/config_file.py
index acf531b0..4cedc9b2 100644
--- a/centml/sdk/utils/config_file.py
+++ b/centml/sdk/utils/config_file.py
@@ -9,6 +9,8 @@
 # Field-level validation (size cap, filename charset, mount_path rules) is
 # left to the API so SDK doesn't drift when server limits change.
 def load_config_file_mount(path: str, mount_path: str, filename: Optional[str] = None) -> ConfigFileMount:
-    with open(path, "r", encoding="utf-8") as f:
+    # newline="" disables universal-newline translation so CRLF/CR line
+    # endings reach the server byte-faithful instead of being normalized to \n.
+    with open(path, "r", encoding="utf-8", newline="") as f:
         content = f.read()
     return ConfigFileMount(filename=filename or os.path.basename(path), mount_path=mount_path, content=content)
diff --git a/examples/sdk/default.conf b/examples/sdk/default.conf
new file mode 100644
index 00000000..aa75ddfb
--- /dev/null
+++ b/examples/sdk/default.conf
@@ -0,0 +1,7 @@
+server {
+    listen 8080;
+    location / {
+        return 200 "hello from config_file\n";
+        add_header Content-Type text/plain;
+    }
+}
diff --git a/tests/test_sdk_config_file_helper.py b/tests/test_sdk_config_file_helper.py
index 9470a2d5..212e94f2 100644
--- a/tests/test_sdk_config_file_helper.py
+++ b/tests/test_sdk_config_file_helper.py
@@ -39,3 +39,14 @@ def test_utf8_multibyte_content_roundtrips(tmp_path):
 def test_missing_file_raises_filenotfound(tmp_path):
     with pytest.raises(FileNotFoundError):
         load_config_file_mount(str(tmp_path / "does-not-exist.conf"), "/etc/x")
+
+
+def test_preserves_crlf_line_endings(tmp_path):
+    # Windows-authored configs use \r\n; the helper must not silently
+    # normalize them to \n when uploading to the server.
+    src = tmp_path / "windows.conf"
+    src.write_bytes(b"line1\r\nline2\r\n")
+
+    mount = load_config_file_mount(str(src), "/etc/app")
+
+    assert mount.content == "line1\r\nline2\r\n"