sgl-project · zhaochenyang20 · Feb 18, 2026 · Feb 5, 2026 · Feb 6, 2026 · Feb 6, 2026
diff --git a/docs/advanced_features/sglang_for_rl.md b/docs/advanced_features/sglang_for_rl.md
@@ -106,6 +106,29 @@ This path trades some I/O overhead for simplicity and flexibility. It integrates
 
 **Python Engine API:** `engine.update_weights_from_disk(model_path, load_format=None)`
 
+**Diffusion engine (SGLang-Diffusion):** The diffusion engine exposes the same `POST /update_weights_from_disk` endpoint with the following behavior:
+
+- **All-or-nothing with rollback:** if any module fails to load, all previously updated modules are rolled back to the original weights by reloading from the original model path. No partial updates are left behind. If rollback itself fails, the exception propagates so the caller knows the model is in an inconsistent state.
+- **Offload-aware:** when layerwise offload (`--dit-layerwise-offload`) is enabled, the diffusion offload manager replaces GPU parameters with small `torch.empty((1,))` placeholders while real weights live in consolidated pinned CPU buffers. A naive `param.data.copy_()` would fail with a shape mismatch. Instead, the updater dynamically detects active offload managers and writes new weights directly into their CPU buffers, bypassing the placeholders entirely. For any layer that happens to be prefetched on GPU at update time, the live GPU tensor is also updated so the change takes effect immediately. This requires no extra GPU memory and does not disturb the offload state.
+- **DTensor-aware:** parameters distributed via `torch.distributed.tensor` (tensor parallelism) are updated through `distribute_tensor` so that each shard is correctly placed on the right device mesh.
+
+**Request body:**
+
+| Field | Description | Defaults | Options |
+| --- | --- | --- | --- |
+| `model_path` | The model path with the new weights. | Required | Type: str |
+| `flush_cache` | Flush TeaCache state after update. | `True` | Type: bool |
+| `target_modules` | List of module names to update (e.g. `["transformer"]`). If omitted, all `nn.Module` components are updated. | `None` | Type: list[str] |
+
+**Response body:**
+
+| Field | Description | Defaults | Options |
+| --- | --- | --- | --- |
+| `success` | Whether the update succeeded. | - | Type: bool |
+| `message` | Status / error message. | - | Type: str |
+
+> **Note:** The diffusion engine (SGLang-Diffusion) does not currently support hot refit (updating weights while inference is in progress). The diffusion scheduler processes one request at a time and completes the entire inference before handling the next request, so weight updates and inference never run concurrently.
+
 ### Update Weights from Tensor
 
 **When to use:**

@@ -17,6 +17,7 @@
     VertexGenerateReqInput,
 )
 from sglang.multimodal_gen.runtime.entrypoints.openai.utils import build_sampling_params
+from sglang.multimodal_gen.runtime.entrypoints.post_training import weights_api
 from sglang.multimodal_gen.runtime.entrypoints.utils import (
     prepare_request,
     save_outputs,
@@ -214,6 +215,7 @@ def create_app(server_args: ServerArgs):
     app.include_router(common_api.router)
     app.include_router(image_api.router)
     app.include_router(video_api.router)
+    app.include_router(weights_api.router)
 
     app.state.server_args = server_args
     return app
@@ -0,0 +1,19 @@
+"""Request/response data structures for post-training APIs."""
+
+from dataclasses import dataclass
+
+
+@dataclass
+class UpdateWeightFromDiskReqInput:
+    """Request to update model weights from disk for diffusion models."""
+
+    model_path: str
+    flush_cache: bool = True
+    target_modules: list[str] | None = None
+
+
+@dataclass
+class GetWeightsChecksumReqInput:
+    """Compute SHA-256 checksum of loaded module weights for verification."""
+
+    module_names: list[str] | None = None
@@ -0,0 +1,62 @@
+"""Weight update API for the diffusion engine."""
+
+from fastapi import APIRouter, Request
+from fastapi.responses import ORJSONResponse
+
+from sglang.multimodal_gen.runtime.entrypoints.post_training.io_struct import (
+    GetWeightsChecksumReqInput,
+    UpdateWeightFromDiskReqInput,
+)
+from sglang.multimodal_gen.runtime.scheduler_client import async_scheduler_client
+
+router = APIRouter()
+
+
+@router.post("/update_weights_from_disk")
+async def update_weights_from_disk(request: Request):
+    """Update model weights from disk inplace without restarting the server."""
+    body = await request.json()
+    model_path = body.get("model_path")
+    if not model_path:
+        return ORJSONResponse(
+            {"success": False, "message": "model_path is required"},
+            status_code=400,
+        )
+
+    req = UpdateWeightFromDiskReqInput(
+        model_path=model_path,
+        flush_cache=body.get("flush_cache", True),
+        target_modules=body.get("target_modules"),
+    )
+
+    try:
+        response = await async_scheduler_client.forward(req)
+    except Exception as e:
+        return ORJSONResponse(
+            {"success": False, "message": str(e)},
+            status_code=500,
+        )
+
+    result = response.output
+    success = result.get("success", False)
+    message = result.get("message", "Unknown status")
+    return ORJSONResponse(
+        {"success": success, "message": message},
+        status_code=200 if success else 400,
+    )
+
+
+@router.post("/get_weights_checksum")
+async def get_weights_checksum(request: Request):
+    """Return SHA-256 checksum of each requested module's weights."""
+    body = await request.json()
+    req = GetWeightsChecksumReqInput(
+        module_names=body.get("module_names"),
+    )
+
+    try:
+        response = await async_scheduler_client.forward(req)
+    except Exception as e:
+        return ORJSONResponse({"error": str(e)}, status_code=500)
+
+    return ORJSONResponse(response.output, status_code=200)
@@ -2,19 +2,20 @@
 
 # SPDX-License-Identifier: Apache-2.0
 # Adapted from vllm: https://github.com/vllm-project/vllm/blob/v0.7.3/vllm/model_executor/model_loader/weight_utils.py
-"""Utilities for downloading and initializing model weights."""
+"""Utilities for downloading, loading, initializing and verifying model weights."""
 
 import hashlib
 import json
 import os
 import tempfile
-from collections.abc import Generator
+from collections.abc import Generator, Iterable
 from pathlib import Path
 
 import filelock
 import huggingface_hub.constants
 import torch
 from safetensors.torch import safe_open
+from torch.distributed.tensor import DTensor
 from tqdm.auto import tqdm
 
 try:
@@ -336,3 +337,23 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> str | None:
 
     # If there were no matches, return the untouched param name
     return name
+
+
+def compute_weights_checksum(
+    named_params: Iterable[tuple[str, torch.Tensor]],
+) -> str:
+    """Compute a SHA-256 checksum for a set of (name, tensor) pairs.
+
+    Used to verify the correctness of weight refitting. After a refit,
+    compare the checksum of the in-GPU model weights against the checksum
+    of the on-disk tensors or the tensors in the training engine.
+    """
+    hasher = hashlib.sha256()
+    for name, tensor in sorted(named_params, key=lambda x: x[0]):
+        hasher.update(name.encode())
+        t = tensor.detach()
+        # DTensor doesn't support .numpy(); extract the local tensor.
+        if isinstance(t, DTensor):
+            t = t._local_tensor
+        hasher.update(t.cpu().contiguous().reshape(-1).view(torch.uint8).numpy().data)
+    return hasher.hexdigest()