[diffusion] address comments

dreamyang-liu · dreamyang-liu · commit 666fb8b4eb61 · 2026-02-11T08:45:11.000Z
diff --git a/docs/advanced_features/sglang_for_rl.md b/docs/advanced_features/sglang_for_rl.md
@@ -127,6 +127,8 @@ This path trades some I/O overhead for simplicity and flexibility. It integrates
 | `success` | Whether the update succeeded. | - | Type: bool |
 | `message` | Status / error message. | - | Type: str |
 
+> **Note:** The diffusion engine (SGLang-Diffusion) does not currently support hot refit (updating weights while inference is in progress). The diffusion scheduler processes one request at a time and completes the entire inference before handling the next request, so weight updates and inference never run concurrently.
+
 ### Update Weights from Tensor
 
 **When to use:**
diff --git a/python/sglang/multimodal_gen/runtime/entrypoints/post_training/io_struct.py b/python/sglang/multimodal_gen/runtime/entrypoints/post_training/io_struct.py
diff --git a/python/sglang/multimodal_gen/runtime/entrypoints/post_training/weights_api.py b/python/sglang/multimodal_gen/runtime/entrypoints/post_training/weights_api.py
@@ -1,9 +1,9 @@
-"""Post-training APIs: weight updates and related operations."""
+"""Weight update API for the diffusion engine."""
 
 from fastapi import APIRouter, Request
 from fastapi.responses import ORJSONResponse
 
-from sglang.multimodal_gen.runtime.entrypoints.post_training.utils import (
+from sglang.multimodal_gen.runtime.entrypoints.post_training.io_struct import (
     UpdateWeightFromDiskReqInput,
 )
 from sglang.multimodal_gen.runtime.scheduler_client import async_scheduler_client
diff --git a/python/sglang/multimodal_gen/runtime/loader/utils.py b/python/sglang/multimodal_gen/runtime/loader/utils.py
@@ -145,6 +145,18 @@ def _list_safetensors_files(model_path: str) -> list[str]:
     return sorted(glob.glob(os.path.join(str(model_path), "*.safetensors")))
 
 
+def find_weights_dir(local_path: str, module_name: str) -> str | None:
+    """Locate the safetensors directory for module_name under local_path.
+
+    Diffusion models store weights in per-module subdirectories (e.g.
+    transformer/, vae/, text_encoder/).
+    """
+    dir_path = os.path.join(local_path, module_name)
+    if os.path.exists(dir_path):
+        return dir_path
+    return None
+
+
 def get_memory_usage_of_component(module) -> float | None:
     """
     returned value is in GB, rounded to 2 decimal digits
diff --git a/python/sglang/multimodal_gen/runtime/loader/weights_updater.py b/python/sglang/multimodal_gen/runtime/loader/weights_updater.py
@@ -5,19 +5,23 @@
 without restarting the server.  It is the diffusion-engine counterpart of the
 LLM engine's ModelRunner.update_weights_from_disk.
 
-Typical usage (from GPUWorker):
+Typical usage (from GPUWorker.update_weights_from_disk):
 
     updater = WeightsUpdater(self.pipeline)
     success, message = updater.update_weights_from_disk(
         model_path,
-        original_model_path=self.server_args.model_path,
+        flush_cache=flush_cache,
+        target_modules=target_modules,
     )
+    if success:
+        self.server_args.model_path = model_path
+    return success, message
 
 Key design decisions:
 
 - All-or-nothing: if any module fails to load, all previously updated
   modules are rolled back to the original weights by reloading from
-  original_model_path.  No partial updates are left behind.
+  pipeline.model_path. No partial updates are left behind.
 
 - Rollback failures propagate: if rollback itself fails, the exception is
   not caught so the caller knows the model is in an inconsistent state.
@@ -41,13 +45,15 @@
 from __future__ import annotations
 
 import gc
-import os
-import time
 
 import torch
+from torch.distributed.tensor import DTensor, distribute_tensor
 
 from sglang.multimodal_gen.runtime.cache.teacache import TeaCacheMixin
-from sglang.multimodal_gen.runtime.loader.utils import _list_safetensors_files
+from sglang.multimodal_gen.runtime.loader.utils import (
+    _list_safetensors_files,
+    find_weights_dir,
+)
 from sglang.multimodal_gen.runtime.loader.weight_utils import (
     safetensors_weights_iterator,
 )
@@ -56,12 +62,6 @@
 from sglang.multimodal_gen.runtime.utils.layerwise_offload import OffloadableDiTMixin
 from sglang.multimodal_gen.runtime.utils.logging_utils import init_logger
 
-try:
-    from torch.distributed.tensor import DTensor, distribute_tensor
-except ImportError:
-    DTensor = None
-    distribute_tensor = None
-
 logger = init_logger(__name__)
 
 
@@ -87,7 +87,8 @@ class WeightsUpdater:
 
     Args:
         pipeline: A ComposedPipelineBase (or DiffusersPipeline) instance
-            whose modules will be updated.
+            whose modules will be updated.  The pipeline's model_path
+            attribute is used for rollback on failure.
     """
 
     def __init__(self, pipeline):
@@ -96,26 +97,21 @@ def __init__(self, pipeline):
     def update_weights_from_disk(
         self,
         model_path: str,
-        original_model_path: str,
         flush_cache: bool = True,
         target_modules: list[str] | None = None,
     ) -> tuple[bool, str]:
         """Update model weights from disk without restarting the server.
 
         Args:
             model_path: HF repo id or local path to the new weights.
-            original_model_path: Path to the currently loaded weights (used
-                for rollback on failure).
             flush_cache: If True, reset TeaCache state after a successful
                 update so that stale cached residuals are not reused.
             target_modules: Explicit list of module names to update.  None
-                or ["all"] updates every nn.Module in the pipeline.
+                updates every nn.Module in the pipeline.
 
         Returns:
-            (success, message) tuple.
+            (success, message) tuple where success is True on success.
         """
-        tic = time.perf_counter()
-        self._original_model_path = original_model_path
         logger.info(f"Updating weights from disk: {model_path}")
 
         try:
@@ -161,10 +157,9 @@ def update_weights_from_disk(
 
         if success and flush_cache:
             for _, module in modules_to_update:
-                _reset_cache_state(module)
+                if isinstance(module, TeaCacheMixin):
+                    module.reset_teacache_state()
 
-        elapsed = time.perf_counter() - tic
-        message = f"{message} elapsed={elapsed:.2f}s"
         logger.info(message)
         return success, message
 
@@ -182,7 +177,7 @@ def _collect_modules(
         """
         components = get_updatable_modules(self.pipeline)
 
-        if target_modules is None or target_modules == ["all"]:
+        if target_modules is None:
             names = list(components.keys())
         else:
             unknown = [n for n in target_modules if n not in components]
@@ -232,7 +227,7 @@ def _rollback(self, updated_modules: list[str]) -> None:
         """
         if not updated_modules:
             return
-        original_path = maybe_download_model(self._original_model_path)
+        original_path = maybe_download_model(self.pipeline.model_path)
         for name in updated_modules:
             module = self.pipeline.get_module(name)
             if module is None:
@@ -249,23 +244,6 @@ def _rollback(self, updated_modules: list[str]) -> None:
 # ---------------------------------------------------------------------------
 
 
-def find_weights_dir(local_path: str, module_name: str) -> str | None:
-    """Locate the safetensors directory for module_name under local_path.
-
-    Diffusion models store weights in per-module subdirectories (e.g.
-    transformer/, vae/, text_encoder/).  This function tries
-    <local_path>/<module_name>/ first, then falls back to local_path
-    itself if it directly contains safetensors files (common for RL
-    checkpoints that save weights in a flat directory).
-    """
-    dir_path = os.path.join(local_path, module_name)
-    if os.path.exists(dir_path):
-        return dir_path
-    if _list_safetensors_files(local_path):
-        return local_path
-    return None
-
-
 def _get_weights_iter(weights_dir: str):
     """Return a (name, tensor) iterator over safetensors in weights_dir."""
     safetensors_files = _list_safetensors_files(weights_dir)
@@ -295,25 +273,21 @@ def _validate_weight_files(
     return weights_map, missing
 
 
-def _get_offload_managers(module: torch.nn.Module) -> list:
-    """Return active offload managers for the given module, if any."""
-    if isinstance(module, OffloadableDiTMixin) and module.layerwise_offload_managers:
-        return [m for m in module.layerwise_offload_managers if m.enabled]
-    return []
-
-
 def _load_weights_into_module(module: torch.nn.Module, weights_iter) -> None:
     """Load weights into a module, handling offload-managed parameters.
 
     For offloaded modules, updates CPU buffers directly via
     update_cpu_weights(); non-offloaded parameters use in-place copy.
     """
-    offload_managers = _get_offload_managers(module)
+    offload_managers: list = []
+    if isinstance(module, OffloadableDiTMixin) and module.layerwise_offload_managers:
+        offload_managers = [m for m in module.layerwise_offload_managers if m.enabled]
+
     if offload_managers:
         weight_dict = dict(weights_iter)
         offloaded_names: set[str] = set()
         for manager in offload_managers:
-            offloaded_names |= manager.update_cpu_weights(weight_dict)
+            offloaded_names.update(manager.update_cpu_weights(weight_dict))
         remaining = ((n, w) for n, w in weight_dict.items() if n not in offloaded_names)
         load_weights_into_model(remaining, dict(module.named_parameters()))
     else:
@@ -330,7 +304,7 @@ def load_weights_into_model(weights_iter, model_params: dict) -> None:
             raise ValueError(
                 f"Shape mismatch for {name}: model={param.shape}, loaded={loaded_weight.shape}"
             )
-        if DTensor is not None and isinstance(param, DTensor):
+        if isinstance(param, DTensor):
             distributed_weight = distribute_tensor(
                 loaded_weight.to(param.dtype),
                 param.device_mesh,
@@ -339,13 +313,3 @@ def load_weights_into_model(weights_iter, model_params: dict) -> None:
             param._local_tensor.copy_(distributed_weight._local_tensor)
         else:
             param.data.copy_(loaded_weight.to(param.dtype))
-
-
-def _reset_cache_state(module: torch.nn.Module) -> None:
-    """Reset Cache state after weight updates.
-
-    After weights change, any cached residuals from previous denoising steps
-    are invalid and must be cleared.
-    """
-    if isinstance(module, TeaCacheMixin):
-        module.reset_teacache_state()
diff --git a/python/sglang/multimodal_gen/runtime/managers/gpu_worker.py b/python/sglang/multimodal_gen/runtime/managers/gpu_worker.py
@@ -355,7 +355,6 @@ def update_weights_from_disk(
         updater = WeightsUpdater(self.pipeline)
         success, message = updater.update_weights_from_disk(
             model_path,
-            original_model_path=self.server_args.model_path,
             flush_cache=flush_cache,
             target_modules=target_modules,
         )
diff --git a/python/sglang/multimodal_gen/runtime/managers/scheduler.py b/python/sglang/multimodal_gen/runtime/managers/scheduler.py
@@ -20,7 +20,7 @@
     _parse_size,
     save_image_to_path,
 )
-from sglang.multimodal_gen.runtime.entrypoints.post_training.utils import (
+from sglang.multimodal_gen.runtime.entrypoints.post_training.io_struct import (
     UpdateWeightFromDiskReqInput,
 )
 from sglang.multimodal_gen.runtime.managers.gpu_worker import GPUWorker
diff --git a/python/sglang/multimodal_gen/runtime/utils/layerwise_offload.py b/python/sglang/multimodal_gen/runtime/utils/layerwise_offload.py
@@ -277,11 +277,21 @@ def sync_all_layers_to_cpu(self) -> None:
             self.sync_layer_to_cpu(layer_idx)
 
     @torch.compiler.disable
-    def update_cpu_weights(self, weight_dict: Dict[str, torch.Tensor]) -> Set[str]:
+    def update_cpu_weights(
+        self, weight_dict: Dict[str, torch.Tensor]
+    ) -> Set[str] | None:
         """Update consolidated CPU buffers with new weights.
 
-        For layers currently on GPU, the live GPU parameter is also updated
-        so the change takes effect immediately.
+        When layerwise offload (--dit-layerwise-offload) is enabled, the
+        offload manager replaces GPU parameters with small torch.empty((1,))
+        placeholders while real weights live in consolidated pinned CPU
+        buffers.  A naive param.data.copy_() would fail with a shape
+        mismatch.  Instead, this method writes new weights directly into
+        the CPU buffers, bypassing the placeholders entirely.  For any
+        layer that happens to be resident on GPU at update time, the live
+        GPU tensor is also updated so the change takes effect immediately.
+        This requires no extra GPU memory and does not disturb the offload
+        state.
 
         Args:
             weight_dict: Mapping of parameter name to new weight tensor.
@@ -294,7 +304,7 @@ def update_cpu_weights(self, weight_dict: Dict[str, torch.Tensor]) -> Set[str]:
                 metadata (i.e. the real shape, not the placeholder shape).
         """
         if not self.enabled:
-            return set()
+            return None
 
         updated_names: Set[str] = set()
         for name, loaded_weight in weight_dict.items():

Original file line number	Diff line number	Diff line change
`@@ -355,7 +355,6 @@ def update_weights_from_disk(`
`355`	`355`	`updater = WeightsUpdater(self.pipeline)`
`356`	`356`	`success, message = updater.update_weights_from_disk(`
`357`	`357`	`model_path,`
`358`		`- original_model_path=self.server_args.model_path,`
`359`	`358`	`flush_cache=flush_cache,`
`360`	`359`	`target_modules=target_modules,`
`361`	`360`	`)`
Original file line number	Diff line number	Diff line change
`@@ -20,7 +20,7 @@`
`20`	`20`	`_parse_size,`
`21`	`21`	`save_image_to_path,`
`22`	`22`	`)`
`23`		`-from sglang.multimodal_gen.runtime.entrypoints.post_training.utils import (`
	`23`	`+from sglang.multimodal_gen.runtime.entrypoints.post_training.io_struct import (`
`24`	`24`	`UpdateWeightFromDiskReqInput,`
`25`	`25`	`)`
`26`	`26`	`from sglang.multimodal_gen.runtime.managers.gpu_worker import GPUWorker`