diff --git a/trinity/trainer/verl/fsdp_workers.py b/trinity/trainer/verl/fsdp_workers.py index 5787bf2e6e..3d2f0b40a1 100644 --- a/trinity/trainer/verl/fsdp_workers.py +++ b/trinity/trainer/verl/fsdp_workers.py @@ -958,6 +958,12 @@ def update_actor(self, data: DataProto): "After offload actor optimizer during update_actor", logger=self.logger ) + # Release reserved GPU memory held by PyTorch's caching allocator after + # backward passes. Without this, memory_reserved grows monotonically and + # eventually starves vLLM during weight sync in colocate mode. + # Matches the pattern in megatron_workers.py update_actor(). + torch.cuda.empty_cache() + return output @register(dispatch_mode=make_nd_compute_dataproto_dispatch_fn(mesh_name="actor"))