From 9f7ba30bff40e7881afdc1d8500dcfa0d4e05452 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Wed, 5 Nov 2025 16:18:30 -0800 Subject: [PATCH 1/6] leaf modules: explain better add Masahiro's explanation to why that code is there. --- deepspeed/runtime/zero/stage3.py | 1 + 1 file changed, 1 insertion(+) diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py index f8c146c81d2f..dc3974e1680e 100644 --- a/deepspeed/runtime/zero/stage3.py +++ b/deepspeed/runtime/zero/stage3.py @@ -1264,6 +1264,7 @@ def make_hook(params): def reduce_leaf_module_grads(module, grad_input, grad_output): for param in params: + # this takes care of grads for MoE experts that didn't participate in the current iteration/layer if param.grad is None: param.grad = torch.zeros_like(param) self.reduce_ready_partitions_and_remove_grads(param) From bd004a387e8132e1a5336059652d0c562ef7897d Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Thu, 6 Nov 2025 00:42:15 +0000 Subject: [PATCH 2/6] install uv Signed-off-by: Stas Bekman --- ci/accelerate.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ci/accelerate.py b/ci/accelerate.py index f9fc09d75f19..c51f8307ca59 100644 --- a/ci/accelerate.py +++ b/ci/accelerate.py @@ -14,6 +14,7 @@ .from_registry("pytorch/pytorch:2.6.0-cuda12.4-cudnn9-devel", add_python="3.10") .run_commands("apt update && apt install -y libaio-dev") .apt_install("git") + .run_commands("pip install uv") .run_commands("uv pip install --system --compile-bytecode datasets==3.6.0") .run_commands( "git clone https://github.com/huggingface/accelerate && \ From d123fa51ad25d9517fb6deca1384d7162d424f51 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Thu, 6 Nov 2025 00:54:08 +0000 Subject: [PATCH 3/6] force Signed-off-by: Stas Bekman --- ci/accelerate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/accelerate.py b/ci/accelerate.py index c51f8307ca59..ec6fc520434a 100644 --- a/ci/accelerate.py +++ b/ci/accelerate.py @@ -14,7 +14,7 @@ .from_registry("pytorch/pytorch:2.6.0-cuda12.4-cudnn9-devel", add_python="3.10") .run_commands("apt update && apt install -y libaio-dev") .apt_install("git") - .run_commands("pip install uv") + .run_commands("pip install uv", force_build=True) .run_commands("uv pip install --system --compile-bytecode datasets==3.6.0") .run_commands( "git clone https://github.com/huggingface/accelerate && \ From c948a6d6716993afcbe16575350d64d07fbcb17d Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Thu, 6 Nov 2025 02:39:20 +0000 Subject: [PATCH 4/6] force Signed-off-by: Stas Bekman --- ci/accelerate.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ci/accelerate.py b/ci/accelerate.py index ec6fc520434a..0defc50d29da 100644 --- a/ci/accelerate.py +++ b/ci/accelerate.py @@ -9,6 +9,10 @@ ROOT_PATH = Path(__file__).parents[1] +import os +# forces rebuild if needed +os.environ['MODAL_FORCE_BUILD'] = "1" + # yapf: disable image = (modal.Image .from_registry("pytorch/pytorch:2.6.0-cuda12.4-cudnn9-devel", add_python="3.10") From ade1d67c3aab3e9efc69fc45aaefba9d32c8d015 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Thu, 6 Nov 2025 03:05:56 +0000 Subject: [PATCH 5/6] debug Signed-off-by: Stas Bekman --- .github/workflows/modal-accelerate.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/modal-accelerate.yml b/.github/workflows/modal-accelerate.yml index 342fcd4707f4..2d7ddf2dd529 100644 --- a/.github/workflows/modal-accelerate.yml +++ b/.github/workflows/modal-accelerate.yml @@ -96,4 +96,5 @@ jobs: - name: Run tests run: | + cat ci/accelerate.py modal run -m ci.accelerate From 440e0a8066000d479f0c88a35f66b843173c6564 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Thu, 6 Nov 2025 12:06:38 -0800 Subject: [PATCH 6/6] Update modal-accelerate.yml --- .github/workflows/modal-accelerate.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/modal-accelerate.yml b/.github/workflows/modal-accelerate.yml index d7996d5c8709..f188e5f3e0e2 100644 --- a/.github/workflows/modal-accelerate.yml +++ b/.github/workflows/modal-accelerate.yml @@ -101,5 +101,4 @@ jobs: - name: Run tests run: | - cat ci/accelerate.py modal run -m ci.accelerate