deepspeedai
diff --git a/‎.github/workflows/aws-torch-latest-full.yml‎
Lines changed: 42 additions & 0 deletions b/‎.github/workflows/aws-torch-latest-full.yml‎
Lines changed: 42 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 2 additions & 0 deletions b/‎README.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎accelerator/cpu_accelerator.py‎
Lines changed: 1 addition & 1 deletion b/‎accelerator/cpu_accelerator.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎csrc/aio/py_test/parse_aio_stats.py‎
Lines changed: 1 addition & 1 deletion b/‎csrc/aio/py_test/parse_aio_stats.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎csrc/cpu/comm/arm64/shm.h‎
Lines changed: 106 additions & 0 deletions b/‎csrc/cpu/comm/arm64/shm.h‎
Lines changed: 106 additions & 0 deletions
diff --git a/‎csrc/cpu/comm/shm.cpp‎
Lines changed: 15 additions & 3 deletions b/‎csrc/cpu/comm/shm.cpp‎
Lines changed: 15 additions & 3 deletions
diff --git a/‎deepspeed/autotuning/autotuner.py‎
Lines changed: 2 additions & 2 deletions b/‎deepspeed/autotuning/autotuner.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎deepspeed/autotuning/tuner/base_tuner.py‎
Lines changed: 1 addition & 1 deletion b/‎deepspeed/autotuning/tuner/base_tuner.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎deepspeed/comm/torch.py‎
Lines changed: 5 additions & 1 deletion b/‎deepspeed/comm/torch.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎deepspeed/compat.py‎
Lines changed: 47 additions & 0 deletions b/‎deepspeed/compat.py‎
Lines changed: 47 additions & 0 deletions
@@ -7,20 +7,62 @@
 # This workflow runs:
 # - Parallel tests with pytest-xdist (-n 8)
 # - Sequential tests marked with @pytest.mark.sequential
+#
+# Nightly schedule: skips if no new commits since last successful run.
 ################################################################################
 
 name: aws-torch-latest-full
 
 on:
+  schedule:
+    - cron: '0 8 * * *'   # Daily at 08:00 UTC (midnight PST)
   workflow_dispatch:
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}
   cancel-in-progress: true
 
 jobs:
+  check-changes:
+    name: Check for new commits
+    runs-on: ubuntu-latest
+    # Only check on schedule; workflow_dispatch always runs
+    if: github.event_name == 'schedule'
+    outputs:
+      has_changes: ${{ steps.check.outputs.has_changes }}
+    steps:
+      - name: Check for commits since last successful run
+        id: check
+        env:
+          GH_TOKEN: ${{ github.token }}
+        run: |
+          default_branch="${{ github.event.repository.default_branch }}"
+
+          # Get the HEAD SHA of the last successful run of this workflow
+          last_sha=$(gh api \
+            "repos/${{ github.repository }}/actions/workflows/aws-torch-latest-full.yml/runs?status=success&branch=${default_branch}&per_page=1" \
+            --jq '.workflow_runs[0].head_sha // empty')
+
+          current_sha="${{ github.sha }}"
+
+          if [ -z "$last_sha" ]; then
+            echo "No previous successful run found — running tests"
+            echo "has_changes=true" >> "$GITHUB_OUTPUT"
+          elif [ "$last_sha" = "$current_sha" ]; then
+            echo "No new commits since last successful run ($last_sha) — skipping"
+            echo "has_changes=false" >> "$GITHUB_OUTPUT"
+          else
+            echo "New commits detected: $last_sha -> $current_sha — running tests"
+            echo "has_changes=true" >> "$GITHUB_OUTPUT"
+          fi
+
   unit-tests:
     name: Unit Tests (Full)
+    needs: [check-changes]
+    # Run if: (a) workflow_dispatch, or (b) schedule with new commits
+    if: |
+      always() &&
+      (github.event_name == 'workflow_dispatch' || needs.check-changes.outputs.has_changes == 'true')
     runs-on: [self-hosted, gpu-ci, gpu-l40s, l40s-4gpu, aws]
     timeout-minutes: 180
 
 
@@ -18,6 +18,8 @@
 
 * [2025/12] [DeepSpeed Core API updates: PyTorch-style backward and low-precision master states](https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/core_api_update/README.md)
 
+* [2025/11] [DeepSpeed ZeRO++ powers large-scale distillation training of LLMs for Recommendation Systems at LinkedIn](https://aclanthology.org/2025.emnlp-industry.119/)
+
 * [2025/10] We hosted the [Ray x DeepSpeed Meetup](https://luma.com/3wctqteh) at Anyscale. We shared our most recent work on SuperOffload, ZenFlow, Muon Optimizer Support, Arctic Long Sequence Training and DeepCompile. Please find the meetup slides [here](https://docs.google.com/presentation/d/1eM3mY6oW9GYkRy1Xz0iOnbbEr5T1t0JJXOM5BKtR-Ks/edit?slide=id.g38615d6b4c2_0_87#slide=id.g38615d6b4c2_0_87).
 
 * [2025/10] [SuperOffload: Unleashing the Power of Large-Scale LLM Training on Superchips](https://pytorch.org/blog/superoffload-unleashing-the-power-of-large-scale-llm-training-on-superchips/)
 
@@ -232,7 +232,7 @@ def is_fp16_supported(self):
         try:
             if torch.ops.mkldnn._is_mkldnn_fp16_supported():
                 return True
-        except:
+        except Exception:
             return False
 
     def supported_dtypes(self):
 
@@ -50,7 +50,7 @@ def extract_value(key, file):
                 return int(v[0]) * 1024 * 1024
             else:
                 return int(key[2:])
-    except:
+    except Exception:
         print(f"{file}: extract_value fails on {key}")
         return None
 
 
@@ -0,0 +1,106 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+// NOTE:
+// This shared-memory implementation targets AArch64 CPUs.
+// Minimum supported architecture is ARMv8-A with NEON (Advanced SIMD) support.
+// Systems without NEON are not supported.
+
+#include <arm_neon.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <cmath>
+
+// 128 bits = 16 bytes -> fits 8 fp16/bf16 or 4 fp32 elements.
+static int vector_length_in_bytes = 16;
+// When widening fp16/bf16 -> fp32, 4 elements fit in one 128-bit register.
+// Using 8 would require two 128-bit registers, so limit to 4.
+static constexpr int full_precision_elements_in_fixed_vector = 4;
+
+static inline float32x4_t cvt_bf16_to_fp32(const uint16x4_t input)
+{
+    // Zero-extend 16-bit to 32-bit and shift left by 16 bits
+    // BF16 has the same exponent/sign bits as FP32, just missing lower mantissa bits
+    uint32x4_t result_32 = vshll_n_u16(input, 16);
+    return vreinterpretq_f32_u32(result_32);
+}
+
+static inline float32x4_t cvt_fp16_to_fp32(float16x4_t input)
+{
+    // Converts 4 FP16 values to 4 FP32 values
+    return vcvt_f32_f16(input);
+}
+
+// While converting fp32 to fp16, before truncating lsb, it should be rounded to nearest even and
+// Converts 4 float32 -> 4 bfloat16 with round-to-nearest-even (RNE) and NaN handling
+static inline uint16x4_t cvt_fp32_to_bf16(float32x4_t src)
+{
+    // Reinterpret float32 bits as uint32
+    uint32x4_t u32 = vreinterpretq_u32_f32(src);
+
+    const uint32x4_t ones = vdupq_n_u32(0x1);
+    const uint32x4_t vec_bias =
+        vdupq_n_u32(0x7FFF);  // one less than half of the dropped bits range
+    const uint16x4_t nan_bf16 = vdup_n_u16(0xFFFF);
+
+    // RNE: lsb = (input >> 16) & 1
+    uint32x4_t lsb = vandq_u32(vshrq_n_u32(u32, 16), ones);
+
+    // rounding_bias = 0x7FFF + lsb, lsb can be 0 or 1.
+    uint32x4_t bias = vaddq_u32(vec_bias, lsb);
+
+    // input += rounding_bias
+    u32 = vaddq_u32(u32, bias);
+
+    // >> 16 to get bfloat16
+    // vshrq_n_u32 - keeps 32 bit width after shift
+    // vshrn_n_u32 - keeps 16 bits width after shift
+    uint16x4_t bf16 = vshrn_n_u32(u32, 16);
+
+    // vmvnq_u32 is bitwise NOT
+    // NaN mask: ~(src == src) -> 1 if NaN
+    // for normal num, ~(src == src) -> 0
+    uint32x4_t isnan = vmvnq_u32(vceqq_f32(src, src));
+
+    // Select nan_bf16 if isnan (use 16-bit mask)
+    uint16x4_t mask = vreinterpret_u16_u32(vget_low_u32(isnan));
+    return vbsl_u16(mask, nan_bf16, bf16);
+}
+
+// fp32 and fp16 are IEEE formats.
+// converting fp32 to fp16 is handled by vcvt_f16_f32 internally without arbitrarily truncating the
+// lsb but rounds to nearest.
+static inline float16x4_t cvt_fp32_to_fp16(float32x4_t input)
+{
+    // Converts 4 FP32 values to 4 FP16 values with rounding
+    return vcvt_f16_f32(input);
+}
+
+// Reduce functions down below use vectorized algorithm, the number of bytes processed each
+// iteration depends on vector length.  128bit vector ==> 16 bytes. sticking to NEON 128 bit
+
+void reduce_bf16_buffers(int start_elements, int num_elements, char* to_buffer, char** buffers);
+void reduce_fp16_buffers(int start_elements, int num_elements, char* to_buffer, char** buffers);
+void reduce_fp32_buffers(int start_elements, int num_elements, char* to_buffer, char** buffers);
+
+void parallel_memcpy(void* to, void* from, size_t n_bytes);
+
+#define VLOAD_U8(X) vld1q_u8((uint8_t*)(X))
+#define VLOAD_U16(X) vld1_u16((uint16_t*)(X))
+#define VLOAD_F16(X) vld1_f16((float16_t*)(X))
+#define VLOAD_F32(X) vld1q_f32((float32_t*)(X))
+
+#define VSTORE_U8(A, B) vst1q_u8((uint8_t*)(A), B)
+#define VSTORE_U16(A, B) vst1_u16((uint16_t*)(A), B)
+#define VSTORE_F16(A, B) vst1_f16((float16_t*)(A), B)  // fp16 supported from armv8.2-a+fp16
+#define VSTORE_F32(A, B) vst1q_f32((float32_t*)(A), B)
+
+#define VADD_F32(A, B) vaddq_f32(A, B)
+#define VADD_F32_2VL(A, B) vaddq_f32(A, B)
+
+#define CVT_BF16_TO_FP32(X) cvt_bf16_to_fp32(X)
+#define CVT_FP16_TO_FP32(X) cvt_fp16_to_fp32(X)
+#define CVT_FP32_TO_BF16(X) cvt_fp32_to_bf16(X)
+#define CVT_FP32_TO_FP16(X) cvt_fp32_to_fp16(X)
@@ -14,6 +14,9 @@
 #if defined(__riscv)
 #define TARGET_RISCV 1
 #include "riscv64/shm.h"
+#elif defined(__aarch64__)
+#define TARGET_ARM 1
+#include "arm64/shm.h"
 #else
 #include "x86_64/shm.h"
 #endif
@@ -154,7 +157,10 @@ void reduce_bf16_buffers(int start_elements, int num_elements, char* to_buffer,
 #if TARGET_RISCV
     size_t vl = __riscv_vsetvl_e16m1(num_elements);
     vector_length_in_bytes = vl * element_size;
-#else
+#elif TARGET_ARM
+    const int vl = full_precision_elements_in_fixed_vector;
+    vector_length_in_bytes = vl * element_size;
+#else  // x86_64
     const int vl = vector_length_in_bytes / element_size;
 #endif
     int main_elements = num_elements - (num_elements % vl);
@@ -214,7 +220,10 @@ void reduce_fp16_buffers(int start_elements, int num_elements, char* to_buffer,
 #if TARGET_RISCV
     size_t vl = __riscv_vsetvl_e16m1(num_elements);
     vector_length_in_bytes = vl * element_size;
-#else
+#elif TARGET_ARM
+    const int vl = full_precision_elements_in_fixed_vector;
+    vector_length_in_bytes = vl * element_size;
+#else  // x86_64
     const int vl = vector_length_in_bytes / element_size;
 #endif
     int main_elements = num_elements - (num_elements % vl);
@@ -274,7 +283,10 @@ void reduce_fp32_buffers(int start_elements, int num_elements, char* to_buffer,
 #if TARGET_RISCV
     size_t vl = __riscv_vsetvl_e32m1(num_elements);
     vector_length_in_bytes = vl * element_size;
-#else
+#elif TARGET_ARM
+    const int vl = full_precision_elements_in_fixed_vector;
+    vector_length_in_bytes = vl * element_size;
+#else  // x86_64
     const int vl = vector_length_in_bytes / element_size;
 #endif
     int main_elements = num_elements - (num_elements % vl);
 
@@ -69,7 +69,7 @@ def __init__(self, args, active_resources):
             try:
                 os.makedirs(self.exps_dir, exist_ok=True)
                 logger.info(f"Created autotuning experiments directory: {self.exps_dir}")
-            except:
+            except Exception:
                 logger.error(
                     f"Failed to create {self.exps_dir}, please check exps_dir in the autotuning config file is accessible by all the nodes in the job."
                 )
@@ -82,7 +82,7 @@ def __init__(self, args, active_resources):
             try:
                 os.makedirs(self.results_dir, exist_ok=True)
                 logger.info(f"Created autotuning results directory: {self.results_dir}")
-            except:
+            except Exception:
                 logger.error(
                     f"Failed to create {self.results_dir}, please check results_dir in the autotuning config file is accessible by all the nodes in the job."
                 )
 
@@ -67,6 +67,6 @@ def tune(self, sample_size=1, n_trials=1000, early_stopping=None):
                     )
                     break
             return i
-        except:
+        except Exception:
             logger.info("Tuner Error:", sys.exc_info()[0])
             return i
@@ -167,7 +167,11 @@ def all_reduce(self, tensor, op=torch.distributed.ReduceOp.SUM, group=None, asyn
         return torch.distributed.all_reduce(tensor=tensor, op=op, group=group, async_op=async_op)
 
     def inference_all_reduce(self, tensor, op, group=None):
-        if not hasattr(torch.ops, 'deepspeed') or not hasattr(torch.ops.deepspeed, 'inference_all_reduce_'):
+        use_ds_op = hasattr(torch.ops, 'deepspeed') and hasattr(torch.ops.deepspeed, 'inference_all_reduce_')
+        world_size = torch.distributed.get_world_size(group=group)
+        if world_size <= 1:
+            return tensor
+        if not use_ds_op:
             op = self._reduce_op(op)
             return torch.distributed.all_reduce(tensor=tensor, op=op, group=group, async_op=False)
         else:
 
@@ -0,0 +1,47 @@
+"""Compatibility functions to support wider version ranges for python and dependencies."""
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Mapping, Any, Dict
+from inspect import ismodule
+try:
+    import annotationlib  # python >= 3.14
+except ImportError:
+    annotationlib = None
+
+# Deal with annotations in python versions >=3.14. See:
+#   - Python 3.14 release notes: https://docs.python.org/3/whatsnew/3.14.html
+#     Porting annotations: https://docs.python.org/3/whatsnew/3.14.html#whatsnew314-porting-annotations
+#   - PEP649: https://peps.python.org/pep-0649/
+#   - PEP749: https://peps.python.org/pep-0749/
+# Backwards compatible, applies best practices (use annotationlib) from python 3.14 onwards.
+
+
+def get_annotations_from_namespace(namespace: Mapping[str, object]) -> Dict[str, Any]:
+    if annotationlib:
+        annotate_func = annotationlib.get_annotate_from_class_namespace(namespace)
+        if annotate_func is not None:
+            return annotationlib.call_annotate_function(annotate_func, annotationlib.Format.VALUE)
+    return namespace.get("__annotations__", {})
+
+
+def get_annotations(obj: Any) -> Dict[str, Any]:
+    """
+    Retrieves annotations from a Python object.
+
+    In python >=3.14 this is a thin wrapper around the `annotationlib.get_annotations` function
+    with the added convenience to automatically infer the type for non module, class, function
+    or customly annotated objects.
+    """
+    if annotationlib:
+        has_annotations = hasattr(obj, "__annotations__") or hasattr(obj, "__annotate__")
+        if not isinstance(obj, type) and not ismodule(obj) and not callable(obj) and not has_annotations:
+            obj = type(obj)
+        return annotationlib.get_annotations(obj)
+    try:
+        return obj.__annotations__
+    except AttributeError:
+        return {}
Original file line number	Diff line number	Diff line change
`@@ -67,6 +67,6 @@ def tune(self, sample_size=1, n_trials=1000, early_stopping=None):`
`67`	`67`	`)`
`68`	`68`	`break`
`69`	`69`	`return i`
`70`		`- except:`
	`70`	`+ except Exception:`
`71`	`71`	`logger.info("Tuner Error:", sys.exc_info()[0])`
`72`	`72`	`return i`