Skip to content

Commit 4584a65

Browse files
authored
Merge branch 'master' into gma/xpu_use_stock_pytorch
2 parents 9b30894 + ae21699 commit 4584a65

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

65 files changed

+1467
-329
lines changed

.github/workflows/aws-torch-latest-full.yml

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,20 +7,62 @@
77
# This workflow runs:
88
# - Parallel tests with pytest-xdist (-n 8)
99
# - Sequential tests marked with @pytest.mark.sequential
10+
#
11+
# Nightly schedule: skips if no new commits since last successful run.
1012
################################################################################
1113

1214
name: aws-torch-latest-full
1315

1416
on:
17+
schedule:
18+
- cron: '0 8 * * *' # Daily at 08:00 UTC (midnight PST)
1519
workflow_dispatch:
1620

1721
concurrency:
1822
group: ${{ github.workflow }}-${{ github.ref }}
1923
cancel-in-progress: true
2024

2125
jobs:
26+
check-changes:
27+
name: Check for new commits
28+
runs-on: ubuntu-latest
29+
# Only check on schedule; workflow_dispatch always runs
30+
if: github.event_name == 'schedule'
31+
outputs:
32+
has_changes: ${{ steps.check.outputs.has_changes }}
33+
steps:
34+
- name: Check for commits since last successful run
35+
id: check
36+
env:
37+
GH_TOKEN: ${{ github.token }}
38+
run: |
39+
default_branch="${{ github.event.repository.default_branch }}"
40+
41+
# Get the HEAD SHA of the last successful run of this workflow
42+
last_sha=$(gh api \
43+
"repos/${{ github.repository }}/actions/workflows/aws-torch-latest-full.yml/runs?status=success&branch=${default_branch}&per_page=1" \
44+
--jq '.workflow_runs[0].head_sha // empty')
45+
46+
current_sha="${{ github.sha }}"
47+
48+
if [ -z "$last_sha" ]; then
49+
echo "No previous successful run found — running tests"
50+
echo "has_changes=true" >> "$GITHUB_OUTPUT"
51+
elif [ "$last_sha" = "$current_sha" ]; then
52+
echo "No new commits since last successful run ($last_sha) — skipping"
53+
echo "has_changes=false" >> "$GITHUB_OUTPUT"
54+
else
55+
echo "New commits detected: $last_sha -> $current_sha — running tests"
56+
echo "has_changes=true" >> "$GITHUB_OUTPUT"
57+
fi
58+
2259
unit-tests:
2360
name: Unit Tests (Full)
61+
needs: [check-changes]
62+
# Run if: (a) workflow_dispatch, or (b) schedule with new commits
63+
if: |
64+
always() &&
65+
(github.event_name == 'workflow_dispatch' || needs.check-changes.outputs.has_changes == 'true')
2466
runs-on: [self-hosted, gpu-ci, gpu-l40s, l40s-4gpu, aws]
2567
timeout-minutes: 180
2668

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@
1818

1919
* [2025/12] [DeepSpeed Core API updates: PyTorch-style backward and low-precision master states](https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/core_api_update/README.md)
2020

21+
* [2025/11] [DeepSpeed ZeRO++ powers large-scale distillation training of LLMs for Recommendation Systems at LinkedIn](https://aclanthology.org/2025.emnlp-industry.119/)
22+
2123
* [2025/10] We hosted the [Ray x DeepSpeed Meetup](https://luma.com/3wctqteh) at Anyscale. We shared our most recent work on SuperOffload, ZenFlow, Muon Optimizer Support, Arctic Long Sequence Training and DeepCompile. Please find the meetup slides [here](https://docs.google.com/presentation/d/1eM3mY6oW9GYkRy1Xz0iOnbbEr5T1t0JJXOM5BKtR-Ks/edit?slide=id.g38615d6b4c2_0_87#slide=id.g38615d6b4c2_0_87).
2224

2325
* [2025/10] [SuperOffload: Unleashing the Power of Large-Scale LLM Training on Superchips](https://pytorch.org/blog/superoffload-unleashing-the-power-of-large-scale-llm-training-on-superchips/)

accelerator/cpu_accelerator.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -232,7 +232,7 @@ def is_fp16_supported(self):
232232
try:
233233
if torch.ops.mkldnn._is_mkldnn_fp16_supported():
234234
return True
235-
except:
235+
except Exception:
236236
return False
237237

238238
def supported_dtypes(self):

csrc/aio/py_test/parse_aio_stats.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ def extract_value(key, file):
5050
return int(v[0]) * 1024 * 1024
5151
else:
5252
return int(key[2:])
53-
except:
53+
except Exception:
5454
print(f"{file}: extract_value fails on {key}")
5555
return None
5656

csrc/cpu/comm/arm64/shm.h

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
// Copyright (c) Microsoft Corporation.
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
// DeepSpeed Team
5+
6+
// NOTE:
7+
// This shared-memory implementation targets AArch64 CPUs.
8+
// Minimum supported architecture is ARMv8-A with NEON (Advanced SIMD) support.
9+
// Systems without NEON are not supported.
10+
11+
#include <arm_neon.h>
12+
#include <stddef.h>
13+
#include <stdint.h>
14+
#include <cmath>
15+
16+
// 128 bits = 16 bytes -> fits 8 fp16/bf16 or 4 fp32 elements.
17+
static int vector_length_in_bytes = 16;
18+
// When widening fp16/bf16 -> fp32, 4 elements fit in one 128-bit register.
19+
// Using 8 would require two 128-bit registers, so limit to 4.
20+
static constexpr int full_precision_elements_in_fixed_vector = 4;
21+
22+
static inline float32x4_t cvt_bf16_to_fp32(const uint16x4_t input)
23+
{
24+
// Zero-extend 16-bit to 32-bit and shift left by 16 bits
25+
// BF16 has the same exponent/sign bits as FP32, just missing lower mantissa bits
26+
uint32x4_t result_32 = vshll_n_u16(input, 16);
27+
return vreinterpretq_f32_u32(result_32);
28+
}
29+
30+
static inline float32x4_t cvt_fp16_to_fp32(float16x4_t input)
31+
{
32+
// Converts 4 FP16 values to 4 FP32 values
33+
return vcvt_f32_f16(input);
34+
}
35+
36+
// While converting fp32 to fp16, before truncating lsb, it should be rounded to nearest even and
37+
// Converts 4 float32 -> 4 bfloat16 with round-to-nearest-even (RNE) and NaN handling
38+
static inline uint16x4_t cvt_fp32_to_bf16(float32x4_t src)
39+
{
40+
// Reinterpret float32 bits as uint32
41+
uint32x4_t u32 = vreinterpretq_u32_f32(src);
42+
43+
const uint32x4_t ones = vdupq_n_u32(0x1);
44+
const uint32x4_t vec_bias =
45+
vdupq_n_u32(0x7FFF); // one less than half of the dropped bits range
46+
const uint16x4_t nan_bf16 = vdup_n_u16(0xFFFF);
47+
48+
// RNE: lsb = (input >> 16) & 1
49+
uint32x4_t lsb = vandq_u32(vshrq_n_u32(u32, 16), ones);
50+
51+
// rounding_bias = 0x7FFF + lsb, lsb can be 0 or 1.
52+
uint32x4_t bias = vaddq_u32(vec_bias, lsb);
53+
54+
// input += rounding_bias
55+
u32 = vaddq_u32(u32, bias);
56+
57+
// >> 16 to get bfloat16
58+
// vshrq_n_u32 - keeps 32 bit width after shift
59+
// vshrn_n_u32 - keeps 16 bits width after shift
60+
uint16x4_t bf16 = vshrn_n_u32(u32, 16);
61+
62+
// vmvnq_u32 is bitwise NOT
63+
// NaN mask: ~(src == src) -> 1 if NaN
64+
// for normal num, ~(src == src) -> 0
65+
uint32x4_t isnan = vmvnq_u32(vceqq_f32(src, src));
66+
67+
// Select nan_bf16 if isnan (use 16-bit mask)
68+
uint16x4_t mask = vreinterpret_u16_u32(vget_low_u32(isnan));
69+
return vbsl_u16(mask, nan_bf16, bf16);
70+
}
71+
72+
// fp32 and fp16 are IEEE formats.
73+
// converting fp32 to fp16 is handled by vcvt_f16_f32 internally without arbitrarily truncating the
74+
// lsb but rounds to nearest.
75+
static inline float16x4_t cvt_fp32_to_fp16(float32x4_t input)
76+
{
77+
// Converts 4 FP32 values to 4 FP16 values with rounding
78+
return vcvt_f16_f32(input);
79+
}
80+
81+
// Reduce functions down below use vectorized algorithm, the number of bytes processed each
82+
// iteration depends on vector length. 128bit vector ==> 16 bytes. sticking to NEON 128 bit
83+
84+
void reduce_bf16_buffers(int start_elements, int num_elements, char* to_buffer, char** buffers);
85+
void reduce_fp16_buffers(int start_elements, int num_elements, char* to_buffer, char** buffers);
86+
void reduce_fp32_buffers(int start_elements, int num_elements, char* to_buffer, char** buffers);
87+
88+
void parallel_memcpy(void* to, void* from, size_t n_bytes);
89+
90+
#define VLOAD_U8(X) vld1q_u8((uint8_t*)(X))
91+
#define VLOAD_U16(X) vld1_u16((uint16_t*)(X))
92+
#define VLOAD_F16(X) vld1_f16((float16_t*)(X))
93+
#define VLOAD_F32(X) vld1q_f32((float32_t*)(X))
94+
95+
#define VSTORE_U8(A, B) vst1q_u8((uint8_t*)(A), B)
96+
#define VSTORE_U16(A, B) vst1_u16((uint16_t*)(A), B)
97+
#define VSTORE_F16(A, B) vst1_f16((float16_t*)(A), B) // fp16 supported from armv8.2-a+fp16
98+
#define VSTORE_F32(A, B) vst1q_f32((float32_t*)(A), B)
99+
100+
#define VADD_F32(A, B) vaddq_f32(A, B)
101+
#define VADD_F32_2VL(A, B) vaddq_f32(A, B)
102+
103+
#define CVT_BF16_TO_FP32(X) cvt_bf16_to_fp32(X)
104+
#define CVT_FP16_TO_FP32(X) cvt_fp16_to_fp32(X)
105+
#define CVT_FP32_TO_BF16(X) cvt_fp32_to_bf16(X)
106+
#define CVT_FP32_TO_FP16(X) cvt_fp32_to_fp16(X)

csrc/cpu/comm/shm.cpp

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,9 @@
1414
#if defined(__riscv)
1515
#define TARGET_RISCV 1
1616
#include "riscv64/shm.h"
17+
#elif defined(__aarch64__)
18+
#define TARGET_ARM 1
19+
#include "arm64/shm.h"
1720
#else
1821
#include "x86_64/shm.h"
1922
#endif
@@ -154,7 +157,10 @@ void reduce_bf16_buffers(int start_elements, int num_elements, char* to_buffer,
154157
#if TARGET_RISCV
155158
size_t vl = __riscv_vsetvl_e16m1(num_elements);
156159
vector_length_in_bytes = vl * element_size;
157-
#else
160+
#elif TARGET_ARM
161+
const int vl = full_precision_elements_in_fixed_vector;
162+
vector_length_in_bytes = vl * element_size;
163+
#else // x86_64
158164
const int vl = vector_length_in_bytes / element_size;
159165
#endif
160166
int main_elements = num_elements - (num_elements % vl);
@@ -214,7 +220,10 @@ void reduce_fp16_buffers(int start_elements, int num_elements, char* to_buffer,
214220
#if TARGET_RISCV
215221
size_t vl = __riscv_vsetvl_e16m1(num_elements);
216222
vector_length_in_bytes = vl * element_size;
217-
#else
223+
#elif TARGET_ARM
224+
const int vl = full_precision_elements_in_fixed_vector;
225+
vector_length_in_bytes = vl * element_size;
226+
#else // x86_64
218227
const int vl = vector_length_in_bytes / element_size;
219228
#endif
220229
int main_elements = num_elements - (num_elements % vl);
@@ -274,7 +283,10 @@ void reduce_fp32_buffers(int start_elements, int num_elements, char* to_buffer,
274283
#if TARGET_RISCV
275284
size_t vl = __riscv_vsetvl_e32m1(num_elements);
276285
vector_length_in_bytes = vl * element_size;
277-
#else
286+
#elif TARGET_ARM
287+
const int vl = full_precision_elements_in_fixed_vector;
288+
vector_length_in_bytes = vl * element_size;
289+
#else // x86_64
278290
const int vl = vector_length_in_bytes / element_size;
279291
#endif
280292
int main_elements = num_elements - (num_elements % vl);

deepspeed/autotuning/autotuner.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ def __init__(self, args, active_resources):
6969
try:
7070
os.makedirs(self.exps_dir, exist_ok=True)
7171
logger.info(f"Created autotuning experiments directory: {self.exps_dir}")
72-
except:
72+
except Exception:
7373
logger.error(
7474
f"Failed to create {self.exps_dir}, please check exps_dir in the autotuning config file is accessible by all the nodes in the job."
7575
)
@@ -82,7 +82,7 @@ def __init__(self, args, active_resources):
8282
try:
8383
os.makedirs(self.results_dir, exist_ok=True)
8484
logger.info(f"Created autotuning results directory: {self.results_dir}")
85-
except:
85+
except Exception:
8686
logger.error(
8787
f"Failed to create {self.results_dir}, please check results_dir in the autotuning config file is accessible by all the nodes in the job."
8888
)

deepspeed/autotuning/tuner/base_tuner.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,6 @@ def tune(self, sample_size=1, n_trials=1000, early_stopping=None):
6767
)
6868
break
6969
return i
70-
except:
70+
except Exception:
7171
logger.info("Tuner Error:", sys.exc_info()[0])
7272
return i

deepspeed/comm/torch.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,11 @@ def all_reduce(self, tensor, op=torch.distributed.ReduceOp.SUM, group=None, asyn
167167
return torch.distributed.all_reduce(tensor=tensor, op=op, group=group, async_op=async_op)
168168

169169
def inference_all_reduce(self, tensor, op, group=None):
170-
if not hasattr(torch.ops, 'deepspeed') or not hasattr(torch.ops.deepspeed, 'inference_all_reduce_'):
170+
use_ds_op = hasattr(torch.ops, 'deepspeed') and hasattr(torch.ops.deepspeed, 'inference_all_reduce_')
171+
world_size = torch.distributed.get_world_size(group=group)
172+
if world_size <= 1:
173+
return tensor
174+
if not use_ds_op:
171175
op = self._reduce_op(op)
172176
return torch.distributed.all_reduce(tensor=tensor, op=op, group=group, async_op=False)
173177
else:

deepspeed/compat.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
"""Compatibility functions to support wider version ranges for python and dependencies."""
2+
3+
# Copyright (c) Microsoft Corporation.
4+
# SPDX-License-Identifier: Apache-2.0
5+
6+
# DeepSpeed Team
7+
8+
from typing import Mapping, Any, Dict
9+
from inspect import ismodule
10+
try:
11+
import annotationlib # python >= 3.14
12+
except ImportError:
13+
annotationlib = None
14+
15+
# Deal with annotations in python versions >=3.14. See:
16+
# - Python 3.14 release notes: https://docs.python.org/3/whatsnew/3.14.html
17+
# Porting annotations: https://docs.python.org/3/whatsnew/3.14.html#whatsnew314-porting-annotations
18+
# - PEP649: https://peps.python.org/pep-0649/
19+
# - PEP749: https://peps.python.org/pep-0749/
20+
# Backwards compatible, applies best practices (use annotationlib) from python 3.14 onwards.
21+
22+
23+
def get_annotations_from_namespace(namespace: Mapping[str, object]) -> Dict[str, Any]:
24+
if annotationlib:
25+
annotate_func = annotationlib.get_annotate_from_class_namespace(namespace)
26+
if annotate_func is not None:
27+
return annotationlib.call_annotate_function(annotate_func, annotationlib.Format.VALUE)
28+
return namespace.get("__annotations__", {})
29+
30+
31+
def get_annotations(obj: Any) -> Dict[str, Any]:
32+
"""
33+
Retrieves annotations from a Python object.
34+
35+
In python >=3.14 this is a thin wrapper around the `annotationlib.get_annotations` function
36+
with the added convenience to automatically infer the type for non module, class, function
37+
or customly annotated objects.
38+
"""
39+
if annotationlib:
40+
has_annotations = hasattr(obj, "__annotations__") or hasattr(obj, "__annotate__")
41+
if not isinstance(obj, type) and not ismodule(obj) and not callable(obj) and not has_annotations:
42+
obj = type(obj)
43+
return annotationlib.get_annotations(obj)
44+
try:
45+
return obj.__annotations__
46+
except AttributeError:
47+
return {}

0 commit comments

Comments
 (0)