sgl-project · Yael-X · Nov 6, 2025 · Nov 6, 2025 · Nov 6, 2025 · Nov 6, 2025
diff --git a/build.sh b/build.sh
@@ -2,6 +2,7 @@
 set -e
 
 BUILD_DEEPEP_MODULE="ON"
+BUILD_DEEPEP_OPS="ON"
 BUILD_KERNELS_MODULE="ON"
 BUILD_MEMORY_SAVER_MODULE="ON"
 
@@ -20,6 +21,11 @@ while getopts ":a:hd" opt; do
             case "$OPTARG" in
                 deepep )
                     BUILD_DEEPEP_MODULE="ON"
+                    BUILD_DEEPEP_OPS="ON"
+                    ;;
+                deepep2 )
+                    BUILD_DEEPEP_MODULE="ON"
+                    BUILD_DEEPEP_OPS="OFF"
                     ;;
                 kernels )
                     BUILD_KERNELS_MODULE="ON"
@@ -120,7 +126,11 @@ function build_deepep_kernels()
     if [[ "$ONLY_BUILD_DEEPEP_ADAPTER_MODULE" == "ON" ]]; then return 0; fi
     if [[ "$BUILD_DEEPEP_MODULE" != "ON" ]]; then return 0; fi
 
-    KERNEL_DIR="csrc/deepep/ops"
+    if [[ "$BUILD_DEEPEP_OPS" == "ON" ]]; then
+        KERNEL_DIR="csrc/deepep/ops"
+    else
+        KERNEL_DIR="csrc/deepep/ops2"
+    fi
     CUSTOM_OPP_DIR="${CURRENT_DIR}/python/deep_ep/deep_ep"
 
     cd "$KERNEL_DIR" || exit
@@ -137,6 +147,7 @@ function build_deepep_kernels()
         echo "find run package: $custom_opp_file"
         chmod +x "$custom_opp_file"
     fi
+    rm -rf "$CUSTOM_OPP_DIR"/vendors
     ./build_out/custom_opp_*.run --install-path=$CUSTOM_OPP_DIR
     cd -
 }

diff --git a/csrc/deepep/CMakeLists.txt b/csrc/deepep/CMakeLists.txt
@@ -1,6 +1,14 @@
 # this is the cmakelist file for deepep build
 # deepep will be built as separated wheel package
 
+if(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "x86_64")
+    set(DEEPEP_ARCH "x86_64")
+elseif(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "aarch64")
+    set(DEEPEP_ARCH "aarch64")
+else()
+    message(FATAL_ERROR "Unsupported host processor: ${CMAKE_SYSTEM_PROCESSOR}")
+endif()
+
 set(PROJECT_BUILD_PATH ${PROJECT_BINARY_DIR})
 set(TARGET_INSTALL_DIR ${CMAKE_INSTALL_PREFIX})
 set(ASCEND_HOME_PATH ${ASCEND_HOME_PATH})
@@ -27,6 +35,7 @@ target_include_directories( deep_ep_cpp PRIVATE
     ${TORCH_NPU_DIR}/include/third_party/acl/inc/acl
     ${TORCH_NPU_DIR}/include/third_party/acl/inc
     ${ASCEND_HOME_PATH}/include
+    ${ASCEND_HOME_PATH}/${DEEPEP_ARCH}-linux/include/experiment/platform
 )
 target_link_directories(deep_ep_cpp PRIVATE
         ${TORCH_DIR}/lib
@@ -38,6 +47,7 @@ target_link_libraries(deep_ep_cpp PRIVATE
     ascendcl
     hccl
     torch_npu
+    opapi
 )
 
 message(STATUS "TARGET_INSTALL_DIR = ${TARGET_INSTALL_DIR}")

diff --git a/csrc/deepep/deep_ep.cpp b/csrc/deepep/deep_ep.cpp
@@ -40,6 +40,8 @@ Buffer::Buffer(int64_t rank, int64_t num_ranks, int64_t num_nvl_bytes, int64_t n
     }
 
     this->shared_expert_rank_num = get_value_from_env("MOE_SHARED_EXPERT_RANK_NUM", 0);
+
+    soc_version = op::GetCurrentPlatformInfo().GetSocVersion();
 }
 
 Buffer::~Buffer() noexcept(false) {}
@@ -440,9 +442,10 @@ Buffer::intranode_combine(const torch::Tensor &x, const torch::Tensor &topk_idx,
     return {combined_x, recv_topk_weights, event};
 }
 
-std::tuple<at::Tensor, std::optional<at::Tensor>, at::Tensor, at::Tensor, at::Tensor, std::optional<EventHandle>,
-           std::optional<std::function<void()>>>
+std::tuple<at::Tensor, std::optional<at::Tensor>, at::Tensor, at::Tensor, at::Tensor, at::Tensor,
+           std::optional<EventHandle>, std::optional<std::function<void()>>>
 Buffer::low_latency_dispatch(const at::Tensor &x, const at::Tensor &topk_idx,
+                             const std::optional<torch::Tensor> &topk_weights,
                              const std::optional<at::Tensor> &cumulative_local_expert_recv_stats,
                              int64_t num_max_dispatch_tokens_per_rank, int64_t num_experts, bool use_fp8,
                              bool round_scale, bool use_ue8m0, bool async, bool return_recv_hook)
@@ -493,25 +496,54 @@ Buffer::low_latency_dispatch(const at::Tensor &x, const at::Tensor &topk_idx,
         at::empty({num_max_tokens, hidden}, new_x.options().dtype(use_fp8 ? at::kChar : at::kBFloat16));
     auto packed_recv_x_scales = at::empty({num_max_tokens}, at::dtype(at::kFloat).device(device));
     auto expandIdx = at::empty({max_size}, at::dtype(at::kInt).device(device));
-    auto ep_recv_count = at::empty({num_local_experts * num_ranks}, at::dtype(at::kInt).device(device));
+    // ��Ӧ��layout��������ݳ���
+    int32_t server_num = num_ranks / LOCAL_RANK_SIZE;
+    at::Tensor expand_scales = at::empty({1}, at::dtype(at::kFloat).device(device));
+    ;  // just A2 layered need
+    at::Tensor ep_recv_count =
+        at::empty({num_local_experts * num_ranks}, at::dtype(at::kInt).device(device));  // A2 non-layered / A3
+    at::Tensor expert_scales =
+        at::ones({num_tokens, num_topk}, at::dtype(at::kFloat).device(device));  // A2 non-layered / A3
     auto tp_recv_count = at::empty({1}, at::dtype(at::kInt).device(device));
     auto packed_recv_count = at::empty({num_local_experts}, at::dtype(at::kLong).device(device));
-    auto expandScales = at::empty({1}, at::dtype(at::kFloat).device(device));
     at::Tensor scales;
-    at::Tensor activateMask;
-    auto expert_scales = at::empty({1}, at::dtype(at::kFloat).device(device));
+    at::Tensor activate_mask;
+
+    if (soc_version == op::SocVersion::ASCEND910B) {
+        const char *hcclIntraPcieEnable = getenv("HCCL_INTRA_PCIE_ENABLE");
+        const char *hcclIntraRoceEnable = getenv("HCCL_INTRA_ROCE_ENABLE");
+        if (hcclIntraPcieEnable != nullptr && hcclIntraRoceEnable != nullptr && strcmp(hcclIntraPcieEnable, "1") == 0 &&
+            strcmp(hcclIntraRoceEnable, "0") == 0) {  // A2 layered
+            if (topk_weights.has_value()) {
+                if (!this->is_padding) {
+                    expert_scales = topk_weights.value();
+                } else {
+                    std::vector<at::Tensor> weight_blocks;
+                    if (topk_weights->size(0) != 0) {
+                        weight_blocks.emplace_back(topk_weights.value());
+                    }
+                    for (int i = 0; i < this->padding_cnt; i++) {
+                        at::Tensor tmp_weight =
+                            torch::arange(0, num_topk, topk_weights->options()).reshape({1, num_topk});
+                        weight_blocks.emplace_back(tmp_weight);
+                    }
+                    expert_scales = torch::cat(weight_blocks, 0);
+                }
+            }
+            int64_t recv_count_tensor_size = num_experts + 2 * global_bs * num_topk * server_num;
+            ep_recv_count = at::empty({recv_count_tensor_size}, at::dtype(at::kInt).device(device));
+            expand_scales = at::empty({num_max_tokens}, at::dtype(at::kFloat).device(device));
+            activate_mask = (new_topk_idx >= 0).to(torch::kBool);
+        }
+    }
+
     int64_t quant_mode = use_fp8 ? 2 : 0;
     int64_t tp_size = 1;
     int64_t tp_rank = 0;
     int64_t expert_shard_type = 0;
     int outType = get_value_from_env("MOE_EXPERT_TOKEN_NUMS_TYPE", 1);
     int64_t expert_token_nums_type = outType;
 
-    std::string comm_log = "0";
-    std::vector<char> comm_log_buf(comm_log.begin(), comm_log.end());
-    comm_log_buf.push_back('\0');
-    char *comm_log_ptr = comm_log_buf.data();
-
     // get ep & tp name
     char hcom_ep_name[HCOMM_NAME_LEN];
     if (!moe_all_to_all_group_name.empty()) {
@@ -520,10 +552,11 @@ Buffer::low_latency_dispatch(const at::Tensor &x, const at::Tensor &topk_idx,
         HCCL_CHECK(HcclGetCommName(ep_comm, hcom_ep_name));
     }
     char hcom_tp_name[HCOMM_NAME_LEN] = {0};
+    char comm_alg[] = "fullmesh";
 
     EXEC_NPU_CMD(aclnnMoeDistributeDispatchV2, new_x, new_topk_idx,
                  scales,         // smooth scales,
-                 activateMask,   // activateMask
+                 activate_mask,  // activateMask
                  expert_scales,  // expert_scales
                  hcom_ep_name,   // ep
                  num_ranks,      // rankSize
@@ -538,17 +571,17 @@ Buffer::low_latency_dispatch(const at::Tensor &x, const at::Tensor &topk_idx,
                  quant_mode,
                  global_bs,               // global_bs
                  expert_token_nums_type,  // expert_token_nums_type
-                 comm_log_ptr, packed_recv_x,
+                 comm_alg, packed_recv_x,
                  packed_recv_x_scales,  // dynamicScalesOut
                  expandIdx,
                  packed_recv_count,  // expertTokenNumsOut
-                 ep_recv_count, tp_recv_count, expandScales);
+                 ep_recv_count, tp_recv_count, expand_scales);
 
     // Wait streams
     std::optional<EventHandle> event;
 
     // Return values
-    return {packed_recv_x, packed_recv_x_scales,        packed_recv_count, expandIdx, ep_recv_count,
+    return {packed_recv_x, packed_recv_x_scales,        packed_recv_count, expandIdx, ep_recv_count, expand_scales,
             event,         std::function<void()>([] {})};
 }
 
@@ -561,7 +594,7 @@ std::tuple<at::Tensor, std::optional<EventHandle>, std::optional<std::function<v
     const at::Tensor &x, const at::Tensor &topk_idx, const at::Tensor &topk_weights, const at::Tensor &src_info,
     const at::Tensor &layout_range, int64_t num_max_dispatch_tokens_per_rank, int64_t num_experts,
     const at::Tensor &packed_recv_count, bool zero_copy, bool async, bool return_recv_hook,
-    const std::optional<at::Tensor> &out)
+    const std::optional<at::Tensor> &out, const at::Tensor &expand_scales)
 {
     at::Tensor new_idx = topk_idx;
     at::Tensor new_scales = topk_weights;
@@ -598,8 +631,11 @@ std::tuple<at::Tensor, std::optional<EventHandle>, std::optional<std::function<v
     at::Tensor ep_send_counts = layout_range;
     at::Tensor expert_scales = new_scales;
     at::Tensor tp_send_counts = at::empty({1}, at::dtype(at::kInt).device(device));
-    at::Tensor x_active_mask, activation_scale, weight_scale, group_list, expand_scales;
-
+    at::Tensor activation_scale, weight_scale, group_list;
+    at::Tensor x_active_mask;
+    if (soc_version == op::SocVersion::ASCEND910B) {
+        x_active_mask = (new_topk_idx >= 0).to(torch::kBool);
+    }
     int64_t tp_world_size = 1;
     int64_t tp_rankId = 0;
     int64_t expert_shared_type = 0;
@@ -613,16 +649,13 @@ std::tuple<at::Tensor, std::optional<EventHandle>, std::optional<std::function<v
     at::Tensor shared_expert_x{nullptr};
     at::Tensor combined_x = at::empty({num_combined_tokens, hidden}, x.options());
     std::optional<EventHandle> event;
-    std::string comm_log = "0";
-    std::vector<char> comm_log_buf(comm_log.begin(), comm_log.end());
-    comm_log_buf.push_back('\0');
-    char *comm_log_ptr = comm_log_buf.data();
+    char comm_alg[] = "fullmesh";
 
     EXEC_NPU_CMD(aclnnMoeDistributeCombineV2, expand_x, expert_ids, expand_idx, ep_send_counts, expert_scales,
                  tp_send_counts, x_active_mask, activation_scale, weight_scale, group_list, expand_scales,
                  shared_expert_x, hcom_ep_name, num_ranks, rank, num_experts, hcom_tp_name, tp_world_size, tp_rankId,
                  expert_shared_type, shared_expert_num, shared_expert_rank_num, global_bs, out_dtype, comm_quant_mode,
-                 group_list_type, comm_log_ptr, combined_x);
+                 group_list_type, comm_alg, combined_x);
     if (this->is_padding) {
         if (this->padding_cnt == PADDING_SIZE) {
             combined_x = this->ori_x;

diff --git a/csrc/deepep/deep_ep.hpp b/csrc/deepep/deep_ep.hpp
@@ -7,6 +7,7 @@
 #include <optional>
 #include "hccl/hccl.h"
 #include "hccl/hccl_types.h"
+#include "aclnn/opdev/platform.h"
 
 #include "config.hpp"
 #include "event.hpp"
@@ -16,6 +17,7 @@ namespace deep_ep {
 struct Buffer {
     int64_t rank, rdma_rank;
     int64_t num_ranks;
+    op::SocVersion soc_version;
 
     int64_t num_nvl_bytes;
     int64_t num_rdma_bytes;
@@ -73,9 +75,10 @@ struct Buffer {
                       const std::optional<torch::Tensor> &topk_weights, const torch::Tensor &src_idx,
                       const torch::Tensor &send_head, const std::optional<at::Tensor> &combine_send_cost_stats);
 
-    std::tuple<at::Tensor, std::optional<at::Tensor>, at::Tensor, at::Tensor, at::Tensor, std::optional<EventHandle>,
-               std::optional<std::function<void()>>>
+    std::tuple<at::Tensor, std::optional<at::Tensor>, at::Tensor, at::Tensor, at::Tensor, at::Tensor,
+               std::optional<EventHandle>, std::optional<std::function<void()>>>
     low_latency_dispatch(const at::Tensor &x, const at::Tensor &topk_idx,
+                         const std::optional<torch::Tensor> &topk_weights,
                          const std::optional<at::Tensor> &cumulative_local_expert_recv_stats,
                          int64_t num_max_dispatch_tokens_per_rank, int64_t num_experts, bool use_fp8, bool round_scale,
                          bool use_ue8m0, bool async, bool return_recv_hook);
@@ -86,7 +89,7 @@ struct Buffer {
         const at::Tensor &x, const at::Tensor &topk_idx, const at::Tensor &topk_weights, const at::Tensor &src_info,
         const at::Tensor &layout_range, int64_t num_max_dispatch_tokens_per_rank, int64_t num_experts,
         const at::Tensor &packed_recv_count, bool zero_copy, bool async, bool return_recv_hook,
-        const std::optional<at::Tensor> &out);
+        const std::optional<at::Tensor> &out, const at::Tensor &expand_scales);
 
     std::vector<at::Tensor> fused_deep_moe(const at::Tensor &x, const at::Tensor &expertIds,
                                            const at::Tensor &gmm1PermutedWeight,

diff --git a/csrc/deepep/ops2/CMakeLists.txt b/csrc/deepep/ops2/CMakeLists.txt
@@ -0,0 +1,67 @@
+cmake_minimum_required(VERSION 3.16.0)
+project(opp)
+
+if(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "x86_64")
+    set(CANN_HOST_ARCH "x86_64")
+elseif(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "aarch64")
+    set(CANN_HOST_ARCH "aarch64")
+else()
+    message(FATAL_ERROR "Unsupported host processor: ${CMAKE_SYSTEM_PROCESSOR}")
+endif()
+
+include(cmake/config.cmake)
+include(cmake/func.cmake)
+include(cmake/intf.cmake)
+
+set(CMAKE_COMPILE ${CMAKE_CXX_COMPILER})
+
+if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/framework)
+    add_subdirectory(framework)
+endif()
+if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/op_host)
+    add_subdirectory(op_host)
+endif()
+if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel)
+    add_subdirectory(op_kernel)
+endif()
+if(ENABLE_TEST AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/testcases)
+    add_subdirectory(testcases)
+endif()
+
+add_custom_command(OUTPUT ${CMAKE_BINARY_DIR}/scripts/install.sh ${CMAKE_BINARY_DIR}/scripts/upgrade.sh
+    COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_BINARY_DIR}/scripts
+    COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_SOURCE_DIR}/scripts/ ${CMAKE_BINARY_DIR}/scripts/
+    COMMAND sed -i "s/vendor_name=customize/vendor_name=${vendor_name}/g" ${CMAKE_BINARY_DIR}/scripts/install.sh ${CMAKE_BINARY_DIR}/scripts/upgrade.sh
+    VERBATIM
+)
+add_custom_target(modify_vendor ALL DEPENDS ${CMAKE_BINARY_DIR}/scripts/install.sh ${CMAKE_BINARY_DIR}/scripts/upgrade.sh)
+
+get_system_info(SYSTEM_INFO)
+
+# gen version.info
+add_custom_target(gen_version_info ALL
+        COMMAND bash ${CMAKE_CURRENT_SOURCE_DIR}/cmake/util/gen_version_info.sh ${ASCEND_CANN_PACKAGE_PATH} ${CMAKE_CURRENT_BINARY_DIR}
+)
+
+if(NOT ASCEND_PACK_SHARED_LIBRARY)
+    install(DIRECTORY ${CMAKE_BINARY_DIR}/scripts/ DESTINATION . FILE_PERMISSIONS OWNER_EXECUTE OWNER_READ GROUP_READ)
+
+    install(FILES ${CMAKE_SOURCE_DIR}/custom.proto DESTINATION packages OPTIONAL)
+    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/version.info
+            DESTINATION packages/vendors/${vendor_name}/)
+
+    # CPack config
+    set(CPACK_PACKAGE_NAME ${CMAKE_PROJECT_NAME})
+    set(CPACK_PACKAGE_VERSION ${CMAKE_PROJECT_VERSION})
+    set(CPACK_PACKAGE_DESCRIPTION "CPack opp project")
+    set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "CPack opp project")
+    set(CPACK_PACKAGE_DIRECTORY ${CMAKE_INSTALL_PREFIX})
+
+    set(CPACK_PACKAGE_FILE_NAME "custom_opp_${SYSTEM_INFO}_${CMAKE_SYSTEM_PROCESSOR}.run")
+    set(CPACK_GENERATOR External)
+    set(CPACK_CMAKE_GENERATOR "Unix Makefiles")
+    set(CPACK_EXTERNAL_ENABLE_STAGING TRUE)
+    set(CPACK_EXTERNAL_PACKAGE_SCRIPT ${CMAKE_SOURCE_DIR}/cmake/makeself.cmake)
+    set(CPACK_EXTERNAL_BUILT_PACKAGES ${CPACK_PACKAGE_DIRECTORY}/_CPack_Packages/Linux/External/${CPACK_PACKAGE_FILE_NAME}/${CPACK_PACKAGE_FILE_NAME})
+    include(CPack)
+endif()
diff --git a/csrc/deepep/ops2/CMakePresets.json b/csrc/deepep/ops2/CMakePresets.json
@@ -0,0 +1,59 @@
+{
+    "version": 1,
+    "cmakeMinimumRequired": {
+        "major": 3,
+        "minor": 19,
+        "patch": 0
+    },
+    "configurePresets": [
+        {
+            "name": "default",
+            "displayName": "Default Config",
+            "description": "Default build using Unix Makefiles generator for native compilation",
+            "generator": "Unix Makefiles",
+            "binaryDir": "${sourceDir}/build_out",
+            "cacheVariables": {
+                "CMAKE_BUILD_TYPE": {
+                    "type": "STRING",
+                    "value": "Release"
+                },
+                "ENABLE_SOURCE_PACKAGE": {
+                    "type": "BOOL",
+                    "value": "True"
+                },
+                "ENABLE_BINARY_PACKAGE": {
+                    "type": "BOOL",
+                    "value": "True"
+                },
+                "ASCEND_COMPUTE_UNIT": {
+                    "type": "STRING",
+                    "value": "ascend910b"
+                },
+                "ENABLE_TEST": {
+                    "type": "BOOL",
+                    "value": "True"
+                },
+                "vendor_name": {
+                    "type": "STRING",
+                    "value": "hwcomputing"
+                },
+                "ASCEND_CANN_PACKAGE_PATH": {
+                    "type": "PATH",
+                    "value": "/usr/local/Ascend/ascend-toolkit/latest"
+                },
+                "ASCEND_PYTHON_EXECUTABLE": {
+                    "type": "STRING",
+                    "value": "python3"
+                },
+                "CMAKE_INSTALL_PREFIX": {
+                    "type": "PATH",
+                    "value": "${sourceDir}/build_out"
+                },
+                "ASCEND_PACK_SHARED_LIBRARY": {
+                    "type": "BOOL",
+                    "value": "False"
+                }
+            }
+        }
+    ]
+}