diff --git a/build.sh b/build.sh
index 97bb7c0b7..9b0ec7599 100755
--- a/build.sh
+++ b/build.sh
@@ -2,6 +2,7 @@
 set -e
 
 BUILD_DEEPEP_MODULE="ON"
+BUILD_DEEPEP_OPS="ON"
 BUILD_KERNELS_MODULE="ON"
 BUILD_MEMORY_SAVER_MODULE="ON"
 
@@ -20,6 +21,11 @@ while getopts ":a:hd" opt; do
             case "$OPTARG" in
                 deepep )
                     BUILD_DEEPEP_MODULE="ON"
+                    BUILD_DEEPEP_OPS="ON"
+                    ;;
+                deepep2 )
+                    BUILD_DEEPEP_MODULE="ON"
+                    BUILD_DEEPEP_OPS="OFF"
                     ;;
                 kernels )
                     BUILD_KERNELS_MODULE="ON"
@@ -120,7 +126,11 @@ function build_deepep_kernels()
     if [[ "$ONLY_BUILD_DEEPEP_ADAPTER_MODULE" == "ON" ]]; then return 0; fi
     if [[ "$BUILD_DEEPEP_MODULE" != "ON" ]]; then return 0; fi
 
-    KERNEL_DIR="csrc/deepep/ops"
+    if [[ "$BUILD_DEEPEP_OPS" == "ON" ]]; then
+        KERNEL_DIR="csrc/deepep/ops"
+    else
+        KERNEL_DIR="csrc/deepep/ops2"
+    fi
     CUSTOM_OPP_DIR="${CURRENT_DIR}/python/deep_ep/deep_ep"
 
     cd "$KERNEL_DIR" || exit
@@ -137,6 +147,7 @@ function build_deepep_kernels()
         echo "find run package: $custom_opp_file"
         chmod +x "$custom_opp_file"
     fi
+    rm -rf "$CUSTOM_OPP_DIR"/vendors
     ./build_out/custom_opp_*.run --install-path=$CUSTOM_OPP_DIR
     cd -
 }
diff --git a/csrc/deepep/CMakeLists.txt b/csrc/deepep/CMakeLists.txt
index 656284117..b51475732 100644
--- a/csrc/deepep/CMakeLists.txt
+++ b/csrc/deepep/CMakeLists.txt
@@ -1,6 +1,14 @@
 # this is the cmakelist file for deepep build
 # deepep will be built as separated wheel package
 
+if(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "x86_64")
+    set(DEEPEP_ARCH "x86_64")
+elseif(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "aarch64")
+    set(DEEPEP_ARCH "aarch64")
+else()
+    message(FATAL_ERROR "Unsupported host processor: ${CMAKE_SYSTEM_PROCESSOR}")
+endif()
+
 set(PROJECT_BUILD_PATH ${PROJECT_BINARY_DIR})
 set(TARGET_INSTALL_DIR ${CMAKE_INSTALL_PREFIX})
 set(ASCEND_HOME_PATH ${ASCEND_HOME_PATH})
@@ -27,6 +35,7 @@ target_include_directories( deep_ep_cpp PRIVATE
     ${TORCH_NPU_DIR}/include/third_party/acl/inc/acl
     ${TORCH_NPU_DIR}/include/third_party/acl/inc
     ${ASCEND_HOME_PATH}/include
+    ${ASCEND_HOME_PATH}/${DEEPEP_ARCH}-linux/include/experiment/platform
 )
 target_link_directories(deep_ep_cpp PRIVATE
         ${TORCH_DIR}/lib
@@ -38,6 +47,7 @@ target_link_libraries(deep_ep_cpp PRIVATE
     ascendcl
     hccl
     torch_npu
+    opapi
 )
 
 message(STATUS "TARGET_INSTALL_DIR = ${TARGET_INSTALL_DIR}")
diff --git a/csrc/deepep/deep_ep.cpp b/csrc/deepep/deep_ep.cpp
index efa088a6c..901cd3d8d 100644
--- a/csrc/deepep/deep_ep.cpp
+++ b/csrc/deepep/deep_ep.cpp
@@ -40,6 +40,8 @@ Buffer::Buffer(int64_t rank, int64_t num_ranks, int64_t num_nvl_bytes, int64_t n
     }
 
     this->shared_expert_rank_num = get_value_from_env("MOE_SHARED_EXPERT_RANK_NUM", 0);
+
+    soc_version = op::GetCurrentPlatformInfo().GetSocVersion();
 }
 
 Buffer::~Buffer() noexcept(false) {}
@@ -440,9 +442,10 @@ Buffer::intranode_combine(const torch::Tensor &x, const torch::Tensor &topk_idx,
     return {combined_x, recv_topk_weights, event};
 }
 
-std::tuple<at::Tensor, std::optional<at::Tensor>, at::Tensor, at::Tensor, at::Tensor, std::optional<EventHandle>,
-           std::optional<std::function<void()>>>
+std::tuple<at::Tensor, std::optional<at::Tensor>, at::Tensor, at::Tensor, at::Tensor, at::Tensor,
+           std::optional<EventHandle>, std::optional<std::function<void()>>>
 Buffer::low_latency_dispatch(const at::Tensor &x, const at::Tensor &topk_idx,
+                             const std::optional<torch::Tensor> &topk_weights,
                              const std::optional<at::Tensor> &cumulative_local_expert_recv_stats,
                              int64_t num_max_dispatch_tokens_per_rank, int64_t num_experts, bool use_fp8,
                              bool round_scale, bool use_ue8m0, bool async, bool return_recv_hook)
@@ -493,13 +496,47 @@ Buffer::low_latency_dispatch(const at::Tensor &x, const at::Tensor &topk_idx,
         at::empty({num_max_tokens, hidden}, new_x.options().dtype(use_fp8 ? at::kChar : at::kBFloat16));
     auto packed_recv_x_scales = at::empty({num_max_tokens}, at::dtype(at::kFloat).device(device));
     auto expandIdx = at::empty({max_size}, at::dtype(at::kInt).device(device));
-    auto ep_recv_count = at::empty({num_local_experts * num_ranks}, at::dtype(at::kInt).device(device));
+    // ��Ӧ��layout��������ݳ���
+    int32_t server_num = num_ranks / LOCAL_RANK_SIZE;
+    at::Tensor expand_scales = at::empty({1}, at::dtype(at::kFloat).device(device));
+    ;  // just A2 layered need
+    at::Tensor ep_recv_count =
+        at::empty({num_local_experts * num_ranks}, at::dtype(at::kInt).device(device));  // A2 non-layered / A3
+    at::Tensor expert_scales =
+        at::ones({num_tokens, num_topk}, at::dtype(at::kFloat).device(device));  // A2 non-layered / A3
     auto tp_recv_count = at::empty({1}, at::dtype(at::kInt).device(device));
     auto packed_recv_count = at::empty({num_local_experts}, at::dtype(at::kLong).device(device));
-    auto expandScales = at::empty({1}, at::dtype(at::kFloat).device(device));
     at::Tensor scales;
-    at::Tensor activateMask;
-    auto expert_scales = at::empty({1}, at::dtype(at::kFloat).device(device));
+    at::Tensor activate_mask;
+
+    if (soc_version == op::SocVersion::ASCEND910B) {
+        const char *hcclIntraPcieEnable = getenv("HCCL_INTRA_PCIE_ENABLE");
+        const char *hcclIntraRoceEnable = getenv("HCCL_INTRA_ROCE_ENABLE");
+        if (hcclIntraPcieEnable != nullptr && hcclIntraRoceEnable != nullptr && strcmp(hcclIntraPcieEnable, "1") == 0 &&
+            strcmp(hcclIntraRoceEnable, "0") == 0) {  // A2 layered
+            if (topk_weights.has_value()) {
+                if (!this->is_padding) {
+                    expert_scales = topk_weights.value();
+                } else {
+                    std::vector<at::Tensor> weight_blocks;
+                    if (topk_weights->size(0) != 0) {
+                        weight_blocks.emplace_back(topk_weights.value());
+                    }
+                    for (int i = 0; i < this->padding_cnt; i++) {
+                        at::Tensor tmp_weight =
+                            torch::arange(0, num_topk, topk_weights->options()).reshape({1, num_topk});
+                        weight_blocks.emplace_back(tmp_weight);
+                    }
+                    expert_scales = torch::cat(weight_blocks, 0);
+                }
+            }
+            int64_t recv_count_tensor_size = num_experts + 2 * global_bs * num_topk * server_num;
+            ep_recv_count = at::empty({recv_count_tensor_size}, at::dtype(at::kInt).device(device));
+            expand_scales = at::empty({num_max_tokens}, at::dtype(at::kFloat).device(device));
+            activate_mask = (new_topk_idx >= 0).to(torch::kBool);
+        }
+    }
+
     int64_t quant_mode = use_fp8 ? 2 : 0;
     int64_t tp_size = 1;
     int64_t tp_rank = 0;
@@ -507,11 +544,6 @@ Buffer::low_latency_dispatch(const at::Tensor &x, const at::Tensor &topk_idx,
     int outType = get_value_from_env("MOE_EXPERT_TOKEN_NUMS_TYPE", 1);
     int64_t expert_token_nums_type = outType;
 
-    std::string comm_log = "0";
-    std::vector<char> comm_log_buf(comm_log.begin(), comm_log.end());
-    comm_log_buf.push_back('\0');
-    char *comm_log_ptr = comm_log_buf.data();
-
     // get ep & tp name
     char hcom_ep_name[HCOMM_NAME_LEN];
     if (!moe_all_to_all_group_name.empty()) {
@@ -520,10 +552,11 @@ Buffer::low_latency_dispatch(const at::Tensor &x, const at::Tensor &topk_idx,
         HCCL_CHECK(HcclGetCommName(ep_comm, hcom_ep_name));
     }
     char hcom_tp_name[HCOMM_NAME_LEN] = {0};
+    char comm_alg[] = "fullmesh";
 
     EXEC_NPU_CMD(aclnnMoeDistributeDispatchV2, new_x, new_topk_idx,
                  scales,         // smooth scales,
-                 activateMask,   // activateMask
+                 activate_mask,  // activateMask
                  expert_scales,  // expert_scales
                  hcom_ep_name,   // ep
                  num_ranks,      // rankSize
@@ -538,17 +571,17 @@ Buffer::low_latency_dispatch(const at::Tensor &x, const at::Tensor &topk_idx,
                  quant_mode,
                  global_bs,               // global_bs
                  expert_token_nums_type,  // expert_token_nums_type
-                 comm_log_ptr, packed_recv_x,
+                 comm_alg, packed_recv_x,
                  packed_recv_x_scales,  // dynamicScalesOut
                  expandIdx,
                  packed_recv_count,  // expertTokenNumsOut
-                 ep_recv_count, tp_recv_count, expandScales);
+                 ep_recv_count, tp_recv_count, expand_scales);
 
     // Wait streams
     std::optional<EventHandle> event;
 
     // Return values
-    return {packed_recv_x, packed_recv_x_scales,        packed_recv_count, expandIdx, ep_recv_count,
+    return {packed_recv_x, packed_recv_x_scales,        packed_recv_count, expandIdx, ep_recv_count, expand_scales,
             event,         std::function<void()>([] {})};
 }
 
@@ -561,7 +594,7 @@ std::tuple<at::Tensor, std::optional<EventHandle>, std::optional<std::function<v
     const at::Tensor &x, const at::Tensor &topk_idx, const at::Tensor &topk_weights, const at::Tensor &src_info,
     const at::Tensor &layout_range, int64_t num_max_dispatch_tokens_per_rank, int64_t num_experts,
     const at::Tensor &packed_recv_count, bool zero_copy, bool async, bool return_recv_hook,
-    const std::optional<at::Tensor> &out)
+    const std::optional<at::Tensor> &out, const at::Tensor &expand_scales)
 {
     at::Tensor new_idx = topk_idx;
     at::Tensor new_scales = topk_weights;
@@ -598,8 +631,11 @@ std::tuple<at::Tensor, std::optional<EventHandle>, std::optional<std::function<v
     at::Tensor ep_send_counts = layout_range;
     at::Tensor expert_scales = new_scales;
     at::Tensor tp_send_counts = at::empty({1}, at::dtype(at::kInt).device(device));
-    at::Tensor x_active_mask, activation_scale, weight_scale, group_list, expand_scales;
-
+    at::Tensor activation_scale, weight_scale, group_list;
+    at::Tensor x_active_mask;
+    if (soc_version == op::SocVersion::ASCEND910B) {
+        x_active_mask = (new_topk_idx >= 0).to(torch::kBool);
+    }
     int64_t tp_world_size = 1;
     int64_t tp_rankId = 0;
     int64_t expert_shared_type = 0;
@@ -613,16 +649,13 @@ std::tuple<at::Tensor, std::optional<EventHandle>, std::optional<std::function<v
     at::Tensor shared_expert_x{nullptr};
     at::Tensor combined_x = at::empty({num_combined_tokens, hidden}, x.options());
     std::optional<EventHandle> event;
-    std::string comm_log = "0";
-    std::vector<char> comm_log_buf(comm_log.begin(), comm_log.end());
-    comm_log_buf.push_back('\0');
-    char *comm_log_ptr = comm_log_buf.data();
+    char comm_alg[] = "fullmesh";
 
     EXEC_NPU_CMD(aclnnMoeDistributeCombineV2, expand_x, expert_ids, expand_idx, ep_send_counts, expert_scales,
                  tp_send_counts, x_active_mask, activation_scale, weight_scale, group_list, expand_scales,
                  shared_expert_x, hcom_ep_name, num_ranks, rank, num_experts, hcom_tp_name, tp_world_size, tp_rankId,
                  expert_shared_type, shared_expert_num, shared_expert_rank_num, global_bs, out_dtype, comm_quant_mode,
-                 group_list_type, comm_log_ptr, combined_x);
+                 group_list_type, comm_alg, combined_x);
     if (this->is_padding) {
         if (this->padding_cnt == PADDING_SIZE) {
             combined_x = this->ori_x;
diff --git a/csrc/deepep/deep_ep.hpp b/csrc/deepep/deep_ep.hpp
index b01375df4..1df6de3f8 100644
--- a/csrc/deepep/deep_ep.hpp
+++ b/csrc/deepep/deep_ep.hpp
@@ -7,6 +7,7 @@
 #include <optional>
 #include "hccl/hccl.h"
 #include "hccl/hccl_types.h"
+#include "aclnn/opdev/platform.h"
 
 #include "config.hpp"
 #include "event.hpp"
@@ -16,6 +17,7 @@ namespace deep_ep {
 struct Buffer {
     int64_t rank, rdma_rank;
     int64_t num_ranks;
+    op::SocVersion soc_version;
 
     int64_t num_nvl_bytes;
     int64_t num_rdma_bytes;
@@ -73,9 +75,10 @@ struct Buffer {
                       const std::optional<torch::Tensor> &topk_weights, const torch::Tensor &src_idx,
                       const torch::Tensor &send_head, const std::optional<at::Tensor> &combine_send_cost_stats);
 
-    std::tuple<at::Tensor, std::optional<at::Tensor>, at::Tensor, at::Tensor, at::Tensor, std::optional<EventHandle>,
-               std::optional<std::function<void()>>>
+    std::tuple<at::Tensor, std::optional<at::Tensor>, at::Tensor, at::Tensor, at::Tensor, at::Tensor,
+               std::optional<EventHandle>, std::optional<std::function<void()>>>
     low_latency_dispatch(const at::Tensor &x, const at::Tensor &topk_idx,
+                         const std::optional<torch::Tensor> &topk_weights,
                          const std::optional<at::Tensor> &cumulative_local_expert_recv_stats,
                          int64_t num_max_dispatch_tokens_per_rank, int64_t num_experts, bool use_fp8, bool round_scale,
                          bool use_ue8m0, bool async, bool return_recv_hook);
@@ -86,7 +89,7 @@ struct Buffer {
         const at::Tensor &x, const at::Tensor &topk_idx, const at::Tensor &topk_weights, const at::Tensor &src_info,
         const at::Tensor &layout_range, int64_t num_max_dispatch_tokens_per_rank, int64_t num_experts,
         const at::Tensor &packed_recv_count, bool zero_copy, bool async, bool return_recv_hook,
-        const std::optional<at::Tensor> &out);
+        const std::optional<at::Tensor> &out, const at::Tensor &expand_scales);
 
     std::vector<at::Tensor> fused_deep_moe(const at::Tensor &x, const at::Tensor &expertIds,
                                            const at::Tensor &gmm1PermutedWeight,
diff --git a/csrc/deepep/ops2/CMakeLists.txt b/csrc/deepep/ops2/CMakeLists.txt
new file mode 100644
index 000000000..359beca29
--- /dev/null
+++ b/csrc/deepep/ops2/CMakeLists.txt
@@ -0,0 +1,67 @@
+cmake_minimum_required(VERSION 3.16.0)
+project(opp)
+
+if(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "x86_64")
+    set(CANN_HOST_ARCH "x86_64")
+elseif(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "aarch64")
+    set(CANN_HOST_ARCH "aarch64")
+else()
+    message(FATAL_ERROR "Unsupported host processor: ${CMAKE_SYSTEM_PROCESSOR}")
+endif()
+
+include(cmake/config.cmake)
+include(cmake/func.cmake)
+include(cmake/intf.cmake)
+
+set(CMAKE_COMPILE ${CMAKE_CXX_COMPILER})
+
+if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/framework)
+    add_subdirectory(framework)
+endif()
+if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/op_host)
+    add_subdirectory(op_host)
+endif()
+if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel)
+    add_subdirectory(op_kernel)
+endif()
+if(ENABLE_TEST AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/testcases)
+    add_subdirectory(testcases)
+endif()
+
+add_custom_command(OUTPUT ${CMAKE_BINARY_DIR}/scripts/install.sh ${CMAKE_BINARY_DIR}/scripts/upgrade.sh
+    COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_BINARY_DIR}/scripts
+    COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_SOURCE_DIR}/scripts/ ${CMAKE_BINARY_DIR}/scripts/
+    COMMAND sed -i "s/vendor_name=customize/vendor_name=${vendor_name}/g" ${CMAKE_BINARY_DIR}/scripts/install.sh ${CMAKE_BINARY_DIR}/scripts/upgrade.sh
+    VERBATIM
+)
+add_custom_target(modify_vendor ALL DEPENDS ${CMAKE_BINARY_DIR}/scripts/install.sh ${CMAKE_BINARY_DIR}/scripts/upgrade.sh)
+
+get_system_info(SYSTEM_INFO)
+
+# gen version.info
+add_custom_target(gen_version_info ALL
+        COMMAND bash ${CMAKE_CURRENT_SOURCE_DIR}/cmake/util/gen_version_info.sh ${ASCEND_CANN_PACKAGE_PATH} ${CMAKE_CURRENT_BINARY_DIR}
+)
+
+if(NOT ASCEND_PACK_SHARED_LIBRARY)
+    install(DIRECTORY ${CMAKE_BINARY_DIR}/scripts/ DESTINATION . FILE_PERMISSIONS OWNER_EXECUTE OWNER_READ GROUP_READ)
+
+    install(FILES ${CMAKE_SOURCE_DIR}/custom.proto DESTINATION packages OPTIONAL)
+    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/version.info
+            DESTINATION packages/vendors/${vendor_name}/)
+
+    # CPack config
+    set(CPACK_PACKAGE_NAME ${CMAKE_PROJECT_NAME})
+    set(CPACK_PACKAGE_VERSION ${CMAKE_PROJECT_VERSION})
+    set(CPACK_PACKAGE_DESCRIPTION "CPack opp project")
+    set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "CPack opp project")
+    set(CPACK_PACKAGE_DIRECTORY ${CMAKE_INSTALL_PREFIX})
+
+    set(CPACK_PACKAGE_FILE_NAME "custom_opp_${SYSTEM_INFO}_${CMAKE_SYSTEM_PROCESSOR}.run")
+    set(CPACK_GENERATOR External)
+    set(CPACK_CMAKE_GENERATOR "Unix Makefiles")
+    set(CPACK_EXTERNAL_ENABLE_STAGING TRUE)
+    set(CPACK_EXTERNAL_PACKAGE_SCRIPT ${CMAKE_SOURCE_DIR}/cmake/makeself.cmake)
+    set(CPACK_EXTERNAL_BUILT_PACKAGES ${CPACK_PACKAGE_DIRECTORY}/_CPack_Packages/Linux/External/${CPACK_PACKAGE_FILE_NAME}/${CPACK_PACKAGE_FILE_NAME})
+    include(CPack)
+endif()
diff --git a/csrc/deepep/ops2/CMakePresets.json b/csrc/deepep/ops2/CMakePresets.json
new file mode 100644
index 000000000..f87df921f
--- /dev/null
+++ b/csrc/deepep/ops2/CMakePresets.json
@@ -0,0 +1,59 @@
+{
+    "version": 1,
+    "cmakeMinimumRequired": {
+        "major": 3,
+        "minor": 19,
+        "patch": 0
+    },
+    "configurePresets": [
+        {
+            "name": "default",
+            "displayName": "Default Config",
+            "description": "Default build using Unix Makefiles generator for native compilation",
+            "generator": "Unix Makefiles",
+            "binaryDir": "${sourceDir}/build_out",
+            "cacheVariables": {
+                "CMAKE_BUILD_TYPE": {
+                    "type": "STRING",
+                    "value": "Release"
+                },
+                "ENABLE_SOURCE_PACKAGE": {
+                    "type": "BOOL",
+                    "value": "True"
+                },
+                "ENABLE_BINARY_PACKAGE": {
+                    "type": "BOOL",
+                    "value": "True"
+                },
+                "ASCEND_COMPUTE_UNIT": {
+                    "type": "STRING",
+                    "value": "ascend910b"
+                },
+                "ENABLE_TEST": {
+                    "type": "BOOL",
+                    "value": "True"
+                },
+                "vendor_name": {
+                    "type": "STRING",
+                    "value": "hwcomputing"
+                },
+                "ASCEND_CANN_PACKAGE_PATH": {
+                    "type": "PATH",
+                    "value": "/usr/local/Ascend/ascend-toolkit/latest"
+                },
+                "ASCEND_PYTHON_EXECUTABLE": {
+                    "type": "STRING",
+                    "value": "python3"
+                },
+                "CMAKE_INSTALL_PREFIX": {
+                    "type": "PATH",
+                    "value": "${sourceDir}/build_out"
+                },
+                "ASCEND_PACK_SHARED_LIBRARY": {
+                    "type": "BOOL",
+                    "value": "False"
+                }
+            }
+        }
+    ]
+}
diff --git a/csrc/deepep/ops2/README.md b/csrc/deepep/ops2/README.md
new file mode 100644
index 000000000..2856efe87
--- /dev/null
+++ b/csrc/deepep/ops2/README.md
@@ -0,0 +1,10 @@
+# moe_dispatch_combine
+
+# compile
+bash build.sh
+
+# install
+./build_out/custom_opp_ubuntu_aarch64.run --install-path=/usr/local/Ascend/ascend-toolkit/latest/opp/
+
+# require import env parameters before running
+source /usr/local/Ascend/ascend-toolkit/latest/opp/vendors/hwcomputing/bin/set_env.bash
diff --git a/csrc/deepep/ops2/build.sh b/csrc/deepep/ops2/build.sh
new file mode 100755
index 000000000..7692e4e56
--- /dev/null
+++ b/csrc/deepep/ops2/build.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+
+export OPS_PROJECT_NAME=aclnnInner
+
+SCRIPTS_DIR=$(cd "$(dirname "$0")" && pwd)
+
+if [ -n "$BASE_LIBS_PATH" ]; then
+  export ASCEND_HOME_PATH="$BASE_LIBS_PATH"
+elif [ -z "$ASCEND_HOME_PATH" ]; then
+  if [ -n "$ASCEND_AICPU_PATH" ]; then
+    export ASCEND_HOME_PATH="$ASCEND_AICPU_PATH"
+  else
+    echo "please set env." >&2
+    exit 1
+  fi
+fi
+echo "using ASCEND_HOME_PATH: $ASCEND_HOME_PATH"
+script_path=$(realpath $(dirname $0))
+
+BUILD_DIR="build_out"
+HOST_NATIVE_DIR="host_native_tiling"
+
+chmod +x cmake/util/gen_ops_filter.sh
+mkdir -p build_out
+rm -rf build_out/*
+
+opts=$(python3 $script_path/cmake/util/preset_parse.py $script_path/CMakePresets.json)
+ENABLE_CROSS="-DENABLE_CROSS_COMPILE=True"
+ENABLE_BINARY="-DENABLE_BINARY_PACKAGE=True"
+ENABLE_LIBRARY="-DASCEND_PACK_SHARED_LIBRARY=True"
+cmake_version=$(cmake --version | grep "cmake version" | awk '{print $3}')
+
+target=package
+if [ -n "$1" ]; then target="$1"; fi
+if [[ $opts =~ $ENABLE_LIBRARY ]]; then target=install; fi
+
+if [[ $opts =~ $ENABLE_CROSS ]] && [[ $opts =~ $ENABLE_BINARY ]]
+then
+  if [ "$cmake_version" \< "3.19.0" ] ; then
+    cmake -S . -B "$BUILD_DIR" $opts -DENABLE_CROSS_COMPILE=0
+  else
+    cmake -S . -B "$BUILD_DIR" --preset=default -DENABLE_CROSS_COMPILE=0
+  fi
+  cmake --build "$BUILD_DIR" --target cust_optiling
+  mkdir $BUILD_DIR/$HOST_NATIVE_DIR
+  lib_path=$(find "$BUILD_DIR" -name "libcust_opmaster_rt2.0.so")
+  if [ -z "$lib_path" ] || [ $(echo "$lib_path" | wc -l) -ne 1 ]; then
+    echo "Error: Expected to find exactly one libcust_opmaster_rt2.0.so, but found none or multiple." >&2
+    exit 1
+  fi
+  mv "$lib_path" "$BUILD_DIR/$HOST_NATIVE_DIR/"
+  find "$BUILD_DIR" -mindepth 1 -maxdepth 1 ! -name "$HOST_NATIVE_DIR" -exec rm -rf {} +
+  host_native_tiling_lib=$(realpath $(find $BUILD_DIR -type f -name "libcust_opmaster_rt2.0.so"))
+  if [ "$cmake_version" \< "3.19.0" ] ; then
+    cmake -S . -B "$BUILD_DIR" $opts -DHOST_NATIVE_TILING_LIB=$host_native_tiling_lib
+  else
+    cmake -S . -B "$BUILD_DIR" --preset=default -DHOST_NATIVE_TILING_LIB=$host_native_tiling_lib
+  fi
+  cmake --build "$BUILD_DIR" --target binary -j$(nproc)
+  cmake --build "$BUILD_DIR" --target $target -j$(nproc)
+else
+  if [ "$cmake_version" \< "3.19.0" ] ; then
+    cmake -S . -B "$BUILD_DIR" $opts
+  else
+      cmake -S . -B "$BUILD_DIR" --preset=default
+  fi
+  cmake --build "$BUILD_DIR" --target binary -j$(nproc)
+  cmake --build "$BUILD_DIR" --target $target -j$(nproc)
+fi
diff --git a/csrc/deepep/ops2/cmake/config.cmake b/csrc/deepep/ops2/cmake/config.cmake
new file mode 100644
index 000000000..d990cd312
--- /dev/null
+++ b/csrc/deepep/ops2/cmake/config.cmake
@@ -0,0 +1,42 @@
+
+set(CMAKE_CXX_FLAGS_DEBUG "")
+set(CMAKE_CXX_FLAGS_RELEASE "")
+
+if (NOT DEFINED CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE Release CACHE STRING "")
+endif()
+if (CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
+    set(CMAKE_INSTALL_PREFIX "${CMAKE_SOURCE_DIR}/build_out" CACHE PATH "" FORCE)
+endif()
+if (NOT DEFINED ASCEND_CANN_PACKAGE_PATH)
+    set(ASCEND_CANN_PACKAGE_PATH /usr/local/Ascend/latest CACHE PATH "")
+endif()
+if (NOT DEFINED ASCEND_PYTHON_EXECUTABLE)
+    set(ASCEND_PYTHON_EXECUTABLE python3 CACHE STRING "")
+endif()
+if (NOT DEFINED ASCEND_COMPUTE_UNIT)
+    set(ASCEND_COMPUTE_UNIT ascend910_93 CACHE STRING "")
+endif()
+if (NOT DEFINED ENABLE_TEST)
+    set(ENABLE_TEST FALSE CACHE BOOL "")
+endif()
+if (NOT DEFINED ENABLE_CROSS_COMPILE)
+    set(ENABLE_CROSS_COMPILE  FALSE CACHE BOOL "")
+endif()
+if (NOT DEFINED CMAKE_CROSS_PLATFORM_COMPILER)
+    set(CMAKE_CROSS_PLATFORM_COMPILER "/your/cross/compiler/path" CACHE PATH "")
+endif()
+if (NOT DEFINED CMAKE_CROSS_LIBRARY_PATH)
+    set(CMAKE_CROSS_LIBRARY_PATH "" CACHE PATH "")
+endif()
+if (NOT DEFINED ASCEND_PACK_SHARED_LIBRARY)
+    set(ASCEND_PACK_SHARED_LIBRARY False CACHE BOOL "")
+endif()
+set(ASCEND_TENSOR_COMPILER_PATH ${ASCEND_CANN_PACKAGE_PATH}/compiler)
+set(ASCEND_CCEC_COMPILER_PATH ${ASCEND_TENSOR_COMPILER_PATH}/ccec_compiler/bin)
+set(ASCEND_AUTOGEN_PATH ${CMAKE_BINARY_DIR}/autogen)
+file(MAKE_DIRECTORY ${ASCEND_AUTOGEN_PATH})
+set(CUSTOM_COMPILE_OPTIONS "custom_compile_options.ini")
+set(CUSTOM_OPC_OPTIONS "custom_opc_options.ini")
+file(WRITE ${ASCEND_AUTOGEN_PATH}/${CUSTOM_COMPILE_OPTIONS} "")
+file(WRITE ${ASCEND_AUTOGEN_PATH}/${CUSTOM_OPC_OPTIONS} "")
diff --git a/csrc/deepep/ops2/cmake/device_task.cmake b/csrc/deepep/ops2/cmake/device_task.cmake
new file mode 100644
index 000000000..3b7c0a136
--- /dev/null
+++ b/csrc/deepep/ops2/cmake/device_task.cmake
@@ -0,0 +1,48 @@
+message(STATUS "TILING SINK TASK BEGIN")
+message(STATUS "TARGET: ${TARGET}")
+message(STATUS "OPTION: ${OPTION}")
+message(STATUS "SRC: ${SRC}")
+message(STATUS "VENDOR: ${VENDOR_NAME}")
+
+set(CMAKE_CXX_COMPILER ${ASCEND_CANN_PACKAGE_PATH}/toolkit/toolchain/hcc/bin/aarch64-target-linux-gnu-g++)
+set(CMAKE_C_COMPILER ${ASCEND_CANN_PACKAGE_PATH}/toolkit/toolchain/hcc/bin/aarch64-target-linux-gnu-gcc)
+
+string(REPLACE " " ";" SRC "${SRC}")
+add_library(${TARGET} ${OPTION}
+    ${SRC}
+)
+target_compile_definitions(${TARGET} PRIVATE
+    DEVICE_OP_TILING_LIB
+    _FORTIFY_SOURCE=2
+    google=ascend_private
+)
+target_include_directories(${TARGET} PRIVATE
+    ${ASCEND_CANN_PACKAGE_PATH}/include
+)
+target_compile_options(${TARGET} PRIVATE
+    -fPIC
+    -fstack-protector-strong
+    -fstack-protector-all
+    -O2
+    -std=c++11
+    -fvisibility-inlines-hidden
+    -fvisibility=hidden
+)
+target_link_libraries(${TARGET} PRIVATE
+    -Wl,--whole-archive
+    device_register
+    c_sec
+    mmpa
+    tiling_api
+    platform_static
+    ascend_protobuf
+    exe_meta_device
+    -Wl,--no-whole-archive
+)
+target_link_directories(${TARGET} PRIVATE
+    ${ASCEND_CANN_PACKAGE_PATH}/lib64/device/lib64
+    ${ASCEND_CANN_PACKAGE_PATH}/compiler/lib64
+)
+set_target_properties(${TARGET} PROPERTIES
+    OUTPUT_NAME cust_opmaster
+)
diff --git a/csrc/deepep/ops2/cmake/func.cmake b/csrc/deepep/ops2/cmake/func.cmake
new file mode 100644
index 000000000..aff395819
--- /dev/null
+++ b/csrc/deepep/ops2/cmake/func.cmake
@@ -0,0 +1,368 @@
+include(ExternalProject)
+
+function(get_system_info SYSTEM_INFO)
+  if (UNIX)
+    execute_process(COMMAND grep -i ^id= /etc/os-release OUTPUT_VARIABLE TEMP)
+    string(REGEX REPLACE "\n|id=|ID=|\"" "" SYSTEM_NAME ${TEMP})
+    set(${SYSTEM_INFO} ${SYSTEM_NAME}_${CMAKE_SYSTEM_PROCESSOR} PARENT_SCOPE)
+  elseif (WIN32)
+    message(STATUS "System is Windows. Only for pre-build.")
+  else ()
+    message(FATAL_ERROR "${CMAKE_SYSTEM_NAME} not support.")
+  endif ()
+endfunction()
+
+function(opbuild)
+  message(STATUS "Opbuild generating sources")
+  cmake_parse_arguments(OPBUILD "" "OUT_DIR;PROJECT_NAME;ACCESS_PREFIX;ENABLE_SOURCE" "OPS_SRC" ${ARGN})
+  execute_process(COMMAND ${CMAKE_COMPILE} -g -fPIC -shared -std=c++11 ${OPBUILD_OPS_SRC} -D_GLIBCXX_USE_CXX11_ABI=0
+                  -I ${ASCEND_CANN_PACKAGE_PATH}/include -I ${CMAKE_CURRENT_SOURCE_DIR}/../op_kernel
+                  -L ${ASCEND_CANN_PACKAGE_PATH}/lib64 -lexe_graph -lregister -ltiling_api
+                  -o ${OPBUILD_OUT_DIR}/libascend_all_ops.so
+                  RESULT_VARIABLE EXEC_RESULT
+                  OUTPUT_VARIABLE EXEC_INFO
+                  ERROR_VARIABLE  EXEC_ERROR
+  )
+  if (${EXEC_RESULT})
+    message("build ops lib info: ${EXEC_INFO}")
+    message("build ops lib error: ${EXEC_ERROR}")
+    message(FATAL_ERROR "opbuild run failed!")
+  endif()
+  set(proj_env "")
+  set(prefix_env "")
+  if (NOT "${OPBUILD_PROJECT_NAME}x" STREQUAL "x")
+    set(proj_env "OPS_PROJECT_NAME=${OPBUILD_PROJECT_NAME}")
+  endif()
+  if (NOT "${OPBUILD_ACCESS_PREFIX}x" STREQUAL "x")
+    set(prefix_env "OPS_DIRECT_ACCESS_PREFIX=${OPBUILD_ACCESS_PREFIX}")
+  endif()
+
+  set(ENV{ENABLE_SOURCE_PACKAGE} ${OPBUILD_ENABLE_SOURCE})
+  if(${ASCEND_PACK_SHARED_LIBRARY})
+    if (NOT vendor_name)
+      message(FATAL_ERROR "ERROR: vendor_name is invalid!")
+      return()
+    endif()
+    set(ENV{ASCEND_VENDOR_NAME} ${vendor_name})
+    set(ENV{OPS_PRODUCT_NAME} ${ASCEND_COMPUTE_UNIT})
+    set(ENV{SYSTEM_PROCESSOR} ${CMAKE_SYSTEM_PROCESSOR})
+  endif()
+  execute_process(COMMAND ${proj_env} ${prefix_env} ${ASCEND_CANN_PACKAGE_PATH}/toolkit/tools/opbuild/op_build
+                          ${OPBUILD_OUT_DIR}/libascend_all_ops.so ${OPBUILD_OUT_DIR}
+                  RESULT_VARIABLE EXEC_RESULT
+                  OUTPUT_VARIABLE EXEC_INFO
+                  ERROR_VARIABLE  EXEC_ERROR
+  )
+  unset(ENV{ENABLE_SOURCE_PACKAGE})
+  if(${ASCEND_PACK_SHARED_LIBRARY})
+    unset(ENV{ASCEND_VENDOR_NAME})
+    unset(ENV{OPS_PRODUCT_NAME})
+    unset(ENV{SYSTEM_PROCESSOR})
+  endif()
+  if (${EXEC_RESULT})
+    message("opbuild ops info: ${EXEC_INFO}")
+    message("opbuild ops error: ${EXEC_ERROR}")
+  endif()
+  message(STATUS "Opbuild generating sources - done")
+endfunction()
+
+function(add_ops_info_target)
+  cmake_parse_arguments(OPINFO "" "TARGET;OPS_INFO;OUTPUT;INSTALL_DIR" "" ${ARGN})
+  get_filename_component(opinfo_file_path "${OPINFO_OUTPUT}" DIRECTORY)
+  add_custom_command(OUTPUT ${OPINFO_OUTPUT}
+      COMMAND mkdir -p ${opinfo_file_path}
+      COMMAND ${ASCEND_PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/cmake/util/parse_ini_to_json.py
+              ${OPINFO_OPS_INFO} ${OPINFO_OUTPUT}
+  )
+  add_custom_target(${OPINFO_TARGET} ALL
+      DEPENDS ${OPINFO_OUTPUT}
+  )
+  if(NOT ${ASCEND_PACK_SHARED_LIBRARY})
+    install(FILES ${OPINFO_OUTPUT}
+            DESTINATION ${OPINFO_INSTALL_DIR}
+    )
+  endif()
+endfunction()
+
+function(add_ops_compile_options OP_TYPE)
+  cmake_parse_arguments(OP_COMPILE "" "OP_TYPE" "COMPUTE_UNIT;OPTIONS" ${ARGN})
+  execute_process(COMMAND ${ASCEND_PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/cmake/util/ascendc_gen_options.py
+                          ${ASCEND_AUTOGEN_PATH}/${CUSTOM_COMPILE_OPTIONS} ${OP_TYPE} ${OP_COMPILE_COMPUTE_UNIT}
+                          ${OP_COMPILE_OPTIONS}
+                  RESULT_VARIABLE EXEC_RESULT
+                  OUTPUT_VARIABLE EXEC_INFO
+                  ERROR_VARIABLE  EXEC_ERROR)
+  if (${EXEC_RESULT})
+      message("add ops compile options info: ${EXEC_INFO}")
+      message("add ops compile options error: ${EXEC_ERROR}")
+      message(FATAL_ERROR "add ops compile options failed!")
+  endif()
+endfunction()
+
+function(add_npu_support_target)
+  cmake_parse_arguments(NPUSUP "" "TARGET;OPS_INFO_DIR;OUT_DIR;INSTALL_DIR" "" ${ARGN})
+  get_filename_component(npu_sup_file_path "${NPUSUP_OUT_DIR}" DIRECTORY)
+  add_custom_command(OUTPUT ${NPUSUP_OUT_DIR}/npu_supported_ops.json
+    COMMAND mkdir -p ${NPUSUP_OUT_DIR}
+    COMMAND ${CMAKE_SOURCE_DIR}/cmake/util/gen_ops_filter.sh
+            ${NPUSUP_OPS_INFO_DIR}
+            ${NPUSUP_OUT_DIR}
+  )
+  add_custom_target(npu_supported_ops ALL
+    DEPENDS ${NPUSUP_OUT_DIR}/npu_supported_ops.json
+  )
+  if(NOT ${ASCEND_PACK_SHARED_LIBRARY})
+    install(FILES ${NPUSUP_OUT_DIR}/npu_supported_ops.json
+      DESTINATION ${NPUSUP_INSTALL_DIR}
+    )
+  endif()
+endfunction()
+
+function(add_simple_kernel_compile)
+  set(options "")
+  set(single_value_args "OPS_INFO;OUT_DIR;TILING_LIB;OP_TYPE;SRC;COMPUTE_UNIT;JSON_FILE;DYNAMIC_PATH")
+  set(multi_value_args "OPTIONS;CONFIGS")
+  cmake_parse_arguments(BINCMP "${options}" "${single_value_args}" "${multi_value_args}" ${ARGN})
+  if (NOT DEFINED BINCMP_OUT_DIR)
+    set(BINCMP_OUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/binary)
+  endif()
+  if (NOT DEFINED BINCMP_TILING_LIB)
+    set(BINCMP_TILING_LIB $<TARGET_FILE:cust_optiling>)
+  endif()
+  if (${ASCEND_PACK_SHARED_LIBRARY})
+    if (NOT TARGET op_kernel_pack)
+      add_custom_target(op_kernel_pack
+                        COMMAND ${ASCEND_PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/cmake/util/ascendc_pack_kernel.py
+                        --input-path=${BINCMP_OUT_DIR}
+                        --output-path=${BINCMP_OUT_DIR}/library
+                        --enable-library=${ASCEND_PACK_SHARED_LIBRARY}
+                        --platform=${CMAKE_SYSTEM_PROCESSOR})
+      add_library(ascend_kernels INTERFACE)
+      target_link_libraries(ascend_kernels INTERFACE kernels)
+      target_link_directories(ascend_kernels INTERFACE ${BINCMP_OUT_DIR}/library)
+      target_include_directories(ascend_kernels INTERFACE ${BINCMP_OUT_DIR}/library)
+      add_dependencies(ascend_kernels op_kernel_pack)
+      add_dependencies(op_kernel_pack ${BINCMP_OP_TYPE}_${BINCMP_COMPUTE_UNIT})
+    endif()
+  endif()
+  # add Environment Variable Configurations of ccache
+  set(_ASCENDC_ENV_VAR)
+  if(${CMAKE_CXX_COMPILER_LAUNCHER} MATCHES "ccache$")
+    list(APPEND _ASCENDC_ENV_VAR export ASCENDC_CCACHE_EXECUTABLE=${CMAKE_CXX_COMPILER_LAUNCHER} &&)
+  endif()
+
+  if (NOT DEFINED BINCMP_OPS_INFO)
+    set(BINCMP_OPS_INFO ${ASCEND_AUTOGEN_PATH}/aic-${BINCMP_COMPUTE_UNIT}-ops-info.ini)
+  endif()
+  if (NOT ${ENABLE_CROSS_COMPILE})
+    add_custom_target(${BINCMP_OP_TYPE}_${BINCMP_COMPUTE_UNIT}
+                      COMMAND ${_ASCENDC_ENV_VAR} ${ASCEND_PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/cmake/util/ascendc_compile_kernel.py
+                      --op-name=${BINCMP_OP_TYPE}
+                      --src-file=${BINCMP_SRC}
+                      --compute-unit=${BINCMP_COMPUTE_UNIT}
+                      --compile-options=\"${BINCMP_OPTIONS}\"
+                      --debug-config=\"${BINCMP_CONFIGS}\"
+                      --config-ini=${BINCMP_OPS_INFO}
+                      --tiling-lib=${BINCMP_TILING_LIB}
+                      --output-path=${BINCMP_OUT_DIR}
+                      --dynamic-dir=${BINCMP_DYNAMIC_PATH}
+                      --enable-binary=\"${ENABLE_BINARY_PACKAGE}\"
+                      --json-file=${BINCMP_JSON_FILE}
+                      --build-tool=$(MAKE))
+    add_dependencies(${BINCMP_OP_TYPE}_${BINCMP_COMPUTE_UNIT} cust_optiling)
+  else()
+    if (${ENABLE_BINARY_PACKAGE} AND NOT DEFINED HOST_NATIVE_TILING_LIB)
+      message(FATAL_ERROR "Native host libs was not set for cross compile!")
+    endif()
+    add_custom_target(${BINCMP_OP_TYPE}_${BINCMP_COMPUTE_UNIT}
+                      COMMAND ${_ASCENDC_ENV_VAR} ${ASCEND_PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/cmake/util/ascendc_compile_kernel.py
+                      --op-name=${BINCMP_OP_TYPE}
+                      --src-file=${BINCMP_SRC}
+                      --compute-unit=${BINCMP_COMPUTE_UNIT}
+                      --compile-options=\"${BINCMP_OPTIONS}\"
+                      --debug-config=\"${BINCMP_CONFIGS}\"
+                      --config-ini=${BINCMP_OPS_INFO}
+                      --tiling-lib=${HOST_NATIVE_TILING_LIB}
+                      --output-path=${BINCMP_OUT_DIR}
+                      --dynamic-dir=${BINCMP_DYNAMIC_PATH}
+                      --enable-binary=\"${ENABLE_BINARY_PACKAGE}\"
+                      --json-file=${BINCMP_JSON_FILE}
+                      --build-tool=$(MAKE))
+  endif()
+  add_dependencies(ascendc_bin_${BINCMP_COMPUTE_UNIT}_gen_ops_config ${BINCMP_OP_TYPE}_${BINCMP_COMPUTE_UNIT})
+  add_dependencies(${BINCMP_OP_TYPE}_${BINCMP_COMPUTE_UNIT} ops_info_gen_${BINCMP_COMPUTE_UNIT})
+endfunction()
+
+function(ascendc_device_library)
+    message(STATUS "Ascendc device library generating")
+    cmake_parse_arguments(DEVICE "" "TARGET;OPTION" "SRC" ${ARGN})
+    execute_process(
+        COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/tiling_sink
+        COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/tiling_sink/CMakeLists.txt
+    )
+    execute_process(
+        COMMAND ${CMAKE_COMMAND} -E echo "cmake_minimum_required(VERSION 3.16.0)\nproject(cust_tiling_sink)\ninclude(${CMAKE_SOURCE_DIR}/cmake/device_task.cmake)\n"
+        OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/tiling_sink/CMakeLists.txt
+        RESULT_VARIABLE result
+    )
+    string(REPLACE ";" " " DEVICE_SRC "${DEVICE_SRC}")
+    ExternalProject_Add(tiling_sink_task
+        SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/tiling_sink
+        CONFIGURE_COMMAND ${CMAKE_COMMAND}
+        -DASCEND_CANN_PACKAGE_PATH=${ASCEND_CANN_PACKAGE_PATH}
+        -DTARGET=${DEVICE_TARGET}
+        -DOPTION=${DEVICE_OPTION}
+        -DSRC=${DEVICE_SRC}
+        -DVENDOR_NAME=${vendor_name}
+        <SOURCE_DIR>
+        CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}
+        INSTALL_COMMAND ""
+        BUILD_ALWAYS TRUE
+    )
+    ExternalProject_Get_Property(tiling_sink_task BINARY_DIR)
+    set(TILINGSINK_LIB_PATH "")
+    if ("${DEVICE_OPTION}" STREQUAL "SHARED")
+        set(TILINGSINK_LIB_PATH "${BINARY_DIR}/libcust_opmaster.so")
+    else()
+        set(TILINGSINK_LIB_PATH "${BINARY_DIR}/libcust_opmaster.a")
+    endif()
+    install(FILES ${TILINGSINK_LIB_PATH}
+        DESTINATION packages/vendors/${vendor_name}/op_impl/ai_core/tbe/op_master_device/lib
+    )
+endfunction()
+function(add_opregistry_target)
+  string(REPLACE ";" "-" COMPUTE_UNIT "${ASCEND_COMPUTE_UNIT}")
+  add_custom_target(op_registry_pack
+                    COMMAND ${ASCEND_PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/cmake/util/ascendc_pack_opregistry.py
+                    --input-path=${CMAKE_SOURCE_DIR}/build_out/
+                    --copy-path=${CMAKE_SOURCE_DIR}/build_out/tmp/vendors/${vendor_name}/
+                    --output-path=${CMAKE_SOURCE_DIR}/build_out/library/
+                    --vendor-name=${vendor_name}
+                    --compute-unit=${COMPUTE_UNIT}
+                    --framework-type=${ASCEND_FRAMEWORK_TYPE}
+                    --platform=${CMAKE_SYSTEM_PROCESSOR})
+  add_library(ascend_opregistry INTERFACE)
+  target_link_libraries(ascend_opregistry INTERFACE opregistry)
+  target_link_directories(ascend_opregistry INTERFACE ${CMAKE_SOURCE_DIR}/build_out/library)
+  target_include_directories(ascend_opregistry INTERFACE ${CMAKE_SOURCE_DIR}/build_out/library)
+  add_dependencies(ascend_opregistry op_registry_pack)
+  if(EXISTS  "${CMAKE_SOURCE_DIR}/framework/caffe_plugin")
+    add_dependencies(op_registry_pack cust_caffe_parsers)
+  elseif(EXISTS  "${CMAKE_SOURCE_DIR}/framework/tf_plugin")
+    add_dependencies(op_registry_pack cust_tf_parsers)
+  elseif(EXISTS  "${CMAKE_SOURCE_DIR}/framework/onnx_plugin")
+    add_dependencies(op_registry_pack cust_onnx_parsers)
+  endif()
+endfunction()
+
+function(add_kernels_install)
+  # install kernel file
+  if (${ENABLE_SOURCE_PACKAGE})
+    install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/binary/dynamic/
+            DESTINATION packages/vendors/${vendor_name}/op_impl/ai_core/tbe/${vendor_name}_impl/dynamic/
+    )
+  endif()
+
+  # install *.o files and *.json files
+  if (${ENABLE_BINARY_PACKAGE})
+    set(INSTALL_DIR packages/vendors/${vendor_name}/op_impl/ai_core/tbe/)
+      foreach(compute_unit ${ASCEND_COMPUTE_UNIT})
+        install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/binary/${compute_unit}/
+                DESTINATION ${INSTALL_DIR}/kernel/${compute_unit}/
+        )
+      endforeach()
+      install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/binary/config/
+              DESTINATION ${INSTALL_DIR}/kernel/config/
+      )
+  endif()
+endfunction()
+
+function(add_kernels_compile)
+  set(DYNAMIC_PATH "")
+  if (${ENABLE_SOURCE_PACKAGE})
+    set(DYNAMIC_PATH ${CMAKE_CURRENT_BINARY_DIR}/binary/dynamic)
+    file(MAKE_DIRECTORY ${DYNAMIC_PATH})
+    file(GLOB KERNEL_FILES "${CMAKE_SOURCE_DIR}/op_kernel/*")
+    file(COPY ${KERNEL_FILES} DESTINATION ${DYNAMIC_PATH})
+    file(REMOVE "${DYNAMIC_PATH}/CMakeLists.txt")
+  endif()
+
+  foreach(compute_unit ${ASCEND_COMPUTE_UNIT})
+    # generate aic-${compute_unit}-ops-info.json
+    add_ops_info_target(TARGET ops_info_gen_${compute_unit}
+      OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/tbe/op_info_cfg/ai_core/${compute_unit}/aic-${compute_unit}-ops-info.json
+      OPS_INFO ${ASCEND_AUTOGEN_PATH}/aic-${compute_unit}-ops-info.ini
+      INSTALL_DIR packages/vendors/${vendor_name}/op_impl/ai_core/tbe/config/${compute_unit}
+    )
+
+    # define a target:binary to prevent kernel file from being rebuilt during the preinstall process
+    if (NOT TARGET binary)
+      add_custom_target(binary)
+    endif()
+
+    if (${ENABLE_BINARY_PACKAGE} OR ${ENABLE_SOURCE_PACKAGE})
+      if (${ENABLE_BINARY_PACKAGE})
+        # gen binary_info_config.json and <file_name>.json
+        add_custom_target(ascendc_bin_${compute_unit}_gen_ops_config
+                          COMMAND ${ASCEND_PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/cmake/util/insert_simplified_keys.py
+                                  -p ${CMAKE_CURRENT_BINARY_DIR}/binary/${compute_unit}
+                          COMMAND ${ASCEND_PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/cmake/util/ascendc_ops_config.py
+                                  -p ${CMAKE_CURRENT_BINARY_DIR}/binary/${compute_unit}
+                                  -s ${compute_unit}
+                          COMMAND ${CMAKE_COMMAND} -E make_directory
+                                  ${CMAKE_CURRENT_BINARY_DIR}/binary/config/${compute_unit}
+                          COMMAND mv ${CMAKE_CURRENT_BINARY_DIR}/binary/${compute_unit}/*.json
+                                  ${CMAKE_CURRENT_BINARY_DIR}/binary/config/${compute_unit}
+        )
+      else()
+        if (NOT TARGET ascendc_bin_${compute_unit}_gen_ops_config)
+          add_custom_target(ascendc_bin_${compute_unit}_gen_ops_config)
+        endif()
+      endif()
+      add_dependencies(binary ascendc_bin_${compute_unit}_gen_ops_config)
+
+      # get op_type-op_name from aic-${compute_unit}-ops-info.ini
+      execute_process(COMMAND ${ASCEND_PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/cmake/util/ascendc_get_op_name.py
+                              --ini-file=${ASCEND_AUTOGEN_PATH}/aic-${compute_unit}-ops-info.ini
+                      OUTPUT_VARIABLE OP_TYPE_NAME
+                      RESULT_VARIABLE EXEC_RESULT
+                      ERROR_VARIABLE EXEC_ERROR
+      )
+      if (${EXEC_RESULT})
+        message(FATAL_ERROR, "get op name failed, gen error: ${EXEC_ERROR}")
+      endif()
+
+      # compile op one by one with ascendc_compile_kernel.py
+      string(REPLACE "\n" ";" TYPE_NAME_LIST "${OP_TYPE_NAME}")
+      foreach(TYPE_NAME IN LISTS TYPE_NAME_LIST)
+        if (NOT "${TYPE_NAME}" STREQUAL "")
+          string(REPLACE "-" ";" bin_sep ${TYPE_NAME})
+          list(GET bin_sep 0 op_type)
+          list(GET bin_sep 1 op_file)
+          add_simple_kernel_compile(OP_TYPE ${op_type}
+                                    SRC ${CMAKE_SOURCE_DIR}/op_kernel/${op_file}.cpp
+                                    COMPUTE_UNIT ${compute_unit}
+                                    JSON_FILE ${CMAKE_CURRENT_BINARY_DIR}/tbe/op_info_cfg/ai_core/${compute_unit}/aic-${compute_unit}-ops-info.json
+                                    DYNAMIC_PATH ${DYNAMIC_PATH})
+        endif()
+      endforeach()
+    endif()
+  endforeach()
+
+  # generate npu_supported_ops.json
+  add_npu_support_target(TARGET npu_supported_ops
+    OPS_INFO_DIR ${ASCEND_AUTOGEN_PATH}
+    OUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/tbe/op_info_cfg/ai_core
+    INSTALL_DIR packages/vendors/${vendor_name}/framework/${ASCEND_FRAMEWORK_TYPE}
+  )
+
+  if(ENABLE_TEST AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/testcases)
+    add_subdirectory(testcases)
+  endif()
+
+  if(NOT ASCEND_PACK_SHARED_LIBRARY)
+    add_kernels_install()
+  else()
+    add_opregistry_target()
+  endif()
+endfunction()
diff --git a/csrc/deepep/ops2/cmake/intf.cmake b/csrc/deepep/ops2/cmake/intf.cmake
new file mode 100644
index 000000000..d2643bbc7
--- /dev/null
+++ b/csrc/deepep/ops2/cmake/intf.cmake
@@ -0,0 +1,28 @@
+
+add_library(intf_pub INTERFACE)
+target_compile_options(intf_pub INTERFACE
+    -fPIC
+    -fvisibility=hidden
+    -fvisibility-inlines-hidden
+    $<$<CONFIG:Release>:-O2>
+    $<$<CONFIG:Debug>:-O0 -g>
+    $<$<COMPILE_LANGUAGE:CXX>:-std=c++11>
+    $<$<AND:$<COMPILE_LANGUAGE:CXX>,$<CONFIG:Debug>>:-ftrapv -fstack-check>
+    $<$<COMPILE_LANGUAGE:C>:-pthread -Wfloat-equal -Wshadow -Wformat=2 -Wno-deprecated -Wextra>
+    $<IF:$<VERSION_GREATER:${CMAKE_C_COMPILER_VERSION},4.8.5>,-fstack-protector-strong,-fstack-protector-all>
+)
+target_compile_definitions(intf_pub INTERFACE
+    _GLIBCXX_USE_CXX11_ABI=0
+    $<$<CONFIG:Release>:_FORTIFY_SOURCE=2>
+)
+target_include_directories(intf_pub INTERFACE ${ASCEND_CANN_PACKAGE_PATH}/include
+    ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel
+)
+target_link_options(intf_pub INTERFACE
+    $<$<STREQUAL:$<TARGET_PROPERTY:TYPE>,EXECUTABLE>:-pie>
+    $<$<CONFIG:Release>:-s>
+    -Wl,-z,relro
+    -Wl,-z,now
+    -Wl,-z,noexecstack
+)
+target_link_directories(intf_pub INTERFACE ${ASCEND_CANN_PACKAGE_PATH}/lib64)
diff --git a/csrc/deepep/ops2/cmake/makeself.cmake b/csrc/deepep/ops2/cmake/makeself.cmake
new file mode 100644
index 000000000..82c8da7eb
--- /dev/null
+++ b/csrc/deepep/ops2/cmake/makeself.cmake
@@ -0,0 +1,31 @@
+execute_process(COMMAND bash ${CMAKE_CURRENT_LIST_DIR}/util/makeself/makeself.sh
+                        --header ${CMAKE_CURRENT_LIST_DIR}/util/makeself/makeself-header.sh
+                        --help-header ./help.info
+                        --gzip --complevel 4 --nomd5 --sha256
+                        ./ ${CPACK_PACKAGE_FILE_NAME} "version:1.0" ./install.sh
+                WORKING_DIRECTORY ${CPACK_TEMPORARY_DIRECTORY}
+                RESULT_VARIABLE EXEC_RESULT
+                ERROR_VARIABLE  EXEC_ERROR
+)
+
+if (NOT "${EXEC_RESULT}x" STREQUAL "0x")
+  message(FATAL_ERROR "CPack Command error: ${EXEC_RESULT}\n${EXEC_ERROR}")
+endif()
+
+execute_process(COMMAND cp ${CPACK_EXTERNAL_BUILT_PACKAGES} ${CPACK_PACKAGE_DIRECTORY}/
+        COMMAND echo "Copy ${CPACK_EXTERNAL_BUILT_PACKAGES} to ${CPACK_PACKAGE_DIRECTORY}/"
+        WORKING_DIRECTORY ${CPACK_TEMPORARY_DIRECTORY}
+    )
+
+if (NOT "${CPACK_PACKAGE_DIRECTORY}x" STREQUAL "${CPACK_INSTALL_PREFIX}x")
+    execute_process(
+        COMMAND ${CMAKE_COMMAND} -E make_directory ${CPACK_INSTALL_PREFIX}
+        WORKING_DIRECTORY ${CPACK_TEMPORARY_DIRECTORY}
+    )
+
+    execute_process(
+        COMMAND cp ${CPACK_EXTERNAL_BUILT_PACKAGES} ${CPACK_INSTALL_PREFIX}/
+        COMMAND echo "Copy ${CPACK_EXTERNAL_BUILT_PACKAGES} to ${CPACK_INSTALL_PREFIX}/"
+        WORKING_DIRECTORY ${CPACK_TEMPORARY_DIRECTORY}
+    )
+endif()
diff --git a/csrc/deepep/ops2/cmake/util/__init__.py b/csrc/deepep/ops2/cmake/util/__init__.py
new file mode 100755
index 000000000..364083fab
--- /dev/null
+++ b/csrc/deepep/ops2/cmake/util/__init__.py
@@ -0,0 +1,8 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import os
+import sys
+
+PYF_PATH = os.path.dirname(os.path.realpath(__file__))
+sys.path.append(PYF_PATH)
diff --git a/csrc/deepep/ops2/cmake/util/ascendc_bin_param_build.py b/csrc/deepep/ops2/cmake/util/ascendc_bin_param_build.py
new file mode 100755
index 000000000..388575cde
--- /dev/null
+++ b/csrc/deepep/ops2/cmake/util/ascendc_bin_param_build.py
@@ -0,0 +1,531 @@
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+
+import argparse
+import copy
+import hashlib
+import json
+import os
+import re
+import sys
+from collections import defaultdict
+from typing import Dict, List, NamedTuple, Set, Tuple
+
+import const_var
+import opdesc_parser
+
+PYF_PATH = os.path.dirname(os.path.realpath(__file__))
+
+
+class ParamInfo(NamedTuple):
+    dtype_list: list
+    format_list: list
+    dtype_for_bin_list: dict
+    format_for_bin_list: dict
+
+
+class BinParamBuilder(opdesc_parser.OpDesc):
+    def __init__(self: any, op_type: str):
+        super().__init__(op_type)
+        self.soc = ""
+        self.out_path = ""
+        self.tiling_keys = set()
+        self.op_debug_config = ""
+
+    def set_soc_version(self: any, soc: str):
+        self.soc = soc
+
+    def set_out_path(self: any, out_path: str):
+        self.out_path = out_path
+
+    def set_tiling_key(self: any, tiling_key_info: Set):
+        if tiling_key_info:
+            self.tiling_keys.update(tiling_key_info)
+
+    def set_op_debug_config(self: any, op_debug_config: str):
+        if op_debug_config:
+            self.op_debug_config = op_debug_config
+
+    def get_full_list(self: any):
+        dtype_list = []
+        for dtype_in in self.input_dtype:
+            dtype_list.append(dtype_in.split(","))
+        for dtype_out in self.output_dtype:
+            dtype_list.append(dtype_out.split(","))
+
+        format_list = []
+        for fmt_in in self.input_fmt:
+            format_list.append(fmt_in.split(","))
+        for fmt_out in self.output_fmt:
+            format_list.append(fmt_out.split(","))
+
+        dtype_for_bin_list = [
+            [] for _ in range(len(self.input_dtype) + len(self.output_dtype))
+        ]
+        format_for_bin_list = copy.deepcopy(dtype_for_bin_list)
+
+        for key, value in self.input_dtype_for_bin.items():
+            dtype_for_bin_list[key] = value.split(",")
+        for key, value in self.output_dtype_for_bin.items():
+            dtype_for_bin_list[key + len(self.input_dtype)] = value.split(",")
+        for key, value in self.input_fmt_for_bin.items():
+            format_for_bin_list[key] = value.split(",")
+        for key, value in self.output_fmt_for_bin.items():
+            format_for_bin_list[key + len(self.input_dtype)] = value.split(",")
+
+        return ParamInfo(
+            dtype_list, format_list, dtype_for_bin_list, format_for_bin_list
+        )
+
+    def gen_bin_cprs_list(self: any, param_info: ParamInfo):
+        combine_dict = {}
+        origin_combine_dict = {}
+        for cob_idx in range(0, len(self.input_dtype[0].split(","))):
+            origin_combine = ""
+            combine = ""
+            for param_idx in range(0, len(self.input_dtype) + len(self.output_dtype)):
+                if param_info.dtype_for_bin_list[param_idx]:
+                    combine += param_info.dtype_for_bin_list[param_idx][cob_idx]
+                else:
+                    combine += param_info.dtype_list[param_idx][cob_idx]
+                origin_combine += param_info.dtype_list[param_idx][cob_idx]
+                if param_info.format_for_bin_list[param_idx]:
+                    combine += param_info.format_for_bin_list[param_idx][cob_idx]
+                else:
+                    combine += param_info.format_list[param_idx][cob_idx]
+                origin_combine += param_info.format_list[param_idx][cob_idx]
+            if combine not in combine_dict:
+                combine_dict[combine] = []
+            combine_dict[combine].append(cob_idx)
+            origin_combine_dict[origin_combine] = cob_idx
+        for key, value in combine_dict.items():
+            if key not in origin_combine_dict:
+                print(f"WARNING: ForBinQuery {key} not in origin combine")
+                self.bin_save_list += value
+                continue
+            if len(value) == 1 and value[0] == origin_combine_dict[key]:
+                self.bin_save_list += value
+                continue
+            self.bin_cprs_head.append(origin_combine_dict[key])
+            self.bin_cprs_list.append(value)
+        for index, sub_list in enumerate(self.bin_cprs_list):
+            if self.bin_cprs_head[index] not in self.bin_save_list:
+                continue
+            sub_list.append(self.bin_cprs_head[index])
+        self.bin_save_list += self.bin_cprs_head
+
+    def gen_for_bin_list(self: any, param_info: ParamInfo):
+        combine_size = len(self.input_dtype[0].split(","))
+        input_size = len(self.input_dtype)
+        output_size = len(self.output_dtype)
+
+        self.input_dtype_for_bin_list = [[] for _ in range(input_size)]
+        self.output_dtype_for_bin_list = [[] for _ in range(output_size)]
+        for i in range(0, input_size):
+            self.input_dtype_for_bin_list[i] = [[] for _ in range(combine_size)]
+        for i in range(0, output_size):
+            self.output_dtype_for_bin_list[i] = [[] for _ in range(combine_size)]
+        self.input_fmt_for_bin_list = copy.deepcopy(self.input_dtype_for_bin_list)
+        self.output_fmt_for_bin_list = copy.deepcopy(self.output_dtype_for_bin_list)
+
+        for index, sub_list in enumerate(self.bin_cprs_list):
+            head_idx = self.bin_cprs_head[index]
+            for cmb_idx in sub_list:
+                for i in range(0, input_size):
+                    self.input_dtype_for_bin_list[i][head_idx].append(
+                        param_info.dtype_list[i][cmb_idx]
+                    )
+                    self.input_fmt_for_bin_list[i][head_idx].append(
+                        param_info.format_list[i][cmb_idx]
+                    )
+                for i in range(0, output_size):
+                    self.output_dtype_for_bin_list[i][head_idx].append(
+                        param_info.dtype_list[i + input_size][cmb_idx]
+                    )
+                    self.output_fmt_for_bin_list[i][head_idx].append(
+                        param_info.format_list[i + input_size][cmb_idx]
+                    )
+
+    def rm_cprs_cmb(self: any, dtype_list, format_list, input_size, output_size):
+        for i in range(0, input_size):
+            self.input_dtype_for_bin_list[i] = [
+                element
+                for index, element in enumerate(self.input_dtype_for_bin_list[i])
+                if index in self.bin_save_list
+            ]
+            self.input_fmt_for_bin_list[i] = [
+                element
+                for index, element in enumerate(self.input_fmt_for_bin_list[i])
+                if index in self.bin_save_list
+            ]
+            new_dtype_list = [
+                element
+                for index, element in enumerate(dtype_list[i])
+                if index in self.bin_save_list
+            ]
+            new_dtype_str = ""
+            for dtype in new_dtype_list:
+                new_dtype_str += f"{dtype},"
+            self.input_dtype[i] = new_dtype_str[:-1]
+            new_format_list = [
+                element
+                for index, element in enumerate(format_list[i])
+                if index in self.bin_save_list
+            ]
+            new_format_str = ""
+            for fmt in new_format_list:
+                new_format_str += f"{fmt},"
+            self.input_fmt[i] = new_format_str[:-1]
+        for i in range(0, output_size):
+            self.output_dtype_for_bin_list[i] = [
+                element
+                for index, element in enumerate(self.output_dtype_for_bin_list[i])
+                if index in self.bin_save_list
+            ]
+            self.output_fmt_for_bin_list[i] = [
+                element
+                for index, element in enumerate(self.output_fmt_for_bin_list[i])
+                if index in self.bin_save_list
+            ]
+            new_dtype_list = [
+                element
+                for index, element in enumerate(dtype_list[i + input_size])
+                if index in self.bin_save_list
+            ]
+            new_dtype_str = ""
+            for dtype in new_dtype_list:
+                new_dtype_str += f"{dtype},"
+            self.output_dtype[i] = new_dtype_str[:-1]
+            new_format_list = [
+                element
+                for index, element in enumerate(format_list[i + input_size])
+                if index in self.bin_save_list
+            ]
+            new_format_str = ""
+            for fmt in new_format_list:
+                new_format_str += f"{fmt},"
+            self.output_fmt[i] = new_format_str[:-1]
+
+    def is_set_for_bin_query(self: any):
+        return any(
+            [
+                self.input_dtype_for_bin,
+                self.output_dtype_for_bin,
+                self.input_fmt_for_bin,
+                self.output_fmt_for_bin,
+            ]
+        )
+
+    def for_bin_list_match(self: any):
+        if not self.is_set_for_bin_query():
+            return
+        input_size = len(self.input_dtype)
+        output_size = len(self.output_dtype)
+        param_info = self.get_full_list()
+        self.gen_bin_cprs_list(param_info)
+        self.gen_for_bin_list(param_info)
+        if len(self.bin_save_list) == len(self.input_dtype[0].split(",")):
+            print(
+                f"WARNING: ForBinQuery can not compress number of bin file with this set, please check!!."
+            )
+            return
+        self.rm_cprs_cmb(
+            param_info.dtype_list, param_info.format_list, input_size, output_size
+        )
+
+    def gen_input_json(self: any, auto_gen_path: str):
+        key_map = {}
+        self.for_bin_list_match()
+        count = len(self.input_dtype[0].split(","))
+        required_parameters = set()
+        index_value = -1
+
+        for i in range(0, count):
+            inputs = []
+            outputs = []
+            attrs = []
+            required_parameter = []
+            op_node = {}
+
+            for idx in range(0, len(self.input_name)):
+                idtypes = self.input_dtype[idx].split(",")
+                ifmts = self.input_fmt[idx].split(",")
+                itype = self.input_type[idx]
+                para = {}
+                para["name"] = self.input_name[idx][:-5]
+                para["index"] = idx
+                para["dtype"] = idtypes[i]
+                if (
+                    self.is_set_for_bin_query()
+                    and self.input_dtype_for_bin_list[idx][i]
+                ):
+                    para["dtypeForBinQuery"] = self.input_dtype_for_bin_list[idx][i]
+                para["format"] = ifmts[i]
+                if self.is_set_for_bin_query() and self.input_fmt_for_bin_list[idx][i]:
+                    para["formatForBinQuery"] = self.input_fmt_for_bin_list[idx][i]
+                para["paramType"] = itype
+                para["shape"] = [-2]
+                para["format_match_mode"] = "FormatAgnostic"
+
+                input_parameter_key = (idtypes[i], ifmts[i])
+                if itype == "dynamic":
+                    inputs.append([para])
+                    required_parameter.append(input_parameter_key)
+                elif itype == "required":
+                    inputs.append(para)
+                    required_parameter.append(input_parameter_key)
+                else:
+                    inputs.append(para)
+
+            for idx in range(0, len(self.output_name)):
+                odtypes = self.output_dtype[idx].split(",")
+                ofmts = self.output_fmt[idx].split(",")
+                otype = self.output_type[idx]
+                para = {}
+                para["name"] = self.output_name[idx][:-5]
+                para["index"] = idx
+                para["dtype"] = odtypes[i]
+                if (
+                    self.is_set_for_bin_query()
+                    and self.output_dtype_for_bin_list[idx][i]
+                ):
+                    para["dtypeForBinQuery"] = self.output_dtype_for_bin_list[idx][i]
+                para["format"] = ofmts[i]
+                if self.is_set_for_bin_query() and self.output_fmt_for_bin_list[idx][i]:
+                    para["formatForBinQuery"] = self.output_fmt_for_bin_list[idx][i]
+                para["paramType"] = otype
+                para["shape"] = [-2]
+                para["format_match_mode"] = "FormatAgnostic"
+                output_parameter_key = (odtypes[i], ofmts[i])
+                if otype == "dynamic":
+                    outputs.append([para])
+                    required_parameter.append(output_parameter_key)
+                elif otype == "required":
+                    outputs.append(para)
+                    required_parameter.append(output_parameter_key)
+                else:
+                    outputs.append(para)
+
+            for attr in self.attr_list:
+                att = {}
+                att["name"] = attr
+                atype = self.attr_val.get(attr).get("type").lower()
+                att["dtype"] = atype
+                att["value"] = const_var.ATTR_DEF_VAL.get(atype)
+                attrs.append(att)
+
+            required_parameter_tuple = tuple(required_parameter)
+            if required_parameter_tuple in required_parameters:
+                continue
+            else:
+                required_parameters.add(required_parameter_tuple)
+                index_value += 1
+
+            op_node["bin_filename"] = ""
+            op_node["inputs"] = inputs
+            op_node["outputs"] = outputs
+            if len(attrs) > 0:
+                op_node["attrs"] = attrs
+
+            param = {}
+            param["op_type"] = self.op_type
+            param["op_list"] = [op_node]
+            objstr = json.dumps(param, indent="  ")
+            md5sum = hashlib.md5(objstr.encode("utf-8")).hexdigest()
+            while key_map.get(md5sum) is not None:
+                objstr += "1"
+                md5sum = hashlib.md5(objstr.encode("utf-8")).hexdigest()
+            key_map[md5sum] = md5sum
+            bin_file = self.op_type + "_" + md5sum
+            op_node["bin_filename"] = bin_file
+            param_file = os.path.join(self.out_path, bin_file + "_param.json")
+            param_file = os.path.realpath(param_file)
+            with os.fdopen(
+                os.open(param_file, const_var.WFLAGS, const_var.WMODES), "w"
+            ) as fd:
+                json.dump(param, fd, indent="  ")
+            self._write_build_cmd(param_file, bin_file, index_value, auto_gen_path)
+
+    def _write_build_cmd(
+        self: any, param_file: str, bin_file: str, index: int, auto_gen_path: str
+    ):
+        hard_soc = const_var.conv_soc_ver(self.soc)
+        if not hard_soc:
+            hard_soc = self.soc.capitalize()
+        name_com = [self.op_type, self.op_file, str(index)]
+        compile_file = os.path.join(self.out_path, "-".join(name_com) + ".sh")
+        compile_file = os.path.realpath(compile_file)
+
+        bin_cmd_str = "res=$(opc $1 --main_func={fun} --input_param={param} --soc_version={soc} \
+                --output=$2 --impl_mode={impl} --simplified_key_mode=0 --op_mode=dynamic "
+
+        build_cmd_var = "#!/bin/bash\n"
+        build_cmd_var += f'echo "[{self.soc}] Generating {bin_file} ..."\n'
+        plog_level = os.environ.get("ASCEND_GLOBAL_LOG_LEVEL")
+        plog_stdout = os.environ.get("ASCEND_SLOG_PRINT_TO_STDOUT")
+        if plog_level is None:
+            build_cmd_var += const_var.SET_PLOG_LEVEL_ERROR
+        if plog_stdout is None:
+            build_cmd_var += const_var.SET_PLOG_STDOUT
+        build_cmd_var += const_var.SRC_ENV
+        if hard_soc == "Ascend610Lite":
+            build_cmd_var += f"export ASCEND_CUSTOM_OPP_PATH={auto_gen_path}:$ASCEND_CUSTOM_OPP_PATH \n"
+        build_cmd_var += bin_cmd_str.format(
+            fun=self.op_intf,
+            soc=hard_soc,
+            param=param_file,
+            impl="high_performance,optional",
+        )
+        enable_tiling_keys = False
+        if self.tiling_keys:
+            tiling_keys_list = sorted(list(self.tiling_keys))
+            tiling_key_str = ",".join([str(_key) for _key in tiling_keys_list])
+            build_cmd_var += f' --tiling_key="{tiling_key_str}"'
+            enable_tiling_keys = True
+
+        if self.op_debug_config:
+            op_debug_str = ",".join([str(_key) for _key in list(self.op_debug_config)])
+            build_cmd_var += f" --op_debug_config={op_debug_str}"
+
+        build_cmd_var += ")\n"
+        build_cmd_var += "\n"
+        if enable_tiling_keys is False:
+            build_cmd_var += 'echo "${res}"\n'
+            build_cmd_var += const_var.CHK_CMD.format(res_file=bin_file + ".json")
+            build_cmd_var += const_var.CHK_CMD.format(res_file=bin_file + ".o")
+        else:
+            build_cmd_var += "if [ $? -eq 1 ]; then\n"
+            build_cmd_var += '    if echo "${res}" | \
+grep -q "None of the given tiling keys are in the supported list"; then\n'
+            build_cmd_var += '        echo "${res}"\n'
+            build_cmd_var += "    else\n"
+            build_cmd_var += '        echo "${res}"\n'
+            build_cmd_var += "        exit 1\n"
+            build_cmd_var += "    fi\n"
+            build_cmd_var += "else\n"
+            build_cmd_var += 'echo "${res}"\n'
+            build_cmd_var += const_var.CHK_CMD.format(res_file=bin_file + ".json")
+            build_cmd_var += const_var.CHK_CMD.format(res_file=bin_file + ".o")
+            build_cmd_var += "fi\n"
+        build_cmd_var += f'echo "[{self.soc}] Generating {bin_file} Done"\n'
+
+        with os.fdopen(
+            os.open(compile_file, const_var.WFLAGS, const_var.WMODES), "w"
+        ) as fd:
+            fd.write(build_cmd_var)
+
+
+def get_tiling_keys(tiling_keys: str) -> Set:
+    all_tiling_keys = set()
+    if not tiling_keys:
+        return all_tiling_keys
+
+    tiling_key_list = tiling_keys.split(";")
+    for tiling_key_value in tiling_key_list:
+        pattern = r"(?<![^\s])(\d+)-(\d+)(?![^\s])"
+        results = re.findall(pattern, tiling_key_value)
+        if results:
+            start, end = results[0]
+            if int(start) > int(end):
+                continue
+            for i in range(int(start), int(end) + 1):
+                all_tiling_keys.add(i)
+        elif tiling_key_value.isdigit():
+            all_tiling_keys.add(int(tiling_key_value))
+    return all_tiling_keys
+
+
+def trans_soc_verion(soc_ver: str):
+    low_soc_ver = soc_ver.lower()
+    if low_soc_ver not in opdesc_parser.SOC_TO_SHORT_SOC_MAP:
+        return low_soc_ver
+    return opdesc_parser.SOC_TO_SHORT_SOC_MAP[low_soc_ver]
+
+
+def parse_op_debug_confg(opc_config_file: str, soc: str) -> Dict:
+    tiling_key_info = defaultdict(set)
+    op_debug_config = defaultdict(set)
+    if not opc_config_file:
+        return tiling_key_info, op_debug_config
+
+    if not os.path.exists(opc_config_file):
+        return tiling_key_info, op_debug_config
+
+    with open(opc_config_file, "r") as file:
+        contents = file.readlines()
+
+    for _content in contents:
+        content = _content.strip()
+        opc_configs = content.split("@")
+        if len(opc_configs) < 3:
+            continue
+
+        op_type = opc_configs[0]
+        if not op_type:
+            continue
+
+        compute_unit = opc_configs[1]
+        if compute_unit:
+            compute_unit_list = compute_unit.split(";")
+            soc_lists = []
+            for soc_ver in compute_unit_list:
+                short_soc_ver = trans_soc_verion(soc_ver)
+                soc_lists.append(short_soc_ver)
+            if soc not in soc_lists:
+                continue
+
+        for options in opc_configs[2:]:
+            if "--tiling_key" in options:
+                format_tiling_keys = get_tiling_keys(options.split("=")[1])
+                if format_tiling_keys:
+                    tiling_key_info[op_type].update(format_tiling_keys)
+            if "--op_debug_config" in options:
+                format_debug_config = set(options.split("=")[1].split(";"))
+                if format_debug_config:
+                    op_debug_config[op_type].update(format_debug_config)
+
+    return tiling_key_info, op_debug_config
+
+
+def gen_bin_param_file(
+    cfgfile: str, out_dir: str, soc: str, opc_config_file: str = "", ops: list = None
+):
+    if not os.path.exists(cfgfile):
+        print(
+            f"INFO: {cfgfile} does not exists in this project, skip generating compile commands."
+        )
+        return
+
+    op_descs = opdesc_parser.get_op_desc(cfgfile, [], [], BinParamBuilder, ops)
+    tiling_key_info, op_debug_config = parse_op_debug_confg(opc_config_file, soc)
+    auto_gen_path_dir = os.path.dirname(cfgfile)
+    all_soc_key = "ALL"
+    for op_desc in op_descs:
+        op_desc.set_soc_version(soc)
+        op_desc.set_out_path(out_dir)
+        if op_desc.op_type in op_debug_config:
+            op_desc.set_op_debug_config(op_debug_config[op_desc.op_type])
+        if all_soc_key in op_debug_config:
+            op_desc.set_op_debug_config(op_debug_config[all_soc_key])
+        if op_desc.op_type in tiling_key_info:
+            op_desc.set_tiling_key(tiling_key_info[op_desc.op_type])
+        if all_soc_key in tiling_key_info:
+            op_desc.set_tiling_key(tiling_key_info[all_soc_key])
+        op_desc.gen_input_json(auto_gen_path_dir)
+
+
+def parse_args(argv):
+    """Command line parameter parsing"""
+    parser = argparse.ArgumentParser()
+    parser.add_argument("argv", nargs="+")
+    parser.add_argument("--opc-config-file", nargs="?", const="", default="")
+    return parser.parse_args(argv)
+
+
+if __name__ == "__main__":
+    args = parse_args(sys.argv)
+    if len(args.argv) <= 3:
+        raise RuntimeError("arguments must greater than 3")
+    gen_bin_param_file(
+        args.argv[1], args.argv[2], args.argv[3], opc_config_file=args.opc_config_file
+    )
diff --git a/csrc/deepep/ops2/cmake/util/ascendc_compile_kernel.py b/csrc/deepep/ops2/cmake/util/ascendc_compile_kernel.py
new file mode 100755
index 000000000..c4c815765
--- /dev/null
+++ b/csrc/deepep/ops2/cmake/util/ascendc_compile_kernel.py
@@ -0,0 +1,259 @@
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+
+import argparse
+import glob
+import os
+import shutil
+import subprocess
+import sys
+import time
+
+import ascendc_bin_param_build
+import ascendc_impl_build
+import ascendc_op_info
+import const_var
+
+
+class CompileKernel:
+    def __init__(self: any, args: any):
+        self.op_type = args.op_name
+        self.op_cpp_file = os.path.realpath(args.src_file)
+        self.op_soc_ver = args.compute_unit
+        self.compile_options = args.compile_options
+        self.op_debug_config = args.debug_config
+        self.op_cfg_ini = os.path.realpath(args.config_ini)
+        self.op_tiling = os.path.realpath(args.tiling_lib)
+        self.op_output = os.path.realpath(args.output_path)
+        self.op_impl_py = None
+        self.compile_sh = []
+        self.working_dir = os.path.join(
+            os.getcwd(),
+            self.op_type + "_" + self.op_soc_ver,
+        )
+        self.build_opp_path = os.path.join(self.working_dir, "customize")
+        os.makedirs(self.working_dir)
+        os.makedirs(self.op_output, exist_ok=True)
+        if args.dynamic_dir is not None and args.dynamic_dir != "":
+            self.dynamic_dir = os.path.realpath(args.dynamic_dir)
+        else:
+            self.dynamic_dir = None
+        if args.json_file is not None and args.json_file != "":
+            self.json_file = args.json_file
+        else:
+            self.json_file = None
+
+    def clean(self: any):
+        if "dump_cce" not in self.op_debug_config:
+            shutil.rmtree(self.working_dir)
+        return
+
+    def ascendc_gen_impl(self: any):
+        rep_cfg = {}
+        rep_cfg[const_var.REPLAY_BATCH] = ""
+        rep_cfg[const_var.REPLAY_ITERATE] = ""
+        cfg_dir = {}
+        cfg_dir[const_var.CFG_IMPL_DIR] = os.path.dirname(self.op_cpp_file)
+        cfg_dir[const_var.CFG_OUT_DIR] = os.path.join(self.working_dir, "dynamic")
+        os.makedirs(os.path.join(self.working_dir, "dynamic"), exist_ok=True)
+        cfg_dir[const_var.AUTO_GEN_DIR] = os.path.dirname(self.op_cfg_ini)
+        ascendc_impl_build.write_scripts(
+            self.op_cfg_ini, rep_cfg, cfg_dir, [self.op_type], self.compile_options
+        )
+        py_files = glob.glob(os.path.join(self.working_dir, "dynamic", "*.py"))
+        if py_files is None or len(py_files) != 1:
+            self.clean()
+            raise RuntimeError("compile py file {} generated error!".format(py_files))
+        self.op_impl_py = os.path.join(
+            self.working_dir, "dynamic", self.op_type + ".py"
+        )
+        if self.dynamic_dir is not None:
+            shutil.copy(py_files[0], self.dynamic_dir)
+        os.rename(py_files[0], self.op_impl_py)
+        if not os.path.exists(self.op_impl_py):
+            self.clean()
+            raise RuntimeError(
+                "compile py file {} not generated!".format(self.op_impl_py)
+            )
+
+    def ascendc_gen_param(self: any):
+        bin_param_path = os.path.join(self.working_dir, "bin_param")
+        os.makedirs(bin_param_path)
+        base_dir = os.path.dirname(self.op_cfg_ini)
+        opc_config_file = os.path.join(base_dir, "custom_opc_options.ini")
+        ascendc_bin_param_build.gen_bin_param_file(
+            self.op_cfg_ini,
+            bin_param_path,
+            self.op_soc_ver,
+            opc_config_file,
+            [self.op_type],
+        )
+        tiling_key_info, op_debug_config = ascendc_bin_param_build.parse_op_debug_confg(
+            opc_config_file, self.op_type
+        )
+        if self.op_type in op_debug_config:
+            self.op_debug_config = op_debug_config[self.op_type]
+        if "ALL" in op_debug_config:
+            self.op_debug_config = op_debug_config["ALL"]
+        bin_param_files = glob.glob(os.path.join(bin_param_path, "*.json"))
+        if bin_param_files is None or len(bin_param_files) <= 0:
+            self.clean()
+            raise RuntimeError("compile binary param json file not generated!")
+        self.compile_sh = glob.glob(os.path.join(bin_param_path, "*.sh"))
+        if self.compile_sh is None or len(self.compile_sh) != len(bin_param_files):
+            self.clean()
+            raise RuntimeError("compile binary shell file not generated!")
+
+    def ascendc_put_tiling(self: any):
+        tiling_path = os.path.join(
+            self.build_opp_path, "op_impl", "ai_core", "tbe", "op_tiling"
+        )
+        os.makedirs(tiling_path)
+        tiling_so = os.path.join(tiling_path, "liboptiling.so")
+        os.symlink(self.op_tiling, tiling_so)
+        if not os.path.exists(tiling_so):
+            self.clean()
+            raise RuntimeError("prepare tiling lib {} link failed!".format(tiling_so))
+
+    def ascendc_put_json(self: any):
+        if self.json_file is not None:
+            json_file_dir = os.path.join(
+                self.build_opp_path,
+                "op_impl",
+                "ai_core",
+                "tbe",
+                "config",
+                self.op_soc_ver,
+            )
+            os.makedirs(json_file_dir)
+            shutil.copy(self.json_file, json_file_dir)
+            build_json_file = os.path.join(
+                json_file_dir, "aic-{}-ops-info.json".format(self.op_soc_ver)
+            )
+            if not os.path.exists(build_json_file):
+                self.clean()
+                raise RuntimeError(
+                    "prepare json file aic-{}-ops-info.json failed!".format(
+                        self.op_soc_ver
+                    )
+                )
+
+    def ascendc_build(self: any):
+        op_info = ascendc_op_info.OpInfo(self.op_type, self.op_cfg_ini)
+        op_file = op_info.get_op_file()
+        op_bin_dir = os.path.join(self.op_output, self.op_soc_ver, op_file)
+        os.makedirs(op_bin_dir, exist_ok=True)
+        all_tar = []
+        sub_cmd = []
+        index = 0
+        for sh in self.compile_sh:
+            tar = op_file + str(index)
+            build_path = os.path.join(self.working_dir, "kernel_" + str(index))
+            os.makedirs(build_path)
+            all_tar.append(tar)
+            sub_cmd.append(tar + ":")
+            sub_cmd.append(
+                "\tcd {} && bash {} --kernel-src=$(CPP) $(PY) $(OUT) $(MAKE)".format(
+                    build_path, sh
+                )
+            )
+            index += 1
+        mkfile = os.path.join(self.working_dir, op_file + ".make")
+        with os.fdopen(os.open(mkfile, const_var.WFLAGS, const_var.WMODES), "w") as fd:
+            sub_cmd.insert(0, "all: " + " ".join(all_tar))
+            fd.write("\n".join(sub_cmd))
+
+        if os.getenv("TILINGKEY_PAR_COMPILE") is None:
+            cmd_str = (
+                "export HI_PYTHON=python3 && export ASCEND_CUSTOM_OPP_PATH={} && export TILINGKEY_PAR_COMPILE=1"
+                "&& make -f {} PY={} OUT={} CPP={}"
+            )
+        else:
+            cmd_str = "export HI_PYTHON=python3 && export ASCEND_CUSTOM_OPP_PATH={} && make -f {} PY={} OUT={} CPP={}"
+
+        if (
+            os.system(
+                cmd_str.format(
+                    self.build_opp_path,
+                    mkfile,
+                    self.op_impl_py,
+                    op_bin_dir,
+                    self.op_cpp_file,
+                )
+            )
+            != 0
+        ):
+            raise RuntimeError(
+                "Kernel Compilation Error: OpType {} Kernel File {}!".format(
+                    self.op_type, self.op_cpp_file
+                )
+            )
+
+
+def args_parse():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-n", "--op-name", nargs="?", help="Op name(Camel string) to compile."
+    )
+    parser.add_argument("-s", "--src-file", nargs="?", help="Op kernel source file.")
+
+    parser.add_argument("-u", "--compute-unit", nargs="?", help="Compute unit.")
+    parser.add_argument(
+        "-c", "--compile-options", nargs="?", help="Compile options of compiler."
+    )
+    parser.add_argument(
+        "-d",
+        "--debug-config",
+        nargs="?",
+        help="Debug config of op, ref opc op-debug-config.",
+    )
+    parser.add_argument("-i", "--config-ini", nargs="?", help="Op config ini file.")
+    parser.add_argument(
+        "-t", "--tiling-lib", nargs="?", help="Tiling shared library file."
+    )
+
+    parser.add_argument(
+        "-o", "--output-path", nargs="?", help="Output path of compile result."
+    )
+    parser.add_argument(
+        "-dy",
+        "--dynamic-dir",
+        nargs="?",
+        default=None,
+        help="dynamic path of source compile.",
+    )
+    parser.add_argument(
+        "-eb",
+        "--enable-binary",
+        nargs="?",
+        default=None,
+        help="whether binary compile is enabled.",
+    )
+    parser.add_argument(
+        "-j",
+        "--json-file",
+        nargs="?",
+        default=None,
+        help="aic-<compute-unit>-ops-info.json file path.",
+    )
+    # $(MAKE) is necessary for parallel compiling
+    parser.add_argument(
+        "-b", "--build-tool", nargs="?", default=None, help="build tool must be make."
+    )
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = args_parse()
+    kernel_builder = CompileKernel(args)
+    kernel_builder.clean()
+    if args.enable_binary == "False":
+        kernel_builder.ascendc_gen_impl()
+        kernel_builder.clean()
+    else:
+        kernel_builder.ascendc_gen_impl()
+        kernel_builder.ascendc_gen_param()
+        kernel_builder.ascendc_put_json()
+        kernel_builder.ascendc_put_tiling()
+        kernel_builder.ascendc_build()
+        kernel_builder.clean()
diff --git a/csrc/deepep/ops2/cmake/util/ascendc_gen_options.py b/csrc/deepep/ops2/cmake/util/ascendc_gen_options.py
new file mode 100755
index 000000000..637ae9d79
--- /dev/null
+++ b/csrc/deepep/ops2/cmake/util/ascendc_gen_options.py
@@ -0,0 +1,83 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+import json
+import os
+import re
+import stat
+import sys
+
+import const_var
+
+
+def write_options_to_file(
+    file_name: str, options_str: str, op_type: str, compute_unit: str, split_char: str
+):
+    flags = os.O_WRONLY | os.O_CREAT
+    modes = stat.S_IWUSR | stat.S_IRUSR
+    try:
+        with os.fdopen(os.open(file_name, flags, modes), "a") as fd:
+            fd.write(
+                op_type + split_char + compute_unit + split_char + options_str + "\n"
+            )
+    except Exception as err:
+        print("write compile options config file failed")
+        raise (err)
+
+
+def gen_compile_options(
+    compile_options_file: str, op_type: str, compute_unit: str, compile_options: list
+):
+    base_dir = os.path.dirname(compile_options_file)
+    opc_config_file = os.path.join(base_dir, "custom_opc_options.ini")
+    compile_opt = []
+    opc_debug_config = []
+    opc_tiling_keys = ""
+    for opts in compile_options:
+        if "oom" in opts:
+            if opts == "--oom":
+                opc_debug_config.append("oom")
+            else:
+                raise RuntimeError(f"Unknown oom option format {opts}")
+        elif "--save-temp-files" in opts:
+            opc_debug_config.append("dump_cce")
+        elif "--tiling_key" in opts:
+            keys = opts.strip().split("=")[1].split(",")
+            keys_str = ";".join([key for key in keys])
+            opc_tiling_keys = keys_str
+        else:
+            compile_opt.append(opts)
+    if len(compile_opt) > 0:
+        options_str = ";".join([opt for opt in compile_opt])
+        write_options_to_file(
+            compile_options_file, options_str, op_type, compute_unit, ","
+        )
+    opc_config_str = ""
+    if opc_debug_config:
+        opc_config_str = "--op_debug_config=" + ";".join(
+            [opt for opt in opc_debug_config]
+        )
+    if len(opc_tiling_keys) > 0:
+        if opc_config_str != "":
+            opc_config_str += "@"
+        opc_config_str += "--tiling_key=" + opc_tiling_keys
+
+    if opc_config_str != "":
+        write_options_to_file(
+            opc_config_file, opc_config_str, op_type, compute_unit, "@"
+        )
+
+
+if __name__ == "__main__":
+    if len(sys.argv) < 4:
+        raise RuntimeError("arguments must greater than 4")
+    compute_soc = ""
+    comp_options = []
+    for i in range(len(sys.argv) - 3):
+        if sys.argv[i + 3].upper().startswith("ASCEND"):
+            compute_soc += sys.argv[i + 3] + ";"
+        else:
+            comp_options.append(sys.argv[i + 3])
+    if compute_soc != "":
+        compute_soc = compute_soc[0:-1]
+    gen_compile_options(sys.argv[1], sys.argv[2], compute_soc, comp_options)
diff --git a/csrc/deepep/ops2/cmake/util/ascendc_get_op_name.py b/csrc/deepep/ops2/cmake/util/ascendc_get_op_name.py
new file mode 100755
index 000000000..5da592b3b
--- /dev/null
+++ b/csrc/deepep/ops2/cmake/util/ascendc_get_op_name.py
@@ -0,0 +1,20 @@
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+
+import argparse
+import configparser
+
+
+def args_parse():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-i", "--ini-file", help="op info ini.")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = args_parse()
+    op_config = configparser.ConfigParser()
+    op_config.read(args.ini_file)
+    for section in op_config.sections():
+        print(section, end="-")
+        print(op_config.get(section, "opFile.value"), end="\n")
diff --git a/csrc/deepep/ops2/cmake/util/ascendc_impl_build.py b/csrc/deepep/ops2/cmake/util/ascendc_impl_build.py
new file mode 100755
index 000000000..7486ad138
--- /dev/null
+++ b/csrc/deepep/ops2/cmake/util/ascendc_impl_build.py
@@ -0,0 +1,748 @@
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+
+import argparse
+import datetime
+import glob
+import os
+import re
+import sys
+from typing import List
+
+import const_var
+import opdesc_parser
+
+PYF_PATH = os.path.dirname(os.path.realpath(__file__))
+
+IMPL_HEAD = '''#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+"""
+Copyright (c) Huawei Technologies Co., Ltd. {}-{}. All rights reserved.
+"""
+
+import os, sys
+import ctypes
+import json
+import shutil
+from tbe.common.platform import get_soc_spec
+from tbe.common.utils import para_check
+from tbe.tikcpp import compile_op, replay_op, check_op_cap, generalize_op_params, get_code_channel, OpInfo
+from tbe.tikcpp.compile_op import CommonUtility, AscendCLogLevel
+from tbe.common.buildcfg import get_default_build_config
+from impl.util.platform_adapter import tbe_register
+from tbe.common.buildcfg import get_current_build_config
+PYF_PATH = os.path.dirname(os.path.realpath(__file__))
+
+DTYPE_MAP = {{"float32": ["DT_FLOAT", "float"],
+    "float16": ["DT_FLOAT16", "half"],
+    "int8": ["DT_INT8", "int8_t"],
+    "int16": ["DT_INT16", "int16_t"],
+    "int32": ["DT_INT32", "int32_t"],
+    "int64": ["DT_INT64", "int64_t"],
+    "uint1": ["DT_UINT1", "uint8_t"],
+    "uint8": ["DT_UINT8", "uint8_t"],
+    "uint16": ["DT_UINT16", "uint16_t"],
+    "uint32": ["DT_UINT32", "uint32_t"],
+    "uint64": ["DT_UINT64", "uint64_t"],
+    "bool": ["DT_BOOL", "bool"],
+    "double": ["DT_DOUBLE", "double"],
+    "dual": ["DT_DUAL", "unknown"],
+    "dual_sub_int8": ["DT_DUAL_SUB_INT8", "unknown"],
+    "dual_sub_uint8": ["DT_DUAL_SUB_UINT8", "unknown"],
+    "string": ["DT_STRING", "unknown"],
+    "complex32": ["DT_COMPLEX32", "unknown"],
+    "complex64": ["DT_COMPLEX64", "unknown"],
+    "complex128": ["DT_COMPLEX128", "unknown"],
+    "qint8": ["DT_QINT8", "unknown"],
+    "qint16": ["DT_QINT16", "unknown"],
+    "qint32": ["DT_QINT32", "unknown"],
+    "quint8": ["DT_QUINT8", "unknown"],
+    "quint16": ["DT_QUINT16", "unknown"],
+    "resource": ["DT_RESOURCE", "unknown"],
+    "string_ref": ["DT_STRING_REF", "unknown"],
+    "int4": ["DT_INT4", "int4b_t"],
+    "bfloat16": ["DT_BF16", "bfloat16_t"]}}
+
+def add_dtype_fmt_option_single(x, x_n, is_ref: bool = False):
+    options = []
+    x_fmt = x.get("format")
+    x_dtype = x.get("dtype")
+    x_n_in_kernel = x_n + '_REF' if is_ref else x_n
+    options.append("-DDTYPE_{{n}}={{t}}".format(n=x_n_in_kernel, t=DTYPE_MAP.get(x_dtype)[1]))
+    options.append("-DORIG_DTYPE_{{n}}={{ot}}".format(n=x_n_in_kernel, ot=DTYPE_MAP.get(x_dtype)[0]))
+    options.append("-DFORMAT_{{n}}=FORMAT_{{f}}".format(n=x_n_in_kernel, f=x_fmt))
+    return options
+
+def get_dtype_fmt_options(__inputs__, __outputs__):
+    options = []
+    input_names = {}
+    output_names = {}
+    unique_param_name_set = set()
+    for idx, x in enumerate(__inputs__):
+        if x is None:
+            continue
+        x_n = input_names[idx].upper()
+        unique_param_name_set.add(x_n)
+        options += add_dtype_fmt_option_single(x, x_n)
+
+    for idx, x in enumerate(__outputs__):
+        if x is None:
+            continue
+        x_n = output_names[idx].upper()
+        if x_n in unique_param_name_set:
+            options += add_dtype_fmt_option_single(x, x_n, True)
+        else:
+            options += add_dtype_fmt_option_single(x, x_n)
+    return options
+
+def load_dso(so_path):
+    try:
+        ctypes.CDLL(so_path)
+    except OSError as error :
+        CommonUtility.print_compile_log("", error, AscendCLogLevel.LOG_ERROR)
+        raise RuntimeError("cannot open %s" %(so_path))
+    else:
+        msg = "load so succ " + so_path
+        CommonUtility.print_compile_log("", msg, AscendCLogLevel.LOG_INFO)
+
+def get_shortsoc_compile_option(compile_option_list: list, shortsoc:str):
+    compile_options = []
+    if shortsoc in compile_option_list:
+        compile_options.extend(compile_option_list[shortsoc])
+    if '__ALLSOC__' in compile_option_list:
+        compile_options.extend(compile_option_list['__ALLSOC__'])
+    return compile_options
+
+def get_kernel_source(src_file, dir_snake, dir_ex):
+    src_ex = os.path.join(PYF_PATH, "..", "ascendc", dir_ex, src_file)
+    if os.path.exists(src_ex):
+        return src_ex
+    src = os.environ.get('BUILD_KERNEL_SRC')
+    if src and os.path.exists(src):
+        return src
+    src = os.path.join(PYF_PATH, "..", "ascendc", dir_snake, src_file)
+    if os.path.exists(src):
+        return src
+    src = os.path.join(PYF_PATH, src_file)
+    if os.path.exists(src):
+        return src
+    src = os.path.join(PYF_PATH, "..", "ascendc", dir_snake, dir_snake + ".cpp")
+    if os.path.exists(src):
+        return src
+    src = os.path.join(PYF_PATH, "..", "ascendc", dir_ex, dir_ex + ".cpp")
+    if os.path.exists(src):
+        return src
+    src = os.path.join(PYF_PATH, "..", "ascendc", os.path.splitext(src_file)[0], src_file)
+    if os.path.exists(src):
+        return src
+    return src_ex
+
+'''
+
+IMPL_API = """
+@tbe_register.register_operator("{}", trans_bool_to_s8=False)
+@para_check.check_op_params({})
+def {}({}, kernel_name="{}", impl_mode=""):
+{}
+    if get_current_build_config("enable_op_prebuild"):
+        return
+    __inputs__, __outputs__, __attrs__ = _build_args({})
+    options = get_dtype_fmt_options(__inputs__, __outputs__)
+    options += ["-x", "cce"]
+    bisheng = os.environ.get('BISHENG_REAL_PATH')
+    if bisheng is None:
+        bisheng = shutil.which("bisheng")
+    if bisheng != None:
+        bisheng_path = os.path.dirname(bisheng)
+        tikcpp_path = os.path.realpath(os.path.join(bisheng_path, "..", "..", "tikcpp"))
+    else:
+        tikcpp_path = os.path.realpath("/usr/local/Ascend/latest/compiler/tikcpp")
+    options.append("-I" + tikcpp_path)
+    options.append("-I" + os.path.join(tikcpp_path, "..", "..", "include"))
+    options.append("-I" + os.path.join(tikcpp_path, "tikcfw"))
+    options.append("-I" + os.path.join(tikcpp_path, "tikcfw", "impl"))
+    options.append("-I" + os.path.join(tikcpp_path, "tikcfw", "interface"))
+    options.append("-I" + os.path.join(PYF_PATH, "..", "ascendc", "common"))
+    if impl_mode == "high_performance":
+        options.append("-DHIGH_PERFORMANCE=1")
+    elif impl_mode == "high_precision":
+        options.append("-DHIGH_PRECISION=1")
+    if get_current_build_config("enable_deterministic_mode") == 1:
+        options.append("-DDETERMINISTIC_MODE=1")
+    else:
+        options.append("-DDETERMINISTIC_MODE=0")
+
+    custom_compile_options = {},
+    custom_all_compile_options = {},
+    soc_version = get_soc_spec("SOC_VERSION")
+    soc_short = get_soc_spec("SHORT_SOC_VERSION").lower()
+    custom_compile_options_soc = get_shortsoc_compile_option(custom_compile_options[0], soc_short)
+    custom_all_compile_options_soc = get_shortsoc_compile_option(custom_all_compile_options[0], soc_short)
+    options += custom_all_compile_options_soc
+    options += custom_compile_options_soc
+
+    origin_func_name = "{}"
+    ascendc_src_dir_ex = "{}"
+    ascendc_src_dir = "{}"
+    ascendc_src_file = "{}"
+    src = get_kernel_source(ascendc_src_file, ascendc_src_dir, ascendc_src_dir_ex)
+"""
+
+REPLAY_OP_API = """
+    msg = "start replay Ascend C Operator {}, kernel name is {}"
+    CommonUtility.print_compile_log("", msg, AscendCLogLevel.LOG_INFO)
+    tikreplay_codegen_path = tikcpp_path + "/tikreplaylib/lib"
+    tikreplay_stub_path = tikcpp_path + "/tikreplaylib/lib/" + soc_version
+    msg = "start load libtikreplaylib_codegen.so and libtikreplaylib_stub.so"
+    CommonUtility.print_compile_log("", msg, AscendCLogLevel.LOG_INFO)
+    codegen_so_path = tikreplay_codegen_path + "/libtikreplaylib_codegen.so"
+    replaystub_so_path = tikreplay_stub_path + "/libtikreplaylib_stub.so"
+    if PYF_PATH.endswith("dynamic"):
+        op_replay_path = os.path.join(PYF_PATH, "..", "..", "op_replay")
+    else:
+        op_replay_path = os.path.join(PYF_PATH, "..", "op_replay")
+    replayapi_so_path = os.path.join(op_replay_path, "libreplay_{}_" + soc_short + ".so")
+    load_dso(codegen_so_path)
+    load_dso(replaystub_so_path)
+    load_dso(replayapi_so_path)
+    op_type = "{}"
+    entry_obj = os.path.join(op_replay_path, "{}_entry_" + soc_short + ".o")
+    code_channel = get_code_channel(src, kernel_name, op_type, options)
+    op_info = OpInfo(kernel_name = kernel_name, op_type = op_type, inputs = __inputs__, outputs = __outputs__,\\
+        attrs = __attrs__, impl_mode = impl_mode, param_type_dynamic = {})
+    res, msg = replay_op(op_info, entry_obj, code_channel, src, options)
+    if not res:
+        print("call replay op failed for %s and get into call compile op" %(msg))
+        compile_op(src, origin_func_name, op_info, options, code_channel, '{}')
+"""
+
+COMPILE_OP_API = """
+    msg = "start compile Ascend C Operator {}, kernel name is " + kernel_name
+    CommonUtility.print_compile_log("", msg, AscendCLogLevel.LOG_INFO)
+    op_type = "{}"
+    code_channel = get_code_channel(src, kernel_name, op_type, options)
+    op_info = OpInfo(kernel_name = kernel_name, op_type = op_type, inputs = __inputs__, outputs = __outputs__,\\
+        attrs = __attrs__, impl_mode = impl_mode, origin_inputs=[{}], origin_outputs = [{}],\\
+                param_type_dynamic = {}, mc2_ctx = {}, param_type_list = {}, init_value_list = {},\\
+                output_shape_depend_on_compute = {})
+    compile_op(src, origin_func_name, op_info, options, code_channel, '{}')
+"""
+
+COMPILE_OP_API_BUILT_IN = """
+    msg = "start compile Ascend C Operator {}, kernel name is " + kernel_name
+    CommonUtility.print_compile_log("", msg, AscendCLogLevel.LOG_INFO)
+    op_type = "{}"
+    code_channel = get_code_channel(src, kernel_name, op_type, options)
+    op_info = OpInfo(kernel_name = kernel_name, op_type = op_type, inputs = __inputs__, outputs = __outputs__,\\
+        attrs = __attrs__, impl_mode = impl_mode, origin_inputs=[{}], origin_outputs = [{}],\\
+                param_type_dynamic = {}, mc2_ctx = {}, param_type_list = {}, init_value_list = {},\\
+                output_shape_depend_on_compute = {})
+
+    op_compile_option = '{}'
+    opp_path = os.environ.get('ASCEND_OPP_PATH')
+    dat_path = os.path.realpath(os.path.join(opp_path, "built-in", "op_impl", "ai_core", "tbe", "ascendc_impl.dat"))
+    if opp_path and os.path.exists(dat_path):
+        # dat file exists: built in hidden src file online compiling process. append vfs compile option in compile_op
+        abs_rel_kernel_src_path = "{}"
+        extend_options = {{}}
+        extend_options['opp_kernel_hidden_dat_path'] = dat_path
+        compile_op(abs_rel_kernel_src_path, origin_func_name, op_info, options, code_channel, op_compile_option,\\
+            extend_options)
+    else:
+        raise RuntimeError("built-in opp compile, ascendc_impl.dat file path does not exist: %s" %(dat_path))
+"""
+
+SUP_API = """
+def {}({}, impl_mode=""):
+    __inputs__, __outputs__, __attrs__ = _build_args({})
+    ret_str = check_op_cap("{}", "{}", __inputs__, __outputs__, __attrs__)
+    ret_dict = json.loads(ret_str)
+    err_code = ret_dict.get("ret_code")
+    sup = "Unknown"
+    reason = "Unknown reason"
+    if err_code is not None:
+        if err_code == 0:
+            sup = "True"
+            reason = ""
+        elif err_code == 1:
+            sup = "False"
+            reason = ret_dict.get("reason")
+        else:
+            sup = "Unknown"
+            reason = ret_dict.get("reason")
+    return sup, reason
+"""
+CAP_API = """
+def {}({}, impl_mode=""):
+    __inputs__, __outputs__, __attrs__ = _build_args({})
+    result = check_op_cap("{}", "{}", __inputs__, __outputs__, __attrs__)
+    return result.decode("utf-8")
+"""
+GLZ_API = """
+@tbe_register.register_param_generalization("{}")
+def {}_generalization({}, generalize_config=None):
+    __inputs__, __outputs__, __attrs__ = _build_args({})
+    ret_str = generalize_op_params("{}", __inputs__, __outputs__, __attrs__, generalize_config)
+    return [json.loads(ret_str)]
+"""
+
+ATTR_DEFAULT = {
+    "bool": "False",
+    "int": "0",
+    "float": "0.0",
+    "list_int": "[]",
+    "list_float": "[]",
+    "list_bool": "[]",
+    "list_list_int": "[[]]",
+    "str": "",
+}
+
+
+def optype_snake(origin_str):
+    temp_str = origin_str[0].lower() + origin_str[1:]
+    new_str = re.sub(r"([A-Z])", r"_\1", temp_str).lower()
+    return new_str
+
+
+def optype_snake_ex(s):
+    snake_case = ""
+    for i, c in enumerate(s):
+        if i == 0:
+            snake_case += c.lower()
+        elif c.isupper():
+            if s[i - 1] != "_":
+                if not s[i - 1].isupper():
+                    snake_case += "_"
+                elif s[i - 1].isupper() and (i + 1) < len(s) and s[i + 1].islower():
+                    snake_case += "_"
+            snake_case += c.lower()
+        else:
+            snake_case += c
+    return snake_case
+
+
+class AdpBuilder(opdesc_parser.OpDesc):
+    def __init__(self: any, op_type: str):
+        self.argsdefv = []
+        self.op_compile_option: str = "{}"
+        super().__init__(op_type)
+
+    def write_adapt(
+        self: any, impl_path, path: str, op_compile_option_all: list = None
+    ):
+        self._build_paradefault()
+        if os.environ.get("BUILD_BUILTIN_OPP") != "1" and impl_path != "":
+            src_file = os.path.join(impl_path, self.op_file + ".cpp")
+            if not os.path.exists(src_file):
+                print(
+                    f"[ERROR]: operator: {self.op_file} source file: {src_file} does not found, please check."
+                )
+                return
+        out_path = os.path.abspath(path)
+        if self.dynamic_shape and not out_path.endswith("dynamic"):
+            out_path = os.path.join(path, "dynamic")
+            os.makedirs(out_path, exist_ok=True)
+        adpfile = os.path.join(out_path, self.op_file + ".py")
+        self._gen_op_compile_option(op_compile_option_all)
+        with os.fdopen(os.open(adpfile, const_var.WFLAGS, const_var.WMODES), "w") as fd:
+            self._write_head(fd)
+            self._write_argparse(fd)
+            self._write_impl(fd, impl_path)
+            if self.op_chk_support:
+                self._write_cap("check_supported", fd)
+                self._write_cap("get_op_support_info", fd)
+            if self.op_fmt_sel:
+                self._write_cap("op_select_format", fd)
+                self._write_cap("get_op_specific_info", fd)
+            if self.op_range_limit == "limited" or self.op_range_limit == "dynamic":
+                self._write_glz(fd)
+
+    def _gen_op_compile_option(self: any, op_compile_option_all: list = None):
+        if op_compile_option_all is not None:
+            if self.op_type in op_compile_option_all:
+                self.op_compile_option = op_compile_option_all[self.op_type]
+            elif "__all__" in op_compile_option_all:
+                self.op_compile_option = op_compile_option_all["__all__"]
+
+    def _ip_argpack(self: any, default: bool = True) -> list:
+        args = []
+        for i in range(len(self.input_name)):
+            arg = self.input_name[i]
+            if default and self.argsdefv[i] is not None:
+                arg += "=" + self.argsdefv[i]
+            args.append(arg)
+        return args
+
+    def _op_argpack(self: any, default: bool = True) -> list:
+        args = []
+        argidx = len(self.input_name)
+        for i in range(len(self.output_name)):
+            arg = self.output_name[i]
+            if default and self.argsdefv[i + argidx] is not None:
+                arg += "=" + self.argsdefv[i + argidx]
+            args.append(arg)
+        return args
+
+    def _attr_argpack(self: any, default: bool = True) -> list:
+        args = []
+        argidx = len(self.input_name) + len(self.output_name)
+        for i in range(len(self.attr_list)):
+            att = self.attr_list[i]
+            arg = att
+            if default and self.argsdefv[i + argidx] is not None:
+                if self.attr_val.get(att).get("type") == "str":
+                    arg += '="' + self.argsdefv[i + argidx] + '"'
+                elif self.attr_val.get(att).get("type") == "bool":
+                    arg += "=" + self.argsdefv[i + argidx].capitalize()
+                else:
+                    arg += "=" + self.argsdefv[i + argidx]
+            args.append(arg)
+        return args
+
+    def _build_paralist(self: any, default: bool = True) -> str:
+        args = []
+        args.extend(self._ip_argpack(default))
+        args.extend(self._op_argpack(default))
+        args.extend(self._attr_argpack(default))
+        return ", ".join(args)
+
+    def _io_parachk(self: any, types: list, type_name: str) -> list:
+        chk = []
+        for iot in types:
+            if iot == "optional":
+                ptype = "OPTION"
+            else:
+                ptype = iot.upper()
+            chk.append("para_check.{}_{}".format(ptype, type_name))
+        return chk
+
+    def _attr_parachk(self: any) -> list:
+        chk = []
+        for att in self.attr_list:
+            att_type = self.attr_val.get(att).get("type").upper()
+            chk.append("para_check.{}_ATTR_{}".format("OPTION", att_type))
+        return chk
+
+    def _build_parachk(self: any) -> str:
+        chk = []
+        chk.extend(self._io_parachk(self.input_type, "INPUT"))
+        chk.extend(self._io_parachk(self.output_type, "OUTPUT"))
+        chk.extend(self._attr_parachk())
+        chk.append("para_check.KERNEL_NAME")
+        return ", ".join(chk)
+
+    def _build_virtual(self: any) -> str:
+        virt_exp = []
+        for index in range(len(self.input_name)):
+            if self.input_virt.get(index) is None:
+                continue
+            val = []
+            val.append('"param_name":"{}"'.format(self.input_name[index]))
+            val.append('"index":{}'.format(index))
+            val.append('"dtype":"{}"'.format(self.input_dtype[index].split(",")[0]))
+            val.append('"format":"{}"'.format(self.input_fmt[index].split(",")[0]))
+            val.append('"ori_format":"{}"'.format(self.input_fmt[index].split(",")[0]))
+            val.append('"paramType":"optional"')
+            val.append('"shape":[1]')
+            val.append('"ori_shape":[1]')
+            virt_exp.append(
+                "    " + self.input_name[index] + " = {" + ",".join(val) + "}"
+            )
+        if len(virt_exp) > 0:
+            return "\n".join(virt_exp)
+        else:
+            return "    # do ascendc build step"
+
+    def _build_mc2_ctx(self: any):
+        if len(self.mc2_ctx) != 0:
+            return '["' + '", "'.join(self.mc2_ctx) + '"]'
+        return "[]"
+
+    def _build_paradefault(self: any):
+        optional = False
+        argtypes = []
+        argtypes.extend(self.input_type)
+        argtypes.extend(self.output_type)
+        in_idx = 0
+        for atype in argtypes:
+            if atype == "optional":
+                optional = True
+            if optional:
+                self.argsdefv.append("None")
+            else:
+                self.argsdefv.append(None)
+            in_idx += 1
+        for attr in self.attr_list:
+            atype = self.attr_val.get(attr).get("paramType")
+            if atype == "optional":
+                optional = True
+            attrval = self.attr_val.get(attr).get("defaultValue")
+            if attrval is not None:
+                optional = True
+                if type == "bool":
+                    attrval = attrval.capitalize()
+                elif type == "str":
+                    attrval = '"' + attrval + '"'
+                self.argsdefv.append(attrval)
+                continue
+            if optional:
+                self.argsdefv.append(
+                    ATTR_DEFAULT.get(self.attr_val.get(attr).get("type"))
+                )
+            else:
+                self.argsdefv.append(None)
+
+    def _write_head(self: any, fd: object):
+        now = datetime.datetime.now()
+        curr_year = now.year
+        former_year = curr_year - 1
+        fd.write(
+            IMPL_HEAD.format(
+                former_year, curr_year, self.input_ori_name, self.output_ori_name
+            )
+        )
+
+    def _write_argparse(self: any, fd: object):
+        args = self._build_paralist(False)
+        fd.write("def _build_args({}):\n".format(args))
+        fd.write("    __inputs__ = []\n")
+        fd.write("    for arg in [{}]:\n".format(", ".join(self.input_name)))
+        fd.write("        if arg != None:\n")
+        fd.write("            if isinstance(arg, (list, tuple)):\n")
+        fd.write("                if len(arg) == 0:\n")
+        fd.write("                    continue\n")
+        fd.write("                __inputs__.append(arg[0])\n")
+        fd.write("            else:\n")
+        fd.write("                __inputs__.append(arg)\n")
+        fd.write("        else:\n")
+        fd.write("            __inputs__.append(arg)\n")
+        fd.write("    __outputs__ = []\n")
+        fd.write("    for arg in [{}]:\n".format(", ".join(self.output_name)))
+        fd.write("        if arg != None:\n")
+        fd.write("            if isinstance(arg, (list, tuple)):\n")
+        fd.write("                if len(arg) == 0:\n")
+        fd.write("                    continue\n")
+        fd.write("                __outputs__.append(arg[0])\n")
+        fd.write("            else:\n")
+        fd.write("                __outputs__.append(arg)\n")
+        fd.write("        else:\n")
+        fd.write("            __outputs__.append(arg)\n")
+        fd.write("    __attrs__ = []\n")
+        for attr in self.attr_list:
+            fd.write("    if {} != None:\n".format(attr))
+            fd.write("        attr = {}\n")
+            fd.write('        attr["name"] = "{}"\n'.format(attr))
+            fd.write(
+                '        attr["dtype"] = "{}"\n'.format(
+                    self.attr_val.get(attr).get("type")
+                )
+            )
+            fd.write('        attr["value"] = {}\n'.format(attr))
+            fd.write("        __attrs__.append(attr)\n")
+        fd.write("    return __inputs__, __outputs__, __attrs__\n")
+
+    def _get_kernel_source(self: any, kernel_src_dir, src_file, dir_snake, dir_ex):
+        src_ex = os.path.join(kernel_src_dir, dir_ex, src_file)
+        if os.path.exists(src_ex):
+            return src_ex
+        src = os.environ.get("BUILD_KERNEL_SRC")
+        if src and os.path.exists(src):
+            return src
+        src = os.path.join(kernel_src_dir, dir_snake, src_file)
+        if os.path.exists(src):
+            return src
+        src = os.path.join(kernel_src_dir, src_file)
+        if os.path.exists(src):
+            return src
+        src = os.path.join(kernel_src_dir, dir_snake, dir_snake + ".cpp")
+        if os.path.exists(src):
+            return src
+        src = os.path.join(kernel_src_dir, dir_ex, dir_ex + ".cpp")
+        if os.path.exists(src):
+            return src
+        src = os.path.join(kernel_src_dir, os.path.splitext(src_file)[0], src_file)
+        if os.path.exists(src):
+            return src
+        return src_ex
+
+    def _write_impl(self: any, fd: object, impl_path: str = ""):
+        argsdef = self._build_paralist()
+        argsval = self._build_paralist(False)
+        pchk = self._build_parachk()
+        if len(self.kern_name) > 0:
+            kern_name = self.kern_name
+        else:
+            kern_name = self.op_intf
+        src = self.op_file + ".cpp"
+        virt_exprs = self._build_virtual()
+        fd.write(
+            IMPL_API.format(
+                self.op_type,
+                pchk,
+                self.op_intf,
+                argsdef,
+                kern_name,
+                virt_exprs,
+                argsval,
+                self.custom_compile_options,
+                self.custom_all_compile_options,
+                self.op_intf,
+                optype_snake_ex(self.op_type),
+                optype_snake(self.op_type),
+                src,
+            )
+        )
+        if self.op_replay_flag:
+            fd.write(
+                REPLAY_OP_API.format(
+                    self.op_type,
+                    kern_name,
+                    self.op_file,
+                    self.op_type,
+                    self.op_file,
+                    self.param_type_dynamic,
+                    self.op_compile_option,
+                )
+            )
+        else:
+            if os.environ.get("BUILD_BUILTIN_OPP") == "1":
+                relative_kernel_src_path = os.path.realpath(
+                    self._get_kernel_source(
+                        impl_path,
+                        src,
+                        optype_snake(self.op_type),
+                        optype_snake_ex(self.op_type),
+                    )
+                )
+                # to match src path in .dat file system, turn relative path into absolute path
+                abs_rel_kernel_src_path = os.path.join(
+                    "/", os.path.relpath(relative_kernel_src_path, impl_path)
+                )
+
+                # compiling hidden src file requires src path before packaging .dat file,
+                # hard code such src path to <op_type>.py
+                fd.write(
+                    COMPILE_OP_API_BUILT_IN.format(
+                        self.op_type,
+                        self.op_type,
+                        ", ".join(self.input_name),
+                        ", ".join(self.output_name),
+                        self.param_type_dynamic,
+                        self._build_mc2_ctx(),
+                        self.input_type + self.output_type,
+                        self.output_init_value,
+                        self.output_shape_depend_on_compute,
+                        self.op_compile_option,
+                        abs_rel_kernel_src_path,
+                    )
+                )
+            else:
+                fd.write(
+                    COMPILE_OP_API.format(
+                        self.op_type,
+                        self.op_type,
+                        ", ".join(self.input_name),
+                        ", ".join(self.output_name),
+                        self.param_type_dynamic,
+                        self._build_mc2_ctx(),
+                        self.input_type + self.output_type,
+                        self.output_init_value,
+                        self.output_shape_depend_on_compute,
+                        self.op_compile_option,
+                    )
+                )
+
+    def _write_cap(self: any, cap_name: str, fd: object):
+        argsdef = self._build_paralist()
+        argsval = self._build_paralist(False)
+        if cap_name == "check_supported":
+            fd.write(SUP_API.format(cap_name, argsdef, argsval, cap_name, self.op_type))
+        else:
+            fd.write(CAP_API.format(cap_name, argsdef, argsval, cap_name, self.op_type))
+
+    def _write_glz(self: any, fd: object):
+        argsdef = self._build_paralist()
+        argsval = self._build_paralist(False)
+        fd.write(
+            GLZ_API.format(self.op_type, self.op_intf, argsdef, argsval, self.op_type)
+        )
+
+
+def write_scripts(
+    cfgfile: str,
+    cfgs: dict,
+    dirs: dict,
+    ops: list = None,
+    op_compile_option: list = None,
+):
+    batch_lists = cfgs.get(const_var.REPLAY_BATCH).split(";")
+    iterator_lists = cfgs.get(const_var.REPLAY_ITERATE).split(";")
+    file_map = {}
+    op_descs = opdesc_parser.get_op_desc(
+        cfgfile,
+        batch_lists,
+        iterator_lists,
+        AdpBuilder,
+        ops,
+        dirs.get(const_var.AUTO_GEN_DIR),
+    )
+    for op_desc in op_descs:
+        op_desc.write_adapt(
+            dirs.get(const_var.CFG_IMPL_DIR),
+            dirs.get(const_var.CFG_OUT_DIR),
+            op_compile_option,
+        )
+        file_map[op_desc.op_type] = op_desc.op_file
+    return file_map
+
+
+class OpFileNotExistsError(Exception):
+    """File does not exist error."""
+
+    def __str__(self) -> str:
+        return (
+            f"File aic-*-ops-info.ini does not exist in directory {super().__str__()}"
+        )
+
+
+def get_ops_info_files(opsinfo_dir: List[str]) -> List[str]:
+    """Get all ops info files."""
+    ops_info_files = []
+    for _dir in opsinfo_dir:
+        ops_info_files.extend(glob.glob(f"{_dir}/aic-*-ops-info.ini"))
+    return sorted(ops_info_files)
+
+
+def parse_args(argv):
+    """Command line parameter parsing"""
+    parser = argparse.ArgumentParser()
+    parser.add_argument("argv", nargs="+")
+    parser.add_argument("--opsinfo-dir", nargs="*", default=None)
+    return parser.parse_args(argv)
+
+
+if __name__ == "__main__":
+    args = parse_args(sys.argv)
+
+    if len(args.argv) <= 6:
+        raise RuntimeError("arguments must greater equal than 6")
+
+    rep_cfg = {}
+    rep_cfg[const_var.REPLAY_BATCH] = args.argv[2]
+    rep_cfg[const_var.REPLAY_ITERATE] = args.argv[3]
+
+    cfg_dir = {}
+    cfg_dir[const_var.CFG_IMPL_DIR] = args.argv[4]
+    cfg_dir[const_var.CFG_OUT_DIR] = args.argv[5]
+    cfg_dir[const_var.AUTO_GEN_DIR] = args.argv[6]
+
+    ops_infos = []
+    if args.opsinfo_dir:
+        ops_infos.extend(get_ops_info_files(args.opsinfo_dir))
+        if not ops_infos:
+            raise OpFileNotExistsError(args.opsinfo_dir)
+    else:
+        ops_infos.append(args.argv[1])
+
+    for ops_info in ops_infos:
+        write_scripts(cfgfile=ops_info, cfgs=rep_cfg, dirs=cfg_dir)
diff --git a/csrc/deepep/ops2/cmake/util/ascendc_op_info.py b/csrc/deepep/ops2/cmake/util/ascendc_op_info.py
new file mode 100755
index 000000000..a75404285
--- /dev/null
+++ b/csrc/deepep/ops2/cmake/util/ascendc_op_info.py
@@ -0,0 +1,41 @@
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+
+import os
+import sys
+
+import opdesc_parser
+
+PYF_PATH = os.path.dirname(os.path.realpath(__file__))
+
+
+class OpInfo:
+    def __init__(self: any, op_type: str, cfg_file: str):
+        op_descs = opdesc_parser.get_op_desc(
+            cfg_file, [], [], opdesc_parser.OpDesc, [op_type]
+        )
+        if op_descs is None or len(op_descs) != 1:
+            raise RuntimeError("cannot get op info of {}".format(op_type))
+        self.op_desc = op_descs[0]
+
+    def get_op_file(self: any):
+        return self.op_desc.op_file
+
+    def get_op_intf(self: any):
+        return self.op_desc.op_intf
+
+    def get_inputs_name(self: any):
+        return self.op_desc.input_ori_name
+
+    def get_outputs_name(self: any):
+        return self.op_desc.output_ori_name
+
+
+if __name__ == "__main__":
+    if len(sys.argv) <= 2:
+        raise RuntimeError("arguments must greater than 2")
+    op_info = OpInfo(sys.argv[1], sys.argv[2])
+    print(op_info.get_op_file())
+    print(op_info.get_op_intf())
+    print(op_info.get_inputs_name())
+    print(op_info.get_outputs_name())
diff --git a/csrc/deepep/ops2/cmake/util/ascendc_ops_config.py b/csrc/deepep/ops2/cmake/util/ascendc_ops_config.py
new file mode 100755
index 000000000..8c5dd2763
--- /dev/null
+++ b/csrc/deepep/ops2/cmake/util/ascendc_ops_config.py
@@ -0,0 +1,314 @@
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+
+import argparse
+import glob
+import json
+import os
+import sys
+
+import const_var
+
+BINARY_INFO_CONFIG_JSON = "binary_info_config.json"
+
+
+def load_json(json_file: str):
+    with open(json_file, encoding="utf-8") as file:
+        json_content = json.load(file)
+    return json_content
+
+
+def get_specified_suffix_file(root_dir, suffix):
+    specified_suffix = os.path.join(root_dir, "**/*.{}".format(suffix))
+    all_suffix_files = glob.glob(specified_suffix, recursive=True)
+    return sorted(all_suffix_files)
+
+
+def add_dict_key(dict_to_add, key, value):
+    if value is None:
+        return
+    dict_to_add[key] = value
+
+
+def correct_format_mode(format_mode):
+    if format_mode == "FormatDefault":
+        return "nd_agnostic"
+    if format_mode == "FormatAgnostic":
+        return "static_nd_agnostic"
+    if format_mode == "FormatFixed":
+        return "normal"
+    return format_mode
+
+
+def get_input_or_output_config(in_or_out):
+    param_dict = {}
+    name = in_or_out.get("name")
+    index = in_or_out.get("index")
+    param_type = in_or_out.get("paramType")
+
+    format_match_mode = in_or_out.get("format_match_mode")
+    format_mode = correct_format_mode(format_match_mode)
+
+    dtype_mode = in_or_out.get("dtype_match_mode")
+    if dtype_mode == "DtypeByte":
+        dtype_mode = "bit"
+
+    add_dict_key(param_dict, "name", name)
+    add_dict_key(param_dict, "index", index)
+    add_dict_key(param_dict, "paramType", param_type)
+    add_dict_key(param_dict, "dtypeMode", dtype_mode)
+    add_dict_key(param_dict, "formatMode", format_mode)
+    return param_dict
+
+
+def get_inputs_or_outputs_config(inputs_or_outputs):
+    if inputs_or_outputs is None:
+        return None
+    inputs_or_outputs_list = []
+
+    for in_or_out in inputs_or_outputs:
+        if isinstance(in_or_out, dict):
+            dict_param_config = get_input_or_output_config(in_or_out)
+            inputs_or_outputs_list.append(dict_param_config)
+        elif isinstance(in_or_out, list):
+            param_info = in_or_out[0]
+            list_param_config = get_input_or_output_config(param_info)
+            tmp_list = [list_param_config]
+            inputs_or_outputs_list.append(tmp_list)
+    return inputs_or_outputs_list
+
+
+def gen_attrs_config(attrs):
+    attrs_list = []
+    for attr in attrs:
+        attrs_dict = {}
+        name = attr.get("name")
+        mode = attr.get("mode")
+        add_dict_key(attrs_dict, "name", name)
+        add_dict_key(attrs_dict, "mode", mode)
+        attrs_list.append(attrs_dict)
+    return attrs_list
+
+
+def get_params_config(support_info):
+    params_dict = {}
+
+    inputs = support_info.get("inputs")
+    inputs_list = get_inputs_or_outputs_config(inputs)
+    params_dict["inputs"] = inputs_list
+
+    outputs = support_info.get("outputs")
+    outputs_list = get_inputs_or_outputs_config(outputs)
+    params_dict["outputs"] = outputs_list
+
+    attrs = support_info.get("attrs")
+    if attrs is not None:
+        attrs_list = gen_attrs_config(attrs)
+        params_dict["attrs"] = attrs_list
+
+    return params_dict
+
+
+def add_simplified_config(
+    op_type, support_info, core_type, task_ration, objfile, config
+):
+    simplified_key = support_info.get("simplifiedKey")
+
+    json_path = objfile.split(".")[0] + ".json"
+
+    simple_cfg = config.get(BINARY_INFO_CONFIG_JSON)
+    op_cfg = simple_cfg.get(op_type)
+    if not op_cfg:
+        op_cfg = {"dynamicRankSupport": True}
+
+        simplified_key_mode = support_info.get("simplifiedKeyMode")
+        add_dict_key(op_cfg, "simplifiedKeyMode", simplified_key_mode)
+
+        optional_input_mode = support_info.get("optionalInputMode")
+        optional_output_mode = support_info.get("optionalOutputMode")
+        add_dict_key(op_cfg, "optionalInputMode", optional_input_mode)
+        if optional_output_mode is not None:
+            add_dict_key(op_cfg, "optionalOutputMode", optional_output_mode)
+
+        params_info = get_params_config(support_info)
+        op_cfg["params"] = params_info
+        op_cfg["binaryList"] = []
+        simple_cfg[op_type] = op_cfg
+
+    bin_list = op_cfg.get("binaryList")
+    if core_type == 0 and task_ration == "tilingKey":
+        bin_list.append(
+            {
+                "coreType": core_type,
+                "simplifiedKey": simplified_key,
+                "multiKernelType": 1,
+                "binPath": objfile,
+                "jsonPath": json_path,
+            }
+        )
+    else:
+        bin_list.append(
+            {
+                "coreType": core_type,
+                "simplifiedKey": simplified_key,
+                "binPath": objfile,
+                "jsonPath": json_path,
+            }
+        )
+
+
+def add_op_config(op_file, bin_info, config):
+    op_cfg = config.get(op_file)
+    if not op_cfg:
+        op_cfg = {"binList": []}
+        config[op_file] = op_cfg
+    op_cfg.get("binList").append(bin_info)
+
+
+def gen_ops_config(json_file, soc, config):
+    core_type_map = {
+        "MIX": 0,
+        "AiCore": 1,
+        "VectorCore": 2,
+        "MIX_AICORE": 3,
+        "MIX_VECTOR_CORE": 4,
+        "MIX_AIV": 4,
+    }
+    contents = load_json(json_file)
+    if ("binFileName" not in contents) or ("supportInfo" not in contents):
+        return
+    json_base_name = os.path.basename(json_file)
+    op_dir = os.path.basename(os.path.dirname(json_file))
+
+    support_info = contents.get("supportInfo")
+    bin_name = contents.get("binFileName")
+    bin_suffix = contents.get("binFileSuffix")
+    core_type = contents.get("coreType")
+    task_ration = contents.get("taskRation")
+    core_type = core_type_map.get(core_type, -1)
+    if core_type == -1 and soc != "ascend310b":
+        raise Exception("[ERROR]: must set coreType in json when soc version is {soc}.")
+
+    bin_file_name = bin_name + bin_suffix
+    op_type = bin_name.split("_")[0]
+    op_file = op_dir + ".json"
+    bin_info = {}
+
+    add_dict_key(bin_info, "implMode", support_info.get("implMode"))
+    add_dict_key(bin_info, "int64Mode", support_info.get("int64Mode"))
+    add_dict_key(bin_info, "simplifiedKeyMode", support_info.get("simplifiedKeyMode"))
+
+    simplified_key = support_info.get("simplifiedKey")
+    if simplified_key is not None:
+        bin_info["simplifiedKey"] = simplified_key
+        obj_file = os.path.join(soc, op_dir, bin_file_name)
+        add_simplified_config(
+            op_type, support_info, core_type, task_ration, obj_file, config
+        )
+
+    add_dict_key(bin_info, "dynamicParamMode", support_info.get("dynamicParamMode"))
+    bin_info["staticKey"] = support_info.get("staticKey")
+    bin_info["inputs"] = support_info.get("inputs")
+    bin_info["outputs"] = support_info.get("outputs")
+    if support_info.get("attrs"):
+        bin_info["attrs"] = support_info.get("attrs")
+
+    add_dict_key(bin_info, "opMode", support_info.get("opMode"))
+    add_dict_key(bin_info, "optionalInputMode", support_info.get("optionalInputMode"))
+    add_dict_key(bin_info, "deterministic", support_info.get("deterministic"))
+    if support_info.get("optionalOutputMode") is not None:
+        add_dict_key(
+            bin_info, "optionalOutputMode", support_info.get("optionalOutputMode")
+        )
+
+    bin_info["binInfo"] = {"jsonFilePath": os.path.join(soc, op_dir, json_base_name)}
+    add_op_config(op_file, bin_info, config)
+
+
+def check_single_op_is_void(root_dir):
+    for root, dirs, _ in os.walk(root_dir):
+        for sub_dir in dirs:
+            dir_path = os.path.join(root, sub_dir)
+            if len(os.listdir(dir_path)) == 0:
+                print(f"[ERROR] op {sub_dir}: not any obj compile success")
+                sys.exit(1)
+
+
+def gen_all_config(root_dir, soc, out_dir, skip_binary_info_config):
+    suffix = "json"
+    config = {BINARY_INFO_CONFIG_JSON: {}}
+    check_single_op_is_void(root_dir)
+    all_json_files = get_specified_suffix_file(root_dir, suffix)
+
+    for _json in all_json_files:
+        gen_ops_config(_json, soc, config)
+        file_path = soc + _json.split(soc)[1]
+        with open(_json, "r+") as f:
+            data = json.load(f)
+            data["filePath"] = file_path
+            f.seek(0)
+            json.dump(data, f, indent=" ")
+            f.truncate()
+
+    for cfg_key in config.keys():
+        if skip_binary_info_config and cfg_key == BINARY_INFO_CONFIG_JSON:
+            continue
+        cfg_file = os.path.join(out_dir, cfg_key)
+        with os.fdopen(
+            os.open(cfg_file, const_var.WFLAGS, const_var.WMODES), "w"
+        ) as fd:
+            json.dump(config.get(cfg_key), fd, indent="  ")
+
+
+# Parse multiple soc_versions ops in single path.
+def gen_all_soc_config(all_path):
+    soc_roots = glob.glob(os.path.join(all_path, "ascend*"))
+
+    for soc_root in soc_roots:
+        soc = os.path.basename(soc_root)
+        gen_all_config(soc_root, soc, soc_root, True)
+        cfg_files = glob.glob(os.path.join(soc_root, "*.json"))
+        cfg_path = os.path.join(all_path, "config", soc)
+        os.makedirs(cfg_path, exist_ok=True)
+        for cfg_file in cfg_files:
+            new_file = os.path.join(cfg_path, os.path.basename(cfg_file))
+            os.rename(cfg_file, new_file)
+
+
+def args_prase():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-p",
+        "--path",
+        nargs="?",
+        required=True,
+        help="Parse the path of the json file.",
+    )
+
+    parser.add_argument(
+        "-s", "--soc", nargs="?", required=True, help="Parse the soc_version of ops."
+    )
+
+    parser.add_argument("-o", "--out", nargs="?", help="Output directory.")
+
+    parser.add_argument(
+        "--skip-binary-info-config",
+        action="store_true",
+        help="binary_info_config.json file is not parsed.",
+    )
+
+    return parser.parse_args()
+
+
+def main():
+    args = args_prase()
+    if args.out is None:
+        out_dir = args.path
+    else:
+        out_dir = args.out
+
+    gen_all_config(args.path, args.soc, out_dir, args.skip_binary_info_config)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/csrc/deepep/ops2/cmake/util/ascendc_pack_kernel.py b/csrc/deepep/ops2/cmake/util/ascendc_pack_kernel.py
new file mode 100755
index 000000000..430c1ba8e
--- /dev/null
+++ b/csrc/deepep/ops2/cmake/util/ascendc_pack_kernel.py
@@ -0,0 +1,283 @@
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+
+import argparse
+import glob
+import json
+import math
+import os
+import subprocess
+import sys
+
+import ascendc_ops_config
+import const_var
+from tbe.tikcpp.log_utils import AscendCLogLevel, LogUtil
+
+
+class PackKernel:
+    def __init__(self: any, args: any):
+        self.in_path = os.path.realpath(args.input_path)
+        self.out_path = os.path.realpath(args.output_path)
+        self.is_lib = args.enable_library
+        self.platform = args.platform
+        self.op_info = {}
+        self.file_info = {}
+        try:
+            os.makedirs(self.out_path, exist_ok=True)
+        except Exception as e:
+            LogUtil.print_compile_log(
+                "",
+                f"make {self.out_path} error: {e}!",
+                AscendCLogLevel.LOG_ERROR,
+                LogUtil.Option.NON_SOC,
+            )
+
+    def load_json(self: any, json_file: str):
+        with open(json_file, encoding="utf-8") as file:
+            json_content = json.load(file)
+            return json_content
+
+    def get_symbol(self: any, name: str):
+        name = name.replace("/", "_")
+        return name.replace(".", "_")
+
+    def ascendc_gen_object(self: any, in_file: str, soc: str):
+        sym = self.get_symbol("_binary_" + in_file)
+        out_file = os.path.join(self.out_path, sym + ".o")
+        # ascend610lite only support aarch64
+        if soc == "ascend610lite":
+            try:
+                subprocess.run(
+                    [
+                        "llvm-objcopy",
+                        "--input-target",
+                        "binary",
+                        "--output-target",
+                        "elf64-littleaarch64",
+                        "--binary-architecture",
+                        "aarch64",
+                        in_file,
+                        out_file,
+                    ]
+                )
+            except Exception as e:
+                LogUtil.print_compile_log(
+                    "",
+                    " ascend610lite execute objcopy fail!",
+                    AscendCLogLevel.LOG_ERROR,
+                    LogUtil.Option.NON_SOC,
+                )
+                return None
+            return [sym + "_start", sym + "_end"]
+        uname = os.popen("uname -m").read().strip()
+        if self.platform is not None:
+            target_platform = self.platform
+        else:
+            target_platform = uname
+        try:
+            if target_platform == "x86_64":
+                subprocess.run(
+                    [
+                        "llvm-objcopy",
+                        "--input-target",
+                        "binary",
+                        "--output-target",
+                        "elf64-x86-64",
+                        "--binary-architecture",
+                        "i386",
+                        in_file,
+                        out_file,
+                    ]
+                )
+            elif target_platform == "aarch64":
+                subprocess.run(
+                    [
+                        "llvm-objcopy",
+                        "--input-target",
+                        "binary",
+                        "--output-target",
+                        "elf64-littleaarch64",
+                        "--binary-architecture",
+                        "aarch64",
+                        in_file,
+                        out_file,
+                    ]
+                )
+            else:
+                subprocess.run(["echo", "unsported environment!"])
+        except Exception as e:
+            LogUtil.print_compile_log(
+                "",
+                f"{target_platform} execute objcopy error: {e}!",
+                AscendCLogLevel.LOG_ERROR,
+                LogUtil.Option.NON_SOC,
+            )
+            return None
+        return [sym + "_start", sym + "_end"]
+
+    def ascendc_get_config(self: any):
+        os.chdir(self.in_path)
+        soc_vers = os.listdir("config")
+        for soc in soc_vers:
+            bin_infos = glob.glob(os.path.join("config", soc, "*.json"))
+            cfgs = {}
+            for bin_info in bin_infos:
+                if bin_info.find("binary_info_config.json") > 0:
+                    continue
+                jobj = self.load_json(bin_info)
+                for bin_cfg in jobj.get("binList"):
+                    js_cfg = bin_cfg.get("binInfo").get("jsonFilePath")
+                    op_type = os.path.basename(js_cfg).split("_")[0]
+                    if cfgs.get(op_type) is None:
+                        op_obj = {}
+                        op_obj["obj"] = []
+                        op_obj["cfg"] = bin_info
+                        cfgs[op_type] = op_obj
+                    op_obj = cfgs.get(op_type)
+                    op_obj.get("obj").append(js_cfg[:-5])
+                self.file_info[soc] = cfgs
+
+    def ascendc_pack_kernel(self: any):
+        for soc in self.file_info.keys():
+            os.chdir(self.in_path)
+            op_cfgs = self.file_info.get(soc)
+            for op_type in op_cfgs.keys():
+                op_obj = op_cfgs.get(op_type)
+                if self.op_info.get(op_type) is None:
+                    op_info = {}
+                    op_info["op_fun"] = ["nullptr", "nullptr"]
+                    op_info["op_bin"] = {}
+                    op_info["op_rkb"] = []
+                    self.op_info[op_type] = op_info
+                op_info = self.op_info.get(op_type)
+                op_bin = op_info.get("op_bin")
+                if op_bin.get(soc) is None:
+                    op_bin[soc] = []
+                    op_bin[soc].append(self.ascendc_gen_object(op_obj["cfg"], soc))
+                op_soc = op_bin.get(soc)
+                for objs in op_obj["obj"]:
+                    op_soc.append(self.ascendc_gen_object(objs + ".json", soc))
+                    op_soc.append(self.ascendc_gen_object(objs + ".o", soc))
+
+    def ascendc_gen_header(self: any):
+        for op_type in self.op_info.keys():
+            op_obj = self.op_info.get(op_type)
+            macro_op = (
+                "#define {}_OP_RESOURCES std::make_tuple<std::vector<void *>, \\\n"
+                "    std::map<ge::AscendString, std::vector<std::tuple<const uint8_t *, const uint8_t *>>>, \\\n"
+                "    std::vector<std::tuple<const uint8_t *, const uint8_t *>>>({{{}}}, \\\n".format(
+                    op_type, ", ".join(op_obj.get("op_fun"))
+                )
+            )
+            op_bin = op_obj.get("op_bin")
+            socs_res = []
+            op_syms = []
+            for soc in op_bin.keys():
+                soc_res = '{{ "{}", {{'.format(soc)
+                soc_syms = op_bin.get(soc)
+                soc_pairs = []
+                for pair_addr in soc_syms:
+                    pair_addr1 = ["&" + s for s in pair_addr]
+                    op_syms += pair_addr
+                    soc_pairs.append(
+                        "    {{ {} }} ".format(", \\\n      ".join(pair_addr1))
+                    )
+                soc_res += ", \\\n        ".join(soc_pairs)
+                soc_res += " } }"
+                socs_res.append(soc_res)
+            macro_op += "    {{ {} }}, \\\n".format(", \\\n      ".join(socs_res))
+            macro_op += "    {{ {} }})\n\n".format(", ".join(op_obj.get("op_rkb")))
+            macro_str = '#define {}_RESOURCES {{{{"{}", {}}}}}'.format(
+                op_type, op_type, "{}_OP_RESOURCES".format(op_type)
+            )
+            var_str = (
+                "extern gert::OpImplRegisterV2 op_impl_register_optiling_{};\n".format(
+                    op_type
+                )
+            )
+            if len(op_syms) > 0:
+                var_str += (
+                    "extern uint8_t " + ";\nextern uint8_t ".join(op_syms) + ";\n"
+                )
+            head_file = os.path.join(self.out_path, "{}_op_resource.h".format(op_type))
+            try:
+                with os.fdopen(
+                    os.open(head_file, const_var.WFLAGS, const_var.WMODES), "w"
+                ) as fd:
+                    fd.write("#include <stdint.h>\n")
+                    fd.write("#include <map>\n")
+                    fd.write("#include <tuple>\n")
+                    fd.write("#include <vector>\n")
+                    fd.write('#include "graph/ascend_string.h"\n')
+                    fd.write('#include "register/op_impl_registry.h"\n\n')
+                    fd.write(var_str)
+                    fd.write("\n")
+                    fd.write(macro_op)
+                    fd.write(macro_str)
+            except Exception as e:
+                LogUtil.print_compile_log(
+                    "",
+                    f"{op_type}_op_resource.h create error: {e}!",
+                    AscendCLogLevel.LOG_ERROR,
+                    LogUtil.Option.NON_SOC,
+                )
+
+    def ascendc_gen_lib(self: any):
+        out_lib = os.path.join(self.out_path, "libkernels.a")
+        if os.path.exists(out_lib):
+            os.remove(out_lib)
+        objs = glob.glob(os.path.join(self.out_path, "*.o"))
+        start = 0
+        batch_size = 100
+        for _ in range(math.ceil(len(objs) / batch_size)):
+            sub_objs = objs[start : start + batch_size]
+            start += batch_size
+            try:
+                subprocess.run(["ar", "qc", out_lib] + sub_objs)
+                subprocess.run(["ranlib", out_lib])
+            except Exception as e:
+                LogUtil.print_compile_log(
+                    "",
+                    f"execute ar/ranlib command error: {e}!",
+                    AscendCLogLevel.LOG_ERROR,
+                    LogUtil.Option.NON_SOC,
+                )
+
+    def ascendc_gen_opsinfo(self: any):
+        ascendc_ops_config.gen_all_soc_config(self.in_path)
+
+
+def args_parse():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-i", "--input-path", nargs="?", help="Input path of compile result."
+    )
+    parser.add_argument(
+        "-o", "--output-path", nargs="?", help="Output path of compile result."
+    )
+    parser.add_argument(
+        "-l",
+        "--enable-library",
+        nargs="?",
+        default=None,
+        help="Whether library is enabled.",
+    )
+    parser.add_argument(
+        "-p",
+        "--platform",
+        nargs="?",
+        default=None,
+        help="target platform is x86_64 or aarch64.",
+    )
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = args_parse()
+    kernel_packer = PackKernel(args)
+    if kernel_packer.is_lib is None:
+        kernel_packer.ascendc_gen_opsinfo()
+    kernel_packer.ascendc_get_config()
+    kernel_packer.ascendc_pack_kernel()
+    kernel_packer.ascendc_gen_header()
+    kernel_packer.ascendc_gen_lib()
diff --git a/csrc/deepep/ops2/cmake/util/ascendc_pack_opregistry.py b/csrc/deepep/ops2/cmake/util/ascendc_pack_opregistry.py
new file mode 100755
index 000000000..1acb7e45e
--- /dev/null
+++ b/csrc/deepep/ops2/cmake/util/ascendc_pack_opregistry.py
@@ -0,0 +1,365 @@
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+
+import argparse
+import glob
+import math
+import os
+import shutil
+import subprocess
+import sys
+
+import const_var
+from tbe.tikcpp.log_utils import AscendCLogLevel, LogUtil
+
+
+class PackKernel:
+    def __init__(self: any, args: any):
+        self.in_path = os.path.realpath(args.input_path)
+        self.copy_path = os.path.realpath(args.copy_path)
+        self.out_path = os.path.realpath(args.output_path)
+        self.op_soc_ver = args.compute_unit.split("-")
+        self.vendor_name = args.vendor_name
+        self.framework_type = args.framework_type
+        self.platform = args.platform
+        self.op_info = {}
+        self.file_info = {}
+        if os.path.exists(self.copy_path):
+            try:
+                shutil.rmtree(self.copy_path)
+            except OSError as e:
+                LogUtil.print_compile_log(
+                    "",
+                    f"remove {self.copy_path} error!",
+                    AscendCLogLevel.LOG_ERROR,
+                    LogUtil.Option.NON_SOC,
+                )
+        if os.path.exists(self.out_path):
+            try:
+                shutil.rmtree(self.out_path)
+            except OSError as e:
+                LogUtil.print_compile_log(
+                    "",
+                    f"remove {self.out_path} error!",
+                    AscendCLogLevel.LOG_ERROR,
+                    LogUtil.Option.NON_SOC,
+                )
+        try:
+            os.makedirs(self.copy_path, exist_ok=True)
+        except Exception as e:
+            LogUtil.print_compile_log(
+                "",
+                f"make {self.copy_path} error: {e}!",
+                AscendCLogLevel.LOG_ERROR,
+                LogUtil.Option.NON_SOC,
+            )
+        try:
+            os.makedirs(self.out_path, exist_ok=True)
+        except Exception as e:
+            LogUtil.print_compile_log(
+                "",
+                f"make {self.out_path} error: {e}!",
+                AscendCLogLevel.LOG_ERROR,
+                LogUtil.Option.NON_SOC,
+            )
+
+    def get_symbol(self: any, name: str):
+        name = name.replace("/", "_")
+        name = name.replace("-", "_")
+        return name.replace(".", "_")
+
+    def ascendc_gen_object(self: any, in_file: str, path: str):
+        sym = self.get_symbol("_binary_" + in_file)
+        out_file = os.path.join(self.out_path, sym + ".o")
+        # ascend610lite only support aarch64
+        if path.find("ascend610lite") != -1:
+            try:
+                subprocess.run(
+                    [
+                        "llvm-objcopy",
+                        "--input-target",
+                        "binary",
+                        "--output-target",
+                        "elf64-littleaarch64",
+                        "--binary-architecture",
+                        "aarch64",
+                        in_file,
+                        out_file,
+                    ]
+                )
+            except Exception as e:
+                LogUtil.print_compile_log(
+                    "",
+                    " ascend610lite execute objcopy fail!",
+                    AscendCLogLevel.LOG_ERROR,
+                    LogUtil.Option.NON_SOC,
+                )
+                return None
+            return [sym + "_start", sym + "_end"]
+
+        uname = os.popen("uname -m").read().strip()
+        if self.platform is not None:
+            target_platform = self.platform
+        else:
+            target_platform = uname
+        try:
+            if target_platform == "x86_64":
+                subprocess.run(
+                    [
+                        "llvm-objcopy",
+                        "--input-target",
+                        "binary",
+                        "--output-target",
+                        "elf64-x86-64",
+                        "--binary-architecture",
+                        "i386",
+                        in_file,
+                        out_file,
+                    ]
+                )
+            elif target_platform == "aarch64":
+                subprocess.run(
+                    [
+                        "llvm-objcopy",
+                        "--input-target",
+                        "binary",
+                        "--output-target",
+                        "elf64-littleaarch64",
+                        "--binary-architecture",
+                        "aarch64",
+                        in_file,
+                        out_file,
+                    ]
+                )
+            else:
+                subprocess.run(["echo", "unsupported environment!"])
+        except Exception as e:
+            LogUtil.print_compile_log(
+                "",
+                f"{target_platform} execute objcopy error: {e}!",
+                AscendCLogLevel.LOG_ERROR,
+                LogUtil.Option.NON_SOC,
+            )
+            return None
+        return [sym + "_start", sym + "_end"]
+
+    def ascendc_get_config(self: any):
+        os.chdir(self.copy_path)
+        current_directory = os.getcwd()
+        catalog_file = os.listdir(current_directory)
+        for catalog in catalog_file:
+            if catalog == "op_impl" or catalog == "framework":
+                files_dict = {}
+                for root, _, files in os.walk(catalog):
+                    for file in files:
+                        if (
+                            file.endswith(".json")
+                            or file.endswith(".so")
+                            or file.endswith(".cpp")
+                            or file.endswith(".py")
+                            or file.endswith(".o")
+                        ):
+                            file_path = os.path.join(root, file)
+                            file_name = os.path.basename(file_path)
+                            files_dict[file_name] = file_path
+                self.file_info[catalog] = files_dict
+
+    def ascendc_pack_kernel(self: any):
+        op_info = {}
+        for files in self.file_info.keys():
+            os.chdir(self.copy_path)
+            op_cfgs = self.file_info.get(files)
+            for file_name in op_cfgs.keys():
+                op_info[file_name] = []
+                path, filename = os.path.split(op_cfgs[file_name])
+                op_info[file_name].append(os.path.join(self.vendor_name, path))
+                op_info[file_name].append(
+                    self.ascendc_gen_object(op_cfgs[file_name], path)
+                )
+        self.op_info = op_info
+
+    def ascendc_gen_header(self: any):
+        socs_res = []
+        var_str = ""
+        macro_op = (
+            "std::vector<std::tuple<ge::AscendString, ge::AscendString, "
+            "const uint8_t *, const uint8_t *>> __ascendc_op_info = \n"
+        )
+        for file_name in self.op_info.keys():
+            file_addr = self.op_info.get(file_name)
+            soc_pairs = []
+            op_syms = []
+            soc_res = ' {{ "{}", '.format(file_name)
+            soc_res += '"{}", '.format(file_addr[0])
+            for pair_addr in file_addr[1]:
+                op_syms.append(pair_addr)
+                pair_addr1 = "&" + pair_addr
+                soc_pairs.append(pair_addr1)
+            soc_res += "{}, {}".format(soc_pairs[0], soc_pairs[1])
+            soc_res += "}, \n"
+            socs_res.append(soc_res)
+            if len(op_syms) > 0:
+                var_str += "".join(
+                    ["extern uint8_t {};\n".format(sym) for sym in op_syms]
+                )
+        macro_op += "{{\n{}}}; \n".format("".join(socs_res))
+        head_file = os.path.join(self.out_path, "ge_table_op_resource.h")
+        try:
+            with os.fdopen(
+                os.open(head_file, const_var.WFLAGS, const_var.WMODES), "w"
+            ) as fd:
+                fd.write("#include <stdint.h>\n")
+                fd.write("#include <map>\n")
+                fd.write("#include <tuple>\n")
+                fd.write("#include <vector>\n")
+                fd.write('#include "graph/ascend_string.h"\n')
+                fd.write('#include "register/op_impl_registry.h"\n\n')
+                fd.write(var_str)
+                fd.write("\n")
+                fd.write("namespace AscendC {\n")
+                fd.write(macro_op)
+                fd.write("}\n")
+        except Exception as e:
+            LogUtil.print_compile_log(
+                "",
+                f"ge_table_op_resource.h create error: {e}!",
+                AscendCLogLevel.LOG_ERROR,
+                LogUtil.Option.NON_SOC,
+            )
+
+    def ascendc_gen_lib(self: any):
+        out_lib = os.path.join(self.out_path, "libopregistry.a")
+        if os.path.exists(out_lib):
+            os.remove(out_lib)
+        objs = glob.glob(os.path.join(self.out_path, "*.o"))
+        start = 0
+        batch_size = 100
+        for _ in range(math.ceil(len(objs) / batch_size)):
+            sub_objs = objs[start : start + batch_size]
+            start += batch_size
+            try:
+                subprocess.run(["ar", "qc", out_lib] + sub_objs)
+                subprocess.run(["ranlib", out_lib])
+            except Exception as e:
+                LogUtil.print_compile_log(
+                    "",
+                    f"execute ar/ranlib command error: {e}!",
+                    AscendCLogLevel.LOG_ERROR,
+                    LogUtil.Option.NON_SOC,
+                )
+
+    def ascendc_copy_dir(self: any, src_dir: str, target_dir: str):
+        file_list = os.listdir(src_dir)
+        for file_name in file_list:
+            source_file = os.path.join(src_dir, file_name)
+            target_file = os.path.join(target_dir, file_name)
+            if os.path.isdir(source_file):
+                try:
+                    shutil.copytree(source_file, target_file)
+                except Exception as e:
+                    LogUtil.print_compile_log(
+                        "",
+                        f"copy {source_file} error: {e}!",
+                        AscendCLogLevel.LOG_ERROR,
+                        LogUtil.Option.NON_SOC,
+                    )
+
+    def ascendc_copy_file(self: any, src_dir: str, target_dir: str):
+        file_list = os.listdir(src_dir)
+        for file_name in file_list:
+            source_file = os.path.join(src_dir, file_name)
+            if os.path.isfile(source_file):
+                try:
+                    os.makedirs(target_dir, exist_ok=True)
+                except Exception as e:
+                    LogUtil.print_compile_log(
+                        "",
+                        f"make {target_dir} error: {e}!",
+                        AscendCLogLevel.LOG_ERROR,
+                        LogUtil.Option.NON_SOC,
+                    )
+                try:
+                    shutil.copy(source_file, target_dir)
+                except Exception as e:
+                    LogUtil.print_compile_log(
+                        "",
+                        f"copy {source_file} error: {e}!",
+                        AscendCLogLevel.LOG_ERROR,
+                        LogUtil.Option.NON_SOC,
+                    )
+
+    def ascendc_copy_func(self: any):
+        os.chdir(self.in_path)
+        framework_catalog = os.listdir("framework")
+        for catalog_file in framework_catalog:
+            if (
+                catalog_file == "tf_plugin"
+                or catalog_file == "caffe_plugin"
+                or catalog_file == "onnx_plugin"
+            ):
+                source_dir = "op_kernel/tbe/op_info_cfg/ai_core"
+                dst_dir = os.path.join(self.copy_path, "framework", self.framework_type)
+                self.ascendc_copy_file(source_dir, dst_dir)
+                source_dir = os.path.join("framework", catalog_file)
+                dst_dir = os.path.join(self.copy_path, "framework", self.framework_type)
+                self.ascendc_copy_file(source_dir, dst_dir)
+        source_dir = "op_kernel/tbe/op_info_cfg/ai_core"
+        dst_dir = os.path.join(self.copy_path, "op_impl/ai_core/tbe/config")
+        self.ascendc_copy_dir(source_dir, dst_dir)
+        source_dir = "op_kernel/binary/dynamic"
+        dst_dir = os.path.join(
+            self.copy_path, "op_impl/ai_core/tbe", self.vendor_name + "_impl", "dynamic"
+        )
+        self.ascendc_copy_file(source_dir, dst_dir)
+        for compute_unit in self.op_soc_ver:
+            source_dir = os.path.join("op_kernel/binary", compute_unit)
+            dst_dir = os.path.join(
+                self.copy_path, "op_impl/ai_core/tbe/kernel", compute_unit
+            )
+            self.ascendc_copy_dir(source_dir, dst_dir)
+        source_dir = "op_kernel/binary/config"
+        dst_dir = os.path.join(self.copy_path, "op_impl/ai_core/tbe/kernel/config")
+        self.ascendc_copy_dir(source_dir, dst_dir)
+        so_file = "op_impl/ai_core/tbe/op_master_device/lib/libcust_opmaster.so"
+        if os.path.exists(so_file):
+            dst_dir = os.path.join(
+                self.copy_path, "op_impl/ai_core/tbe/op_master_device/lib"
+            )
+            os.makedirs(dst_dir, exist_ok=True)
+            shutil.copy(so_file, dst_dir)
+
+
+def args_parse():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-i", "--input-path", nargs="?", help="Input path of compile result."
+    )
+    parser.add_argument(
+        "-c", "--copy-path", nargs="?", help="Copy path of compile result."
+    )
+    parser.add_argument(
+        "-o", "--output-path", nargs="?", help="Output path of compile result."
+    )
+    parser.add_argument("-n", "--vendor-name", nargs="?", help="Vendor name.")
+    parser.add_argument("-u", "--compute-unit", nargs="?", help="Compute unit.")
+    parser.add_argument(
+        "-t", "--framework-type", nargs="?", help="Framework type, eg:tensorflow."
+    )
+    parser.add_argument(
+        "-p",
+        "--platform",
+        nargs="?",
+        default=None,
+        help="target platform is x86_64 or aarch64.",
+    )
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = args_parse()
+    kernel_packer = PackKernel(args)
+    kernel_packer.ascendc_copy_func()
+    kernel_packer.ascendc_get_config()
+    kernel_packer.ascendc_pack_kernel()
+    kernel_packer.ascendc_gen_header()
+    kernel_packer.ascendc_gen_lib()
diff --git a/csrc/deepep/ops2/cmake/util/ascendc_replay_build.py b/csrc/deepep/ops2/cmake/util/ascendc_replay_build.py
new file mode 100755
index 000000000..e07545f5c
--- /dev/null
+++ b/csrc/deepep/ops2/cmake/util/ascendc_replay_build.py
@@ -0,0 +1,85 @@
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+
+import os
+import sys
+
+import const_var
+import opdesc_parser
+import replay_codegen
+from replay_codegen import ReplayCodeGenParams
+
+PYF_PATH = os.path.dirname(os.path.realpath(__file__))
+
+
+class ReplayBuilder(opdesc_parser.OpDesc):
+    def __init__(self: any, op_type: str):
+        super().__init__(op_type)
+
+    def gen_replay_source(self: any, impl_path: str, out_path: str, ops_product: str):
+        if not self.op_replay_flag:
+            print("{} replay not enabled".format(self.op_type))
+            return
+        argn = len(self.input_name) + len(self.output_name) + 1
+        if self.op_replay_batch:
+            print("{} replay in batch mode".format(self.op_type))
+        else:
+            print("{} replay in normal mode".format(self.op_type))
+        if impl_path.endswith("op_kernel"):
+            implf = os.path.join(impl_path, self.op_file + ".cpp")
+            tiling_file = os.path.join(
+                impl_path, "../op_host", self.op_file + "_tiling.h"
+            )
+        else:
+            if self.dynamic_shape:
+                dyn_path = "dynamic"
+            else:
+                dyn_path = ""
+            implf = os.path.join(impl_path, dyn_path, self.op_file + ".cpp")
+            tiling_file = os.path.join(
+                impl_path, "../../op_tiling", self.op_file + "_tiling.h"
+            )
+        rep_conf = replay_codegen.ReplayCodeGen(
+            ReplayCodeGenParams(
+                self.op_type,
+                implf,
+                tiling_file,
+                self.op_file,
+                self.op_intf,
+                argn,
+                self.op_replay_batch,
+                self.max_block_dim,
+                self.max_shape_size,
+            )
+        )
+        rep_conf.set_batch(self.op_replay_batch)
+        rep_conf.set_outdir(out_path)
+        rep_conf.gen_replay(ops_product)
+
+
+def gen_replay(
+    cfgfile: str, cfgs: dict, dirs: dict, ops_product: str, ops: list = None
+):
+    batch_lists = cfgs.get(const_var.REPLAY_BATCH).split(";")
+    iterator_lists = cfgs.get(const_var.REPLAY_ITERATE).split(";")
+    op_descs = opdesc_parser.get_op_desc(
+        cfgfile, batch_lists, iterator_lists, ReplayBuilder, ops
+    )
+    for op_desc in op_descs:
+        op_desc.gen_replay_source(
+            dirs.get(const_var.CFG_IMPL_DIR),
+            dirs.get(const_var.CFG_OUT_DIR),
+            ops_product,
+        )
+
+
+if __name__ == "__main__":
+    if len(sys.argv) <= 6:
+        raise RuntimeError("arguments must greater than 6")
+    rep_cfg = {}
+    rep_cfg[const_var.REPLAY_BATCH] = sys.argv[2]
+    rep_cfg[const_var.REPLAY_ITERATE] = sys.argv[3]
+    rep_dir = {}
+    rep_dir[const_var.CFG_IMPL_DIR] = sys.argv[4]
+    rep_dir[const_var.CFG_OUT_DIR] = sys.argv[5]
+    gen_replay(sys.argv[1], rep_cfg, rep_dir, sys.argv[6])
diff --git a/csrc/deepep/ops2/cmake/util/batch_replay_impl.temp b/csrc/deepep/ops2/cmake/util/batch_replay_impl.temp
new file mode 100644
index 000000000..0e8834664
--- /dev/null
+++ b/csrc/deepep/ops2/cmake/util/batch_replay_impl.temp
@@ -0,0 +1,117 @@
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <iostream>
+#include <thread>
+#include <stdlib.h>
+#include "replay_def.h"
+#include "code_gen.h"
+#include "replay_fun.h"
+#include "register/op_check.h"
+#define __ASCENDC_REPLAY_CODE__
+#include <time.h>
+
+using namespace std;
+using namespace optiling;
+using namespace AscendCReplay;
+
+extern "C" void __KERNEL_FUN__ (__ARGS_DEF__, const char *);
+extern "C" int elf_batch_append(char *elf, uint32_t elfSize, char *jit, int kernum, char *atext[], int alen[],
+    int atlen, const char* kernelname[]);
+
+#define KERNEL_N 1
+#define ARG_N (__ARG_NUM__)
+#define MAX_L (1024 * 1024 * 100)
+#define MAX_E (1024 * 1024)
+
+int __KERNEL_FUN___replay___OPS_PRODUCT__(ReplayFuncParam& param, const int core_type)
+{
+    // gen type 1 : direct call codes 0: load .o file
+    if (param.gentype < 0 || param.gentype > 1) {
+        printf("Error: call replay gen type is %d, should only be 1 or 0\n", param.gentype);
+        return 0;
+    } else if (param.gentype == 1 && param.objptr == nullptr) {
+        printf("Error: call replay with direct call mode, but code obj addr is null\n");
+        return 0;
+    } else if (param.gentype == 0 && param.output_kernel_file == nullptr) {
+        printf("Error: call replay with object file mode, but object file path is null\n");
+        return 0;
+    }
+    // core_type 0:MIX 1:CUBE 2:VEC
+    if (core_type < 0 || core_type > 2) {
+        printf("Error: call replay core type is %d !\n", core_type);
+        return 0;
+    }
+    g_coreType = __CORE_TYPE__;
+    g_taskRation = param.task_ration;
+    g_tilingKey = param.tiling_key;
+
+    unsigned char *buf, *jit;
+    char *kernel[KERNEL_N];
+    int len[KERNEL_N];
+    block_idx = 0;
+    block_num = param.block_dim;
+    g_ubBase = block_num;
+    uint8_t *code = (uint8_t *)malloc(MAX_L);
+    uint8_t *pos = code;
+    struct timespec tp1, tp2;
+
+    clock_gettime(CLOCK_MONOTONIC, &tp1);
+    if (block_num > 32) {
+        printf("Error: block_num > 32\n");
+        return 0;
+    }
+    //__OP_FOPEN__
+    for (int i = 0; i < KERNEL_N; i++) {
+        //__OP_SET_KERNEL__
+        for (int j = 0; j < ARG_N; j++)
+            AddArg(j, ARG_STEP * (j + 1));
+#ifdef FP_CEILING
+        SetCtrlFloatEnable();
+#else
+        SetCtrlFloatDisable();
+#endif
+        CodeInit(pos, true);
+        __KERNEL_FUN__(__KERNEL_ARGS__, param.tiling_data);
+        CodeEnd();
+        kernel[i] = (char *)pos;
+        len[i] = CodeLen();
+        pos += len[i];
+    }
+    //__OP_FCLOSE__
+    clock_gettime(CLOCK_MONOTONIC, &tp2);
+    buf = (unsigned char *)malloc(MAX_E);
+    int fd = open(param.entry_file, O_RDONLY);
+    if (fd < 0) {
+        printf("[error]: cannot find entry.o : %s\n", param.entry_file);
+        return 0;
+    }
+    uint32_t bufSize = read(fd, buf, MAX_E);
+    if (bufSize <= 0) {
+        printf("[error]: entry.o : %s is too small ! \n", param.entry_file);
+    }
+    close(fd);
+    jit = (unsigned char *)malloc(MAX_L);
+    printf("total code generated %ld\n", pos - code);
+    int sz = elf_batch_append((char *)buf, bufSize, (char *)jit, KERNEL_N, kernel, len, pos - code, &param.kernel_name);
+    if (tp1.tv_sec != tp2.tv_sec) {
+        printf("%ld NS\n", tp2.tv_nsec + 1000000000 - tp1.tv_nsec);
+    } else {
+        printf("%ld NS\n", tp2.tv_nsec - tp1.tv_nsec);
+    }
+    printf("new elf size %d\n", sz);
+    if (param.gentype == 0) {
+        fd = open(param.output_kernel_file, O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR);
+        (void)write(fd, jit, sz);
+        close(fd);
+        free(jit);
+    } else if (param.gentype == 1) {
+        *param.objptr = (char*)jit;
+    }
+    free(buf);
+    free(code);
+    return sz;
+}
+
+REG_REPLAY_FUNC(__OPTYPE__, __OPS_PRODUCT__, __KERNEL_FUN___replay___OPS_PRODUCT__);
diff --git a/csrc/deepep/ops2/cmake/util/code_channel_infer.py b/csrc/deepep/ops2/cmake/util/code_channel_infer.py
new file mode 100755
index 000000000..c9042d4d3
--- /dev/null
+++ b/csrc/deepep/ops2/cmake/util/code_channel_infer.py
@@ -0,0 +1,64 @@
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+
+import collections
+import copy
+import ctypes
+import os
+import shutil
+import stat
+import subprocess
+
+"""CODE_* is used to cube/vector api is called in operator code
+CODE_MIX means both cube and vector api is called
+CODE_CUBE means only cube api is called
+CODE_VEC means only vector api is called
+"""
+CODE_MIX = 0
+CODE_CUBE = 1
+CODE_VEC = 2
+
+
+def _is_v220(op_product: str):
+    """return if current soc version is V220
+
+    Returns:
+        res: True means V220
+    """
+    if op_product == "ascend910_93" or op_product == "ascend910b":
+        return True
+    return False
+
+
+InfoCodeChanelParams = collections.namedtuple(
+    "InfoCodeChanelParams",
+    [
+        "src_file",
+        "tiling_header",
+        "kernel_name",
+        "outdir",
+        "op_product",
+        "compile_options",
+    ],
+)
+
+
+def infer_code_channel(params: InfoCodeChanelParams):
+    """get code channel for v220, return CODE_MIX if soc version is not V220
+
+    Args:
+        src_file (str): AscendC operator code file
+        src_file (str): AscendC operator tiling header file
+        kernel_name (str): kernel function name
+        optype (str): operator type
+        compile_options (list): compile options for bisheng cmd
+
+    Raises:
+        Exception: if not exist L1/L0/UB if code, it's not a aicore code
+
+    Returns:
+        res (int): CODE_MIX/CODE_CUBE/CODE_VEC
+    """
+    if not _is_v220(params.op_product):
+        return CODE_MIX
+    return CODE_VEC
diff --git a/csrc/deepep/ops2/cmake/util/const_var.py b/csrc/deepep/ops2/cmake/util/const_var.py
new file mode 100755
index 000000000..bc8f33bf9
--- /dev/null
+++ b/csrc/deepep/ops2/cmake/util/const_var.py
@@ -0,0 +1,61 @@
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+
+import os
+import stat
+
+REPLAY_BATCH = "batch"
+REPLAY_ITERATE = "iterate"
+CFG_IMPL_DIR = "impl_dir"
+CFG_OUT_DIR = "out_dir"
+AUTO_GEN_DIR = "auto_gen_dir"
+WFLAGS = os.O_WRONLY | os.O_CREAT | os.O_TRUNC
+WMODES = stat.S_IWUSR | stat.S_IRUSR
+SOC_MAP_EXT = {
+    "ascend310p": "Ascend310P3",
+    "ascend310b": "Ascend310B1",
+    "ascend910": "Ascend910A",
+    "ascend910b": "Ascend910B1",
+    "ascend910_93": "Ascend910_9391",
+    "ascend610lite": "Ascend610Lite",
+}
+BIN_CMD = "opc $1 --main_func={fun} --input_param={param} --soc_version={soc} \
+--output=$2 --impl_mode={impl} --simplified_key_mode=0 --op_mode=dynamic\n"
+SET_PLOG_LEVEL_ERROR = "export ASCEND_GLOBAL_LOG_LEVEL=3\n"
+SET_PLOG_STDOUT = "export ASCEND_SLOG_PRINT_TO_STDOUT=1\n"
+SRC_ENV = """
+while true; do
+  case "$1" in
+    --kernel-src=*)
+      export BUILD_KERNEL_SRC=$(echo "$1" | cut -d"=" -f2-)
+      shift
+      ;;
+    -*)
+      shift
+      ;;
+    *)
+      break
+      ;;
+  esac
+done
+"""
+CHK_CMD = """
+if ! test -f $2/{res_file} ; then
+  echo "$2/{res_file} not generated!"
+  exit 1
+fi
+"""
+ATTR_DEF_VAL = {
+    "str": "",
+    "int": 0,
+    "float": 0.0,
+    "bool": False,
+    "list_bool": [],
+    "list_int": [],
+    "list_float": [],
+    "list_list_int": [[]],
+}
+
+
+def conv_soc_ver(ver: str):
+    return SOC_MAP_EXT.get(ver)
diff --git a/csrc/deepep/ops2/cmake/util/gen_impl_and_mrege_json.sh b/csrc/deepep/ops2/cmake/util/gen_impl_and_mrege_json.sh
new file mode 100755
index 000000000..93d7ec845
--- /dev/null
+++ b/csrc/deepep/ops2/cmake/util/gen_impl_and_mrege_json.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+project_path=$1
+build_path=$2
+vendor_name=customize
+if [[ ! -d "$project_path" ]]; then
+    echo "[ERROR] No project path is provided"
+    exit 1
+fi
+
+if [[ ! -d "$build_path" ]]; then
+    echo "[ERROR] No build path is provided"
+    exit 1
+fi
+
+# copy aicpu kernel so operators
+if [[ -d "${project_path}/cpukernel/aicpu_kernel_lib" ]]; then
+    cp -f ${project_path}/cpukernel/aicpu_kernel_lib/* ${build_path}/makepkg/packages/vendors/$vendor_name/op_impl/cpu/aicpu_kernel/impl
+    rm -rf ${project_path}/cpukernel/aicpu_kernel_lib
+fi
diff --git a/csrc/deepep/ops2/cmake/util/gen_ops_filter.sh b/csrc/deepep/ops2/cmake/util/gen_ops_filter.sh
new file mode 100755
index 000000000..b06a4e9fc
--- /dev/null
+++ b/csrc/deepep/ops2/cmake/util/gen_ops_filter.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+
+if [[ -z "$1" ]]; then
+    echo -e "[ERROR] No source dir provided"
+    exit 1
+fi
+
+if [[ -z "$2" ]]; then
+    echo -e "[ERROR] No destination dir provided"
+    exit 1
+fi
+
+src=$1
+dest_file=$2/npu_supported_ops.json
+
+if [ -f "$dest_file" ];then
+    chmod u+w $dest_file
+fi
+
+echo $*
+
+add_ops() {
+    name=$1
+    isHeavy=$2
+    file=$3
+    grep -w "\"$name\"" ${file} >/dev/null
+    if [ $? == 0 ];then
+        return
+    fi
+    echo "  \"${name}\": {" >> ${file}
+    echo "    \"isGray\": false," >> ${file}
+    echo "    \"isHeavy\": ${isHeavy}" >> ${file}
+    echo "  }," >> ${file}
+}
+
+echo "{" > ${dest_file}
+ini_files=$(find ${src} -name "*.ini")
+for file in ${ini_files} ; do
+    name=$(grep '^\[' ${file} | sed 's/\[//g' | sed 's/]//g' | sed 's/\r//g')
+    grep 'heavyOp.flag' ${file} >/dev/null
+    if [ $? == 0 ];then
+        isHeavy=$(grep 'heavyOp.flag' ${file} | awk -F= '{print $2}')
+    else
+        isHeavy="false"
+    fi
+    for op in ${name} ; do
+        add_ops ${op} "false" ${dest_file}
+    done
+done
+echo "}" >> ${dest_file}
+file_count=$(cat ${dest_file} | wc -l)
+line=$(($file_count-1))
+sed -i "${line}{s/,//g}" ${dest_file}
+
+chmod 640 "${dest_file}"
+echo -e "[INFO] Succeed generated ${dest_file}"
+
+exit 0
diff --git a/csrc/deepep/ops2/cmake/util/gen_version_info.sh b/csrc/deepep/ops2/cmake/util/gen_version_info.sh
new file mode 100755
index 000000000..8468f949a
--- /dev/null
+++ b/csrc/deepep/ops2/cmake/util/gen_version_info.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+ascend_install_dir=$1
+gen_file_dir=$2
+
+# create version.info
+compiler_version=$(grep "Version" -w ${ascend_install_dir}/compiler/version.info | awk -F = '{print $2}')
+echo "custom_opp_compiler_version=${compiler_version}" > ${gen_file_dir}/version.info
diff --git a/csrc/deepep/ops2/cmake/util/insert_op_info.py b/csrc/deepep/ops2/cmake/util/insert_op_info.py
new file mode 100644
index 000000000..ca7562e7a
--- /dev/null
+++ b/csrc/deepep/ops2/cmake/util/insert_op_info.py
@@ -0,0 +1,35 @@
+# -*- coding: utf-8 -*-
+
+import json
+import os
+import stat
+import sys
+
+import const_var
+
+if __name__ == "__main__":
+    if len(sys.argv) != 3:
+        print(sys.argv)
+        print("argv error, inert_op_info.py your_op_file lib_op_file")
+        sys.exit(2)
+
+    with open(sys.argv[1], "r") as load_f:
+        insert_operator = json.load(load_f)
+
+    all_operators = {}
+    if os.path.exists(sys.argv[2]):
+        if os.path.getsize(sys.argv[2]) != 0:
+            with open(sys.argv[2], "r") as load_f:
+                all_operators = json.load(load_f)
+
+    for k in insert_operator.keys():
+        if k in all_operators.keys():
+            print("replace op:[", k, "] success")
+        else:
+            print("insert op:[", k, "] success")
+        all_operators[k] = insert_operator[k]
+
+    with os.fdopen(
+        os.open(sys.argv[2], const_var.WFLAGS, const_var.WMODES), "w"
+    ) as json_file:
+        json_file.write(json.dumps(all_operators, indent=4))
diff --git a/csrc/deepep/ops2/cmake/util/insert_simplified_keys.py b/csrc/deepep/ops2/cmake/util/insert_simplified_keys.py
new file mode 100755
index 000000000..599ebe974
--- /dev/null
+++ b/csrc/deepep/ops2/cmake/util/insert_simplified_keys.py
@@ -0,0 +1,242 @@
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+
+import argparse
+import glob
+import json
+import os
+import re
+import sys
+
+import const_var
+
+DATA_TPYE_DICT = {
+    "float32": 0,
+    "float16": 1,
+    "int8": 2,
+    "int16": 6,
+    "uint16": 7,
+    "uint8": 4,
+    "int32": 3,
+    "int64": 9,
+    "uint32": 8,
+    "uint64": 10,
+    "bool": 12,
+    "double": 11,
+    "string": 13,
+    "dual": 14,
+    "dual": 15,
+    "complex64": 16,
+    "complex128": 17,
+    "qint8": 18,
+    "qint16": 19,
+    "qint32": 20,
+    "quint8": 21,
+    "quint16": 22,
+    "resource": 23,
+    "string": 24,
+    "dual": 25,
+    "variant": 26,
+    "bf16": 27,
+    "bfloat16": 27,
+    "undefined": 28,
+    "int4": 29,
+    "uint1": 30,
+    "int2": 31,
+}
+
+FORMAT_DICT = {
+    "NCHW": 0,
+    "NHWC": 1,
+    "ND": 2,
+    "NC1HWC0": 3,
+    "FRACTAL_Z": 4,
+    "NC1C0HWPAD": 5,
+    "NHWC1C0": 6,
+    "FSR_NCHW": 7,
+    "FRACTAL_DECONV": 8,
+    "C1HWNC0": 9,
+    "FRACTAL_DECONV_TRANSPOSE": 10,
+    "FRACTAL_DECONV_SP_STRIDE_TRANS": 11,
+    "NC1HWC0_C04": 12,
+    "FRACTAL_Z_C04": 13,
+    "CHWN": 14,
+    "FRACTAL_DECONV_SP_STRIDE8_TRANS": 15,
+    "HWCN": 16,
+    "NC1KHKWHWC0": 17,
+    "BN_WEIGHT": 18,
+    "FILTER_HWCK": 19,
+    "HASHTABLE_LOOKUP_LOOKUPS": 20,
+    "HASHTABLE_LOOKUP_KEYS": 21,
+    "HASHTABLE_LOOKUP_VALUE": 22,
+    "HASHTABLE_LOOKUP_OUTPUT": 23,
+    "HASHTABLE_LOOKUP_HITS": 24,
+    "C1HWNCoC0": 25,
+    "MD": 26,
+    "NDHWC": 27,
+    "FRACTAL_ZZ": 28,
+    "FRACTAL_NZ": 29,
+    "NCDHW": 30,
+    "DHWCN": 31,
+    "NDC1HWC0": 32,
+    "FRACTAL_Z_3D": 33,
+    "CN": 34,
+    "NC": 35,
+    "DHWNC": 36,
+    "FRACTAL_Z_3D_TRANSPOSE": 37,
+    "FRACTAL_ZN_LSTM": 38,
+    "FRACTAL_Z_G": 39,
+    "RESERVED": 40,
+    "ALL": 41,
+    "NULL": 42,
+    "ND_RNN_BIAS": 43,
+    "FRACTAL_ZN_RNN": 44,
+    "NYUV": 45,
+    "NYUV_A": 46,
+}
+
+
+def load_json(json_file: str):
+    with open(json_file, encoding="utf-8") as file:
+        json_content = json.load(file)
+    return json_content
+
+
+def get_specified_suffix_file(root_dir, suffix):
+    specified_suffix = os.path.join(root_dir, "**/*.{}".format(suffix))
+    all_suffix_files = glob.glob(specified_suffix, recursive=True)
+    return all_suffix_files
+
+
+def get_deterministic_value(support_info):
+    deterministic_key = "deterministic"
+    if deterministic_key not in support_info:
+        return 0
+    deterministic_value = support_info.get(deterministic_key)
+    if deterministic_value == "true":
+        return 1
+    else:
+        return 0
+
+
+def get_precision_value(support_info):
+    precision_key = "implMode"
+    precision_value = support_info.get(precision_key)
+    if precision_value == "high_performance":
+        _value = 1
+    elif precision_value == "high_precision":
+        _value = 2
+    else:
+        _value = 0
+    return _value
+
+
+def get_overflow_value(support_info):
+    return 0
+
+
+def get_parameters(info):
+    if info:
+        if "dtype" in info:
+            data_type = info["dtype"]
+            data_type_value = DATA_TPYE_DICT.get(data_type)
+        else:
+            data_type_value = 0
+        if "format" in info:
+            _format = info["format"]
+            _format_value = FORMAT_DICT.get(_format)
+        else:
+            _format_value = 0
+    else:
+        data_type_value = 0
+        _format_value = 0
+    return str(data_type_value), str(_format_value)
+
+
+def get_dynamic_parameters(info):
+    # 动态输入时只需获取第一个参数
+    return get_parameters(info[0])
+
+
+def get_all_parameters(support_info, _type):
+    result_list = list()
+    info_lists = support_info.get(_type)
+    if info_lists:
+        for _info in info_lists:
+            # 输入为列表时是动态输入
+            if isinstance(_info, (list, tuple)):
+                data_type_value, _format_value = get_dynamic_parameters(_info)
+            else:
+                data_type_value, _format_value = get_parameters(_info)
+            result_list.append("{},{}".format(data_type_value, _format_value))
+    return result_list
+
+
+def get_all_input_parameters(support_info):
+    result = get_all_parameters(support_info, "inputs")
+    return "/".join(result)
+
+
+def insert_content_into_file(input_file, content):
+    with open(input_file, "r+") as file:
+        lines = file.readlines()
+        for index, line in enumerate(lines):
+            match_result = re.search(r'"staticKey":', line)
+            if match_result:
+                count = len(line) - len(line.lstrip())
+                new_content = "{}{}".format(" " * count, content)
+                # 插入到前一行，防止插入最后时还需要考虑是否添加逗号
+                lines.insert(index, new_content)
+                break
+        file.seek(0)
+        file.write("".join(lines))
+
+
+def insert_simplified_keys(json_file):
+    contents = load_json(json_file)
+    # 不存在'binFileName'或者'supportInfo'字段时，非需要替换的解析json文件
+    if ("binFileName" not in contents) or ("supportInfo" not in contents):
+        return
+    support_info = contents.get("supportInfo")
+    bin_file_name = contents.get("binFileName")
+    # 'simplifiedKey'字段已经存在时，直接返回，不重复生成
+    if "simplifiedKey" in support_info:
+        return
+    op_type = bin_file_name.split("_")[0]
+    deterministic = str(get_deterministic_value(support_info))
+    precision = str(get_precision_value(support_info))
+    overflow = str(get_overflow_value(support_info))
+    input_parameters = get_all_input_parameters(support_info)
+    key = "{}/d={},p={},o={}/{}/".format(
+        op_type, deterministic, precision, overflow, input_parameters
+    )
+    result = '"simplifiedKey": "' + key + '",\n'
+    insert_content_into_file(json_file, result)
+
+
+def insert_all_simplified_keys(root_dir):
+    suffix = "json"
+    all_json_files = get_specified_suffix_file(root_dir, suffix)
+    for _json in all_json_files:
+        insert_simplified_keys(_json)
+
+
+def args_prase():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-p",
+        "--path",
+        nargs="?",
+        required=True,
+        help="Parse the path of the json file.",
+    )
+    return parser.parse_args()
+
+
+def main():
+    args = args_prase()
+    insert_all_simplified_keys(args.path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/csrc/deepep/ops2/cmake/util/kernel_entry.py b/csrc/deepep/ops2/cmake/util/kernel_entry.py
new file mode 100755
index 000000000..255266fd9
--- /dev/null
+++ b/csrc/deepep/ops2/cmake/util/kernel_entry.py
@@ -0,0 +1,113 @@
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+
+
+def gen_fun_def(title, kernel, argn, arg_type, arg_name):
+    entry = []
+    entry.append(title)
+    entry.append(kernel)
+    entry.append("(")
+    args = []
+    for i in range(0, argn):
+        args.append(arg_type + " " + arg_name + str(i))
+    entry.append(", ".join(args))
+    entry.append(")")
+    return " ".join(entry)
+
+
+def gen_batch_kernel_body(fname, argn, arg_name):
+    body = []
+    body.append("{")
+    fun = []
+    fun.append(fname)
+    fun.append("(")
+    args = []
+    for i in range(0, argn):
+        args.append(arg_name + str(i))
+    fun.append(", ".join(args))
+    fun.append(");")
+    body.append(" ".join(fun))
+    body.append("}")
+    return "\n".join(body)
+
+
+def gen_mc_kernel_body(kn, argn, arg_name, blknum):
+    body = []
+    body.append("{")
+    body.append("    switch(block_idx) {")
+    for blk in range(0, blknum):
+        fun = []
+        fun.append("{}_blk{:02d}".format(kn, blk))
+        fun.append("(")
+        args = []
+        for i in range(0, argn):
+            args.append(arg_name + str(i))
+        fun.append(", ".join(args))
+        fun.append(")")
+        body.append("        case {}: {}; break;".format(blk, " ".join(fun)))
+    body.append("        default: break;")
+    body.append("    }")
+    body.append("}")
+    return "\n".join(body)
+
+
+def gen_proc_body(argn, arg_name):
+    body = []
+    body.append("{")
+    args = []
+    for i in range(0, argn):
+        args.append(arg_name + str(i))
+    body.append("uint64_t __x = (uint64_t)" + " + (uint64_t)".join(args) + ";")
+    body.append('__asm__ ("NOP");')
+    body.append('__asm__ ("NOP");')
+    body.append('__asm__ ("NOP");')
+    body.append("}")
+    return "\n".join(body)
+
+
+def batch_code_gen(kn, argn, argt):
+    codes = []
+    kernel_name = kn
+    proc_name = kernel_name + "_percore"
+    arg_num = int(argn)
+    data_type = argt
+    arg_type = "__gm__ " + data_type + "* __restrict__"
+    arg_name = "arg"
+    kernel_title = 'extern "C" __global__ __aicore__ void'
+    proc_title = 'extern "C" __attribute__((noinline)) __aicore__ void'
+    codes.append("#ifndef __aicore__")
+    codes.append("#define __aicore__ [aicore]")
+    codes.append("#endif")
+    codes.append(gen_fun_def(proc_title, proc_name, arg_num, arg_type, arg_name) + ";")
+    codes.append(gen_fun_def(kernel_title, kernel_name, arg_num, arg_type, arg_name))
+    codes.append(gen_batch_kernel_body(proc_name, arg_num, arg_name))
+    codes.append(gen_fun_def(proc_title, proc_name, arg_num, arg_type, arg_name))
+    codes.append(gen_proc_body(arg_num, arg_name))
+    return "\n".join(codes) + "\n"
+
+
+def mc_code_gen(kn, argn, argt, blknum):
+    codes = []
+    kernel_name = kn
+    core_num = int(blknum)
+    arg_num = int(argn)
+    data_type = argt
+    arg_type = "__gm__ " + data_type + "* __restrict__"
+    arg_name = "arg"
+    kernel_title = 'extern "C" __global__ __aicore__ void'
+    proc_title = 'extern "C" __attribute__((noinline)) __aicore__ void'
+    codes.append("#ifndef __aicore__")
+    codes.append("#define __aicore__ [aicore]")
+    codes.append("#endif")
+    for i in range(0, core_num):
+        proc_name = "{}_blk{:02d}".format(kernel_name, i)
+        codes.append(
+            gen_fun_def(proc_title, proc_name, arg_num, arg_type, arg_name) + ";"
+        )
+    codes.append(gen_fun_def(kernel_title, kernel_name, arg_num, arg_type, arg_name))
+    codes.append(gen_mc_kernel_body(kernel_name, arg_num, arg_name, core_num))
+    for i in range(0, core_num):
+        proc_name = "{}_blk{:02d}".format(kernel_name, i)
+        codes.append(gen_fun_def(proc_title, proc_name, arg_num, arg_type, arg_name))
+        codes.append(gen_proc_body(arg_num, arg_name))
+    return "\n".join(codes) + "\n"
diff --git a/csrc/deepep/ops2/cmake/util/kernel_impl.temp b/csrc/deepep/ops2/cmake/util/kernel_impl.temp
new file mode 100644
index 000000000..5079a1043
--- /dev/null
+++ b/csrc/deepep/ops2/cmake/util/kernel_impl.temp
@@ -0,0 +1,10 @@
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <iostream>
+#include "replay_def.h"
+#include "code_gen.h"
+#include "replay_fun.h"
+#define __ASCENDC_REPLAY_CODE__
+#include "__CCE_FILE__"
diff --git a/csrc/deepep/ops2/cmake/util/makeself/COPYING b/csrc/deepep/ops2/cmake/util/makeself/COPYING
new file mode 100644
index 000000000..d159169d1
--- /dev/null
+++ b/csrc/deepep/ops2/cmake/util/makeself/COPYING
@@ -0,0 +1,339 @@
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users.  This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it.  (Some other Free Software Foundation software is covered by
+the GNU Lesser General Public License instead.)  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+  To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have.  You must make sure that they, too, receive or can get the
+source code.  And you must show them these terms so they know their
+rights.
+
+  We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+  Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software.  If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+  Finally, any free program is threatened constantly by software
+patents.  We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary.  To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                    GNU GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License.  The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language.  (Hereinafter, translation is included without limitation in
+the term "modification".)  Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+  1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+  2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) You must cause the modified files to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    b) You must cause any work that you distribute or publish, that in
+    whole or in part contains or is derived from the Program or any
+    part thereof, to be licensed as a whole at no charge to all third
+    parties under the terms of this License.
+
+    c) If the modified program normally reads commands interactively
+    when run, you must cause it, when started running for such
+    interactive use in the most ordinary way, to print or display an
+    announcement including an appropriate copyright notice and a
+    notice that there is no warranty (or else, saying that you provide
+    a warranty) and that users may redistribute the program under
+    these conditions, and telling the user how to view a copy of this
+    License.  (Exception: if the Program itself is interactive but
+    does not normally print such an announcement, your work based on
+    the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+    a) Accompany it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of Sections
+    1 and 2 above on a medium customarily used for software interchange; or,
+
+    b) Accompany it with a written offer, valid for at least three
+    years, to give any third party, for a charge no more than your
+    cost of physically performing source distribution, a complete
+    machine-readable copy of the corresponding source code, to be
+    distributed under the terms of Sections 1 and 2 above on a medium
+    customarily used for software interchange; or,
+
+    c) Accompany it with the information you received as to the offer
+    to distribute corresponding source code.  (This alternative is
+    allowed only for noncommercial distribution and only if you
+    received the program in object code or executable form with such
+    an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it.  For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable.  However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License.  Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+  5. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Program or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+  6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+  7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation.  If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+  10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission.  For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this.  Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+                            NO WARRANTY
+
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+    Gnomovision version 69, Copyright (C) year name of author
+    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+  `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+  <signature of Ty Coon>, 1 April 1989
+  Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs.  If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.
diff --git a/csrc/deepep/ops2/cmake/util/makeself/README.md b/csrc/deepep/ops2/cmake/util/makeself/README.md
new file mode 100644
index 000000000..9d3d4b86f
--- /dev/null
+++ b/csrc/deepep/ops2/cmake/util/makeself/README.md
@@ -0,0 +1,246 @@
+[![License: GPL v2](https://img.shields.io/badge/License-GPL%20v2-blue.svg)](https://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html)
+![Build Status](https://github.com/megastep/makeself/workflows/CI/badge.svg)
+
+# makeself - Make self-extractable archives on Unix
+
+[makeself.sh][1] is a small shell script that generates a self-extractable
+compressed tar archive from a directory. The resulting file appears as a shell script
+(many of those have a **.run** suffix), and can be launched as is. The archive
+will then uncompress itself to a temporary directory and an optional arbitrary
+command will be executed (for example an installation script). This is pretty
+similar to archives generated with WinZip Self-Extractor in the Windows world.
+Makeself archives also include checksums for integrity self-validation (CRC
+and/or MD5/SHA256 checksums).
+
+The makeself.sh script itself is used only to create the archives from a
+directory of files. The resultant archive is actually a compressed (using
+gzip, bzip2, or compress) TAR archive, with a small shell script stub at the
+beginning. This small stub performs all the steps of extracting the files,
+running the embedded command, and removing the temporary files when done.
+All the user has to do to install the software contained in such an
+archive is to "run" the archive, i.e **sh nice-software.run**. I recommend
+using the ".run" (which was introduced by some Makeself archives released by
+Loki Software) or ".sh" suffix for such archives not to confuse the users,
+so that they will know they are actually shell scripts (with quite a lot of binary data
+attached to them though!).
+
+I am trying to keep the code of this script as portable as possible, i.e it is
+not relying on any bash-specific features and only calls commands that are
+installed on any functioning UNIX-compatible system. This script as well as
+the archives it generates should run on any Unix flavor, with any compatible
+Bourne shell, provided of course that the compression programs are available.
+
+As of version 2.1, Makeself has been rewritten and tested on the following
+platforms :
+
+  * Linux (all distributions)
+  * Sun Solaris (8 and above)
+  * HP-UX (tested on 11.0 and 11i on HPPA RISC)
+  * SCO OpenUnix and OpenServer
+  * IBM AIX 5.1L
+  * macOS (Darwin)
+  * SGI IRIX 6.5
+  * FreeBSD
+  * UnicOS / Cray
+  * Cygwin (Windows)
+
+If you successfully run Makeself and/or archives created with it on another
+system, then please [let me know][2]!
+
+Examples of publicly available archives made using makeself are :
+
+  * Game patches and installers for [Id Software][3] games like Quake 3 for Linux or Return To Castle Wolfenstein ;
+  * All game patches released by [Loki Software][4] for the Linux version of popular games ;
+  * The [nVidia drivers][5] for Linux
+  * The installer for the Linux version of [Google Earth][6]
+  * The [VirtualBox][7] installers for Linux
+  * The [Makeself][1] distribution itself ;-)
+  * and countless others...
+
+**Important note for Apache users:** By default, most Web servers will think that Makeself archives are regular text files and thus they may show up as text in a Web browser. The correct way to prevent this is to add a MIME type for this file format, like so (in httpd.conf) :
+
+`AddType application/x-makeself .run`
+
+**Important note for certain GNU/Linux distributions:** Archives created with Makeself prior to v2.1.2 were using an old syntax for the _head_ and _tail_ Unix commands that is being progressively obsoleted in their GNU forms. Therefore you may have problems uncompressing some of these archives. A workaround for this is to set the environment variable $_POSIX2_VERSION to enable the old syntax, i.e. :
+
+`export _POSIX2_VERSION=199209`
+
+## Usage
+
+The syntax of makeself is the following:
+
+```
+makeself.sh [args] archive_dir file_name label startup_script [script_args]
+```
+
+  * _args_ are optional options for Makeself. The available ones are :
+
+    * **`--version`** : Prints the version number on stdout, then exits immediately
+    * **`--gzip`** : Use gzip for compression (the default on platforms on which gzip is commonly available, like Linux)
+    * **`--bzip2`** : Use bzip2 instead of gzip for better compression. The bzip2 command must be available in the command path. It is recommended that the archive prefix be set to something like '.bz2.run', so that potential users know that they'll need bzip2 to extract it.
+    * **`--pbzip2`** : Use pbzip2 instead of gzip for better and faster compression on machines having multiple CPUs. The pbzip2 command must be available in the command path. It is recommended that the archive prefix be set to something like '.bz2.run', so that potential users know that they'll need bzip2 to extract it.
+    * **`--xz`** : Use xz instead of gzip for better compression. The xz command must be available in the command path. It is recommended that the archive prefix be set to something like '.xz.run' for the archive, so that potential users know that they'll need xz to extract it.
+    * **`--lzo`** : Use lzop instead of gzip for better compression. The lzop command must be available in the command path. It is recommended that the archive prefix be set to something like `.lzo.run` for the archive, so that potential users know that they'll need lzop to extract it.
+    * **`--lz4`** : Use lz4 instead of gzip for better compression. The lz4 command must be available in the command path. It is recommended that the archive prefix be set to something like '.lz4.run' for the archive, so that potential users know that they'll need lz4 to extract it.
+    * **`--zstd`** : Use zstd instead of gzip for better compression. The zstd command must be available in the command path. It is recommended that the archive prefix be set to something like '.zstd.run' for the archive, so that potential users know that they'll need zstd to extract it.
+    * **`--pigz`** : Use pigz for compression.
+    * **`--base64`** : Encode the archive to ASCII in Base64 format instead of compressing (base64 command required).
+    * **`--gpg-encrypt`** : Encrypt the archive using `gpg -ac -z $COMPRESS_LEVEL`. This will prompt for a password to encrypt with. Assumes that potential users have `gpg` installed.
+    * **`--ssl-encrypt`** : Encrypt the archive using `openssl aes-256-cbc -a -salt`. This will prompt for a password to encrypt with. Assumes that the potential users have the OpenSSL tools installed.
+    * **`--compress`** : Use the UNIX `compress` command to compress the data. This should be the default on all platforms that don't have gzip available.
+    * **`--nocomp`** : Do not use any compression for the archive, which will then be an uncompressed TAR.
+    * **`--complevel`** : Specify the compression level for gzip, bzip2, pbzip2, zstd, xz, lzo or lz4. (defaults to 9)
+    * **`--threads`** : Specify the number of threads to be used by compressors that support parallelization. Omit to use compressor's default. Most useful (and required) for opting into xz's threading, usually with `--threads=0` for all available cores. pbzip2 and pigz are parallel by default, and setting this value allows limiting the number of threads they use.
+    * **`--notemp`** : The generated archive will not extract the files to a temporary directory, but in a new directory created in the current directory. This is better to distribute software packages that may extract and compile by themselves (i.e. launch the compilation through the embedded script).
+    * **`--current`** : Files will be extracted to the current directory, instead of in a subdirectory. This option implies `--notemp` above.
+    * **`--follow`** : Follow the symbolic links inside of the archive directory, i.e. store the files that are being pointed to instead of the links themselves.
+    * **`--append`** _(new in 2.1.x)_: Append data to an existing archive, instead of creating a new one. In this mode, the settings from the original archive are reused (compression type, label, embedded script), and thus don't need to be specified again on the command line.
+    * **`--header`** : Makeself uses a separate file to store the header stub, called `makeself-header.sh`. By default, it is assumed that it is stored in the same location as makeself.sh. This option can be used to specify its actual location if it is stored someplace else.
+    * **`--cleanup`** : Specify a script that is run when execution is interrupted or finishes successfully. The script is executed with the same environment and initial `script_args` as `startup_script`.
+    * **`--copy`** : Upon extraction, the archive will first extract itself to a temporary directory. The main application of this is to allow self-contained installers stored in a Makeself archive on a CD, when the installer program will later need to unmount the CD and allow a new one to be inserted. This prevents "Filesystem busy" errors for installers that span multiple CDs.
+    * **`--nox11`** : Disable the automatic spawning of a new terminal in X11.
+    * **`--nowait`** : When executed from a new X11 terminal, disable the user prompt at the end of the script execution.
+    * **`--nomd5`** and **`--nocrc`** : Disable the creation of a MD5 / CRC checksum for the archive. This speeds up the extraction process if integrity checking is not necessary.
+    * **`--sha256`** : Adds a SHA256 checksum for the archive. This is in addition to the MD5 / CRC checksums unless `--nomd5` is also used.
+    * **`--lsm` _file_** : Provide and LSM file to makeself, that will be embedded in the generated archive. LSM files are describing a software package in a way that is easily parseable. The LSM entry can then be later retrieved using the `--lsm` argument to the archive. An example of a LSM file is provided with Makeself.
+    * **`--tar-format opt`** : Specify the tar archive format (default is ustar); you may use any value accepted by your tar command (such as posix, v7, etc).
+    * **`--tar-extra opt`** : Append more options to the tar command line.
+
+        For instance, in order to exclude the `.git` directory from the packaged archive directory using the GNU `tar`, one can use `makeself.sh --tar-extra "--exclude=.git" ...`
+
+    * **`--keep-umask`** : Keep the umask set to shell default, rather than overriding when executing self-extracting archive.
+    * **`--packaging-date date`** : Use provided string as the packaging date instead of the current date.
+    * **`--license`** : Append a license file.
+    * **`--nooverwrite`** : Do not extract the archive if the specified target directory already exists.
+    * **`--help-header file`** : Add a header to the archive's `--help` output.
+  * `archive_dir` is the name of the directory that contains the files to be archived
+  * `file_name` is the name of the archive to be created
+  * `label` is an arbitrary text string describing the package. It will be displayed while extracting the files.
+  * `startup_script` is the command to be executed _from within_ the directory of extracted files. Thus, if you wish to execute a program contained in this directory, you must prefix your command with `./`. For example, `./program` will be fine. The `script_args` are additional arguments for this command.
+
+Here is an example, assuming the user has a package image stored in a **/home/joe/mysoft**, and he wants to generate a self-extracting package named
+**mysoft.sh**, which will launch the "setup" script initially stored in /home/joe/mysoft :
+
+`makeself.sh /home/joe/mysoft mysoft.sh "Joe's Nice Software Package" ./setup
+`
+
+Here is also how I created the [makeself.run][9] archive which contains the Makeself distribution :
+
+`makeself.sh --notemp makeself makeself.run "Makeself by Stephane Peter" echo "Makeself has extracted itself" `
+
+Archives generated with Makeself can be passed the following arguments:
+
+  * **`--keep`** : Prevent the files to be extracted in a temporary directory that will be removed after the embedded script's execution. The files will then be extracted in the current working directory and will stay here until you remove them.
+  * **`--verbose`** : Will prompt the user before executing the embedded command
+  * **`--target dir`** : Allows to extract the archive in an arbitrary place.
+  * **`--nox11`** : Do not spawn a X11 terminal.
+  * **`--confirm`** : Prompt the user for confirmation before running the embedded command.
+  * **`--info`** : Print out general information about the archive (does not extract).
+  * **`--lsm`** : Print out the LSM entry, if it is present.
+  * **`--list`** : List the files in the archive.
+  * **`--check`** : Check the archive for integrity using the embedded checksums. Does not extract the archive.
+  * **`--nochown`** : By default, a `chown -R` command is run on the target directory after extraction, so that all files belong to the current user. This is mostly needed if you are running as root, as tar will then try to recreate the initial user ownerships. You may disable this behavior with this flag.
+  * **`--tar`** : Run the tar command on the contents of the archive, using the following arguments as parameter for the command.
+  * **`--noexec`** : Do not run the embedded script after extraction.
+  * **`--noexec-cleanup`** : Do not run the embedded cleanup script.
+  * **`--nodiskspace`** : Do not check for available disk space before attempting to extract.
+  * **`--cleanup-args`** : Specify arguments to be passed to the cleanup script. Wrap value in quotes to specify multiple arguments.
+
+Any subsequent arguments to the archive will be passed as additional arguments to the embedded command. You must explicitly use the `--` special command-line construct before any such options to make sure that Makeself will not try to interpret them.
+
+## Startup Script
+
+The startup script must be a regular Shell script.
+
+Within the startup script, you can use the `$USER_PWD` variable to get the path of the folder from which the self-extracting script is executed. This is especially useful to access files that are located in the same folder as the script, as shown in the example below.
+
+`my-self-extracting-script.sh --fooBarFileParameter foo.bar`
+
+## Building and Testing
+
+Clone the git repo and execute `git submodule update --init --recursive` to obtain all submodules.
+
+* To make a release: `make`
+* To run all tests:  `make test`
+
+## Maven Usage
+
+Makeself is now supported by the following maven plugin [makeself-maven-plugin](https://github.com/hazendaz/makeself-maven-plugin).  Please refer to project for usage and report any bugs in regards to maven plugin on that project.
+
+## License
+
+Makeself itself is covered by the [GNU General Public License][8] (GPL) version 2 and above. Archives generated by Makeself don't have to be placed under this license (although I encourage it ;-)), since the archive itself is merely data for Makeself.
+
+## Contributing
+
+I will gladly consider merging your pull requests on the [GitHub][10] repository. However, please keep the following in mind:
+
+  * One of the main purposes of Makeself is portability. Do not submit patches that will break supported platforms. The more platform-agnostic, the better.
+  * Please explain clearly what the purpose of the patch is, and how you achieved it.
+
+## Download
+
+Get the latest official distribution [here][9] (version 2.4.2).
+
+The latest development version can be grabbed from [GitHub][10]. Feel free to submit any patches there through the fork and pull request process.
+
+## Version history
+
+  * **v1.0:** Initial public release
+  * **v1.1:** The archive can be passed parameters that will be passed on to the embedded script, thanks to John C. Quillan
+  * **v1.2:** Cosmetic updates, support for bzip2 compression and non-temporary archives. Many ideas thanks to Francois Petitjean.
+  * **v1.3:** More patches from Bjarni R. Einarsson and Francois Petitjean: Support for no compression (`--nocomp`), script is no longer mandatory, automatic launch in an xterm, optional verbose output, and -target archive option to indicate where to extract the files.
+  * **v1.4:** Many patches from Francois Petitjean: improved UNIX compatibility, automatic integrity checking, support of LSM files to get info on the package at run time..
+  * **v1.5.x:** A lot of bugfixes, and many other patches, including automatic verification through the usage of checksums. Version 1.5.5 was the stable release for a long time, even though the Web page didn't get updated ;-). Makeself was also officially made a part of the [Loki Setup installer][11], and its source is being maintained as part of this package.
+  * **v2.0:** Complete internal rewrite of Makeself. The command-line parsing was vastly improved, the overall maintenance of the package was greatly improved by separating the stub from makeself.sh. Also Makeself was ported and tested to a variety of Unix platforms.
+  * **v2.0.1:** First public release of the new 2.0 branch. Prior versions are officially obsoleted. This release introduced the `--copy` argument that was introduced in response to a need for the [UT2K3][12] Linux installer.
+  * **v2.1.0:** Big change : Makeself can now support multiple embedded tarballs, each stored separately with their own checksums. An existing archive can be updated with the `--append` flag. Checksums are also better managed, and the `--nochown` option for archives appeared.
+  * **v2.1.1:** Fixes related to the Unix compression (compress command). Some Linux distributions made the insane choice to make it unavailable, even though gzip is capable of uncompressing these files, plus some more bugfixes in the extraction and checksum code.
+  * **v2.1.2:** Some bug fixes. Use head -n to avoid problems with POSIX conformance.
+  * **v2.1.3:** Bug fixes with the command line when spawning terminals. Added `--tar`, `--noexec` for archives. Added `--nomd5` and `--nocrc` to avoid creating checksums in archives. The embedded script is now run through "eval". The `--info` output now includes the command used to create the archive. A man page was contributed by Bartosz Fenski.
+  * **v2.1.4:** Fixed `--info` output. Generate random directory name when extracting files to . to avoid problems. Better handling of errors with wrong permissions for the directory containing the files. Avoid some race conditions, Unset the $CDPATH variable to avoid problems if it is set. Better handling of dot files in the archive directory.
+  * **v2.1.5:** Made the md5sum detection consistent with the header code. Check for the presence of the archive directory. Added `--encrypt` for symmetric encryption through gpg (Eric Windisch). Added support for the digest command on Solaris 10 for MD5 checksums. Check for available disk space before extracting to the target directory (Andreas Schweitzer). Allow extraction to run asynchronously (patch by Peter Hatch). Use file descriptors internally to avoid error messages (patch by Kay Tiong Khoo).
+  * **v2.1.6:** Replaced one dot per file progress with a realtime progress percentage and a spinning cursor. Added `--noprogress` to prevent showing the progress during the decompression. Added `--target` dir to allow extracting directly to a target directory. (Guy Baconniere)
+  * **v2.2.0:** First major new release in years! Includes many bugfixes and user contributions. Please look at the [project page on Github][10] for all the details.
+  * **v2.3.0:** Support for archive encryption via GPG or OpenSSL. Added LZO and LZ4 compression support. Options to set the packaging date and stop the umask from being overridden. Optionally ignore check for available disk space when extracting. New option to check for root permissions before extracting.
+  * **v2.3.1:** Various compatibility updates. Added unit tests for Travis CI in the GitHub repo. New `--tar-extra`, `--untar-extra`, `--gpg-extra`, `--gpg-asymmetric-encrypt-sign` options.
+  * **v2.4.0:** Added optional support for SHA256 archive integrity checksums.
+  * **v2.4.2:** New --cleanup and --cleanup-args arguments for cleanup scripts. Added threading support for supported compressors. Now supports zstd compression.
+  * **v2.4.3:** Make explicit POSIX tar archives for increased compatibility.
+  * **v2.4.4:** Fixed various compatibility issues (no longer use POSIX tar archives), Github Actions to check on Solaris and FreeBSD.
+  * **v2.4.5:** Added `--tar-format` option to set the tar archive format (default is ustar)
+
+## Links
+
+  * Check out the ["Loki Setup"][11] installer, used to install many Linux games and other applications, and of which I am the co-author. Since the demise of Loki, I am now the official maintainer of the project, and it is now being hosted here on GitHub.
+  * Bjarni R. Einarsson also wrote the **setup.sh** installer script, inspired by Makeself. [Check it out !][14]
+
+## Contact
+
+This script was written by [Stéphane Peter][15] (megastep at megastep.org). Any enhancements and suggestions are welcome.
+
+Contributions were included from John C. Quillan, Bjarni R. Einarsson,
+Francois Petitjean, Ryan C. Gordon, and many contributors on GitHub. If you think I forgot
+your name, don't hesitate to contact me.
+
+This project is now hosted on GitHub. Feel free to submit patches and bug reports on the [project page][10].
+
+* * *
+
+[Stephane Peter][2]
+
+   [1]: http://makeself.io/
+   [2]: mailto:megastep@megastep.org
+   [3]: http://www.idsoftware.com/
+   [4]: http://www.lokigames.com/products/myth2/updates.php3
+   [5]: http://www.nvidia.com/
+   [6]: http://earth.google.com/
+   [7]: http://www.virtualbox.org/
+   [8]: http://www.gnu.org/copyleft/gpl.html
+   [9]: https://github.com/megastep/makeself/releases/download/release-2.4.5/makeself-2.4.5.run
+   [10]: https://github.com/megastep/makeself
+   [11]: https://github.com/megastep/loki_setup/
+   [12]: http://www.unrealtournament2003.com/
+   [13]: http://www.icculus.org/
+   [14]: http://bre.klaki.net/programs/setup.sh/
+   [15]: https://stephanepeter.com/
diff --git a/csrc/deepep/ops2/cmake/util/makeself/VERSION b/csrc/deepep/ops2/cmake/util/makeself/VERSION
new file mode 100644
index 000000000..59aa62c1f
--- /dev/null
+++ b/csrc/deepep/ops2/cmake/util/makeself/VERSION
@@ -0,0 +1 @@
+2.4.5
diff --git a/csrc/deepep/ops2/cmake/util/makeself/make-release.sh b/csrc/deepep/ops2/cmake/util/makeself/make-release.sh
new file mode 100755
index 000000000..65d698f25
--- /dev/null
+++ b/csrc/deepep/ops2/cmake/util/makeself/make-release.sh
@@ -0,0 +1,8 @@
+#!/bin/sh
+#
+# Create a distributable archive of the current version of Makeself
+
+VER=`cat VERSION`
+mkdir -p /tmp/makeself-$VER release
+cp -pPR makeself* test README.md COPYING VERSION .gitmodules /tmp/makeself-$VER/
+./makeself.sh --notemp /tmp/makeself-$VER release/makeself-$VER.run "Makeself v$VER" echo "Makeself has extracted itself"
diff --git a/csrc/deepep/ops2/cmake/util/makeself/makeself-header.sh b/csrc/deepep/ops2/cmake/util/makeself/makeself-header.sh
new file mode 100644
index 000000000..23ffc483e
--- /dev/null
+++ b/csrc/deepep/ops2/cmake/util/makeself/makeself-header.sh
@@ -0,0 +1,660 @@
+cat << EOF  > "$archname"
+#!/bin/bash
+# This script was generated using Makeself $MS_VERSION
+# The license covering this archive and its contents, if any, is wholly independent of the Makeself license (GPL)
+# 2022.3.19-Modified the MS_Help function and some options
+# Huawei Technologies Co., Ltd. <foss@huawei.com>
+
+ORIG_UMASK=\`umask\`
+
+CRCsum="$CRCsum"
+MD5="$MD5sum"
+SHA="$SHAsum"
+SIGNATURE="$Signature"
+TMPROOT=\${TMPDIR:="\$HOME"}
+if ! test -d "\$TMPROOT"; then
+    TMPROOT="\$PWD"
+fi
+export TMPDIR="\$TMPROOT"
+USER_PWD="\$PWD"
+if ! test -d "\$USER_PWD"; then
+    exit 1
+fi
+export USER_PWD
+ARCHIVE_DIR=\`dirname "\$0"\`
+export ARCHIVE_DIR
+
+name_of_file="\$0 "
+pwd_of_file="\$PWD"
+label="$LABEL"
+script="$SCRIPT"
+scriptargs="$SCRIPTARGS"
+cleanup_script="${CLEANUP_SCRIPT}"
+licensetxt="$LICENSE"
+helpheader='$HELPHEADER'
+targetdir="$archdirname"
+filesizes="$filesizes"
+totalsize="$totalsize"
+keep="$KEEP"
+nooverwrite="$NOOVERWRITE"
+quiet="n"
+accept="n"
+nodiskspace="n"
+export_conf="$EXPORT_CONF"
+decrypt_cmd="$DECRYPT_CMD"
+skip="$SKIP"
+
+print_cmd_arg=""
+if type printf > /dev/null; then
+    print_cmd="printf"
+elif test -x /usr/ucb/echo; then
+    print_cmd="/usr/ucb/echo"
+else
+    print_cmd="echo"
+fi
+
+if test -d /usr/xpg4/bin; then
+    PATH=/usr/xpg4/bin:\$PATH
+    export PATH
+fi
+
+if test -d /usr/sfw/bin; then
+    PATH=\$PATH:/usr/sfw/bin
+    export PATH
+fi
+
+unset CDPATH
+
+MS_Printf()
+{
+    \$print_cmd \$print_cmd_arg "\$1"
+}
+
+MS_PrintLicense()
+{
+  PAGER=\${PAGER:=more}
+  if test x"\$licensetxt" != x; then
+    PAGER_PATH=\`exec <&- 2>&-; which \$PAGER || command -v \$PAGER || type \$PAGER\`
+    if test -x "\$PAGER_PATH"; then
+      echo "\$licensetxt" | \$PAGER
+    else
+      echo "\$licensetxt"
+    fi
+    if test x"\$accept" != xy; then
+      while true
+      do
+        MS_Printf "Please type y to accept, n otherwise: "
+        read yn
+        if test x"\$yn" = xn; then
+          keep=n
+          eval \$finish; exit 1
+          break;
+        elif test x"\$yn" = xy; then
+          break;
+        fi
+      done
+    fi
+  fi
+}
+
+MS_diskspace()
+{
+	(
+	df -kP "\$1" | tail -1 | awk '{ if (\$4 ~ /%/) {print \$3} else {print \$4} }'
+	)
+}
+
+MS_dd()
+{
+    blocks=\`expr \$3 / 1024\`
+    bytes=\`expr \$3 % 1024\`
+    # Test for ibs, obs and conv feature
+    if dd if=/dev/zero of=/dev/null count=1 ibs=512 obs=512 conv=sync 2> /dev/null; then
+        dd if="\$1" ibs=\$2 skip=1 obs=1024 conv=sync 2> /dev/null | \\
+        { test \$blocks -gt 0 && dd ibs=1024 obs=1024 count=\$blocks ; \\
+          test \$bytes  -gt 0 && dd ibs=1 obs=1024 count=\$bytes ; } 2> /dev/null
+    else
+        dd if="\$1" bs=\$2 skip=1 2> /dev/null
+    fi
+}
+
+MS_dd_Progress()
+{
+    if test x"\$noprogress" = xy; then
+        MS_dd "\$@"
+        return \$?
+    fi
+    file="\$1"
+    offset=\$2
+    length=\$3
+    pos=0
+    bsize=4194304
+    while test \$bsize -gt \$length; do
+        bsize=\`expr \$bsize / 4\`
+    done
+    blocks=\`expr \$length / \$bsize\`
+    bytes=\`expr \$length % \$bsize\`
+    (
+        dd ibs=\$offset skip=1 2>/dev/null
+        pos=\`expr \$pos \+ \$bsize\`
+        MS_Printf "     0%% " 1>&2
+        if test \$blocks -gt 0; then
+            while test \$pos -le \$length; do
+                dd bs=\$bsize count=1 2>/dev/null
+                pcent=\`expr \$length / 100\`
+                pcent=\`expr \$pos / \$pcent\`
+                if test \$pcent -lt 100; then
+                    MS_Printf "\b\b\b\b\b\b\b" 1>&2
+                    if test \$pcent -lt 10; then
+                        MS_Printf "    \$pcent%% " 1>&2
+                    else
+                        MS_Printf "   \$pcent%% " 1>&2
+                    fi
+                fi
+                pos=\`expr \$pos \+ \$bsize\`
+            done
+        fi
+        if test \$bytes -gt 0; then
+            dd bs=\$bytes count=1 2>/dev/null
+        fi
+        MS_Printf "\b\b\b\b\b\b\b" 1>&2
+        MS_Printf " 100%%  " 1>&2
+    ) < "\$file"
+}
+
+MS_Help()
+{
+    cat << EOH >&2
+Usage: \$0 [options]
+Options:
+  --help | -h                       Print this message
+  --info                            Print embedded info : title, default target directory, embedded script ...
+  --list                            Print the list of files in the archive
+  --check                           Checks integrity and version dependency of the archive
+  --quiet                           Quiet install mode, skip human-computer interactions
+  --nox11                           Do not spawn an xterm
+  --noexec                          Do not run embedded script
+  --extract=<path>                  Extract directly to a target directory (absolute or relative)
+                                    Usually used with --noexec to just extract files without running
+  --tar arg1 [arg2 ...]             Access the contents of the archive through the tar command
+\${helpheader}
+EOH
+}
+
+MS_Verify_Sig()
+{
+    GPG_PATH=\`exec <&- 2>&-; which gpg || command -v gpg || type gpg\`
+    MKTEMP_PATH=\`exec <&- 2>&-; which mktemp || command -v mktemp || type mktemp\`
+    test -x "\$GPG_PATH" || GPG_PATH=\`exec <&- 2>&-; which gpg || command -v gpg || type gpg\`
+    test -x "\$MKTEMP_PATH" || MKTEMP_PATH=\`exec <&- 2>&-; which mktemp || command -v mktemp || type mktemp\`
+	offset=\`head -n "\$skip" "\$1" | wc -c | tr -d " "\`
+    temp_sig=\`mktemp -t XXXXX\`
+    echo \$SIGNATURE | base64 --decode > "\$temp_sig"
+    gpg_output=\`MS_dd "\$1" \$offset \$totalsize | LC_ALL=C "\$GPG_PATH" --verify "\$temp_sig" - 2>&1\`
+    gpg_res=\$?
+    rm -f "\$temp_sig"
+    if test \$gpg_res -eq 0 && test \`echo \$gpg_output | grep -c Good\` -eq 1; then
+        if test \`echo \$gpg_output | grep -c \$sig_key\` -eq 1; then
+            test x"\$quiet" = xn && echo "GPG signature is good" >&2
+        else
+            echo "GPG Signature key does not match" >&2
+            exit 2
+        fi
+    else
+        test x"\$quiet" = xn && echo "GPG signature failed to verify" >&2
+        exit 2
+    fi
+}
+
+MS_Check()
+{
+    OLD_PATH="\$PATH"
+    PATH=\${GUESS_MD5_PATH:-"\$OLD_PATH:/bin:/usr/bin:/sbin:/usr/local/ssl/bin:/usr/local/bin:/opt/openssl/bin"}
+	MD5_ARG=""
+    MD5_PATH=\`exec <&- 2>&-; which md5sum || command -v md5sum || type md5sum\`
+    test -x "\$MD5_PATH" || MD5_PATH=\`exec <&- 2>&-; which md5 || command -v md5 || type md5\`
+    test -x "\$MD5_PATH" || MD5_PATH=\`exec <&- 2>&-; which digest || command -v digest || type digest\`
+    PATH="\$OLD_PATH"
+
+    SHA_PATH=\`exec <&- 2>&-; which shasum || command -v shasum || type shasum\`
+    test -x "\$SHA_PATH" || SHA_PATH=\`exec <&- 2>&-; which sha256sum || command -v sha256sum || type sha256sum\`
+
+    if test x"\$quiet" = xn; then
+		MS_Printf "Verifying archive integrity..."
+    fi
+    offset=\`head -n "\$skip" "\$1" | wc -c | tr -d " "\`
+    fsize=\`cat "\$1" | wc -c | tr -d " "\`
+    if test \$totalsize -ne \`expr \$fsize - \$offset\`; then
+        echo " Unexpected archive size." >&2
+        exit 2
+    fi
+    verb=\$2
+    i=1
+    for s in \$filesizes
+    do
+		crc=\`echo \$CRCsum | cut -d" " -f\$i\`
+		if test -x "\$SHA_PATH"; then
+			if test x"\`basename \$SHA_PATH\`" = xshasum; then
+				SHA_ARG="-a 256"
+			fi
+			sha=\`echo \$SHA | cut -d" " -f\$i\`
+			if test x"\$sha" = x0000000000000000000000000000000000000000000000000000000000000000; then
+				test x"\$verb" = xy && echo " \$1 does not contain an embedded SHA256 checksum." >&2
+			else
+				shasum=\`MS_dd_Progress "\$1" \$offset \$s | eval "\$SHA_PATH \$SHA_ARG" | cut -b-64\`;
+				if test x"\$shasum" != x"\$sha"; then
+					echo "Error in SHA256 checksums: \$shasum is different from \$sha" >&2
+					exit 2
+				elif test x"\$quiet" = xn; then
+					MS_Printf " SHA256 checksums are OK." >&2
+				fi
+				crc="0000000000";
+			fi
+		fi
+		if test -x "\$MD5_PATH"; then
+			if test x"\`basename \$MD5_PATH\`" = xdigest; then
+				MD5_ARG="-a md5"
+			fi
+			md5=\`echo \$MD5 | cut -d" " -f\$i\`
+			if test x"\$md5" = x00000000000000000000000000000000; then
+				test x"\$verb" = xy && echo " \$1 does not contain an embedded MD5 checksum." >&2
+			else
+				md5sum=\`MS_dd_Progress "\$1" \$offset \$s | eval "\$MD5_PATH \$MD5_ARG" | cut -b-32\`;
+				if test x"\$md5sum" != x"\$md5"; then
+					echo "Error in MD5 checksums: \$md5sum is different from \$md5" >&2
+					exit 2
+				elif test x"\$quiet" = xn; then
+					MS_Printf " MD5 checksums are OK." >&2
+				fi
+				crc="0000000000"; verb=n
+			fi
+		fi
+		if test x"\$crc" = x0000000000; then
+			test x"\$verb" = xy && echo " \$1 does not contain a CRC checksum." >&2
+		else
+			sum1=\`MS_dd_Progress "\$1" \$offset \$s | CMD_ENV=xpg4 cksum | awk '{print \$1}'\`
+			if test x"\$sum1" != x"\$crc"; then
+				echo "Error in checksums: \$sum1 is different from \$crc" >&2
+				exit 2
+			elif test x"\$quiet" = xn; then
+				MS_Printf " CRC checksums are OK." >&2
+			fi
+		fi
+		i=\`expr \$i + 1\`
+		offset=\`expr \$offset + \$s\`
+    done
+    if test x"\$quiet" = xn; then
+		echo " All good."
+    fi
+}
+
+MS_Decompress()
+{
+    if test x"\$decrypt_cmd" != x""; then
+        { eval "\$decrypt_cmd" || echo " ... Decryption failed." >&2; } | eval "$GUNZIP_CMD"
+    else
+        eval "$GUNZIP_CMD"
+    fi
+
+    if test \$? -ne 0; then
+        echo " ... Decompression failed." >&2
+    fi
+}
+
+UnTAR()
+{
+    if test x"\$quiet" = xn; then
+		tar \$1vf - $UNTAR_EXTRA 2>&1 || { echo " ... Extraction failed." >&2; kill -15 \$$; }
+    else
+		tar \$1f - $UNTAR_EXTRA 2>&1 || { echo Extraction failed. >&2; kill -15 \$$; }
+    fi
+}
+
+MS_exec_cleanup() {
+    if test x"\$cleanup" = xy && test x"\$cleanup_script" != x""; then
+        cleanup=n
+        cd "\$tmpdir"
+        eval "\"\$cleanup_script\" \$scriptargs \$cleanupargs"
+    fi
+}
+
+MS_cleanup()
+{
+    echo 'Signal caught, cleaning up' >&2
+    MS_exec_cleanup
+    cd "\$TMPROOT"
+    rm -rf "\$tmpdir"
+    eval \$finish; exit 15
+}
+
+Script_Args_Check()
+{
+    script_supported_args=\$(echo \${helpheader} | grep -o -E "\-\-[^ ]+" | awk -F"=" {'print \$1'})
+    arg_to_test=\$(echo \$1|awk -F"=" {'print \$1'})
+
+    for arg in \${script_supported_args};
+    do
+        if test x"\$arg_to_test" = x"\$arg" ;then
+            return
+        fi
+    done
+
+    MS_Help
+    exit 1
+}
+
+finish=true
+xterm_loop=
+noprogress=$NOPROGRESS
+nox11=$NOX11
+copy=$COPY
+ownership=$OWNERSHIP
+verbose=n
+cleanup=y
+cleanupargs=
+sig_key=
+
+initargs="\$@"
+
+while [ -n "\$*" ]
+do
+    case "\$1" in
+    -h | --help)
+	MS_Help
+	exit 0
+	;;
+    -q | --quiet)
+	quiet=y
+	noprogress=y
+	shift
+	;;
+    --info)
+	echo Identification: "\$label"
+	echo Target directory: "\$targetdir"
+	echo Uncompressed size: $USIZE KB
+	echo Compression: $COMPRESS
+	if test x"$ENCRYPT" != x""; then
+	    echo Encryption: $ENCRYPT
+	fi
+	echo Date of packaging: $DATE
+	echo Built with Makeself version $MS_VERSION
+	echo Build command was: "$MS_COMMAND"
+	if test x"\$script" != x; then
+	    echo Script run after extraction:
+	    echo "    " \$script \$scriptargs
+	fi
+	if test x"$copy" = xcopy; then
+		echo "Archive will copy itself to a temporary location"
+	fi
+	if test x"$NEED_ROOT" = xy; then
+		echo "Root permissions required for extraction"
+	fi
+	if test x"$KEEP" = xy; then
+	    echo "directory \$targetdir is permanent"
+	else
+	    echo "\$targetdir will be removed after extraction"
+	fi
+	exit 0
+	;;
+    --list)
+	echo Target directory: \$targetdir
+	offset=\`head -n "\$skip" "\$0" | wc -c | tr -d " "\`
+	for s in \$filesizes
+	do
+	    MS_dd "\$0" \$offset \$s | MS_Decompress | UnTAR t
+	    offset=\`expr \$offset + \$s\`
+	done
+	exit 0
+	;;
+	--tar)
+	offset=\`head -n "\$skip" "\$0" | wc -c | tr -d " "\`
+	arg1="\$2"
+    shift 2 || { MS_Help; exit 1; }
+	for s in \$filesizes
+	do
+	    MS_dd "\$0" \$offset \$s | MS_Decompress | tar "\$arg1" - "\$@"
+	    offset=\`expr \$offset + \$s\`
+	done
+	exit 0
+	;;
+    --check)
+	MS_Check "\$0" y
+	scriptargs="\$scriptargs \$1"
+    shift
+	;;
+	--noexec)
+	script=""
+    cleanup_script=""
+	shift
+	;;
+    --extract=*)
+	keep=y
+	targetdir=\`echo \$1 | cut -d"=" -f2 \`
+    if ! shift; then MS_Help; exit 1; fi
+	;;
+    --nox11)
+	nox11=y
+	shift
+	;;
+    --xwin)
+	if test "$NOWAIT" = n; then
+		finish="echo Press Return to close this window...; read junk"
+	fi
+	xterm_loop=1
+	shift
+	;;
+    --phase2)
+	copy=phase2
+	shift
+	;;
+    --repack | --repack-path=*)
+	Script_Args_Check \$1
+	scriptargs="\$scriptargs '\$1'"
+	shift
+	if [[ ! "\$1" =~ ^-.* ]]; then
+		scriptargs="\$scriptargs '\$1'"
+		shift
+	fi
+	;;
+    *)
+	Script_Args_Check \$1
+	scriptargs="\$scriptargs '\$1'"
+    shift
+    ;;
+    esac
+done
+
+quiet_para=""
+if test x"\$quiet" = xy; then
+    quiet_para="--quiet "
+fi
+scriptargs="--\$name_of_file""--\"\$pwd_of_file\""" \$quiet_para""\$scriptargs"
+
+if test x"\$quiet" = xy -a x"\$verbose" = xy; then
+	echo Cannot be verbose and quiet at the same time. >&2
+	exit 1
+fi
+
+if test x"$NEED_ROOT" = xy -a \`id -u\` -ne 0; then
+	echo "Administrative privileges required for this archive (use su or sudo)" >&2
+	exit 1
+fi
+
+if test x"\$copy" \!= xphase2; then
+    MS_PrintLicense
+fi
+
+case "\$copy" in
+copy)
+    tmpdir="\$TMPROOT"/makeself.\$RANDOM.\`date +"%y%m%d%H%M%S"\`.\$\$
+    mkdir "\$tmpdir" || {
+	echo "Could not create temporary directory \$tmpdir" >&2
+	exit 1
+    }
+    SCRIPT_COPY="\$tmpdir/makeself"
+    echo "Copying to a temporary location..." >&2
+    cp "\$0" "\$SCRIPT_COPY"
+    chmod +x "\$SCRIPT_COPY"
+    cd "\$TMPROOT"
+    exec "\$SCRIPT_COPY" --phase2 -- \$initargs
+    ;;
+phase2)
+    finish="\$finish ; rm -rf \`dirname \$0\`"
+    ;;
+esac
+
+if test x"\$nox11" = xn; then
+    if tty -s; then                 # Do we have a terminal?
+	:
+    else
+        if test x"\$DISPLAY" != x -a x"\$xterm_loop" = x; then  # No, but do we have X?
+            if xset q > /dev/null 2>&1; then # Check for valid DISPLAY variable
+                GUESS_XTERMS="xterm gnome-terminal rxvt dtterm eterm Eterm xfce4-terminal lxterminal kvt konsole aterm terminology"
+                for a in \$GUESS_XTERMS; do
+                    if type \$a >/dev/null 2>&1; then
+                        XTERM=\$a
+                        break
+                    fi
+                done
+                chmod a+x \$0 || echo Please add execution rights on \$0
+                if test \`echo "\$0" | cut -c1\` = "/"; then # Spawn a terminal!
+                    exec \$XTERM -e "\$0 --xwin \$initargs"
+                else
+                    exec \$XTERM -e "./\$0 --xwin \$initargs"
+                fi
+            fi
+        fi
+    fi
+fi
+
+if test x"\$targetdir" = x.; then
+    tmpdir="."
+else
+    if test x"\$keep" = xy; then
+	if test x"\$nooverwrite" = xy && test -d "\$targetdir"; then
+            echo "Target directory \$targetdir already exists, aborting." >&2
+            exit 1
+	fi
+	if test x"\$quiet" = xn; then
+	    echo "Creating directory \$targetdir" >&2
+	fi
+	tmpdir="\$targetdir"
+	dashp="-p"
+    else
+	tmpdir="\$TMPROOT/selfgz\$\$\$RANDOM"
+	dashp=""
+    fi
+    mkdir \$dashp "\$tmpdir" || {
+	echo 'Cannot create target directory' \$tmpdir >&2
+	echo 'You should try option --extract=<path>' >&2
+	eval \$finish
+	exit 1
+    }
+fi
+
+location="\`pwd\`"
+if test x"\$SETUP_NOCHECK" != x1; then
+    MS_Check "\$0"
+fi
+offset=\`head -n "\$skip" "\$0" | wc -c | tr -d " "\`
+
+if test x"\$verbose" = xy; then
+	MS_Printf "About to extract $USIZE KB in \$tmpdir ... Proceed ? [Y/n] "
+	read yn
+	if test x"\$yn" = xn; then
+		eval \$finish; exit 1
+	fi
+fi
+
+if test x"\$quiet" = xn; then
+    # Decrypting with openssl will ask for password,
+    # the prompt needs to start on new line
+	if test x"$ENCRYPT" = x"openssl"; then
+	    echo "Decrypting and uncompressing \$label..."
+	else
+        MS_Printf "Uncompressing \$label"
+	fi
+fi
+res=3
+if test x"\$keep" = xn; then
+    trap MS_cleanup 1 2 3 15
+fi
+
+if test x"\$nodiskspace" = xn; then
+    leftspace=\`MS_diskspace "\$tmpdir"\`
+    if test -n "\$leftspace"; then
+        if test "\$leftspace" -lt $USIZE; then
+            echo
+            echo "Not enough space left in "\`dirname \$tmpdir\`" (\$leftspace KB) to decompress \$0 ($USIZE KB)" >&2
+            if test x"\$keep" = xn; then
+                echo "Consider setting TMPDIR to a directory with more free space."
+            fi
+            eval \$finish; exit 1
+        fi
+    fi
+fi
+
+for s in \$filesizes
+do
+    if MS_dd_Progress "\$0" \$offset \$s | MS_Decompress | ( cd "\$tmpdir"; umask \$ORIG_UMASK ; UnTAR xp ) 1>/dev/null; then
+		if test x"\$ownership" = xy; then
+			(cd "\$tmpdir"; chown -R \`id -u\` .;  chgrp -R \`id -g\` .)
+		fi
+    else
+		echo >&2
+		echo "Unable to decompress \$0" >&2
+		eval \$finish; exit 1
+    fi
+    offset=\`expr \$offset + \$s\`
+done
+if test x"\$quiet" = xn; then
+	echo
+fi
+
+cd "\$tmpdir"
+res=0
+if test x"\$script" != x; then
+    if test x"\$export_conf" = x"y"; then
+        MS_BUNDLE="\$0"
+        MS_LABEL="\$label"
+        MS_SCRIPT="\$script"
+        MS_SCRIPTARGS="\$scriptargs"
+        MS_ARCHDIRNAME="\$archdirname"
+        MS_KEEP="\$KEEP"
+        MS_NOOVERWRITE="\$NOOVERWRITE"
+        MS_COMPRESS="\$COMPRESS"
+        MS_CLEANUP="\$cleanup"
+        export MS_BUNDLE MS_LABEL MS_SCRIPT MS_SCRIPTARGS
+        export MS_ARCHDIRNAME MS_KEEP MS_NOOVERWRITE MS_COMPRESS
+    fi
+
+    if test x"\$verbose" = x"y"; then
+        yn="x"
+        while test x"\$yn" != x -a x"\$yn" != xy -a x"\$yn" != xY -a x"\$yn" != xn -a x"\$yn" != xN
+        do
+            MS_Printf "OK to execute: \$script \$scriptargs \$* ? [Y/n] "
+            read yn
+            if test x"\$yn" = x -o x"\$yn" = xy -o x"\$yn" = xY; then
+                eval "\"\$script\" \$scriptargs \"\\\$@\""; res=\$?;
+            elif  test x"\$yn" = xn -o x"\$yn" = xN; then
+                echo "Unable to decompress \$script ,because of aborting! ";res=\$?
+            else
+                echo "Input value is unacceptable,please try again."
+            fi
+        done
+    else
+		eval "\"\$script\" \$scriptargs \"\\\$@\""; res=\$?
+    fi
+    if test "\$res" -ne 0; then
+		test x"\$verbose" = xy && echo "The program '\$script' returned an error code (\$res)" >&2
+    fi
+fi
+
+MS_exec_cleanup
+
+if test x"\$keep" = xn; then
+    cd "\$TMPROOT"
+    rm -rf "\$tmpdir"
+fi
+eval \$finish; exit \$res
+EOF
diff --git a/csrc/deepep/ops2/cmake/util/makeself/makeself.1 b/csrc/deepep/ops2/cmake/util/makeself/makeself.1
new file mode 100644
index 000000000..81bf6e4ff
--- /dev/null
+++ b/csrc/deepep/ops2/cmake/util/makeself/makeself.1
@@ -0,0 +1,110 @@
+.TH "MAKESELF" "1" "2.4.5"
+.SH "NAME"
+makeself \- An utility to generate self-extractable archives.
+.SH "SYNTAX"
+.B makeself [\fIoptions\fP] archive_dir file_name label
+.B [\fIstartup_script\fP] [\fIargs\fP]
+.SH "DESCRIPTION"
+This program is a free (GPL) utility designed to create self-extractable
+archives from a directory.
+.SH "OPTIONS"
+The following options are supported.
+.TP 15
+.B -v, --version
+Prints out the makeself version number and exits.
+.TP
+.B -h, --help
+Print out help information.
+.TP
+.B --tar-quietly
+Suppress verbose output from the tar command
+.TP
+.B --quiet
+Do not print any messages other than errors
+.TP
+.B --gzip
+Compress using gzip (default if detected).
+.TP
+.B --bzip2
+Compress using bzip2.
+.TP
+.B --pbzip2
+Compress using pbzip2.
+.TP
+.B --xz
+Compress using xz.
+.TP
+.B --lzo
+Compress using lzop.
+.TP
+.B --lz4
+Compress using lz4.
+.TP
+.B --compress
+Compress using the UNIX 'compress' command.
+.TP
+.B --nocomp
+Do not compress the data.
+.TP
+.B --complevel lvl
+Specify the compression level for gzip,bzip2,pbzui2,xz,lzo or lz4
+.TP
+.B --notemp
+The archive will create archive_dir in the current directory and
+uncompress in ./archive_dir.
+.TP
+.B --copy
+Upon extraction, the archive will first copy itself to a temporary directory.
+.TP
+.B --append
+Append more files to an existing makeself archive. The label and startup scripts will then be ignored.
+.TP
+.B --current
+Files will be extracted to the current directory. Both --current and --target dir imply --notemp.
+.TP
+.B --target dir
+Extract directly to a target directory. Directory path can be either absolute or relative.
+.TP
+.B --header file
+Specify location of the header script.
+.TP
+.B --cleanup file
+Specify a cleanup script that executes on interrupt and when finished successfully.
+.TP
+.B --follow
+Follow the symlinks in the archive.
+.TP
+.B --noprogress
+Do not show the progress during the decompression.
+.TP
+.B --nox11
+Disable automatic spawn of an xterm if running in X11.
+.TP
+.B --nowait
+Do not wait for user input after executing embedded program from an xterm.
+.TP
+.B --nomd5
+Do not create a MD5 checksum for the archive.
+.TP
+.B --nocrc
+Do not create a CRC32 checksum for the archive.
+.TP
+.B --lsm file
+LSM file describing the package.
+.B --packaging-date date
+Use provided string as the packaging date instead of the current date.
+.SH "EXAMPLES"
+Here is an example, assuming the user has a package image stored in a /home/joe/mysoft,
+and he wants to generate a self-extracting package named mysoft.sh, which will launch
+the "setup" script initially stored in /home/joe/mysoft:
+.TP
+makeself.sh /home/joe/mysoft mysoft.sh "Joe's Nice Software Package" ./setup
+.TP
+Here is also how I created the makeself.run archive which contains the Makeself distribution:
+.TP
+makeself.sh --notemp makeself makeself.run "Makeself by Stephane Peter" echo "Makeself has extracted itself"
+.SH "AUTHORS"
+Makeself has been written by Stéphane Peter <megastep@megastep.org>.
+.BR
+This man page was originally written by Bartosz Fenski <fenio@o2.pl> for the
+Debian GNU/Linux distribution (but it may be used by others).
diff --git a/csrc/deepep/ops2/cmake/util/makeself/makeself.lsm b/csrc/deepep/ops2/cmake/util/makeself/makeself.lsm
new file mode 100644
index 000000000..802cada3a
--- /dev/null
+++ b/csrc/deepep/ops2/cmake/util/makeself/makeself.lsm
@@ -0,0 +1,16 @@
+Begin3
+Title:          makeself.sh
+Version:        2.4.5
+Description:    makeself.sh is a shell script that generates a self-extractable
+                tar.gz archive from a directory. The resulting file appears as a shell
+                script, and can be launched as is. The archive will then uncompress
+                itself to a temporary directory and an arbitrary command will be
+                executed (for example an installation script). This is pretty similar
+                to archives generated with WinZip Self-Extractor in the Windows world.
+Keywords:       Installation archive tar winzip
+Author:         Stephane Peter (megastep@megastep.org)
+Maintained-by:  Stephane Peter (megastep@megastep.org)
+Original-site:  https://makeself.io/
+Platform:       Unix
+Copying-policy: GPL
+End
diff --git a/csrc/deepep/ops2/cmake/util/makeself/makeself.sh b/csrc/deepep/ops2/cmake/util/makeself/makeself.sh
new file mode 100755
index 000000000..60ced4a8a
--- /dev/null
+++ b/csrc/deepep/ops2/cmake/util/makeself/makeself.sh
@@ -0,0 +1,822 @@
+#!/bin/sh
+#
+# Makeself version 2.4.x
+#  by Stephane Peter <megastep@megastep.org>
+#
+# Utility to create self-extracting tar.gz archives.
+# The resulting archive is a file holding the tar.gz archive with
+# a small Shell script stub that uncompresses the archive to a temporary
+# directory and then executes a given script from within that directory.
+#
+# Makeself home page: https://makeself.io/
+#
+# Version 2.0 is a rewrite of version 1.0 to make the code easier to read and maintain.
+#
+# Version history :
+# - 1.0 : Initial public release
+# - 1.1 : The archive can be passed parameters that will be passed on to
+#         the embedded script, thanks to John C. Quillan
+# - 1.2 : Package distribution, bzip2 compression, more command line options,
+#         support for non-temporary archives. Ideas thanks to Francois Petitjean
+# - 1.3 : More patches from Bjarni R. Einarsson and Francois Petitjean:
+#         Support for no compression (--nocomp), script is no longer mandatory,
+#         automatic launch in an xterm, optional verbose output, and -target
+#         archive option to indicate where to extract the files.
+# - 1.4 : Improved UNIX compatibility (Francois Petitjean)
+#         Automatic integrity checking, support of LSM files (Francois Petitjean)
+# - 1.5 : Many bugfixes. Optionally disable xterm spawning.
+# - 1.5.1 : More bugfixes, added archive options -list and -check.
+# - 1.5.2 : Cosmetic changes to inform the user of what's going on with big
+#           archives (Quake III demo)
+# - 1.5.3 : Check for validity of the DISPLAY variable before launching an xterm.
+#           More verbosity in xterms and check for embedded command's return value.
+#           Bugfix for Debian 2.0 systems that have a different "print" command.
+# - 1.5.4 : Many bugfixes. Print out a message if the extraction failed.
+# - 1.5.5 : More bugfixes. Added support for SETUP_NOCHECK environment variable to
+#           bypass checksum verification of archives.
+# - 1.6.0 : Compute MD5 checksums with the md5sum command (patch from Ryan Gordon)
+# - 2.0   : Brand new rewrite, cleaner architecture, separated header and UNIX ports.
+# - 2.0.1 : Added --copy
+# - 2.1.0 : Allow multiple tarballs to be stored in one archive, and incremental updates.
+#           Added --nochown for archives
+#           Stopped doing redundant checksums when not necessary
+# - 2.1.1 : Work around insane behavior from certain Linux distros with no 'uncompress' command
+#           Cleaned up the code to handle error codes from compress. Simplified the extraction code.
+# - 2.1.2 : Some bug fixes. Use head -n to avoid problems.
+# - 2.1.3 : Bug fixes with command line when spawning terminals.
+#           Added --tar for archives, allowing to give arbitrary arguments to tar on the contents of the archive.
+#           Added --noexec to prevent execution of embedded scripts.
+#           Added --nomd5 and --nocrc to avoid creating checksums in archives.
+#           Added command used to create the archive in --info output.
+#           Run the embedded script through eval.
+# - 2.1.4 : Fixed --info output.
+#           Generate random directory name when extracting files to . to avoid problems. (Jason Trent)
+#           Better handling of errors with wrong permissions for the directory containing the files. (Jason Trent)
+#           Avoid some race conditions (Ludwig Nussel)
+#           Unset the $CDPATH variable to avoid problems if it is set. (Debian)
+#           Better handling of dot files in the archive directory.
+# - 2.1.5 : Made the md5sum detection consistent with the header code.
+#           Check for the presence of the archive directory
+#           Added --encrypt for symmetric encryption through gpg (Eric Windisch)
+#           Added support for the digest command on Solaris 10 for MD5 checksums
+#           Check for available disk space before extracting to the target directory (Andreas Schweitzer)
+#           Allow extraction to run asynchronously (patch by Peter Hatch)
+#           Use file descriptors internally to avoid error messages (patch by Kay Tiong Khoo)
+# - 2.1.6 : Replaced one dot per file progress with a realtime progress percentage and a spinning cursor (Guy Baconniere)
+#           Added --noprogress to prevent showing the progress during the decompression (Guy Baconniere)
+#           Added --target dir to allow extracting directly to a target directory (Guy Baconniere)
+# - 2.2.0 : Many bugfixes, updates and contributions from users. Check out the project page on Github for the details.
+# - 2.3.0 : Option to specify packaging date to enable byte-for-byte reproducibility. (Marc Pawlowsky)
+# - 2.4.0 : Optional support for SHA256 checksums in archives.
+# - 2.4.2 : Add support for threads for several compressors. (M. Limber)
+#           Added zstd support.
+# - 2.4.3 : Make explicit POSIX tar archives for increased compatibility.
+# - 2.4.5 : Added --tar-format to override ustar tar archive format
+#
+# (C) 1998-2021 by Stephane Peter <megastep@megastep.org>
+#
+# This software is released under the terms of the GNU GPL version 2 and above
+# Please read the license at http://www.gnu.org/copyleft/gpl.html
+# Self-extracting archives created with this script are explicitly NOT released under the term of the GPL
+#
+
+MS_VERSION=2.4.5
+MS_COMMAND="$0"
+unset CDPATH
+
+for f in ${1+"$@"}; do
+    MS_COMMAND="$MS_COMMAND \\\\
+    \\\"$f\\\""
+done
+
+# For Solaris systems
+if test -d /usr/xpg4/bin; then
+    PATH=/usr/xpg4/bin:$PATH
+    export PATH
+fi
+
+# Procedures
+
+MS_Usage()
+{
+    echo "Usage: $0 [args] archive_dir file_name label startup_script [script_args]"
+    echo "args can be one or more of the following :"
+    echo "    --version | -v     : Print out Makeself version number and exit"
+    echo "    --help | -h        : Print out this help message"
+    echo "    --tar-quietly      : Suppress verbose output from the tar command"
+    echo "    --quiet | -q       : Do not print any messages other than errors."
+    echo "    --gzip             : Compress using gzip (default if detected)"
+    echo "    --pigz             : Compress with pigz"
+    echo "    --zstd             : Compress with zstd"
+    echo "    --bzip2            : Compress using bzip2 instead of gzip"
+    echo "    --pbzip2           : Compress using pbzip2 instead of gzip"
+    echo "    --xz               : Compress using xz instead of gzip"
+    echo "    --lzo              : Compress using lzop instead of gzip"
+    echo "    --lz4              : Compress using lz4 instead of gzip"
+    echo "    --compress         : Compress using the UNIX 'compress' command"
+    echo "    --complevel lvl    : Compression level for gzip pigz zstd xz lzo lz4 bzip2 and pbzip2 (default 9)"
+    echo "    --threads thds     : Number of threads to be used by compressors that support parallelization."
+    echo "                         Omit to use compressor's default. Most useful (and required) for opting"
+    echo "                         into xz's threading, usually with '--threads=0' for all available cores."
+    echo "                         pbzip2 and pigz are parallel by default, and setting this value allows"
+    echo "                         limiting the number of threads they use."
+    echo "    --base64           : Instead of compressing, encode the data using base64"
+    echo "    --gpg-encrypt      : Instead of compressing, encrypt the data using GPG"
+    echo "    --gpg-asymmetric-encrypt-sign"
+    echo "                       : Instead of compressing, asymmetrically encrypt and sign the data using GPG"
+    echo "    --gpg-extra opt    : Append more options to the gpg command line"
+    echo "    --ssl-encrypt      : Instead of compressing, encrypt the data using OpenSSL"
+    echo "    --ssl-passwd pass  : Use the given password to encrypt the data using OpenSSL"
+    echo "    --ssl-pass-src src : Use the given src as the source of password to encrypt the data"
+    echo "                         using OpenSSL. See \"PASS PHRASE ARGUMENTS\" in man openssl."
+    echo "                         If this option is not supplied, the user will be asked to enter"
+    echo "                         encryption password on the current terminal."
+    echo "    --ssl-no-md        : Do not use \"-md\" option not supported by older OpenSSL."
+    echo "    --nochown          : Do not give the target folder to the current user (default)"
+    echo "    --chown            : Give the target folder to the current user recursively"
+    echo "    --nocomp           : Do not compress the data"
+    echo "    --notemp           : The archive will create archive_dir in the"
+    echo "                         current directory and uncompress in ./archive_dir"
+    echo "    --needroot         : Check that the root user is extracting the archive before proceeding"
+    echo "    --copy             : Upon extraction, the archive will first copy itself to"
+    echo "                         a temporary directory"
+    echo "    --append           : Append more files to an existing Makeself archive"
+    echo "                         The label and startup scripts will then be ignored"
+    echo "    --target dir       : Extract directly to a target directory"
+    echo "                         directory path can be either absolute or relative"
+    echo "    --nooverwrite      : Do not extract the archive if the specified target directory exists"
+    echo "    --current          : Files will be extracted to the current directory"
+    echo "                         Both --current and --target imply --notemp"
+    echo "    --tar-format opt   : Specify a tar archive format (default is ustar)"
+    echo "    --tar-extra opt    : Append more options to the tar command line"
+    echo "    --untar-extra opt  : Append more options to the during the extraction of the tar archive"
+    echo "    --nomd5            : Don't calculate an MD5 for archive"
+    echo "    --nocrc            : Don't calculate a CRC for archive"
+    echo "    --sha256           : Compute a SHA256 checksum for the archive"
+    echo "    --header file      : Specify location of the header script"
+    echo "    --cleanup file     : Specify a cleanup script that executes on interrupt and when finished successfully."
+    echo "    --follow           : Follow the symlinks in the archive"
+    echo "    --noprogress       : Do not show the progress during the decompression"
+    echo "    --nox11            : Disable automatic spawn of a xterm"
+    echo "    --nowait           : Do not wait for user input after executing embedded"
+    echo "                         program from an xterm"
+    echo "    --sign passphrase  : Signature private key to sign the package with"
+    echo "    --lsm file         : LSM file describing the package"
+    echo "    --license file     : Append a license file"
+    echo "    --help-header file : Add a header to the archive's --help output"
+    echo "    --packaging-date date"
+    echo "                       : Use provided string as the packaging date"
+    echo "                         instead of the current date."
+    echo
+    echo "    --keep-umask       : Keep the umask set to shell default, rather than overriding when executing self-extracting archive."
+    echo "    --export-conf      : Export configuration variables to startup_script"
+    echo
+    echo "Do not forget to give a fully qualified startup script name"
+    echo "(i.e. with a ./ prefix if inside the archive)."
+    exit 1
+}
+
+# Default settings
+if type gzip >/dev/null 2>&1; then
+    COMPRESS=gzip
+elif type compress >/dev/null 2>&1; then
+    COMPRESS=compress
+else
+    echo "ERROR: missing commands: gzip, compress" >&2
+    MS_Usage
+fi
+ENCRYPT=n
+PASSWD=""
+PASSWD_SRC=""
+OPENSSL_NO_MD=n
+COMPRESS_LEVEL=9
+DEFAULT_THREADS=123456 # Sentinel value
+THREADS=$DEFAULT_THREADS
+KEEP=n
+CURRENT=n
+NOX11=n
+NOWAIT=n
+APPEND=n
+TAR_QUIETLY=n
+KEEP_UMASK=n
+QUIET=n
+NOPROGRESS=n
+COPY=none
+NEED_ROOT=n
+TAR_ARGS=rvf
+TAR_FORMAT=ustar
+TAR_EXTRA=""
+GPG_EXTRA=""
+DU_ARGS=-ks
+HEADER=`dirname "$0"`/makeself-header.sh
+SIGNATURE=""
+TARGETDIR=""
+NOOVERWRITE=n
+DATE=`LC_ALL=C date`
+EXPORT_CONF=n
+SHA256=n
+OWNERSHIP=n
+SIGN=n
+GPG_PASSPHRASE=""
+
+# LSM file stuff
+LSM_CMD="echo No LSM. >> \"\$archname\""
+
+while true
+do
+    case "$1" in
+    --version | -v)
+	echo Makeself version $MS_VERSION
+	exit 0
+	;;
+    --pbzip2)
+	COMPRESS=pbzip2
+	shift
+	;;
+    --bzip2)
+	COMPRESS=bzip2
+	shift
+	;;
+    --gzip)
+	COMPRESS=gzip
+	shift
+	;;
+    --pigz)
+    	COMPRESS=pigz
+    	shift
+    	;;
+    --zstd)
+    	COMPRESS=zstd
+    	shift
+    	;;
+    --xz)
+	COMPRESS=xz
+	shift
+	;;
+    --lzo)
+	COMPRESS=lzo
+	shift
+	;;
+    --lz4)
+	COMPRESS=lz4
+	shift
+	;;
+    --compress)
+	COMPRESS=compress
+	shift
+	;;
+    --base64)
+	COMPRESS=base64
+	shift
+	;;
+    --gpg-encrypt)
+	COMPRESS=gpg
+	shift
+	;;
+    --gpg-asymmetric-encrypt-sign)
+	COMPRESS=gpg-asymmetric
+	shift
+	;;
+    --gpg-extra)
+	GPG_EXTRA="$2"
+    shift 2 || { MS_Usage; exit 1; }
+	;;
+    --ssl-encrypt)
+	ENCRYPT=openssl
+ 	shift
+	;;
+    --ssl-passwd)
+	PASSWD=$2
+    shift 2 || { MS_Usage; exit 1; }
+	;;
+    --ssl-pass-src)
+	PASSWD_SRC=$2
+    shift 2 || { MS_Usage; exit 1; }
+	;;
+    --ssl-no-md)
+	OPENSSL_NO_MD=y
+	shift
+	;;
+    --nocomp)
+	COMPRESS=none
+	shift
+	;;
+    --complevel)
+	COMPRESS_LEVEL="$2"
+    shift 2 || { MS_Usage; exit 1; }
+	;;
+    --threads)
+	THREADS="$2"
+    shift 2 || { MS_Usage; exit 1; }
+	;;
+    --nochown)
+	OWNERSHIP=n
+	shift
+	;;
+    --chown)
+	OWNERSHIP=y
+	shift
+	;;
+    --notemp)
+	KEEP=y
+	shift
+	;;
+    --copy)
+	COPY=copy
+	shift
+	;;
+    --current)
+	CURRENT=y
+	KEEP=y
+	shift
+	;;
+    --tar-format)
+	    TAR_FORMAT="$2"
+        shift 2 || { MS_Usage; exit 1; }
+    ;;
+    --tar-extra)
+	    TAR_EXTRA="$2"
+        shift 2 || { MS_Usage; exit 1; }
+    ;;
+    --untar-extra)
+        UNTAR_EXTRA="$2"
+        shift 2 || { MS_Usage; exit 1; }
+        ;;
+    --target)
+	  TARGETDIR="$2"
+	  KEEP=y
+    shift 2 || { MS_Usage; exit 1; }
+ 	  ;;
+    --sign)
+    SIGN=y
+    GPG_PASSPHRASE="$2"
+    shift 2 || { MS_Usage; exit 1; }
+    ;;
+    --nooverwrite)
+        NOOVERWRITE=y
+	shift
+        ;;
+    --needroot)
+	NEED_ROOT=y
+	shift
+	;;
+    --header)
+	HEADER="$2"
+    shift 2 || { MS_Usage; exit 1; }
+	;;
+    --cleanup)
+    CLEANUP_SCRIPT="$2"
+    shift 2 || { MS_Usage; exit 1; }
+    ;;
+    --license)
+        # We need to escape all characters having a special meaning in double quotes
+        LICENSE=$(sed 's/\\/\\\\/g; s/"/\\\"/g; s/`/\\\`/g; s/\$/\\\$/g' "$2")
+        shift 2 || { MS_Usage; exit 1; }
+	;;
+    --follow)
+	TAR_ARGS=rvhf
+	DU_ARGS=-ksL
+	shift
+	;;
+    --noprogress)
+	NOPROGRESS=y
+	shift
+	;;
+    --nox11)
+	NOX11=y
+	shift
+	;;
+    --nowait)
+	NOWAIT=y
+	shift
+	;;
+    --nomd5)
+	NOMD5=y
+	shift
+	;;
+    --sha256)
+        SHA256=y
+        shift
+        ;;
+    --nocrc)
+	NOCRC=y
+	shift
+	;;
+    --append)
+	APPEND=y
+	shift
+	;;
+    --lsm)
+	LSM_CMD="cat \"$2\" >> \"\$archname\""
+    shift 2 || { MS_Usage; exit 1; }
+	;;
+    --packaging-date)
+	DATE="$2"
+    shift 2 || { MS_Usage; exit 1; }
+        ;;
+    --help-header)
+	HELPHEADER=`sed -e "s/'/'\\\\\''/g" $2`
+    shift 2 || { MS_Usage; exit 1; }
+	[ -n "$HELPHEADER" ] && HELPHEADER="$HELPHEADER
+"
+    ;;
+    --tar-quietly)
+	TAR_QUIETLY=y
+	shift
+	;;
+	--keep-umask)
+	KEEP_UMASK=y
+	shift
+	;;
+    --export-conf)
+    EXPORT_CONF=y
+    shift
+    ;;
+    -q | --quiet)
+	QUIET=y
+	shift
+	;;
+    -h | --help)
+	MS_Usage
+	;;
+    -*)
+	echo Unrecognized flag : "$1"
+	MS_Usage
+	;;
+    *)
+	break
+	;;
+    esac
+done
+
+if test $# -lt 1; then
+	MS_Usage
+else
+	if test -d "$1"; then
+		archdir="$1"
+	else
+		echo "Directory $1 does not exist." >&2
+		exit 1
+	fi
+fi
+archname="$2"
+
+if test "$QUIET" = "y" || test "$TAR_QUIETLY" = "y"; then
+    if test "$TAR_ARGS" = "rvf"; then
+	    TAR_ARGS="rf"
+    elif test "$TAR_ARGS" = "rvhf"; then
+	    TAR_ARGS="rhf"
+    fi
+fi
+
+if test "$APPEND" = y; then
+    if test $# -lt 2; then
+    	MS_Usage
+    fi
+
+    # Gather the info from the original archive
+    OLDENV=`sh "$archname" --dumpconf`
+    if test $? -ne 0; then
+	    echo "Unable to update archive: $archname" >&2
+	    exit 1
+    else
+	    eval "$OLDENV"
+	    OLDSKIP=`expr $SKIP + 1`
+    fi
+else
+    if test "$KEEP" = n -a $# = 3; then
+	    echo "ERROR: Making a temporary archive with no embedded command does not make sense!" >&2
+    	echo >&2
+    	MS_Usage
+    fi
+    # We don't want to create an absolute directory unless a target directory is defined
+    if test "$CURRENT" = y; then
+	    archdirname="."
+    elif test x"$TARGETDIR" != x; then
+	    archdirname="$TARGETDIR"
+    else
+	    archdirname=`basename "$1"`
+    fi
+
+    if test $# -lt 3; then
+	    MS_Usage
+    fi
+
+    LABEL="$3"
+    SCRIPT="$4"
+    test "x$SCRIPT" = x || shift 1
+    shift 3
+    SCRIPTARGS="$*"
+fi
+
+if test "$KEEP" = n -a "$CURRENT" = y; then
+    echo "ERROR: It is A VERY DANGEROUS IDEA to try to combine --notemp and --current." >&2
+    exit 1
+fi
+
+case $COMPRESS in
+gzip)
+    GZIP_CMD="gzip -c$COMPRESS_LEVEL"
+    GUNZIP_CMD="gzip -cd"
+    ;;
+pigz)
+    GZIP_CMD="pigz -$COMPRESS_LEVEL"
+    if test $THREADS -ne $DEFAULT_THREADS; then # Leave as the default if threads not indicated
+        GZIP_CMD="$GZIP_CMD --processes $THREADS"
+    fi
+    GUNZIP_CMD="gzip -cd"
+    ;;
+zstd)
+    GZIP_CMD="zstd -$COMPRESS_LEVEL"
+    if test $THREADS -ne $DEFAULT_THREADS; then # Leave as the default if threads not indicated
+        GZIP_CMD="$GZIP_CMD --threads=$THREADS"
+    fi
+    GUNZIP_CMD="zstd -cd"
+    ;;
+pbzip2)
+    GZIP_CMD="pbzip2 -c$COMPRESS_LEVEL"
+    if test $THREADS -ne $DEFAULT_THREADS; then # Leave as the default if threads not indicated
+        GZIP_CMD="$GZIP_CMD -p$THREADS"
+    fi
+    GUNZIP_CMD="bzip2 -d"
+    ;;
+bzip2)
+    GZIP_CMD="bzip2 -$COMPRESS_LEVEL"
+    GUNZIP_CMD="bzip2 -d"
+    ;;
+xz)
+    GZIP_CMD="xz -c$COMPRESS_LEVEL"
+    # Must opt-in by specifying a value since not all versions of xz support threads
+    if test $THREADS -ne $DEFAULT_THREADS; then
+        GZIP_CMD="$GZIP_CMD --threads=$THREADS"
+    fi
+    GUNZIP_CMD="xz -d"
+    ;;
+lzo)
+    GZIP_CMD="lzop -c$COMPRESS_LEVEL"
+    GUNZIP_CMD="lzop -d"
+    ;;
+lz4)
+    GZIP_CMD="lz4 -c$COMPRESS_LEVEL"
+    GUNZIP_CMD="lz4 -d"
+    ;;
+base64)
+    GZIP_CMD="base64"
+    GUNZIP_CMD="base64 --decode -i -"
+    ;;
+gpg)
+    GZIP_CMD="gpg $GPG_EXTRA -ac -z$COMPRESS_LEVEL"
+    GUNZIP_CMD="gpg -d"
+    ENCRYPT="gpg"
+    ;;
+gpg-asymmetric)
+    GZIP_CMD="gpg $GPG_EXTRA -z$COMPRESS_LEVEL -es"
+    GUNZIP_CMD="gpg --yes -d"
+    ENCRYPT="gpg"
+    ;;
+compress)
+    GZIP_CMD="compress -fc"
+    GUNZIP_CMD="(type compress >/dev/null 2>&1 && compress -fcd || gzip -cd)"
+    ;;
+none)
+    GZIP_CMD="cat"
+    GUNZIP_CMD="cat"
+    ;;
+esac
+
+if test x"$ENCRYPT" = x"openssl"; then
+    if test x"$APPEND" = x"y"; then
+        echo "Appending to existing archive is not compatible with OpenSSL encryption." >&2
+    fi
+
+    ENCRYPT_CMD="openssl enc -aes-256-cbc -salt"
+    DECRYPT_CMD="openssl enc -aes-256-cbc -d"
+
+    if test x"$OPENSSL_NO_MD" != x"y"; then
+        ENCRYPT_CMD="$ENCRYPT_CMD -md sha256"
+        DECRYPT_CMD="$DECRYPT_CMD -md sha256"
+    fi
+
+    if test -n "$PASSWD_SRC"; then
+        ENCRYPT_CMD="$ENCRYPT_CMD -pass $PASSWD_SRC"
+    elif test -n "$PASSWD"; then
+        ENCRYPT_CMD="$ENCRYPT_CMD -pass pass:$PASSWD"
+    fi
+fi
+
+tmpfile="${TMPDIR:-/tmp}/mkself$$"
+
+if test -f "$HEADER"; then
+	oldarchname="$archname"
+	archname="$tmpfile"
+	# Generate a fake header to count its lines
+	SKIP=0
+	. "$HEADER"
+	SKIP=`cat "$tmpfile" |wc -l`
+	# Get rid of any spaces
+	SKIP=`expr $SKIP`
+	rm -f "$tmpfile"
+	if test "$QUIET" = "n"; then
+		echo "Header is $SKIP lines long" >&2
+	fi
+	archname="$oldarchname"
+else
+    echo "Unable to open header file: $HEADER" >&2
+    exit 1
+fi
+
+if test "$QUIET" = "n"; then
+    echo
+fi
+
+if test "$APPEND" = n; then
+    if test -f "$archname"; then
+		echo "WARNING: Overwriting existing file: $archname" >&2
+    fi
+fi
+
+USIZE=`du $DU_ARGS "$archdir" | awk '{print $1}'`
+
+if test "." = "$archdirname"; then
+	if test "$KEEP" = n; then
+		archdirname="makeself-$$-`date +%Y%m%d%H%M%S`"
+	fi
+fi
+
+test -d "$archdir" || { echo "Error: $archdir does not exist."; rm -f "$tmpfile"; exit 1; }
+if test "$QUIET" = "n"; then
+   echo "About to compress $USIZE KB of data..."
+   echo "Adding files to archive named \"$archname\"..."
+fi
+
+# See if we have GNU tar
+TAR=`exec <&- 2>&-; which gtar || command -v gtar || type gtar`
+test -x "$TAR" || TAR=tar
+
+tmparch="${TMPDIR:-/tmp}/mkself$$.tar"
+(
+    if test "$APPEND" = "y"; then
+        tail -n "+$OLDSKIP" "$archname" | eval "$GUNZIP_CMD" > "$tmparch"
+    fi
+    cd "$archdir"
+    # "Determining if a directory is empty"
+    # https://www.etalabs.net/sh_tricks.html
+    find . \
+        \( \
+        ! -type d \
+        -o \
+        \( -links 2 -exec sh -c '
+            is_empty () (
+                cd "$1"
+                set -- .[!.]* ; test -f "$1" && return 1
+                set -- ..?* ; test -f "$1" && return 1
+                set -- * ; test -f "$1" && return 1
+                return 0
+            )
+            is_empty "$0"' {} \; \
+        \) \
+        \) -print \
+        | LC_ALL=C sort \
+        | sed 's/./\\&/g' \
+        | xargs $TAR $TAR_EXTRA --format $TAR_FORMAT -$TAR_ARGS "$tmparch"
+) || {
+    echo "ERROR: failed to create temporary archive: $tmparch"
+    rm -f "$tmparch" "$tmpfile"
+    exit 1
+}
+
+USIZE=`du $DU_ARGS "$tmparch" | awk '{print $1}'`
+
+eval "$GZIP_CMD" <"$tmparch" >"$tmpfile" || {
+    echo "ERROR: failed to create temporary file: $tmpfile"
+    rm -f "$tmparch" "$tmpfile"
+    exit 1
+}
+rm -f "$tmparch"
+
+if test x"$ENCRYPT" = x"openssl"; then
+    echo "About to encrypt archive \"$archname\"..."
+    { eval "$ENCRYPT_CMD -in $tmpfile -out ${tmpfile}.enc" && mv -f ${tmpfile}.enc $tmpfile; } || \
+        { echo Aborting: could not encrypt temporary file: "$tmpfile".; rm -f "$tmpfile"; exit 1; }
+fi
+
+fsize=`cat "$tmpfile" | wc -c | tr -d " "`
+
+# Compute the checksums
+
+shasum=0000000000000000000000000000000000000000000000000000000000000000
+md5sum=00000000000000000000000000000000
+crcsum=0000000000
+
+if test "$NOCRC" = y; then
+	if test "$QUIET" = "n"; then
+		echo "skipping crc at user request"
+	fi
+else
+	crcsum=`CMD_ENV=xpg4 cksum < "$tmpfile" | sed -e 's/ /Z/' -e 's/	/Z/' | cut -dZ -f1`
+	if test "$QUIET" = "n"; then
+		echo "CRC: $crcsum"
+	fi
+fi
+
+if test "$SHA256" = y; then
+	SHA_PATH=`exec <&- 2>&-; which shasum || command -v shasum || type shasum`
+	if test -x "$SHA_PATH"; then
+		shasum=`eval "$SHA_PATH -a 256" < "$tmpfile" | cut -b-64`
+	else
+		SHA_PATH=`exec <&- 2>&-; which sha256sum || command -v sha256sum || type sha256sum`
+		shasum=`eval "$SHA_PATH" < "$tmpfile" | cut -b-64`
+	fi
+	if test "$QUIET" = "n"; then
+		if test -x "$SHA_PATH"; then
+			echo "SHA256: $shasum"
+		else
+			echo "SHA256: none, SHA command not found"
+		fi
+	fi
+fi
+if test "$NOMD5" = y; then
+	if test "$QUIET" = "n"; then
+		echo "Skipping md5sum at user request"
+	fi
+else
+	# Try to locate a MD5 binary
+	OLD_PATH=$PATH
+	PATH=${GUESS_MD5_PATH:-"$OLD_PATH:/bin:/usr/bin:/sbin:/usr/local/ssl/bin:/usr/local/bin:/opt/openssl/bin"}
+	MD5_ARG=""
+	MD5_PATH=`exec <&- 2>&-; which md5sum || command -v md5sum || type md5sum`
+	test -x "$MD5_PATH" || MD5_PATH=`exec <&- 2>&-; which md5 || command -v md5 || type md5`
+	test -x "$MD5_PATH" || MD5_PATH=`exec <&- 2>&-; which digest || command -v digest || type digest`
+	PATH=$OLD_PATH
+	if test -x "$MD5_PATH"; then
+		if test `basename ${MD5_PATH}`x = digestx; then
+			MD5_ARG="-a md5"
+		fi
+		md5sum=`eval "$MD5_PATH $MD5_ARG" < "$tmpfile" | cut -b-32`
+		if test "$QUIET" = "n"; then
+			echo "MD5: $md5sum"
+		fi
+	else
+		if test "$QUIET" = "n"; then
+			echo "MD5: none, MD5 command not found"
+		fi
+	fi
+fi
+if test "$SIGN" = y; then
+    GPG_PATH=`exec <&- 2>&-; which gpg || command -v gpg || type gpg`
+    if test -x "$GPG_PATH"; then
+        SIGNATURE=`$GPG_PATH --pinentry-mode=loopback --batch --yes --passphrase "$GPG_PASSPHRASE" --output - --detach-sig $tmpfile | base64 | tr -d \\\\n`
+        if test "$QUIET" = "n"; then
+            echo "Signature: $SIGNATURE"
+        fi
+    else
+        echo "Missing gpg command" >&2
+    fi
+fi
+
+totalsize=0
+for size in $fsize;
+do
+    totalsize=`expr $totalsize + $size`
+done
+
+if test "$APPEND" = y; then
+    mv "$archname" "$archname".bak || exit
+
+    # Prepare entry for new archive
+    filesizes="$fsize"
+    CRCsum="$crcsum"
+    MD5sum="$md5sum"
+    SHAsum="$shasum"
+    Signature="$SIGNATURE"
+    # Generate the header
+    . "$HEADER"
+    # Append the new data
+    cat "$tmpfile" >> "$archname"
+
+    chmod +x "$archname"
+    rm -f "$archname".bak
+    if test "$QUIET" = "n"; then
+    	echo "Self-extractable archive \"$archname\" successfully updated."
+    fi
+else
+    filesizes="$fsize"
+    CRCsum="$crcsum"
+    MD5sum="$md5sum"
+    SHAsum="$shasum"
+    Signature="$SIGNATURE"
+
+    # Generate the header
+    . "$HEADER"
+
+    # Append the compressed tar data after the stub
+    if test "$QUIET" = "n"; then
+    	echo
+    fi
+    cat "$tmpfile" >> "$archname"
+    chmod +x "$archname"
+    if test "$QUIET" = "n"; then
+    	echo Self-extractable archive \"$archname\" successfully created.
+    fi
+fi
+rm -f "$tmpfile"
diff --git a/csrc/deepep/ops2/cmake/util/makeself/run-tests.sh b/csrc/deepep/ops2/cmake/util/makeself/run-tests.sh
new file mode 100755
index 000000000..31ee16511
--- /dev/null
+++ b/csrc/deepep/ops2/cmake/util/makeself/run-tests.sh
@@ -0,0 +1,8 @@
+#!/bin/sh
+# Run every available test - Bash needed
+cd test
+for test in *test;
+do
+	echo "Running test $test ..."
+	bash $test || { echo "*** ERROR: Test '$test' failed!"; exit 1; }
+done
diff --git a/csrc/deepep/ops2/cmake/util/merge_aicpu_info_json.sh b/csrc/deepep/ops2/cmake/util/merge_aicpu_info_json.sh
new file mode 100755
index 000000000..970a44bf1
--- /dev/null
+++ b/csrc/deepep/ops2/cmake/util/merge_aicpu_info_json.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+project_path=$1
+build_path=$2
+vendor_name=customize
+echo $@
+if [[ ! -d "$project_path" ]]; then
+    echo "[ERROR] No project path is provided"
+    exit 1
+fi
+
+if [[ ! -d "$build_path" ]]; then
+    echo "[ERROR] No build path is provided"
+    exit 1
+fi
+
+if [[ ! -d "$ASCEND_OPP_PATH" ]]; then
+    echo "[ERROR] No opp install path is provided"
+    exit 1
+fi
+custom_exist_info_json=$ASCEND_OPP_PATH/vendors/$vendor_name/op_impl/cpu/config/cust_aicpu_kernel.json
+custom_new_info_json=$build_path/makepkg/packages/vendors/$vendor_name/op_impl/cpu/config/cust_aicpu_kernel.json
+temp_info_json=$build_path/makepkg/packages/vendors/$vendor_name/op_impl/cpu/config/temp_cust_aicpu_kernel.json
+
+if [[ -f "$custom_exist_info_json" ]] && [[ -f "$custom_new_info_json" ]]; then
+    cp -f $custom_exist_info_json $temp_info_json
+    chmod +w $temp_info_json
+    python3 ${project_path}/cmake/util/insert_op_info.py ${custom_new_info_json} ${temp_info_json}
+    cp -f $temp_info_json $custom_new_info_json
+    rm -f $temp_info_json
+fi
diff --git a/csrc/deepep/ops2/cmake/util/opdesc_parser.py b/csrc/deepep/ops2/cmake/util/opdesc_parser.py
new file mode 100755
index 000000000..7b789567a
--- /dev/null
+++ b/csrc/deepep/ops2/cmake/util/opdesc_parser.py
@@ -0,0 +1,415 @@
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+
+import os
+import sys
+
+OP_ALL = "__ALLOP__"
+SOC_ALL = "__ALLSOC__"
+SOC_TO_SHORT_SOC_MAP = {
+    "ascend910a": "ascend910",
+    "ascend910proa": "ascend910",
+    "ascend910b": "ascend910",
+    "ascend910prob": "ascend910",
+    "ascend910premiuma": "ascend910",
+    "ascend910b1": "ascend910b",
+    "ascend910b2": "ascend910b",
+    "ascend910b2c": "ascend910b",
+    "ascend910b3": "ascend910b",
+    "ascend910b4": "ascend910b",
+    "ascend910b4-1": "ascend910b",
+    "ascend910_9391": "ascend910_93",
+    "ascend910_9381": "ascend910_93",
+    "ascend910_9372": "ascend910_93",
+    "ascend910_9392": "ascend910_93",
+    "ascend910_9382": "ascend910_93",
+    "ascend910_9361": "ascend910_93",
+    "ascend310p1": "ascend310p",
+    "ascend310p3": "ascend310p",
+    "ascend310p3vir01": "ascend310p",
+    "ascend310p3vir02": "ascend310p",
+    "ascend310p3vir04": "ascend310p",
+    "ascend310p3vir08": "ascend310p",
+    "ascend310b1": "ascend310b",
+    "bs9sx1aa": "bs9sx1a",
+    "ascend610lite": "ascend610lite",
+}
+CONFLICT_KEYWORDS = {
+    "and",
+    "as",
+    "assert",
+    "break",
+    "class",
+    "continue",
+    "def",
+    "del",
+    "elif",
+    "else",
+    "except",
+    "finally",
+    "for",
+    "from",
+    "global",
+    "if",
+    "import",
+    "in",
+    "is",
+    "lambda",
+    "not",
+    "or",
+    "pass",
+    "raise",
+    "return",
+    "try",
+    "while",
+    "with",
+    "yield",
+    "False",
+    "None",
+    "True",
+    "nonlocal",
+    "arg",
+    "__inputs__",
+    "__outputs__",
+    "options",
+    "bisheng",
+    "bisheng_path",
+    "tikcpp_path",
+    "impl_mode",
+    "custom_compile_options",
+    "custom_all_compile_options",
+    "soc_version",
+    "soc_short",
+    "custom_compile_options_soc",
+    "custom_all_compile_options_soc",
+    "origin_func_name",
+    "ascendc_src_dir_ex",
+    "ascendc_src_dir",
+    "ascendc_src_file",
+    "src",
+    "op_type",
+    "code_channel",
+    "op_info",
+    "compile_op",
+    "get_code_channel",
+    "result",
+    "__attrs__",
+    "isinstance",
+    "attr",
+    "get_current_build_config",
+    "_build_args",
+    "get_dtype_fmt_options",
+    "shutil",
+    "os",
+    "get_kernel_source",
+}
+
+
+class OpDesc:
+    def __init__(self: any, op_type: str):
+        self.op_type = op_type
+        self.attr_list = []
+        self.attr_val = {}
+        self.input_name = []
+        self.input_ori_name = []
+        self.input_type = []
+        self.input_dtype = []
+        self.input_dtype_for_bin_list = []
+        self.input_dtype_for_bin = {}
+        self.input_fmt = []
+        self.input_fmt_for_bin_list = []
+        self.input_fmt_for_bin = {}
+        self.input_virt = {}
+        self.output_name = []
+        self.output_ori_name = []
+        self.output_type = []
+        self.output_dtype = []
+        self.output_dtype_for_bin_list = []
+        self.output_dtype_for_bin = {}
+        self.output_fmt = []
+        self.output_fmt_for_bin_list = []
+        self.output_fmt_for_bin = {}
+        self.output_init_value = []
+        self.output_shape_depend_on_compute = []
+        self.op_fmt_sel = False
+        self.op_chk_support = False
+        self.op_intf = ""
+        self.kern_name = ""
+        self.op_file = ""
+        self.op_replay_flag = False
+        self.op_replay_batch = False
+        self.input_idx = -1
+        self.output_idx = -1
+        self.max_block_dim = 32
+        self.max_shape_size = 268435456
+        self.dynamic_shape = False
+        self.op_range_limit = ""
+        self.custom_compile_options = {}
+        self.custom_all_compile_options = {}
+        self.param_type_dynamic = False
+        self.mc2_ctx = []
+        self.bin_cprs_list = []
+        self.bin_cprs_head = []
+        self.bin_save_list = []
+
+    @staticmethod
+    def _parse_digit(conf: str) -> int:
+        return int(conf.split("=")[1])
+
+    @staticmethod
+    def _parse_flag(conf: str) -> bool:
+        if "true" == conf.split("=")[1]:
+            return True
+        return False
+
+    @staticmethod
+    def _parse_str(conf: str) -> str:
+        return conf.split("=")[1]
+
+    @staticmethod
+    def _parse_list(conf: str) -> list:
+        return conf.split("=")[1].split(",")
+
+    def parse_input(self: any, conf: str):
+        if conf.startswith("input{}.name".format(int(self.input_idx) + 1)):
+            self.input_idx += 1
+            self.input_ori_name.append(self._parse_str(conf))
+            self.input_name.append(self.input_ori_name[-1] + "_in__")
+        elif conf.startswith("input{}.paramType".format(int(self.input_idx))):
+            param_type = self._parse_str(conf)
+            self.input_type.append(param_type)
+            if param_type == "dynamic":
+                self.param_type_dynamic = True
+        elif conf.startswith("input{}.dtype".format(int(self.input_idx))):
+            self.input_dtype.append(self._parse_str(conf))
+        elif conf.startswith("input{}.for_bin_dtype".format(int(self.input_idx))):
+            self.input_dtype_for_bin.update({self.input_idx: self._parse_str(conf)})
+        elif conf.startswith("input{}.format".format(int(self.input_idx))):
+            self.input_fmt.append(self._parse_str(conf))
+        elif conf.startswith("input{}.for_bin_format".format(int(self.input_idx))):
+            self.input_fmt_for_bin.update({self.input_idx: self._parse_str(conf)})
+        elif conf.startswith("input{}.virtual".format(int(self.input_idx))):
+            self.input_virt[self.input_idx] = self._parse_str(conf)
+        elif conf.startswith("input{}.initValue".format(int(self.input_idx))):
+            raise Exception(
+                f"[ERROR]: Op: {{'{self.op_type}'}} input {self.input_ori_name[int(self.input_idx)]}\
+ has InitValue, which is not support!"
+            )
+        else:
+            return
+
+    def parse_output(self: any, conf: str):
+        if conf.startswith("output{}.name".format(int(self.output_idx) + 1)):
+            self.output_idx += 1
+            self.output_ori_name.append(self._parse_str(conf))
+            self.output_name.append(self.output_ori_name[-1] + "_out_")
+            self.output_init_value.append(None)
+        elif conf.startswith("output{}.paramType".format(int(self.output_idx))):
+            param_type = self._parse_str(conf)
+            self.output_type.append(param_type)
+            if param_type == "dynamic":
+                self.param_type_dynamic = True
+        elif conf.startswith("output{}.dtype".format(int(self.output_idx))):
+            self.output_dtype.append(self._parse_str(conf))
+        elif conf.startswith("output{}.for_bin_dtype".format(int(self.output_idx))):
+            self.output_dtype_for_bin.update({self.output_idx: self._parse_str(conf)})
+        elif conf.startswith("output{}.format".format(int(self.output_idx))):
+            self.output_fmt.append(self._parse_str(conf))
+        elif conf.startswith("output{}.for_bin_format".format(int(self.output_idx))):
+            self.output_fmt_for_bin.update({self.output_idx: self._parse_str(conf)})
+        elif conf.startswith("output{}.initValue".format(int(self.output_idx))):
+            self.output_init_value[int(self.output_idx)] = self._parse_str(conf)
+        elif conf.startswith(
+            "output{}.outputShapeDependOnCompute=true".format(int(self.output_idx))
+        ):
+            self.output_shape_depend_on_compute.append(int(self.output_idx))
+        else:
+            return
+
+    def parse_op_format(self: any, conf: str):
+        self.op_fmt_sel = self._parse_flag(conf)
+
+    def parse_check_support(self: any, conf: str):
+        self.op_chk_support = self._parse_flag(conf)
+
+    def parse_range_limit(self: any, conf: str):
+        self.op_range_limit = self._parse_str(conf)
+
+    def parse_kern_name(self: any, conf: str):
+        self.kern_name = self._parse_str(conf)
+
+    def parse_op_intf(self: any, conf: str):
+        self.op_intf = self._parse_str(conf)
+
+    def parse_op_file(self: any, conf: str):
+        self.op_file = self._parse_str(conf)
+
+    def parse_dynamic_shape(self: any, conf: str):
+        self.dynamic_shape = self._parse_flag(conf)
+
+    def parse_attr_list(self: any, conf: str):
+        self.attr_list = self._parse_list(conf)
+        intersection_element = set(self.attr_list) & CONFLICT_KEYWORDS
+        if intersection_element:
+            raise Exception(
+                f"[ERROR]: The attribute name: {intersection_element} in op: {{'{self.op_type}'}} \
+conflicts with the built-in variable name. Use a complex name or prefix the operator name."
+            )
+
+    def parse_mc2_ctx(self: any, conf: str):
+        self.mc2_ctx = self._parse_list(conf)
+
+    @staticmethod
+    def _camel_to_snake(camel_case_str: str):
+        snake_case_str = ""
+        for i, c in enumerate(camel_case_str):
+            if i == 0:
+                snake_case_str += c.lower()
+            elif c.isupper():
+                snake_case_str += "_" + c.lower()
+            else:
+                snake_case_str += c
+        return snake_case_str
+
+    def parse_attr_val(self: any, conf: str):
+        for attr in self.attr_list:
+            if self.attr_val.get(attr) is None:
+                self.attr_val[attr] = {}
+            if conf.startswith("attr_{}.type".format(attr)):
+                self.attr_val.get(attr)["type"] = self._camel_to_snake(
+                    self._parse_str(conf)
+                )
+            elif conf.startswith("attr_{}.paramType".format(attr)):
+                self.attr_val.get(attr)["paramType"] = self._parse_str(conf)
+            elif conf.startswith("attr_{}.defaultValue".format(attr)):
+                self.attr_val.get(attr)["defaultValue"] = self._parse_str(conf)
+
+    def parse_replay_val(self: any, batch_list: list, iterator_list: list):
+        if self.op_type in batch_list:
+            self.op_replay_flag = True
+            self.op_replay_batch = True
+        elif self.op_type in iterator_list:
+            self.op_replay_flag = True
+            self.op_replay_batch = False
+
+
+def _is_op_type_in_opdesc(op_descs: list, op_type: str):
+    for op in op_descs:
+        if op_type == op.op_type:
+            return True
+    return False
+
+
+def _set_all_options_to_opdescs(op_descs, soc_ver_compile_options):
+    for op in op_descs:
+        op.custom_all_compile_options = soc_ver_compile_options
+
+
+def _set_options_to_opdesc(op_descs, op_type, soc_ver_compile_options):
+    for op in op_descs:
+        if op.op_type != op_type:
+            continue
+        op.custom_compile_options.update(soc_ver_compile_options)
+
+
+def _trans_soc_ver_to_short(soc_ver: str):
+    low_soc_ver = soc_ver.lower()
+    if low_soc_ver not in SOC_TO_SHORT_SOC_MAP:
+        print(
+            f"WARNING: caution: {soc_ver} will trans into ascend910, if not your intention,"
+            f"use ascend910b1~4 instead"
+        )
+    return SOC_TO_SHORT_SOC_MAP[low_soc_ver]
+
+
+def _get_op_custom_options(op_descs: list, auto_gen_dir: str):
+    if auto_gen_dir is None:
+        return {}
+    file = os.path.join(auto_gen_dir, "custom_compile_options.ini")
+    if not os.path.exists(file):
+        print(f"WARNING: cannot find {auto_gen_dir}/custom_compile_options.ini")
+        return {}
+    with open(file, "r") as fd:
+        lines = fd.readlines()
+        for line in lines:
+            param_list = str.split(line.rstrip("\n"), ",")
+            if len(param_list) != 3:
+                raise Exception(
+                    f"ERROR: custom compile option {param_list} len is not 3"
+                )
+            op_type = param_list[0]
+            if op_type.upper() == "ALL":
+                op_type = OP_ALL
+            if op_type != OP_ALL and _is_op_type_in_opdesc(op_descs, op_type) == False:
+                continue
+            soc_ver_compile_options = {}
+            soc_ver = param_list[1]
+            options_str = param_list[2]
+            options = str.split(options_str, ";")
+            if soc_ver == "":
+                soc_ver_compile_options[SOC_ALL] = options
+            else:
+                soc_ver_list = str.split(soc_ver, ";")
+                for ver in soc_ver_list:
+                    short_ver = _trans_soc_ver_to_short(ver)
+                    soc_ver_compile_options[short_ver] = options
+            if op_type == OP_ALL:
+                _set_all_options_to_opdescs(op_descs, soc_ver_compile_options)
+            else:
+                _set_options_to_opdesc(op_descs, op_type, soc_ver_compile_options)
+
+
+def get_op_desc(
+    file: str,
+    batch_list: list,
+    iterator_list: list,
+    builder: any,
+    op_type: list,
+    auto_gen_dir: str = None,
+) -> list:
+    op_descs = []
+    op_match = False
+    with open(file, "r") as fd:
+        lines = fd.readlines()
+        for line in lines:
+            line = line.strip()
+            if line.startswith("["):
+                name = line[1:-1]
+                if op_type is None or name in op_type:
+                    op_match = True
+                    op_desc = builder(name)
+                    op_desc.parse_replay_val(batch_list, iterator_list)
+                    op_descs.append(op_desc)
+                else:
+                    op_match = False
+                    if op_type is not None and len(op_descs) == len(op_type):
+                        break
+                continue
+            if not op_match:
+                continue
+            if line.startswith("input"):
+                op_desc.parse_input(line)
+            elif line.startswith("output"):
+                op_desc.parse_output(line)
+            elif line.startswith("dynamicFormat.flag"):
+                op_desc.parse_op_format(line)
+            elif line.startswith("needCheckSupport.flag"):
+                op_desc.parse_check_support(line)
+            elif line.startswith("rangeLimit.value"):
+                op_desc.parse_range_limit(line)
+            elif line.startswith("opInterface.value"):
+                op_desc.parse_op_intf(line)
+            elif line.startswith("kernel.name"):
+                op_desc.parse_kern_name(line)
+            elif line.startswith("opFile.value"):
+                op_desc.parse_op_file(line)
+            elif line.startswith("dynamicShapeSupport.flag"):
+                op_desc.parse_dynamic_shape(line)
+            elif line.startswith("mc2.ctx"):
+                op_desc.parse_mc2_ctx(line)
+            elif line.startswith("attr.list"):
+                op_desc.parse_attr_list(line)
+            elif line.startswith("attr_"):
+                op_desc.parse_attr_val(line)
+    _get_op_custom_options(op_descs, auto_gen_dir)
+    return op_descs
diff --git a/csrc/deepep/ops2/cmake/util/parse_ini_to_json.py b/csrc/deepep/ops2/cmake/util/parse_ini_to_json.py
new file mode 100755
index 000000000..928acae6f
--- /dev/null
+++ b/csrc/deepep/ops2/cmake/util/parse_ini_to_json.py
@@ -0,0 +1,449 @@
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+
+"""
+parser ini to json
+"""
+
+import json
+import os
+import stat
+import sys
+
+ATTR_TYPE_LIST = [
+    "int",
+    "float",
+    "bool",
+    "str",
+    "listInt",
+    "listFloat",
+    "listBool",
+    "listStr",
+    "listListInt",
+    "type",
+    "listType",
+    "tensor",
+    "listTensor",
+]
+ATTR_PARAMTYPE_LIST = ["optional", "required"]
+BOOL_FLAG_KEY = [
+    "dynamicFormat",
+    "dynamicShapeSupport",
+    "dynamicRankSupport",
+    "precision_reduce",
+    "heavyOp",
+    "needCheckSupport",
+    "enableVectorCore",
+]
+BOOL_LIST = ["true", "false"]
+DTYPE_LIST = [
+    "float16",
+    "float",
+    "float32",
+    "int8",
+    "int16",
+    "int32",
+    "uint8",
+    "uint16",
+    "uint32",
+    "bool",
+    "int64",
+    "uint64",
+    "qint8",
+    "qint16",
+    "qint32",
+    "quint8",
+    "quint16",
+    "double",
+    "complex64",
+    "complex128",
+    "string",
+    "resource",
+    "dual",
+    "dual_sub_int8",
+    "dual_sub_uint8",
+    "string_ref",
+    "int4",
+    "bfloat16",
+    "uint1",
+]
+FORMAT_LIST = [
+    "NCHW",
+    "NHWC",
+    "ND",
+    "NC1HWC0",
+    "FRACTAL_Z",
+    "NC1C0HWPAD",
+    "NHWC1C0",
+    "FSR_NCHW",
+    "FRACTAL_DECONV",
+    "C1HWNC0",
+    "FRACTAL_DECONV_TRANSPOSE",
+    "FRACTAL_DECONV_SP_STRIDE_TRANS",
+    "NC1HWC0_C04",
+    "FRACTAL_Z_C04",
+    "CHWN",
+    "FRACTAL_DECONV_SP_STRIDE8_TRANS",
+    "HWCN",
+    "NC1KHKWHWC0",
+    "BN_WEIGHT",
+    "FILTER_HWCK",
+    "HASHTABLE_LOOKUP_LOOKUPS",
+    "HASHTABLE_LOOKUP_KEYS",
+    "HASHTABLE_LOOKUP_VALUE",
+    "HASHTABLE_LOOKUP_OUTPUT",
+    "HASHTABLE_LOOKUP_HITS",
+    "C1HWNCoC0",
+    "MD",
+    "NDHWC",
+    "FRACTAL_ZZ",
+    "FRACTAL_NZ",
+    "NCDHW",
+    "DHWCN",
+    "NDC1HWC0",
+    "FRACTAL_Z_3D",
+    "CN",
+    "NC",
+    "DHWNC",
+    "FRACTAL_Z_3D_TRANSPOSE",
+    "FRACTAL_ZN_LSTM",
+    "FRACTAL_ZN_RNN",
+    "FRACTAL_Z_G",
+    "NULL",
+]
+
+
+def parse_ini_files(ini_files):
+    """
+    parse ini files to json
+    Parameters:
+    ----------------
+    ini_files:input file list
+    return:ops_info
+    ----------------
+    """
+    tbe_ops_info = {}
+    for ini_file in ini_files:
+        check_file_size(ini_file)
+        parse_ini_to_obj(ini_file, tbe_ops_info)
+    return tbe_ops_info
+
+
+def check_file_size(input_file):
+    try:
+        file_size = os.path.getsize(input_file)
+    except OSError as os_error:
+        print('[ERROR] Failed to open "%s". %s' % (input_file, str(os_error)))
+        raise OSError from os_error
+    if file_size > 10 * 1024 * 1024:
+        print(
+            "[WARN] The size of %s exceeds 10MB, it may take more time to run, please wait."
+            % input_file
+        )
+
+
+def parse_ini_to_obj(ini_file, tbe_ops_info):
+    """
+    parse ini file to json obj
+    Parameters:
+    ----------------
+    ini_file:ini file path
+    tbe_ops_info:ops_info
+    ----------------
+    """
+    with open(ini_file) as ini_file:
+        lines = ini_file.readlines()
+        op_dict = {}
+        op_name = ""
+        find_op_type = False
+        for line in lines:
+            line = line.rstrip()
+            if line == "":
+                continue
+            if line.startswith("["):
+                if line.endswith("]"):
+                    op_name = line[1:-1]
+                    op_dict = {}
+                    tbe_ops_info[op_name] = op_dict
+                    find_op_type = True
+            elif "=" in line:
+                key1 = line[: line.index("=")]
+                key2 = line[line.index("=") + 1 :]
+                key1_0, key1_1 = key1.split(".")
+                if key1_0 not in op_dict:
+                    op_dict[key1_0] = {}
+                if key1_1 in op_dict.get(key1_0):
+                    raise RuntimeError(
+                        "Op:" + op_name + " " + key1_0 + " " + key1_1 + " is repeated!"
+                    )
+                dic_key = op_dict.get(key1_0)
+                dic_key[key1_1] = key2
+            else:
+                continue
+        if not find_op_type:
+            raise RuntimeError("Not find OpType in .ini file.")
+
+
+def check_output_exist(op_dict, is_valid):
+    """
+    Function Description:
+        Check output is exist
+    Parameter: op_dict
+    Parameter: is_valid
+    """
+    if "output0" in op_dict:
+        output0_dict = op_dict.get("output0")
+        if output0_dict.get("name", None) is None:
+            is_valid = False
+            print("output0.name is required in .ini file!")
+    else:
+        is_valid = False
+        print("output0 is required in .ini file!")
+    return is_valid
+
+
+def check_attr_dict(attr_dict, is_valid, attr):
+    """
+    Function Description:
+        Check attr_dict
+    Parameter: attr_dict
+    Parameter: is_valid
+    Parameter: attr
+    """
+    attr_type = attr_dict.get("type")
+    value = attr_dict.get("value")
+    param_type = attr_dict.get("paramType")
+    if attr_type is None or value is None:
+        is_valid = False
+        print("If attr.list is exist, {0}.type and {0}.value is required".format(attr))
+    if param_type and param_type not in ATTR_PARAMTYPE_LIST:
+        is_valid = False
+        print("{0}.paramType only support {1}.".format(attr, ATTR_PARAMTYPE_LIST))
+    if attr_type and attr_type not in ATTR_TYPE_LIST:
+        is_valid = False
+        print("{0}.type only support {1}.".format(attr, ATTR_TYPE_LIST))
+    return is_valid
+
+
+def check_attr(op_dict, is_valid):
+    """
+    Function Description:
+        Check attr
+    Parameter: op_dict
+    Parameter: is_valid
+    """
+    if "attr" in op_dict:
+        attr_dict = op_dict.get("attr")
+        attr_list_str = attr_dict.get("list", None)
+        if attr_list_str is None:
+            is_valid = False
+            print("attr.list is required in .ini file!")
+        else:
+            attr_list = attr_list_str.split(",")
+            for attr_name in attr_list:
+                attr = "attr_" + attr_name.strip()
+                attr_dict = op_dict.get(attr)
+                if attr_dict:
+                    is_valid = check_attr_dict(attr_dict, is_valid, attr)
+                else:
+                    is_valid = False
+                    print(
+                        "%s is required in .ini file, when attr.list is %s!"
+                        % (attr, attr_list_str)
+                    )
+    return is_valid
+
+
+def check_bool_flag(op_dict, is_valid):
+    """
+    Function Description:
+        check_bool_flag
+    Parameter: op_dict
+    Parameter: is_valid
+    """
+    for key in BOOL_FLAG_KEY:
+        if key in op_dict:
+            op_bool_key = op_dict.get(key)
+            if op_bool_key.get("flag").strip() not in BOOL_LIST:
+                is_valid = False
+                print("{0}.flag only support {1}.".format(key, BOOL_LIST))
+    return is_valid
+
+
+def check_type_format(op_info, is_valid, op_info_key):
+    """
+    Function Description:
+        Check type and format
+    Parameter: op_info
+    Parameter: is_valid
+    Parameter: op_info_key
+    """
+    op_info_dtype_str = op_info.get("dtype")
+    op_info_dtype_num = 0
+    op_info_format_num = 0
+    if op_info_dtype_str:
+        op_info_dtype = op_info_dtype_str.split(",")
+        op_info_dtype_num = len(op_info_dtype)
+        for dtype in op_info_dtype:
+            if dtype.strip() not in DTYPE_LIST:
+                is_valid = False
+                print("{0}.dtype not support {1}.".format(op_info_key, dtype))
+    op_info_format_str = op_info.get("format")
+    if op_info_format_str:
+        op_info_format = op_info_format_str.split(",")
+        op_info_format_num = len(op_info_format)
+        for op_format in op_info_format:
+            if op_format.strip() not in FORMAT_LIST:
+                is_valid = False
+                print("{0}.format not support {1}.".format(op_info_key, op_format))
+    if op_info_dtype_num > 0 and op_info_format_num > 0:
+        if op_info_dtype_num != op_info_format_num:
+            is_valid = False
+            print(
+                "The number of {0}.dtype not match the number of {0}.format.".format(
+                    op_info_key
+                )
+            )
+    return is_valid
+
+
+def check_op_info(tbe_ops):
+    """
+    Function Description:
+        Check info.
+    Parameter: tbe_ops
+    Return Value: is_valid
+    """
+    print("\n\n==============check valid for ops info start==============")
+    required_op_input_info_keys = ["paramType", "name"]
+    required_op_output_info_keys = ["paramType", "name"]
+    param_type_valid_value = ["dynamic", "optional", "required"]
+    is_valid = True
+    for op_key in tbe_ops:
+        op_dict = tbe_ops[op_key]
+        for op_info_key in op_dict:
+            if op_info_key.startswith("input"):
+                op_input_info = op_dict[op_info_key]
+                missing_keys = []
+                for required_op_input_info_key in required_op_input_info_keys:
+                    if required_op_input_info_key not in op_input_info:
+                        missing_keys.append(required_op_input_info_key)
+                if len(missing_keys) > 0:
+                    print(
+                        "op: "
+                        + op_key
+                        + " "
+                        + op_info_key
+                        + " missing: "
+                        + ",".join(missing_keys)
+                    )
+                    is_valid = False
+                else:
+                    if not op_input_info["paramType"] in param_type_valid_value:
+                        print(
+                            "op: "
+                            + op_key
+                            + " "
+                            + op_info_key
+                            + " paramType not valid, valid key:[dynamic, "
+                            "optional, required]"
+                        )
+                        is_valid = False
+                is_valid = check_type_format(op_input_info, is_valid, op_info_key)
+            if op_info_key.startswith("output"):
+                op_input_info = op_dict[op_info_key]
+                missing_keys = []
+                for required_op_input_info_key in required_op_output_info_keys:
+                    if required_op_input_info_key not in op_input_info:
+                        missing_keys.append(required_op_input_info_key)
+                if len(missing_keys) > 0:
+                    print(
+                        "op: "
+                        + op_key
+                        + " "
+                        + op_info_key
+                        + " missing: "
+                        + ",".join(missing_keys)
+                    )
+                    is_valid = False
+                else:
+                    if not op_input_info["paramType"] in param_type_valid_value:
+                        print(
+                            "op: "
+                            + op_key
+                            + " "
+                            + op_info_key
+                            + " paramType not valid, valid key:[dynamic, "
+                            "optional, required]"
+                        )
+                        is_valid = False
+                is_valid = check_type_format(op_input_info, is_valid, op_info_key)
+        is_valid = check_attr(op_dict, is_valid)
+        is_valid = check_bool_flag(op_dict, is_valid)
+    print("==============check valid for ops info end================\n\n")
+    return is_valid
+
+
+def write_json_file(tbe_ops_info, json_file_path):
+    """
+    Save info to json file
+    Parameters:
+    ----------------
+    tbe_ops_info: ops_info
+    json_file_path: json file path
+    ----------------
+    """
+    json_file_real_path = os.path.realpath(json_file_path)
+    wr_flag = os.O_WRONLY | os.O_CREAT
+    wr_mode = stat.S_IWUSR | stat.S_IRUSR
+    with os.fdopen(os.open(json_file_real_path, wr_flag, wr_mode), "w") as file_path:
+        # The owner have all rights£¬group only have read rights
+        os.chmod(json_file_real_path, stat.S_IWUSR + stat.S_IRGRP + stat.S_IRUSR)
+        json.dump(
+            tbe_ops_info, file_path, sort_keys=True, indent=4, separators=(",", ":")
+        )
+    print("Compile op info cfg successfully.")
+
+
+def parse_ini_to_json(ini_file_paths, outfile_path):
+    """
+    parse ini files to json file
+    Parameters:
+    ----------------
+    ini_file_paths: list of ini file path
+    outfile_path: output file path
+    ----------------
+    """
+    tbe_ops_info = parse_ini_files(ini_file_paths)
+    if not check_op_info(tbe_ops_info):
+        print("Compile op info cfg failed.")
+        return False
+    write_json_file(tbe_ops_info, outfile_path)
+    return True
+
+
+if __name__ == "__main__":
+    args = sys.argv
+
+    OUTPUT_FILE_PATH = "tbe_ops_info.json"
+    ini_file_path_list = []
+    parse_ini_list = []
+
+    for arg in args:
+        if arg.endswith("ini"):
+            ini_file_path_list.append(arg)
+            OUTPUT_FILE_PATH = arg.replace(".ini", ".json")
+        if arg.endswith("json"):
+            OUTPUT_FILE_PATH = arg
+
+    if not ini_file_path_list:
+        ini_file_path_list.append("tbe_ops_info.ini")
+
+    for ini_file in ini_file_path_list:
+        if os.path.exists(ini_file):
+            parse_ini_list.append(ini_file)
+
+    if parse_ini_list:
+        if not parse_ini_to_json(parse_ini_list, OUTPUT_FILE_PATH):
+            sys.exit(1)
+        sys.exit(0)
diff --git a/csrc/deepep/ops2/cmake/util/preset_parse.py b/csrc/deepep/ops2/cmake/util/preset_parse.py
new file mode 100755
index 000000000..983f16e74
--- /dev/null
+++ b/csrc/deepep/ops2/cmake/util/preset_parse.py
@@ -0,0 +1,34 @@
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+
+import json
+import os
+import sys
+
+
+def read_json(file):
+    with open(file, "r") as fd:
+        config = json.load(fd)
+    return config
+
+
+def get_config_opts(file):
+    config = read_json(file)
+
+    src_dir = os.path.abspath(os.path.dirname(file))
+    opts = ""
+
+    for conf in config:
+        if conf == "configurePresets":
+            for node in config[conf]:
+                macros = node.get("cacheVariables")
+                if macros is not None:
+                    for key in macros:
+                        opts += "-D{}={} ".format(key, macros[key]["value"])
+
+    opts = opts.replace("${sourceDir}", src_dir)
+    print(opts)
+
+
+if __name__ == "__main__":
+    get_config_opts(sys.argv[1])
diff --git a/csrc/deepep/ops2/cmake/util/replay_codegen.py b/csrc/deepep/ops2/cmake/util/replay_codegen.py
new file mode 100755
index 000000000..6f896a098
--- /dev/null
+++ b/csrc/deepep/ops2/cmake/util/replay_codegen.py
@@ -0,0 +1,132 @@
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+
+import collections
+import os
+import stat
+
+import code_channel_infer
+import const_var
+import kernel_entry as keb
+from tiling_data_def_build import gen_tiling
+
+PYF_PATH = os.path.dirname(__file__)
+
+ReplayCodeGenParams = collections.namedtuple(
+    "ReplayCodeGenParams",
+    [
+        "op_type",
+        "impl",
+        "tiling_file",
+        "kernel",
+        "entry",
+        "argn",
+        "op_replay_batch",
+        "max_block_dim",
+        "max_shape_size",
+    ],
+)
+
+
+class ReplayCodeGen:
+    def __init__(self, replayCodeGenParams):
+        self.op_type = replayCodeGenParams.op_type
+        self.impl = replayCodeGenParams.impl
+        self.tiling_file = replayCodeGenParams.tiling_file
+        self.tiling_data_file = ""
+        self.kernel = replayCodeGenParams.kernel
+        self.entry = replayCodeGenParams.entry
+        self.argn = replayCodeGenParams.argn
+        self.batch = False
+        self.outdir = ""
+        self.data_type = "uint8_t"
+        self.blknum = 32
+        self.op_replay_batch = replayCodeGenParams.op_replay_batch
+        self.max_block_dim = replayCodeGenParams.max_block_dim
+        self.max_shape_size = replayCodeGenParams.max_shape_size
+
+    def set_batch(self, is_batch):
+        self.batch = is_batch
+
+    def set_outdir(self, outdir):
+        self.outdir = outdir
+
+    def gen_replay(self, ops_product: str):
+        kerentry = os.path.join(self.outdir, self.kernel + "_entry.cce")
+        kerimpl = os.path.join(self.outdir, self.kernel + "_impl.cpp")
+        replayimpl = os.path.join(self.outdir, self.kernel + "_replay.cpp")
+        if self.batch:
+            reptmp = os.path.join(PYF_PATH, "batch_replay_impl.temp")
+        else:
+            reptmp = os.path.join(PYF_PATH, "replay_impl.temp")
+        kertmp = os.path.join(PYF_PATH, "kernel_impl.temp")
+        self._gen_kentry(kerentry)
+        self._gen_kimpl_code(kerimpl, kertmp)
+        self._gen_tiling_data_header()
+        self._gen_replay_code(replayimpl, reptmp, ops_product)
+
+    def _gen_tiling_data_header(self):
+        self.tiling_data_file = os.path.join(
+            self.outdir, self.kernel + "_tiling_data.h"
+        )
+        gen_tiling(self.tiling_file, self.tiling_data_file)
+
+    def _gen_kimpl_code(self, src, tmpfile):
+        with open(tmpfile, "r") as fd:
+            temp = fd.read()
+            temp = temp.replace("__CCE_FILE__", self.impl)
+        with os.fdopen(os.open(src, const_var.WFLAGS, const_var.WMODES), "w") as ofd:
+            ofd.write(temp)
+
+    def _gen_replay_code(self, src, tmpfile, ops_product: str):
+        with open(tmpfile, "r") as fd:
+            temp = fd.read()
+            temp = temp.replace("__ARG_NUM__", str(self.argn))
+            argdef = []
+            kargs = []
+            for i in range(0, self.argn):
+                argdef.append("{} *".format(self.data_type))
+                kargs.append("({} *)GetArg({})".format(self.data_type, i))
+            temp = temp.replace("__ARGS_DEF__", ", ".join(argdef))
+            temp = temp.replace("__KERNEL_ARGS__", ", ".join(kargs))
+            temp = temp.replace("__KERNEL_FUN__", self.entry)
+            core_type_infer = "core_type"
+            code_channel = code_channel_infer.infer_code_channel(
+                code_channel_infer.InfoCodeChanelParams(
+                    self.impl,
+                    self.tiling_data_file,
+                    self.kernel,
+                    self.outdir,
+                    ops_product,
+                    None,
+                )
+            )
+            if code_channel == code_channel_infer.CODE_VEC:
+                core_type_infer = "0"
+            elif code_channel == code_channel_infer.CODE_CUBE:
+                core_type_infer = "1"
+            temp = temp.replace("__CORE_TYPE__", core_type_infer)
+            # register function
+            temp = temp.replace("__OPS_PRODUCT__", ops_product)
+            temp = temp.replace("__OPTYPE__", self.op_type)
+        with os.fdopen(os.open(src, const_var.WFLAGS, const_var.WMODES), "w") as ofd:
+            ofd.write(temp)
+
+    def _gen_kentry(self, src):
+        kf = ""
+        pre_alloc_str = "A" * 256
+        if self.batch:
+            kf += keb.batch_code_gen(
+                "K{:02d}_{}{}".format(0, self.entry, pre_alloc_str),
+                self.argn,
+                self.data_type,
+            )
+        else:
+            kf += keb.mc_code_gen(
+                "K{:02d}_{}{}".format(0, self.entry, pre_alloc_str),
+                self.argn,
+                self.data_type,
+                self.blknum,
+            )
+        with os.fdopen(os.open(src, const_var.WFLAGS, const_var.WMODES), "w") as ofd:
+            ofd.write(kf)
diff --git a/csrc/deepep/ops2/cmake/util/replay_impl.temp b/csrc/deepep/ops2/cmake/util/replay_impl.temp
new file mode 100644
index 000000000..1d30dd865
--- /dev/null
+++ b/csrc/deepep/ops2/cmake/util/replay_impl.temp
@@ -0,0 +1,120 @@
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <iostream>
+#include <thread>
+#include "replay_def.h"
+#include "code_gen.h"
+#include "replay_fun.h"
+#include "register/op_check.h"
+#define __ASCENDC_REPLAY_CODE__
+using namespace std;
+using namespace optiling;
+using namespace AscendCReplay;
+
+extern "C" void __KERNEL_FUN__ (__ARGS_DEF__, const char *);
+extern "C" int elf_append(char *elf, uint32_t elfSize, char *jit, int kernum, int blknum[], char *atext[],
+    int alen[], int atlen, const char* kernelname[]);
+
+#define KERNEL_N 1
+#define ARG_N (__ARG_NUM__)
+#define MAX_L (1024 * 1024 * 100)
+#define MAX_E (1024 * 1024)
+
+int __KERNEL_FUN___replay___OPS_PRODUCT__(ReplayFuncParam& param, const int core_type)
+{
+    // gen type 1 : direct call codes 0: load .o file
+    if (param.gentype < 0 || param.gentype > 1) {
+        printf("Error: call replay gen type is %d, should only be 1 or 0\n", param.gentype);
+        return 0;
+    } else if (param.gentype == 1 && param.objptr == nullptr) {
+        printf("Error: call replay with direct call mode, but code obj addr is null\n");
+        return 0;
+    } else if (param.gentype == 0 && param.output_kernel_file == nullptr) {
+        printf("Error: call replay with object file mode, but object file path is null\n");
+        return 0;
+    }
+    // core_type 0:MIX 1:CUBE 2:VEC
+    if (core_type < 0 || core_type > 2) {
+        printf("Error: call replay core type is %d !\n", core_type);
+        return 0;
+    }
+    g_coreType = __CORE_TYPE__;
+    g_taskRation = param.task_ration;
+    g_tilingKey = param.tiling_key;
+
+    unsigned char *buf, *jit;
+    char *kernel[KERNEL_N * 32];
+    int len[KERNEL_N * 32];
+    int blknum[KERNEL_N];
+    int max;
+    block_num = param.block_dim;
+    g_ubBase = block_num;
+    uint8_t *code = (uint8_t *)malloc(MAX_L);
+    uint8_t *pos = code;
+    struct timespec tp1, tp2;
+
+    clock_gettime(CLOCK_MONOTONIC, &tp1);
+    if (block_num > 32) {
+        printf("Error: block_num > 32\n");
+        return 0;
+    }
+    //__OP_FOPEN__
+    for (int i = 0; i < KERNEL_N; i++) {
+        for (int j = 0; j < ARG_N; j++)
+            AddArg(j, ARG_STEP * (j + 1));
+        for (block_idx = 0; block_idx < block_num; block_idx++) {
+            //__OP_SET_KERNEL__
+            int code_idx = i * block_num + block_idx;
+#ifdef FP_CEILING
+            SetCtrlFloatEnable();
+#else
+            SetCtrlFloatDisable();
+#endif
+            CodeInit(pos, false);
+            __KERNEL_FUN__(__KERNEL_ARGS__, param.tiling_data);
+            CodeEnd();
+            kernel[code_idx] = (char *)pos;
+            len[code_idx] = CodeLen();
+            pos += len[code_idx];
+            printf("kernel %d core %ld code generated len %d\n", i, block_idx, len[code_idx]);
+        }
+        blknum[i] = block_num;
+    }
+    //__OP_FCLOSE__
+    clock_gettime(CLOCK_MONOTONIC, &tp2);
+    buf = (unsigned char *)malloc(MAX_E);
+    int fd = open(param.entry_file, O_RDONLY);
+    if (fd < 0) {
+        printf("[error]: cannot find entry.o : %s\n", param.entry_file);
+        return 0;
+    }
+    uint32_t bufSize = read(fd, buf, MAX_E);
+    if (bufSize <= 0) {
+        printf("[error]: entry.o : %s is too small ! \n", param.entry_file);
+    }
+    close(fd);
+    jit = (unsigned char *)malloc(MAX_L);
+    printf("total code generated %ld\n", pos - code);
+    int sz = elf_append((char *)buf, bufSize, (char *)jit, KERNEL_N, blknum, kernel, len, pos - code, &param.kernel_name);
+    if (tp1.tv_sec != tp2.tv_sec) {
+        printf("%ld NS\n", tp2.tv_nsec + 1000000000 - tp1.tv_nsec);
+    } else {
+        printf("%ld NS\n", tp2.tv_nsec - tp1.tv_nsec);
+    }
+    printf("new elf size %d\n", sz);
+    if (param.gentype == 0) {
+        fd = open(param.output_kernel_file, O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR);
+        (void)write(fd, jit, sz);
+        close(fd);
+        free(jit);
+    } else if (param.gentype == 1) {
+        *param.objptr = (char*)jit;
+    }
+    free(buf);
+    free(code);
+    return sz;
+}
+
+REG_REPLAY_FUNC(__OPTYPE__, __OPS_PRODUCT__, __KERNEL_FUN___replay___OPS_PRODUCT__);
diff --git a/csrc/deepep/ops2/cmake/util/tiling_data_def_build.py b/csrc/deepep/ops2/cmake/util/tiling_data_def_build.py
new file mode 100755
index 000000000..0576b202d
--- /dev/null
+++ b/csrc/deepep/ops2/cmake/util/tiling_data_def_build.py
@@ -0,0 +1,99 @@
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+"""
+Function:
+The replay function entry
+"""
+
+import os
+import re
+import stat
+import sys
+
+import const_var
+
+
+def gen_tiling(tiling_header_file: str, tiling_file_out: str):
+    if not os.path.exists(tiling_header_file):
+        print("warning: no userdef tiling header file: ", tiling_header_file)
+        return
+    print("generate tiling def header file: ", tiling_file_out)
+    tmp_name = os.path.splitext(os.path.basename(tiling_header_file))[0].upper()
+    tiling_source = "#ifndef __{}_H__\n".format(tmp_name)
+    tiling_source += "#define __{}_H__\n\n".format(tmp_name)
+    tiling_source += "#include <cstdint>\n"
+    tiling_source += "#include <cstring>\n\n"
+    tiling_source += '#include "kernel_tiling/kernel_tiling.h"\n\n'
+    end_source = ""
+    pattern = re.compile(r"[(](.*)[)]", re.S)
+    with open(tiling_header_file, "r") as fd:
+        lines = fd.readlines()
+        for line in lines:
+            line = line.strip()
+            if line.startswith("BEGIN_TILING_DATA_DEF"):
+                tiling_source += "#pragma pack(1)\n"
+                tiling_source += "struct "
+                struct_def = re.findall(pattern, line)[0]
+                tiling_source += struct_def + " {\n"
+            elif line.startswith("TILING_DATA_FIELD_DEF_ARR"):
+                field_params = re.findall(pattern, line)[0]
+                fds = field_params.split(",")
+                tiling_source += "    {} {}[{}] = {{}};\n".format(
+                    fds[0].strip(), fds[2].strip(), fds[1].strip()
+                )
+            elif line.startswith("TILING_DATA_FIELD_DEF_STRUCT"):
+                field_params = re.findall(pattern, line)[0]
+                fds = field_params.split(",")
+                tiling_source += "    {} {};\n".format(fds[0].strip(), fds[1].strip())
+            elif line.startswith("TILING_DATA_FIELD_DEF"):
+                field_params = re.findall(pattern, line)[0]
+                fds = field_params.split(",")
+                tiling_source += "    {} {} = 0;\n".format(
+                    fds[0].strip(), fds[1].strip()
+                )
+            elif line.startswith("END_TILING_DATA_DEF"):
+                tiling_source += "};\n"
+                tiling_source += "#pragma pack()\n\n"
+                tiling_source += "#ifdef __NPU_TILING__\n"
+                tiling_source += "inline [aicore] void Init{stru}(const __gm__ uint8_t* tiling, {stru}* const_data)\n".format(
+                    stru=struct_def
+                )
+                tiling_source += "{\n"
+                tiling_source += "    const __gm__ uint32_t *src = (const __gm__ uint32_t *)tiling;\n"
+                tiling_source += "    uint32_t *dst = (uint32_t *)const_data;\n"
+                tiling_source += "    for (auto i = 0; i < sizeof({}) / 4; i++) *(dst + i) = *(src + i);\n".format(
+                    struct_def
+                )
+                tiling_source += "}\n"
+                tiling_source += "#else\n"
+                tiling_source += "inline void Init{stru}(uint8_t* tiling, {stru}* const_data)\n".format(
+                    stru=struct_def
+                )
+                tiling_source += "{\n"
+                tiling_source += "    uint64_t *src = (uint64_t *)tiling;\n"
+                tiling_source += "    uint64_t *dst = (uint64_t *)const_data;\n"
+                tiling_source += "    for (auto i = 0; i < sizeof({}) / 8; i++) *(dst + i) = *(src + i);\n".format(
+                    struct_def
+                )
+                tiling_source += "}\n"
+                tiling_source += "#endif\n\n"
+                end_source = """
+#undef GET_TILING_DATA
+#define GET_TILING_DATA(tiling_data, tiling_arg) \\
+{stru} tiling_data; \\
+Init{stru}(tiling_arg, &tiling_data)\n
+""".format(
+                    stru=struct_def
+                )
+    tiling_source += end_source
+    tiling_source += "#endif"
+    with os.fdopen(
+        os.open(tiling_file_out, const_var.WFLAGS, const_var.WMODES), "w"
+    ) as ofd:
+        ofd.write(tiling_source)
+
+
+if __name__ == "__main__":
+    if len(sys.argv) <= 2:
+        raise RuntimeError("arguments must greater than 2")
+    gen_tiling(sys.argv[1], sys.argv[2])
diff --git a/csrc/deepep/ops2/op_host/CMakeLists.txt b/csrc/deepep/ops2/op_host/CMakeLists.txt
new file mode 100644
index 000000000..f40147f59
--- /dev/null
+++ b/csrc/deepep/ops2/op_host/CMakeLists.txt
@@ -0,0 +1,174 @@
+
+aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR} ops_srcs)
+
+opbuild(OPS_SRC ${ops_srcs}
+        OUT_DIR ${ASCEND_AUTOGEN_PATH}
+)
+
+file(GLOB group_proto_src ${ASCEND_AUTOGEN_PATH}/group_proto/*.cc)
+
+add_library(cust_op_proto SHARED
+    $<$<TARGET_EXISTS:group_proto_src>:${group_proto_src}>
+    ${ops_srcs}
+    ${ASCEND_AUTOGEN_PATH}/op_proto.cc
+)
+target_compile_definitions(cust_op_proto PRIVATE OP_PROTO_LIB)
+target_compile_options(cust_op_proto PRIVATE
+        -fvisibility=hidden
+)
+if(ENABLE_CROSS_COMPILE)
+    target_link_directories(cust_op_proto PRIVATE
+                            ${CMAKE_COMPILE_COMPILER_LIBRARY}
+                            ${CMAKE_COMPILE_RUNTIME_LIBRARY}
+    )
+endif()
+target_link_libraries(cust_op_proto PRIVATE
+        intf_pub
+        exe_graph
+        register
+        tiling_api
+        -Wl,--whole-archive
+        rt2_registry
+        -Wl,--no-whole-archive
+)
+set_target_properties(cust_op_proto PROPERTIES OUTPUT_NAME
+                      cust_opsproto_rt2.0
+)
+file(GLOB fallback_src ${ASCEND_AUTOGEN_PATH}/fallback_*.cpp)
+add_library(cust_optiling SHARED ${ops_srcs})
+if (${fallback_src})
+    target_sources(cust_optiling PRIVATE ${fallback_src})
+endif()
+target_compile_definitions(cust_optiling PRIVATE OP_TILING_LIB)
+target_compile_options(cust_optiling PRIVATE
+        -fvisibility=hidden
+)
+if(ENABLE_CROSS_COMPILE)
+    target_link_directories(cust_optiling PRIVATE
+                            ${CMAKE_COMPILE_COMPILER_LIBRARY}
+                            ${CMAKE_COMPILE_RUNTIME_LIBRARY}
+    )
+endif()
+target_link_libraries(cust_optiling PRIVATE
+        nnopbase
+        intf_pub
+        exe_graph
+        register
+        tiling_api
+        -Wl,--whole-archive
+        rt2_registry
+        -Wl,--no-whole-archive
+)
+set_target_properties(cust_optiling PROPERTIES OUTPUT_NAME
+                      cust_opmaster_rt2.0
+)
+
+file(GLOB_RECURSE pregen_file
+    "${CMAKE_CURRENT_SOURCE_DIR}/op_api/*"
+)
+
+file(COPY ${pregen_file} DESTINATION ${ASCEND_AUTOGEN_PATH})
+file(GLOB aclnn_src ${ASCEND_AUTOGEN_PATH}/aclnn*.cpp)
+file(GLOB aclnn_inc ${ASCEND_AUTOGEN_PATH}/aclnn_*.h)
+if(NOT ASCEND_PACK_SHARED_LIBRARY)
+    add_library(cust_opapi SHARED ${aclnn_src})
+else()
+    file(GLOB op_registry ${ASCEND_AUTOGEN_PATH}/custom_op_registry.cpp)
+    add_library(cust_opapi SHARED ${aclnn_src} ${op_registry})
+    target_compile_definitions(cust_opapi PRIVATE ACLNN_WITH_BINARY)
+endif()
+if(ENABLE_CROSS_COMPILE)
+    target_link_directories(cust_opapi PRIVATE
+                            ${CMAKE_COMPILE_COMPILER_LIBRARY}
+                            ${CMAKE_COMPILE_RUNTIME_LIBRARY}
+    )
+endif()
+if(NOT ASCEND_PACK_SHARED_LIBRARY)
+    target_link_libraries(cust_opapi PRIVATE intf_pub ascendcl nnopbase)
+else()
+    add_library(cust_op_proto_obj OBJECT
+                $<$<TARGET_EXISTS:group_proto_src>:${group_proto_src}>
+                ${ops_srcs}
+                ${ASCEND_AUTOGEN_PATH}/op_proto.cc
+    )
+    target_compile_definitions(cust_op_proto_obj PRIVATE OP_PROTO_LIB)
+    target_compile_options(cust_op_proto_obj PRIVATE
+                           -fvisibility=hidden
+    )
+    if(ENABLE_CROSS_COMPILE)
+        target_link_directories(cust_op_proto_obj PRIVATE
+                                ${CMAKE_COMPILE_COMPILER_LIBRARY}
+                                ${CMAKE_COMPILE_RUNTIME_LIBRARY}
+    )
+    endif()
+    target_link_libraries(cust_op_proto_obj PRIVATE
+                          intf_pub
+                          exe_graph
+                          register
+                          tiling_api
+                          -Wl,--whole-archive
+                          rt2_registry
+                          -Wl,--no-whole-archive
+    )
+    add_library(cust_optiling_obj OBJECT ${ops_srcs})
+    target_compile_definitions(cust_optiling_obj PRIVATE OP_TILING_LIB)
+    target_compile_options(cust_optiling_obj PRIVATE
+                           -fvisibility=hidden
+    )
+    if(ENABLE_CROSS_COMPILE)
+        target_link_directories(cust_optiling_obj PRIVATE
+                                ${CMAKE_COMPILE_COMPILER_LIBRARY}
+                                ${CMAKE_COMPILE_RUNTIME_LIBRARY}
+        )
+    endif()
+    target_link_libraries(cust_optiling_obj PRIVATE
+                          intf_pub
+                          exe_graph
+                          register
+                          tiling_api
+                          -Wl,--whole-archive
+                          rt2_registry
+                          -Wl,--no-whole-archive
+    )
+    target_compile_options(cust_opapi PRIVATE -DLOG_CPP)
+    target_include_directories(cust_opapi INTERFACE ${CMAKE_SOURCE_DIR}/build_out/library/)
+    target_link_libraries(cust_opapi PRIVATE intf_pub ascendcl nnopbase cust_optiling_obj cust_op_proto_obj ascend_opregistry ascend_kernels)
+    add_dependencies(cust_opapi ascend_opregistry)
+endif()
+
+target_include_directories(cust_opapi PRIVATE $ENV{ASCEND_HOME_PATH}/${CANN_HOST_ARCH}-linux/include/experiment/platform/)
+include_directories($ENV{ASCEND_HOME_PATH}/../opp/vendors/CAM/op_impl/ai_core/tbe/CAM_impl/dynamic/)
+
+add_custom_target(optiling_compat ALL
+                  COMMAND ln -sf lib/linux/${CMAKE_SYSTEM_PROCESSOR}/$<TARGET_FILE_NAME:cust_optiling>
+                          ${CMAKE_CURRENT_BINARY_DIR}/liboptiling.so
+)
+if(NOT ASCEND_PACK_SHARED_LIBRARY)
+        install(TARGETS cust_op_proto
+                LIBRARY DESTINATION packages/vendors/${vendor_name}/op_proto/lib/linux/${CMAKE_SYSTEM_PROCESSOR})
+        install(FILES ${ASCEND_AUTOGEN_PATH}/op_proto.h
+                DESTINATION packages/vendors/${vendor_name}/op_proto/inc)
+        file(GLOB GROUP_PROTO_HEADERS ${ASCEND_AUTOGEN_PATH}/group_proto/*.h)
+        if (GROUP_PROTO_HEADERS)
+                install(FILES ${GROUP_PROTO_HEADERS}
+                        DESTINATION packages/vendors/${vendor_name}/op_proto/inc)
+        endif()
+        install(TARGETS cust_optiling
+                LIBRARY DESTINATION packages/vendors/${vendor_name}/op_impl/ai_core/tbe/op_tiling/lib/linux/${CMAKE_SYSTEM_PROCESSOR})
+        install(FILES ${CMAKE_CURRENT_BINARY_DIR}/liboptiling.so
+                DESTINATION packages/vendors/${vendor_name}/op_impl/ai_core/tbe/op_tiling)
+        install(TARGETS cust_opapi
+                LIBRARY DESTINATION packages/vendors/${vendor_name}/op_api/lib)
+        install(FILES ${aclnn_inc}
+                DESTINATION packages/vendors/${vendor_name}/op_api/include)
+else()
+        file(GLOB group_inc ${ASCEND_AUTOGEN_PATH}/group_proto/*.h)
+        install(TARGETS cust_opapi
+                LIBRARY DESTINATION op_api/lib)
+        install(FILES ${ASCEND_AUTOGEN_PATH}/op_proto.h
+                DESTINATION op_api/include)
+        install(FILES ${group_inc}
+                DESTINATION op_api/include)
+        install(FILES ${aclnn_inc}
+                DESTINATION op_api/include)
+endif()
diff --git a/csrc/deepep/ops2/op_host/error_log.h b/csrc/deepep/ops2/op_host/error_log.h
new file mode 100644
index 000000000..84258321c
--- /dev/null
+++ b/csrc/deepep/ops2/op_host/error_log.h
@@ -0,0 +1,48 @@
+#ifndef OPS_BUILT_IN_OP_TILING_ERROR_LOG_H_
+#define OPS_BUILT_IN_OP_TILING_ERROR_LOG_H_
+
+#include <string>
+#include "toolchain/slog.h"
+
+#define OP_LOGI(opname, ...)
+#define OP_LOGW(opname, ...)             \
+    do {                                 \
+        printf("[WARN][%s] ", (opname)); \
+        printf(__VA_ARGS__);             \
+        printf("\n");                    \
+    } while (0)
+
+#define OP_LOGE_WITHOUT_REPORT(opname, ...) \
+    do {                                    \
+        printf("[ERRORx][%s] ", (opname));  \
+        printf(__VA_ARGS__);                \
+        printf("\n");                       \
+    } while (0)
+
+#define OP_LOGE(opname, ...)              \
+    do {                                  \
+        printf("[ERROR][%s] ", (opname)); \
+        printf(__VA_ARGS__);              \
+        printf("\n");                     \
+    } while (0)
+
+// #define OP_LOGD(opname, ...) printf("[DEBUG]" __VA_ARGS__); printf("\n");
+#define OP_LOGD(opname, ...)
+
+namespace optiling {
+
+#define VECTOR_INNER_ERR_REPORT_TILIING(op_name, err_msg, ...)   \
+    do {                                                         \
+        OP_LOGE_WITHOUT_REPORT(op_name, err_msg, ##__VA_ARGS__); \
+    } while (0)
+
+#define OP_TILING_CHECK(cond, log_func, expr) \
+    do {                                      \
+        if (cond) {                           \
+            log_func;                         \
+            expr;                             \
+        }                                     \
+    } while (0)
+}  // namespace optiling
+
+#endif  // OPS_BUILT_IN_OP_TILING_ERROR_LOG_H_
diff --git a/csrc/deepep/ops2/op_host/mc2_tiling_utils.h b/csrc/deepep/ops2/op_host/mc2_tiling_utils.h
new file mode 100644
index 000000000..6004ac4b0
--- /dev/null
+++ b/csrc/deepep/ops2/op_host/mc2_tiling_utils.h
@@ -0,0 +1,63 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2024-2025. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*!
+ * \file mc2_tiling_utils.h
+ * \brief
+ */
+
+#ifndef __MC2_TILING_UTILS_H__
+#define __MC2_TILING_UTILS_H__
+
+#include <cstdint>
+#include <map>
+#include <string>
+
+#include "register/tilingdata_base.h"
+#include "tiling/tiling_api.h"
+#include "error_log.h"
+
+namespace mc2tiling {
+
+constexpr uint32_t AICPU_BLOCK_DIM_A2 = 6U;
+class Mc2TilingUtils
+{
+public:
+#define HCCL_BUFFSIZE "HCCL_BUFFSIZE"
+    static uint64_t GetMaxWindowSize()
+    {
+        uint16_t defaultWindowSize = 200;
+        if (getenv(HCCL_BUFFSIZE) == nullptr) {
+            OP_LOGD("", "Env HCCL_BUFFSIZE don't set");
+        } else {
+            try {
+                std::string envStr(getenv(HCCL_BUFFSIZE));
+                defaultWindowSize = std::stoi(envStr);
+            } catch (const std::invalid_argument &ia) {
+                OP_LOGE("", "Invalid argument when parsing HCCL_BUFFSIZE: %s", ia.what());
+            } catch (const std::out_of_range &oor) {
+                OP_LOGE("", "Out of range when parsing HCCL_BUFFSIZE: %s", oor.what());
+            }
+        }
+        const uint64_t maxWindowSize = static_cast<uint64_t>(defaultWindowSize) * 1024UL * 1024UL;
+        OP_LOGI("", "Get maxWindowSize is %lu", maxWindowSize);
+        return maxWindowSize;
+    }
+};
+
+}  // namespace mc2tiling
+
+#endif
diff --git a/csrc/deepep/ops2/op_host/moe_distribute_combine_v2.cpp b/csrc/deepep/ops2/op_host/moe_distribute_combine_v2.cpp
new file mode 100644
index 000000000..552b13e60
--- /dev/null
+++ b/csrc/deepep/ops2/op_host/moe_distribute_combine_v2.cpp
@@ -0,0 +1,177 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*!
+ * \file moe_distribute_combine_v2.cpp
+ * \brief
+ */
+
+#include "register/op_def_registry.h"
+
+namespace ops {
+class MoeDistributeCombineV2 : public OpDef
+{
+public:
+    explicit MoeDistributeCombineV2(const char *name) : OpDef(name)
+    {
+        this->Input("expand_x")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_BF16, ge::DT_FLOAT16, ge::DT_INT32, ge::DT_INT32})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .AutoContiguous();
+        this->Input("expert_ids")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_INT32, ge::DT_INT32, ge::DT_INT32, ge::DT_INT32})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .AutoContiguous();
+        this->Input("assist_info_for_combine")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_INT32, ge::DT_INT32, ge::DT_INT32, ge::DT_INT32})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .AutoContiguous();
+        this->Input("ep_send_counts")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_INT32, ge::DT_INT32, ge::DT_INT32, ge::DT_INT32})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .AutoContiguous();
+        this->Input("expert_scales")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_FLOAT, ge::DT_FLOAT, ge::DT_FLOAT, ge::DT_FLOAT})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .AutoContiguous();
+        this->Input("tp_send_counts")
+            .ParamType(OPTIONAL)
+            .DataType({ge::DT_INT32, ge::DT_INT32, ge::DT_INT32, ge::DT_INT32})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .AutoContiguous();
+        this->Input("x_active_mask")
+            .ParamType(OPTIONAL)
+            .DataType({ge::DT_BOOL, ge::DT_BOOL, ge::DT_BOOL, ge::DT_BOOL})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .AutoContiguous();
+        this->Input("activation_scale")
+            .ParamType(OPTIONAL)
+            .DataType({ge::DT_FLOAT, ge::DT_FLOAT, ge::DT_FLOAT, ge::DT_FLOAT})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .AutoContiguous();
+        this->Input("weight_scale")
+            .ParamType(OPTIONAL)
+            .DataType({ge::DT_FLOAT, ge::DT_FLOAT, ge::DT_FLOAT, ge::DT_FLOAT})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .AutoContiguous();
+        this->Input("group_list")
+            .ParamType(OPTIONAL)
+            .DataType({ge::DT_INT64, ge::DT_INT64, ge::DT_INT64, ge::DT_INT64})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .AutoContiguous();
+        this->Input("expand_scales")
+            .ParamType(OPTIONAL)
+            .DataType({ge::DT_FLOAT, ge::DT_FLOAT, ge::DT_FLOAT, ge::DT_FLOAT})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .AutoContiguous();
+        this->Input("shared_expert_x")
+            .ParamType(OPTIONAL)
+            .DataType({ge::DT_BF16, ge::DT_FLOAT16, ge::DT_INT32, ge::DT_INT32})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .AutoContiguous();
+        this->Input("elastic_info")
+            .ParamType(OPTIONAL)
+            .DataType({ge::DT_INT32, ge::DT_INT32, ge::DT_INT32, ge::DT_INT32})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .AutoContiguous();
+        this->Input("ori_x")
+            .ParamType(OPTIONAL)
+            .DataType({ge::DT_BF16, ge::DT_FLOAT16, ge::DT_INT32, ge::DT_INT32})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .AutoContiguous();
+        this->Input("const_expert_alpha_1")
+            .ParamType(OPTIONAL)
+            .DataType({ge::DT_BF16, ge::DT_FLOAT16, ge::DT_INT32, ge::DT_INT32})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .AutoContiguous();
+        this->Input("const_expert_alpha_2")
+            .ParamType(OPTIONAL)
+            .DataType({ge::DT_BF16, ge::DT_FLOAT16, ge::DT_INT32, ge::DT_INT32})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .AutoContiguous();
+        this->Input("const_expert_v")
+            .ParamType(OPTIONAL)
+            .DataType({ge::DT_BF16, ge::DT_FLOAT16, ge::DT_INT32, ge::DT_INT32})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .AutoContiguous();
+
+        this->Output("x")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_BF16, ge::DT_FLOAT16, ge::DT_BF16, ge::DT_FLOAT16})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND});
+
+        this->Attr("group_ep").AttrType(REQUIRED).String();
+        this->Attr("ep_world_size").AttrType(REQUIRED).Int();
+        this->Attr("ep_rank_id").AttrType(REQUIRED).Int();
+        this->Attr("moe_expert_num").AttrType(REQUIRED).Int();
+        this->Attr("group_tp").AttrType(OPTIONAL).String("");
+        this->Attr("tp_world_size").AttrType(OPTIONAL).Int(0);
+        this->Attr("tp_rank_id").AttrType(OPTIONAL).Int(0);
+        this->Attr("expert_shard_type").AttrType(OPTIONAL).Int(0);
+        this->Attr("shared_expert_num").AttrType(OPTIONAL).Int(1);
+        this->Attr("shared_expert_rank_num").AttrType(OPTIONAL).Int(0);
+        this->Attr("global_bs").AttrType(OPTIONAL).Int(0);
+        this->Attr("out_dtype").AttrType(OPTIONAL).Int(0);
+        this->Attr("comm_quant_mode").AttrType(OPTIONAL).Int(0);
+        this->Attr("group_list_type").AttrType(OPTIONAL).Int(0);
+        this->Attr("comm_alg").AttrType(OPTIONAL).String("");
+        this->Attr("zero_expert_num").AttrType(OPTIONAL).Int(0);
+        this->Attr("copy_expert_num").AttrType(OPTIONAL).Int(0);
+        this->Attr("const_expert_num").AttrType(OPTIONAL).Int(0);
+
+        OpAICoreConfig aicore_config_A2;
+        aicore_config_A2.DynamicCompileStaticFlag(true)
+            .DynamicFormatFlag(true)
+            .DynamicRankSupportFlag(true)
+            .DynamicShapeSupportFlag(true)
+            .NeedCheckSupportFlag(false)
+            .PrecisionReduceFlag(true)
+            .ExtendCfgInfo("aclnnSupport.value", "support_aclnn")
+            .ExtendCfgInfo("prebuildPattern.value", "Opaque")
+            .ExtendCfgInfo("jitCompile.flag", "static_false")
+            .ExtendCfgInfo("multiKernelSupportDynamicGraph.value", "multi_kernel");
+
+        this->AICore().AddConfig("ascend910b", aicore_config_A2);
+        this->MC2().HcclGroup({"group_ep"});
+    }
+};
+
+OP_ADD(MoeDistributeCombineV2);
+
+}  // namespace ops
diff --git a/csrc/deepep/ops2/op_host/moe_distribute_combine_v2_tiling.cc b/csrc/deepep/ops2/op_host/moe_distribute_combine_v2_tiling.cc
new file mode 100644
index 000000000..8c0de0336
--- /dev/null
+++ b/csrc/deepep/ops2/op_host/moe_distribute_combine_v2_tiling.cc
@@ -0,0 +1,587 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*!
+ * \file moe_distribute_combine_v2_tiling.cc
+ * \brief
+ */
+
+#include <queue>
+#include <vector>
+#include <dlfcn.h>
+#include <fcntl.h>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <cmath>
+#include <cstdint>
+#include <string>
+#include <type_traits>
+
+#include "mc2_tiling_utils.h"
+#include "register/tilingdata_base.h"
+#include "tiling/tiling_api.h"
+#include "error_log.h"
+#include "register/op_def_registry.h"
+#include "experiment/platform/platform/platform_infos_def.h"
+#include "../op_kernel/moe_distribute_combine_v2_tiling.h"
+
+using namespace AscendC;
+using namespace ge;
+
+namespace {
+constexpr uint32_t EXPAND_X_INDEX = 0;
+constexpr uint32_t EXPERT_IDS_INDEX = 1;
+constexpr uint32_t ASSIST_INFO_INDEX = 2;
+constexpr uint32_t EP_SEND_COUNTS_INDEX = 3;
+constexpr uint32_t EXPERT_SCALES_INDEX = 4;
+constexpr uint32_t TP_SEND_COUNTS_INDEX = 5;
+constexpr uint32_t X_ACTIVE_MASK_INDEX = 6;
+constexpr uint32_t ACTIVATION_SCALE_INDEX = 7;
+constexpr uint32_t WEIGHT_SCALE_INDEX = 8;
+constexpr uint32_t GROUP_LIST_INDEX = 9;
+constexpr uint32_t SHARED_EXPERT_X_INDEX = 11;
+constexpr uint32_t ELASTIC_INFO_INDEX = 12;
+constexpr uint32_t ORI_X_INDEX = 13;
+constexpr uint32_t CONST_EXPERT_ALPHA_1_INDEX = 14;
+constexpr uint32_t CONST_EXPERT_ALPHA_2_INDEX = 15;
+constexpr uint32_t CONST_EXPERT_V_INDEX = 16;
+constexpr uint32_t OUTPUT_X_INDEX = 0;
+
+constexpr uint32_t ATTR_GROUP_EP_INDEX = 0;
+constexpr uint32_t ATTR_EP_WORLD_SIZE_INDEX = 1;
+constexpr uint32_t ATTR_EP_RANK_ID_INDEX = 2;
+constexpr uint32_t ATTR_MOE_EXPERT_NUM_INDEX = 3;
+constexpr uint32_t ATTR_GROUP_TP_INDEX = 4;
+constexpr uint32_t ATTR_TP_WORLD_SIZE_INDEX = 5;
+constexpr uint32_t ATTR_TP_RANK_ID_INDEX = 6;
+constexpr uint32_t ATTR_EXPERT_SHARD_TYPE_INDEX = 7;
+constexpr uint32_t ATTR_SHARED_EXPERT_NUM_INDEX = 8;
+constexpr uint32_t ATTR_SHARED_EXPERT_RANK_NUM_INDEX = 9;
+constexpr uint32_t ATTR_GLOBAL_BS_INDEX = 10;
+constexpr uint32_t ATTR_OUT_DTYPE_INDEX = 11;
+constexpr uint32_t ATTR_COMM_QUANT_MODE_INDEX = 12;
+constexpr uint32_t ATTR_GROUP_LIST_TYPE_INDEX = 13;
+constexpr uint32_t ATTR_COMM_ALG_INDEX = 14;
+constexpr uint32_t ATTR_ZERO_EXPERT_NUM_INDEX = 15;
+constexpr uint32_t ATTR_COPY_EXPERT_NUM_INDEX = 16;
+constexpr uint32_t ATTR_CONST_EXPERT_NUM_INDEX = 17;
+
+constexpr uint32_t INT8_COMM_QUANT = 2U;
+constexpr uint64_t INIT_TILINGKEY = 10000;
+constexpr uint64_t TILINGKEY_TP_WORLD_SIZE = 100;
+constexpr uint64_t TP_WORLD_SIZE_TWO = 2;
+constexpr uint64_t TILINGKEY_IS_SHARE_EXPERT = 1000;
+constexpr uint32_t TILINGKEY_INT8_COMM_QUANT = 20U;
+
+constexpr uint32_t THREE_DIMS = 3U;
+constexpr uint32_t TWO_DIMS = 2U;
+constexpr uint32_t ONE_DIM = 1U;
+constexpr uint32_t ASSIST_INFO_DIMS = 1U;
+constexpr uint64_t TILING_KEY_BASE_A2 = 2000UL;
+constexpr uint64_t TILING_KEY_LAYERED_COMM_A2 = 3000UL;
+constexpr uint64_t TILING_KEY_INT8_COMM_QUANT_A2 = 100UL;
+constexpr uint32_t ARR_LENGTH = 128U;
+constexpr uint32_t OP_TYPE_ALL_TO_ALL = 8U;      // numeric representation of AlltoAll
+constexpr uint32_t OP_TYPE_REDUCE_SCATTER = 7U;  // numeric representation of AlltoAll
+
+constexpr size_t MAX_GROUP_NAME_LENGTH = 128UL;
+constexpr int64_t MAX_SHARED_EXPERT_NUM = 4;
+constexpr int64_t MAX_EP_WORLD_SIZE = 768L;  // 384 * 2
+constexpr int64_t MIN_EP_WORLD_SIZE = 2;
+constexpr int64_t EP_RESTRICT_8 = 8;
+constexpr int64_t MAX_TP_WORLD_SIZE = 2;
+constexpr int64_t BS_UPPER_BOUND = 512;
+
+constexpr size_t SYSTEM_NEED_WORKSPACE = 16UL * 1024UL * 1024UL;
+constexpr size_t MASK_CALC_NEED_WORKSPACE = 10UL * 1024UL;
+constexpr int32_t HCCL_BUFFER_SIZE_DEFAULT = 200 * 1024 * 1024;  // Bytes
+constexpr uint32_t VERSION_2 = 2;
+constexpr uint32_t HCOMMCNT_2 = 2;
+constexpr uint32_t RANK_LIST_NUM = 2;
+constexpr int64_t MOE_EXPERT_MAX_NUM = 1024;
+constexpr int64_t K_MAX = 16;
+constexpr int64_t H_MIN = 1024;
+constexpr int64_t H_MAX = 8192;
+constexpr uint64_t MB_SIZE = 1024UL * 1024UL;
+constexpr uint64_t TRIPLE = 3;
+constexpr uint64_t ASSIST_NUM_PER_A = 128UL;
+constexpr uint64_t WIN_ADDR_ALIGN = 512UL;
+constexpr uint64_t SCALE_EXPAND_IDX_BUFFER = 44UL;  // scale32B + 3*4expandIdx
+constexpr uint64_t DOUBLE_DATA_BUFFER = 2UL;
+constexpr uint64_t MAX_OUT_DTYPE_SIZE = 2UL;
+constexpr uint64_t UB_ALIGN = 32UL;
+constexpr int64_t DISPATCH_STATUS_MAX_SUPPORT_NUM = 1280UL;
+
+// A2
+constexpr int32_t MAX_EP_WORLD_SIZE_A2 = 256;
+constexpr int32_t MAX_MOE_EXPERT_NUMS_A2 = 512;
+constexpr int32_t MAX_HIDDEN_SIZE_A2 = 7168;
+constexpr uint32_t MAX_BATCH_SIZE_A2 = 512;
+constexpr uint32_t RANK_NUM_PER_NODE_A2 = 8;
+constexpr uint32_t BLOCK_SIZE_A2 = 32;
+constexpr uint32_t MAX_K_VALUE_A2 = 16;
+const char *K_INNER_DEBUG = "MoeDistributeCombineV2 Tiling Debug";
+
+enum class CommQuantMode : int32_t { NON_QUANT = 0, INT12_QUANT = 1, INT8_QUANT = 2 };
+using CommQuantModeType = std::underlying_type<CommQuantMode>::type;
+}  // namespace
+
+namespace optiling {
+// a2专有
+static void PrintA2TilingDataInfo(MoeDistributeCombineV2Info &info)
+{
+    OP_LOGD(K_INNER_DEBUG, "epWorldSize is %u.", info.epWorldSize);
+    OP_LOGD(K_INNER_DEBUG, "tpWorldSize is %u.", info.tpWorldSize);
+    OP_LOGD(K_INNER_DEBUG, "epRankId is %u.", info.epRankId);
+    OP_LOGD(K_INNER_DEBUG, "tpRankId is %u.", info.tpRankId);
+    OP_LOGD(K_INNER_DEBUG, "expertSharedType is %u.", info.expertSharedType);
+    OP_LOGD(K_INNER_DEBUG, "sharedExpertRankNum is %u.", info.sharedExpertRankNum);
+    OP_LOGD(K_INNER_DEBUG, "moeExpertNum is %u.", info.moeExpertNum);
+    OP_LOGD(K_INNER_DEBUG, "globalBs is %u.", info.globalBs);
+}
+
+static ge::graphStatus MoeDistributeCombineA2CheckAttrAndSetTiling(gert::TilingContext *context,
+                                                                   MoeDistributeCombineV2Info &info,
+                                                                   int32_t &commQuantMode, const bool isLayered)
+{
+    auto attrs = context->GetAttrs();
+    OP_TILING_CHECK(attrs == nullptr, OP_LOGE(K_INNER_DEBUG, "attrs is null."), return ge::GRAPH_FAILED);
+
+    auto groupEpPtr = attrs->GetAttrPointer<char>(static_cast<int>(ATTR_GROUP_EP_INDEX));
+    auto epWorldSizePtr = attrs->GetAttrPointer<int>(ATTR_EP_WORLD_SIZE_INDEX);
+    auto epRankIdPtr = attrs->GetAttrPointer<int>(ATTR_EP_RANK_ID_INDEX);
+    auto moeExpertNumPtr = attrs->GetAttrPointer<int>(ATTR_MOE_EXPERT_NUM_INDEX);
+    auto tpWorldSizePtr = attrs->GetAttrPointer<int>(ATTR_TP_WORLD_SIZE_INDEX);
+    auto tpRankIdPtr = attrs->GetAttrPointer<int>(ATTR_TP_RANK_ID_INDEX);
+    auto expertSharedTypePtr = attrs->GetAttrPointer<int>(ATTR_EXPERT_SHARD_TYPE_INDEX);
+    auto sharedExpertRankNumPtr = attrs->GetAttrPointer<int>(ATTR_SHARED_EXPERT_RANK_NUM_INDEX);
+    auto globalBsPtr = attrs->GetAttrPointer<int>(ATTR_GLOBAL_BS_INDEX);
+    auto commQuantModePtr = attrs->GetAttrPointer<int>(ATTR_COMM_QUANT_MODE_INDEX);
+
+    auto zeroExpertNumPtr = attrs->GetAttrPointer<int64_t>(static_cast<int>(ATTR_ZERO_EXPERT_NUM_INDEX));
+    auto copyExpertNumPtr = attrs->GetAttrPointer<int64_t>(static_cast<int>(ATTR_COPY_EXPERT_NUM_INDEX));
+    auto constExpertNumPtr = attrs->GetAttrPointer<int64_t>(static_cast<int>(ATTR_CONST_EXPERT_NUM_INDEX));
+
+    OP_TILING_CHECK(zeroExpertNumPtr == nullptr, OP_LOGE(K_INNER_DEBUG, "zeroExpertNum is invalid."),
+                    return GRAPH_FAILED);
+    OP_TILING_CHECK(copyExpertNumPtr == nullptr, OP_LOGE(K_INNER_DEBUG, "copyExpertNum is invalid."),
+                    return GRAPH_FAILED);
+    OP_TILING_CHECK(constExpertNumPtr == nullptr, OP_LOGE(K_INNER_DEBUG, "constExpertNum is invalid."),
+                    return GRAPH_FAILED);
+
+    OP_TILING_CHECK((groupEpPtr == nullptr) || (strnlen(groupEpPtr, MAX_GROUP_NAME_LENGTH) == 0) ||
+                        (strnlen(groupEpPtr, MAX_GROUP_NAME_LENGTH) == MAX_GROUP_NAME_LENGTH),
+                    OP_LOGE(K_INNER_DEBUG, "groupEp is invalid."), return ge::GRAPH_FAILED);
+    OP_TILING_CHECK(epWorldSizePtr == nullptr || *epWorldSizePtr <= 0 || *epWorldSizePtr > MAX_EP_WORLD_SIZE_A2 ||
+                        *epWorldSizePtr % RANK_NUM_PER_NODE_A2 != 0,
+                    OP_LOGE(K_INNER_DEBUG, "epWorldSize is invalid."), return GRAPH_FAILED);
+    OP_TILING_CHECK(epRankIdPtr == nullptr || *epRankIdPtr < 0 || *epRankIdPtr >= *epWorldSizePtr,
+                    OP_LOGE(K_INNER_DEBUG, "epRankId is invalid."), return GRAPH_FAILED);
+    OP_TILING_CHECK(moeExpertNumPtr == nullptr || *moeExpertNumPtr <= 0 || *moeExpertNumPtr > MAX_MOE_EXPERT_NUMS_A2 ||
+                        *moeExpertNumPtr % *epWorldSizePtr != 0,
+                    OP_LOGE(K_INNER_DEBUG, "moeExpertNum is invalid."), return GRAPH_FAILED);
+    OP_TILING_CHECK(tpWorldSizePtr == nullptr, OP_LOGE(K_INNER_DEBUG, "tpWorldSize is null."), return GRAPH_FAILED);
+    OP_TILING_CHECK(tpRankIdPtr == nullptr, OP_LOGE(K_INNER_DEBUG, "tpRankId is null."), return GRAPH_FAILED);
+    OP_TILING_CHECK(expertSharedTypePtr == nullptr, OP_LOGE(K_INNER_DEBUG, "expertSharedType is null."),
+                    return GRAPH_FAILED);
+    OP_TILING_CHECK(sharedExpertRankNumPtr == nullptr, OP_LOGE(K_INNER_DEBUG, "sharedExpertRankNum is null."),
+                    return GRAPH_FAILED);
+    OP_TILING_CHECK(globalBsPtr == nullptr, OP_LOGE(K_INNER_DEBUG, "globalBs is null."), return GRAPH_FAILED);
+    OP_TILING_CHECK(commQuantModePtr == nullptr, OP_LOGE(K_INNER_DEBUG, "commQuantMode is null."), return GRAPH_FAILED);
+    OP_TILING_CHECK(!isLayered && *commQuantModePtr != static_cast<CommQuantModeType>(CommQuantMode::NON_QUANT),
+                    OP_LOGE(K_INNER_DEBUG, "commQuantMode is invalid."), return GRAPH_FAILED);
+    OP_TILING_CHECK(isLayered && *commQuantModePtr != static_cast<CommQuantModeType>(CommQuantMode::NON_QUANT) &&
+                        *commQuantModePtr != static_cast<CommQuantModeType>(CommQuantMode::INT8_QUANT),
+                    OP_LOGE(K_INNER_DEBUG, "commQuantMode is invalid."), return GRAPH_FAILED);
+
+    const gert::StorageShape *expertIdStorageShape = context->GetInputShape(EXPERT_IDS_INDEX);
+    OP_TILING_CHECK(expertIdStorageShape == nullptr, OP_LOGE(K_INNER_DEBUG, "xShape is null."), return false);
+    int32_t globalBs = *epWorldSizePtr * expertIdStorageShape->GetStorageShape().GetDim(0);
+
+    // 判断是否满足uint32_t及其他限制
+    int64_t moeExpertNum = static_cast<int64_t>(*moeExpertNumPtr);
+    int64_t zeroExpertNum = *zeroExpertNumPtr;
+    int64_t copyExpertNum = *copyExpertNumPtr;
+    int64_t constExpertNum = *constExpertNumPtr;
+    OP_TILING_CHECK(
+        (moeExpertNum + zeroExpertNum + copyExpertNum + constExpertNum) > INT32_MAX,
+        OP_LOGE(K_INNER_DEBUG, "moeExpertNum + zeroExpertNum + copyExpertNum + constExpertNum exceeds MAX_INT32."),
+        return ge::GRAPH_FAILED);
+    info.epWorldSize = *epWorldSizePtr;
+    info.tpWorldSize = static_cast<uint32_t>(0);
+    info.epRankId = *epRankIdPtr;
+    info.tpRankId = static_cast<uint32_t>(0);
+    info.expertSharedType = static_cast<uint32_t>(0);
+    info.sharedExpertRankNum = static_cast<uint32_t>(0);
+    info.moeExpertNum = *moeExpertNumPtr;
+
+    info.zeroExpertNum = static_cast<uint32_t>(zeroExpertNum);
+    info.copyExpertNum = static_cast<uint32_t>(copyExpertNum);
+    info.constExpertNum = static_cast<uint32_t>(constExpertNum);
+
+    if (*globalBsPtr == 0) {
+        info.globalBs = static_cast<uint32_t>(globalBs);
+    } else {
+        info.globalBs = *globalBsPtr;
+    }
+    commQuantMode = *commQuantModePtr;
+    PrintA2TilingDataInfo(info);
+    return ge::GRAPH_SUCCESS;
+}
+
+static ge::graphStatus MoeDistributeCombineA2CheckShapeAndSetTiling(gert::TilingContext *context,
+                                                                    MoeDistributeCombineV2Info &info)
+{
+    const gert::StorageShape *expandXStorageShape = context->GetInputShape(EXPAND_X_INDEX);
+    const gert::StorageShape *expertIdStorageShape = context->GetInputShape(EXPERT_IDS_INDEX);
+    const gert::StorageShape *xActiveMaskStorageShape = context->GetOptionalInputShape(X_ACTIVE_MASK_INDEX);
+
+    OP_TILING_CHECK(expandXStorageShape == nullptr, OP_LOGE(K_INNER_DEBUG, "expandXShape is null."),
+                    return GRAPH_FAILED);
+    OP_TILING_CHECK(expertIdStorageShape == nullptr, OP_LOGE(K_INNER_DEBUG, "expertIdShape is null."),
+                    return GRAPH_FAILED);
+
+    // copy expert and const expert
+    const gert::StorageShape *oriXStorageShape = context->GetOptionalInputShape(ORI_X_INDEX);
+    const gert::StorageShape *constExpertAlpha1StorageShape =
+        context->GetOptionalInputShape(CONST_EXPERT_ALPHA_1_INDEX);
+    const gert::StorageShape *constExpertAlpha2StorageShape =
+        context->GetOptionalInputShape(CONST_EXPERT_ALPHA_2_INDEX);
+    const gert::StorageShape *constExpertVStorageShape = context->GetOptionalInputShape(CONST_EXPERT_V_INDEX);
+
+    OP_TILING_CHECK(expandXStorageShape->GetStorageShape().GetDimNum() != TWO_DIMS,
+                    OP_LOGE(K_INNER_DEBUG, "expandXshape is invalid"), return GRAPH_FAILED);
+    uint32_t h = expandXStorageShape->GetStorageShape().GetDim(1);
+    OP_TILING_CHECK(h <= 0 || h > MAX_HIDDEN_SIZE_A2 || h % BLOCK_SIZE_A2 != 0,
+                    OP_LOGE(K_INNER_DEBUG, "hiddensize is invalid."), return GRAPH_FAILED);
+    OP_TILING_CHECK(expertIdStorageShape->GetStorageShape().GetDimNum() != TWO_DIMS,
+                    OP_LOGE(K_INNER_DEBUG, "expertIdshape is invalid"), return GRAPH_FAILED);
+    uint32_t bs = expertIdStorageShape->GetStorageShape().GetDim(0);
+    OP_TILING_CHECK(bs <= 0 || bs > MAX_BATCH_SIZE_A2, OP_LOGE(K_INNER_DEBUG, "batchsize is invalid."),
+                    return GRAPH_FAILED);
+
+    uint32_t k = expertIdStorageShape->GetStorageShape().GetDim(1);
+    OP_TILING_CHECK(k <= 0 || k > MAX_K_VALUE_A2, OP_LOGE(K_INNER_DEBUG, "k is invalid."), return GRAPH_FAILED);
+    auto attrs = context->GetAttrs();
+    auto moeExpertNumPtr = attrs->GetAttrPointer<int>(ATTR_MOE_EXPERT_NUM_INDEX);
+    auto zeroExpertNumPtr = attrs->GetAttrPointer<int64_t>(static_cast<int>(ATTR_ZERO_EXPERT_NUM_INDEX));
+    auto copyExpertNumPtr = attrs->GetAttrPointer<int64_t>(static_cast<int>(ATTR_COPY_EXPERT_NUM_INDEX));
+    auto constExpertNumPtr = attrs->GetAttrPointer<int64_t>(static_cast<int>(ATTR_CONST_EXPERT_NUM_INDEX));
+    // 判断是否满足uint32_t及其他限制
+    int32_t moeExpertNum = *moeExpertNumPtr;
+    int32_t zeroExpertNum = static_cast<int32_t>(*zeroExpertNumPtr);
+    int32_t copyExpertNum = static_cast<int32_t>(*copyExpertNumPtr);
+    int32_t constExpertNum = static_cast<int32_t>(*constExpertNumPtr);
+    uint32_t totalExpertNum = static_cast<uint32_t>(moeExpertNum + zeroExpertNum + copyExpertNum + constExpertNum);
+    OP_TILING_CHECK(k <= 0 || k > MAX_K_VALUE_A2 || k > totalExpertNum, OP_LOGE(K_INNER_DEBUG, "k is invalid."),
+                    return GRAPH_FAILED);
+
+    bool isActiveMask = (xActiveMaskStorageShape != nullptr);
+    if (isActiveMask) {
+        const int64_t xActiveMaskDimNums = xActiveMaskStorageShape->GetStorageShape().GetDimNum();
+        OP_TILING_CHECK(((xActiveMaskDimNums != ONE_DIM) && (xActiveMaskDimNums != TWO_DIMS)),
+                        OP_LOGE(K_INNER_DEBUG, "xActiveMask must be 1-dimension or 2-dimension, but got %ld dim",
+                                xActiveMaskDimNums),
+                        return GRAPH_FAILED);
+
+        int64_t xActiveMaskDim0 = xActiveMaskStorageShape->GetStorageShape().GetDim(0);
+        OP_TILING_CHECK(xActiveMaskDim0 != static_cast<int64_t>(bs),
+                        OP_LOGE(K_INNER_DEBUG,
+                                "xActiveMask's dim0 not equal to expertIds's dim0, xActiveMask's dim0 is %ld, "
+                                "expertIds's dim0 is %d",
+                                xActiveMaskDim0, bs),
+                        return GRAPH_FAILED);
+
+        OP_TILING_CHECK(((xActiveMaskStorageShape->GetStorageShape().GetDimNum() == TWO_DIMS) &&
+                         (xActiveMaskStorageShape->GetStorageShape().GetDim(1) != static_cast<int64_t>(k))),
+                        OP_LOGE(K_INNER_DEBUG,
+                                "xActiveMask's dim1 not equal to expertIds's dim1, xActiveMask's dim1 is %lu, "
+                                "expertIds's dim1 is %d",
+                                xActiveMaskStorageShape->GetStorageShape().GetDim(1), k),
+                        return GRAPH_FAILED);
+    }
+
+    // copy expert and const expert
+    OP_TILING_CHECK(copyExpertNum > 0 && oriXStorageShape == nullptr,
+                    OP_LOGE(K_INNER_DEBUG, "oriX must be exist when copyExpertNum > 0"), return GRAPH_FAILED);
+    OP_TILING_CHECK(
+        constExpertNum > 0 && (oriXStorageShape == nullptr || constExpertAlpha1StorageShape == nullptr ||
+                               constExpertAlpha2StorageShape == nullptr || constExpertVStorageShape == nullptr),
+        OP_LOGE(K_INNER_DEBUG, "oriX、alpha1、alpha2、V must be exist when constExpertNum > 0"), return GRAPH_FAILED);
+
+    if (oriXStorageShape != nullptr) {
+        // 必须是2维
+        OP_TILING_CHECK(oriXStorageShape->GetStorageShape().GetDimNum() != TWO_DIMS,
+                        OP_LOGE(K_INNER_DEBUG, "ori_x must be 2-dimension, but got %lu dim",
+                                oriXStorageShape->GetStorageShape().GetDimNum()),
+                        return GRAPH_FAILED);
+
+        // shape为(bs, h)
+        int64_t oriXDim0 = oriXStorageShape->GetStorageShape().GetDim(0);
+        int64_t oriXDim1 = oriXStorageShape->GetStorageShape().GetDim(1);
+        OP_TILING_CHECK(oriXDim0 != static_cast<int64_t>(bs),
+                        OP_LOGE(K_INNER_DEBUG, "ori_x's dim0 not equal to bs, ori_x's dim0 = %ld, bs = %ld", oriXDim0,
+                                static_cast<int64_t>(bs)),
+                        return GRAPH_FAILED);
+        OP_TILING_CHECK(oriXDim1 != static_cast<int64_t>(h),
+                        OP_LOGE(K_INNER_DEBUG, "ori_x's dim1 not equal to h, ori_x's dim1 = %ld, h = %ld", oriXDim1,
+                                static_cast<int64_t>(h)),
+                        return GRAPH_FAILED);
+    }
+
+    if (constExpertAlpha1StorageShape != nullptr) {
+        // 必须是1维
+        OP_TILING_CHECK(constExpertAlpha1StorageShape->GetStorageShape().GetDimNum() != ONE_DIM,
+                        OP_LOGE(K_INNER_DEBUG, "const_expert_alpha_1 must be 1-dimension, but got %lu dim",
+                                constExpertAlpha1StorageShape->GetStorageShape().GetDimNum()),
+                        return GRAPH_FAILED);
+
+        // shape为(constExpertNum)
+        int64_t constExpertAlpha1Dim0 = constExpertAlpha1StorageShape->GetStorageShape().GetDim(0);
+        OP_TILING_CHECK(
+            constExpertAlpha1Dim0 != *constExpertNumPtr,
+            OP_LOGE(K_INNER_DEBUG,
+                    "const_expert_alpha_1's dim0 not equal to const_expert_num, const_expert_alpha_1's dim0 = %ld, "
+                    "const_expert_num = %ld",
+                    constExpertAlpha1Dim0, *constExpertNumPtr),
+            return GRAPH_FAILED);
+    }
+
+    if (constExpertAlpha2StorageShape != nullptr) {
+        // 必须是1维
+        OP_TILING_CHECK(constExpertAlpha2StorageShape->GetStorageShape().GetDimNum() != ONE_DIM,
+                        OP_LOGE(K_INNER_DEBUG, "const_expert_alpha_2 must be 1-dimension, but got %lu dim",
+                                constExpertAlpha2StorageShape->GetStorageShape().GetDimNum()),
+                        return GRAPH_FAILED);
+
+        // shape为(constExpertNum)
+        int64_t constExpertAlpha2Dim0 = constExpertAlpha2StorageShape->GetStorageShape().GetDim(0);
+        OP_TILING_CHECK(
+            constExpertAlpha2Dim0 != *constExpertNumPtr,
+            OP_LOGE(K_INNER_DEBUG,
+                    "const_expert_alpha_2's dim0 not equal to const_expert_num, const_expert_alpha_2's dim0 = %ld, "
+                    "const_expert_num = %ld",
+                    constExpertAlpha2Dim0, *constExpertNumPtr),
+            return GRAPH_FAILED);
+    }
+
+    if (constExpertVStorageShape != nullptr) {
+        // 必须是2维
+        OP_TILING_CHECK(constExpertVStorageShape->GetStorageShape().GetDimNum() != TWO_DIMS,
+                        OP_LOGE(K_INNER_DEBUG, "const_expert_v must be 2-dimension, but got %lu dim",
+                                constExpertVStorageShape->GetStorageShape().GetDimNum()),
+                        return GRAPH_FAILED);
+        // 必须是2维(constExpertNum, H)
+        int64_t constExpertVDim0 = constExpertVStorageShape->GetStorageShape().GetDim(0);
+        int64_t constExpertVDim1 = constExpertVStorageShape->GetStorageShape().GetDim(1);
+        OP_TILING_CHECK(constExpertVDim0 != *constExpertNumPtr,
+                        OP_LOGE(K_INNER_DEBUG,
+                                "const_expert_v's dim0 not equal to const_expert_num, const_expert_v's dim0 = %ld, "
+                                "const_expert_num = %ld",
+                                constExpertVDim0, *constExpertNumPtr),
+                        return GRAPH_FAILED);
+        OP_TILING_CHECK(
+            constExpertVDim1 != static_cast<int64_t>(h),
+            OP_LOGE(K_INNER_DEBUG, "const_expert_v's dim1 not equal to h, const_expert_v's dim1 = %ld, h = %ld",
+                    constExpertVDim1, static_cast<int64_t>(h)),
+            return GRAPH_FAILED);
+    }
+
+    info.isTokenMask = ((isActiveMask) && (xActiveMaskStorageShape->GetStorageShape().GetDimNum() == ONE_DIM));
+    info.isExpertMask = ((isActiveMask) && (xActiveMaskStorageShape->GetStorageShape().GetDimNum() == TWO_DIMS));
+    info.bs = bs;
+    info.k = k;
+    info.h = h;
+
+    OP_LOGD(K_INNER_DEBUG, "batchSize is %u", bs);
+    OP_LOGD(K_INNER_DEBUG, "k is %u", k);
+    OP_LOGD(K_INNER_DEBUG, "hiddenSize is %u", h);
+    OP_LOGD(K_INNER_DEBUG, "isTokenMask is %d", static_cast<int32_t>(info.isTokenMask));
+    OP_LOGD(K_INNER_DEBUG, "isExpertMask is %d", static_cast<int32_t>(info.isExpertMask));
+
+    return ge::GRAPH_SUCCESS;
+}
+
+static ge::graphStatus MoeDistributeCombineA2GetPlatformInfoAndSetTiling(gert::TilingContext *context,
+                                                                         MoeDistributeCombineV2Info &info)
+{
+    auto ascendcPlatform = platform_ascendc::PlatformAscendC(context->GetPlatformInfo());
+    uint32_t aivNum = ascendcPlatform.GetCoreNumAiv();
+    uint64_t ubSize = 0U;
+    ascendcPlatform.GetCoreMemSize(platform_ascendc::CoreMemType::UB, ubSize);
+
+    info.aivNum = aivNum;
+    info.totalUbSize = ubSize;
+
+    OP_LOGD(K_INNER_DEBUG, "aivNum=%d", info.aivNum);
+    OP_LOGD(K_INNER_DEBUG, "ubSize=%lu", info.totalUbSize);
+
+    return ge::GRAPH_SUCCESS;
+}
+
+// 为了兼容老版本，在未配置commAlg参数时，读取环境变量；
+// commAlg参数当前支持"fullmesh"和"hierarchy"两种，其余报错。
+static ge::graphStatus MoeDistributeCombineCheckCommAlg(gert::TilingContext *context, bool &isLayered)
+{
+    isLayered = false;
+    auto attrs = context->GetAttrs();
+    auto commAlg = attrs->GetAttrPointer<char>(static_cast<int>(ATTR_COMM_ALG_INDEX));
+
+    const char *hcclIntraPcieEnable = getenv("HCCL_INTRA_PCIE_ENABLE");
+    const char *hcclIntraRoceEnable = getenv("HCCL_INTRA_ROCE_ENABLE");
+    if (hcclIntraPcieEnable != nullptr && hcclIntraRoceEnable != nullptr && strcmp(hcclIntraPcieEnable, "1") == 0 &&
+        strcmp(hcclIntraRoceEnable, "0") == 0) {
+        OP_LOGD(K_INNER_DEBUG,
+                "ENV HCCL_INTRA_PCIE_ENABLE = 1 and HCCL_INTRA_ROCE_ENABLE = 0, use hierarchy algorithm.");
+        isLayered = true;
+        return ge::GRAPH_SUCCESS;
+    } else {
+        OP_LOGD(K_INNER_DEBUG,
+                "ENV HCCL_INTRA_PCIE_ENABLE != 1 or HCCL_INTRA_ROCE_ENABLE != 0, use default fullmesh algorithm.");
+    }
+
+    if (commAlg == nullptr || strlen(commAlg) == 0) {
+        OP_LOGE(K_INNER_DEBUG, "Attr commAlg is invalid, please configure fullmesh or hierarchy.");
+        return GRAPH_FAILED;
+    }
+
+    OP_LOGI(K_INNER_DEBUG, "commAlg is %s", commAlg);
+    if (strcmp(commAlg, "fullmesh") == 0) {
+        return ge::GRAPH_SUCCESS;
+    } else if (strcmp(commAlg, "hierarchy") == 0) {
+        isLayered = true;
+        return ge::GRAPH_SUCCESS;
+    } else {
+        OP_LOGE(K_INNER_DEBUG, "commAlg is not support");
+        return GRAPH_FAILED;
+    }
+}
+
+static uint64_t MoeDistributeCombineA2CalcTilingKey(gert::TilingContext *context, const bool isLayered,
+                                                    const int32_t commQuantMode)
+{
+    uint64_t tilingKey = TILING_KEY_BASE_A2;
+    if (isLayered) {
+        tilingKey = TILING_KEY_LAYERED_COMM_A2;
+        if (commQuantMode == static_cast<CommQuantModeType>(CommQuantMode::INT8_QUANT)) {
+            tilingKey += TILING_KEY_INT8_COMM_QUANT_A2;
+        }
+    }
+    OP_LOGD(K_INNER_DEBUG, "tilingKey=%lu", tilingKey);
+    return tilingKey;
+}
+
+static ge::graphStatus MoeDistributeCombineA2TilingFuncImpl(gert::TilingContext *context)
+{
+    const char *nodeName = context->GetNodeName();
+    OP_LOGI(nodeName, "Enter MoeDistributeCombineV2 tiling func.");
+
+    // tilingData
+    MoeDistributeCombineV2TilingData *tilingData = context->GetTilingData<MoeDistributeCombineV2TilingData>();
+    OP_TILING_CHECK(tilingData == nullptr, VECTOR_INNER_ERR_REPORT_TILIING(nodeName, "tilingData is nullptr."),
+                    return ge::GRAPH_FAILED);
+    MoeDistributeCombineV2Info &info = tilingData->moeDistributeCombineV2Info;
+
+    bool isLayered = false;
+    OP_TILING_CHECK(
+        MoeDistributeCombineCheckCommAlg(context, isLayered) != ge::GRAPH_SUCCESS,
+        VECTOR_INNER_ERR_REPORT_TILIING(context->GetNodeName(), "MoeDistributeCombineV2 CheckCommAlg Failed"),
+        return ge::GRAPH_FAILED);
+    int32_t commQuantMode = 0;
+    OP_TILING_CHECK(
+        MoeDistributeCombineA2CheckShapeAndSetTiling(context, info) != ge::GRAPH_SUCCESS,
+        VECTOR_INNER_ERR_REPORT_TILIING(context->GetNodeName(), "MoeDistributeCombineV2 CheckShapeAndSetTiling Failed"),
+        return ge::GRAPH_FAILED);
+    OP_TILING_CHECK(
+        MoeDistributeCombineA2CheckAttrAndSetTiling(context, info, commQuantMode, isLayered) != ge::GRAPH_SUCCESS,
+        VECTOR_INNER_ERR_REPORT_TILIING(context->GetNodeName(), "MoeDistributeCombineV2 CheckAttrAndSetTiling Failed"),
+        return ge::GRAPH_FAILED);
+    OP_TILING_CHECK(MoeDistributeCombineA2GetPlatformInfoAndSetTiling(context, info) != ge::GRAPH_SUCCESS,
+                    VECTOR_INNER_ERR_REPORT_TILIING(context->GetNodeName(),
+                                                    "MoeDistributeCombineV2 GetPlatformInfoAndSetTiling Failed"),
+                    return ge::GRAPH_FAILED);
+
+    uint32_t blockDim = 1U;
+    auto ascendcPlatform = platform_ascendc::PlatformAscendC(context->GetPlatformInfo());
+    uint32_t aivNum = ascendcPlatform.GetCoreNumAiv();
+    blockDim = ascendcPlatform.CalcTschBlockDim(aivNum, 0, aivNum);
+    context->SetBlockDim(blockDim);
+    context->SetAicpuBlockDim(mc2tiling::AICPU_BLOCK_DIM_A2);
+
+    uint64_t tilingKey = MoeDistributeCombineA2CalcTilingKey(context, isLayered, commQuantMode);
+    context->SetTilingKey(tilingKey);
+    // 2. workspace
+    size_t *workSpaces = context->GetWorkspaceSizes(1);
+    OP_TILING_CHECK(workSpaces == nullptr, VECTOR_INNER_ERR_REPORT_TILIING(nodeName, "workSpaces is nullptr."),
+                    return ge::GRAPH_FAILED);
+    size_t userWorkspaceSize = info.moeExpertNum * sizeof(uint32_t) * 2U;
+    workSpaces[0] = SYSTEM_NEED_WORKSPACE + userWorkspaceSize;
+
+    // 3. communication
+    auto attrs = context->GetAttrs();
+    auto group = attrs->GetAttrPointer<char>(static_cast<int>(ATTR_GROUP_EP_INDEX));
+    std::string algConfig = isLayered ? "BatchWrite=level1:hierarchy" : "BatchWrite=level1:fullmesh";
+    uint32_t opType = 18;  // DispatchCombine
+
+    AscendC::Mc2CcTilingConfig mc2CcTilingConfig(group, opType, algConfig);
+    mc2CcTilingConfig.GetTiling(tilingData->mc2InitTiling);
+    mc2CcTilingConfig.GetTiling(tilingData->mc2CcTiling);
+
+    return ge::GRAPH_SUCCESS;
+}
+
+static ge::graphStatus MoeDistributeCombineV2TilingFunc(gert::TilingContext *context)
+{
+    // 不支持 expandX数据类型为int32 type
+    auto expandXDesc = context->GetInputDesc(EXPAND_X_INDEX);
+    const char *nodeName = context->GetNodeName();
+    OP_TILING_CHECK(expandXDesc == nullptr, OP_LOGE(nodeName, "expandxDesc is null."), return ge::GRAPH_FAILED);
+    // 检查expandX数据类型为DT_INT32
+    OP_TILING_CHECK((expandXDesc->GetDataType() == ge::DT_INT32),
+                    OP_LOGE(nodeName, "expandX dataType is invalid, dataType should be bf16 or float16, but is %d",
+                            static_cast<ge::DataType>(expandXDesc->GetDataType())),
+                    return ge::GRAPH_FAILED);
+
+    fe::PlatFormInfos *platformInfoPtr = context->GetPlatformInfo();
+    fe::PlatFormInfos &platformInfo = *platformInfoPtr;
+
+    std::string socVersion;
+    (void)platformInfo.GetPlatformResWithLock("version", "Short_SoC_version", socVersion);
+    ge::graphStatus ret;
+    if (socVersion == "Ascend910B") {
+        ret = MoeDistributeCombineA2TilingFuncImpl(context);
+    } else {
+        // ret = MoeDistributeCombineA3TilingFuncImpl(context);
+    }
+
+    return ret;
+}
+
+struct MoeDistributeCombineCompileInfo {};
+ge::graphStatus TilingParseForMoeDistributeCombineV2(gert::TilingParseContext *context)
+{
+    (void)context;
+    return ge::GRAPH_SUCCESS;
+}
+
+IMPL_OP_OPTILING(MoeDistributeCombineV2)
+    .Tiling(MoeDistributeCombineV2TilingFunc)
+    .TilingParse<MoeDistributeCombineCompileInfo>(TilingParseForMoeDistributeCombineV2);
+}  // namespace optiling
diff --git a/csrc/deepep/ops2/op_host/moe_distribute_dispatch_v2.cpp b/csrc/deepep/ops2/op_host/moe_distribute_dispatch_v2.cpp
new file mode 100644
index 000000000..b5022234c
--- /dev/null
+++ b/csrc/deepep/ops2/op_host/moe_distribute_dispatch_v2.cpp
@@ -0,0 +1,142 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*!
+ * \file moe_distribute_dispatch_v2.cpp
+ * \brief
+ */
+
+#include "register/op_def_registry.h"
+
+namespace ops {
+class MoeDistributeDispatchV2 : public OpDef
+{
+public:
+    explicit MoeDistributeDispatchV2(const char *name) : OpDef(name)
+    {
+        this->Input("x")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_BF16, ge::DT_BF16, ge::DT_FLOAT16, ge::DT_FLOAT16})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .AutoContiguous();
+        this->Input("expert_ids")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_INT32, ge::DT_INT32, ge::DT_INT32, ge::DT_INT32})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .AutoContiguous();
+        this->Input("scales")
+            .ParamType(OPTIONAL)
+            .DataType({ge::DT_FLOAT, ge::DT_FLOAT, ge::DT_FLOAT, ge::DT_FLOAT})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .AutoContiguous();
+        this->Input("x_active_mask")
+            .ParamType(OPTIONAL)
+            .DataType({ge::DT_BOOL, ge::DT_BOOL, ge::DT_BOOL, ge::DT_BOOL})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .AutoContiguous();
+        this->Input("expert_scales")
+            .ParamType(OPTIONAL)
+            .DataType({ge::DT_FLOAT, ge::DT_FLOAT, ge::DT_FLOAT, ge::DT_FLOAT})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .AutoContiguous();
+        this->Input("elastic_info")
+            .ParamType(OPTIONAL)
+            .DataType({ge::DT_INT32, ge::DT_INT32, ge::DT_INT32, ge::DT_INT32})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .AutoContiguous();
+
+        this->Output("expand_x")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_BF16, ge::DT_INT8, ge::DT_FLOAT16, ge::DT_INT8})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND});
+
+        this->Output("dynamic_scales")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_FLOAT, ge::DT_FLOAT, ge::DT_FLOAT, ge::DT_FLOAT})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND});
+
+        this->Output("assist_info_for_combine")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_INT32, ge::DT_INT32, ge::DT_INT32, ge::DT_INT32})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND});
+        this->Output("expert_token_nums")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_INT64, ge::DT_INT64, ge::DT_INT64, ge::DT_INT64})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND});
+        this->Output("ep_recv_count")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_INT32, ge::DT_INT32, ge::DT_INT32, ge::DT_INT32})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND});
+        this->Output("tp_recv_count")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_INT32, ge::DT_INT32, ge::DT_INT32, ge::DT_INT32})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND});
+        this->Output("expand_scales")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_FLOAT, ge::DT_FLOAT, ge::DT_FLOAT, ge::DT_FLOAT})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND});
+
+        this->Attr("group_ep").AttrType(REQUIRED).String();
+        this->Attr("ep_world_size").AttrType(REQUIRED).Int();
+        this->Attr("ep_rank_id").AttrType(REQUIRED).Int();
+        this->Attr("moe_expert_num").AttrType(REQUIRED).Int();
+        this->Attr("group_tp").AttrType(OPTIONAL).String("");
+        this->Attr("tp_world_size").AttrType(OPTIONAL).Int(0);
+        this->Attr("tp_rank_id").AttrType(OPTIONAL).Int(0);
+        this->Attr("expert_shard_type").AttrType(OPTIONAL).Int(0);
+        this->Attr("shared_expert_num").AttrType(OPTIONAL).Int(1);
+        this->Attr("shared_expert_rank_num").AttrType(OPTIONAL).Int(0);
+        this->Attr("quant_mode").AttrType(OPTIONAL).Int(0);
+        this->Attr("global_bs").AttrType(OPTIONAL).Int(0);
+        this->Attr("expert_token_nums_type").AttrType(OPTIONAL).Int(1);
+        this->Attr("comm_alg").AttrType(OPTIONAL).String("");
+        this->Attr("zero_expert_num").AttrType(OPTIONAL).Int(0);
+        this->Attr("copy_expert_num").AttrType(OPTIONAL).Int(0);
+        this->Attr("const_expert_num").AttrType(OPTIONAL).Int(0);
+
+        OpAICoreConfig aicore_config_A2;
+        aicore_config_A2.DynamicCompileStaticFlag(true)
+            .DynamicFormatFlag(true)
+            .DynamicRankSupportFlag(true)
+            .DynamicShapeSupportFlag(true)
+            .NeedCheckSupportFlag(false)
+            .PrecisionReduceFlag(true)
+            .ExtendCfgInfo("aclnnSupport.value", "support_aclnn")
+            .ExtendCfgInfo("prebuildPattern.value", "Opaque")
+            .ExtendCfgInfo("jitCompile.flag", "static_false")
+            .ExtendCfgInfo("multiKernelSupportDynamicGraph.value", "multi_kernel");
+
+        this->AICore().AddConfig("ascend910b", aicore_config_A2);
+        this->MC2().HcclGroup({"group_ep"});
+    }
+};
+
+OP_ADD(MoeDistributeDispatchV2);
+
+}  // namespace ops
diff --git a/csrc/deepep/ops2/op_host/moe_distribute_dispatch_v2_tiling.cc b/csrc/deepep/ops2/op_host/moe_distribute_dispatch_v2_tiling.cc
new file mode 100644
index 000000000..b7f4b55be
--- /dev/null
+++ b/csrc/deepep/ops2/op_host/moe_distribute_dispatch_v2_tiling.cc
@@ -0,0 +1,497 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*!
+ * \file moe_distribute_dispatch_v2_tiling.cc
+ * \brief
+ */
+
+#include <queue>
+#include <vector>
+#include <dlfcn.h>
+#include <fcntl.h>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <cmath>
+#include <cstdint>
+#include <string>
+
+#include "error_log.h"
+#include "graph/utils/type_utils.h"
+#include "register/op_def_registry.h"
+#include "tiling/platform/platform_ascendc.h"
+#include "tiling/hccl/hccl_tiling.h"
+#include "mc2_tiling_utils.h"
+#include "experiment/platform/platform/platform_infos_def.h"
+#include "../op_kernel/moe_distribute_dispatch_v2_tiling.h"
+
+using namespace AscendC;
+using namespace ge;
+namespace {
+constexpr uint32_t X_INDEX = 0U;
+constexpr uint32_t EXPERT_IDS_INDEX = 1U;
+constexpr uint32_t SCALES_INDEX = 2U;
+constexpr uint32_t X_ACTIVE_MASK_INDEX = 3U;
+constexpr uint32_t EXPERT_SCALES_INDEX = 4U;
+constexpr uint32_t OUTPUT_EXPAND_X_INDEX = 0U;
+constexpr uint32_t OUTPUT_DYNAMIC_SCALES_INDEX = 1U;
+constexpr uint32_t OUTPUT_ASSIST_INFO_INDEX = 2U;
+constexpr uint32_t OUTPUT_EXPERT_TOKEN_NUMS_INDEX = 3U;
+constexpr uint32_t OUTPUT_EP_RECV_COUNTS_INDEX = 4U;
+constexpr uint32_t OUTPUT_TP_RECV_COUNTS_INDEX = 5U;
+constexpr uint32_t OUTPUT_EXPAND_SCALES_INDEX = 6U;
+
+constexpr uint32_t ATTR_GROUP_EP_INDEX = 0;
+constexpr uint32_t ATTR_EP_WORLD_SIZE_INDEX = 1;
+constexpr uint32_t ATTR_EP_RANK_ID_INDEX = 2;
+constexpr uint32_t ATTR_MOE_EXPERT_NUM_INDEX = 3;
+constexpr uint32_t ATTR_GROUP_TP_INDEX = 4;
+constexpr uint32_t ATTR_TP_WORLD_SIZE_INDEX = 5;
+constexpr uint32_t ATTR_TP_RANK_ID_INDEX = 6;
+constexpr uint32_t ATTR_EXPERT_SHARD_TYPE_INDEX = 7;
+constexpr uint32_t ATTR_SHARED_EXPERT_NUM_INDEX = 8;
+constexpr uint32_t ATTR_SHARED_EXPERT_RANK_NUM_INDEX = 9;
+constexpr uint32_t ATTR_QUANT_MODE_INDEX = 10;
+constexpr uint32_t ATTR_GLOBAL_BS_INDEX = 11;
+constexpr uint32_t ATTR_EXPERT_TOKEN_NUMS_TYPE_INDEX = 12;
+constexpr uint32_t ATTR_COMM_ALG_INDEX = 13;
+constexpr uint32_t ATTR_ZERO_EXPERT_NUM_INDEX = 14;
+constexpr uint32_t ATTR_COPY_EXPERT_NUM_INDEX = 15;
+constexpr uint32_t ATTR_CONST_EXPERT_NUM_INDEX = 16;
+
+constexpr uint32_t TWO_DIMS = 2;
+constexpr uint32_t ONE_DIM = 1;
+constexpr uint32_t DYN_SCALE_DIMS = 1;
+constexpr uint32_t ASSIST_INFO_DIMS = 1;
+constexpr uint32_t DYNAMIC_SCALE_DIM_NUM = 1;
+constexpr uint64_t INIT_TILINGKEY = 10000;
+constexpr uint32_t ARR_LENGTH = 128;
+constexpr uint32_t OP_TYPE_ALL_TO_ALL = 8;
+constexpr uint32_t NO_SCALES = 0;
+constexpr uint32_t STATIC_SCALES = 1;
+constexpr uint32_t DYNAMIC_SCALES = 2;
+constexpr uint32_t OP_TYPE_ALL_GATHER = 6;
+
+constexpr uint32_t UNQUANT_MODE = 0;
+constexpr uint32_t STATIC_QUANT_MODE = 1;
+constexpr uint32_t DYNAMIC_QUANT_MODE = 2;
+constexpr size_t MAX_GROUP_NAME_LENGTH = 128UL;
+constexpr int64_t MAX_SHARED_EXPERT_NUM = 4;
+constexpr int64_t MAX_EP_WORLD_SIZE = 768L;  // 384 * 2
+constexpr int64_t MIN_EP_WORLD_SIZE = 2;
+constexpr int64_t EP_RESTRICT_8 = 8;
+constexpr int64_t MAX_TP_WORLD_SIZE = 2;
+constexpr int64_t BS_UPPER_BOUND = 512;
+
+constexpr uint64_t NUM_10 = 10ULL;
+constexpr uint32_t TILINGKEY_SCALES = 10;
+constexpr uint32_t TILINGKEY_TP_WORLD_SIZE = 100;
+constexpr uint32_t TP_WORLD_SIZE_TWO = 2;
+constexpr uint32_t TILINGKEY_IS_SHARE_EXPERT = 1000;
+constexpr uint32_t VERSION_2 = 2;
+constexpr uint32_t HCOMMCNT_2 = 2;
+constexpr int64_t MOE_EXPERT_MAX_NUM = 1024;
+constexpr int64_t K_MAX = 16;
+constexpr size_t SYSTEM_NEED_WORKSPACE = 16UL * 1024UL * 1024UL;
+constexpr uint32_t WORKSPACE_ELEMENT_OFFSET = 512;
+constexpr int32_t HCCL_BUFFER_SIZE_DEFAULT = 200 * 1024 * 1024;  // Bytes
+constexpr int64_t H_MIN = 1024;
+constexpr int64_t H_MAX = 8192;
+constexpr uint64_t MB_SIZE = 1024UL * 1024UL;
+constexpr uint64_t TRIPLE = 3;
+constexpr uint64_t WIN_ADDR_ALIGN = 512UL;
+constexpr uint64_t SCALE_EXPAND_IDX_BUFFER = 44UL;  // scale32B + 3*4expandIdx
+constexpr uint64_t DOUBLE_DATA_BUFFER = 2UL;
+constexpr uint64_t MAX_OUT_DTYPE_SIZE = 2UL;
+constexpr uint64_t UB_ALIGN = 32UL;
+constexpr int64_t DISPATCH_STATUS_MAX_SUPPORT_NUM = 1280UL;
+
+// A2定义
+const char *K_INNER_DEBUG = "MoeDistributeDispatchV2 Tiling Debug";
+constexpr uint32_t RANK_NUM_PER_NODE_A2 = 8;
+constexpr uint32_t BLOCK_SIZE_A2 = 32;
+constexpr uint32_t MAX_K_VALUE_A2 = 16;
+constexpr int32_t MAX_HIDDEN_SIZE_A2 = 7168;
+constexpr int32_t MAX_EP_WORLD_SIZE_A2 = 256;
+constexpr int32_t MAX_MOE_EXPERT_NUMS_A2 = 512;
+constexpr uint32_t MAX_BATCH_SIZE_A2 = 512;
+constexpr size_t USER_WORKSPACE_A2 = 1UL * 1024UL * 1024UL;  // moeExpertNum_ * sizeof(uint32_t) + epWorldSize_ * 2 * 32
+constexpr uint64_t TILING_KEY_BASE_A2 = 2000000000;
+constexpr uint64_t TILING_KEY_LAYERED_COMM_A2 = 100000000;
+constexpr uint64_t INIT_TILINGKEY_A2 = 1000;
+}  // namespace
+
+namespace optiling {
+// a2函数
+static ge::graphStatus MoeDistributeDispatchA2CheckAttrAndSetTiling(gert::TilingContext *context,
+                                                                    MoeDistributeDispatchV2Info &info)
+{
+    auto attrs = context->GetAttrs();
+    OP_TILING_CHECK(attrs == nullptr, OP_LOGE(K_INNER_DEBUG, "attrs is null."), return ge::GRAPH_FAILED);
+
+    auto groupEpPtr = attrs->GetAttrPointer<char>(static_cast<int>(ATTR_GROUP_EP_INDEX));
+    auto epWorldSizePtr = attrs->GetAttrPointer<int>(ATTR_EP_WORLD_SIZE_INDEX);
+    auto epRankIdPtr = attrs->GetAttrPointer<int>(ATTR_EP_RANK_ID_INDEX);
+    auto moeExpertNumPtr = attrs->GetAttrPointer<int>(ATTR_MOE_EXPERT_NUM_INDEX);
+    auto tpWorldSizePtr = attrs->GetAttrPointer<int>(ATTR_TP_WORLD_SIZE_INDEX);
+    auto tpRankIdPtr = attrs->GetAttrPointer<int>(ATTR_TP_RANK_ID_INDEX);
+    auto expertSharedTypePtr = attrs->GetAttrPointer<int>(ATTR_EXPERT_SHARD_TYPE_INDEX);
+    auto sharedExpertRankNumPtr = attrs->GetAttrPointer<int>(ATTR_SHARED_EXPERT_RANK_NUM_INDEX);
+    auto quantModePtr = attrs->GetAttrPointer<int>(ATTR_QUANT_MODE_INDEX);
+    auto globalBsPtr = attrs->GetAttrPointer<int>(ATTR_GLOBAL_BS_INDEX);
+    auto expertTokenNumsTypePtr = attrs->GetAttrPointer<int>(ATTR_EXPERT_TOKEN_NUMS_TYPE_INDEX);
+    auto zeroExpertNumPtr = attrs->GetAttrPointer<int64_t>(static_cast<int>(ATTR_ZERO_EXPERT_NUM_INDEX));
+    auto copyExpertNumPtr = attrs->GetAttrPointer<int64_t>(static_cast<int>(ATTR_COPY_EXPERT_NUM_INDEX));
+    auto constExpertNumPtr = attrs->GetAttrPointer<int64_t>(static_cast<int>(ATTR_CONST_EXPERT_NUM_INDEX));
+
+    const gert::StorageShape *expertIdStorageShape = context->GetInputShape(EXPERT_IDS_INDEX);
+    OP_TILING_CHECK(expertIdStorageShape == nullptr, OP_LOGE(K_INNER_DEBUG, "expertIdShape is null."),
+                    return GRAPH_FAILED);
+    int32_t bs = expertIdStorageShape->GetStorageShape().GetDim(0);
+
+    OP_TILING_CHECK((groupEpPtr == nullptr) || (strnlen(groupEpPtr, MAX_GROUP_NAME_LENGTH) == 0) ||
+                        (strnlen(groupEpPtr, MAX_GROUP_NAME_LENGTH) == MAX_GROUP_NAME_LENGTH),
+                    OP_LOGE(K_INNER_DEBUG, "groupEp is invalid."), return ge::GRAPH_FAILED);
+    OP_TILING_CHECK(epWorldSizePtr == nullptr || *epWorldSizePtr <= 0 || *epWorldSizePtr > MAX_EP_WORLD_SIZE_A2 ||
+                        *epWorldSizePtr % RANK_NUM_PER_NODE_A2 != 0,
+                    OP_LOGE(K_INNER_DEBUG, "epWorldSize is invalid."), return GRAPH_FAILED);
+    OP_TILING_CHECK(epRankIdPtr == nullptr || *epRankIdPtr < 0 || *epRankIdPtr >= *epWorldSizePtr,
+                    OP_LOGE(K_INNER_DEBUG, "epRankId is invalid."), return GRAPH_FAILED);
+    OP_TILING_CHECK(moeExpertNumPtr == nullptr || *moeExpertNumPtr % *epWorldSizePtr != 0 || *moeExpertNumPtr <= 0 ||
+                        *moeExpertNumPtr > MAX_MOE_EXPERT_NUMS_A2,
+                    OP_LOGE(K_INNER_DEBUG, "moeExpertNum is invalid."), return GRAPH_FAILED);
+    OP_TILING_CHECK(tpWorldSizePtr == nullptr, OP_LOGE(K_INNER_DEBUG, "tpWorldSize is null."), return GRAPH_FAILED);
+    OP_TILING_CHECK(tpRankIdPtr == nullptr, OP_LOGE(K_INNER_DEBUG, "tpRankId is null."), return GRAPH_FAILED);
+    OP_TILING_CHECK(expertSharedTypePtr == nullptr, OP_LOGE(K_INNER_DEBUG, "expertSharedType is null."),
+                    return GRAPH_FAILED);
+    OP_TILING_CHECK(sharedExpertRankNumPtr == nullptr, OP_LOGE(K_INNER_DEBUG, "sharedExpertRankNum is null."),
+                    return GRAPH_FAILED);
+    OP_TILING_CHECK(quantModePtr == nullptr || (*quantModePtr != UNQUANT_MODE && *quantModePtr != DYNAMIC_QUANT_MODE),
+                    OP_LOGE(K_INNER_DEBUG, "quantMode is invalid."), return GRAPH_FAILED);
+    OP_TILING_CHECK(globalBsPtr == nullptr, OP_LOGE(K_INNER_DEBUG, "globalBs is null."), return GRAPH_FAILED);
+    OP_TILING_CHECK(expertTokenNumsTypePtr == nullptr || *expertTokenNumsTypePtr < 0 || *expertTokenNumsTypePtr > 1,
+                    OP_LOGE(K_INNER_DEBUG, "expertTokenNumsType is invalid. Must be 0 or 1. "), return GRAPH_FAILED);
+    OP_TILING_CHECK(zeroExpertNumPtr == nullptr, OP_LOGE(K_INNER_DEBUG, "zeroExpertNumPtr is null."),
+                    return ge::GRAPH_FAILED);
+    OP_TILING_CHECK(copyExpertNumPtr == nullptr, OP_LOGE(K_INNER_DEBUG, "copyExpertNumPtr is null."),
+                    return ge::GRAPH_FAILED);
+    OP_TILING_CHECK(constExpertNumPtr == nullptr, OP_LOGE(K_INNER_DEBUG, "constExpertNumPtr is null."),
+                    return ge::GRAPH_FAILED);
+
+    // 判断是否满足uint32_t及其他限制
+    int64_t moeExpertNum = static_cast<int64_t>(*moeExpertNumPtr);
+    int64_t zeroExpertNum = *zeroExpertNumPtr;
+    int64_t copyExpertNum = *copyExpertNumPtr;
+    int64_t constExpertNum = *constExpertNumPtr;
+    int64_t zeroComputeExpertNum = zeroExpertNum + copyExpertNum + constExpertNum;
+
+    OP_LOGD(K_INNER_DEBUG, "zeroExpertNum=%ld,copyExpertNum= %ld, constExpertNum=%ld", zeroExpertNum, copyExpertNum,
+            constExpertNum);
+    OP_TILING_CHECK(
+        zeroComputeExpertNum + moeExpertNum > INT32_MAX,
+        OP_LOGE(K_INNER_DEBUG,
+                "zeroExpertNum[%ld] + copyExpertNum[%ld] + constExpertNum[%ld] + moeExpertNum[%ld] exceed INT32_MAX.",
+                zeroExpertNum, copyExpertNum, constExpertNum, moeExpertNum),
+        return ge::GRAPH_FAILED);
+
+    info.epWorldSize = *epWorldSizePtr;
+    info.tpWorldSize = static_cast<uint32_t>(0);
+    info.epRankId = *epRankIdPtr;
+    info.tpRankId = static_cast<uint32_t>(0);
+    info.expertSharedType = static_cast<uint32_t>(0);
+    info.sharedExpertRankNum = static_cast<uint32_t>(0);
+    info.moeExpertNum = *moeExpertNumPtr;
+    info.quantMode = *quantModePtr;
+    if (*globalBsPtr == 0) {
+        info.globalBs = *epWorldSizePtr * bs;
+    } else {
+        info.globalBs = *globalBsPtr;
+    }
+    info.expertTokenNumsType = *expertTokenNumsTypePtr;
+    info.zeroComputeExpertNum = static_cast<int32_t>(zeroComputeExpertNum);
+    OP_LOGD(K_INNER_DEBUG, "quantMode=%d", info.quantMode);
+    OP_LOGD(K_INNER_DEBUG, "globalBs=%d", info.globalBs);
+    OP_LOGD(K_INNER_DEBUG, "expertTokenNumsType=%d", info.expertTokenNumsType);
+    OP_LOGD(K_INNER_DEBUG, "expertSharedType=%d", info.expertSharedType);
+    OP_LOGD(K_INNER_DEBUG, "sharedExpertRankNum=%d", info.sharedExpertRankNum);
+    OP_LOGD(K_INNER_DEBUG, "moeExpertNum=%d", info.moeExpertNum);
+    OP_LOGD(K_INNER_DEBUG, "epWorldSize=%d", info.epWorldSize);
+    OP_LOGD(K_INNER_DEBUG, "tpWorldSize=%d", info.tpWorldSize);
+    OP_LOGD(K_INNER_DEBUG, "epRankId=%d", info.epRankId);
+    OP_LOGD(K_INNER_DEBUG, "tpRankId=%d", info.tpRankId);
+    OP_LOGD(K_INNER_DEBUG, "zeroComputeExpertNum=%d", info.zeroComputeExpertNum);
+
+    return ge::GRAPH_SUCCESS;
+}
+
+static ge::graphStatus MoeDistributeDispatchA2CheckShapeAndSetTiling(gert::TilingContext *context,
+                                                                     MoeDistributeDispatchV2Info &info, bool isLayered)
+{
+    const char *nodeName = context->GetNodeName();
+    const gert::StorageShape *xStorageShape = context->GetInputShape(X_INDEX);
+    const gert::StorageShape *expertIdStorageShape = context->GetInputShape(EXPERT_IDS_INDEX);
+    const gert::StorageShape *scalesStorageShape = context->GetOptionalInputShape(SCALES_INDEX);
+    const gert::StorageShape *xActiveMaskStorageShape = context->GetOptionalInputShape(X_ACTIVE_MASK_INDEX);
+    const gert::StorageShape *expertScalesStorageShape = context->GetOptionalInputShape(EXPERT_SCALES_INDEX);
+    const gert::StorageShape *expandScalesStorageShape = context->GetOutputShape(OUTPUT_EXPAND_SCALES_INDEX);
+
+    OP_TILING_CHECK(xStorageShape == nullptr, OP_LOGE(K_INNER_DEBUG, "xShape is null."), return GRAPH_FAILED);
+    OP_TILING_CHECK(expertIdStorageShape == nullptr, OP_LOGE(K_INNER_DEBUG, "expertIdShape is null."),
+                    return GRAPH_FAILED);
+    OP_TILING_CHECK(isLayered && expertScalesStorageShape == nullptr, OP_LOGE(K_INNER_DEBUG, "expertScales is null."),
+                    return GRAPH_FAILED);
+    OP_TILING_CHECK(isLayered && expandScalesStorageShape == nullptr, OP_LOGE(K_INNER_DEBUG, "expandScales is null."),
+                    return GRAPH_FAILED);
+    OP_TILING_CHECK(xStorageShape->GetStorageShape().GetDimNum() != TWO_DIMS,
+                    OP_LOGE(K_INNER_DEBUG, "x dims is invalid."), return GRAPH_FAILED);
+    OP_TILING_CHECK(expertIdStorageShape->GetStorageShape().GetDimNum() != TWO_DIMS,
+                    OP_LOGE(K_INNER_DEBUG, "expertId dims is invalid."), return GRAPH_FAILED);
+    OP_LOGD(nodeName, "X dim0 = %ld", xStorageShape->GetStorageShape().GetDim(0));
+    OP_LOGD(nodeName, "X dim1 = %ld", xStorageShape->GetStorageShape().GetDim(1));
+    OP_LOGD(nodeName, "expertId dim0 = %ld", expertIdStorageShape->GetStorageShape().GetDim(0));
+    OP_LOGD(nodeName, "expertId dim1 = %ld", expertIdStorageShape->GetStorageShape().GetDim(1));
+
+    uint32_t h = xStorageShape->GetStorageShape().GetDim(1);
+    uint32_t bs = expertIdStorageShape->GetStorageShape().GetDim(0);
+    uint32_t k = expertIdStorageShape->GetStorageShape().GetDim(1);
+    bool isScales = (scalesStorageShape != nullptr);
+    auto attrs = context->GetAttrs();
+    OP_TILING_CHECK(attrs == nullptr, OP_LOGE(K_INNER_DEBUG, "attrs is null."), return ge::GRAPH_FAILED);
+    auto quantModePtr = attrs->GetAttrPointer<int>(ATTR_QUANT_MODE_INDEX);
+    OP_TILING_CHECK(h % BLOCK_SIZE_A2 != 0 || h <= 0 || h > MAX_HIDDEN_SIZE_A2,
+                    OP_LOGE(K_INNER_DEBUG, "hiddensize is invalid."), return GRAPH_FAILED);
+    OP_TILING_CHECK(bs <= 0 || bs > MAX_BATCH_SIZE_A2, OP_LOGE(K_INNER_DEBUG, "batchsize is invalid."),
+                    return GRAPH_FAILED);
+    auto moeExpertNumPtr = attrs->GetAttrPointer<int>(ATTR_MOE_EXPERT_NUM_INDEX);
+    auto zeroExpertNumPtr = attrs->GetAttrPointer<int64_t>(static_cast<int>(ATTR_ZERO_EXPERT_NUM_INDEX));
+    auto copyExpertNumPtr = attrs->GetAttrPointer<int64_t>(static_cast<int>(ATTR_COPY_EXPERT_NUM_INDEX));
+    auto constExpertNumPtr = attrs->GetAttrPointer<int64_t>(static_cast<int>(ATTR_CONST_EXPERT_NUM_INDEX));
+    // 判断是否满足uint32_t及其他限制
+    int32_t moeExpertNum = *moeExpertNumPtr;
+    int32_t zeroExpertNum = static_cast<int32_t>(*zeroExpertNumPtr);
+    int32_t copyExpertNum = static_cast<int32_t>(*copyExpertNumPtr);
+    int32_t constExpertNum = static_cast<int32_t>(*constExpertNumPtr);
+    int32_t zeroComputeExpertNum = zeroExpertNum + copyExpertNum + constExpertNum;
+    OP_TILING_CHECK(k <= 0 || k > MAX_K_VALUE_A2 || k > static_cast<uint32_t>(moeExpertNum + zeroComputeExpertNum),
+                    OP_LOGE(K_INNER_DEBUG, "k is invalid."), return GRAPH_FAILED);
+    OP_TILING_CHECK(*quantModePtr == UNQUANT_MODE && isScales,
+                    OP_LOGE(K_INNER_DEBUG, "scales should be null when quantMode is unQuant."), return GRAPH_FAILED);
+
+    bool isActiveMask = (xActiveMaskStorageShape != nullptr);
+    if (isActiveMask) {
+        const int64_t xActiveMaskDimNums = xActiveMaskStorageShape->GetStorageShape().GetDimNum();
+        OP_TILING_CHECK(
+            ((xActiveMaskDimNums != ONE_DIM) && (xActiveMaskDimNums != TWO_DIMS)),
+            OP_LOGE(nodeName, "xActiveMask must be 1-dimension or 2-dimension, but got %ld dim", xActiveMaskDimNums),
+            return GRAPH_FAILED);
+
+        int64_t xActiveMaskDim0 = xActiveMaskStorageShape->GetStorageShape().GetDim(0);
+        OP_TILING_CHECK(xActiveMaskDim0 != static_cast<int64_t>(bs),
+                        OP_LOGE(nodeName,
+                                "xActiveMask's dim0 not equal to expertIds's dim0, xActiveMask's dim0 is %ld, "
+                                "expertIds's dim0 is %d",
+                                xActiveMaskDim0, bs),
+                        return GRAPH_FAILED);
+
+        OP_TILING_CHECK(((xActiveMaskStorageShape->GetStorageShape().GetDimNum() == TWO_DIMS) &&
+                         (xActiveMaskStorageShape->GetStorageShape().GetDim(1) != static_cast<int64_t>(k))),
+                        OP_LOGE(nodeName,
+                                "xActiveMask's dim1 not equal to expertIds's dim1, xActiveMask's dim1 is %lu, "
+                                "expertIds's dim1 is %d",
+                                xActiveMaskStorageShape->GetStorageShape().GetDim(1), k),
+                        return GRAPH_FAILED);
+    }
+
+    info.isTokenMask = ((isActiveMask) && (xActiveMaskStorageShape->GetStorageShape().GetDimNum() == ONE_DIM));
+    info.isExpertMask = ((isActiveMask) && (xActiveMaskStorageShape->GetStorageShape().GetDimNum() == TWO_DIMS));
+
+    info.bs = bs;
+    info.k = k;
+    info.h = h;
+
+    OP_LOGD(K_INNER_DEBUG, "isTokenMask is %d", static_cast<int32_t>(info.isTokenMask));
+    OP_LOGD(K_INNER_DEBUG, "isExpertMask is %d", static_cast<int32_t>(info.isExpertMask));
+    OP_LOGD(K_INNER_DEBUG, "batchSize is %u", info.bs);
+    OP_LOGD(K_INNER_DEBUG, "k is %u", info.k);
+    OP_LOGD(K_INNER_DEBUG, "hiddenSize is %u", info.h);
+    return ge::GRAPH_SUCCESS;
+}
+
+static ge::graphStatus MoeDistributeDispatchA2GetPlatformInfoAndSetTiling(gert::TilingContext *context,
+                                                                          MoeDistributeDispatchV2Info &info)
+{
+    auto ascendcPlatform = platform_ascendc::PlatformAscendC(context->GetPlatformInfo());
+    uint32_t aivNum = ascendcPlatform.GetCoreNumAiv();
+    uint64_t ubSize = 0U;
+    ascendcPlatform.GetCoreMemSize(platform_ascendc::CoreMemType::UB, ubSize);
+    info.aivNum = aivNum;
+    info.totalUbSize = ubSize;
+
+    OP_LOGD(K_INNER_DEBUG, "aivNum=%d", info.aivNum);
+    OP_LOGD(K_INNER_DEBUG, "ubSize=%lu", info.totalUbSize);
+
+    return ge::GRAPH_SUCCESS;
+}
+
+// 为了兼容老版本，在未配置commAlg参数时，读取环境变量；
+// commAlg参数当前支持"fullmesh"和"hierarchy"两种，其余使用默认fullmesh不分层方案。
+static ge::graphStatus MoeDistributeDispatchA2CheckCommAlg(gert::TilingContext *context, bool &isLayered)
+{
+    isLayered = false;
+    auto attrs = context->GetAttrs();
+    auto commAlg = attrs->GetAttrPointer<char>(static_cast<int>(ATTR_COMM_ALG_INDEX));
+
+    const char *hcclIntraPcieEnable = getenv("HCCL_INTRA_PCIE_ENABLE");
+    const char *hcclIntraRoceEnable = getenv("HCCL_INTRA_ROCE_ENABLE");
+    if (hcclIntraPcieEnable != nullptr && hcclIntraRoceEnable != nullptr && strcmp(hcclIntraPcieEnable, "1") == 0 &&
+        strcmp(hcclIntraRoceEnable, "0") == 0) {
+        OP_LOGD(K_INNER_DEBUG,
+                "ENV HCCL_INTRA_PCIE_ENABLE = 1 and HCCL_INTRA_ROCE_ENABLE = 0, use hierarchy algorithm.");
+        isLayered = true;
+        return ge::GRAPH_SUCCESS;
+    } else {
+        OP_LOGD(K_INNER_DEBUG,
+                "ENV HCCL_INTRA_PCIE_ENABLE != 1 or HCCL_INTRA_ROCE_ENABLE != 0, use default fullmesh algorithm.");
+    }
+
+    if (commAlg == nullptr || strlen(commAlg) == 0) {
+        OP_LOGE(K_INNER_DEBUG, "Attr commAlg is invalid, please configure fullmesh or hierarchy.");
+        return GRAPH_FAILED;
+    }
+
+    OP_LOGI(K_INNER_DEBUG, "commAlg is %s", commAlg);
+    if (strcmp(commAlg, "fullmesh") == 0) {
+        return ge::GRAPH_SUCCESS;
+    } else if (strcmp(commAlg, "hierarchy") == 0) {
+        isLayered = true;
+        return ge::GRAPH_SUCCESS;
+    } else {
+        OP_LOGE(K_INNER_DEBUG, "commAlg is not support");
+        return GRAPH_FAILED;
+    }
+}
+
+static uint64_t MoeDistributeDispatchA2CalcTilingKey(gert::TilingContext *context, const bool isLayered)
+{
+    uint64_t tilingKey = TILING_KEY_BASE_A2 + INIT_TILINGKEY_A2;
+    if (isLayered) {
+        tilingKey += TILING_KEY_LAYERED_COMM_A2;
+    }
+
+    auto attrs = context->GetAttrs();
+    auto quantModePtr = attrs->GetAttrPointer<int>(ATTR_QUANT_MODE_INDEX);
+    tilingKey += static_cast<uint64_t>(*quantModePtr);
+
+    const gert::StorageShape *scalesStorageShape = context->GetOptionalInputShape(SCALES_INDEX);
+    bool isScales = (scalesStorageShape != nullptr);
+    if (isScales) {
+        tilingKey += NUM_10;
+    }
+
+    OP_LOGD(K_INNER_DEBUG, "tilingKey=%lu", tilingKey);
+
+    return tilingKey;
+}
+
+static ge::graphStatus MoeDistributeDispatchA2TilingFuncImpl(gert::TilingContext *context)
+{
+    const char *nodeName = context->GetNodeName();
+    OP_LOGI(nodeName, "Enter MoeDistributeDispatchV2 tiling func.");
+
+    // 1. tilingData
+    MoeDistributeDispatchV2TilingData *tilingData = context->GetTilingData<MoeDistributeDispatchV2TilingData>();
+    OP_TILING_CHECK(tilingData == nullptr, VECTOR_INNER_ERR_REPORT_TILIING(nodeName, "tilingData is nullptr."),
+                    return ge::GRAPH_FAILED);
+    MoeDistributeDispatchV2Info &info = tilingData->moeDistributeDispatchV2Info;
+
+    bool isLayered = false;
+    OP_TILING_CHECK(
+        MoeDistributeDispatchA2CheckCommAlg(context, isLayered) != ge::GRAPH_SUCCESS,
+        VECTOR_INNER_ERR_REPORT_TILIING(context->GetNodeName(), "MoeDistributeDispatchV2 CheckCommAlg Failed"),
+        return ge::GRAPH_FAILED);
+    OP_TILING_CHECK(MoeDistributeDispatchA2CheckShapeAndSetTiling(context, info, isLayered) != ge::GRAPH_SUCCESS,
+                    VECTOR_INNER_ERR_REPORT_TILIING(context->GetNodeName(),
+                                                    "MoeDistributeDispatchV2 CheckShapeAndSetTiling Failed"),
+                    return ge::GRAPH_FAILED);
+    OP_TILING_CHECK(
+        MoeDistributeDispatchA2CheckAttrAndSetTiling(context, info) != ge::GRAPH_SUCCESS,
+        VECTOR_INNER_ERR_REPORT_TILIING(context->GetNodeName(), "MoeDistributeDispatchV2 CheckAttrAndSetTiling Failed"),
+        return ge::GRAPH_FAILED);
+    OP_TILING_CHECK(MoeDistributeDispatchA2GetPlatformInfoAndSetTiling(context, info) != ge::GRAPH_SUCCESS,
+                    VECTOR_INNER_ERR_REPORT_TILIING(context->GetNodeName(),
+                                                    "MoeDistributeDispatchV2 GetPlatformInfoAndSetTiling Failed"),
+                    return ge::GRAPH_FAILED);
+
+    uint32_t blockDim = 1U;
+    auto ascendcPlatform = platform_ascendc::PlatformAscendC(context->GetPlatformInfo());
+    uint32_t aivNum = ascendcPlatform.GetCoreNumAiv();
+    blockDim = ascendcPlatform.CalcTschBlockDim(aivNum, 0, aivNum);
+    context->SetBlockDim(blockDim);
+    context->SetAicpuBlockDim(mc2tiling::AICPU_BLOCK_DIM_A2);
+
+    uint64_t tilingKey = MoeDistributeDispatchA2CalcTilingKey(context, isLayered);
+    context->SetTilingKey(tilingKey);
+    // 2. workspace
+    size_t *workSpaces = context->GetWorkspaceSizes(1);
+    OP_TILING_CHECK(workSpaces == nullptr, VECTOR_INNER_ERR_REPORT_TILIING(nodeName, "workSpaces is nullptr."),
+                    return ge::GRAPH_FAILED);
+    workSpaces[0] = SYSTEM_NEED_WORKSPACE + USER_WORKSPACE_A2;
+
+    // 3. communication
+    auto attrs = context->GetAttrs();
+    auto group = attrs->GetAttrPointer<char>(static_cast<int>(ATTR_GROUP_EP_INDEX));
+    std::string algConfig = isLayered ? "BatchWrite=level1:hierarchy" : "BatchWrite=level1:fullmesh";
+    uint32_t opType = 18;  // BatchWrite
+
+    AscendC::Mc2CcTilingConfig mc2CcTilingConfig(group, opType, algConfig);
+    mc2CcTilingConfig.GetTiling(tilingData->mc2InitTiling);
+    mc2CcTilingConfig.GetTiling(tilingData->mc2CcTiling);
+
+    OP_LOGI(nodeName, "Leave MoeDistributeDispatchV2 tiling func.");
+    return ge::GRAPH_SUCCESS;
+}
+
+static ge::graphStatus MoeDistributeDispatchV2TilingFunc(gert::TilingContext *context)
+{
+    fe::PlatFormInfos *platformInfoPtr = context->GetPlatformInfo();
+    fe::PlatFormInfos &platformInfo = *platformInfoPtr;
+
+    std::string socVersion;
+    (void)platformInfo.GetPlatformResWithLock("version", "Short_SoC_version", socVersion);
+    ge::graphStatus ret;
+    if (socVersion == "Ascend910B") {
+        ret = MoeDistributeDispatchA2TilingFuncImpl(context);
+    } else {
+        // ret = MoeDistributeDispatchA3TilingFuncImpl(context);
+    }
+    return ret;
+}
+
+struct MoeDistributeDispatchCompileInfo {};
+ge::graphStatus TilingParseForMoeDistributeDispatchV2(gert::TilingParseContext *context)
+{
+    (void)context;
+    return ge::GRAPH_SUCCESS;
+}
+
+IMPL_OP_OPTILING(MoeDistributeDispatchV2)
+    .Tiling(MoeDistributeDispatchV2TilingFunc)
+    .TilingParse<MoeDistributeDispatchCompileInfo>(TilingParseForMoeDistributeDispatchV2);
+}  // namespace optiling
diff --git a/csrc/deepep/ops2/op_host/op_api/aclnn_moe_distribute_combine_v2.cpp b/csrc/deepep/ops2/op_host/op_api/aclnn_moe_distribute_combine_v2.cpp
new file mode 100644
index 000000000..0a1c32b28
--- /dev/null
+++ b/csrc/deepep/ops2/op_host/op_api/aclnn_moe_distribute_combine_v2.cpp
@@ -0,0 +1,70 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2024-2025. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <string.h>
+#include <cstdio>
+
+#include "aclnn_moe_distribute_combine_v2.h"
+#include "aclnnInner_moe_distribute_combine_v2.h"
+#include "graph/types.h"
+#include "aclnn/opdev/platform.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+enum NnopbaseHcclServerType {
+    NNOPBASE_HCCL_SERVER_TYPE_AICPU = 0,
+    NNOPBASE_HCCL_SERVER_TYPE_MTE,
+    NNOPBASE_HCCL_SERVER_TYPE_END
+};
+
+extern "C" void __attribute__((weak)) NnopbaseSetHcclServerType(void *executor, NnopbaseHcclServerType sType);
+
+aclnnStatus aclnnMoeDistributeCombineV2GetWorkspaceSize(
+    const aclTensor *expandX, const aclTensor *expertIds, const aclTensor *assistInfoForCombine,
+    const aclTensor *epSendCounts, const aclTensor *expertScales, const aclTensor *tpSendCountsOptional,
+    const aclTensor *xActiveMaskOptional, const aclTensor *activationScaleOptional,
+    const aclTensor *weightScaleOptional, const aclTensor *groupListOptional, const aclTensor *expandScalesOptional,
+    const aclTensor *sharedExpertXOptional, char *groupEp, int64_t epWorldSize, int64_t epRankId, int64_t moeExpertNum,
+    char *groupTp, int64_t tpWorldSize, int64_t tpRankId, int64_t expertShardType, int64_t sharedExpertNum,
+    int64_t sharedExpertRankNum, int64_t globalBs, int64_t outDtype, int64_t commQuantMode, int64_t groupListType,
+    char *commAlg, aclTensor *xOut, uint64_t *workspaceSize, aclOpExecutor **executor)
+{
+    return aclnnInnerMoeDistributeCombineV2GetWorkspaceSize(
+        expandX, expertIds, assistInfoForCombine, epSendCounts, expertScales, tpSendCountsOptional, xActiveMaskOptional,
+        activationScaleOptional, weightScaleOptional, groupListOptional, expandScalesOptional, sharedExpertXOptional,
+        nullptr, nullptr, nullptr, nullptr, nullptr, groupEp, epWorldSize, epRankId, moeExpertNum, groupTp, tpWorldSize,
+        tpRankId, expertShardType, sharedExpertNum, sharedExpertRankNum, globalBs, outDtype, commQuantMode,
+        groupListType, commAlg, 0, 0, 0, xOut, workspaceSize, executor);
+}
+
+aclnnStatus aclnnMoeDistributeCombineV2(void *workspace, uint64_t workspaceSize, aclOpExecutor *executor,
+                                        aclrtStream stream)
+{
+    if (NnopbaseSetHcclServerType) {
+        if (op::GetCurrentPlatformInfo().GetSocVersion() == op::SocVersion::ASCEND910B) {
+            NnopbaseSetHcclServerType(executor, NNOPBASE_HCCL_SERVER_TYPE_AICPU);
+        } else {
+            NnopbaseSetHcclServerType(executor, NNOPBASE_HCCL_SERVER_TYPE_MTE);
+        }
+    }
+
+    return aclnnInnerMoeDistributeCombineV2(workspace, workspaceSize, executor, stream);
+}
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/csrc/deepep/ops2/op_host/op_api/aclnn_moe_distribute_combine_v2.h b/csrc/deepep/ops2/op_host/op_api/aclnn_moe_distribute_combine_v2.h
new file mode 100644
index 000000000..120ac29a8
--- /dev/null
+++ b/csrc/deepep/ops2/op_host/op_api/aclnn_moe_distribute_combine_v2.h
@@ -0,0 +1,95 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2024-2025. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef OP_API_INC_MOE_DISTRIBUTE_COMBINE_V2_
+#define OP_API_INC_MOE_DISTRIBUTE_COMBINE_V2_
+
+#include <string>
+
+#include "aclnn/aclnn_base.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * 算子功能：实现MoeDistributeCombineV2功能。
+ * @brief aclnnMoeDistributeCombine的第一段接口，根据具体的计算流程，计算workspace大小。
+ * @domain aclnn_ops_infer
+ * @param [in] expandX: 计算输入，Tensor，数据类型float16，bfloat16，必须为2维，数据格式支持ND。
+ * @param [in] expertIds: 计算输入，Tensor，数据类型int32，必须为2维，数据格式支持ND。
+ * @param [in] assistInfoForCombine: 计算输入，Tensor，数据类型int32，必须为1维，数据格式支持ND。
+ * @param [in] epSendCounts: 计算输入，Tensor，数据类型int32，必须为1维，数据格式支持ND。
+ * @param [in] expertScales: 计算输入，Tensor，数据类型float32，必须为2维，数据格式支持ND。
+ * @param [in] tpSendCountsOptional:
+ * 计算输入，Tensor，数据类型int32，必须为1维，数据格式支持ND。若有TP域通信需要传参，若无TP域通信，传空指针即可。
+ * @param [in] xActiveMaskOptional: 计算输入，Tensor，数据类型bool，必须为1维，数据格式支持ND。
+ * @param [in] activationScaleOptional:
+ * 计算输入，Tensor，数据类型float32，必须为1维，数据格式支持ND。预留参数，当前版本不支持，传空指针即可。
+ * @param [in] weightScaleOptional:
+ * 计算输入，Tensor，数据类型float32，必须为2维，数据格式支持ND。预留参数，暂未使用，传空即可。
+ * @param [in] groupListOptional:
+ * 计算输入，Tensor，数据类型int64，必须为1维，数据格式支持ND。预留参数，暂未使用，传空即可。
+ * @param [in] expandScalesOptional: 计算输入，Tensor，数据类型float32，必须为1维，数据格式支持ND。
+ * @param [in] sharedExpertXOptional: 计算可选输入，Tensor，数据类型float16，bfloat16，必须为2维，数据格式支持ND。
+ * @param [in] groupEp: 计算输入，str。ep通信域名称，专家并行的通信域。不能和groupTp相同。
+ * @param [in] epWorldSize: 计算输入，int。ep通信域size。
+ * @param [in] epRankId: 计算输入，int。ep本卡Id。同一个EP通信域中各卡的epRankId不重复。
+ * @param [in] moeExpertNum: 计算输入，int。MOE专家数量。
+ * @param [in] groupTp: 计算可选输入，str。tp通信域名称，数据并行的通信域。
+ * @param [in] tpWorldSize: 计算可选输入，int。tp通信域size。
+ * @param [in] tpRankId: 计算可选输入，int。tp本卡Id。同一个TP通信域中各卡的tpRankId不能重复。
+ * @param [in] expertShardType: 计算可选输入，int。专家共享类型。当前仅支持传0。
+ * @param [in] sharedExpertNum: 计算可选输入，int。共享专家数量。
+ * @param [in] sharedExpertRankNum: 计算可选输入，int。共享专家卡数量。
+ * @param [in] globalBs: 计算可选输入，int。
+ * @param [in] outDtype: 计算可选输入，int。输出数据类型。预留参数，暂未使用，传0即可。
+ * @param [in] commQuantMode: 计算可选输入，int。通信量化类型。
+ * @param [in] groupListType: 计算可选输入，int。groupList格式。预留参数，暂未使用，传0即可。
+ * @param [in] commAlg: 计算可选输入，str。 通信算法类型。预留参数，暂未使用。
+ * @param [out] xOut: 计算输出，Tensor，必选输出，数据类型支持float16, bfloat16，仅支持2维，数据格式支持ND。
+ * @param [out] workspaceSize: 出参，返回需要在npu device侧申请的workspace大小。
+ * @param [out] executor: 出参，返回op执行器，包含了算子计算流程。
+ * @return aclnnStatus: 返回值，返回状态码
+ *
+ */
+__attribute__((visibility("default"))) aclnnStatus aclnnMoeDistributeCombineV2GetWorkspaceSize(
+    const aclTensor *expandX, const aclTensor *expertIds, const aclTensor *assistInfoForCombine,
+    const aclTensor *epSendCounts, const aclTensor *expertScales, const aclTensor *tpSendCountsOptional,
+    const aclTensor *xActiveMaskOptional, const aclTensor *activationScaleOptional,
+    const aclTensor *weightScaleOptional, const aclTensor *groupListOptional, const aclTensor *expandScalesOptional,
+    const aclTensor *sharedExpertXOptional, char *groupEp, int64_t epWorldSize, int64_t epRankId, int64_t moeExpertNum,
+    char *groupTp, int64_t tpWorldSize, int64_t tpRankId, int64_t expertShardType, int64_t sharedExpertNum,
+    int64_t sharedExpertRankNum, int64_t globalBs, int64_t outDtype, int64_t commQuantMode, int64_t groupListType,
+    char *commAlg, aclTensor *xOut, uint64_t *workspaceSize, aclOpExecutor **executor);
+
+/**
+ * @brief aclnnMoeDistributeCombine的第二段接口，用于执行计算。
+ * @param [in] workspace: 在npu device侧申请的workspace内存起址。
+ * @param [in] workspace_size: 在npu
+ * device侧申请的workspace大小，由第一段接口aclnnMoeDistributeCombineGetWorkspaceSize获取。
+ * @param [in] executor: op执行器，包含了算子计算流程。
+ * @param [in] stream: acl stream流。
+ * @return aclnnStatus: 返回状态码
+ */
+__attribute__((visibility("default"))) aclnnStatus aclnnMoeDistributeCombineV2(void *workspace, uint64_t workspaceSize,
+                                                                               aclOpExecutor *executor,
+                                                                               aclrtStream stream);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // OP_API_INC_MOE_DISTRIBUTE_COMBINE_V2_
diff --git a/csrc/deepep/ops2/op_host/op_api/aclnn_moe_distribute_dispatch_v2.cpp b/csrc/deepep/ops2/op_host/op_api/aclnn_moe_distribute_dispatch_v2.cpp
new file mode 100644
index 000000000..fbc212657
--- /dev/null
+++ b/csrc/deepep/ops2/op_host/op_api/aclnn_moe_distribute_dispatch_v2.cpp
@@ -0,0 +1,68 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2024-2025. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <string.h>
+#include <cstdio>
+
+#include "aclnn_moe_distribute_dispatch_v2.h"
+#include "aclnnInner_moe_distribute_dispatch_v2.h"
+#include "aclnn/opdev/platform.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static constexpr int32_t DISPATCH_DYNAMIC_QUANT_MODE = 2;
+enum NnopbaseHcclServerType {
+    NNOPBASE_HCCL_SERVER_TYPE_AICPU = 0,
+    NNOPBASE_HCCL_SERVER_TYPE_MTE,
+    NNOPBASE_HCCL_SERVER_TYPE_END
+};
+
+extern "C" void __attribute__((weak)) NnopbaseSetHcclServerType(void *executor, NnopbaseHcclServerType sType);
+
+aclnnStatus aclnnMoeDistributeDispatchV2GetWorkspaceSize(
+    const aclTensor *x, const aclTensor *expertIds, const aclTensor *scalesOptional,
+    const aclTensor *xActiveMaskOptional, const aclTensor *expertScalesOptional, char *groupEp, int64_t epWorldSize,
+    int64_t epRankId, int64_t moeExpertNum, char *groupTp, int64_t tpWorldSize, int64_t tpRankId,
+    int64_t expertShardType, int64_t sharedExpertNum, int64_t sharedExpertRankNum, int64_t quantMode, int64_t globalBs,
+    int64_t expertTokenNumsType, char *commAlg, aclTensor *expandXOut, aclTensor *dynamicScalesOut,
+    aclTensor *assistInfoForCombineOut, aclTensor *expertTokenNumsOut, aclTensor *epRecvCountsOut,
+    aclTensor *tpRecvCountsOut, aclTensor *expandScalesOut, uint64_t *workspaceSize, aclOpExecutor **executor)
+{
+    return aclnnInnerMoeDistributeDispatchV2GetWorkspaceSize(
+        x, expertIds, scalesOptional, xActiveMaskOptional, expertScalesOptional, nullptr, groupEp, epWorldSize,
+        epRankId, moeExpertNum, "", tpWorldSize, tpRankId, expertShardType, sharedExpertNum, sharedExpertRankNum,
+        quantMode, globalBs, expertTokenNumsType, commAlg, 0, 0, 0, expandXOut, dynamicScalesOut,
+        assistInfoForCombineOut, expertTokenNumsOut, epRecvCountsOut, tpRecvCountsOut, expandScalesOut, workspaceSize,
+        executor);
+}
+
+aclnnStatus aclnnMoeDistributeDispatchV2(void *workspace, uint64_t workspaceSize, aclOpExecutor *executor,
+                                         aclrtStream stream)
+{
+    if (NnopbaseSetHcclServerType) {
+        if (op::GetCurrentPlatformInfo().GetSocVersion() == op::SocVersion::ASCEND910B) {
+            NnopbaseSetHcclServerType(executor, NNOPBASE_HCCL_SERVER_TYPE_AICPU);
+        } else {
+            NnopbaseSetHcclServerType(executor, NNOPBASE_HCCL_SERVER_TYPE_MTE);
+        }
+    }
+
+    return aclnnInnerMoeDistributeDispatchV2(workspace, workspaceSize, executor, stream);
+}
+#ifdef __cplusplus
+}
+#endif
diff --git a/csrc/deepep/ops2/op_host/op_api/aclnn_moe_distribute_dispatch_v2.h b/csrc/deepep/ops2/op_host/op_api/aclnn_moe_distribute_dispatch_v2.h
new file mode 100644
index 000000000..5b79d8299
--- /dev/null
+++ b/csrc/deepep/ops2/op_host/op_api/aclnn_moe_distribute_dispatch_v2.h
@@ -0,0 +1,94 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2024-2025. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef OP_API_INC_MOE_DISTRIBUTE_DISPATCH_V2_
+#define OP_API_INC_MOE_DISTRIBUTE_DISPATCH_V2_
+
+#include <string>
+
+#include "aclnn/aclnn_base.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * 算子功能：实现MoeDistributeDispatch功能，对Token数据先进行量化，再进行EP域的alltoallv通信，再进行TP域的allgatherv通信。
+ * @brief aclnnMoeDistributeDispatch的第一段接口，根据具体的计算流程，计算workspace大小。
+ * @domain aclnn_ops_infer
+ * @param [in] x: 计算输入，Tensor，数据类型float16，bfloat16，必须为2维，数据格式支持ND。输入的token数据。
+ * @param [in] expertIds: 计算输入，Tensor，数据类型int32，必须为2维，数据格式支持ND。每个token的topK个专家索引。
+ * @param [in] scalesOptional: 计算可选输入，Tensor，数据类型float32，必须为2维，数据格式支持ND。每个专家的smooth权重。
+ * @param [in] xActiveMaskOptional: 计算输入，Tensor，数据类型Bool，必须为1维，数据格式支持ND。
+ * @param [in] expertScalesOptional: 计算输入，Tensor，必须为2维，数据格式支持ND。
+ * @param [in] groupEp: 计算输入，str。ep通信域名称，专家并行的通信域。不能和groupTp相同。
+ * @param [in] epWorldSize: 计算输入，int。ep通信域size。
+ * @param [in] epRankId: 计算输入，int。ep本卡Id。同一个EP通信域中各卡的epRankId不能重复。
+ * @param [in] moeExpertNum: 计算输入，int。MOE专家数量。
+ * @param [in] groupTp: 计算可选输入，str。tp通信域名称，数据并行的通信域。无tp通信域时传空。
+ * @param [in] tpWorldSize: 计算可选输入，int。tp通信域size。
+ * @param [in] tpRankId: 计算可选输入，int。tp本卡Id。
+ * @param [in] expertShardType: 计算可选输入，int。专家共享类型。
+ * @param [in] sharedExpertNum: 计算可选输入，int。共享专家数量。
+ * @param [in] sharedExpertRankNum: 计算可选输入，int。共享专家卡数量。
+ * @param [in] quantMode: 计算可选输入，int，量化模式。
+ * @param [in] globalBs: 计算可选输入，int。EP域全局的batch size大小。
+ * @param [in] expertTokenNumsType: 计算可选输入，int。输出expertTokenNums中的值语义类型。
+ * @param [in] commAlg: 计算可选输入，str。 通信算法类型。预留参数，暂未使用。
+ * @param [out] expandXOut: 计算输出，Tensor，必选输出，数据类型支持float16, bfloat16,
+ int8，仅支持2维，数据格式支持ND。根据 expertIdx进行扩展过的token特征。
+ * @param [out] dynamicScalesOut:
+ 计算输出，Tensor，必选输出，数据类型float32，仅支持1维，数据格式支持ND。quantMode为0时输出为空。
+ * @param [out] assistInfoForCombineOut:
+ 计算输出，Tensor，必选输出，数据类型int32，仅支持1维，数据格式支持ND,传输给combine算子的辅助信息。
+ * @param [out] expertTokenNumsOut:
+ 计算输出，Tensor，必选输出，数据类型int64，仅支持1维，数据格式支持ND。每个专家收到的token个数。
+ * @param [out] epRecvCountsOut:
+ 计算输出，Tensor，必选输出，数据类型int32，仅支持1维，数据格式支持ND。表示从各卡接收的token数。
+ * @param [out] tpRecvCountsOut:
+ 计算输出，Tensor，必选输出，数据类型int32，仅支持1维，数据格式支持ND。无tp通信域时输出为空。
+ * @param [out] expandScalesOut: 计算输出，Tensor，必选输出，数据类型float32，仅支持1维，数据格式支持ND。
+ * @param [out] workspaceSize: 出参，返回需要在npu device侧申请的workspace大小。
+ * @param [out] executor: 出参，返回op执行器，包含了算子计算流程。
+ * @return aclnnStatus: 返回值，返回状态码
+ *
+ */
+__attribute__((visibility("default"))) aclnnStatus aclnnMoeDistributeDispatchV2GetWorkspaceSize(
+    const aclTensor *x, const aclTensor *expertIds, const aclTensor *scalesOptional,
+    const aclTensor *xActiveMaskOptional, const aclTensor *expertScalesOptional, char *groupEp, int64_t epWorldSize,
+    int64_t epRankId, int64_t moeExpertNum, char *groupTp, int64_t tpWorldSize, int64_t tpRankId,
+    int64_t expertShardType, int64_t sharedExpertNum, int64_t sharedExpertRankNum, int64_t quantMode, int64_t globalBs,
+    int64_t expertTokenNumsType, char *commAlg, aclTensor *expandXOut, aclTensor *dynamicScalesOut,
+    aclTensor *assistInfoForCombineOut, aclTensor *expertTokenNumsOut, aclTensor *epRecvCountsOut,
+    aclTensor *tpRecvCountsOut, aclTensor *expandScalesOut, uint64_t *workspaceSize, aclOpExecutor **executor);
+
+/**
+ * @brief aclnnMoeDistributeDispatch的第二段接口，用于执行计算。
+ * @param [in] workspace: 在npu device侧申请的workspace内存起址。
+ * @param [in] workspace_size: 在npu
+ * device侧申请的workspace大小，由第一段接口aclnnMoeDistributeDispatchGetWorkspaceSize获取。
+ * @param [in] executor: op执行器，包含了算子计算流程。
+ * @param [in] stream: acl stream流。
+ * @return aclnnStatus: 返回状态码
+ */
+__attribute__((visibility("default"))) aclnnStatus aclnnMoeDistributeDispatchV2(void *workspace, uint64_t workspaceSize,
+                                                                                aclOpExecutor *executor,
+                                                                                aclrtStream stream);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // OP_API_INC_MOE_DISTRIBUTE_DISPATCH_
diff --git a/csrc/deepep/ops2/op_host/tiling_args.h b/csrc/deepep/ops2/op_host/tiling_args.h
new file mode 100644
index 000000000..950cbe904
--- /dev/null
+++ b/csrc/deepep/ops2/op_host/tiling_args.h
@@ -0,0 +1,9 @@
+#ifndef TILING_ARGS_H
+#define TILING_ARGS_H
+#include <cstdint>
+
+namespace Moe {
+constexpr uint64_t COMBINE_STATE_WIN_OFFSET = 3U * 1024UL * 1024UL;
+constexpr uint64_t NOTIFY_DISPATCH_WIN_OFFSET = 204U * 1024UL * 1024UL;
+}  // namespace Moe
+#endif  // TILING_ARGS_H
diff --git a/csrc/deepep/ops2/op_kernel/CMakeLists.txt b/csrc/deepep/ops2/op_kernel/CMakeLists.txt
new file mode 100644
index 000000000..82b1ca48b
--- /dev/null
+++ b/csrc/deepep/ops2/op_kernel/CMakeLists.txt
@@ -0,0 +1,8 @@
+# set custom compile options
+if ("${CMAKE_BUILD_TYPE}x" STREQUAL "Debugx")
+    add_ops_compile_options(ALL OPTIONS -g -O0 )
+endif()
+
+add_ops_compile_options(ALL OPTIONS -DASCENDC_DUMP=1 --cce-auto-sync=off)
+
+add_kernels_compile()
diff --git a/csrc/deepep/ops2/op_kernel/comm_args.h b/csrc/deepep/ops2/op_kernel/comm_args.h
new file mode 100644
index 000000000..fcbf076ae
--- /dev/null
+++ b/csrc/deepep/ops2/op_kernel/comm_args.h
@@ -0,0 +1,84 @@
+#ifndef COMM_ARGS_H
+#define COMM_ARGS_H
+#include <cstdint>
+#include <limits>
+
+#define FORCE_INLINE_AICORE __attribute__((always_inline)) inline __aicore__
+#include "kernel_operator.h"
+
+namespace Moe {
+constexpr int CAM_MAX_RANK_SIZE = 384;  // Maximum number of NPU cards supported by the communication library
+
+constexpr uint64_t NOTIFY_DISPATCH_BUFF_OFFSET = 404UL * 1024UL * 1024UL;
+constexpr int64_t IPC_BUFF_MAX_SIZE = 200 * 1024 * 1024;
+constexpr int64_t IPC_DATA_OFFSET = 2 * 1024 * 1024;  // First 2MB as flag, then 100MB as data storage
+constexpr int64_t PING_PONG_SIZE = 2;
+constexpr int64_t UB_SINGLE_DMA_SIZE_MAX = 190 * 1024;
+constexpr int64_t SMALL_DATA_SIZE = 1 * 1024 * 1024;
+constexpr int64_t UB_SINGLE_PING_PONG_ADD_SIZE_MAX = UB_SINGLE_DMA_SIZE_MAX / 2;
+constexpr int UB_ALIGN_SIZE = 32;
+constexpr int64_t MAGIC_ALIGN_COUNT = UB_ALIGN_SIZE / sizeof(int32_t);
+
+constexpr uint8_t COMM_NUM = 2;  // Size of communication domain
+constexpr uint8_t COMM_EP_IDX = 0;
+constexpr uint8_t COMM_TP_IDX = 1;
+
+constexpr int DFX_COUNT = 50;
+constexpr int64_t WAIT_SUCCESS = 112233445566;
+constexpr int64_t IPC_CHUNK_FLAG = 0;  // Start offset for send recv, chunk flag region
+constexpr int64_t MAX_WAIT_ROUND_UNIT =
+    10 * 1000 * 1000;  // Threshold for waiting to get Flag under normal conditions within the same SIO
+
+constexpr static int32_t UB_HEAD_OFFSET = 96;
+constexpr static int32_t UB_MID_OFFSET = UB_HEAD_OFFSET + UB_SINGLE_PING_PONG_ADD_SIZE_MAX + UB_ALIGN_SIZE;
+constexpr static int64_t UB_FLAG_SIZE = 2 * 1024;
+constexpr static int64_t MAX_CORE_NUM = 48;
+constexpr static uint64_t STATE_WIN_OFFSET = 900 * 1024;
+constexpr static int64_t COMPARE_ALIGN_SIZE = 256;
+
+constexpr static int64_t UB_SINGLE_TOTAL_SIZE_MAX = 192 * 1024;
+constexpr static int64_t START_OFFSET_FOR_SHARE = 512;
+
+enum Op : int { COPYONLY = -1, ADD = 0, MUL = 1, MAX = 2, MIN = 3 };
+
+template <typename T>
+constexpr T T_MAX = std::numeric_limits<T>::max();
+
+template <typename T>
+inline __aicore__ T CeilDiv(const T dividend, const T divisor)
+{
+    static_assert(std::is_arithmetic<T>::value, "T must be an arithmetic type");
+    if (divisor == 0 || dividend + divisor - 1 < dividend) {
+        return T_MAX<T>;
+    }
+    return (dividend + divisor - 1) / divisor;
+}
+
+struct CommArgs {
+    int rank = 0;  // attr rank_id, global rank
+    int localRank = -1;
+    int rankSize = 0;        // global rank size
+    int localRankSize = -1;  // This parameter refers to the number of cards interconnected in fullmesh
+    uint32_t extraFlag = 0;  // 32 bit map, the specific meaning of each bit is above in this file
+    int testFlag = 0;
+    GM_ADDR peerMems[CAM_MAX_RANK_SIZE] =
+        {};  // Buffer obtained from initialization, all allreduce is the same parameter
+    /**
+     * @param sendCountMatrix One-dimensional array with a size of rankSize*rankSize
+     * eg: The value of sendCountMatrix[1] corresponds to the [0][1] of the two-dimensional array, indicating the number
+     * of data that card 0 needs to send to card 1
+     */
+    int64_t sendCountMatrix[CAM_MAX_RANK_SIZE * CAM_MAX_RANK_SIZE] = {};  // for all2allvc
+    int64_t sendCounts[CAM_MAX_RANK_SIZE] = {};                           // for all2allv
+    int64_t sdispls[CAM_MAX_RANK_SIZE] = {};                              // for all2allv
+    int64_t recvCounts[CAM_MAX_RANK_SIZE] = {};                           // for all2allv
+    int64_t rdispls[CAM_MAX_RANK_SIZE] = {};                              // for all2allv
+    int64_t batchSize;
+    int64_t hiddenSize;
+    int64_t topk;
+    int64_t sharedExpertRankNum;
+    int64_t expertNumPerRank;
+    int64_t dfx[DFX_COUNT] = {};
+};
+}  // namespace Moe
+#endif  // COMM_ARGS_H
diff --git a/csrc/deepep/ops2/op_kernel/data_copy.h b/csrc/deepep/ops2/op_kernel/data_copy.h
new file mode 100644
index 000000000..47443e67c
--- /dev/null
+++ b/csrc/deepep/ops2/op_kernel/data_copy.h
@@ -0,0 +1,68 @@
+#ifndef CAM_DATACOPY_GM2GM_H
+#define CAM_DATACOPY_GM2GM_H
+#include <type_traits>
+#include "comm_args.h"
+
+using namespace AscendC;
+using namespace Moe;
+
+template <typename T>
+FORCE_INLINE_AICORE void SetAtomicOpType(int op)
+{
+    switch (op) {
+        case ADD:
+            AscendC::SetAtomicAdd<T>();
+            break;
+        case MUL:
+            // Ignore setting the atomic register when performing mul
+            break;
+        case MAX:
+            AscendC::SetAtomicMax<T>();
+            break;
+        case MIN:
+            AscendC::SetAtomicMin<T>();
+            break;
+        default:
+            AscendC::SetAtomicNone();
+    }
+}
+
+template <typename T>
+FORCE_INLINE_AICORE void CpUB2GM(__gm__ T *gmAddr, __ubuf__ T *ubAddr, uint32_t size)
+{
+    LocalTensor<uint8_t> ubTensor;
+    GlobalTensor<uint8_t> gmTensor;
+    DataCopyExtParams dataCopyParams(1, size, 0, 0, 0);
+    ubTensor.address_.logicPos = static_cast<uint8_t>(TPosition::VECIN);
+    ubTensor.address_.bufferAddr = reinterpret_cast<uint64_t>(ubAddr);
+    gmTensor.SetGlobalBuffer(reinterpret_cast<__gm__ uint8_t *>(gmAddr));
+    DataCopyPad(gmTensor, ubTensor, dataCopyParams);
+}
+
+template <typename T>
+FORCE_INLINE_AICORE void CpGM2UB(__ubuf__ T *ubAddr, __gm__ T *gmAddr, uint32_t size)
+{
+    LocalTensor<uint8_t> ubTensor;
+    GlobalTensor<uint8_t> gmTensor;
+    DataCopyExtParams dataCopyParams(1, size, 0, 0, 0);
+    ubTensor.address_.logicPos = static_cast<uint8_t>(TPosition::VECIN);
+    ubTensor.address_.bufferAddr = reinterpret_cast<uint64_t>(ubAddr);
+    gmTensor.SetGlobalBuffer(reinterpret_cast<__gm__ uint8_t *>(gmAddr));
+    DataCopyPadExtParams<uint8_t> padParams;
+    DataCopyPad(ubTensor, gmTensor, dataCopyParams, padParams);
+}
+
+template <typename T>
+FORCE_INLINE_AICORE void CopyUB2UB(__ubuf__ T *dst, __ubuf__ T *src, const uint32_t calCount)
+{
+    LocalTensor<T> srcTensor;
+    LocalTensor<T> dstTensor;
+    TBuffAddr srcAddr, dstAddr;
+    srcAddr.bufferAddr = reinterpret_cast<uint64_t>(src);
+    dstAddr.bufferAddr = reinterpret_cast<uint64_t>(dst);
+    srcTensor.SetAddr(srcAddr);
+    dstTensor.SetAddr(dstAddr);
+    DataCopy(dstTensor, srcTensor, calCount);
+}
+
+#endif  // CAM_DATACOPY_GM2GM_H
diff --git a/csrc/deepep/ops2/op_kernel/moe_distribute_base.h b/csrc/deepep/ops2/op_kernel/moe_distribute_base.h
new file mode 100644
index 000000000..6cd683c74
--- /dev/null
+++ b/csrc/deepep/ops2/op_kernel/moe_distribute_base.h
@@ -0,0 +1,371 @@
+/*!
+ * \file moe_distribute_base.h
+ * \brief
+ */
+
+#ifndef MOE_DISTRIBUTE_BASE_H
+#define MOE_DISTRIBUTE_BASE_H
+
+/* system tick: 50MHz */
+#define CAL_US(tick) (((tick) * 2) / 100)
+
+/* performance macro */
+// #define USE_256_TO_1__
+#ifdef USE_256_TO_1__
+#pragma message("use 256 to 1")
+#else
+#define USE_FOR_OPT__
+#define DISPATCH_USE_WRITE_SHUFFLE__
+#define USE_TOKEN_COUNT_SPLIT__
+#define USE_ONE_CORE_WAIT__
+
+#ifdef USE_ONE_CORE_WAIT__
+#pragma message("use one core wait")
+
+//  #define USE_ONE_CORE_GETCUMSUM__
+#endif
+#ifdef USE_FOR_OPT__
+#pragma message("use for optimization")
+#define FOR_OPT_MAX_BS__ 64
+#define FOR_OPT_MAX_MOE_RANK__ 256
+#endif
+// #define COMBINE_USE_DYNAMIC_QUANT
+#define OPT_RANK_OFFSET 512
+#define USE_WRITE_SHUFFLE
+#endif
+
+constexpr uint32_t LOCAL_NOTIFY_MAX_NUM = 64;
+constexpr uint32_t LOCAL_STREAM_MAX_NUM = 19;
+constexpr uint32_t AICPU_OP_NOTIFY_MAX_NUM = 2;
+constexpr uint32_t AICPU_MAX_RANK_NUM = 128 * 1024;
+
+struct HcclSignalInfo {
+    uint64_t resId;
+    uint64_t addr;
+    uint32_t devId;
+    uint32_t tsId;
+    uint32_t rankId;
+    uint32_t flag;
+};
+
+struct ListCommon {
+    uint64_t nextHost;
+    uint64_t preHost;
+    uint64_t nextDevice;
+    uint64_t preDevice;
+};
+
+struct HcclStreamInfo {
+    int32_t streamIds;
+    uint32_t sqIds;
+    uint32_t cqIds;
+    uint32_t logicCqids;
+};
+
+struct LocalResInfoV2 {
+    uint32_t streamNum;
+    uint32_t signalNum;
+    HcclSignalInfo localSignals[LOCAL_NOTIFY_MAX_NUM];
+    HcclStreamInfo streamInfo[LOCAL_STREAM_MAX_NUM];
+    HcclStreamInfo mainStreamInfo;
+    HcclSignalInfo aicpuOpNotify[AICPU_OP_NOTIFY_MAX_NUM];
+    ListCommon nextTagRes;  // HccltagLocalResV2
+};
+
+enum class rtFloatOverflowMode_t {
+    RT_OVERFLOW_MODE_SATURATION = 0,
+    RT_OVERFLOW_MODE_INFNAN,
+    RT_OVERFLOW_MODE_UNDEF,
+};
+
+struct AlgoTopoInfo {
+    uint32_t userRank;      // RankID
+    uint32_t userRankSize;  // Rank Number
+    int32_t deviceLogicId;
+    bool isSingleMeshAggregation;
+    uint32_t deviceNumPerAggregation;
+    uint32_t superPodNum;
+    uint32_t devicePhyId;
+    uint32_t topoType;  // TopoType
+    uint32_t deviceType;
+    uint32_t serverNum;
+    uint32_t meshAggregationRankSize;
+    uint32_t multiModuleDiffDeviceNumMode;
+    uint32_t multiSuperPodDiffServerNumMode;
+    uint32_t realUserRank;
+    bool isDiffDeviceModule;
+    bool isDiffDeviceType;
+    uint32_t gcdDeviceNumPerAggregation;
+    uint32_t moduleNum;
+    uint32_t isUsedRdmaRankPairNum;
+    uint64_t isUsedRdmaRankPair;
+    uint32_t pairLinkCounterNum;
+    uint64_t pairLinkCounter;
+    uint32_t nicNum;
+    uint64_t nicList;
+    uint64_t complanRankLength;
+    uint64_t complanRank;
+    uint64_t bridgeRankNum;
+    uint64_t bridgeRank;
+    uint64_t serverAndsuperPodRankLength;
+    uint64_t serverAndsuperPodRank;
+};
+
+struct HcclOpConfig {
+    uint8_t deterministic;
+    uint8_t retryEnable;
+    uint8_t highPerfEnable;
+    uint8_t padding[5];
+    uint8_t linkTimeOut[8];
+    uint64_t notifyWaitTime;
+    uint32_t retryHoldTime;
+    uint32_t retryIntervalTime;
+    bool interHccsDisable = false;
+    rtFloatOverflowMode_t floatOverflowMode = rtFloatOverflowMode_t::RT_OVERFLOW_MODE_UNDEF;
+    uint32_t multiQpThreshold = 512;
+};
+
+struct HcclMC2WorkSpace {
+    uint64_t workSpace;
+    uint64_t workSpaceSize;
+};
+
+struct RemoteResPtr {
+    uint64_t nextHostPtr;
+    uint64_t nextDevicePtr;
+};
+
+struct HDCommunicateParams {
+    uint64_t hostAddr{0};
+    uint64_t deviceAddr{0};
+    uint64_t readCacheAddr{0};
+    uint32_t devMemSize{0};
+    uint32_t buffLen{0};
+    uint32_t flag{0};
+};
+
+struct HcclRankRelationResV2 {
+    uint32_t remoteUsrRankId;
+    uint32_t remoteWorldRank;
+    uint64_t windowsIn;
+    uint64_t windowsOut;
+    uint64_t windowsExp;
+    ListCommon nextTagRes;
+};
+
+struct HcclOpResParam {
+    // local resource
+    HcclMC2WorkSpace mc2WorkSpace;
+    uint32_t localUsrRankId;  // usrrankid
+    uint32_t rankSize;
+    uint64_t winSize;
+    uint64_t localWindowsIn;
+    uint64_t localWindowsOut;
+    char hcomId[128];
+    // aicore detect remote window
+    uint64_t winExpSize;
+    uint64_t localWindowsExp;
+    uint32_t rWinStart;
+    uint32_t rWinOffset;
+    uint64_t version;
+    LocalResInfoV2 localRes;
+    AlgoTopoInfo topoInfo;
+
+    // config parameters
+    HcclOpConfig config;
+    uint64_t hostStateInfo;
+    uint64_t aicpuStateInfo;
+    uint64_t lockAddr;
+    uint32_t rsv[16];
+    uint32_t notifysize;
+    uint32_t remoteResNum;
+    RemoteResPtr remoteRes[AICPU_MAX_RANK_NUM];
+
+    // communicate retry
+    HDCommunicateParams kfcControlTransferH2DParams;
+    HDCommunicateParams kfcStatusTransferD2HParams;
+    uint64_t tinyMem;  // for all2all
+    uint64_t tinyMemSize;
+    // zero-copy
+    uint64_t zeroCopyHeadPtr;
+    uint64_t zeroCopyTailPtr;
+    uint64_t zeroCopyRingBuffer;
+    uint64_t zeroCopyIpcPtrs[16];
+    uint32_t zeroCopyDevicePhyId[16];
+
+    bool utraceStatusFlag;
+};
+
+// Transport �ڴ�����
+enum class HcclAiRMAMemType : uint32_t {
+    LOCAL_INPUT = 0,
+    REMOTE_INPUT,
+
+    LOCAL_OUTPUT,
+    REMOTE_OUTPUT,
+
+    // ��͸��������ڴ棬����MAX_NUM֮ǰ׷�ӣ����磺
+    // LOCAL_EXP,
+    // REMOTE_EXP,
+    MAX_NUM
+};
+
+struct HcclAiRMAMemInfo {
+    uint32_t memMaxNum{0};         // ����ڴ����������� HcclAiRMAMemType::MAX_NUM
+    uint32_t sizeOfMemDetails{0};  // sizeof(MemDetails)�������ڴ�У���ƫ�Ƽ���
+    uint64_t memDetailPtr{0};      // MemDetails�����׵�ַ, ����: HcclAiRMAMemType::MAX_NUM
+    // ������׷���ֶ�
+};
+
+// ȫ�� Transport QP/Mem ��Ϣ
+struct HcclAiRMAInfo {
+    uint32_t curRankId{0};  // ��ǰrankId
+    uint32_t rankNum{0};    // rank����
+    uint32_t qpNum{0};      // ����Transport��QP����
+
+    uint32_t sizeOfAiRMAWQ{0};   // sizeof(HcclAiRMAWQ)
+    uint32_t sizeOfAiRMACQ{0};   // sizeof(HcclAiRMACQ)
+    uint32_t sizeOfAiRMAMem{0};  // sizeof(HcclAiRMAMemInfo)
+
+    // HcclAiRMAWQ��ά�����׵�ַ
+    // QP����: rankNum * qpNum
+    // ����ƫ�ƻ�ȡSQָ�룺sqPtr + (dstRankId * qpNum + qpIndex) * sizeOfAiRMAWQ
+    // 0 <= qpIndex < qpNum
+    uint64_t sqPtr{0};
+
+    // HcclAiRMACQ��ά�����׵�ַ
+    // QP����: rankNum * qpNum
+    // ����ƫ�ƻ�ȡSCQָ�룺scqPtr + (dstRankId * qpNum + qpIndex) * sizeOfAiRMACQ
+    // 0 <= qpIndex < qpNum
+    uint64_t scqPtr{0};
+
+    // HcclAiRMAWQ��ά�����׵�ַ
+    // QP����: rankNum * qpNum
+    // ����ƫ�ƻ�ȡRQָ�룺rqPtr + (dstRankId * qpNum + qpIndex) * sizeOfAiRMAWQ
+    // 0 <= qpIndex < qpNum
+    uint64_t rqPtr{0};
+
+    // HcclAiRMACQ��ά�����׵�ַ
+    // QP����: rankNum * qpNum
+    // ����ƫ�ƻ�ȡRCQָ��: rcqPtr + (dstRankId * qpNum + qpIndex) * sizeOfAiRMACQ
+    // 0 <= qpIndex < qpNum
+    uint64_t rcqPtr{0};
+
+    // HcclAivMemInfoһά����
+    // �ڴ���Ϣ����: rankNum
+    // ����ƫ�ƻ�ȡ�ڴ���Ϣָ��: memPtr + rankId * sizeOfAiRMAMem
+    // srcRankId ��ȡ�����ڴ���Ϣ��dstRankId ��ȡ Transport �ڴ���Ϣ
+    uint64_t memPtr{0};
+    // ������׷���ֶ�
+};
+
+struct CombinedCapability {
+    uint64_t dataplaneModeBitmap;
+};
+
+struct HcclA2CombineOpParam {
+    uint64_t workSpace;                               // Address for communication between client and server,
+                                                      // hccl requests and clears
+    uint64_t workSpaceSize;                           // Space for communication between client and server
+    uint32_t rankId;                                  // id of this rank
+    uint32_t rankNum;                                 // num of ranks in this comm group
+    uint64_t winSize;                                 // size of each windows memory
+    uint64_t windowsIn[AscendC::HCCL_MAX_RANK_NUM];   // windows address for input, windowsIn[rankId] corresponds
+                                                      // to the local card address,
+                                                      // and others are cross-card mapping addresses.
+    uint64_t windowsOut[AscendC::HCCL_MAX_RANK_NUM];  // windows address for output, windowsOut[rankId] corresponds
+                                                      // to the local card address,
+                                                      // and others are cross-card mapping addresses.
+    uint8_t res[8328];
+    uint8_t multiFlag;
+    __gm__ AscendC::IbVerbsData *data;
+    uint64_t dataSize;
+    // ׷���ֶ�
+    uint64_t sizeOfAiRMAInfo;  // sizeof(HcclAiRMAInfo)
+    uint64_t aiRMAInfo;        // HcclAiRMAInfo* �����ṹ��ָ��
+
+    CombinedCapability *capability;  // address of the communication capability information structure on the Device
+    uint64_t capabilitySize;         // size of the communication capability information structure
+};
+
+enum class DataplaneMode : uint32_t {
+    HOST = 0,
+    AICPU = 1,
+    AIV = 2,
+};
+
+enum class DBMode : int32_t { INVALID_DB = -1, HW_DB = 0, SW_DB };
+
+struct HcclAiRMAWQ {
+    uint32_t wqn{0};
+    uint64_t bufAddr{0};
+    uint32_t wqeSize{0};
+    uint32_t depth{0};
+    uint64_t headAddr{0};
+    uint64_t tailAddr{0};
+    DBMode dbMode{DBMode::INVALID_DB};  // 0-hw/1-sw
+    uint64_t dbAddr{0};
+    uint32_t sl{0};
+};
+
+struct HcclAiRMACQ {
+    uint32_t cqn{0};
+    uint64_t bufAddr{0};
+    uint32_t cqeSize{0};
+    uint32_t depth{0};
+    uint64_t headAddr{0};
+    uint64_t tailAddr{0};
+    DBMode dbMode{DBMode::INVALID_DB};  // 0-hw/1-sw
+    uint64_t dbAddr{0};
+};
+
+struct hns_roce_rc_sq_wqe {
+    uint32_t byte_4;
+    uint32_t msg_len;
+    uint32_t immtdata;
+    uint32_t byte_16;
+    uint32_t byte_20;
+    uint32_t rkey;
+    uint64_t remoteVA;
+};
+
+struct hns_roce_lite_wqe_data_seg {
+    uint32_t len;
+    uint32_t lkey;
+    uint64_t localVA;
+};
+
+__aicore__ inline void cacheWriteThrough(__gm__ uint8_t *sourceAddr, uint64_t length)
+{
+    __gm__ uint8_t *start =
+        (__gm__ uint8_t *)((uint64_t)sourceAddr / AscendC::CACHE_LINE_SIZE * AscendC::CACHE_LINE_SIZE);
+    __gm__ uint8_t *end =
+        (__gm__ uint8_t *)(((uint64_t)sourceAddr + length) / AscendC::CACHE_LINE_SIZE * AscendC::CACHE_LINE_SIZE);
+    AscendC::GlobalTensor<uint8_t> global;
+    global.SetGlobalBuffer(start);
+    for (uint32_t i = 0; i <= end - start; i += AscendC::CACHE_LINE_SIZE) {
+        AscendC::DataCacheCleanAndInvalid<uint8_t, AscendC::CacheLine::SINGLE_CACHE_LINE,
+                                          AscendC::DcciDst::CACHELINE_OUT>(global[i]);
+    }
+}
+__aicore__ inline DataplaneMode GetDataplaneMode(GM_ADDR contextGM0)
+{
+    __gm__ HcclA2CombineOpParam *winContext_ = (__gm__ HcclA2CombineOpParam *)contextGM0;
+    CombinedCapability *capability = winContext_->capability;
+    uint64_t capabilitySize = winContext_->capabilitySize;
+    DataplaneMode dataplaneMode = DataplaneMode::HOST;
+    if (capability == 0) {
+        dataplaneMode = DataplaneMode::AICPU;
+        return dataplaneMode;
+    }
+    uint64_t dataplaneModeBitmap = capability->dataplaneModeBitmap;
+    if ((dataplaneModeBitmap & 0x02) == 0x02) {
+        dataplaneMode = DataplaneMode::AICPU;
+    }
+    if ((dataplaneModeBitmap & 0x04) == 0x04) {
+        dataplaneMode = DataplaneMode::AIV;
+    }
+    return dataplaneMode;
+}
+
+#endif  // MOE_DISTRIBUTE_BASE_H
diff --git a/csrc/deepep/ops2/op_kernel/moe_distribute_combine_v2.cpp b/csrc/deepep/ops2/op_kernel/moe_distribute_combine_v2.cpp
new file mode 100644
index 000000000..2fc1e43ba
--- /dev/null
+++ b/csrc/deepep/ops2/op_kernel/moe_distribute_combine_v2.cpp
@@ -0,0 +1,89 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*!
+ * \file moe_distribute_combine_v2.cpp
+ * \brief
+ */
+#include "kernel_operator.h"
+#include "lib/matmul_intf.h"
+#include "moe_distribute_combine_v2_tiling.h"
+#include "moe_distribute_combine_v2.h"
+#include "moe_distribute_combine_v2_layered.h"
+
+using namespace AscendC;
+using namespace MoeDistributeCombineA2Impl;
+
+/*
+ 2000  A2
+ 3000  A2+layered
+  100  quant
+*/
+extern "C" __global__ __aicore__ void moe_distribute_combine_v2(
+    GM_ADDR expandX, GM_ADDR expertIds, GM_ADDR assistInfoForCombine, GM_ADDR epSendCount, GM_ADDR scales,
+    GM_ADDR tpSendCount, GM_ADDR xActiveMask, GM_ADDR activationScale, GM_ADDR weightScale, GM_ADDR groupList,
+    GM_ADDR expandScales, GM_ADDR sharedExpertX, GM_ADDR elasticInfo, GM_ADDR oriX, GM_ADDR constExpertAlpha1,
+    GM_ADDR constExpertAlpha2, GM_ADDR constExpertV, GM_ADDR XOut, GM_ADDR workspaceGM, GM_ADDR tilingGM)
+
+{
+    REGISTER_TILING_DEFAULT(MoeDistributeCombineV2TilingData);
+    REGISTER_TILING_FOR_TILINGKEY("TILING_KEY_VAR < 10000", MoeDistributeCombineV2TilingData);
+    TPipe pipe;
+
+#if (ORIG_DTYPE_EXPAND_X == DT_BF16 || ORIG_DTYPE_EXPAND_X == DT_FLOAT16)
+    if (TILING_KEY_IS(2000)) {
+        GET_TILING_DATA_WITH_STRUCT(MoeDistributeCombineV2TilingData, tilingData, tilingGM);
+        auto tiling = (__gm__ MoeDistributeCombineV2TilingData *)tilingGM;
+        __gm__ void *mc2InitTiling = (__gm__ void *)(&(tiling->mc2InitTiling));
+        __gm__ void *mc2CcTiling = (__gm__ void *)(&(tiling->mc2CcTiling));
+        MoeDistributeCombineV2<DTYPE_EXPAND_X, int32_t> op;
+        op.Init(expandX, expertIds, assistInfoForCombine, epSendCount, scales, xActiveMask, oriX, constExpertAlpha1,
+                constExpertAlpha2, constExpertV, XOut, workspaceGM, &pipe, &tilingData, mc2InitTiling, mc2CcTiling);
+        op.Process();
+    } else if (TILING_KEY_IS(3000)) {
+        GET_TILING_DATA_WITH_STRUCT(MoeDistributeCombineV2TilingData, tilingData, tilingGM);
+        auto tiling = (__gm__ MoeDistributeCombineV2TilingData *)tilingGM;
+        __gm__ void *mc2InitTiling = (__gm__ void *)(&(tiling->mc2InitTiling));
+        __gm__ void *mc2CcTiling = (__gm__ void *)(&(tiling->mc2CcTiling));
+        auto contextGM0 = AscendC::GetHcclContext<HCCL_GROUP_ID_0>();
+        DataplaneMode dataplaneMode = GetDataplaneMode(contextGM0);
+        if (dataplaneMode == DataplaneMode::AIV) {
+            MoeDistributeCombineV2Layered<DTYPE_EXPAND_X, int32_t, DTYPE_EXPAND_X> op;
+            op.Init(expandX, expertIds, assistInfoForCombine, epSendCount, expandScales, XOut, workspaceGM, &pipe,
+                    &tilingData, mc2InitTiling, mc2CcTiling, contextGM0);
+            op.Process();
+        } else {
+            assert(false, "The driver version is too low and does not support layered mode.\n");
+        }
+    } else if (TILING_KEY_IS(3100)) {
+        GET_TILING_DATA_WITH_STRUCT(MoeDistributeCombineV2TilingData, tilingData, tilingGM);
+        auto tiling = (__gm__ MoeDistributeCombineV2TilingData *)tilingGM;
+        __gm__ void *mc2InitTiling = (__gm__ void *)(&(tiling->mc2InitTiling));
+        __gm__ void *mc2CcTiling = (__gm__ void *)(&(tiling->mc2CcTiling));
+
+        auto contextGM0 = AscendC::GetHcclContext<HCCL_GROUP_ID_0>();
+        DataplaneMode dataplaneMode = GetDataplaneMode(contextGM0);
+        if (dataplaneMode == DataplaneMode::AIV) {
+            MoeDistributeCombineV2Layered<DTYPE_EXPAND_X, int32_t, int8_t> op;
+            op.Init(expandX, expertIds, assistInfoForCombine, epSendCount, expandScales, XOut, workspaceGM, &pipe,
+                    &tilingData, mc2InitTiling, mc2CcTiling, contextGM0);
+            op.Process();
+        } else {
+            assert(false, "The driver version is too low. It should not be lower than 25.0.rc1.1.\n");
+        }
+    }
+#endif
+}
diff --git a/csrc/deepep/ops2/op_kernel/moe_distribute_combine_v2.h b/csrc/deepep/ops2/op_kernel/moe_distribute_combine_v2.h
new file mode 100644
index 000000000..aaf86271c
--- /dev/null
+++ b/csrc/deepep/ops2/op_kernel/moe_distribute_combine_v2.h
@@ -0,0 +1,710 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*!
+ * \file moe_distribute_combine_a2.h
+ * \brief
+ */
+#ifndef MOE_DISTRIBUTE_COMBINE_V2_H
+#define MOE_DISTRIBUTE_COMBINE_V2_H
+#include "kernel_operator.h"
+#include "kernel_tiling/kernel_tiling.h"
+#include "moe_distribute_combine_v2_tiling.h"
+#include "moe_distribute_base.h"
+namespace {
+constexpr uint8_t BUFFER_NUM = 2;                       // 多buf
+constexpr uint32_t STATE_OFFSET = 512;                  // 状态空间偏移地址
+constexpr uint32_t STATE_SPACE_SIZE = 1024 * 1024;      // 1M
+constexpr uint32_t UB_ALIGN = 32;                       // UB按32字节对齐
+constexpr uint32_t SELF_STATE_OFFSET = 512 * 1024;      // 本卡状态空间偏移地址
+constexpr uint32_t BATCH_WRITE_ITEM_OFFSET = 8 * 1024;  // batchWriteInfo结构体地址相对于windowOut最后1M的偏移
+constexpr uint32_t BATCH_WRITE_ITEM_SIZE = 32;
+constexpr uint32_t BLOCK_SIZE = 32;
+constexpr uint32_t B32_PER_BLOCK = 8;
+constexpr uint32_t B64_PER_BLOCK = 4;
+constexpr uint32_t SKIP_OFFSET = 32;
+constexpr uint32_t FLAG_VALUE = 0xFFFFFFFF;
+constexpr uint32_t REPEAT_BYTES = 256;
+constexpr uint64_t MB_SIZE = 1024 * 1024;
+template <AscendC::HardEvent event>
+__aicore__ inline void SyncFunc()
+{
+    int32_t eventID = static_cast<int32_t>(GetTPipePtr()->FetchEventID(event));
+    AscendC::SetFlag<event>(eventID);
+    AscendC::WaitFlag<event>(eventID);
+}
+template <typename T>
+inline __aicore__ T RoundUp(const T val, const T align)
+{
+    if (align == 0 || val + align - 1 < val) {
+        return val;
+    }
+    return (val + align - 1) / align * align;
+}
+
+struct TaskInfo {
+    uint32_t startTaskId;
+    uint32_t endTaskId;
+    uint32_t taskNum;
+
+    __aicore__ inline TaskInfo() {}
+    __aicore__ inline void SplitCore(uint32_t taskNumTotal, uint32_t aivNum, uint32_t aivId)
+    {
+        if (aivNum == 0) {
+            startTaskId = 0;
+            endTaskId = 0;
+            taskNum = 0;
+            return;
+        }
+
+        uint32_t formerNum = taskNumTotal / aivNum;
+        uint32_t tailNum = taskNumTotal % aivNum;
+        startTaskId = formerNum * aivId;
+        if (aivId < tailNum) {
+            formerNum++;
+            startTaskId += aivId;
+        } else {
+            startTaskId += tailNum;
+        }
+        taskNum = formerNum;
+        endTaskId = startTaskId + taskNum;
+    }
+};
+
+}  // namespace
+namespace MoeDistributeCombineA2Impl {
+#define TemplateMC2TypeA2Class typename ExpandXType, typename ExpandIdxType
+#define TemplateMC2TypeA2Func ExpandXType, ExpandIdxType
+using namespace AscendC;
+template <TemplateMC2TypeA2Class>
+class MoeDistributeCombineV2
+{
+public:
+    __aicore__ inline MoeDistributeCombineV2(){};
+    __aicore__ inline void Init(GM_ADDR expandX, GM_ADDR expertIds, GM_ADDR expandIdx, GM_ADDR sendCount,
+                                GM_ADDR scales, GM_ADDR xActiveMask, GM_ADDR oriX, GM_ADDR constExpertAlpha1,
+                                GM_ADDR constExpertAlpha2, GM_ADDR constExpertV, GM_ADDR XOut, GM_ADDR workspaceGM,
+                                TPipe *pipe, const MoeDistributeCombineV2TilingData *tilingData,
+                                __gm__ void *mc2InitTiling, __gm__ void *mc2CcTiling);
+    __aicore__ inline void Process();
+
+private:
+    __aicore__ inline void LocalWindowCopy();
+    __aicore__ inline void AlltoAllDispatch();
+    __aicore__ inline void BuffInit();
+    __aicore__ inline void SplitCoreCal();
+    __aicore__ inline void Preload();
+    __aicore__ inline void WaitDispatch();
+    __aicore__ inline void TokenActiveMaskCal();
+    __aicore__ inline void ProcessMoeAndCopyExpert(uint32_t tokenIdx, uint32_t topKIdx);
+    __aicore__ inline void ProcessConstantExpert(uint32_t tokenIdx, uint32_t topKIdx);
+    __aicore__ inline void ProcessCopyExpert(uint32_t tokenIdx, uint32_t topKIdx);
+    TPipe *tpipe_{nullptr};
+    GlobalTensor<ExpandXType> expandXGlobal_;
+    GlobalTensor<ExpandIdxType> expertIdsGlobal_;
+    GlobalTensor<ExpandIdxType> expandIdxGlobal_;
+    GlobalTensor<ExpandIdxType> sendCountGlobal_;
+    GlobalTensor<float> expandScalesGlobal_;
+    GlobalTensor<ExpandXType> expandOutGlobal_;
+    GlobalTensor<ExpandXType> rankWindow_;  // 用于存对端window的变量
+    GlobalTensor<ExpandXType> localOutWindow_;
+    GlobalTensor<ExpandXType> localInWindow_;
+    GlobalTensor<uint32_t> windowInstatusTensor_;
+    GlobalTensor<uint32_t> bufferIdGlobal_;     // win区状态位置拷入相关参数
+    GlobalTensor<uint64_t> workspaceGlobal_;    // 存储batchWriteInfo结构体信息
+    GlobalTensor<uint32_t> workspaceGlobal32_;  // 存储batchWriteInfo结构体信息
+    GlobalTensor<uint32_t> flagGlobal_;
+    GlobalTensor<bool> xActiveMaskGlobal_;  // xActiveMask int8 代替 bool
+    GlobalTensor<ExpandXType>
+        oriXGlobal_;  // 表示未经过FFN的token数据，在使能copyExpert或使能constExpert的场景下需要本输入数据
+    GlobalTensor<ExpandXType> constExpertAlpha1Global_;  // 在使能constExpert的场景下需要输入的计算系数alpha1
+    GlobalTensor<ExpandXType> constExpertAlpha2Global_;  // 在使能constExpert的场景下需要输入的计算系数alpha2
+    GlobalTensor<ExpandXType> constExpertVGlobal_;       // 在使能constExpert的场景下需要输入的计算系数v
+    LocalTensor<uint64_t> batchWriteItemLocalB64;
+    LocalTensor<uint32_t> batchWriteItemLocalB32;
+    LocalTensor<uint32_t> recvCountLocal_;
+    LocalTensor<uint32_t> expertWindowOffsetLocal_;
+    LocalTensor<float> sumFloatLocal_;
+    LocalTensor<ExpandIdxType> expertIdsSegLocal_;
+    LocalTensor<float> expandScalesSegLocal_;
+    LocalTensor<ExpandIdxType> indexCountsSegLocal_;
+    LocalTensor<bool> expertMaskTensor_;
+    LocalTensor<ExpandXType> tmpUb_;
+    LocalTensor<uint32_t> statusTensor_;
+    GM_ADDR windowInGM_;
+    GM_ADDR windowOutGM_;
+    GM_ADDR expandXGM_;
+    GM_ADDR expertIdsGM_;
+    GM_ADDR expandIdxGM_;
+    GM_ADDR sendCountGM_;
+    GM_ADDR scalesGM_;
+    GM_ADDR XOutGM_;
+    GM_ADDR oriXGM_;
+    // tiling侧已确保数据上限，相乘不会越界，因此统一采用uint32_t进行处理
+    uint32_t axisBS_{0};
+    uint32_t axisH_{0};
+    uint32_t axisK_{0};  // topK
+    uint32_t aivNum_{0};
+    uint32_t worldSize_{0};
+    uint32_t rankId_{0};
+    uint32_t coreIdx_{0};              // aiv id
+    uint32_t sharedExpertRankNum_{0};  // 共享专家卡数
+    uint32_t moeExpertNum_{0};         // moe专家数, 等于worldSize_ - 共享专家卡数
+    uint32_t localMoeExpertNum_{0};    // 每张卡的专家数
+    uint32_t zeroExpertNum_{0};
+    uint32_t copyExpertNum_{0};
+    uint32_t constExpertNum_{0};
+    uint64_t rankSizeOnWin_{0};
+    uint64_t dataOffsetOnWin_{0};
+    uint64_t stateOffsetOnWin_{0};
+    uint32_t axisHFloatSize_{0};
+    uint32_t axisHExpandXTypeSize_{0};
+    uint32_t bsKAlign_{0};
+    uint32_t startRankId_{0};
+    uint32_t endRankId_{0};
+    uint32_t sendRankNum_{0};
+    uint32_t halfWinSize_{0};
+    uint32_t dataSpaceSize_{0};
+    uint32_t bufferId_{0};
+    uint32_t tokenNumPerCore_{0};
+    // 分核片上相对偏移
+    uint32_t tokenBeginIndex_{0};
+    uint32_t expertIdsSegBaseOffset_{0};
+    uint32_t expandScalesSegBaseOffset_{0};
+    uint32_t indexCountsSegBaseOffset_{0};
+
+    bool isInputTokenMaskFlag_ = false;
+    bool isInputExpertMaskFlag_ = false;
+    TQueBind<QuePosition::VECIN, QuePosition::VECOUT, BUFFER_NUM> moeQueue_;
+    TBuf<> expertIdsBuf_;
+    TBuf<> expandScalesBuf_;
+    TBuf<> rowTmpFloatBuf_;
+    TBuf<> sumFloatBuf_;
+    TBuf<> indexCountsBuf_;
+    TBuf<> tokenBuf_;
+    TBuf<> batchWriteItemBuf_;
+    // 二维expertMaske
+    TBuf<> expertMaskBuf_;
+
+    TaskInfo taskInfo_;
+
+    GlobalTensor<uint32_t> expertRecvCountGlobal_;
+    GlobalTensor<uint32_t> expertWindowOffsetGlobal_;
+
+    Hccl<HCCL_SERVER_TYPE_AICPU> hccl_;
+    __gm__ HcclOpResParam *winContext_{nullptr};
+};
+template <TemplateMC2TypeA2Class>
+__aicore__ inline void MoeDistributeCombineV2<TemplateMC2TypeA2Func>::Init(
+    GM_ADDR expandX, GM_ADDR expertIds, GM_ADDR expandIdx, GM_ADDR sendCount, GM_ADDR scales, GM_ADDR xActiveMask,
+    GM_ADDR oriX, GM_ADDR constExpertAlpha1, GM_ADDR constExpertAlpha2, GM_ADDR constExpertV, GM_ADDR XOut,
+    GM_ADDR workspaceGM, TPipe *pipe, const MoeDistributeCombineV2TilingData *tilingData, __gm__ void *mc2InitTiling,
+    __gm__ void *mc2CcTiling)
+{
+    tpipe_ = pipe;
+    expandXGM_ = expandX;
+    expertIdsGM_ = expertIds;
+    expandIdxGM_ = expandIdx;
+    sendCountGM_ = sendCount;
+    scalesGM_ = scales;
+    oriXGM_ = oriX;
+    XOutGM_ = XOut;
+    rankId_ = tilingData->moeDistributeCombineV2Info.epRankId;
+    axisBS_ = tilingData->moeDistributeCombineV2Info.bs;
+    axisH_ = tilingData->moeDistributeCombineV2Info.h;
+    axisK_ = tilingData->moeDistributeCombineV2Info.k;
+    aivNum_ = tilingData->moeDistributeCombineV2Info.aivNum;
+    moeExpertNum_ = tilingData->moeDistributeCombineV2Info.moeExpertNum;
+    zeroExpertNum_ = tilingData->moeDistributeCombineV2Info.zeroExpertNum;
+    copyExpertNum_ = tilingData->moeDistributeCombineV2Info.copyExpertNum;
+    constExpertNum_ = tilingData->moeDistributeCombineV2Info.constExpertNum;
+    worldSize_ = tilingData->moeDistributeCombineV2Info.epWorldSize;
+    isInputTokenMaskFlag_ = tilingData->moeDistributeCombineV2Info.isTokenMask;
+    isInputExpertMaskFlag_ = tilingData->moeDistributeCombineV2Info.isExpertMask;
+    auto contextGM = AscendC::GetHcclContext<HCCL_GROUP_ID_0>();
+    winContext_ = (__gm__ HcclOpResParam *)contextGM;
+    hccl_.Init(contextGM, mc2InitTiling);
+    hccl_.SetCcTiling(mc2CcTiling);
+    halfWinSize_ = winContext_->winSize / 2;
+    dataSpaceSize_ = halfWinSize_ - STATE_SPACE_SIZE;
+    windowInGM_ = hccl_.GetWindowsInAddr(rankId_);
+    bufferIdGlobal_.SetGlobalBuffer((__gm__ uint32_t *)(windowInGM_ + dataSpaceSize_));
+    bufferId_ = bufferIdGlobal_.GetValue(0);
+    windowInGM_ = windowInGM_ + halfWinSize_ * bufferId_;
+    windowOutGM_ = hccl_.GetWindowsOutAddr(rankId_) + halfWinSize_ * bufferId_;
+    coreIdx_ = GetBlockIdx();
+    windowInstatusTensor_.SetGlobalBuffer((__gm__ uint32_t *)(windowInGM_));
+    expandXGlobal_.SetGlobalBuffer((__gm__ ExpandXType *)expandX);
+    expertIdsGlobal_.SetGlobalBuffer((__gm__ ExpandIdxType *)expertIds);
+    expandIdxGlobal_.SetGlobalBuffer((__gm__ ExpandIdxType *)expandIdx);
+    sendCountGlobal_.SetGlobalBuffer((__gm__ int32_t *)sendCount);
+    expandScalesGlobal_.SetGlobalBuffer((__gm__ float *)scales);
+    expandOutGlobal_.SetGlobalBuffer((__gm__ ExpandXType *)XOut);
+    workspaceGlobal_.SetGlobalBuffer((__gm__ uint64_t *)(windowOutGM_ + dataSpaceSize_ + BATCH_WRITE_ITEM_OFFSET));
+    workspaceGlobal32_.SetGlobalBuffer((__gm__ uint32_t *)(windowOutGM_ + dataSpaceSize_ + BATCH_WRITE_ITEM_OFFSET));
+
+    expertRecvCountGlobal_.SetGlobalBuffer((__gm__ uint32_t *)workspaceGM);
+    expertWindowOffsetGlobal_.SetGlobalBuffer((__gm__ uint32_t *)(workspaceGM + moeExpertNum_ * sizeof(uint32_t)));
+    xActiveMaskGlobal_.SetGlobalBuffer((__gm__ bool *)xActiveMask);
+    oriXGlobal_.SetGlobalBuffer((__gm__ ExpandXType *)oriX);
+    constExpertAlpha1Global_.SetGlobalBuffer((__gm__ ExpandXType *)constExpertAlpha1);
+    constExpertAlpha2Global_.SetGlobalBuffer((__gm__ ExpandXType *)constExpertAlpha2);
+    constExpertVGlobal_.SetGlobalBuffer((__gm__ ExpandXType *)constExpertV);
+    localMoeExpertNum_ = moeExpertNum_ / worldSize_;
+    rankSizeOnWin_ = dataSpaceSize_ / worldSize_ / BLOCK_SIZE * BLOCK_SIZE;
+    dataOffsetOnWin_ = rankId_ * rankSizeOnWin_;
+    stateOffsetOnWin_ = dataSpaceSize_ + rankId_ * STATE_OFFSET;
+    axisHFloatSize_ = axisH_ * sizeof(float);
+    axisHExpandXTypeSize_ = axisH_ * sizeof(ExpandXType);
+    bsKAlign_ = RoundUp(axisBS_ * axisK_, B32_PER_BLOCK);
+
+    uint64_t stateSizeMaxSize = 2 * STATE_SPACE_SIZE;  // 2: 实际上是(DATA_OFFSET+SKIP_OFFSET+sizeof(uint32)) +
+                                                       // STATE_SPACE_SIZE，近似计算使用2 * STATE_SPACE_SIZE
+    uint64_t winSizeMin = (axisBS_ * worldSize_ * (localMoeExpertNum_ > axisK_ ? axisK_ : localMoeExpertNum_) * axisH_ *
+                               sizeof(uint16_t) +
+                           stateSizeMaxSize) *
+                          BUFFER_NUM;  // 考虑负载极其不均衡时，HCCL BUFFSIZE需要开的大小
+
+    BuffInit();
+
+    if (isInputTokenMaskFlag_) {
+        TokenActiveMaskCal();  // 计算一维mask
+    }
+    if (isInputExpertMaskFlag_) {
+        tpipe_->InitBuffer(expertMaskBuf_, Ceil(axisBS_ * axisK_ * sizeof(bool), UB_ALIGN) * UB_ALIGN);
+        expertMaskTensor_ = expertMaskBuf_.Get<bool>();
+        DataCopyPadExtParams<bool> maskCopyPadParams{false, 0U, 0U, 0U};
+        DataCopyExtParams maskParams{1U, static_cast<uint32_t>(axisBS_ * axisK_ * sizeof(bool)), 0U, 0U, 0U};
+        DataCopyPad(expertMaskTensor_, xActiveMaskGlobal_, maskParams, maskCopyPadParams);
+    }
+    SplitCoreCal();
+}
+template <TemplateMC2TypeA2Class>
+__aicore__ inline void MoeDistributeCombineV2<TemplateMC2TypeA2Func>::BuffInit()
+{
+    uint32_t expertIdsNumPerCore = RoundUp(axisBS_, aivNum_) * axisK_;  // 每个核分配到的task处理个数
+    uint32_t expertIdsBufSizePerCore =
+        RoundUp(expertIdsNumPerCore * static_cast<uint32_t>(sizeof(int32_t)), B32_PER_BLOCK);
+    uint32_t moeExpertNumInt32Size = RoundUp(moeExpertNum_ * static_cast<uint32_t>(sizeof(int32_t)), B32_PER_BLOCK);
+    tpipe_->InitBuffer(moeQueue_, BUFFER_NUM, axisHExpandXTypeSize_);
+    tpipe_->InitBuffer(expertIdsBuf_, Std::max(expertIdsBufSizePerCore, REPEAT_BYTES));
+    tpipe_->InitBuffer(expandScalesBuf_, expertIdsBufSizePerCore);
+    tpipe_->InitBuffer(tokenBuf_, Std::max(axisHFloatSize_, moeExpertNumInt32Size));
+    tpipe_->InitBuffer(rowTmpFloatBuf_, Std::max(axisHFloatSize_, moeExpertNumInt32Size));
+    tpipe_->InitBuffer(sumFloatBuf_, Std::max(axisHFloatSize_, moeExpertNumInt32Size));
+    tpipe_->InitBuffer(indexCountsBuf_, Std::max(expertIdsBufSizePerCore, REPEAT_BYTES));
+    tpipe_->InitBuffer(batchWriteItemBuf_, Std::max(BATCH_WRITE_ITEM_SIZE * worldSize_, moeExpertNumInt32Size));
+    batchWriteItemLocalB64 = batchWriteItemBuf_.Get<uint64_t>();
+    batchWriteItemLocalB32 = batchWriteItemLocalB64.template ReinterpretCast<uint32_t>();
+}
+
+template <TemplateMC2TypeA2Class>
+__aicore__ inline void MoeDistributeCombineV2<TemplateMC2TypeA2Func>::TokenActiveMaskCal()
+{
+    LocalTensor<bool> xActiveMaskTensor;
+    LocalTensor<int8_t> xActiveMaskInt8Tensor;
+    LocalTensor<half> xActiveMaskHalfTensor;
+    LocalTensor<half> sumOutTensor;
+    LocalTensor<uint8_t> tempTensor;
+    uint32_t axisBsAlignSize = RoundUp(axisBS_, UB_ALIGN);
+    xActiveMaskTensor = expertIdsBuf_.Get<bool>(axisBsAlignSize);
+    xActiveMaskHalfTensor = expertIdsBuf_.GetWithOffset<half>(axisBsAlignSize, axisBsAlignSize);
+    sumOutTensor = expertIdsBuf_.Get<half>(UB_ALIGN);
+    tempTensor = indexCountsBuf_.Get<uint8_t>();
+    DataCopyExtParams xActiveMaskParams = {1U, static_cast<uint32_t>(axisBS_ * sizeof(bool)), 0U, 0U, 0U};
+    DataCopyPadExtParams<bool> xActiveMaskCopyPadParams{false, 0U, 0U, 0U};
+    DataCopyPad(xActiveMaskTensor, xActiveMaskGlobal_, xActiveMaskParams, xActiveMaskCopyPadParams);
+    SyncFunc<AscendC::HardEvent::MTE2_V>();
+    xActiveMaskInt8Tensor = xActiveMaskTensor.ReinterpretCast<int8_t>();
+    Cast(xActiveMaskHalfTensor, xActiveMaskInt8Tensor, RoundMode::CAST_NONE, axisBS_);
+    PipeBarrier<PIPE_V>();
+    SumParams params{1, axisBsAlignSize, axisBS_};
+    Sum(sumOutTensor, xActiveMaskHalfTensor, tempTensor, params);
+    SyncFunc<AscendC::HardEvent::V_S>();
+    axisBS_ = static_cast<int32_t>(sumOutTensor.GetValue(0));
+    bsKAlign_ = RoundUp(axisBS_ * axisK_, B32_PER_BLOCK);
+}
+
+template <TemplateMC2TypeA2Class>
+__aicore__ inline void MoeDistributeCombineV2<TemplateMC2TypeA2Func>::SplitCoreCal()
+{
+    // 对worldSize按卡分核，得到每个核上处理的卡的数量
+    sendRankNum_ = worldSize_ / aivNum_;
+    uint32_t remainderRankNum = worldSize_ % aivNum_;
+    startRankId_ = sendRankNum_ * coreIdx_;
+    if (coreIdx_ < remainderRankNum) {
+        sendRankNum_++;
+        startRankId_ += coreIdx_;
+    } else {
+        startRankId_ += remainderRankNum;
+    }
+    endRankId_ = startRankId_ + sendRankNum_;
+}
+template <TemplateMC2TypeA2Class>
+__aicore__ inline void MoeDistributeCombineV2<TemplateMC2TypeA2Func>::AlltoAllDispatch()
+{
+    if (sendRankNum_ == 0) {
+        SyncAll<true>();
+        return;
+    }
+    LocalTensor<ExpandIdxType> sendCountLocal = tokenBuf_.Get<int32_t>();  // 复用tokenBuf_
+    DataCopy(sendCountLocal, sendCountGlobal_, RoundUp(moeExpertNum_, B32_PER_BLOCK));
+    SyncFunc<AscendC::HardEvent::MTE2_S>();
+    for (uint32_t dstRankId = startRankId_; dstRankId < endRankId_; ++dstRankId) {
+        localOutWindow_.SetGlobalBuffer((__gm__ ExpandXType *)(windowOutGM_ + dstRankId * rankSizeOnWin_));
+        uint32_t rankTokenNum = 0;
+        for (uint32_t expertId = 0; expertId < localMoeExpertNum_; ++expertId) {
+            uint32_t preCount = 0;
+            if (expertId != 0 || dstRankId != 0) {
+                preCount = static_cast<uint32_t>(sendCountLocal.GetValue(expertId * worldSize_ + dstRankId - 1));
+            }
+            uint32_t startTokenAddr = preCount * axisH_;
+            uint32_t tokenNum = sendCountLocal(expertId * worldSize_ + dstRankId) - preCount;
+            for (uint32_t tokenId = 0; tokenId < tokenNum; ++tokenId) {
+                LocalTensor<ExpandXType> InUb = moeQueue_.AllocTensor<ExpandXType>();
+                DataCopy(InUb, expandXGlobal_[startTokenAddr], axisH_);
+                moeQueue_.EnQue(InUb);
+                LocalTensor<ExpandXType> OutUb = moeQueue_.DeQue<ExpandXType>();
+                DataCopy(localOutWindow_[rankTokenNum * axisH_], OutUb, axisH_);
+                moeQueue_.FreeTensor<ExpandXType>(OutUb);
+                startTokenAddr += axisH_;
+                rankTokenNum++;
+            }
+        }
+        flagGlobal_.SetGlobalBuffer(
+            (__gm__ uint32_t *)(localOutWindow_.GetPhyAddr(rankTokenNum * axisH_) + SKIP_OFFSET / sizeof(ExpandXType)));
+        flagGlobal_(0) = FLAG_VALUE;
+        uint32_t rankIdOffset = dstRankId - startRankId_;
+        batchWriteItemLocalB64(rankIdOffset * 4) = (uint64_t)(localOutWindow_.GetPhyAddr());
+        batchWriteItemLocalB64(rankIdOffset * 4 + 1) =
+            (uint64_t)(hccl_.GetWindowsInAddr(dstRankId) + halfWinSize_ * bufferId_ + dataOffsetOnWin_);
+        batchWriteItemLocalB64(rankIdOffset * 4 + 2) = rankTokenNum * axisH_ + SKIP_OFFSET / sizeof(ExpandXType) + 2;
+        batchWriteItemLocalB32(rankIdOffset * 8 + 6) = HcclDataType::HCCL_DATA_TYPE_FP16;
+        batchWriteItemLocalB32(rankIdOffset * 8 + 7) = dstRankId;
+        DataCacheCleanAndInvalid<uint32_t, AscendC::CacheLine::SINGLE_CACHE_LINE, AscendC::DcciDst::CACHELINE_OUT>(
+            flagGlobal_);
+    }
+    SyncFunc<AscendC::HardEvent::S_MTE3>();
+    DataCopy(workspaceGlobal_[startRankId_ * 4], batchWriteItemLocalB64, sendRankNum_ * 4);
+    SyncFunc<AscendC::HardEvent::MTE3_S>();
+    SyncAll<true>();
+    if ASCEND_IS_AIV {
+        if (coreIdx_ == 0) {
+            HcclHandle handleId = hccl_.BatchWrite<true>((GM_ADDR)(workspaceGlobal_.GetPhyAddr()), worldSize_);
+            bufferIdGlobal_(0) = bufferId_ ^ 1;
+        }
+        if (rankId_ >= startRankId_ && rankId_ < endRankId_) {
+            localOutWindow_.SetGlobalBuffer((__gm__ ExpandXType *)(windowOutGM_ + dataOffsetOnWin_));
+            localInWindow_.SetGlobalBuffer((__gm__ ExpandXType *)(windowInGM_ + dataOffsetOnWin_));
+            uint32_t rankIdOffset = rankId_ - startRankId_;
+            uint64_t rankTokenNum =
+                (batchWriteItemLocalB64(rankIdOffset * 4 + 2) - SKIP_OFFSET / sizeof(ExpandXType) - 2) / axisH_;
+            for (uint32_t tokenId = 0; tokenId < rankTokenNum; ++tokenId) {
+                LocalTensor<ExpandXType> InUb = moeQueue_.AllocTensor<ExpandXType>();
+                DataCopy(InUb, localOutWindow_[tokenId * axisH_], axisH_);
+                moeQueue_.EnQue(InUb);
+                LocalTensor<ExpandXType> OutUb = moeQueue_.DeQue<ExpandXType>();
+                DataCopy(localInWindow_[tokenId * axisH_], OutUb, axisH_);
+                moeQueue_.FreeTensor<ExpandXType>(OutUb);
+            }
+            flagGlobal_.SetGlobalBuffer((__gm__ uint32_t *)localInWindow_.GetPhyAddr(
+                rankTokenNum * axisH_ + SKIP_OFFSET / sizeof(ExpandXType)));
+            flagGlobal_(0) = FLAG_VALUE;
+            DataCacheCleanAndInvalid<uint32_t, AscendC::CacheLine::SINGLE_CACHE_LINE, AscendC::DcciDst::CACHELINE_OUT>(
+                flagGlobal_);
+        }
+    }
+}
+template <TemplateMC2TypeA2Class>
+__aicore__ inline void MoeDistributeCombineV2<TemplateMC2TypeA2Func>::Preload()
+{
+    recvCountLocal_ = rowTmpFloatBuf_.Get<uint32_t>();              // 复用rowTmpFloatBuf_
+    expertWindowOffsetLocal_ = batchWriteItemBuf_.Get<uint32_t>();  // 复用batchWriteItemBuf_
+    // 缩减UB占用，只读取1/AivNum的专家序号片段，其他核处理部分不读取
+    taskInfo_.SplitCore(axisBS_ * axisK_, aivNum_, coreIdx_);
+    expertIdsSegLocal_ = expertIdsBuf_.Get<ExpandIdxType>();
+    DataCopyPad(expertIdsSegLocal_, expertIdsGlobal_[taskInfo_.startTaskId],
+                {1, static_cast<uint32_t>(taskInfo_.taskNum * sizeof(uint32_t)), 0, 0, 0}, {false, 0, 0, 0});
+    expertIdsSegBaseOffset_ = taskInfo_.startTaskId;
+
+    Duplicate(recvCountLocal_, (uint32_t)0, moeExpertNum_);
+    Duplicate(expertWindowOffsetLocal_, (uint32_t)0, moeExpertNum_);
+
+    SyncFunc<AscendC::HardEvent::V_MTE3>();
+
+    if (coreIdx_ == aivNum_ - 1) {
+        DataCopyPad(expertRecvCountGlobal_, recvCountLocal_,
+                    {1, static_cast<uint32_t>(moeExpertNum_ * sizeof(uint32_t)), 0, 0, 0});
+    }
+
+    SyncAll<true>();
+
+    if (isInputExpertMaskFlag_) {
+        // 需要额外校验Mask，Mask表为全量表，专家表为片段表
+        for (uint32_t i = taskInfo_.startTaskId; i < taskInfo_.endTaskId; ++i) {
+            if (expertMaskTensor_(i) == false)  // 全量表，用[0-bs*k]做索引
+                continue;
+            uint32_t expId = expertIdsSegLocal_.GetValue(i - taskInfo_.startTaskId);  // 片段表，用[0-taskNum]做索引
+            if (expId < moeExpertNum_) recvCountLocal_(expId) += 1;
+        }
+    } else {
+        // 无需校验Mask，直接用片段表
+        for (uint32_t i = 0; i < taskInfo_.taskNum; ++i) {
+            uint32_t expId = expertIdsSegLocal_.GetValue(i);
+            if (expId < moeExpertNum_) recvCountLocal_(expId) += 1;
+        }
+    }
+    SyncFunc<AscendC::HardEvent::S_MTE3>();
+
+    SetAtomicAdd<int32_t>();
+    DataCopyPad(expertRecvCountGlobal_, recvCountLocal_,
+                {1, static_cast<uint32_t>(moeExpertNum_ * sizeof(uint32_t)), 0, 0, 0});
+    SetAtomicNone();
+
+    SyncAll<true>();
+
+    DataCopyPad(recvCountLocal_, expertRecvCountGlobal_,
+                {1, static_cast<uint32_t>(moeExpertNum_ * sizeof(uint32_t)), 0, 0, 0}, {false, 0, 0, 0});
+
+    SyncFunc<AscendC::HardEvent::MTE2_S>();
+
+    taskInfo_.SplitCore(moeExpertNum_ / localMoeExpertNum_, aivNum_, coreIdx_);
+    for (uint32_t groupIdx = taskInfo_.startTaskId; groupIdx < taskInfo_.endTaskId; ++groupIdx) {
+        uint32_t start = groupIdx * localMoeExpertNum_;
+        uint32_t end = start + localMoeExpertNum_;
+        uint32_t prefixSum = 0;
+        for (uint32_t i = start; i < end; ++i) {
+            expertWindowOffsetLocal_(i - start) = prefixSum;
+            prefixSum += recvCountLocal_.GetValue(i);
+        }
+        SyncFunc<AscendC::HardEvent::S_MTE3>();
+        DataCopyPad(expertWindowOffsetGlobal_[start], expertWindowOffsetLocal_,
+                    {1, static_cast<uint32_t>(localMoeExpertNum_ * sizeof(uint32_t)), 0, 0, 0});
+        SyncFunc<AscendC::HardEvent::MTE3_S>();
+    }
+    SyncAll<true>();
+
+    DataCopyPad(expertWindowOffsetLocal_, expertWindowOffsetGlobal_,
+                {1, static_cast<uint32_t>(moeExpertNum_ * sizeof(uint32_t)), 0, 0, 0}, {false, 0, 0, 0});
+
+    tokenNumPerCore_ = axisBS_ / aivNum_;       // 64/48 = 1
+    uint32_t undoTokenNum = axisBS_ % aivNum_;  // 64 % 48 = 16
+    tokenBeginIndex_ = 0;
+    if (coreIdx_ < undoTokenNum) {
+        tokenNumPerCore_ = tokenNumPerCore_ + 1;
+        tokenBeginIndex_ = coreIdx_ * tokenNumPerCore_;
+    } else {
+        tokenBeginIndex_ = (undoTokenNum + coreIdx_ * tokenNumPerCore_);
+    }
+    if (tokenNumPerCore_ == 0) {
+        return;
+    }
+
+    // 缩减UB占用，只读取1/AivNum的Scale\IndexCounts片段，其他核处理部分不读取
+    expandScalesSegLocal_ = expandScalesBuf_.Get<float>();
+    indexCountsSegLocal_ = indexCountsBuf_.Get<ExpandIdxType>();
+    DataCopyPad(expandScalesSegLocal_, expandScalesGlobal_[tokenBeginIndex_ * axisK_],
+                {1, static_cast<uint32_t>(tokenNumPerCore_ * axisK_ * sizeof(uint32_t)), 0, 0, 0}, {false, 0, 0, 0});
+    DataCopyPad(indexCountsSegLocal_, expandIdxGlobal_[tokenBeginIndex_ * axisK_],
+                {1, static_cast<uint32_t>(tokenNumPerCore_ * axisK_ * sizeof(ExpandIdxType)), 0, 0, 0},
+                {false, 0, 0, 0});
+    DataCopyPad(expertIdsSegLocal_, expertIdsGlobal_[tokenBeginIndex_ * axisK_],
+                {1, static_cast<uint32_t>(tokenNumPerCore_ * axisK_ * sizeof(uint32_t)), 0, 0, 0}, {false, 0, 0, 0});
+
+    expandScalesSegBaseOffset_ = tokenBeginIndex_ * axisK_;
+    indexCountsSegBaseOffset_ = tokenBeginIndex_ * axisK_;
+    expertIdsSegBaseOffset_ = tokenBeginIndex_ * axisK_;
+}
+
+template <TemplateMC2TypeA2Class>
+__aicore__ inline void MoeDistributeCombineV2<TemplateMC2TypeA2Func>::WaitDispatch()
+{
+    if (startRankId_ >= worldSize_) {
+        SyncAll<true>();
+        return;
+    }
+    SyncFunc<AscendC::HardEvent::MTE2_S>();
+    for (uint32_t waitFlagNum = 0; waitFlagNum < sendRankNum_;) {
+        waitFlagNum = 0;
+        for (uint32_t rankId = startRankId_; rankId < endRankId_; ++rankId) {
+            uint32_t tokenIdx = (rankId + 1) * localMoeExpertNum_ - 1;
+            GM_ADDR wAddr = windowInGM_ + rankSizeOnWin_ * rankId + SKIP_OFFSET +
+                            (recvCountLocal_(tokenIdx) + expertWindowOffsetLocal_(tokenIdx)) * axisHExpandXTypeSize_;
+            flagGlobal_.SetGlobalBuffer((__gm__ uint32_t *)wAddr);
+            DataCacheCleanAndInvalid<uint32_t, AscendC::CacheLine::SINGLE_CACHE_LINE, AscendC::DcciDst::CACHELINE_OUT>(
+                flagGlobal_);
+            uint32_t flag = flagGlobal_(0);
+            if (flag == FLAG_VALUE) {
+                waitFlagNum++;
+            }
+        }
+    }
+    for (uint32_t rankId = startRankId_; rankId < endRankId_; ++rankId) {
+        uint32_t tokenIdx = (rankId + 1) * localMoeExpertNum_ - 1;
+        GM_ADDR wAddr = windowInGM_ + rankSizeOnWin_ * rankId + SKIP_OFFSET +
+                        (recvCountLocal_(tokenIdx) + expertWindowOffsetLocal_(tokenIdx)) * axisHExpandXTypeSize_;
+        flagGlobal_.SetGlobalBuffer((__gm__ uint32_t *)wAddr);
+        flagGlobal_(0) = 0;
+    }
+    SyncAll<true>();
+}
+
+template <TemplateMC2TypeA2Class>
+__aicore__ inline void MoeDistributeCombineV2<TemplateMC2TypeA2Func>::Process()
+{
+    if ASCEND_IS_AIV {
+        AlltoAllDispatch();
+        Preload();
+        WaitDispatch();
+        LocalWindowCopy();
+        hccl_.Finalize();
+    }
+}
+template <TemplateMC2TypeA2Class>
+__aicore__ inline void MoeDistributeCombineV2<TemplateMC2TypeA2Func>::LocalWindowCopy()
+{
+    sumFloatLocal_ = sumFloatBuf_.Get<float>();
+    if (tokenNumPerCore_ == 0) {
+        return;
+    }
+    // step 4 & step 5
+    int32_t expId = 0;
+    float scaleVal = 0.0;
+    for (uint32_t i = 0; i < tokenNumPerCore_; i++) {
+        uint32_t tokenIdx = tokenBeginIndex_ + i;
+        Duplicate(sumFloatLocal_, 0.0f, axisH_);  // 28K接收最终输出
+        for (uint32_t topKIdx = 0; topKIdx < axisK_; topKIdx++) {
+            uint32_t tokentopKIdx = tokenIdx * axisK_ + topKIdx;
+            if (isInputExpertMaskFlag_) {
+                bool maskExpertFlag = expertMaskTensor_.GetValue(tokentopKIdx);
+                if (!maskExpertFlag) {
+                    continue;
+                }
+            }
+            expId = expertIdsSegLocal_.GetValue(tokentopKIdx - expertIdsSegBaseOffset_);  // bs*k*4 //256*16*4=16k
+            if (expId < moeExpertNum_) {
+                ProcessMoeAndCopyExpert(tokenIdx, topKIdx);
+            } else if (expId < moeExpertNum_ + zeroExpertNum_) {
+                continue;  // 零专家不需要任何操作
+            } else if (expId < moeExpertNum_ + zeroExpertNum_ + copyExpertNum_) {
+                ProcessMoeAndCopyExpert(tokenIdx, topKIdx);
+            } else if (expId < moeExpertNum_ + zeroExpertNum_ + copyExpertNum_ + constExpertNum_) {
+                ProcessConstantExpert(tokenIdx, topKIdx);
+            }
+        }
+        // 结果搬出
+        PipeBarrier<PIPE_V>();
+        LocalTensor<ExpandXType> sumBufLocal_ = tokenBuf_.Get<ExpandXType>();
+        SyncFunc<AscendC::HardEvent::MTE3_V>();
+        Cast(sumBufLocal_, sumFloatLocal_, AscendC::RoundMode::CAST_RINT, axisH_);  // 28k
+        SyncFunc<AscendC::HardEvent::V_MTE3>();
+        DataCopy(expandOutGlobal_[tokenIdx * axisH_], sumBufLocal_, axisH_);
+    }
+    PipeBarrier<PIPE_ALL>();
+}
+
+template <TemplateMC2TypeA2Class>
+__aicore__ inline void MoeDistributeCombineV2<TemplateMC2TypeA2Func>::ProcessMoeAndCopyExpert(uint32_t tokenIdx,
+                                                                                              uint32_t topKIdx)
+{
+    GM_ADDR wAddr;
+    LocalTensor<float> rowTmpFloatLocal = rowTmpFloatBuf_.Get<float>();
+    uint32_t tokentopKIdx = tokenIdx * axisK_ + topKIdx;
+    float scaleVal = expandScalesSegLocal_.GetValue(tokentopKIdx - expandScalesSegBaseOffset_);
+    int32_t expId = expertIdsSegLocal_.GetValue(tokentopKIdx - expertIdsSegBaseOffset_);
+    if (expId < moeExpertNum_) {
+        uint32_t rank = expId / localMoeExpertNum_;
+        wAddr = (__gm__ uint8_t *)(windowInGM_) + rankSizeOnWin_ * rank +
+                expertWindowOffsetLocal_.GetValue(expId) * axisHExpandXTypeSize_ +
+                indexCountsSegLocal_.GetValue(tokentopKIdx - indexCountsSegBaseOffset_) * axisHExpandXTypeSize_;
+    } else {
+        wAddr = (__gm__ uint8_t *)(oriXGM_) + tokenIdx * axisHExpandXTypeSize_;
+    }
+    // copy experts from window
+    rankWindow_.SetGlobalBuffer((__gm__ ExpandXType *)wAddr);
+    tmpUb_ = moeQueue_.AllocTensor<ExpandXType>();
+    SyncFunc<AscendC::HardEvent::V_MTE2>();
+    DataCopy(tmpUb_, rankWindow_, axisH_);
+    SyncFunc<AscendC::HardEvent::MTE2_V>();
+    // cast before muls
+    Cast(rowTmpFloatLocal, tmpUb_, AscendC::RoundMode::CAST_NONE, axisH_);
+    PipeBarrier<PIPE_V>();
+    // muls expert and scaleVal, use inplace scalar muls, do not need extra buf
+    AscendC::Muls(rowTmpFloatLocal, rowTmpFloatLocal, scaleVal, axisH_);  // tokenXscale
+    PipeBarrier<PIPE_V>();
+    // add rowTmpFloatLocal to sumFloatBufLocal
+    AscendC::Add(sumFloatLocal_, sumFloatLocal_, rowTmpFloatLocal, axisH_);
+    moeQueue_.FreeTensor<ExpandXType>(tmpUb_);
+}
+
+template <TemplateMC2TypeA2Class>
+__aicore__ inline void MoeDistributeCombineV2<TemplateMC2TypeA2Func>::ProcessConstantExpert(uint32_t tokenIdx,
+                                                                                            uint32_t topKIdx)
+{
+    PipeBarrier<PIPE_ALL>();
+    uint32_t tokentopKIdx = tokenIdx * axisK_ + topKIdx;
+    float scaleVal = expandScalesSegLocal_.GetValue(tokentopKIdx - expandScalesSegBaseOffset_);
+    int32_t expId = expertIdsSegLocal_.GetValue(tokentopKIdx - expertIdsSegBaseOffset_);
+    uint32_t constExpertIdx = expId - (moeExpertNum_ + zeroExpertNum_ + copyExpertNum_);
+
+    LocalTensor<float> constVFloatLocal = tokenBuf_.Get<float>();
+    LocalTensor<float> constXFloatLocal = rowTmpFloatBuf_.Get<float>();
+    LocalTensor<ExpandXType> constVInUB = moeQueue_.AllocTensor<ExpandXType>();
+    LocalTensor<ExpandXType> constXInUB = moeQueue_.AllocTensor<ExpandXType>();
+
+    DataCopyPadExtParams<ExpandXType> copyPadExtParams{false, 0U, 0U, 0U};
+    DataCopyExtParams expandXCopyParams{1U, static_cast<uint32_t>(axisHExpandXTypeSize_), 0U, 0U, 0U};
+
+    // 直接从GM读取当前常量专家的alpha1和alpha2参数
+    ExpandXType alpha1 = constExpertAlpha1Global_.GetValue(constExpertIdx);
+    ExpandXType alpha2 = constExpertAlpha2Global_.GetValue(constExpertIdx);
+
+    float alpha1Float;
+    float alpha2Float;
+    if constexpr (std::is_same_v<ExpandXType, bfloat16_t>) {
+        alpha1Float = ToFloat(alpha1);
+        alpha2Float = ToFloat(alpha2);
+    } else {
+        alpha1Float = static_cast<float>(alpha1);
+        alpha2Float = static_cast<float>(alpha2);
+    }
+
+    // 读取输入token并转float
+    DataCopyPad(constVInUB, constExpertVGlobal_[constExpertIdx * axisH_], expandXCopyParams, copyPadExtParams);
+    DataCopyPad(constXInUB, oriXGlobal_[tokenIdx * axisH_], expandXCopyParams, copyPadExtParams);
+    SyncFunc<AscendC::HardEvent::MTE2_V>();
+    SyncFunc<AscendC::HardEvent::MTE3_V>();
+    Cast(constXFloatLocal, constXInUB, AscendC::RoundMode::CAST_NONE, axisH_);
+    Cast(constVFloatLocal, constVInUB, AscendC::RoundMode::CAST_NONE, axisH_);
+    PipeBarrier<PIPE_V>();
+    moeQueue_.FreeTensor<ExpandXType>(constVInUB);
+    moeQueue_.FreeTensor<ExpandXType>(constXInUB);
+
+    // 计算 alpha1 * x + alpha2 * v ,结果存放到x
+    AscendC::Muls(constXFloatLocal, constXFloatLocal, alpha1Float, axisH_);
+    AscendC::Muls(constVFloatLocal, constVFloatLocal, alpha2Float, axisH_);
+    PipeBarrier<PIPE_V>();
+    AscendC::Add(constXFloatLocal, constXFloatLocal, constVFloatLocal, axisH_);
+    PipeBarrier<PIPE_V>();
+
+    // 乘以专家权重
+    AscendC::Muls(constXFloatLocal, constXFloatLocal, scaleVal, axisH_);
+    PipeBarrier<PIPE_V>();
+    AscendC::Add(sumFloatLocal_, sumFloatLocal_, constXFloatLocal, axisH_);
+    PipeBarrier<PIPE_V>();
+}
+
+}  // namespace MoeDistributeCombineA2Impl
+#endif  // MOE_DISTRIBUTE_COMBINE_V2_H
diff --git a/csrc/deepep/ops2/op_kernel/moe_distribute_combine_v2_layered.h b/csrc/deepep/ops2/op_kernel/moe_distribute_combine_v2_layered.h
new file mode 100644
index 000000000..1bbeb02e7
--- /dev/null
+++ b/csrc/deepep/ops2/op_kernel/moe_distribute_combine_v2_layered.h
@@ -0,0 +1,1212 @@
+/**
+ * This program is free software, you can redistribute it and/or modify.
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING
+BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+/*!
+ * \file moe_distribute_combine_a2_layered.h
+ * \brief
+ */
+#ifndef MOE_DISTRIBUTE_COMBINE_V2_LAYERED_H
+#define MOE_DISTRIBUTE_COMBINE_V2_LAYERED_H
+#include "kernel_operator.h"
+#include "kernel_tiling/kernel_tiling.h"
+#include "moe_distribute_combine_v2_tiling.h"
+#include "moe_distribute_base.h"
+
+namespace MoeDistributeCombineA2Impl {
+
+constexpr int UB_ALIGN_SIZE = 32;
+constexpr uint64_t CACHELINE_SIZE = 64;
+
+#define TemplateMC2TypeA2layeredClass typename ExpandXType, typename ExpandIdxType, typename ExpandXTransType
+#define TemplateMC2TypeA2layeredFunc ExpandXType, ExpandIdxType, ExpandXTransType
+
+template <typename T>
+struct OutputType {
+    using type = T;
+};
+// 针对float16_t的特化
+template <>
+struct OutputType<half> {
+    using type = half;
+};
+// 针对bfloat16_t的特化
+template <>
+struct OutputType<bfloat16_t> {
+    using type = float;
+};
+// 辅助类型别名（C++11起支持）
+template <typename T>
+using OutputType_t = typename OutputType<T>::type;
+
+using namespace AscendC;
+template <TemplateMC2TypeA2layeredClass>
+class MoeDistributeCombineV2Layered
+{
+public:
+    constexpr static uint32_t BUFFER_NUM = 2U;                   // 多buf
+    constexpr static uint32_t STATE_OFFSET = 512U;               // 状态空间偏移地址
+    constexpr static uint32_t STATE_SPACE_SIZE = 1024U * 1024U;  // 1M
+    constexpr static uint32_t UB_ALIGN = 32U;                    // UB按32字节对齐
+    constexpr static uint32_t SELF_STATE_OFFSET = 512U * 1024U;  // 本卡状态空间偏移地址
+
+    constexpr static uint32_t BLOCK_SIZE = 32U;
+    constexpr static uint32_t B16_PER_BLOCK = 16U;
+    constexpr static uint32_t B32_PER_BLOCK = 8U;
+    constexpr static uint32_t B64_PER_BLOCK = 4U;
+    constexpr static uint32_t SERVER_RANK_SIZE = 8U;
+    constexpr static uint32_t IPC_DATA_OFFSET = 4U * 1024U * 1024U;
+    constexpr static uint32_t RDMA_DATA_SIZE = 100U * 1024U * 1024U;
+    constexpr static uint32_t VEC_LEN = 256U;
+    constexpr static uint32_t MAGIC_OFFSET = 2U * 1024U * 1024U - 32U * 32U;
+    constexpr static uint32_t EXTRA_TOKEN_INFO_NUM = 4U;  // 专家信息 权重信息 量化Scale 到达标志位
+    constexpr static uint64_t MB_SIZE = 1024UL * 1024UL;
+    constexpr static bool DynamicQuant = std::is_same<ExpandXTransType, int8_t>::value;
+    constexpr static uint32_t TBUF_SIZE = 185U * 1024U;
+    constexpr static uint32_t TBUF_TEMP_OFFSET = 0U;
+    constexpr static uint32_t IPC_REDUCE_USED_CORE_NUM = 32U;  // 拉起远端IPC和机内reduce需要的核数
+    constexpr static uint32_t WEIGHT_VALUE_NUM = 16U;  // token(h * sizeof(bf/fp16)) + scale(32B) = (h + 16) * 2B
+    constexpr static uint64_t GM2IPC_SYNC_FLAG = 12345ULL;
+    constexpr static uint64_t RDMA_TOKEN_ARRIVED_FLAG = 123ULL;
+    constexpr static uint64_t RDMA_TOKEN_END_FLAG = 321ULL;
+    constexpr static uint32_t MAX_BS_NUM = 512U;  // 适配bs=512
+    constexpr static uint32_t FLAG_SINGLE_CNT = 4;
+    constexpr static uint32_t FLAG_TOTAL_SIZE =
+        MAX_BS_NUM * FLAG_SINGLE_CNT > IPC_DATA_OFFSET
+            ? IPC_DATA_OFFSET
+            : MAX_BS_NUM *FLAG_SINGLE_CNT;  // AlltoAllServerDispatch会放弃bs>1024的部分
+
+    template <AscendC::HardEvent event>
+    __aicore__ inline void SyncFunc()
+    {
+        int32_t eventID = static_cast<int32_t>(GetTPipePtr()->FetchEventID(event));
+        AscendC::SetFlag<event>(eventID);
+        AscendC::WaitFlag<event>(eventID);
+    }
+    template <typename T>
+    inline __aicore__ T RoundUp(const T val, const T align)
+    {
+        static_assert(std::is_arithmetic<T>::value, "T must be an arithmetic type");
+        if (align == 0 || val + align - 1 < val) {
+            return val;
+        }
+        return (val + align - 1) / align * align;
+    }
+    template <typename T>
+    inline __aicore__ T CeilDiv(const T dividend, const T divisor)
+    {
+        return (divisor == 0) ? 0 : ((dividend + divisor - 1) / divisor);
+    }
+
+    __aicore__ inline MoeDistributeCombineV2Layered(){};
+    __aicore__ inline void Init(GM_ADDR expandX, GM_ADDR expertIds, GM_ADDR expandIdx, GM_ADDR sendCount,
+                                GM_ADDR scales, GM_ADDR XOut, GM_ADDR workspaceGM, TPipe *pipe,
+                                const MoeDistributeCombineV2TilingData *tilingData, __gm__ void *mc2InitTiling,
+                                __gm__ void *mc2CcTiling, GM_ADDR contextGM);
+    __aicore__ inline void Process();
+    __aicore__ inline void AIVRDMAPostSend(GM_ADDR srcDmaAddr, GM_ADDR destDmaAddr, uint64_t destRankId,
+                                           uint64_t messageLen, __gm__ HcclAiRMAInfo *QpInfo);
+
+private:
+    __aicore__ inline void BuffInit();
+    __aicore__ inline void SplitCoreCal();
+    __aicore__ inline void GM2IPC();
+    __aicore__ inline void WaitIPC();
+    __aicore__ inline void SumToWindow();
+    __aicore__ inline void WaitDispatch();
+    __aicore__ inline void AlltoAllServerDispatch();
+    __aicore__ inline void SumToServer();
+    __aicore__ inline void Preload();
+    __aicore__ inline void ToWindowPreload();
+
+    TPipe *tpipe_{nullptr};
+    GlobalTensor<ExpandXType> expandXGlobal_;
+    GlobalTensor<ExpandIdxType> expertIdsGlobal_;
+    GlobalTensor<ExpandIdxType> expandIdxGlobal_;
+    GlobalTensor<ExpandIdxType> sendCountGlobal_;
+    GlobalTensor<ExpandIdxType> bkCountGlobal_;
+    GlobalTensor<float> expandScalesGlobal_;
+    GlobalTensor<ExpandXType> expandOutGlobal_;
+
+    GlobalTensor<ExpandXType> localOutWindow_;
+    GlobalTensor<ExpandXTransType> localInWindow_;
+    GlobalTensor<uint32_t> bufferIdGlobal_;    // 用于存对端状态window的变量
+    GlobalTensor<int32_t> statusSpaceGlobal_;  // win区状态位置拷入相关参数
+    GlobalTensor<int32_t> readStateGlobal_;
+
+    uint64_t shareAddreRank[8];
+
+    // 低精度需要用到的变量
+    GlobalTensor<ExpandXType> scaleOutWindow_;  // 第一层输出的scale值和offset，都是fp16格式
+    GlobalTensor<ExpandXType> localInScaleWindow_;
+    OutputType_t<ExpandXType> scaleMulVal;
+    uint32_t mask;
+
+    GM_ADDR windowInGM_;
+    GM_ADDR windowOutGM_;
+    GM_ADDR statusSpaceGm_;
+    GM_ADDR expandXGM_;
+    GM_ADDR expertIdsGM_;
+    GM_ADDR expandIdxGM_;
+    GM_ADDR sendCountGM_;
+    GM_ADDR scalesGM_;
+    GM_ADDR XOutGM_;
+    __gm__ HcclAiRMAInfo *qp_info_;
+
+    // 分层所需的参数
+    GM_ADDR shareAddrGM_;
+    GM_ADDR offsetInnerGM_;
+    GM_ADDR countInnerGM_;
+    GM_ADDR offsetOuterGM_;
+    GM_ADDR countOuterGM_;
+
+    GlobalTensor<int32_t> shareAddrGlobal_;
+    GlobalTensor<uint64_t> shareFlagGlobal_;
+    GlobalTensor<ExpandXType> shareMemGlobal_;
+    GlobalTensor<ExpandXType> dstshareMemGlobal_;
+    GlobalTensor<uint64_t> magicGlobal_;
+    GlobalTensor<int32_t> offsetInnerGlobal_;
+    GlobalTensor<int16_t> countInnerGlobal_;
+    GlobalTensor<int32_t> offsetOuterGlobal_;
+    GlobalTensor<int32_t> countOuterGlobal_;
+
+    // tiling侧已确保数据上限，相乘不会越界，因此统一采用uint32_t进行处理
+    uint32_t countReL{0};
+    uint32_t axisBS_{0};
+    uint32_t globalBs{0};
+    uint32_t axisH_{0};
+    uint32_t axisK_{0};  // topK
+    uint32_t aivNum_{0};
+    uint32_t worldSize_{0};
+    uint32_t rankId_{0};
+    uint32_t coreIdx_{0};              // aiv id
+    uint32_t sharedExpertRankNum_{0};  // 共享专家卡数
+    __gm__ HcclA2CombineOpParam *winContext_{nullptr};
+    uint32_t moeExpertNum_{0};       // moe专家数, 等于worldSize_ - 共享专家卡数
+    uint32_t localMoeExpertNum_{0};  // 每张卡的专家数
+    uint32_t expandXRows_;
+    uint64_t rankSizeOnWin_{0};
+    Hccl<HCCL_SERVER_TYPE_AICPU> hccl_;
+    uint64_t dataOffsetOnWin_{0};
+    uint64_t stateOffsetOnWin_{0};
+    uint32_t axisHFloatSize_{0};
+    uint32_t axisHExpandXTypeSize_{0};
+    uint32_t startRankId_{0};
+    uint32_t endRankId_{0};
+    uint32_t sendRankNum_{0};
+    uint32_t halfWinSize_{0};
+    uint32_t dataSpaceSize_{0};
+    uint32_t bufferId_{0};
+    uint32_t tokenNumPerCore_{0};
+    uint32_t tokenIndex_{0};
+    uint32_t serverNum{0};
+    uint32_t ipcSliceSize{0};
+    uint32_t ipcSliceNodeSize{0};
+    uint64_t send_counts_inner_offset{0};
+    uint64_t offset_inner_offset{0};
+    uint64_t send_counts_outer_offset{0};
+    uint64_t offset_outer_offset{0};
+    uint64_t share_offset{0};
+    uint32_t IPC_DATA_SIZE{0};
+    TBuf<QuePosition::VECCALC> tBuf;
+    TBuf<TPosition::VECOUT> rdmaInBuf_;
+    TBuf<TPosition::VECOUT> rdmaInBuf2_;
+    TBuf<> statusBuf_;
+
+    int32_t sumTarget_{0};
+    int32_t stateValue_{0};
+    uint32_t startBs{0};
+    uint32_t endBs{0};
+    uint32_t processNum{0};
+    uint32_t resNum{0};
+    uint32_t resLen{0};
+    uint32_t offsetIndex{0};
+    uint32_t maxLocalBs{0};
+    uint32_t stepCoreNum{0};
+    uint64_t magicValue{0};
+    LocalTensor<int32_t> offsetReduceLocal_;
+    LocalTensor<int32_t> countReduceLocal_;
+    LocalTensor<uint64_t> ubLocal;
+    LocalTensor<uint32_t> ubLocalHead;
+    // 低精度相关
+    uint32_t repeatNum{0};
+    uint32_t scaleNum;
+    uint32_t scaleNumAlign;
+    uint32_t SCALE_GRANU;
+    uint32_t lastRepeatNum{0};
+};
+
+template <TemplateMC2TypeA2layeredClass>
+__aicore__ inline void MoeDistributeCombineV2Layered<TemplateMC2TypeA2layeredFunc>::AIVRDMAPostSend(
+    GM_ADDR srcDmaAddr, GM_ADDR destDmaAddr, uint64_t destRankId, uint64_t messageLen, __gm__ HcclAiRMAInfo *QpInfo)
+{
+    auto qpNum = ((__gm__ HcclAiRMAInfo *)QpInfo)->qpNum;
+    auto qp_ctx_entry =
+        (__gm__ HcclAiRMAWQ *)(((__gm__ HcclAiRMAInfo *)QpInfo)->sqPtr +
+                               destRankId * qpNum * (uint64_t)(((__gm__ HcclAiRMAInfo *)QpInfo)->sizeOfAiRMAWQ));
+    auto mem_info_table = ((__gm__ HcclAiRMAInfo *)QpInfo)->memPtr;
+    auto sizeof_memdetail = ((__gm__ HcclAiRMAInfo *)QpInfo)->sizeOfAiRMAMem;
+    auto cur_rank_id = (((__gm__ HcclAiRMAInfo *)QpInfo)->curRankId);
+    auto sqBaseAddr = qp_ctx_entry->bufAddr;
+    auto wqeSize = qp_ctx_entry->wqeSize;
+    auto curHardwareHead = qp_ctx_entry->headAddr;
+    cacheWriteThrough((__gm__ uint8_t *)curHardwareHead, 8);
+    uint64_t curHead = *(__gm__ uint32_t *)(curHardwareHead);
+    auto curHardwareTailAddr = qp_ctx_entry->tailAddr;
+    uint64_t shift = 15U;
+    auto QP_DEPTH = qp_ctx_entry->depth;
+
+    PipeBarrier<PIPE_ALL>();
+
+    // Make sure we don't overflow the SQ in an infinite loop - no need to mitigate endless loop as the host
+    // will timeout and kill the kernel, same as all2all kernel if it fails to complete (e.g. in case of link loss)
+    while (1) {
+        cacheWriteThrough((__gm__ uint8_t *)curHardwareTailAddr, 8);
+        if ((curHead - *(__gm__ uint32_t *)(curHardwareTailAddr)) < QP_DEPTH - 1) {
+            break;
+        }
+        int64_t systemCycleAfter = AscendC::GetSystemCycle();  // add this line to solve slow poll CQ issue
+    }
+
+    __gm__ uint8_t *wqeAddr = (__gm__ uint8_t *)(sqBaseAddr + wqeSize * (curHead % QP_DEPTH));
+
+    // Write the WQE to GM
+    uint64_t ownBit = (curHead >> shift) & 1U;
+    uint32_t byte_4 = 3U;                      // [0:4] opcode=0x3(RDMA_WRITE)
+    byte_4 |= ((~ownBit) << 7U) & (1U << 7U);  // [7] owner_bit
+    byte_4 |= 1U << 8U;                        // [8:8] IBV_SEND_SIGNALED
+
+    *(__gm__ uint32_t *)(wqeAddr) = byte_4;          // Control set by local parameter see above lines
+    *(__gm__ uint32_t *)(wqeAddr + 4) = messageLen;  // message size
+    *(__gm__ uint32_t *)(wqeAddr + 8) = 0;           // immtdata is always 0 till we provide poll CQ flow in AIV
+    *(__gm__ uint32_t *)(wqeAddr + 12) = 1U << 24U;  // [120:127] num_sge = 1
+    *(__gm__ uint32_t *)(wqeAddr + 16) = 0;          // [128:151] start_sge_idx = 0;
+    __gm__ HcclAiRMAMemInfo *memDetail = (__gm__ HcclAiRMAMemInfo *)(mem_info_table + sizeof_memdetail * destRankId);
+    *(__gm__ uint32_t *)(wqeAddr + 20) =
+        ((__gm__ MemDetails *)(memDetail->memDetailPtr +
+                               memDetail->sizeOfMemDetails * static_cast<uint32_t>(HcclAiRMAMemType::REMOTE_INPUT)))
+            ->key;
+    *(__gm__ uint64_t *)(wqeAddr + 24) = (uint64_t)destDmaAddr;  // destination VA
+
+    // Setup SGE and write to GM
+    __gm__ uint8_t *sgeAddr = wqeAddr + sizeof(struct hns_roce_rc_sq_wqe);
+    *(__gm__ uint32_t *)(sgeAddr) = messageLen;
+    memDetail = (__gm__ HcclAiRMAMemInfo *)(mem_info_table + sizeof_memdetail * destRankId);
+    *(__gm__ uint32_t *)(sgeAddr + sizeof(uint32_t)) =
+        ((__gm__ MemDetails *)(memDetail->memDetailPtr +
+                               memDetail->sizeOfMemDetails * static_cast<uint32_t>(HcclAiRMAMemType::LOCAL_OUTPUT)))
+            ->key;  // L_Key
+    *(__gm__ uint64_t *)(sgeAddr + 2 * sizeof(uint32_t)) =
+        (uint64_t)srcDmaAddr;  // src VA addr memory registered by RNIC
+
+    // wqe & sge cache flush
+    cacheWriteThrough(wqeAddr, sizeof(struct hns_roce_rc_sq_wqe) + sizeof(struct hns_roce_lite_wqe_data_seg));
+    PipeBarrier<PIPE_ALL>();
+    curHead++;
+
+    uint64_t doorBellInfo = 0;
+    doorBellInfo |= qp_ctx_entry->wqn;                     // [0:23] DB_TAG (qp_num)
+    doorBellInfo |= 0UL << 24UL;                           // [24:27] DB_CMD = HNS_ROCE_V2_SQ_DB (0)
+    doorBellInfo |= (curHead % 65536UL) << 32UL;           // [32:47] DB_PI = sq.head
+    doorBellInfo |= (uint64_t)(qp_ctx_entry->sl) << 48UL;  // [48:50] DB_SL = qp.sl
+
+    __gm__ uint64_t *doorBellAddr = (__gm__ uint64_t *)(qp_ctx_entry->dbAddr);
+    PipeBarrier<PIPE_ALL>();
+
+    ubLocal.SetValue(0, doorBellInfo);
+    AscendC::GlobalTensor<uint64_t> DBGlobalTensor;
+    DBGlobalTensor.SetGlobalBuffer(doorBellAddr);
+    AscendC::DataCopyExtParams copyParams{1, 1 * sizeof(uint64_t), 0, 0, 0};
+    PipeBarrier<PIPE_ALL>();
+    AscendC::DataCopyPad(DBGlobalTensor, ubLocal, copyParams);
+    PipeBarrier<PIPE_ALL>();
+
+    ubLocalHead.SetValue(0, (uint32_t)curHead);
+    AscendC::GlobalTensor<uint32_t> HeadGlobalTensor;
+    HeadGlobalTensor.SetGlobalBuffer((__gm__ uint32_t *)curHardwareHead);
+    AscendC::DataCopyExtParams copyParamsHead{1, 1 * sizeof(uint32_t), 0, 0, 0};
+    PipeBarrier<PIPE_ALL>();
+    AscendC::DataCopyPad(HeadGlobalTensor, ubLocalHead, copyParamsHead);
+    PipeBarrier<PIPE_ALL>();
+}
+
+template <TemplateMC2TypeA2layeredClass>
+__aicore__ inline void MoeDistributeCombineV2Layered<TemplateMC2TypeA2layeredFunc>::Init(
+    GM_ADDR expandX, GM_ADDR expertIds, GM_ADDR expandIdx, GM_ADDR sendCount, GM_ADDR scales, GM_ADDR XOut,
+    GM_ADDR workspaceGM, TPipe *pipe, const MoeDistributeCombineV2TilingData *tilingData, __gm__ void *mc2InitTiling,
+    __gm__ void *mc2CcTiling, GM_ADDR contextGM)
+{
+    tpipe_ = pipe;
+    expandXGM_ = expandX;
+    expertIdsGM_ = expertIds;
+    expandIdxGM_ = expandIdx;
+    sendCountGM_ = sendCount;
+    scalesGM_ = scales;
+    XOutGM_ = XOut;
+    rankId_ = tilingData->moeDistributeCombineV2Info.epRankId;
+    axisBS_ = tilingData->moeDistributeCombineV2Info.bs;
+    axisH_ = tilingData->moeDistributeCombineV2Info.h;
+    axisK_ = tilingData->moeDistributeCombineV2Info.k;
+    aivNum_ = tilingData->moeDistributeCombineV2Info.aivNum;
+    moeExpertNum_ = tilingData->moeDistributeCombineV2Info.moeExpertNum;
+    worldSize_ = tilingData->moeDistributeCombineV2Info.epWorldSize;
+
+    globalBs = tilingData->moeDistributeCombineV2Info.globalBs;
+    if (globalBs >= MAX_BS_NUM) {
+        maxLocalBs = MAX_BS_NUM;
+    } else {
+        maxLocalBs = globalBs;
+    }
+
+    if constexpr (std::is_same<ExpandXType, half>::value) {  // fp16
+        SCALE_GRANU = 16U;
+        scaleNum = axisH_ / SCALE_GRANU;
+        scaleNumAlign = RoundUp(scaleNum, (uint32_t)(UB_ALIGN / sizeof(ExpandXType)));
+        repeatNum = CeilDiv(axisH_, (VEC_LEN / static_cast<uint32_t>(sizeof(ExpandXType))));
+        uint32_t vecNum = VEC_LEN / static_cast<uint32_t>(sizeof(ExpandXType));
+        if (axisH_ >= vecNum) {
+            mask = vecNum;
+        } else {
+            mask = axisH_;
+        }
+
+    } else {  // bf16
+        SCALE_GRANU = 8U;
+        scaleNum = axisH_ / SCALE_GRANU;
+        scaleNumAlign = RoundUp(scaleNum, (uint32_t)(UB_ALIGN / sizeof(ExpandXType)));
+        repeatNum = CeilDiv(axisH_, (VEC_LEN / static_cast<uint32_t>(sizeof(float))));
+        uint32_t vecNum = VEC_LEN / static_cast<uint32_t>(sizeof(float));  // Brcb 8个datablock(32Bytes)
+        if (axisH_ >= vecNum) {
+            mask = vecNum;
+        } else {
+            mask = axisH_;
+        }
+    }
+    scaleMulVal = 1 / 127.;
+
+    winContext_ = (__gm__ HcclA2CombineOpParam *)contextGM;
+    hccl_.Init(contextGM, mc2InitTiling);
+    hccl_.SetCcTiling(mc2CcTiling);
+    qp_info_ = (__gm__ HcclAiRMAInfo *)(((__gm__ HcclA2CombineOpParam *)contextGM)->aiRMAInfo);
+
+    halfWinSize_ = RDMA_DATA_SIZE / 2U;
+    IPC_DATA_SIZE =
+        winContext_->winSize - RDMA_DATA_SIZE - IPC_DATA_OFFSET;  // HCCL_BUFFSIZE - 100 - 4 = 2048 - 104 = 1944M
+    dataSpaceSize_ = halfWinSize_ - STATE_SPACE_SIZE;             // 50-1 = 49M
+    windowInGM_ = hccl_.GetWindowsInAddr(rankId_);
+    bufferIdGlobal_.SetGlobalBuffer((__gm__ uint32_t *)(windowInGM_ + dataSpaceSize_ + worldSize_ * STATE_OFFSET));
+    // winIn + 49M + 16*0.5k = winIn + 49.0078125M
+    bufferId_ = bufferIdGlobal_(0);
+    windowInGM_ = windowInGM_ + halfWinSize_ * bufferId_;
+    windowOutGM_ = hccl_.GetWindowsOutAddr(rankId_) + halfWinSize_ * bufferId_;
+    coreIdx_ = GetBlockIdx();
+
+    serverNum = worldSize_ / SERVER_RANK_SIZE;
+    expandXGlobal_.SetGlobalBuffer((__gm__ ExpandXType *)expandX);
+    expertIdsGlobal_.SetGlobalBuffer((__gm__ ExpandIdxType *)expertIds);
+    expandIdxGlobal_.SetGlobalBuffer((__gm__ ExpandIdxType *)expandIdx);
+    sendCountGlobal_.SetGlobalBuffer((__gm__ int32_t *)sendCount);
+    bkCountGlobal_.SetGlobalBuffer((__gm__ int32_t *)(sendCount + worldSize_ * localMoeExpertNum_ * 4));
+    expandScalesGlobal_.SetGlobalBuffer((__gm__ float *)scales);
+    expandOutGlobal_.SetGlobalBuffer((__gm__ ExpandXType *)XOut);
+    readStateGlobal_.SetGlobalBuffer((__gm__ int32_t *)(windowOutGM_ + dataSpaceSize_));
+    localMoeExpertNum_ = moeExpertNum_ / worldSize_;
+    expandXRows_ = localMoeExpertNum_ * axisBS_ * worldSize_;
+    rankSizeOnWin_ = static_cast<uint64_t>(dataSpaceSize_ / worldSize_ / BLOCK_SIZE * BLOCK_SIZE);
+    // rankSizeOnWin_ = 49M / 16 = 3.0625M
+    statusSpaceGm_ = windowInGM_ + dataSpaceSize_;
+    statusSpaceGlobal_.SetGlobalBuffer((__gm__ int32_t *)statusSpaceGm_);
+    dataOffsetOnWin_ = rankId_ * rankSizeOnWin_;
+    stateOffsetOnWin_ = static_cast<uint64_t>(dataSpaceSize_ + rankId_ * STATE_OFFSET);
+    axisHFloatSize_ = axisH_ * static_cast<uint32_t>(sizeof(float));
+    axisHExpandXTypeSize_ = axisH_ * static_cast<uint32_t>(sizeof(ExpandXType));
+
+    uint64_t winSizeMin =
+        moeExpertNum_ * axisBS_ * (axisHExpandXTypeSize_ + EXTRA_TOKEN_INFO_NUM * axisK_ * sizeof(uint32_t)) +
+        IPC_DATA_OFFSET + RDMA_DATA_SIZE;  // 考虑负载极其不均衡时，HCCL BUFFSIZE需要开的大小
+
+    assert(winContext_->winSize >= winSizeMin,
+           "The HCCL_BUFFSIZE is %lluMB, the min value should be %lluMB. \
+        epWorldSize:%u, epRankId:%u, moeExpertNum:%u, globalBs:%u, bs:%u, k:%u, h:%u, aivNum:%u, \
+        totalUbSize:%llu\n",
+           winContext_->winSize / MB_SIZE, winSizeMin / MB_SIZE, tilingData->moeDistributeCombineV2Info.epWorldSize,
+           tilingData->moeDistributeCombineV2Info.epRankId, tilingData->moeDistributeCombineV2Info.moeExpertNum,
+           tilingData->moeDistributeCombineV2Info.globalBs, tilingData->moeDistributeCombineV2Info.bs,
+           tilingData->moeDistributeCombineV2Info.k, tilingData->moeDistributeCombineV2Info.h,
+           tilingData->moeDistributeCombineV2Info.aivNum, tilingData->moeDistributeCombineV2Info.totalUbSize);
+
+    GlobalTensor<int32_t> selfStatusTensor;
+    selfStatusTensor.SetGlobalBuffer((__gm__ int32_t *)(statusSpaceGm_ + SELF_STATE_OFFSET));
+    // selfStatusTensor: winIn + 49M+0.5M = winIn + 49.5M
+    // coreIdx_ < serverNum
+    int32_t state = selfStatusTensor(coreIdx_ * UB_ALIGN);
+
+    if (state == 0) {
+        sumTarget_ = static_cast<int32_t>(1);
+        selfStatusTensor(coreIdx_ * UB_ALIGN) = 1;
+        stateValue_ = 1;
+    } else {
+        sumTarget_ = 0;
+        selfStatusTensor(coreIdx_ * UB_ALIGN) = 0;
+        stateValue_ = 0;
+    }
+
+    BuffInit();
+
+    SplitCoreCal();
+
+    if (coreIdx_ == 0U) {
+        readStateGlobal_.SetValue(0, stateValue_);
+        DataCacheCleanAndInvalid<int32_t, AscendC::CacheLine::SINGLE_CACHE_LINE, AscendC::DcciDst::CACHELINE_OUT>(
+            readStateGlobal_);
+    }
+    send_counts_inner_offset = static_cast<uint64_t>(worldSize_ * localMoeExpertNum_);
+    offset_inner_offset = send_counts_inner_offset + static_cast<uint64_t>(globalBs * serverNum / 2);
+    send_counts_outer_offset = offset_inner_offset + static_cast<uint64_t>(globalBs * axisK_ * serverNum);
+    offset_outer_offset = send_counts_outer_offset + static_cast<uint64_t>(axisBS_);
+    share_offset = offset_outer_offset + static_cast<uint64_t>(axisBS_ * serverNum);
+
+    shareAddrGM_ = sendCount + share_offset;
+    offsetInnerGM_ = sendCount + offset_inner_offset;
+    countInnerGM_ = sendCount + send_counts_inner_offset;
+    offsetOuterGM_ = sendCount + offset_outer_offset;
+    countOuterGM_ = sendCount + send_counts_outer_offset;
+
+    shareAddrGlobal_.SetGlobalBuffer(reinterpret_cast<__gm__ int32_t *>(sendCount) + share_offset);
+    countInnerGlobal_.SetGlobalBuffer(reinterpret_cast<__gm__ int16_t *>(sendCount) + send_counts_inner_offset * 2);
+    // why *2
+    offsetInnerGlobal_.SetGlobalBuffer(reinterpret_cast<__gm__ int32_t *>(sendCount) + offset_inner_offset);
+    countOuterGlobal_.SetGlobalBuffer(reinterpret_cast<__gm__ int32_t *>(sendCount) + send_counts_outer_offset);
+    offsetOuterGlobal_.SetGlobalBuffer(reinterpret_cast<__gm__ int32_t *>(sendCount) + offset_outer_offset);
+
+    // epSendCount |countInnerGlobal_ | offsetInnerGlobal_ | countOuterGlobal_ | offsetOuterGlobal_ | shareAddrGlobal_ |
+    // ws*epOnRank | ws*epOnRank | gbs*serverNum / 2 | gbs*k*serverNum | bs | bs * serverNum |
+
+    PipeBarrier<PIPE_ALL>();
+    for (int i = 0; i < 8; i++) {
+        shareAddreRank[i] = reinterpret_cast<uint64_t>(
+            RDMA_DATA_SIZE + hccl_.GetWindowsInAddr(rankId_ / SERVER_RANK_SIZE * SERVER_RANK_SIZE + i));
+    }
+    magicGlobal_.SetGlobalBuffer((__gm__ uint64_t *)(shareAddreRank[rankId_ % SERVER_RANK_SIZE]));
+    magicValue = magicGlobal_.GetValue(MAGIC_OFFSET / sizeof(uint64_t));
+    PipeBarrier<PIPE_ALL>();
+}
+
+template <TemplateMC2TypeA2layeredClass>
+__aicore__ inline void MoeDistributeCombineV2Layered<TemplateMC2TypeA2layeredFunc>::BuffInit()
+{
+    // 状态tBuf
+    tpipe_->InitBuffer(statusBuf_, worldSize_ * UB_ALIGN);
+
+    // AIVRDMAPostSend函数需要的tBuf
+    tpipe_->InitBuffer(rdmaInBuf_, UB_ALIGN_SIZE);
+    ubLocal = rdmaInBuf_.Get<uint64_t>();
+
+    tpipe_->InitBuffer(rdmaInBuf2_, UB_ALIGN_SIZE);
+    ubLocalHead = rdmaInBuf2_.Get<uint32_t>();
+
+    // 总tBuf
+    tpipe_->InitBuffer(tBuf, TBUF_SIZE);  // 185KB
+}
+template <TemplateMC2TypeA2layeredClass>
+__aicore__ inline void MoeDistributeCombineV2Layered<TemplateMC2TypeA2layeredFunc>::SplitCoreCal()
+{
+    // 对worldSize按卡分核，得到每个核上处理的卡的数量
+    sendRankNum_ = worldSize_ / aivNum_;
+    uint32_t remainderRankNum = worldSize_ % aivNum_;
+    startRankId_ = sendRankNum_ * coreIdx_;
+    if (coreIdx_ < remainderRankNum) {
+        sendRankNum_++;
+        startRankId_ += coreIdx_;
+    } else {
+        startRankId_ += remainderRankNum;
+    }
+    endRankId_ = startRankId_ + sendRankNum_;
+}
+
+template <TemplateMC2TypeA2layeredClass>
+__aicore__ inline void MoeDistributeCombineV2Layered<TemplateMC2TypeA2layeredFunc>::GM2IPC()
+{
+    ipcSliceSize = IPC_DATA_SIZE / worldSize_ / BLOCK_SIZE * BLOCK_SIZE;
+    // HCCL_BUFFSIZE - 100 - 4 = 2048 - 104 = 1944M
+    // ipcSliceSize = 1944M / 16 =121.5M
+
+    // rdma 8*512*7k*2*8= rank * bs*h*typeof*topk =
+    // 16*512*8*7k*2
+    ipcSliceNodeSize = ipcSliceSize * SERVER_RANK_SIZE;
+    // 121.5M*8 = 1G
+
+    // 初始化baseBuffOffset
+    uint32_t baseBuffOffset = TBUF_TEMP_OFFSET;  // 0
+    // 申请LocalTensor : sendCount 以及计算偏移 256 * 4 = 1KB
+    LocalTensor<ExpandIdxType> sendCountLocal =
+        tBuf.GetWithOffset<int32_t>(RoundUp(moeExpertNum_, B32_PER_BLOCK), baseBuffOffset);
+    baseBuffOffset += sizeof(int32_t) * RoundUp(moeExpertNum_, B32_PER_BLOCK);
+
+    // 申请LocalTensor : expandScales 以及计算偏移 512 * 4 = 2KB (+ 1KB = 3KB)
+    LocalTensor<float> expandScalesLocal =
+        tBuf.GetWithOffset<float>((maxLocalBs + UB_ALIGN - 1) / UB_ALIGN * UB_ALIGN, baseBuffOffset);
+    baseBuffOffset += sizeof(float) * ((maxLocalBs + UB_ALIGN - 1) / UB_ALIGN * UB_ALIGN);
+
+    // 申请LocalTensor : InUb。 token：【data】(H * fp16/bf16) + expandScales(32B) 14KB + 32B + (3KB = 17KB + 32B)
+    LocalTensor<ExpandXType> inUb = tBuf.GetWithOffset<ExpandXType>(axisH_ + WEIGHT_VALUE_NUM, baseBuffOffset);
+    LocalTensor<float> inUbTemp = inUb[axisH_].template ReinterpretCast<float>();
+
+    DataCopy(sendCountLocal, sendCountGlobal_, RoundUp(moeExpertNum_, B32_PER_BLOCK));  // mte2
+    PipeBarrier<PIPE_ALL>();
+    SyncFunc<AscendC::HardEvent::MTE2_S>();
+    uint64_t localShareAddr = shareAddreRank[rankId_ % SERVER_RANK_SIZE];
+    for (uint32_t dstRankId = startRankId_; dstRankId < endRankId_; ++dstRankId) {
+        uint64_t targetRankAddr = localShareAddr + static_cast<uint64_t>(dstRankId * ipcSliceSize + IPC_DATA_OFFSET);
+
+        dstshareMemGlobal_.SetGlobalBuffer((__gm__ ExpandXType *)(targetRankAddr));
+
+        uint32_t rankTokenNum = 0U;
+
+        for (uint32_t expertId = 0U; expertId < localMoeExpertNum_; ++expertId) {
+            uint32_t preCount = 0U;
+
+            if (expertId != 0U || dstRankId != 0U) {
+                preCount = static_cast<uint32_t>(sendCountLocal.GetValue(expertId * worldSize_ + dstRankId - 1));
+            }
+            uint32_t tokenNum = sendCountLocal.GetValue(expertId * worldSize_ + dstRankId) - preCount;
+            // tokenNum:
+            uint32_t startTokenAddr = preCount * axisH_;
+            PipeBarrier<PIPE_ALL>();
+            // DataCopy(expandScalesLocal, expandScalesGlobal_[preCount], tokenNum);
+            DataCopy(expandScalesLocal, expandScalesGlobal_[preCount], (tokenNum + UB_ALIGN - 1) / UB_ALIGN * UB_ALIGN);
+            SyncFunc<AscendC::HardEvent::MTE2_S>();
+            for (uint32_t tokenId = 0U; tokenId < tokenNum; ++tokenId) {
+                float scaleVal = expandScalesLocal.GetValue(tokenId);
+                inUbTemp(0) = scaleVal;
+                SyncFunc<AscendC::HardEvent::MTE3_MTE2>();
+                SyncFunc<AscendC::HardEvent::S_MTE2>();
+                DataCopy(inUb, expandXGlobal_[startTokenAddr], axisH_);
+                SyncFunc<AscendC::HardEvent::MTE2_MTE3>();
+                DataCopy(dstshareMemGlobal_[rankTokenNum * (axisH_ + WEIGHT_VALUE_NUM)], inUb,
+                         axisH_ + WEIGHT_VALUE_NUM);
+                startTokenAddr += axisH_;
+                rankTokenNum++;
+                PipeBarrier<PIPE_ALL>();
+            }
+        }
+    }
+    SyncAll<true>();
+    if (coreIdx_ < SERVER_RANK_SIZE) {
+        uint64_t targetAddr = shareAddreRank[coreIdx_ % SERVER_RANK_SIZE];
+        shareFlagGlobal_.SetGlobalBuffer((__gm__ uint64_t *)targetAddr);
+        LocalTensor<uint64_t> inUb = statusBuf_.Get<uint64_t>();
+        inUb(0) = GM2IPC_SYNC_FLAG + magicValue;
+        uint32_t flagOffset = rankId_ % SERVER_RANK_SIZE;
+        PipeBarrier<PIPE_ALL>();
+        DataCopy(shareFlagGlobal_[flagOffset * FLAG_SINGLE_CNT], inUb,
+                 FLAG_SINGLE_CNT);  // *4是因为单次拷贝256byte = 4*int64
+
+        PipeBarrier<PIPE_ALL>();
+    }
+    SyncAll<true>();
+}
+
+template <TemplateMC2TypeA2layeredClass>
+__aicore__ inline void MoeDistributeCombineV2Layered<TemplateMC2TypeA2layeredFunc>::WaitIPC()
+{
+    ///***
+    uint32_t stepCoreNum_ = SERVER_RANK_SIZE;
+    shareFlagGlobal_.SetGlobalBuffer((__gm__ uint64_t *)shareAddreRank[rankId_ % SERVER_RANK_SIZE]);
+    // 只要8个core分别wait 来自8卡的flag，然后sync一下 再进行流水
+
+    if (coreIdx_ < stepCoreNum_) {
+        LocalTensor<uint64_t> inUb = statusBuf_.Get<uint64_t>();
+        uint32_t waitFlagAddr = coreIdx_ % stepCoreNum_;
+        while (true) {
+            DataCopy(inUb, shareFlagGlobal_[waitFlagAddr * FLAG_SINGLE_CNT], FLAG_SINGLE_CNT);
+            PipeBarrier<PIPE_ALL>();
+            if (inUb(0) >= (GM2IPC_SYNC_FLAG + magicValue)) {
+                break;
+            }
+        }
+        inUb(0) = 0;
+        PipeBarrier<PIPE_ALL>();
+        DataCopy(shareFlagGlobal_[waitFlagAddr * FLAG_SINGLE_CNT], inUb,
+                 FLAG_SINGLE_CNT);  // *4是因为单次拷贝256byte = 4*int64
+        PipeBarrier<PIPE_ALL>();
+    }
+    SyncAll<true>();
+}
+
+template <TemplateMC2TypeA2layeredClass>
+__aicore__ inline void MoeDistributeCombineV2Layered<TemplateMC2TypeA2layeredFunc>::SumToWindow()
+{
+    // 32core流水并行
+    uint32_t coreNumPerServer = stepCoreNum / serverNum;
+    uint32_t serverId_ = coreIdx_ / coreNumPerServer;
+    uint32_t targetRankId_ = rankId_ % SERVER_RANK_SIZE + serverId_ * SERVER_RANK_SIZE;
+
+    // 初始baseBuffOffset
+    uint32_t baseBuffOffset = TBUF_TEMP_OFFSET;
+    // 初始化 countReduce 所需tBuf的大小 512 * 2 = 1KB
+    LocalTensor<int16_t> countReduceLocal =
+        tBuf.GetWithOffset<int16_t>(RoundUp(maxLocalBs, B16_PER_BLOCK), baseBuffOffset);
+    baseBuffOffset += sizeof(int16_t) * RoundUp(maxLocalBs, B16_PER_BLOCK);  // 需要32字节对齐
+
+    LocalTensor<int32_t> offsetReduceLocal =
+        tBuf.GetWithOffset<int32_t>(RoundUp(maxLocalBs * axisK_, B32_PER_BLOCK), baseBuffOffset);
+
+    // 量化和不量化都要用 512 * 8 * 4 = 16KB (+ 1KB = 17KB)
+    baseBuffOffset += sizeof(int32_t) * RoundUp(maxLocalBs * axisK_, B32_PER_BLOCK);
+    LocalTensor<ExpandXType> dataInLocal = tBuf.GetWithOffset<ExpandXType>(
+        axisH_ + WEIGHT_VALUE_NUM, baseBuffOffset);  // 6. dataIn: (7k + 16) * 2 = 14KB + 32
+    baseBuffOffset += sizeof(ExpandXType) * (axisH_ + WEIGHT_VALUE_NUM);
+
+    // 初始化 fp16所需tBuf偏移的base Offset
+    uint32_t fp16baseBuffOffset = baseBuffOffset;
+
+    // 量化和不量化都要用 同时也为bf16的Brcb函数扩充复用，扩充到H个，至少要256B对齐 28KB (+ 17 KB = 45KB)
+    LocalTensor<float> castInFloatLocal =
+        tBuf.GetWithOffset<float>(RoundUp(axisHFloatSize_, VEC_LEN) / sizeof(float), baseBuffOffset);
+    baseBuffOffset += RoundUp(axisHFloatSize_, VEC_LEN);
+
+    // 量化和不量化都要用
+    LocalTensor<float> sumFloatLocal = tBuf.GetWithOffset<float>(axisH_, baseBuffOffset);
+
+    // token格式: data(H*sizeof(ExpandXType)) + weight值(32B)
+    LocalTensor<float> inUbTemp = dataInLocal[axisH_].template ReinterpretCast<float>();
+
+    // 量化 dataInLocal复用 存放 int8的data fp/bf16的scale
+    LocalTensor<ExpandXTransType> castDataInt8 = dataInLocal.template ReinterpretCast<ExpandXTransType>();
+    LocalTensor<ExpandXType> scaleData = dataInLocal[axisH_ / 2].template ReinterpretCast<ExpandXType>();
+
+    // 量化fp16 14KB (+ 45KB = 59KB)
+    LocalTensor<ExpandXType> sumHalfLocal =
+        tBuf.GetWithOffset<ExpandXType>(axisH_, fp16baseBuffOffset);  // 复用castInFloatLocal
+    fp16baseBuffOffset += axisH_ * sizeof(ExpandXType);
+
+    // 16个数取最大值
+    LocalTensor<ExpandXType> reduceMaxOutTensor = tBuf.GetWithOffset<ExpandXType>(scaleNum, fp16baseBuffOffset);
+
+    // 将scale利用Brcb函数扩充到H个，至少要256B对齐   复用reduceMaxOutTensor
+    LocalTensor<ExpandXType> absScaleTensor = tBuf.GetWithOffset<ExpandXType>(
+        RoundUp(axisHExpandXTypeSize_, VEC_LEN) / sizeof(ExpandXType), fp16baseBuffOffset);
+
+    // 量化 bf16 复用sumFloatLocal
+    LocalTensor<half> halfLocal = tBuf.GetWithOffset<half>(axisH_, baseBuffOffset);
+
+    baseBuffOffset += sizeof(float) * (axisH_);  // 复用sumFloatLocal，但是offset要加上sumFloatLocal大小
+    LocalTensor<float> reduceMaxTensorFloat = tBuf.GetWithOffset<float>(scaleNum, baseBuffOffset);
+
+    DataCopy(countReduceLocal, countInnerGlobal_[globalBs * serverId_], RoundUp(maxLocalBs, B16_PER_BLOCK));
+    DataCopy(offsetReduceLocal, offsetInnerGlobal_[globalBs * axisK_ * serverId_],
+             RoundUp(maxLocalBs * axisK_, B32_PER_BLOCK));
+    PipeBarrier<PIPE_ALL>();
+    SyncFunc<AscendC::HardEvent::MTE2_S>();
+    uint64_t rdmaAddr = (uint64_t)(hccl_.GetWindowsOutAddr(rankId_) + halfWinSize_ * bufferId_ +
+                                   serverId_ * rankSizeOnWin_ * SERVER_RANK_SIZE);
+    scaleOutWindow_.SetGlobalBuffer((__gm__ ExpandXType *)rdmaAddr);  // 16bit
+    localOutWindow_.SetGlobalBuffer((__gm__ ExpandXType *)rdmaAddr);
+    LocalTensor<uint64_t> rdmaFlagLocal = statusBuf_.Get<uint64_t>();
+    rdmaFlagLocal(0) = RDMA_TOKEN_ARRIVED_FLAG + magicValue;
+    PipeBarrier<PIPE_ALL>();
+    int offsetPre = 0;
+    offsetIndex = 0U;
+
+    // 计算offsetIndex,copyNum,dataOffset,scaleOffset
+    uint32_t listLen = 64;  // maxLocalBs / coreNumPerServer;
+    uint32_t offsetIndexs[65];
+    uint32_t copyNums[65];
+    uint32_t dataOffsets[65];
+    uint32_t scaleOffsets[65];
+    uint32_t totalCopyLen = 0;
+    uint32_t processNum_ = 0;
+    // 每个核使用的链路要岔开，不能有冲突
+    for (uint32_t i = 0U; i < maxLocalBs; i++) {
+        if ((i % coreNumPerServer) == (coreIdx_ % coreNumPerServer)) {
+            int offsetCur = static_cast<int32_t>(countReduceLocal.GetValue(i));
+            uint32_t dataOffset = i * (axisH_ / 2U + scaleNumAlign);  // uint8的数据
+            if (i != 0U) {
+                offsetPre = static_cast<int32_t>(countReduceLocal.GetValue(i - 1));
+            }
+            int copyNum = offsetCur - offsetPre;
+            if (copyNum <= 0) {
+                break;
+            }
+            offsetIndex = static_cast<uint32_t>(offsetPre);
+
+            offsetIndexs[processNum_] = offsetIndex;
+            copyNums[processNum_] = static_cast<uint32_t>(copyNum);
+            dataOffsets[processNum_] = dataOffset;
+            totalCopyLen += static_cast<uint32_t>(copyNum);
+            processNum_++;
+        }
+        // processNum -> 512 / 16 = 2**(9-4) = 32
+    }
+
+    uint32_t processTokenNum = 0;
+    uint32_t offsetIndexStart = offsetIndexs[processTokenNum];
+    offsetIndex = offsetIndexs[processTokenNum];
+    uint32_t copyNum = copyNums[processTokenNum];
+    uint32_t dataOffset = dataOffsets[processTokenNum];
+
+    uint32_t tokenOffset = 0;
+    for (uint32_t i = 0U; i < totalCopyLen; i++) {
+        uint32_t targetIpcRank = offsetReduceLocal.GetValue(offsetIndex) / (globalBs * axisK_);
+        uint32_t targetIpcOffset =
+            offsetReduceLocal.GetValue(offsetIndex) % (globalBs * axisK_) * (axisH_ + WEIGHT_VALUE_NUM);
+
+        uint64_t copyAddr = shareAddreRank[targetIpcRank % SERVER_RANK_SIZE] +
+                            static_cast<uint64_t>(targetRankId_ * ipcSliceSize) +
+                            static_cast<uint64_t>(IPC_DATA_OFFSET);
+        shareMemGlobal_.SetGlobalBuffer((__gm__ ExpandXType *)copyAddr);
+        SyncFunc<AscendC::HardEvent::MTE3_MTE2>();
+        DataCopy(dataInLocal, shareMemGlobal_[targetIpcOffset], axisH_ + WEIGHT_VALUE_NUM);  // mte2
+        SyncFunc<AscendC::HardEvent::MTE2_S>();
+        float scaleVal = inUbTemp(0);
+        SyncFunc<AscendC::HardEvent::MTE2_V>();
+        Cast(castInFloatLocal, dataInLocal, AscendC::RoundMode::CAST_NONE, axisH_);
+        PipeBarrier<PIPE_V>();
+        if ((offsetIndex - offsetIndexStart) == 0U) {
+            Muls(sumFloatLocal, castInFloatLocal, scaleVal, axisH_);
+        } else {
+            Axpy(sumFloatLocal, castInFloatLocal, scaleVal, axisH_);
+        }
+
+        offsetIndex += 1U;
+
+        PipeBarrier<PIPE_V>();
+        if ((offsetIndex - offsetIndexStart) == copyNum) {
+            tokenOffset = coreNumPerServer * processTokenNum + coreIdx_ % coreNumPerServer;
+            if constexpr (DynamicQuant && std::is_same<ExpandXTransType, int8_t>::value) {
+                if constexpr (std::is_same<ExpandXType, half>::value) {
+                    Cast(sumHalfLocal, sumFloatLocal, AscendC::RoundMode::CAST_RINT, axisH_);
+                    PipeBarrier<PIPE_V>();
+                    Abs(absScaleTensor, sumHalfLocal, axisH_);
+                    PipeBarrier<PIPE_V>();
+                    BlockReduceMax(reduceMaxOutTensor, absScaleTensor, repeatNum, mask, 1, 1, 8);  // g16
+                    PipeBarrier<PIPE_V>();
+                    SyncFunc<AscendC::HardEvent::MTE3_V>();
+                    Muls(scaleData, reduceMaxOutTensor, scaleMulVal, scaleNum);  // 1/scale = dmax / 127
+                    PipeBarrier<PIPE_V>();
+                    Brcb(absScaleTensor, scaleData, repeatNum, {1, 8});  // 填充scale值
+                    PipeBarrier<PIPE_V>();
+
+                    Div(sumHalfLocal, sumHalfLocal, absScaleTensor, axisH_);  // data_fp16/(1/scale)
+                    PipeBarrier<PIPE_V>();
+                    Cast(castDataInt8, sumHalfLocal, RoundMode::CAST_RINT, axisH_);  // fp16->int8 四舍六入五成双
+                    PipeBarrier<PIPE_V>();
+
+                    SyncFunc<AscendC::HardEvent::V_MTE3>();
+                    DataCopy(localOutWindow_[dataOffset], dataInLocal, axisH_ / 2 + scaleNumAlign);  // int8数据+scale值
+                    PipeBarrier<PIPE_MTE3>();
+                    DataCopy(shareFlagGlobal_[(serverId_ + 1) * FLAG_TOTAL_SIZE + tokenOffset * FLAG_SINGLE_CNT],
+                             rdmaFlagLocal, FLAG_SINGLE_CNT);
+                } else {
+                    PipeBarrier<PIPE_V>();
+                    Abs(castInFloatLocal, sumFloatLocal, axisH_);  // 求fp32张量的绝对值
+                    PipeBarrier<PIPE_V>();
+                    BlockReduceMax(reduceMaxTensorFloat, castInFloatLocal, repeatNum, mask, 1, 1, 8);  // fp32的g16
+                    PipeBarrier<PIPE_V>();
+                    Muls(reduceMaxTensorFloat, reduceMaxTensorFloat, scaleMulVal, scaleNum);  // scale = dmax * 1/127
+                    PipeBarrier<PIPE_V>();
+                    Brcb(castInFloatLocal, reduceMaxTensorFloat, repeatNum, {1, 8});  // 填充fp32的scale值
+                    PipeBarrier<PIPE_V>();
+                    Div(sumFloatLocal, sumFloatLocal, castInFloatLocal, axisH_);  // data_fp32/(1/scale)
+                    PipeBarrier<PIPE_V>();
+                    SyncFunc<AscendC::HardEvent::MTE3_V>();
+                    Cast(scaleData, reduceMaxTensorFloat, RoundMode::CAST_RINT, scaleNum);  // 1/scale从fp32量化成bf16
+                    PipeBarrier<PIPE_V>();
+                    Cast(halfLocal, sumFloatLocal, RoundMode::CAST_RINT, axisH_);  // token数据fp32->bf16 四舍六入五成双
+                    PipeBarrier<PIPE_V>();
+                    Cast(castDataInt8, halfLocal, RoundMode::CAST_RINT, axisH_);  // token数据bf16->int8 四舍六入五成双
+                    PipeBarrier<PIPE_V>();
+                    SyncFunc<AscendC::HardEvent::V_MTE3>();
+                    DataCopy(localOutWindow_[dataOffset], dataInLocal, axisH_ / 2 + scaleNumAlign);  // int8数据+scale值
+                    PipeBarrier<PIPE_MTE3>();
+                    DataCopy(shareFlagGlobal_[(serverId_ + 1) * FLAG_TOTAL_SIZE + tokenOffset * FLAG_SINGLE_CNT],
+                             rdmaFlagLocal, FLAG_SINGLE_CNT);
+                }
+            } else {
+                PipeBarrier<PIPE_V>();
+                Cast(dataInLocal, sumFloatLocal, AscendC::RoundMode::CAST_RINT, axisH_);
+                SyncFunc<AscendC::HardEvent::V_MTE3>();
+                DataCopy(localOutWindow_[tokenOffset * axisH_], dataInLocal, axisH_);  // int8数据+scale值
+                PipeBarrier<PIPE_MTE3>();
+                DataCopy(shareFlagGlobal_[(serverId_ + 1) * FLAG_TOTAL_SIZE + tokenOffset * FLAG_SINGLE_CNT],
+                         rdmaFlagLocal, FLAG_SINGLE_CNT);
+            }
+            processTokenNum++;
+            offsetIndex = offsetIndexs[processTokenNum];
+            copyNum = copyNums[processTokenNum];
+            dataOffset = dataOffsets[processTokenNum];
+            offsetIndexStart = offsetIndex;
+        }
+    }
+    PipeBarrier<PIPE_ALL>();
+    rdmaFlagLocal(0) = RDMA_TOKEN_END_FLAG + magicValue;
+    tokenOffset = coreNumPerServer * processTokenNum + coreIdx_ % coreNumPerServer;
+    DataCopy(shareFlagGlobal_[(serverId_ + 1) * FLAG_TOTAL_SIZE + tokenOffset * FLAG_SINGLE_CNT], rdmaFlagLocal,
+             FLAG_SINGLE_CNT);
+    SyncAll<true>();
+}
+
+template <TemplateMC2TypeA2layeredClass>
+__aicore__ inline void MoeDistributeCombineV2Layered<TemplateMC2TypeA2layeredFunc>::AlltoAllServerDispatch()
+{
+    LocalTensor<uint64_t> checkRdmaLocal = statusBuf_.Get<uint64_t>();
+    LocalTensor<ExpandXTransType> tmpLowUb_ = tBuf.Get<ExpandXTransType>();
+    uint32_t checkServer = coreIdx_ - stepCoreNum;
+    GlobalTensor<ExpandXTransType> aivSrcGlobal;
+    GlobalTensor<ExpandXTransType> aivDstGlobal;
+    uint32_t tragRankId = rankId_ % SERVER_RANK_SIZE + SERVER_RANK_SIZE * checkServer;
+    uint32_t copySum = 0;
+    uint32_t copyOnceNum = 1;
+    uint32_t copyLen_;
+    uint32_t copyLenAlign_;
+    uint32_t selfServerID = rankId_ / SERVER_RANK_SIZE;
+    bool stopFlag = false;
+    uint32_t cpNum = 0;
+
+    if constexpr (DynamicQuant && std::is_same<ExpandXTransType, int8_t>::value) {
+        copyLen_ = axisH_ * static_cast<uint32_t>(sizeof(ExpandXTransType)) +
+                   scaleNum * static_cast<uint32_t>(sizeof(ExpandXType));
+        copyLenAlign_ = axisH_ * static_cast<uint32_t>(sizeof(ExpandXTransType)) +
+                        scaleNumAlign * static_cast<uint32_t>(sizeof(ExpandXType));
+    } else {
+        copyLen_ = axisH_ * static_cast<uint32_t>(sizeof(ExpandXType));
+        copyLenAlign_ = copyLen_;
+    }
+    uint64_t srcrdmaAddr = (uint64_t)(hccl_.GetWindowsOutAddr(rankId_) + halfWinSize_ * bufferId_ +
+                                      checkServer * rankSizeOnWin_ * SERVER_RANK_SIZE);
+    uint64_t dstrdmaAddr = (uint64_t)(hccl_.GetWindowsInAddr(tragRankId) + halfWinSize_ * bufferId_ +
+                                      (rankId_ / SERVER_RANK_SIZE) * rankSizeOnWin_ * SERVER_RANK_SIZE);
+    while (!stopFlag) {
+        for (uint32_t i = 0U; i < copyOnceNum; i++) {
+            while (true) {
+                DataCopy(checkRdmaLocal[64],
+                         shareFlagGlobal_[(checkServer + 1) * FLAG_TOTAL_SIZE + copySum * FLAG_SINGLE_CNT],
+                         FLAG_SINGLE_CNT);
+                PipeBarrier<PIPE_ALL>();
+                if (checkRdmaLocal.GetValue(64) == (RDMA_TOKEN_ARRIVED_FLAG + magicValue)) {
+                    copySum++;
+                    break;
+                } else if (checkRdmaLocal.GetValue(64) == (RDMA_TOKEN_END_FLAG + magicValue) || copySum == maxLocalBs) {
+                    stopFlag = true;
+                    break;
+                }
+            }
+            PipeBarrier<PIPE_ALL>();
+            if (stopFlag) {
+                break;
+            }
+        }
+        if (copySum > 0U) {
+            if (rankId_ != tragRankId) {
+                aivSrcGlobal.SetGlobalBuffer((__gm__ ExpandXTransType *)(srcrdmaAddr));
+                aivDstGlobal.SetGlobalBuffer((__gm__ ExpandXTransType *)(dstrdmaAddr));
+                AIVRDMAPostSend((GM_ADDR)(srcrdmaAddr + copyLenAlign_ * (copySum - copyOnceNum)),
+                                (GM_ADDR)(dstrdmaAddr + copyLenAlign_ * (copySum - copyOnceNum)), tragRankId,
+                                copyLen_ * copyOnceNum, qp_info_);
+            } else {
+                aivSrcGlobal.SetGlobalBuffer((__gm__ ExpandXTransType *)(srcrdmaAddr));
+                aivDstGlobal.SetGlobalBuffer((__gm__ ExpandXTransType *)(dstrdmaAddr));
+                if constexpr (DynamicQuant && std::is_same<ExpandXTransType, int8_t>::value) {
+                    cpNum = axisH_ + scaleNumAlign * static_cast<uint32_t>(sizeof(ExpandXType)) /
+                                         static_cast<uint32_t>(sizeof(ExpandXTransType));
+                } else {
+                    cpNum = axisH_ * static_cast<uint32_t>(sizeof(ExpandXType)) /
+                            static_cast<uint32_t>(sizeof(ExpandXTransType));
+                }
+                for (uint32_t k = 0U; k < copyOnceNum; k++) {
+                    DataCopy(tmpLowUb_,
+                             aivSrcGlobal[copyLenAlign_ * (copySum - copyOnceNum + k) / sizeof(ExpandXTransType)],
+                             cpNum);
+                    PipeBarrier<PIPE_ALL>();
+                    DataCopy(aivDstGlobal[copyLenAlign_ * (copySum - copyOnceNum + k) / sizeof(ExpandXTransType)],
+                             tmpLowUb_, cpNum);
+                }
+            }
+        }
+    }
+    if (rankId_ != tragRankId) {
+        AIVRDMAPostSend((GM_ADDR)((uint64_t)(readStateGlobal_.GetPhyAddr())),
+                        (GM_ADDR)((uint64_t)(hccl_.GetWindowsInAddr(tragRankId) + halfWinSize_ * bufferId_ +
+                                             dataSpaceSize_ + selfServerID * STATE_OFFSET)),
+                        tragRankId, 32, qp_info_);
+    }
+    SyncAll<true>();
+}
+
+template <TemplateMC2TypeA2layeredClass>
+__aicore__ inline void MoeDistributeCombineV2Layered<TemplateMC2TypeA2layeredFunc>::WaitDispatch()
+{
+    if ((coreIdx_ < serverNum) && (coreIdx_ != (rankId_ / SERVER_RANK_SIZE))) {
+        uint32_t targetRank = rankId_ % SERVER_RANK_SIZE + (coreIdx_)*SERVER_RANK_SIZE;
+        LocalTensor<int32_t> statusTensor = statusBuf_.Get<int32_t>();
+        uint32_t readNum = 1U;
+        DataCopyParams intriParams{static_cast<uint16_t>(readNum), 1, 15, 0};  // srcStride为15个block
+        while (true) {
+            DataCopy(statusTensor, statusSpaceGlobal_[(coreIdx_)*STATE_OFFSET / sizeof(int32_t)], intriParams);
+            PipeBarrier<PIPE_ALL>();
+            int32_t sumOfFlag = statusTensor.GetValue(0);
+            if (sumOfFlag == sumTarget_) {
+                break;
+            }
+        }
+    }
+    PipeBarrier<PIPE_ALL>();
+    SyncAll<true>();
+}
+
+template <TemplateMC2TypeA2layeredClass>
+__aicore__ inline void MoeDistributeCombineV2Layered<TemplateMC2TypeA2layeredFunc>::Preload()
+{
+    uint32_t reduceCore = 8U;
+    if (coreIdx_ >= reduceCore) {
+        return;
+    }
+    processNum = axisBS_ / reduceCore;
+    resNum = axisBS_ - processNum * reduceCore;
+    resLen = (resNum == 0U) ? 0U : 1U;
+    startBs = 0U;
+    endBs = 0U;
+    if (coreIdx_ < resNum) {
+        processNum += 1U;
+        startBs = coreIdx_ * processNum;
+        endBs = startBs + processNum;
+    } else {
+        startBs = coreIdx_ * processNum + resNum;
+        endBs = startBs + processNum;
+    }
+    uint64_t selfRankAddr = (uint64_t)(hccl_.GetWindowsInAddr(rankId_) + halfWinSize_ * bufferId_);
+    localInWindow_.SetGlobalBuffer((__gm__ ExpandXTransType *)(selfRankAddr));
+
+    // 低精度需要用到的变量
+    if constexpr (DynamicQuant && std::is_same<ExpandXTransType, int8_t>::value) {
+        localInScaleWindow_.SetGlobalBuffer((__gm__ ExpandXType *)(selfRankAddr));
+    }
+
+    // 初始化offset
+    uint32_t baseBuffOffset = TBUF_TEMP_OFFSET;
+    offsetReduceLocal_ = tBuf.GetWithOffset<int32_t>(
+        RoundUp(axisBS_ * serverNum, (uint32_t)(UB_ALIGN / sizeof(int32_t))), baseBuffOffset);
+    baseBuffOffset += sizeof(uint32_t) * RoundUp(axisBS_ * serverNum, (uint32_t)(UB_ALIGN / sizeof(int32_t)));
+
+    countReduceLocal_ =
+        tBuf.GetWithOffset<int32_t>(RoundUp(axisBS_, (uint32_t)(UB_ALIGN / sizeof(int32_t))), baseBuffOffset);
+
+    DataCopy(offsetReduceLocal_, offsetOuterGlobal_,
+             RoundUp(axisBS_ * serverNum, (uint32_t)(UB_ALIGN / sizeof(int32_t))));
+    DataCopy(countReduceLocal_, countOuterGlobal_, RoundUp(axisBS_, (uint32_t)(UB_ALIGN / sizeof(int32_t))));  // 256 *
+                                                                                                               // 4
+    SyncFunc<AscendC::HardEvent::MTE2_S>();
+    offsetIndex = 0U;
+    if (startBs != 0U) {
+        offsetIndex = countReduceLocal_.GetValue(startBs - 1U);
+    }
+}
+template <TemplateMC2TypeA2layeredClass>
+__aicore__ inline void MoeDistributeCombineV2Layered<TemplateMC2TypeA2layeredFunc>::SumToServer()
+{
+    uint32_t reduceCore = 8U;
+    if (coreIdx_ >= reduceCore) {
+        SyncAll<true>();
+        return;
+    }
+    // 初始化 fp16  bf16的offset
+    uint32_t baseBuffOffset = sizeof(uint32_t) * RoundUp(axisBS_ * serverNum, (uint32_t)(UB_ALIGN / sizeof(int32_t))) +
+                              sizeof(int32_t) * RoundUp(axisBS_, (uint32_t)(UB_ALIGN / sizeof(int32_t)));
+    uint32_t fpBaseBuffOffset = baseBuffOffset;
+    uint32_t bfBaseBuffOffset = baseBuffOffset;
+
+    // 不量化
+    LocalTensor<float> sumFloatLocal = tBuf.GetWithOffset<float>(axisH_, baseBuffOffset);
+    LocalTensor<ExpandXType> sumFpAndBfLocal = tBuf.GetWithOffset<ExpandXType>(axisH_, baseBuffOffset);
+    baseBuffOffset += axisH_ * sizeof(float);
+
+    LocalTensor<ExpandXType> dataIn = tBuf.GetWithOffset<ExpandXType>(axisH_, baseBuffOffset);
+    baseBuffOffset += axisH_ * sizeof(ExpandXType);
+    LocalTensor<float> castFp32 = tBuf.GetWithOffset<float>(axisH_, baseBuffOffset);
+
+    // 量化 fp16
+    LocalTensor<ExpandXType> sumFp16Local = tBuf.GetWithOffset<ExpandXType>(axisH_, fpBaseBuffOffset);
+    fpBaseBuffOffset += axisH_ * sizeof(ExpandXType);
+
+    LocalTensor<ExpandXTransType> dataInt8 = tBuf.GetWithOffset<ExpandXTransType>(axisH_, fpBaseBuffOffset);
+    fpBaseBuffOffset += axisH_ * sizeof(ExpandXTransType);
+
+    LocalTensor<ExpandXType> scaleData = tBuf.GetWithOffset<ExpandXType>(scaleNumAlign, fpBaseBuffOffset);
+    fpBaseBuffOffset += scaleNumAlign * sizeof(ExpandXType);
+
+    LocalTensor<ExpandXType> castFp16 = tBuf.GetWithOffset<ExpandXType>(axisH_, fpBaseBuffOffset);
+    fpBaseBuffOffset += axisH_ * sizeof(ExpandXType);
+
+    LocalTensor<ExpandXType> scaleDup = tBuf.GetWithOffset<ExpandXType>(axisH_, fpBaseBuffOffset);
+
+    // 量化 bf16
+    LocalTensor<float> sumFloatLocal1 = tBuf.GetWithOffset<float>(axisH_, bfBaseBuffOffset);
+    LocalTensor<ExpandXType> sumBf16Local = tBuf.GetWithOffset<ExpandXType>(axisH_, bfBaseBuffOffset);
+    bfBaseBuffOffset += axisH_ * sizeof(float);
+
+    LocalTensor<ExpandXTransType> dataInUbInt8 = tBuf.GetWithOffset<ExpandXTransType>(axisH_, bfBaseBuffOffset);
+    bfBaseBuffOffset += axisH_ * sizeof(ExpandXTransType);
+
+    LocalTensor<ExpandXType> scaleDataBf16 = tBuf.GetWithOffset<ExpandXType>(scaleNumAlign, bfBaseBuffOffset);
+    bfBaseBuffOffset += scaleNumAlign * sizeof(ExpandXType);
+
+    LocalTensor<half> castDataHalf = tBuf.GetWithOffset<half>(axisH_, bfBaseBuffOffset);  // Bf16 用half代替
+    bfBaseBuffOffset += axisH_ * sizeof(half);
+
+    LocalTensor<float> castDataFp32 = tBuf.GetWithOffset<float>(axisH_, bfBaseBuffOffset);
+    bfBaseBuffOffset += axisH_ * sizeof(float);
+
+    LocalTensor<float> castFp32scale = tBuf.GetWithOffset<float>(scaleNum, bfBaseBuffOffset);
+    bfBaseBuffOffset += scaleNumAlign * sizeof(float);
+
+    LocalTensor<float> castFp32ScaleBrcb = tBuf.GetWithOffset<float>(axisH_, bfBaseBuffOffset);
+
+    for (uint32_t i = startBs; i < endBs; i++) {
+        int offsetPre = 0;
+        int offsetCur = countReduceLocal_.GetValue(i);
+        if (i != 0U) {
+            offsetPre = countReduceLocal_.GetValue(i - 1);
+        }
+        PipeBarrier<PIPE_ALL>();  // 高精度为了同步加入的 PIPE_ALL
+        int copyNum = offsetCur - offsetPre;
+        if (!copyNum) {
+            break;
+        }
+        if constexpr (DynamicQuant && std::is_same<ExpandXTransType, int8_t>::value) {
+            if constexpr (std::is_same<ExpandXType, half>::value) {  // fp16
+                SyncFunc<AscendC::HardEvent::MTE3_V>();
+                Duplicate(sumFp16Local, static_cast<ExpandXType>(0.0), axisH_);
+                for (int j = 0; j < copyNum; j++) {
+                    int offsetOnIpc =
+                        (offsetReduceLocal_.GetValue(offsetIndex) / axisBS_ * rankSizeOnWin_ * SERVER_RANK_SIZE +
+                         offsetReduceLocal_.GetValue(offsetIndex) % axisBS_ *
+                             (axisH_ * sizeof(ExpandXTransType) + scaleNumAlign * sizeof(ExpandXType))) /
+                        sizeof(ExpandXTransType);
+                    SyncFunc<AscendC::HardEvent::V_MTE2>();  // 下一个token用的buffer和上一个token用的buffer之间进行同步
+                    DataCopy(dataInt8, localInWindow_[offsetOnIpc], axisH_);
+                    DataCopy(
+                        scaleData,
+                        localInScaleWindow_[((offsetOnIpc + axisH_) * sizeof(ExpandXTransType)) / sizeof(ExpandXType)],
+                        scaleNumAlign);
+
+                    SyncFunc<AscendC::HardEvent::MTE2_V>();
+                    Cast(castFp16, dataInt8, AscendC::RoundMode::CAST_NONE, axisH_);
+                    PipeBarrier<PIPE_V>();
+                    Brcb(scaleDup, scaleData, repeatNum, {1, 8});  // 填充scale值
+                    PipeBarrier<PIPE_V>();
+                    MulAddDst(sumFp16Local, castFp16, scaleDup, axisH_);  // fp16乘加scale值
+                    PipeBarrier<PIPE_V>();
+
+                    offsetIndex++;
+                }
+                PipeBarrier<PIPE_V>();
+                SyncFunc<AscendC::HardEvent::V_MTE3>();
+                DataCopy(expandOutGlobal_[i * axisH_], sumFp16Local, axisH_);
+                PipeBarrier<PIPE_V>();
+            } else {  // bf16
+                SyncFunc<AscendC::HardEvent::MTE3_V>();
+                Duplicate(sumFloatLocal1, 0.0f, axisH_);
+
+                for (int j = 0; j < copyNum; j++) {
+                    int offsetOnIpc =
+                        (offsetReduceLocal_.GetValue(offsetIndex) / axisBS_ * rankSizeOnWin_ * SERVER_RANK_SIZE +
+                         offsetReduceLocal_.GetValue(offsetIndex) % axisBS_ *
+                             (axisH_ * sizeof(ExpandXTransType) + scaleNumAlign * sizeof(ExpandXType))) /
+                        sizeof(ExpandXTransType);
+                    SyncFunc<AscendC::HardEvent::V_MTE2>();  // 下一个token用的buffer和上一个token用的buffer之间进行同步
+                    DataCopy(dataInUbInt8, localInWindow_[offsetOnIpc], axisH_);
+                    DataCopy(
+                        scaleDataBf16,
+                        localInScaleWindow_[((offsetOnIpc + axisH_) * sizeof(ExpandXTransType)) / sizeof(ExpandXType)],
+                        scaleNumAlign);
+
+                    SyncFunc<AscendC::HardEvent::MTE2_V>();
+                    // cast before muls
+                    Cast(castDataHalf, dataInUbInt8, AscendC::RoundMode::CAST_NONE, axisH_);  // data:int8->fp16
+                    PipeBarrier<PIPE_V>();
+                    Cast(castDataFp32, castDataHalf, AscendC::RoundMode::CAST_NONE, axisH_);  // data:fp16->fp32
+                    PipeBarrier<PIPE_V>();
+                    Cast(castFp32scale, scaleDataBf16, AscendC::RoundMode::CAST_NONE, scaleNum);  // scale:bf16->fp32
+                    PipeBarrier<PIPE_V>();
+                    Brcb(castFp32ScaleBrcb, castFp32scale, repeatNum, {1, 8});  // 填充fp32的scale值
+                    PipeBarrier<PIPE_V>();
+                    MulAddDst(sumFloatLocal1, castDataFp32, castFp32ScaleBrcb, axisH_);  // fp16乘加scale值
+                    PipeBarrier<PIPE_V>();
+                    offsetIndex++;
+                }
+                PipeBarrier<PIPE_V>();
+                Cast(sumBf16Local, sumFloatLocal1, AscendC::RoundMode::CAST_RINT, axisH_);
+                SyncFunc<AscendC::HardEvent::V_MTE3>();
+                DataCopy(expandOutGlobal_[i * axisH_], sumBf16Local, axisH_);
+                PipeBarrier<PIPE_V>();
+            }
+        } else {
+            Duplicate(sumFloatLocal, 0.0f, axisH_);
+            for (int j = 0; j < copyNum; j++) {
+                int offsetOnIpc =
+                    (offsetReduceLocal_.GetValue(offsetIndex) / axisBS_ * rankSizeOnWin_ * SERVER_RANK_SIZE +
+                     offsetReduceLocal_.GetValue(offsetIndex) % axisBS_ * axisH_ * sizeof(ExpandXType)) /
+                    sizeof(ExpandXType);
+                SyncFunc<AscendC::HardEvent::V_MTE2>();  // 下一个token用的buffer和上一个token用的buffer之间进行同步
+                DataCopy(dataIn, localInWindow_[offsetOnIpc], axisH_);
+                SyncFunc<AscendC::HardEvent::MTE2_V>();
+                // cast before muls
+                Cast(castFp32, dataIn, AscendC::RoundMode::CAST_NONE, axisH_);
+                PipeBarrier<PIPE_V>();
+                // add mulBufLocal to sumFloatBufLocal
+                AscendC::Add(sumFloatLocal, sumFloatLocal, castFp32, axisH_);
+                offsetIndex++;
+            }
+            PipeBarrier<PIPE_V>();
+            SyncFunc<AscendC::HardEvent::MTE3_V>();
+            Cast(sumFpAndBfLocal, sumFloatLocal, AscendC::RoundMode::CAST_RINT, axisH_);
+            SyncFunc<AscendC::HardEvent::V_MTE3>();
+            DataCopy(expandOutGlobal_[i * axisH_], sumFpAndBfLocal, axisH_);
+            PipeBarrier<PIPE_V>();
+        }
+    }
+
+    SyncAll<true>();
+}
+
+template <TemplateMC2TypeA2layeredClass>
+__aicore__ inline void MoeDistributeCombineV2Layered<TemplateMC2TypeA2layeredFunc>::Process()
+{
+    if ASCEND_IS_AIV {
+        // 所有核执行
+        GM2IPC();
+        WaitIPC();
+        stepCoreNum = IPC_REDUCE_USED_CORE_NUM;
+        if (coreIdx_ < stepCoreNum) {
+            SumToWindow();
+        } else if (coreIdx_ < (stepCoreNum + serverNum)) {
+            AlltoAllServerDispatch();
+        } else {
+            SyncAll<true>();
+        }
+        if (coreIdx_ == 0U) {
+            magicGlobal_.SetValue(MAGIC_OFFSET / sizeof(uint64_t), magicValue + 1);
+            PipeBarrier<PIPE_ALL>();
+            AscendC::DataCacheCleanAndInvalid<uint64_t, AscendC::CacheLine::SINGLE_CACHE_LINE,
+                                              AscendC::DcciDst::CACHELINE_OUT>(
+                magicGlobal_[MAGIC_OFFSET / sizeof(uint64_t)]);
+            bufferIdGlobal_(0) = bufferId_ ^ 1;
+            PipeBarrier<PIPE_ALL>();
+            AscendC::DataCacheCleanAndInvalid<uint32_t, AscendC::CacheLine::SINGLE_CACHE_LINE,
+                                              AscendC::DcciDst::CACHELINE_OUT>(bufferIdGlobal_[0]);
+        }
+        Preload();
+        WaitDispatch();
+        SumToServer();
+        hccl_.Finalize();
+    }
+}
+}  // namespace MoeDistributeCombineA2Impl
+#endif  // MOE_DISTRIBUTE_COMBINE_V2_LAYERED_H
diff --git a/csrc/deepep/ops2/op_kernel/moe_distribute_combine_v2_tiling.h b/csrc/deepep/ops2/op_kernel/moe_distribute_combine_v2_tiling.h
new file mode 100644
index 000000000..b4b00fb84
--- /dev/null
+++ b/csrc/deepep/ops2/op_kernel/moe_distribute_combine_v2_tiling.h
@@ -0,0 +1,55 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*!
+ * \file moe_distribute_combine_tiling.h
+ * \brief
+ */
+#ifndef MOE_DISTRIBUTE_CMOBINE_A2_TILING_H
+#define MOE_DISTRIBUTE_CMOBINE_A2_TILING_H
+
+#include <cstdint>
+#include "kernel_tiling/kernel_tiling.h"
+
+struct MoeDistributeCombineV2Info {
+    uint32_t epWorldSize;          // epWorldSize
+    uint32_t tpWorldSize;          // tpWorldSize
+    uint32_t epRankId;             // epRankId
+    uint32_t tpRankId;             // tpRankId
+    uint32_t expertSharedType;     // expert type
+    uint32_t sharedExpertRankNum;  // shared expert number
+    uint32_t moeExpertNum;         // moe expert number
+    uint32_t zeroExpertNum;        // zero expert number
+    uint32_t copyExpertNum;        // copy expert number
+    uint32_t constExpertNum;       // const expert number
+    uint32_t globalBs;             // globalBs = BS * worldSize
+    uint32_t bs;                   // bs
+    uint32_t k;                    // k
+    uint32_t h;                    // h
+    uint32_t aivNum;               // aivNum
+    uint64_t totalUbSize;          // epWorldSize
+    bool isTokenMask;              // input active mask 1dims or not
+    bool isExpertMask;             // input active mask 2dims or not
+    int8_t reserved[7];            // Pad 7 int8 for memory alignment
+};
+
+struct MoeDistributeCombineV2TilingData {
+    Mc2InitTiling mc2InitTiling;
+    Mc2CcTiling mc2CcTiling;
+    MoeDistributeCombineV2Info moeDistributeCombineV2Info;
+};
+
+#endif  //__MOE_DISTRIBUTE_CMOBINE_A2_TILING_H__
diff --git a/csrc/deepep/ops2/op_kernel/moe_distribute_dispatch_v2.cpp b/csrc/deepep/ops2/op_kernel/moe_distribute_dispatch_v2.cpp
new file mode 100644
index 000000000..172689f4f
--- /dev/null
+++ b/csrc/deepep/ops2/op_kernel/moe_distribute_dispatch_v2.cpp
@@ -0,0 +1,103 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*!
+ * \file moe_distribute_dispatch_v2.cpp
+ * \brief
+ */
+#include "kernel_operator.h"
+#include "moe_distribute_dispatch_v2_tiling.h"
+#include "moe_distribute_dispatch_v2.h"
+#include "moe_distribute_dispatch_v2_layered.h"
+
+using namespace AscendC;
+using namespace MoeDistributeDispatchA2Impl;
+
+/*
+ 2000000000  A2
+  100000000  layered
+       1000  init
+         10  isScales
+          2  quantMode
+*/
+extern "C" __global__ __aicore__ void moe_distribute_dispatch_v2(
+    GM_ADDR x, GM_ADDR expertIds, GM_ADDR scales, GM_ADDR xActiveMask, GM_ADDR expertScales, GM_ADDR elasticInfo,
+    GM_ADDR expandXOut, GM_ADDR dynamicScalesOut, GM_ADDR assistInfoOut, GM_ADDR expertTokenNumsOut,
+    GM_ADDR epSendCountsOut, GM_ADDR tpSendCountsOut, GM_ADDR expandScalesOut, GM_ADDR workspaceGM, GM_ADDR tilingGM)
+{
+    REGISTER_TILING_DEFAULT(MoeDistributeDispatchV2TilingData);
+    REGISTER_TILING_FOR_TILINGKEY("TILING_KEY_VAR >= 2000000000", MoeDistributeDispatchV2TilingData);
+    TPipe pipe;
+#if (ORIG_DTYPE_EXPAND_X == DT_BF16 || ORIG_DTYPE_EXPAND_X == DT_FLOAT16)
+    if (TILING_KEY_IS(2000001000)) {
+        GET_TILING_DATA_WITH_STRUCT(MoeDistributeDispatchV2TilingData, tilingData, tilingGM);
+        MoeDistributeDispatchV2<DTYPE_X, DTYPE_EXPAND_X, false, false, false> op;
+        op.Init(x, expertIds, scales, xActiveMask, expandXOut, dynamicScalesOut, assistInfoOut, expertTokenNumsOut,
+                epSendCountsOut, workspaceGM, &pipe, tilingGM);
+        op.Process();
+    } else if (TILING_KEY_IS(2100001000)) {
+        GET_TILING_DATA_WITH_STRUCT(MoeDistributeDispatchV2TilingData, tilingData, tilingGM);
+        GM_ADDR contextGM0 = AscendC::GetHcclContext<HCCL_GROUP_ID_0>();
+        DataplaneMode dataplaneMode = GetDataplaneMode(contextGM0);
+        if (dataplaneMode == DataplaneMode::AIV) {
+            MoeDistributeDispatchV2Layered<DTYPE_X, DTYPE_EXPAND_X, false, false, false> op;
+            op.Init(x, expertIds, scales, expertScales, expandXOut, dynamicScalesOut, assistInfoOut, expertTokenNumsOut,
+                    epSendCountsOut, expandScalesOut, workspaceGM, &pipe, tilingGM, contextGM0);
+            op.Process();
+        } else {
+            assert(false, "The driver version is too low and does not support layered mode.\n");
+        }
+    }
+#elif (ORIG_DTYPE_EXPAND_X == DT_INT8)
+    if (TILING_KEY_IS(2000001002)) {
+        GET_TILING_DATA_WITH_STRUCT(MoeDistributeDispatchV2TilingData, tilingData, tilingGM);
+        MoeDistributeDispatchV2<DTYPE_X, DTYPE_EXPAND_X, false, true, false> op;
+        op.Init(x, expertIds, scales, xActiveMask, expandXOut, dynamicScalesOut, assistInfoOut, expertTokenNumsOut,
+                epSendCountsOut, workspaceGM, &pipe, tilingGM);
+        op.Process();
+    } else if (TILING_KEY_IS(2000001012)) {
+        GET_TILING_DATA_WITH_STRUCT(MoeDistributeDispatchV2TilingData, tilingData, tilingGM);
+        MoeDistributeDispatchV2<DTYPE_X, DTYPE_EXPAND_X, false, true, true> op;
+        op.Init(x, expertIds, scales, xActiveMask, expandXOut, dynamicScalesOut, assistInfoOut, expertTokenNumsOut,
+                epSendCountsOut, workspaceGM, &pipe, tilingGM);
+        op.Process();
+    } else if (TILING_KEY_IS(2100001002)) {
+        GET_TILING_DATA_WITH_STRUCT(MoeDistributeDispatchV2TilingData, tilingData, tilingGM);
+        GM_ADDR contextGM0 = AscendC::GetHcclContext<HCCL_GROUP_ID_0>();
+        DataplaneMode dataplaneMode = GetDataplaneMode(contextGM0);
+        if (dataplaneMode == DataplaneMode::AIV) {
+            MoeDistributeDispatchV2Layered<DTYPE_X, DTYPE_EXPAND_X, false, true, false> op;
+            op.Init(x, expertIds, scales, expertScales, expandXOut, dynamicScalesOut, assistInfoOut, expertTokenNumsOut,
+                    epSendCountsOut, expandScalesOut, workspaceGM, &pipe, tilingGM, contextGM0);
+            op.Process();
+        } else {
+            assert(false, "The driver version is too low and does not support layered mode.\n");
+        }
+    } else if (TILING_KEY_IS(2100001012)) {
+        GET_TILING_DATA_WITH_STRUCT(MoeDistributeDispatchV2TilingData, tilingData, tilingGM);
+        GM_ADDR contextGM0 = AscendC::GetHcclContext<HCCL_GROUP_ID_0>();
+        DataplaneMode dataplaneMode = GetDataplaneMode(contextGM0);
+        if (dataplaneMode == DataplaneMode::AIV) {
+            MoeDistributeDispatchV2Layered<DTYPE_X, DTYPE_EXPAND_X, false, true, true> op;
+            op.Init(x, expertIds, scales, expertScales, expandXOut, dynamicScalesOut, assistInfoOut, expertTokenNumsOut,
+                    epSendCountsOut, expandScalesOut, workspaceGM, &pipe, tilingGM, contextGM0);
+            op.Process();
+        } else {
+            assert(false, "The driver version is too low and does not support layered mode.\n");
+        }
+    }
+#endif
+}
diff --git a/csrc/deepep/ops2/op_kernel/moe_distribute_dispatch_v2.h b/csrc/deepep/ops2/op_kernel/moe_distribute_dispatch_v2.h
new file mode 100644
index 000000000..82ea0b90c
--- /dev/null
+++ b/csrc/deepep/ops2/op_kernel/moe_distribute_dispatch_v2.h
@@ -0,0 +1,1026 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*!
+ * \file moe_distribute_dispatch_a2.h
+ * \brief
+ */
+
+#ifndef MOE_DISTRIBUTE_DISPATCH_V2_H
+#define MOE_DISTRIBUTE_DISPATCH_V2_H
+
+#include "kernel_operator.h"
+#include "kernel_tiling/kernel_tiling.h"
+#include "moe_distribute_dispatch_v2_tiling.h"
+#include "moe_distribute_base.h"
+
+namespace MoeDistributeDispatchA2Impl {
+constexpr static uint8_t BUFFER_NUM = 2;             // 多buf
+constexpr static uint32_t DATA_OFFSET = 512;         // 数据空间起始偏移
+constexpr static uint32_t STATE_SIZE = 1024 * 1024;  // 1M
+constexpr static uint32_t STATUS_ENTRY_COUNT = 32;
+constexpr static uint32_t STATUS_SIZE = STATUS_ENTRY_COUNT * sizeof(int32_t);
+constexpr static uint32_t UB_ALIGN = 32;  // UB按32字节对齐
+constexpr static uint32_t BITS32_PER_BLOCK = UB_ALIGN / 4;
+constexpr static uint32_t STATUS_BLOCK_COUNT = STATUS_ENTRY_COUNT / BITS32_PER_BLOCK;
+constexpr static uint32_t FLAG_OFFSET = 24;
+constexpr static uint32_t BW_ITEM_SIZE = 32;  // = sizeof(BatchWriteItem)
+constexpr static uint32_t U64_PER_ITEM = BW_ITEM_SIZE / sizeof(uint64_t);
+constexpr static uint32_t U32_PER_ITEM = BW_ITEM_SIZE / sizeof(uint32_t);
+constexpr static uint32_t SKIP_OFFSET = 512;
+constexpr static int32_t FLAG_VALUE = 0xFFFFFFFF;
+constexpr uint64_t MB_SIZE = 1024 * 1024;
+template <typename T>
+__aicore__ inline T RoundUp(const T val, const T align)
+{
+    static_assert(std::is_arithmetic<T>::value, "T must be an arithmetic type");
+    if (align == 0 || val + align - 1 < val) {
+        return val;
+    }
+    return (val + align - 1) / align * align;
+}
+template <AscendC::HardEvent event>
+__aicore__ inline void SyncFunc()
+{
+    int32_t eventID = static_cast<int32_t>(GetTPipePtr()->FetchEventID(event));
+    AscendC::SetFlag<event>(eventID);
+    AscendC::WaitFlag<event>(eventID);
+}
+
+#define TemplateMC2TypeA2Class \
+    typename XType, typename ExpandXOutType, bool StaticQuant, bool DynamicQuant, bool IsSmoothScaleExist
+#define TemplateMC2TypeA2Func XType, ExpandXOutType, StaticQuant, DynamicQuant, IsSmoothScaleExist
+
+using namespace AscendC;
+template <TemplateMC2TypeA2Class>
+class MoeDistributeDispatchV2
+{
+private:
+    constexpr static uint32_t TBUF_SIZE = 190 * 1024;
+    constexpr static uint64_t ALIGNED_LEN_256 = 256UL;
+    constexpr static int32_t BITS_PER_BYTE = 8;
+    constexpr static uint32_t REPEAT_BYTES = 256;
+    constexpr static uint32_t BITS16_PER_BLOCK = UB_ALIGN / sizeof(int16_t);
+
+public:
+    __aicore__ inline MoeDistributeDispatchV2(){};
+    __aicore__ inline void Init(GM_ADDR x, GM_ADDR expertIds, GM_ADDR scales, GM_ADDR xActiveMask, GM_ADDR expandXOut,
+                                GM_ADDR dynamicScalesOut, GM_ADDR expandIdxOut, GM_ADDR expertTokenNumsOut,
+                                GM_ADDR epRecvCountsOut, GM_ADDR workspaceGM, TPipe *pipe, GM_ADDR tilingGM);
+    __aicore__ inline void Process();
+
+private:
+    __aicore__ inline void IndexSort();
+    __aicore__ inline void SendToMoeExpert();
+    __aicore__ inline void LocalWindowCopy();
+    __aicore__ inline void GetStatusCumSum();
+    __aicore__ inline void WaitDispatch();
+    __aicore__ inline void ConstructBatchWriteInfo();
+    __aicore__ inline void ReorderTokens();
+    __aicore__ inline void ReorderTokensPipeSet();
+    __aicore__ inline void ReorderTokensPipeReset();
+    __aicore__ inline void QuantProcess(uint32_t expertIndex, TEventID eventId);
+    __aicore__ inline void TokenActiveMaskCal();
+    __aicore__ inline void ExpertActiveMaskCal();
+    __aicore__ inline void CalVaildExpIdx(LocalTensor<int8_t> maskInputTensor);
+    __aicore__ inline void GenerateGatherMaskTensor(uint32_t maskCnt);
+    __aicore__ inline void MaskZeroComputeExpert(uint32_t maskCnt);
+    __aicore__ inline void ZeroComputeExpertMaskCal();
+    TPipe *tpipe_{nullptr};
+    GlobalTensor<XType> xGMTensor_;
+    GlobalTensor<int32_t> expertIdsGMTensor_;
+    GlobalTensor<float> scalesGMTensor_;
+    GlobalTensor<ExpandXOutType> expandXOutGMTensor_;
+    GlobalTensor<float> dynamicScalesOutGMTensor_;
+    GlobalTensor<XType> windowInTensor_;
+    GlobalTensor<ExpandXOutType> windowInQuantTensor_;
+    GlobalTensor<int32_t> windowInstatusTensor_;
+    GlobalTensor<ExpandXOutType> sendTokensTensor_;
+    GlobalTensor<uint32_t> batchWriteInfoTensor_;
+    GlobalTensor<int32_t> sendStatusTensor_;
+    GlobalTensor<uint32_t> bufferChosenGlobal_;
+    GlobalTensor<int8_t> xActiveMaskGMTensor_;
+
+    LocalTensor<ExpandXOutType> xTmpTensor_;
+    LocalTensor<XType> xInTensor_;
+    LocalTensor<ExpandXOutType> xOutTensor_;
+    LocalTensor<ExpandXOutType> xOutPingTensor_;
+    LocalTensor<ExpandXOutType> xOutPongTensor_;
+    LocalTensor<float> xOutFp32Tensor_;
+    LocalTensor<int32_t> expertCountTensor_;
+    LocalTensor<int32_t> expertIdsTensor_;
+    LocalTensor<float> rowMaxTensor_;
+    LocalTensor<int32_t> statusTensor_;
+    LocalTensor<float> statusFp32Tensor_;
+    LocalTensor<uint64_t> batchWriteU64Tensor_;
+    LocalTensor<uint32_t> batchWriteU32Tensor_;
+    LocalTensor<int32_t> expertTokenNumsW64Tensor_;
+    LocalTensor<uint32_t> expertCumsumTensor_;
+    LocalTensor<int32_t> vaildExpIndexTensor_;
+    LocalTensor<uint32_t> gatherMaskTensor_;
+    TBuf<> dynamicScalesBuf_;
+    TBuf<> expertCountBuf_;
+    TBuf<> expertIdsBuf_;
+    TBuf<> statusBuf_;
+    TBuf<> gatherMaskOutBuf_;  // gather mask输出buf
+    TBuf<> rowMaxBuf_;
+    TBuf<> receiveDataCastFloatBuf_;
+    TBuf<> smoothScalesBuf_;
+    TBuf<> batchWriteInfoBuf_;
+    TBuf<> tBuf_;
+
+    GM_ADDR expandIdxOutGM_;
+    GM_ADDR expertTokenNumsOutGM_;  // 这个输出没有使用
+    GM_ADDR epRecvCountsGM_;
+    GM_ADDR windowInGM_;
+    GM_ADDR windowOutGM_;
+    GM_ADDR batchWriteInfo_;
+
+    // tiling侧已确保数据上限，相乘不会越界，因此统一采用uint32_t进行处理
+    uint32_t axisBS_{0};
+    uint32_t axisH_{0};
+    uint32_t axisK_{0};
+    uint32_t aivNum_{0};
+    uint32_t expertIdsCnt_{0};
+    uint32_t worldSize_{0};
+    uint32_t rankId_{0};
+    uint32_t aivId_{0};         // aiv id
+    uint32_t moeExpertNum_{0};  // moe专家卡数, 等于worldSize_ - 共享专家卡数
+    uint32_t bufferSizePerRank_{0};
+    uint32_t hSize_{0};
+    uint32_t hQuantSize_{0};
+    uint32_t hCommuSize_{0};
+    uint32_t scaleParamPad_{0};
+    uint32_t axisHCommu_{0};
+    uint32_t localMoeExpertNum_{0};
+    uint32_t localMoeExpertNumAlign_{0};
+    uint32_t dataSizePerRank_{0};
+    uint32_t dataSize_{0};
+    uint32_t bufferChosen_{0};
+    uint32_t totalSize_{0};
+    uint64_t activeMaskBsCnt_{0};
+    uint32_t expertTokenNumsType_{0};
+    int32_t zeroComputeExpertNum_{0};
+    uint64_t sendToMoeExpTokenCnt_{0};
+    uint32_t leftUbSize_{TBUF_SIZE};
+    uint32_t baseBuffOffset_{0};
+    uint32_t xActiveMaskSize_{0};
+    bool isTokenMaskFlag_ = false;
+    bool isExpertMaskFlag_ = false;
+    bool isQuant_ = false;
+    Hccl<HCCL_SERVER_TYPE_AICPU> hccl_;
+    __gm__ HcclOpResParam *winContext_{nullptr};
+};
+
+template <TemplateMC2TypeA2Class>
+__aicore__ inline void MoeDistributeDispatchV2<TemplateMC2TypeA2Func>::Init(
+    GM_ADDR x, GM_ADDR expertIds, GM_ADDR scales, GM_ADDR xActiveMask, GM_ADDR expandXOut, GM_ADDR dynamicScalesOut,
+    GM_ADDR expandIdxOut, GM_ADDR expertTokenNumsOut, GM_ADDR epRecvCountsOut, GM_ADDR workspaceGM, TPipe *pipe,
+    GM_ADDR tilingGM)
+{
+    tpipe_ = pipe;
+    REGISTER_TILING_DEFAULT(MoeDistributeDispatchV2TilingData);
+    auto tiling = (__gm__ MoeDistributeDispatchV2TilingData *)tilingGM;
+    __gm__ void *mc2InitTiling = (__gm__ void *)(&(tiling->mc2InitTiling));
+    __gm__ void *mc2CcTiling = (__gm__ void *)(&(tiling->mc2CcTiling));
+    GET_TILING_DATA_WITH_STRUCT(MoeDistributeDispatchV2TilingData, tilingData, tilingGM);
+
+    auto contextGM0 = AscendC::GetHcclContext<HCCL_GROUP_ID_0>();
+    hccl_.Init(contextGM0, mc2InitTiling);
+    hccl_.SetCcTiling(mc2CcTiling);
+
+    winContext_ = (__gm__ HcclOpResParam *)contextGM0;
+    rankId_ = tilingData.moeDistributeDispatchV2Info.epRankId;
+    windowInGM_ = hccl_.GetWindowsInAddr(rankId_);
+    windowOutGM_ = hccl_.GetWindowsOutAddr(rankId_);
+
+    axisBS_ = tilingData.moeDistributeDispatchV2Info.bs;
+    axisH_ = tilingData.moeDistributeDispatchV2Info.h;
+    axisK_ = tilingData.moeDistributeDispatchV2Info.k;
+    aivNum_ = tilingData.moeDistributeDispatchV2Info.aivNum;
+    worldSize_ = tilingData.moeDistributeDispatchV2Info.epWorldSize;
+    expertTokenNumsType_ = tilingData.moeDistributeDispatchV2Info.expertTokenNumsType;
+    isTokenMaskFlag_ = tilingData.moeDistributeDispatchV2Info.isTokenMask;
+    isExpertMaskFlag_ = tilingData.moeDistributeDispatchV2Info.isExpertMask;
+    zeroComputeExpertNum_ = tilingData.moeDistributeDispatchV2Info.zeroComputeExpertNum;
+    totalSize_ = winContext_->winSize / 2;  // 2G / 2 = 1G
+    dataSize_ = totalSize_ - STATE_SIZE;    // 1G - 1M
+    dataSizePerRank_ = dataSize_ / worldSize_;
+    moeExpertNum_ = tilingData.moeDistributeDispatchV2Info.moeExpertNum;
+    localMoeExpertNum_ = moeExpertNum_ / worldSize_;
+    aivId_ = GetBlockIdx();
+    expertIdsCnt_ = axisBS_ * axisK_;
+    localMoeExpertNumAlign_ = (localMoeExpertNum_ + BITS32_PER_BLOCK - 1) / BITS32_PER_BLOCK * BITS32_PER_BLOCK;
+
+    bufferChosenGlobal_.SetGlobalBuffer((__gm__ uint32_t *)(windowInGM_ + dataSize_));
+    bufferChosen_ = bufferChosenGlobal_(0);  // 魔数
+
+    windowInGM_ = windowInGM_ + totalSize_ * bufferChosen_;
+    windowOutGM_ = windowOutGM_ + totalSize_ * bufferChosen_;
+
+    xGMTensor_.SetGlobalBuffer((__gm__ XType *)x);
+    expertIdsGMTensor_.SetGlobalBuffer((__gm__ int32_t *)expertIds);
+    expandXOutGMTensor_.SetGlobalBuffer((__gm__ ExpandXOutType *)(expandXOut),
+                                        worldSize_ * axisBS_ * localMoeExpertNum_ * axisH_);
+    dynamicScalesOutGMTensor_.SetGlobalBuffer((__gm__ float *)(dynamicScalesOut));
+    windowInTensor_.SetGlobalBuffer((__gm__ XType *)(windowInGM_));
+    windowInstatusTensor_.SetGlobalBuffer((__gm__ int32_t *)(windowInGM_));
+    windowInQuantTensor_.SetGlobalBuffer((__gm__ ExpandXOutType *)(windowInGM_));
+    sendTokensTensor_.SetGlobalBuffer((__gm__ ExpandXOutType *)(windowOutGM_));
+    sendStatusTensor_.SetGlobalBuffer((__gm__ int32_t *)(windowOutGM_));
+
+    xActiveMaskGMTensor_.SetGlobalBuffer((__gm__ int8_t *)xActiveMask);
+    expandIdxOutGM_ = expandIdxOut;
+    expertTokenNumsOutGM_ = expertTokenNumsOut;
+    epRecvCountsGM_ = epRecvCountsOut;
+
+    isQuant_ = StaticQuant | DynamicQuant;
+    hSize_ = axisH_ * sizeof(XType);
+    hQuantSize_ = axisH_ * sizeof(ExpandXOutType);  // 如有量化，需要量化后通信
+    scaleParamPad_ = (isQuant_ ? 32 : 0);           // 预留32B给量化参数，实际只使用了4B(fp32)
+    hCommuSize_ = hQuantSize_ + scaleParamPad_;
+    axisHCommu_ = hCommuSize_ / sizeof(ExpandXOutType);
+    bufferSizePerRank_ = 32 * hSize_;
+
+    batchWriteInfo_ = workspaceGM;
+    batchWriteInfoTensor_.SetGlobalBuffer((__gm__ uint32_t *)(batchWriteInfo_), worldSize_ * U32_PER_ITEM);
+
+    tpipe_->InitBuffer(statusBuf_, worldSize_ * STATUS_ENTRY_COUNT * sizeof(int32_t));  // worldsize * 32B
+    leftUbSize_ -= worldSize_ * STATUS_ENTRY_COUNT * sizeof(int32_t);
+    statusTensor_ = statusBuf_.Get<int32_t>();  // 保存发送数据量及flag，同时用于计算windows中的偏移
+    Duplicate<int32_t>(statusTensor_, 0, worldSize_ * STATUS_ENTRY_COUNT);  // 8 = UB_ALIGN / sizeof(int32_t)
+
+    uint64_t mask[2] = {0x0100000001000000, 0};
+    Duplicate<int32_t>(statusTensor_, FLAG_VALUE, mask, worldSize_ * STATUS_ENTRY_COUNT / 64, 1, 8);
+    if (isQuant_) {
+        scalesGMTensor_.SetGlobalBuffer((__gm__ float *)scales);
+    }
+
+    tpipe_->InitBuffer(batchWriteInfoBuf_, worldSize_ * BW_ITEM_SIZE);
+    leftUbSize_ -= worldSize_ * BW_ITEM_SIZE;
+    // Ensure not less than REPEAT_BYTES for TokenActiveMaskCal
+    uint32_t expertIdsBufSize = Std::max(
+        static_cast<uint32_t>((expertIdsCnt_ * sizeof(int32_t) + UB_ALIGN - 1) / UB_ALIGN * UB_ALIGN), REPEAT_BYTES);
+    tpipe_->InitBuffer(expertIdsBuf_, expertIdsBufSize);
+    leftUbSize_ -= expertIdsBufSize;
+    expertIdsTensor_ = expertIdsBuf_.Get<int32_t>();
+
+    tpipe_->InitBuffer(expertCountBuf_, expertIdsBufSize);
+    leftUbSize_ -= expertIdsBufSize;
+    expertCountTensor_ = expertCountBuf_.Get<int32_t>();
+
+    tpipe_->InitBuffer(gatherMaskOutBuf_, (localMoeExpertNumAlign_ * worldSize_ + moeExpertNum_) * sizeof(float));
+    leftUbSize_ -= (localMoeExpertNumAlign_ * worldSize_ + moeExpertNum_) * sizeof(float);
+    tpipe_->InitBuffer(tBuf_, leftUbSize_);
+
+    uint64_t stateSizeMaxSize =
+        2 * STATE_SIZE;  // 2: 实际上是(DATA_OFFSET+SKIP_OFFSET+sizeof(uint32)) + STATE_SIZE，近似计算使用2 * STATE_SIZE
+    uint64_t winSizeMin = (axisBS_ * worldSize_ * (localMoeExpertNum_ > axisK_ ? axisK_ : localMoeExpertNum_) * axisH_ *
+                               sizeof(uint16_t) +
+                           stateSizeMaxSize) *
+                          BUFFER_NUM;  // 考虑负载极其不均衡时，HCCL BUFFSIZE需要开的大小
+
+    assert(winContext_->winSize >= winSizeMin,
+           "The HCCL_BUFFSIZE is %lluMB, the min value should be %lluMB. \
+        epWorldSize:%u, epRankId:%u, moeExpertNum:%u, quantMode:%u, globalBs:%u, bs:%u, k:%u, h:%u, aivNum:%u, \
+        isQuant:%d, totalUbSize:%llu, expertTokenNumsType:%u\n",
+           winContext_->winSize / MB_SIZE, winSizeMin / MB_SIZE, tilingData.moeDistributeDispatchV2Info.epWorldSize,
+           tilingData.moeDistributeDispatchV2Info.epRankId, tilingData.moeDistributeDispatchV2Info.moeExpertNum,
+           tilingData.moeDistributeDispatchV2Info.quantMode, tilingData.moeDistributeDispatchV2Info.globalBs,
+           tilingData.moeDistributeDispatchV2Info.bs, tilingData.moeDistributeDispatchV2Info.k,
+           tilingData.moeDistributeDispatchV2Info.h, tilingData.moeDistributeDispatchV2Info.aivNum,
+           tilingData.moeDistributeDispatchV2Info.isQuant, tilingData.moeDistributeDispatchV2Info.totalUbSize,
+           tilingData.moeDistributeDispatchV2Info.expertTokenNumsType);
+    activeMaskBsCnt_ = axisBS_;
+    sendToMoeExpTokenCnt_ = axisBS_ * axisK_;
+    if (tilingData.moeDistributeDispatchV2Info.isTokenMask) {
+        TokenActiveMaskCal();
+    }
+    vaildExpIndexTensor_ = tBuf_.GetWithOffset<int32_t>(RoundUp(expertIdsCnt_, BITS32_PER_BLOCK), baseBuffOffset_);
+    CreateVecIndex(vaildExpIndexTensor_, 0, RoundUp(expertIdsCnt_, BITS32_PER_BLOCK));
+    baseBuffOffset_ += RoundUp(expertIdsCnt_, BITS32_PER_BLOCK) * sizeof(int32_t);
+    xActiveMaskSize_ = Ceil(expertIdsCnt_, ALIGNED_LEN_256) * ALIGNED_LEN_256 / BITS_PER_BYTE;
+    LocalTensor<uint8_t> gatherMaskTensorInt8 = tBuf_.GetWithOffset<uint8_t>(xActiveMaskSize_, baseBuffOffset_);
+    baseBuffOffset_ += xActiveMaskSize_;
+    gatherMaskTensor_ = gatherMaskTensorInt8.template ReinterpretCast<uint32_t>();
+
+    if (isExpertMaskFlag_) {
+        ExpertActiveMaskCal();
+    }
+
+    if (activeMaskBsCnt_ == 0) {
+        baseBuffOffset_ = RoundUp(expertIdsCnt_, BITS32_PER_BLOCK) * sizeof(uint32_t);
+        return;
+    }
+
+    if (zeroComputeExpertNum_ != 0) {
+        ZeroComputeExpertMaskCal();
+    }
+    baseBuffOffset_ = RoundUp(expertIdsCnt_, BITS32_PER_BLOCK) * sizeof(uint32_t);
+}
+
+template <TemplateMC2TypeA2Class>
+__aicore__ inline void MoeDistributeDispatchV2<TemplateMC2TypeA2Func>::QuantProcess(uint32_t expertIndex,
+                                                                                    TEventID eventId)
+{
+    float dynamicScale = 0.0;
+    uint32_t baseBuffOffset = baseBuffOffset_;
+    LocalTensor<float> floatLocalTemp = tBuf_.GetWithOffset<float>(axisH_, baseBuffOffset);
+    baseBuffOffset += axisH_ * sizeof(float);
+    LocalTensor<float> smoothScalesTensor = tBuf_.GetWithOffset<float>(axisH_, baseBuffOffset);
+    baseBuffOffset += axisH_ * sizeof(float);
+
+    /*
+        <xType> xInTensor_ --> <float> floatLocalTemp --> <int32_t> int32LocalTemp --> <half>halfLocalTemp
+        fp32先转int32再转fp16 -- 对标A3实现
+    */
+    SyncFunc<AscendC::HardEvent::MTE3_V>();  // QuantProcess没开ping-pong XOut的ping-pong未生效
+    WaitFlag<HardEvent::MTE2_V>(eventId);
+    Cast(floatLocalTemp, xInTensor_, RoundMode::CAST_NONE, axisH_);
+    SetFlag<HardEvent::V_MTE2>(eventId);
+    PipeBarrier<PIPE_V>();
+    if constexpr (IsSmoothScaleExist) {
+        DataCopy(smoothScalesTensor, scalesGMTensor_[expertIndex * axisH_], axisH_);
+        SyncFunc<AscendC::HardEvent::MTE2_V>();
+        Mul(floatLocalTemp, floatLocalTemp, smoothScalesTensor, axisH_);
+        PipeBarrier<PIPE_V>();
+    }
+
+    if constexpr (DynamicQuant) {
+        LocalTensor<float> floatLocalAbsTemp = smoothScalesTensor;  // 复用
+        rowMaxTensor_ = tBuf_.GetWithOffset<float>(BITS32_PER_BLOCK, baseBuffOffset);
+        baseBuffOffset += UB_ALIGN;
+        Abs(floatLocalAbsTemp, floatLocalTemp, axisH_);
+        PipeBarrier<PIPE_V>();
+        ReduceMax(rowMaxTensor_, floatLocalAbsTemp, floatLocalAbsTemp, axisH_, false);
+
+        SyncFunc<AscendC::HardEvent::V_S>();
+        dynamicScale = float(127.0) / rowMaxTensor_.GetValue(0);
+        SyncFunc<AscendC::HardEvent::S_V>();
+        Muls(floatLocalTemp, floatLocalTemp, dynamicScale, axisH_);
+        PipeBarrier<PIPE_V>();
+    }
+    LocalTensor<half> halfLocalTemp = floatLocalTemp.ReinterpretCast<half>();
+    LocalTensor<int32_t> int32LocalTemp = floatLocalTemp.ReinterpretCast<int32_t>();
+    Cast(int32LocalTemp, floatLocalTemp, RoundMode::CAST_RINT, axisH_);
+    PipeBarrier<PIPE_V>();
+    SetDeqScale((half)1.000000e+00f);
+    PipeBarrier<PIPE_V>();
+
+    Cast(halfLocalTemp, int32LocalTemp, RoundMode::CAST_ROUND, axisH_);
+
+    PipeBarrier<PIPE_V>();
+    Cast(xOutTensor_, halfLocalTemp, RoundMode::CAST_TRUNC, axisH_);
+    SetFlag<HardEvent::V_MTE3>(eventId);
+
+    floatLocalTemp = xOutTensor_.template ReinterpretCast<float>();
+    dynamicScale = 1 / dynamicScale;
+    floatLocalTemp.SetValue(axisH_ / sizeof(float), dynamicScale);  // int8->float32
+    SetFlag<HardEvent::S_MTE3>(eventId);
+}
+
+template <TemplateMC2TypeA2Class>
+__aicore__ inline void MoeDistributeDispatchV2<TemplateMC2TypeA2Func>::IndexSort()
+{
+    uint32_t activeExpertIds = activeMaskBsCnt_ * axisK_;
+    DataCopyExtParams copyExpertIdsParams{1, static_cast<uint32_t>(activeExpertIds * sizeof(int32_t)), 0, 0, 0};
+    DataCopyPadExtParams<int32_t> padParams{false, 0, 0, 0};
+    DataCopyPad(expertIdsTensor_, expertIdsGMTensor_, copyExpertIdsParams, padParams);
+    Duplicate(expertCountTensor_, 0, RoundUp(activeExpertIds, BITS32_PER_BLOCK));
+    SyncFunc<AscendC::HardEvent::MTE2_S>();
+    SyncFunc<AscendC::HardEvent::V_S>();
+
+    // 24个核
+    for (uint32_t tokenIndex = 0; tokenIndex < sendToMoeExpTokenCnt_; ++tokenIndex) {
+        int32_t expertIdx = vaildExpIndexTensor_(tokenIndex);
+        int32_t expertId = expertIdsTensor_(expertIdx);
+        int32_t rankId = expertId / localMoeExpertNum_;
+        int32_t expertOffsetInRank = expertId % localMoeExpertNum_;
+        expertCountTensor_(expertIdx) = statusTensor_(rankId * STATUS_ENTRY_COUNT + expertOffsetInRank);
+        statusTensor_(rankId * STATUS_ENTRY_COUNT + expertOffsetInRank)++;
+    }
+    uint32_t baseBuffOffset = baseBuffOffset_;
+    expertCumsumTensor_ = gatherMaskOutBuf_.Get<uint32_t>();
+    expertCumsumTensor_.SetValue(0, 0);
+    for (uint32_t expertId = 1; expertId < moeExpertNum_; expertId++) {
+        int32_t rankId = (expertId - 1) / localMoeExpertNum_;
+        int32_t expertOffsetInRank = (expertId - 1) % localMoeExpertNum_;
+        uint32_t count = statusTensor_(rankId * STATUS_ENTRY_COUNT + expertOffsetInRank);
+        uint32_t preSum = expertCumsumTensor_(expertId - 1);
+        expertCumsumTensor_(expertId) = count + preSum;
+    }
+
+    expertCumsumTensor_(moeExpertNum_) = sendToMoeExpTokenCnt_;
+
+    if (aivId_ == aivNum_ - 1) {  // 最后一个核
+        SyncFunc<AscendC::HardEvent::S_MTE3>();
+
+        GlobalTensor<int32_t> expandIdxGMTensor;
+        expandIdxGMTensor.SetGlobalBuffer((__gm__ int32_t *)expandIdxOutGM_);
+        DataCopyPad(expandIdxGMTensor, expertCountTensor_, copyExpertIdsParams);
+
+        DataCopy(windowInstatusTensor_[rankId_ * dataSizePerRank_ / sizeof(int32_t)],
+                 statusTensor_[rankId_ * STATUS_ENTRY_COUNT], STATUS_ENTRY_COUNT);
+
+        LocalTensor<int32_t> flagTmpLocal = tBuf_.GetWithOffset<int32_t>(BITS32_PER_BLOCK, baseBuffOffset);
+        Duplicate<int32_t>(flagTmpLocal, FLAG_VALUE, UB_ALIGN / sizeof(int32_t));
+
+        for (uint32_t rankId = 0; rankId < worldSize_; rankId++) {
+            uint64_t rankOffset = rankId * dataSizePerRank_ / sizeof(int32_t);
+            DataCopy(sendStatusTensor_[rankOffset], statusTensor_[rankId * STATUS_ENTRY_COUNT], STATUS_ENTRY_COUNT);
+
+            uint32_t startExpertId = rankId * localMoeExpertNum_;
+            uint32_t tokenCount =
+                expertCumsumTensor_(startExpertId + localMoeExpertNum_) - expertCumsumTensor_(startExpertId);
+            uint64_t dataFlagOffset =
+                rankOffset + (DATA_OFFSET + tokenCount * hCommuSize_ + SKIP_OFFSET) / sizeof(int32_t);
+            SyncFunc<AscendC::HardEvent::S_MTE3>();
+            DataCopyExtParams copyFlagParams{1, static_cast<uint32_t>(sizeof(int32_t)), 0, 0, 0};
+            DataCopyPad(sendStatusTensor_[dataFlagOffset], flagTmpLocal, copyFlagParams);
+        }
+        SyncFunc<AscendC::HardEvent::MTE3_MTE2>();
+    }
+}
+
+template <TemplateMC2TypeA2Class>
+__aicore__ inline void MoeDistributeDispatchV2<TemplateMC2TypeA2Func>::ReorderTokensPipeSet()
+{
+    if constexpr (StaticQuant || DynamicQuant) {
+        SetFlag<HardEvent::V_MTE2>(EVENT_ID0);
+        SetFlag<HardEvent::V_MTE2>(EVENT_ID1);
+    } else {
+        SetFlag<HardEvent::MTE3_MTE2>(EVENT_ID0);
+        SetFlag<HardEvent::MTE3_MTE2>(EVENT_ID1);
+    }
+}
+
+template <TemplateMC2TypeA2Class>
+__aicore__ inline void MoeDistributeDispatchV2<TemplateMC2TypeA2Func>::ReorderTokensPipeReset()
+{
+    if constexpr (StaticQuant || DynamicQuant) {
+        WaitFlag<HardEvent::V_MTE2>(EVENT_ID0);
+        WaitFlag<HardEvent::V_MTE2>(EVENT_ID1);
+    } else {
+        WaitFlag<HardEvent::MTE3_MTE2>(EVENT_ID0);
+        WaitFlag<HardEvent::MTE3_MTE2>(EVENT_ID1);
+    }
+}
+
+template <TemplateMC2TypeA2Class>
+__aicore__ inline void MoeDistributeDispatchV2<TemplateMC2TypeA2Func>::ReorderTokens()
+{
+    uint32_t sendTokenNum = sendToMoeExpTokenCnt_ / aivNum_;
+    uint32_t remainderTokenNum = sendToMoeExpTokenCnt_ % aivNum_;
+    uint32_t startTokenId = sendTokenNum * aivId_;
+    if (aivId_ < remainderTokenNum) {  // 前remainderRankNum个aiv需要多发1个卡的数据
+        sendTokenNum += 1;
+        startTokenId += aivId_;
+    } else {
+        startTokenId += remainderTokenNum;
+    }
+    uint32_t endTokenId = startTokenId + sendTokenNum;
+
+    GlobalTensor<ExpandXOutType> sendTokensGlobal;
+    ReorderTokensPipeSet();
+    uint32_t baseBuffOffset = baseBuffOffset_;
+    LocalTensor<XType> xInPingTensor;
+    LocalTensor<XType> xInPongTensor;
+    if (isQuant_) {
+        xInPingTensor = tBuf_.GetWithOffset<XType>(axisH_, baseBuffOffset);
+        baseBuffOffset += axisH_ * sizeof(XType);
+        xInPongTensor = tBuf_.GetWithOffset<XType>(axisH_, baseBuffOffset);
+        baseBuffOffset += axisH_ * sizeof(XType);
+    }
+    LocalTensor<ExpandXOutType> xOutPingTensor = tBuf_.GetWithOffset<ExpandXOutType>(axisHCommu_, baseBuffOffset);
+    baseBuffOffset += hCommuSize_;
+    LocalTensor<ExpandXOutType> xOutPongTensor = tBuf_.GetWithOffset<ExpandXOutType>(axisHCommu_, baseBuffOffset);
+    baseBuffOffset += hCommuSize_;
+    baseBuffOffset_ = baseBuffOffset;
+
+    int32_t expertId = 0;
+    int32_t expertIdx = 0;
+    for (uint32_t tokenIndex = startTokenId; tokenIndex < endTokenId; ++tokenIndex) {
+        TEventID eventId = (tokenIndex & 1) ? EVENT_ID0 : EVENT_ID1;
+        expertIdx = vaildExpIndexTensor_(tokenIndex);
+        expertId = expertIdsTensor_(expertIdx);
+        int32_t rankId = expertId / localMoeExpertNum_;
+        int32_t startExpertId = rankId * localMoeExpertNum_;
+        uint32_t expertOffset = expertCumsumTensor_(expertId) - expertCumsumTensor_(startExpertId);
+        SyncFunc<AscendC::HardEvent::S_MTE3>();
+        int32_t tokenOffset = expertCountTensor_(expertIdx);
+        sendTokensGlobal.SetGlobalBuffer(
+            (__gm__ ExpandXOutType *)(windowOutGM_ + rankId * dataSizePerRank_ + DATA_OFFSET));
+        if constexpr (StaticQuant || DynamicQuant) {
+            xInTensor_ = (eventId & 1) ? xInPingTensor : xInPongTensor;
+            xOutTensor_ = (eventId & 1) ? xOutPingTensor : xOutPongTensor;
+            WaitFlag<HardEvent::V_MTE2>(eventId);
+            DataCopy(xInTensor_, xGMTensor_[expertIdx / axisK_ * axisH_], axisH_);  // 约束对齐
+            SetFlag<HardEvent::MTE2_V>(eventId);
+
+            QuantProcess(expertId, eventId);
+            WaitFlag<HardEvent::V_MTE3>(eventId);
+            WaitFlag<HardEvent::S_MTE3>(eventId);
+            DataCopy(sendTokensGlobal[(expertOffset + tokenOffset) * axisHCommu_], xOutTensor_, axisHCommu_);
+        } else {
+            xTmpTensor_ = (eventId & 1) ? xOutPingTensor : xOutPongTensor;
+            WaitFlag<HardEvent::MTE3_MTE2>(eventId);
+            DataCopy(xTmpTensor_, xGMTensor_[expertIdx / axisK_ * axisH_],
+                     axisH_);  // 约束对齐 tokenIndex / axisK_ * axisH_
+            SetFlag<HardEvent::MTE2_MTE3>(eventId);
+            WaitFlag<HardEvent::MTE2_MTE3>(eventId);
+            DataCopy(sendTokensGlobal[(expertOffset + tokenOffset) * axisHCommu_], xTmpTensor_, axisHCommu_);
+            SetFlag<HardEvent::MTE3_MTE2>(eventId);
+        }
+    }
+    baseBuffOffset_ = 0;  // 释放零和专家相关ub
+    ReorderTokensPipeReset();
+}
+
+template <TemplateMC2TypeA2Class>
+__aicore__ inline void MoeDistributeDispatchV2<TemplateMC2TypeA2Func>::ConstructBatchWriteInfo()
+{
+    uint32_t batchWriteItemNum = worldSize_ / aivNum_;
+    uint32_t remainderItemNum = worldSize_ % aivNum_;
+    uint32_t startRankId = batchWriteItemNum * aivId_;
+    if (aivId_ < remainderItemNum) {  // 前remainderRankNum个aiv需要多发1个卡的数据
+        batchWriteItemNum += 1;
+        startRankId += aivId_;
+    } else {
+        startRankId += remainderItemNum;
+    }
+    uint32_t endRankId = startRankId + batchWriteItemNum;
+
+    batchWriteU32Tensor_ = batchWriteInfoBuf_.Get<uint32_t>();
+    batchWriteU64Tensor_ = batchWriteInfoBuf_.Get<uint64_t>();
+
+    uint32_t batchWriteDataType = static_cast<uint32_t>(AscendC::HcclDataType::HCCL_DATA_TYPE_INT8);
+
+    SyncFunc<AscendC::HardEvent::MTE2_S>();
+
+    for (uint32_t rankIndex = startRankId; rankIndex < endRankId; ++rankIndex) {
+        uint32_t startExpertId = rankIndex * localMoeExpertNum_;
+        uint32_t currentIndex = rankIndex - startRankId;
+        uint32_t tokenCount =
+            expertCumsumTensor_(startExpertId + localMoeExpertNum_) - expertCumsumTensor_(startExpertId);
+        GM_ADDR rankGM = (__gm__ uint8_t *)(hccl_.GetWindowsInAddr(rankIndex) + totalSize_ * bufferChosen_ +
+                                            (dataSizePerRank_ * rankId_));
+        GM_ADDR localBuf = (__gm__ uint8_t *)(windowOutGM_ + dataSizePerRank_ * rankIndex);
+        uint64_t batchWriteDataSize = DATA_OFFSET + tokenCount * hCommuSize_ + sizeof(int32_t) + SKIP_OFFSET;
+        batchWriteU64Tensor_(currentIndex * U64_PER_ITEM) = (uint64_t)localBuf;
+        batchWriteU64Tensor_(currentIndex * U64_PER_ITEM + 1) = (uint64_t)rankGM;
+        batchWriteU64Tensor_(currentIndex * U64_PER_ITEM + 2) = batchWriteDataSize;
+        batchWriteU32Tensor_(currentIndex * U32_PER_ITEM + 6) = batchWriteDataType;
+        batchWriteU32Tensor_(currentIndex * U32_PER_ITEM + 7) = rankIndex;
+    }
+
+    SyncFunc<AscendC::HardEvent::S_MTE3>();
+    DataCopy(batchWriteInfoTensor_[startRankId * U32_PER_ITEM], batchWriteU32Tensor_, batchWriteItemNum * U32_PER_ITEM);
+    PipeBarrier<PIPE_ALL>();
+}
+
+template <TemplateMC2TypeA2Class>
+__aicore__ inline void MoeDistributeDispatchV2<TemplateMC2TypeA2Func>::TokenActiveMaskCal()
+{
+    LocalTensor<int8_t> xActiveMaskInt8Tensor;
+    LocalTensor<half> xActiveMaskHalfTensor;
+    LocalTensor<half> sumOutTensor;
+    LocalTensor<uint8_t> tempTensor;
+    uint32_t axisBsAlignSize = (axisBS_ + UB_ALIGN - 1) / UB_ALIGN * UB_ALIGN;
+    uint32_t baseBuffOffset = baseBuffOffset_;
+    xActiveMaskInt8Tensor = tBuf_.GetWithOffset<int8_t>(axisBsAlignSize, baseBuffOffset);
+    baseBuffOffset += axisBsAlignSize * sizeof(int8_t);
+    xActiveMaskHalfTensor = tBuf_.GetWithOffset<half>(axisBsAlignSize, baseBuffOffset);
+    baseBuffOffset += axisBsAlignSize * sizeof(half);
+    sumOutTensor = tBuf_.GetWithOffset<half>(UB_ALIGN, baseBuffOffset);
+    baseBuffOffset += UB_ALIGN * sizeof(half);
+    tempTensor = expertCountBuf_.Get<uint8_t>();
+    DataCopyExtParams xActiveMaskParams = {1U, axisBS_, 0U, 0U, 0U};
+    DataCopyPadExtParams<int8_t> xActiveMaskCopyPadParams{false, 0U, 0U, 0U};
+    DataCopyPad(xActiveMaskInt8Tensor, xActiveMaskGMTensor_, xActiveMaskParams, xActiveMaskCopyPadParams);
+    SyncFunc<AscendC::HardEvent::MTE2_V>();
+    Cast(xActiveMaskHalfTensor, xActiveMaskInt8Tensor, RoundMode::CAST_NONE, axisBS_);
+    PipeBarrier<PIPE_V>();
+    SumParams params{1, axisBsAlignSize, axisBS_};
+    Sum(sumOutTensor, xActiveMaskHalfTensor, tempTensor, params);
+    SyncFunc<AscendC::HardEvent::V_S>();
+    activeMaskBsCnt_ = static_cast<int32_t>(sumOutTensor.GetValue(0));
+    sendToMoeExpTokenCnt_ = activeMaskBsCnt_ * axisK_;
+}
+
+template <TemplateMC2TypeA2Class>
+__aicore__ inline void
+MoeDistributeDispatchV2<TemplateMC2TypeA2Func>::CalVaildExpIdx(LocalTensor<int8_t> maskInputTensor)
+{
+    uint32_t mask = expertIdsCnt_;
+    uint32_t curMaskCnt = axisBS_ * axisK_;
+    uint32_t calCnt = Ceil(curMaskCnt * sizeof(half), ALIGNED_LEN_256) * ALIGNED_LEN_256 / sizeof(half);
+    uint32_t baseBuffOffset = baseBuffOffset_;
+    LocalTensor<half> tempTensor = tBuf_.GetWithOffset<half>(calCnt, baseBuffOffset);
+    baseBuffOffset += calCnt * sizeof(half);
+    LocalTensor<uint8_t> gatherMaskTensorInt8 = gatherMaskTensor_.template ReinterpretCast<uint8_t>();
+    LocalTensor<int32_t> expertsIndexTensor =
+        tBuf_.GetWithOffset<int32_t>(RoundUp(curMaskCnt, BITS32_PER_BLOCK), baseBuffOffset);
+
+    Duplicate<half>(tempTensor, (half)0, calCnt);
+    PipeBarrier<PIPE_V>();
+    SyncFunc<AscendC::HardEvent::MTE2_V>();
+    LocalTensor<int8_t> maskInputInt8Tensor = maskInputTensor.ReinterpretCast<int8_t>();
+    Cast(tempTensor, maskInputInt8Tensor, RoundMode::CAST_NONE, curMaskCnt);
+    PipeBarrier<PIPE_V>();
+    Duplicate<uint32_t>(gatherMaskTensor_, 0,
+                        Ceil(expertIdsCnt_, ALIGNED_LEN_256) * ALIGNED_LEN_256 / BITS_PER_BYTE / sizeof(uint32_t));
+    PipeBarrier<PIPE_V>();
+    CompareScalar(gatherMaskTensorInt8, tempTensor, static_cast<half>(1), AscendC::CMPMODE::EQ, calCnt);
+    CreateVecIndex(expertsIndexTensor, 0, RoundUp(curMaskCnt, BITS32_PER_BLOCK));
+    PipeBarrier<PIPE_V>();
+    GatherMask(vaildExpIndexTensor_, expertsIndexTensor, gatherMaskTensor_, true, mask, {1, 1, 0, 0},
+               sendToMoeExpTokenCnt_);
+}
+
+template <TemplateMC2TypeA2Class>
+__aicore__ inline void MoeDistributeDispatchV2<TemplateMC2TypeA2Func>::ExpertActiveMaskCal()
+{
+    // 计算vaildExpIndexTensor, 连续搬入xActiveMask进行GatherMask计算, 用于moe专家的发送
+    uint32_t tempSize = ((expertIdsCnt_ * sizeof(int8_t) + 1) / UB_ALIGN + 1) * UB_ALIGN / sizeof(int8_t);
+    LocalTensor<int8_t> maskInputTensor = tBuf_.GetWithOffset<int8_t>(tempSize, baseBuffOffset_);
+    baseBuffOffset_ += tempSize;
+    DataCopyPadExtParams<int8_t> maskCopyPadParams{false, 0U, 0U, 0U};
+    DataCopyExtParams maskParams{1U, static_cast<uint32_t>(expertIdsCnt_ * sizeof(int8_t)), 0U, 0U, 0U};
+    DataCopyPad(maskInputTensor, xActiveMaskGMTensor_, maskParams, maskCopyPadParams);
+    CalVaildExpIdx(maskInputTensor);
+    baseBuffOffset_ -= tempSize;
+    SyncFunc<AscendC::HardEvent::V_S>();
+}
+
+template <TemplateMC2TypeA2Class>
+__aicore__ inline void MoeDistributeDispatchV2<TemplateMC2TypeA2Func>::MaskZeroComputeExpert(uint32_t maskCnt)
+{
+    sendToMoeExpTokenCnt_ = activeMaskBsCnt_ * axisK_;
+    uint32_t tmpTokenCnt = static_cast<uint32_t>(sendToMoeExpTokenCnt_);
+    uint32_t baseBuffOffset = baseBuffOffset_;
+    LocalTensor<int32_t> expertsIndexTensor =
+        tBuf_.GetWithOffset<int32_t>(RoundUp(tmpTokenCnt, BITS32_PER_BLOCK), baseBuffOffset);
+    baseBuffOffset += RoundUp(tmpTokenCnt, BITS32_PER_BLOCK) * sizeof(int32_t);
+    int32_t maskTensorInt16Cnt = Ceil(tmpTokenCnt, UB_ALIGN / 2);
+    LocalTensor<uint32_t> maskTensorInt32 =
+        tBuf_.GetWithOffset<uint32_t>(RoundUp(tmpTokenCnt, UB_ALIGN), baseBuffOffset);  // expertCountBuf_
+    LocalTensor<uint8_t> maskTensorInt8 = maskTensorInt32.template ReinterpretCast<uint8_t>();
+    baseBuffOffset += RoundUp(tmpTokenCnt, UB_ALIGN) * sizeof(uint32_t);
+    LocalTensor<half> expertIdsTensorCast =
+        tBuf_.GetWithOffset<half>(RoundUp(tmpTokenCnt, BITS16_PER_BLOCK), baseBuffOffset);  // expertCountBuf_
+    baseBuffOffset += RoundUp(tmpTokenCnt, BITS16_PER_BLOCK) * sizeof(half);
+    int32_t moeExpertNumInt32 = static_cast<int32_t>(moeExpertNum_);
+
+    DataCopyExtParams expertIdsCntParams = {
+        1U, static_cast<uint32_t>(RoundUp(tmpTokenCnt, BITS32_PER_BLOCK) * sizeof(uint32_t)), 0U, 0U, 0U};
+    DataCopyPadExtParams<int32_t> expertIdsCntCopyPadParams{false, 0U, 0U, 0U};
+    DataCopyPad(expertIdsTensor_, expertIdsGMTensor_, expertIdsCntParams, expertIdsCntCopyPadParams);
+    SyncFunc<AscendC::HardEvent::MTE2_V>();
+    PipeBarrier<PIPE_V>();
+    SetDeqScale((half)1.000000e+00f);
+    PipeBarrier<PIPE_V>();
+    Cast(expertIdsTensorCast, expertIdsTensor_, RoundMode::CAST_NONE, RoundUp(tmpTokenCnt, BITS32_PER_BLOCK));
+    PipeBarrier<PIPE_V>();
+    Duplicate<uint32_t>(maskTensorInt32, 0, Ceil(tmpTokenCnt, UB_ALIGN));
+    PipeBarrier<PIPE_V>();
+    // CompareScalar需要保证元素所占空间256字节对齐。
+    uint32_t calcCnt = Ceil(sendToMoeExpTokenCnt_ * sizeof(half), ALIGNED_LEN_256) * ALIGNED_LEN_256 / sizeof(half);
+    // 逐元素比较一个tensor中的元素和另一个Scalar的大小，如果比较后的结果为真，则输出结果的对应比特位为1，否则为0。筛掉零计算量专家
+    CompareScalar(maskTensorInt8, expertIdsTensorCast, static_cast<half>(moeExpertNumInt32), AscendC::CMPMODE::LT,
+                  calcCnt);
+    PipeBarrier<PIPE_V>();
+    // ?
+    LocalTensor<uint16_t> maskTensorInt16 = maskTensorInt32.template ReinterpretCast<uint16_t>();  // 空间bs*k*1
+    LocalTensor<uint16_t> gatherMaskTensorint16 = gatherMaskTensor_.template ReinterpretCast<uint16_t>();  // 空间bs*k*4
+    /* 特殊专家的maskTensorInt16和之前的gatherMaskTensor_结果按位相与，AND 支持uint16，
+     * gatherMaskTensor_和gatherMaskTensorint16是同一个地址 */
+    And(gatherMaskTensorint16, gatherMaskTensorint16, maskTensorInt16, maskTensorInt16Cnt);
+    PipeBarrier<PIPE_V>();
+    // 再筛一次
+    CreateVecIndex(expertsIndexTensor, 0, RoundUp(tmpTokenCnt, BITS32_PER_BLOCK));
+    PipeBarrier<PIPE_V>();
+    GatherMask(vaildExpIndexTensor_, expertsIndexTensor, gatherMaskTensor_, true, maskCnt, {1, 1, 0, 0},
+               sendToMoeExpTokenCnt_);
+    SyncFunc<AscendC::HardEvent::V_S>();
+}
+
+template <TemplateMC2TypeA2Class>
+__aicore__ inline void MoeDistributeDispatchV2<TemplateMC2TypeA2Func>::GenerateGatherMaskTensor(uint32_t maskCnt)
+{
+    Duplicate<uint32_t>(gatherMaskTensor_, 0, Ceil(expertIdsCnt_, UB_ALIGN));
+    PipeBarrier<PIPE_V>();
+    Duplicate<uint32_t>(gatherMaskTensor_, 0xFFFFFFFF, Ceil(maskCnt, UB_ALIGN));
+    PipeBarrier<PIPE_V>();
+}
+
+template <TemplateMC2TypeA2Class>
+__aicore__ inline void MoeDistributeDispatchV2<TemplateMC2TypeA2Func>::ZeroComputeExpertMaskCal()
+{
+    uint32_t maskCnt = expertIdsCnt_;
+    if (isTokenMaskFlag_) {  // 一维
+        maskCnt = activeMaskBsCnt_ * axisK_;
+    }
+
+    if (!isExpertMaskFlag_) {  // 非二维要生成gatherMaskTensor_
+        GenerateGatherMaskTensor(maskCnt);
+    }
+
+    // 零计算量专家剪枝
+    MaskZeroComputeExpert(maskCnt);
+    isExpertMaskFlag_ = true;
+}
+
+template <TemplateMC2TypeA2Class>
+__aicore__ inline void MoeDistributeDispatchV2<TemplateMC2TypeA2Func>::SendToMoeExpert()
+{
+    ConstructBatchWriteInfo();
+    SyncAll<true>();
+
+    if (aivId_ == 0) {
+        HcclHandle batchWriteResult = hccl_.BatchWrite<true>(batchWriteInfo_, worldSize_);
+        bufferChosenGlobal_(0) = bufferChosen_ ^ 1;
+    }
+    if (aivId_ == aivNum_ - 1) {
+        uint32_t baseBuffOffset = baseBuffOffset_;
+        LocalTensor<ExpandXOutType> xOutPingTensor = tBuf_.GetWithOffset<ExpandXOutType>(axisHCommu_, baseBuffOffset);
+        baseBuffOffset += hCommuSize_;
+        LocalTensor<ExpandXOutType> xOutPongTensor = tBuf_.GetWithOffset<ExpandXOutType>(axisHCommu_, baseBuffOffset);
+        baseBuffOffset += hCommuSize_;
+
+        uint32_t startExpertId = rankId_ * localMoeExpertNum_;
+        uint32_t tokenCount =
+            expertCumsumTensor_(startExpertId + localMoeExpertNum_) - expertCumsumTensor_(startExpertId);
+        GlobalTensor<ExpandXOutType> currRankWindowInGlobal;
+        GlobalTensor<ExpandXOutType> currRankWindowOutGlobal;
+        currRankWindowInGlobal.SetGlobalBuffer(
+            (__gm__ ExpandXOutType *)(windowInGM_ + rankId_ * dataSizePerRank_ + DATA_OFFSET));
+        currRankWindowOutGlobal.SetGlobalBuffer(
+            (__gm__ ExpandXOutType *)(windowOutGM_ + rankId_ * dataSizePerRank_ + DATA_OFFSET));
+        SyncFunc<AscendC::HardEvent::S_MTE2>();
+        SetFlag<HardEvent::MTE3_MTE2>(EVENT_ID0);
+        SetFlag<HardEvent::MTE3_MTE2>(EVENT_ID1);
+        for (uint32_t currTokenIdx = 0; currTokenIdx < tokenCount; currTokenIdx++) {
+            TEventID eventId = (currTokenIdx & 1) ? EVENT_ID0 : EVENT_ID1;
+            xTmpTensor_ = (eventId & 1) ? xOutPingTensor : xOutPongTensor;
+            WaitFlag<HardEvent::MTE3_MTE2>(eventId);
+            DataCopy(xTmpTensor_, currRankWindowOutGlobal[currTokenIdx * axisHCommu_], axisHCommu_);
+            SetFlag<HardEvent::MTE2_MTE3>(eventId);
+            WaitFlag<HardEvent::MTE2_MTE3>(eventId);
+            DataCopy(currRankWindowInGlobal[currTokenIdx * axisHCommu_], xTmpTensor_, axisHCommu_);
+            SetFlag<HardEvent::MTE3_MTE2>(eventId);
+        }
+        WaitFlag<HardEvent::MTE3_MTE2>(EVENT_ID0);
+        WaitFlag<HardEvent::MTE3_MTE2>(EVENT_ID1);
+        uint64_t dataFlagOffset =
+            (rankId_ * dataSizePerRank_ + DATA_OFFSET + tokenCount * hCommuSize_ + SKIP_OFFSET) / sizeof(int32_t);
+        SyncFunc<AscendC::HardEvent::MTE3_S>();
+        windowInstatusTensor_(dataFlagOffset) = FLAG_VALUE;
+        DataCacheCleanAndInvalid<int32_t, AscendC::CacheLine::SINGLE_CACHE_LINE, AscendC::DcciDst::CACHELINE_OUT>(
+            windowInstatusTensor_[dataFlagOffset]);
+    }
+}
+
+template <TemplateMC2TypeA2Class>
+__aicore__ inline void MoeDistributeDispatchV2<TemplateMC2TypeA2Func>::WaitDispatch()
+{
+    uint32_t batchWriteItemNum = worldSize_ / aivNum_;
+    uint32_t remainderItemNum = worldSize_ % aivNum_;
+    uint32_t startRankId = batchWriteItemNum * aivId_;
+    if (aivId_ < remainderItemNum) {  // 前remainderRankNum个aiv需要多发1个卡的数据
+        batchWriteItemNum += 1;
+        startRankId += aivId_;
+    } else {
+        startRankId += remainderItemNum;
+    }
+    uint32_t endRankId = startRankId + batchWriteItemNum;
+
+    if (batchWriteItemNum == 0) {
+        SyncAll<true>();
+        return;
+    }
+
+    DataCopyExtParams copyFlagParams{1, static_cast<uint32_t>(sizeof(int32_t)), 0, 0, 0};
+    DataCopyPadExtParams<int32_t> padParams{false, 0, 0, 0};
+    LocalTensor<int32_t> dataFlagLocal = tBuf_.GetWithOffset<int32_t>(BITS32_PER_BLOCK, baseBuffOffset_);
+    SyncFunc<AscendC::HardEvent::S_MTE2>();
+
+    for (uint32_t rankId = startRankId; rankId < endRankId; rankId++) {
+        int32_t statusFlag = 0;
+        int32_t dataFlag = 0;
+        while (statusFlag != FLAG_VALUE) {
+            DataCopy(statusTensor_[rankId * STATUS_ENTRY_COUNT],
+                     windowInstatusTensor_[rankId * dataSizePerRank_ / sizeof(int32_t)], STATUS_ENTRY_COUNT);
+            SyncFunc<AscendC::HardEvent::MTE2_S>();
+            statusFlag = statusTensor_(rankId * STATUS_ENTRY_COUNT + FLAG_OFFSET);
+            PipeBarrier<PIPE_MTE2>();
+        }
+
+        uint32_t tokenCount = 0;
+        for (int32_t expertOffset = 0; expertOffset < localMoeExpertNum_; expertOffset++) {
+            tokenCount += statusTensor_(rankId * STATUS_ENTRY_COUNT + expertOffset);
+        }
+
+        uint64_t dataFlagOffset =
+            (rankId * dataSizePerRank_ + DATA_OFFSET + tokenCount * hCommuSize_ + SKIP_OFFSET) / sizeof(int32_t);
+        while (dataFlag != FLAG_VALUE) {
+            DataCopyPad(dataFlagLocal, windowInstatusTensor_[dataFlagOffset], copyFlagParams, padParams);
+            SyncFunc<AscendC::HardEvent::MTE2_S>();
+            dataFlag = dataFlagLocal(0);
+            PipeBarrier<PIPE_MTE2>();
+        }
+        windowInstatusTensor_(dataFlagOffset) = 0;
+    }
+    SyncAll<true>();
+}
+
+template <TemplateMC2TypeA2Class>
+__aicore__ inline void MoeDistributeDispatchV2<TemplateMC2TypeA2Func>::GetStatusCumSum()
+{
+    uint32_t baseBuffOffset = baseBuffOffset_;
+    uint32_t srcStrideU32 = dataSizePerRank_ - STATUS_SIZE;
+    DataCopyExtParams copyStatusParams{static_cast<uint16_t>(worldSize_), STATUS_SIZE, srcStrideU32, 0, 0};
+    DataCopyPadExtParams<int32_t> padParams{false, 0, 0, 0};
+    DataCopyPad(statusTensor_, windowInstatusTensor_, copyStatusParams, padParams);
+    SyncFunc<AscendC::HardEvent::MTE2_V>();
+    LocalTensor<int32_t> epRecvCountsTempLocal =
+        gatherMaskOutBuf_.GetWithOffset<int32_t>(localMoeExpertNumAlign_ * worldSize_, 0);
+    LocalTensor<int32_t> epRecvCountsOutLocal =
+        gatherMaskOutBuf_.GetWithOffset<int32_t>(moeExpertNum_, localMoeExpertNumAlign_ * worldSize_ * sizeof(int32_t));
+    uint16_t srcStrideU16 = (STATUS_ENTRY_COUNT - localMoeExpertNumAlign_) / BITS32_PER_BLOCK;
+    uint16_t worldSizeU16 = (uint16_t)worldSize_;
+    DataCopyParams copyParamsMultiple{worldSizeU16, static_cast<uint16_t>(localMoeExpertNumAlign_ / BITS32_PER_BLOCK),
+                                      srcStrideU16, 0};
+    DataCopy(epRecvCountsTempLocal, statusTensor_, copyParamsMultiple);
+    uint64_t mask4Adds = localMoeExpertNum_;
+    PipeBarrier<PIPE_V>();
+    for (uint32_t rankIndex = 1; rankIndex < worldSize_; ++rankIndex) {
+        uint32_t statusOffset = rankIndex * localMoeExpertNumAlign_;
+        Add(epRecvCountsTempLocal[statusOffset], epRecvCountsTempLocal[statusOffset - localMoeExpertNumAlign_],
+            epRecvCountsTempLocal[statusOffset], mask4Adds, 1, {1, 1, 1, 8, 8, 8});
+        PipeBarrier<PIPE_V>();
+    }
+    LocalTensor<uint32_t> patternLocal = tBuf_.GetWithOffset<uint32_t>(localMoeExpertNumAlign_, baseBuffOffset);
+    baseBuffOffset += localMoeExpertNumAlign_ * sizeof(uint32_t);
+    Duplicate<uint32_t>(patternLocal, 0, localMoeExpertNumAlign_);
+    SyncFunc<AscendC::HardEvent::V_S>();
+    patternLocal(0) = 1;
+    srcStrideU16 = localMoeExpertNumAlign_ * sizeof(int32_t) / UB_ALIGN;
+    int32_t previousSum = 0;
+    uint64_t rsvdCnt = 0;
+    mask4Adds = worldSize_;
+    uint32_t mask4Gather = localMoeExpertNumAlign_;
+    for (uint32_t expertIndex = 0; expertIndex < localMoeExpertNum_; expertIndex++) {
+        SyncFunc<AscendC::HardEvent::S_V>();
+        GatherMask(epRecvCountsOutLocal[expertIndex * worldSize_], epRecvCountsTempLocal, patternLocal, true,
+                   mask4Gather, {1, worldSizeU16, srcStrideU16, 0}, rsvdCnt);
+        PipeBarrier<PIPE_V>();
+        Adds(epRecvCountsOutLocal[expertIndex * worldSize_], epRecvCountsOutLocal[expertIndex * worldSize_],
+             previousSum, worldSize_);
+        SyncFunc<AscendC::HardEvent::V_S>();
+        previousSum = epRecvCountsOutLocal(expertIndex * worldSize_ + worldSize_ - 1);
+        patternLocal(0) = patternLocal(0) << 1;
+    }
+    if (aivId_ == aivNum_ - 1) {
+        LocalTensor<int32_t> expertTokenNumsW64Tensor =
+            tBuf_.GetWithOffset<int32_t>(localMoeExpertNum_ * 2, baseBuffOffset);
+        if (expertTokenNumsType_ == 0) {
+            mask4Gather = worldSize_;
+            patternLocal(0) = 0;
+            patternLocal((worldSize_ - 1) / 32) = 1 << ((worldSize_ - 1) % 32);
+            srcStrideU16 = worldSize_ * sizeof(int32_t) / UB_ALIGN;
+            SyncFunc<AscendC::HardEvent::S_V>();
+            GatherMask(epRecvCountsTempLocal, epRecvCountsOutLocal, patternLocal, true, mask4Gather,
+                       {1, static_cast<uint16_t>(localMoeExpertNum_), srcStrideU16, 0}, rsvdCnt);
+            SyncFunc<AscendC::HardEvent::V_S>();
+            for (int i = 0; i < localMoeExpertNum_; i++) {
+                expertTokenNumsW64Tensor(i * 2) = epRecvCountsTempLocal(i);
+                expertTokenNumsW64Tensor(i * 2 + 1) = 0;
+            }
+        } else {
+            uint32_t tokenCountOffset = (worldSize_ - 1) * localMoeExpertNumAlign_;
+            for (int i = 0; i < localMoeExpertNum_; i++) {
+                expertTokenNumsW64Tensor(i * 2) = epRecvCountsTempLocal(tokenCountOffset + i);
+                expertTokenNumsW64Tensor(i * 2 + 1) = 0;
+            }
+        }
+        SyncFunc<AscendC::HardEvent::S_MTE3>();
+        GlobalTensor<int32_t> expertTokenNumsGlobal;
+        expertTokenNumsGlobal.SetGlobalBuffer((__gm__ int32_t *)(expertTokenNumsOutGM_));
+        DataCopyExtParams copyPadParams{1, static_cast<uint32_t>(localMoeExpertNum_ * sizeof(int64_t)), 0, 0, 0};
+        DataCopyPad(expertTokenNumsGlobal, expertTokenNumsW64Tensor, copyPadParams);
+
+        GlobalTensor<int32_t> epRecvCountsGlobal;
+        epRecvCountsGlobal.SetGlobalBuffer((__gm__ int32_t *)(epRecvCountsGM_));
+        SyncFunc<AscendC::HardEvent::V_MTE3>();
+        DataCopy(epRecvCountsGlobal, epRecvCountsOutLocal, moeExpertNum_);
+    }
+}
+
+template <TemplateMC2TypeA2Class>
+__aicore__ inline void MoeDistributeDispatchV2<TemplateMC2TypeA2Func>::LocalWindowCopy()
+{
+    uint32_t dynamicScalesLocalIdx = 0;
+    GetStatusCumSum();
+    LocalTensor<int32_t> epRecvCountsOutLocal =
+        gatherMaskOutBuf_.GetWithOffset<int32_t>(moeExpertNum_, localMoeExpertNumAlign_ * worldSize_ * sizeof(int32_t));
+    uint32_t dealRankNum = worldSize_ / aivNum_;
+    uint32_t remainderRankNum = worldSize_ % aivNum_;
+    uint32_t startRankId = dealRankNum * aivId_;
+    if (aivId_ < remainderRankNum) {  // 前remainderRankNum个aiv需要多发1个卡的数据
+        dealRankNum += 1;
+        startRankId += aivId_;
+    } else {
+        startRankId += remainderRankNum;
+    }
+    uint32_t endRankId = startRankId + dealRankNum;
+
+    GlobalTensor<ExpandXOutType> currRankWindowGlobal;
+    uint32_t baseBuffOffset = baseBuffOffset_;
+    LocalTensor<float> dynamicScalesTensor =
+        tBuf_.GetWithOffset<float>(RoundUp(axisBS_, BITS32_PER_BLOCK), baseBuffOffset);
+    baseBuffOffset += RoundUp(axisBS_, BITS32_PER_BLOCK) * sizeof(float);
+    LocalTensor<ExpandXOutType> xOutPingTensor = tBuf_.GetWithOffset<ExpandXOutType>(axisHCommu_, baseBuffOffset);
+    baseBuffOffset += hCommuSize_;
+    LocalTensor<ExpandXOutType> xOutPongTensor = tBuf_.GetWithOffset<ExpandXOutType>(axisHCommu_, baseBuffOffset);
+    baseBuffOffset += hCommuSize_;
+
+    for (uint32_t index = startRankId; index < endRankId; index++) {
+        GM_ADDR wAddr =
+            (__gm__ uint8_t *)(windowInGM_) + index * dataSizePerRank_ + DATA_OFFSET;  // * bufferSizePerRank_;
+        currRankWindowGlobal.SetGlobalBuffer((__gm__ ExpandXOutType *)(wAddr));
+        uint32_t currRankDataOffset = 0;
+        uint32_t currRankStatusOffset = index * STATUS_ENTRY_COUNT;
+
+        for (uint32_t j = 0; j < localMoeExpertNum_; j++) {
+            // 将数据从Window拷贝到UB
+            uint32_t currTokensCount = statusTensor_(currRankStatusOffset + j);
+            uint32_t currTokensOffset = epRecvCountsOutLocal(j * worldSize_ + index) - currTokensCount;
+            dynamicScalesLocalIdx = 0;
+            SyncFunc<AscendC::HardEvent::S_MTE2>();
+            SetFlag<HardEvent::MTE3_MTE2>(EVENT_ID0);
+            SetFlag<HardEvent::MTE3_MTE2>(EVENT_ID1);
+            for (uint32_t k = 0; k < currTokensCount; k++) {
+                TEventID eventId = (k & 1) ? EVENT_ID0 : EVENT_ID1;
+                xTmpTensor_ = (eventId & 1) ? xOutPingTensor : xOutPongTensor;
+                WaitFlag<HardEvent::MTE3_MTE2>(eventId);
+                DataCopy(xTmpTensor_, currRankWindowGlobal[(currRankDataOffset + k) * axisHCommu_], axisHCommu_);
+                SetFlag<HardEvent::MTE2_MTE3>(eventId);
+                if constexpr (DynamicQuant) {
+                    PipeBarrier<PIPE_ALL>();
+                    xOutFp32Tensor_ = xTmpTensor_.template ReinterpretCast<float>();
+                    dynamicScalesTensor.SetValue(dynamicScalesLocalIdx++,
+                                                 xOutFp32Tensor_.GetValue(axisH_ / sizeof(float)));  // int8->float32
+                    PipeBarrier<PIPE_ALL>();
+                }
+                WaitFlag<HardEvent::MTE2_MTE3>(eventId);
+                DataCopy(expandXOutGMTensor_[(currTokensOffset + k) * axisH_], xTmpTensor_, axisH_);
+                SetFlag<HardEvent::MTE3_MTE2>(eventId);
+            }
+            WaitFlag<HardEvent::MTE3_MTE2>(EVENT_ID0);
+            WaitFlag<HardEvent::MTE3_MTE2>(EVENT_ID1);
+            currRankDataOffset += currTokensCount;
+            PipeBarrier<PIPE_ALL>();
+            if constexpr (DynamicQuant) {
+                DataCopyExtParams scalesCopyParams{1U, static_cast<uint32_t>(dynamicScalesLocalIdx * sizeof(float)), 0U,
+                                                   0U, 0U};
+                DataCopyPad(dynamicScalesOutGMTensor_[currTokensOffset], dynamicScalesTensor, scalesCopyParams);
+            }
+        }
+    }
+}
+
+template <TemplateMC2TypeA2Class>
+__aicore__ inline void MoeDistributeDispatchV2<TemplateMC2TypeA2Func>::Process()
+{
+    if ASCEND_IS_AIV {
+        IndexSort();
+        ReorderTokens();
+        SendToMoeExpert();
+        WaitDispatch();
+        LocalWindowCopy();
+        SyncAll<true>();
+        if (aivId_ == 0) {
+            Duplicate<int32_t>(statusTensor_, 0, worldSize_ * STATUS_ENTRY_COUNT);  // 8 = UB_ALIGN / sizeof(int32_t)
+            SyncFunc<AscendC::HardEvent::V_MTE3>();
+            uint32_t dstStrideU32 = dataSizePerRank_ - STATUS_SIZE;
+            DataCopyExtParams copyStatusParams{static_cast<uint16_t>(worldSize_), STATUS_SIZE, 0, dstStrideU32, 0};
+            DataCopyPad(windowInstatusTensor_, statusTensor_, copyStatusParams);
+        }
+        hccl_.Finalize();
+    }
+}
+}  // namespace MoeDistributeDispatchA2Impl
+#endif  // MOE_DISTRIBUTE_DISPATCH_V2_H
diff --git a/csrc/deepep/ops2/op_kernel/moe_distribute_dispatch_v2_layered.h b/csrc/deepep/ops2/op_kernel/moe_distribute_dispatch_v2_layered.h
new file mode 100644
index 000000000..ca44ff5ba
--- /dev/null
+++ b/csrc/deepep/ops2/op_kernel/moe_distribute_dispatch_v2_layered.h
@@ -0,0 +1,1424 @@
+/**
+ * This program is free software, you can redistribute it and/or modify.
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING
+BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+/*!
+ * \file moe_distribute_dispatch_a2_layered.h
+ * \brief
+ */
+
+#ifndef MOE_DISTRIBUTE_DISPATCH_A2_LAYERED_H
+#define MOE_DISTRIBUTE_DISPATCH_A2_LAYERED_H
+
+#include "kernel_operator.h"
+#include "kernel_tiling/kernel_tiling.h"
+#include "moe_distribute_dispatch_v2_tiling.h"
+#include "moe_distribute_base.h"
+
+namespace MoeDistributeDispatchA2Impl {
+#define TemplateMC2TypeA2layeredClass \
+    typename XType, typename ExpandXOutType, bool StaticQuant, bool DynamicQuant, bool IsSmoothScaleExist
+#define TemplateMC2TypeA2layeredFunc XType, ExpandXOutType, StaticQuant, DynamicQuant, IsSmoothScaleExist
+
+using namespace AscendC;
+template <TemplateMC2TypeA2layeredClass>
+class MoeDistributeDispatchV2Layered
+{
+public:
+    constexpr static uint32_t STATE_OFFSET = 512;                 // 状态空间偏移地址
+    constexpr static uint32_t STATUS_SIZE_LAYERED = 1024 * 1024;  // 1M
+    constexpr static uint32_t RDMA_BUFFER_ALIGN = 4 * 1024;
+    constexpr static uint32_t SERVER_RANK_SIZE = 8;
+    constexpr static uint32_t B64_PER_BLOCK = 4;
+    constexpr static uint32_t B16_PER_BLOCK = 16;
+    constexpr static uint32_t UB_32B_ALIGN = 32;
+    constexpr static uint32_t EXP_TOKEN_COUNT_FLAG_CNT = UB_32B_ALIGN / sizeof(int32_t);  // 8
+    constexpr static uint32_t TBUF_SIZE = 190 * 1024;
+    constexpr static uint32_t IPC_MAGIC_OFFSET = 2 * 1024 * 1024 - 128 * 32;
+    constexpr static uint32_t IPC_FLAG_OFFSET = 1 * 1024 * 1024;
+    constexpr static uint32_t IPC_TOKEN_CNT_OFFSET = 2 * 1024 * 1024;
+    constexpr static uint32_t IPC_DATA_OFFSET = 4 * 1024 * 1024;
+    constexpr static uint32_t MTU_SIZE = 4 * 1024;
+    constexpr static uint32_t IPC_BUFF_ALIGN = 512;
+    constexpr static int32_t IPC_FLAG_STEP_1 = 0x0d0d0d0d;
+    constexpr static uint32_t TBUF_TEMP_OFFSET = 8 * 1024;
+    constexpr static uint32_t MAX_BS_NUM = 512U;  // 适配bs=512
+    constexpr static uint32_t TBUF_OFFSET_ALIGN_B32_CNT = 2 * 1024 / sizeof(int32_t);
+    constexpr static uint64_t SHOULD_SEND_FLAG_VALUE = 0x0f0f0f0f;
+    constexpr static uint64_t END_OF_WRITE_FLAG_VALUE = 0xffffffff;
+    constexpr static uint32_t FLAG_SIZE = 64;
+    constexpr static uint32_t FINISH_STATUS = 0;
+    constexpr static uint32_t WAIT_STATUS = 1;
+    constexpr static uint32_t ARRIVAL_STATUS = 2;
+    constexpr static uint32_t SKIP_STATUS = 3;
+    constexpr static uint32_t RDMA_DATA_SIZE = 100U * 1024U * 1024U;
+    constexpr static uint32_t EXTRA_TOKEN_INFO_NUM = 4U;  // 专家信息 权重信息 量化Scale 到达标志位
+
+    template <AscendC::HardEvent event>
+    __aicore__ inline void SyncFunc()
+    {
+        int32_t eventID = static_cast<int32_t>(GetTPipePtr()->FetchEventID(event));
+        AscendC::SetFlag<event>(eventID);
+        AscendC::WaitFlag<event>(eventID);
+    }
+
+    template <typename T>
+    inline __aicore__ T RoundUp(const T val, const T align)
+    {
+        static_assert(std::is_arithmetic<T>::value, "T must be an arithmetic type");
+        if (align == 0 || val + align - 1 < val) {
+            return val;
+        }
+        return (val + align - 1) / align * align;
+    }
+
+public:
+    __aicore__ inline MoeDistributeDispatchV2Layered(){};
+    __aicore__ inline void Init(GM_ADDR x, GM_ADDR expertIds, GM_ADDR scales, GM_ADDR expertScales, GM_ADDR expandXOut,
+                                GM_ADDR dynamicScalesOut, GM_ADDR expandIdxOut, GM_ADDR expertTokenNumsOut,
+                                GM_ADDR epRecvCountsOut, GM_ADDR expandScales, GM_ADDR workspaceGM, TPipe *pipe,
+                                GM_ADDR tilingGM, GM_ADDR contextGM0);
+    __aicore__ inline void Process();
+
+private:
+    __aicore__ inline void ReorderTokens();
+    __aicore__ inline void SendDataToServer(uint32_t destServerId);
+    __aicore__ inline void CreateInnerReduceInfo(uint32_t serverIdx);
+    __aicore__ inline void CreateOuterReduceInfo();
+    __aicore__ inline void Win2Ipc();
+    __aicore__ inline void Ipc2Out();
+    __aicore__ inline void WaitIpcFlag(int32_t flagVal = 1);
+    __aicore__ inline void SetIpcFlag(int32_t flagVal = 1);
+    __aicore__ inline void CleanUp();
+
+    __aicore__ inline uint32_t GetExpRank(uint32_t expertId);
+    __aicore__ inline void QuantProcess(uint32_t sendTokenNum, LocalTensor<XType> xTokenLt,
+                                        LocalTensor<float> tokenCastLt);
+    __aicore__ inline uint32_t GetArrivedTokenInfo(uint32_t serverIdx, uint32_t tokenIdx, bool justExpInfo,
+                                                   LocalTensor<uint8_t> localUB_U8);
+    __aicore__ inline void AIVRDMAPostSend(GM_ADDR srcDmaAddr, GM_ADDR destDmaAddr, uint64_t destRankId,
+                                           uint64_t messageLen, __gm__ HcclAiRMAInfo *QpInfo);
+    __aicore__ inline uint32_t GetSelfServerTokenInfo(uint32_t tokenIdx, bool justExpInfo,
+                                                      LocalTensor<uint8_t> localUB_U8);
+
+    TPipe *tpipe_{nullptr};
+    GlobalTensor<int32_t> expertIdsGMTensor_;
+    GlobalTensor<ExpandXOutType> expandXOutGMTensor_;
+    GlobalTensor<float> dynamicScalesOutGMTensor_;
+    GlobalTensor<float> weightsOutGt;
+    GlobalTensor<uint64_t> sendStatusTensor_;
+    GlobalTensor<uint8_t> sendTokensU8Tensor_;
+    GlobalTensor<uint32_t> bufferChosenGlobal_;
+    GlobalTensor<uint32_t> expertToServerGlobalTensor_;
+    GlobalTensor<uint64_t> readStatusTensor_;
+    GlobalTensor<uint64_t> tokenAddrFlagStructGlobalU64Tensor_;
+
+    LocalTensor<int32_t> expertCountTensor_;
+    LocalTensor<int16_t> expertIdsI16Tensor_;
+    LocalTensor<uint64_t> batchWriteU64Tensor_;
+    LocalTensor<uint32_t> batchWriteU32Tensor_;
+    LocalTensor<uint32_t> expertToServerCntTensor_;
+    LocalTensor<uint32_t> expertToServerIdxTensor_;
+    LocalTensor<uint64_t> ubLocal;
+    LocalTensor<uint32_t> ubLocalHead;
+
+    TBuf<> statusBuf_;
+    TBuf<QuePosition::VECCALC> tBuf;
+    TBuf<TPosition::VECOUT> rdmaInBuf_;
+    TBuf<TPosition::VECOUT> rdmaInBuf2_;
+
+    __gm__ HcclAiRMAInfo *qp_info_;
+    GM_ADDR expandXGM_;
+    GM_ADDR expandIdxGM_;
+    GM_ADDR weightsGM_;
+    GM_ADDR expertTokenNumsOutGM_;
+    GM_ADDR epRecvCountsGM_;
+    GM_ADDR windowInGM_;
+    GM_ADDR windowOutGM_;
+    GM_ADDR dataBatchWriteInfo_;
+    GM_ADDR expertToServerCntGM_;
+    GM_ADDR shareAddrs[8];
+    GM_ADDR tokenAddrFlagStructGM_;
+
+    // tiling侧已确保数据上限，相乘不会越界，因此统一采用uint32_t进行处理
+    uint32_t axisBS_{0};
+    uint32_t globalBs_{0};
+    uint32_t axisH_{0};
+    uint32_t axisK_{0};   // 真实的K值
+    uint32_t alignK_{0};  // axisK_与 BITS32_PER_BLOCK 对齐
+    uint32_t aivNum_{0};
+    uint32_t expertIdsCnt_{0};
+    uint32_t worldSize_{0};
+    uint32_t rankId_{0};
+    uint32_t serverId_{0};
+    uint32_t aivId_{0};         // aiv id
+    uint32_t moeExpertNum_{0};  // moe专家卡数, 等于worldSize_ - 共享专家卡数
+    uint32_t moeExpertNumInServer_{0};
+    uint32_t localMoeExpertNum_{0};
+    uint32_t SERVER_SIZE_ON_WIN{0};
+    uint32_t RANK_SIZE_ON_IPC{0};
+    uint32_t WIN_SIZE{0};
+    uint32_t bufferId_{0};
+    uint32_t totalSize_{0};
+    uint32_t totalWinSize_{0};
+    uint32_t halfWinSize_{0};
+    uint32_t serverNum{0};
+    uint32_t expertTokenNumsType_{0};
+    uint32_t shareMemOffset_{0};
+    uint32_t tokenUbSize_{0};
+
+    // TokenStruck
+    uint32_t tokenGapInStruct_{0};
+    uint32_t infoGapInStruct_{0};
+    uint32_t tokenStructLen_{0};
+    uint32_t tokenLenInStruct_{0};
+    uint32_t expLenInStruct_{0};
+    uint32_t weightLenInStruct_{0};
+    uint32_t realLenInStruct_{0};
+    uint32_t cntLenInStruct_{0};
+    uint32_t tokenOffsetInStruct_{0};
+    uint32_t expOffsetInStruct_{0};
+    uint32_t weightOffsetInStruct_{0};
+    uint32_t cntOffsetInStruct_{0};
+    uint32_t scaleOffsetInStruct_{0};
+    uint32_t scaleLenInStruct_{0};
+    uint32_t flagLenInStruct_{0};
+    uint32_t flagOffsetInStruct_{0};
+    uint64_t magicVal_{0};
+
+    uint64_t combineInnerCntOffset;
+    uint64_t combineInnerCntIndexOffset;
+    uint64_t combineOuterCntOffset;
+    uint64_t combineOuterCntIndexOffset;
+
+    Hccl<HCCL_SERVER_TYPE_AICPU> hccl_;
+    __gm__ HcclA2CombineOpParam *winContext_{nullptr};
+};
+
+template <TemplateMC2TypeA2layeredClass>
+__aicore__ inline void MoeDistributeDispatchV2Layered<TemplateMC2TypeA2layeredFunc>::Init(
+    GM_ADDR x, GM_ADDR expertIds, GM_ADDR scales, GM_ADDR expertScales, GM_ADDR expandXOut, GM_ADDR dynamicScalesOut,
+    GM_ADDR expandIdxOut, GM_ADDR expertTokenNumsOut, GM_ADDR epRecvCountsOut, GM_ADDR expandScales,
+    GM_ADDR workspaceGM, TPipe *pipe, GM_ADDR tilingGM, GM_ADDR contextGM0)
+{
+    tpipe_ = pipe;
+    REGISTER_TILING_DEFAULT(MoeDistributeDispatchV2TilingData);
+    auto tiling = (__gm__ MoeDistributeDispatchV2TilingData *)tilingGM;
+    __gm__ void *mc2InitTiling = (__gm__ void *)(&(tiling->mc2InitTiling));
+    __gm__ void *mc2CcTiling = (__gm__ void *)(&(tiling->mc2CcTiling));
+    GET_TILING_DATA_WITH_STRUCT(MoeDistributeDispatchV2TilingData, tilingData, tilingGM);
+
+    hccl_.Init(contextGM0, mc2InitTiling);
+    hccl_.SetCcTiling(mc2CcTiling);
+
+    winContext_ = (__gm__ HcclA2CombineOpParam *)contextGM0;
+    rankId_ = tilingData.moeDistributeDispatchV2Info.epRankId;
+    serverId_ = rankId_ / SERVER_RANK_SIZE;
+    windowInGM_ = hccl_.GetWindowsInAddr(rankId_);
+    windowOutGM_ = hccl_.GetWindowsOutAddr(rankId_);
+    qp_info_ = (__gm__ HcclAiRMAInfo *)(((__gm__ HcclA2CombineOpParam *)contextGM0)->aiRMAInfo);
+
+    axisBS_ = tilingData.moeDistributeDispatchV2Info.bs;
+    globalBs_ = tilingData.moeDistributeDispatchV2Info.globalBs;
+    axisH_ = tilingData.moeDistributeDispatchV2Info.h;
+    axisK_ = tilingData.moeDistributeDispatchV2Info.k;
+    alignK_ = RoundUp(axisK_, BITS32_PER_BLOCK);
+    aivNum_ = tilingData.moeDistributeDispatchV2Info.aivNum;
+    worldSize_ = tilingData.moeDistributeDispatchV2Info.epWorldSize;
+    moeExpertNum_ = tilingData.moeDistributeDispatchV2Info.moeExpertNum;
+    localMoeExpertNum_ = moeExpertNum_ / worldSize_;
+    totalSize_ = winContext_->winSize;
+    totalWinSize_ = 100 * 1024 * 1024;  // 100 MB for RDMA
+    shareMemOffset_ = totalWinSize_;
+    halfWinSize_ = totalWinSize_ / 2;
+    WIN_SIZE = halfWinSize_ - STATUS_SIZE_LAYERED;
+    expertTokenNumsType_ = tilingData.moeDistributeDispatchV2Info.expertTokenNumsType;
+    aivId_ = GetBlockIdx();
+    expertIdsCnt_ = axisBS_ * axisK_;
+    serverNum = worldSize_ / SERVER_RANK_SIZE;
+
+    uint64_t winSizeMin =
+        moeExpertNum_ * axisBS_ * (axisH_ * sizeof(XType) + EXTRA_TOKEN_INFO_NUM * alignK_ * sizeof(uint32_t)) +
+        IPC_DATA_OFFSET + RDMA_DATA_SIZE;  // 考虑负载极其不均衡时，HCCL BUFFSIZE需要开的大小
+    assert(winContext_->winSize >= winSizeMin,
+           "The HCCL_BUFFSIZE is %lluMB, the min value should be %lluMB. \
+        epWorldSize:%u, epRankId:%u, moeExpertNum:%u, quantMode:%u, globalBs:%u, bs:%u, k:%u, h:%u, aivNum:%u, \
+        isQuant:%d, totalUbSize:%llu, expertTokenNumsType:%u\n",
+           winContext_->winSize / MB_SIZE, winSizeMin / MB_SIZE, tilingData.moeDistributeDispatchV2Info.epWorldSize,
+           tilingData.moeDistributeDispatchV2Info.epRankId, tilingData.moeDistributeDispatchV2Info.moeExpertNum,
+           tilingData.moeDistributeDispatchV2Info.quantMode, tilingData.moeDistributeDispatchV2Info.globalBs,
+           tilingData.moeDistributeDispatchV2Info.bs, tilingData.moeDistributeDispatchV2Info.k,
+           tilingData.moeDistributeDispatchV2Info.h, tilingData.moeDistributeDispatchV2Info.aivNum,
+           tilingData.moeDistributeDispatchV2Info.isQuant, tilingData.moeDistributeDispatchV2Info.totalUbSize,
+           tilingData.moeDistributeDispatchV2Info.expertTokenNumsType);
+
+    // RDMA buffer init
+    bufferChosenGlobal_.SetGlobalBuffer((__gm__ uint32_t *)(windowInGM_ + WIN_SIZE + worldSize_ * STATE_OFFSET));
+    bufferId_ = bufferChosenGlobal_(0);
+    windowInGM_ = windowInGM_ + halfWinSize_ * bufferId_;
+    windowOutGM_ = windowOutGM_ + halfWinSize_ * bufferId_;
+    RANK_SIZE_ON_IPC = (totalSize_ - totalWinSize_ - IPC_DATA_OFFSET) / (localMoeExpertNum_ * worldSize_);
+    RANK_SIZE_ON_IPC = (RANK_SIZE_ON_IPC / IPC_BUFF_ALIGN) * IPC_BUFF_ALIGN;
+
+    // IPC buffer init
+    for (int i = 0; i < SERVER_RANK_SIZE; i++) {
+        shareAddrs[i] = (__gm__ uint8_t *)(reinterpret_cast<uint64_t>(
+            hccl_.GetWindowsInAddr(rankId_ / SERVER_RANK_SIZE * SERVER_RANK_SIZE + i) + shareMemOffset_));
+    }
+    SERVER_SIZE_ON_WIN = WIN_SIZE / serverNum;
+    SERVER_SIZE_ON_WIN = (SERVER_SIZE_ON_WIN / RDMA_BUFFER_ALIGN) * RDMA_BUFFER_ALIGN;
+
+    // TokenStruct info init
+    tokenLenInStruct_ = axisH_ * sizeof(ExpandXOutType);
+    expLenInStruct_ = alignK_ * sizeof(uint32_t);  // 为了对齐，使用 alignK_ 计算tokenStruct中的内存
+    weightLenInStruct_ = alignK_ * sizeof(uint32_t);
+    realLenInStruct_ = axisK_ * sizeof(uint32_t);  // 内存中实际有效部分，跟 axisK_ 有关
+    scaleLenInStruct_ = UB_32B_ALIGN;
+    flagLenInStruct_ = UB_32B_ALIGN;
+    tokenStructLen_ = flagLenInStruct_ + tokenLenInStruct_ + expLenInStruct_ + weightLenInStruct_ + scaleLenInStruct_;
+
+    /* 注意：flag必须放置在整个token struct的最前端，而且token和token之间不能连续发送。
+       原因：两条ROCE消息通过PCIE总线写到GM内存时，只有第二条消息的第一个分片的写操作和上一条消息保证是保序的，其余分片可能比第一条消息更早写入。
+            后续需要通过下一个token的flag到达来校验第一个token是否收到。
+       满足条件：寄存器默认配置保证消息第一个分片写操作保序 */
+
+    /* struct结构如下：
+       | flag: 32B | token(data): H * dtype | exp: alignK * uint32  | weight: alignK * uint32 | scale: 32B |
+    */
+    flagOffsetInStruct_ = 0;
+    tokenOffsetInStruct_ = flagLenInStruct_;
+    expOffsetInStruct_ = tokenOffsetInStruct_ + tokenLenInStruct_;
+    weightOffsetInStruct_ = expOffsetInStruct_ + expLenInStruct_;
+    scaleOffsetInStruct_ = weightOffsetInStruct_ + weightLenInStruct_;
+
+    tokenGapInStruct_ = (tokenStructLen_ - tokenLenInStruct_) / UB_32B_ALIGN;
+    infoGapInStruct_ = (tokenStructLen_ - expLenInStruct_) / UB_32B_ALIGN;
+
+    // Input/Output global tensor init
+    expertIdsGMTensor_.SetGlobalBuffer((__gm__ int32_t *)expertIds);
+    expandXOutGMTensor_.SetGlobalBuffer((__gm__ ExpandXOutType *)(expandXOut),
+                                        worldSize_ * axisBS_ * localMoeExpertNum_ * axisH_);
+    dynamicScalesOutGMTensor_.SetGlobalBuffer((__gm__ float *)(dynamicScalesOut));
+    weightsOutGt.SetGlobalBuffer((__gm__ float *)(expandScales));
+    expertTokenNumsOutGM_ = expertTokenNumsOut;  // 无GlobalTensor
+    epRecvCountsGM_ = epRecvCountsOut;           // 无GlobalTensor
+    expandXGM_ = x;
+    expandIdxGM_ = expertIds;
+    weightsGM_ = expertScales;
+
+    // RDMA send/recv global tensor init
+    sendTokensU8Tensor_.SetGlobalBuffer((__gm__ uint8_t *)(windowOutGM_));
+    sendStatusTensor_.SetGlobalBuffer((__gm__ uint64_t *)(windowOutGM_ + WIN_SIZE));
+    readStatusTensor_.SetGlobalBuffer((__gm__ uint64_t *)(windowInGM_ + WIN_SIZE));
+
+    // Global work space init
+    tokenAddrFlagStructGM_ = workspaceGM;
+    tokenAddrFlagStructGlobalU64Tensor_.SetGlobalBuffer((__gm__ uint64_t *)(tokenAddrFlagStructGM_),
+                                                        axisBS_ * FLAG_SIZE);
+
+    // Combine info offset init
+    combineInnerCntOffset = localMoeExpertNum_ * serverNum * SERVER_RANK_SIZE * sizeof(int32_t);
+    combineInnerCntIndexOffset = combineInnerCntOffset + globalBs_ * serverNum * sizeof(int16_t);
+    combineOuterCntOffset = combineInnerCntIndexOffset + globalBs_ * axisK_ * serverNum * sizeof(int32_t);
+    combineOuterCntIndexOffset = combineOuterCntOffset + axisBS_ * sizeof(int32_t);
+    moeExpertNumInServer_ = SERVER_RANK_SIZE * localMoeExpertNum_;
+
+    // UB init
+    tpipe_->InitBuffer(statusBuf_, FLAG_SIZE);
+
+    tpipe_->InitBuffer(rdmaInBuf_, UB_32B_ALIGN);
+    ubLocal = rdmaInBuf_.Get<uint64_t>();
+
+    tpipe_->InitBuffer(rdmaInBuf2_, UB_32B_ALIGN);
+    ubLocalHead = rdmaInBuf2_.Get<uint32_t>();
+
+    tpipe_->InitBuffer(tBuf, TBUF_SIZE);
+
+    // The maximum value of expertIdsCnt_ is 512 * 16, so there is no integer wrap.
+    uint32_t expertIdsSize = RoundUp(expertIdsCnt_ * static_cast<uint32_t>(sizeof(int16_t)), UB_32B_ALIGN);
+    tokenUbSize_ = TBUF_SIZE - TBUF_TEMP_OFFSET - expertIdsSize;  // = 190KB-8KB-16KB=166KB
+    expertIdsI16Tensor_ =
+        tBuf.GetWithOffset<int16_t>(axisBS_ * alignK_, tokenUbSize_ + TBUF_TEMP_OFFSET);  // 512 * 16 * 2 = 16KB
+
+    // RDMA发送完成标志初始化
+    if (aivId_ == 0) {
+        sendStatusTensor_.SetValue(0, END_OF_WRITE_FLAG_VALUE);
+        DataCacheCleanAndInvalid<uint64_t, AscendC::CacheLine::SINGLE_CACHE_LINE, AscendC::DcciDst::CACHELINE_OUT>(
+            sendStatusTensor_);
+    }
+
+    // 每次调用magic++,用来区分不同轮次
+    LocalTensor<uint64_t> tempLocal = tBuf.Get<uint64_t>();
+    GlobalTensor<uint64_t> magicGt;
+    magicGt.SetGlobalBuffer((__gm__ uint64_t *)(shareAddrs[rankId_ % SERVER_RANK_SIZE] + IPC_MAGIC_OFFSET) +
+                            aivId_ * UB_32B_ALIGN / sizeof(uint64_t));
+    DataCopy(tempLocal, magicGt, UB_32B_ALIGN / sizeof(uint64_t));
+    PipeBarrier<PIPE_ALL>();
+    tempLocal(0) += 1;
+    magicVal_ = tempLocal(0);
+    DataCopy(magicGt, tempLocal, UB_32B_ALIGN / sizeof(uint64_t));
+    PipeBarrier<PIPE_ALL>();
+}
+
+template <TemplateMC2TypeA2layeredClass>
+__aicore__ inline void MoeDistributeDispatchV2Layered<TemplateMC2TypeA2layeredFunc>::AIVRDMAPostSend(
+    GM_ADDR srcDmaAddr, GM_ADDR destDmaAddr, uint64_t destRankId, uint64_t messageLen, __gm__ HcclAiRMAInfo *QpInfo)
+{
+    auto qpNum = ((__gm__ HcclAiRMAInfo *)QpInfo)->qpNum;
+    auto qp_ctx_entry =
+        (__gm__ HcclAiRMAWQ *)(((__gm__ HcclAiRMAInfo *)QpInfo)->sqPtr +
+                               destRankId * qpNum * (uint64_t)(((__gm__ HcclAiRMAInfo *)QpInfo)->sizeOfAiRMAWQ));
+    auto mem_info_table = ((__gm__ HcclAiRMAInfo *)QpInfo)->memPtr;
+    auto sizeof_memdetail = ((__gm__ HcclAiRMAInfo *)QpInfo)->sizeOfAiRMAMem;
+    auto cur_rank_id = (((__gm__ HcclAiRMAInfo *)QpInfo)->curRankId);
+    auto sqBaseAddr = qp_ctx_entry->bufAddr;
+    auto wqeSize = qp_ctx_entry->wqeSize;
+    auto curHardwareHead = qp_ctx_entry->headAddr;
+    cacheWriteThrough((__gm__ uint8_t *)curHardwareHead, 8);
+    uint64_t curHead = *(__gm__ uint32_t *)(curHardwareHead);
+    auto curHardwareTailAddr = qp_ctx_entry->tailAddr;
+    uint64_t shift = 15U;
+    auto QP_DEPTH = qp_ctx_entry->depth;
+
+    PipeBarrier<PIPE_ALL>();
+
+    // Make sure we don't overflow the SQ in an infinite loop - no need to mitigate endless loop as the host
+    // will timeout and kill the kernel, same as all2all kernel if it fails to complete (e.g. in case of link loss)
+    while (1) {
+        cacheWriteThrough((__gm__ uint8_t *)curHardwareTailAddr, 8);
+        if ((curHead - *(__gm__ uint32_t *)(curHardwareTailAddr)) < QP_DEPTH - 1) {
+            break;
+        }
+        int64_t systemCycleAfter = AscendC::GetSystemCycle();  // add this line to solve slow poll CQ issue
+    }
+
+    __gm__ uint8_t *wqeAddr = (__gm__ uint8_t *)(sqBaseAddr + wqeSize * (curHead % QP_DEPTH));
+
+    // Write the WQE to GM
+    uint64_t ownBit = (curHead >> shift) & 1U;
+    uint32_t byte_4 = 3U;                      // [0:4] opcode=0x3(RDMA_WRITE)
+    byte_4 |= ((~ownBit) << 7U) & (1U << 7U);  // [7] owner_bit
+    byte_4 |= 1U << 8U;                        // [8:8] IBV_SEND_SIGNALED
+
+    *(__gm__ uint32_t *)(wqeAddr) = byte_4;          // Control set by local parameter see above lines
+    *(__gm__ uint32_t *)(wqeAddr + 4) = messageLen;  // message size
+    *(__gm__ uint32_t *)(wqeAddr + 8) = 0;           // immtdata is always 0 till we provide poll CQ flow in AIV
+    *(__gm__ uint32_t *)(wqeAddr + 12) = 1U << 24U;  // [120:127] num_sge = 1
+    *(__gm__ uint32_t *)(wqeAddr + 16) = 0;          // [128:151] start_sge_idx = 0;
+    __gm__ HcclAiRMAMemInfo *memDetail = (__gm__ HcclAiRMAMemInfo *)(mem_info_table + sizeof_memdetail * destRankId);
+    *(__gm__ uint32_t *)(wqeAddr + 20) =
+        ((__gm__ MemDetails *)(memDetail->memDetailPtr +
+                               memDetail->sizeOfMemDetails * static_cast<uint32_t>(HcclAiRMAMemType::REMOTE_INPUT)))
+            ->key;
+    *(__gm__ uint64_t *)(wqeAddr + 24) = (uint64_t)destDmaAddr;  // destination VA
+
+    // Setup SGE and write to GM
+    __gm__ uint8_t *sgeAddr = wqeAddr + sizeof(struct hns_roce_rc_sq_wqe);
+    *(__gm__ uint32_t *)(sgeAddr) = messageLen;
+    memDetail = (__gm__ HcclAiRMAMemInfo *)(mem_info_table + sizeof_memdetail * destRankId);
+    *(__gm__ uint32_t *)(sgeAddr + sizeof(uint32_t)) =
+        ((__gm__ MemDetails *)(memDetail->memDetailPtr +
+                               memDetail->sizeOfMemDetails * static_cast<uint32_t>(HcclAiRMAMemType::LOCAL_OUTPUT)))
+            ->key;  // L_Key
+    *(__gm__ uint64_t *)(sgeAddr + 2 * sizeof(uint32_t)) =
+        (uint64_t)srcDmaAddr;  // src VA addr memory registered by RNIC
+
+    // wqe & sge cache flush
+    cacheWriteThrough(wqeAddr, sizeof(struct hns_roce_rc_sq_wqe) + sizeof(struct hns_roce_lite_wqe_data_seg));
+    PipeBarrier<PIPE_ALL>();
+    curHead++;
+
+    uint64_t doorBellInfo = 0;
+    doorBellInfo |= qp_ctx_entry->wqn;                     // [0:23] DB_TAG (qp_num)
+    doorBellInfo |= 0UL << 24UL;                           // [24:27] DB_CMD = HNS_ROCE_V2_SQ_DB (0)
+    doorBellInfo |= (curHead % 65536UL) << 32UL;           // [32:47] DB_PI = sq.head
+    doorBellInfo |= (uint64_t)(qp_ctx_entry->sl) << 48UL;  // [48:50] DB_SL = qp.sl
+
+    __gm__ uint64_t *doorBellAddr = (__gm__ uint64_t *)(qp_ctx_entry->dbAddr);
+    PipeBarrier<PIPE_ALL>();
+
+    ubLocal.SetValue(0, doorBellInfo);
+    AscendC::GlobalTensor<uint64_t> DBGlobalTensor;
+    DBGlobalTensor.SetGlobalBuffer(doorBellAddr);
+    AscendC::DataCopyExtParams copyParams{1, 1 * sizeof(uint64_t), 0, 0, 0};
+    PipeBarrier<PIPE_ALL>();
+    AscendC::DataCopyPad(DBGlobalTensor, ubLocal, copyParams);
+    PipeBarrier<PIPE_ALL>();
+
+    ubLocalHead.SetValue(0, (uint32_t)curHead);
+    AscendC::GlobalTensor<uint32_t> HeadGlobalTensor;
+    HeadGlobalTensor.SetGlobalBuffer((__gm__ uint32_t *)curHardwareHead);
+    AscendC::DataCopyExtParams copyParamsHead{1, 1 * sizeof(uint32_t), 0, 0, 0};
+    PipeBarrier<PIPE_ALL>();
+    AscendC::DataCopyPad(HeadGlobalTensor, ubLocalHead, copyParamsHead);
+    PipeBarrier<PIPE_ALL>();
+}
+
+template <TemplateMC2TypeA2layeredClass>
+__aicore__ inline void
+MoeDistributeDispatchV2Layered<TemplateMC2TypeA2layeredFunc>::CreateInnerReduceInfo(uint32_t serverIdx)
+{
+    // 最后serverNum个Core加入本函数
+    uint32_t curServerId = serverIdx;
+    uint32_t currServerExpBegin = rankId_ / 8 * moeExpertNumInServer_;       // 目标Server的起始专家
+    uint32_t currServerExpEnd = currServerExpBegin + moeExpertNumInServer_;  // 目标Server的结束专家
+    uint32_t tokenOccurNum = 0;
+    uint32_t expOccurNum = 0;
+    uint32_t baseBuffOffset = TBUF_TEMP_OFFSET;
+
+    uint32_t tokenStatus = WAIT_STATUS;
+    uint32_t selfTokenIdx = 0;
+    LocalTensor<uint8_t> localUB_U8 =
+        tBuf.GetWithOffset<uint8_t>(MAX_BS_NUM * alignK_ * sizeof(int32_t), IPC_BUFF_ALIGN);
+    LocalTensor<int32_t> localUB_32 = localUB_U8.template ReinterpretCast<int32_t>();
+
+    uint32_t tokenIdx = 0;
+    while (tokenStatus != FINISH_STATUS) {
+        if (serverId_ == serverIdx) {
+            tokenStatus = GetSelfServerTokenInfo(selfTokenIdx, true, localUB_U8[tokenIdx * expLenInStruct_]);
+            if (tokenStatus == SKIP_STATUS || tokenStatus == ARRIVAL_STATUS) selfTokenIdx++;
+        } else {
+            tokenStatus = GetArrivedTokenInfo(curServerId, tokenIdx, true, localUB_U8[tokenIdx * expLenInStruct_]);
+        }
+        PipeBarrier<PIPE_ALL>();
+        if (tokenStatus != ARRIVAL_STATUS) {
+            continue;
+        } else {
+            tokenIdx += 1;
+        }
+    }
+
+    uint32_t realBS = tokenIdx;
+    if (realBS == 0) {
+        uint32_t copyTokenNum = aivNum_ < globalBs_ ? aivNum_ : globalBs_;
+        LocalTensor<int16_t> zeroTemp = tBuf.GetWithOffset<int16_t>(copyTokenNum * sizeof(int16_t), 0);
+        Duplicate<int16_t>(zeroTemp, 0, RoundUp(copyTokenNum, B16_PER_BLOCK));
+        PipeBarrier<PIPE_ALL>();
+        GlobalTensor<int16_t> combineInnerCnt;
+        combineInnerCnt.SetGlobalBuffer(
+            (__gm__ int16_t *)(epRecvCountsGM_ + combineInnerCntOffset + globalBs_ * curServerId * sizeof(int16_t)));
+        DataCopyExtParams innerCntWriteCountsParams{1, static_cast<uint32_t>(copyTokenNum * sizeof(int16_t)), 0, 0, 0};
+        SyncFunc<AscendC::HardEvent::S_MTE3>();
+        DataCopyPad(combineInnerCnt, zeroTemp, innerCntWriteCountsParams);
+        PipeBarrier<PIPE_ALL>();
+        return;
+    }
+
+    LocalTensor<int32_t> localUB =
+        tBuf.GetWithOffset<int32_t>(RoundUp(realBS * alignK_, BITS32_PER_BLOCK), baseBuffOffset);
+
+    baseBuffOffset += sizeof(int32_t) * RoundUp(realBS * alignK_, TBUF_OFFSET_ALIGN_B32_CNT);
+    LocalTensor<int16_t> combineReduceInfo =
+        tBuf.GetWithOffset<int16_t>(moeExpertNumInServer_ * realBS, baseBuffOffset);
+
+    baseBuffOffset += sizeof(int16_t) * RoundUp(moeExpertNumInServer_ * realBS, TBUF_OFFSET_ALIGN_B32_CNT);
+    LocalTensor<int32_t> expCntMap = tBuf.GetWithOffset<int32_t>(moeExpertNumInServer_, baseBuffOffset);
+
+    baseBuffOffset += sizeof(int32_t) * RoundUp(moeExpertNumInServer_, TBUF_OFFSET_ALIGN_B32_CNT);
+    LocalTensor<int32_t> tokenOffset =
+        tBuf.GetWithOffset<int32_t>(RoundUp(realBS * alignK_, BITS32_PER_BLOCK), baseBuffOffset);
+
+    baseBuffOffset += sizeof(int32_t) * RoundUp(realBS * alignK_, TBUF_OFFSET_ALIGN_B32_CNT);
+    LocalTensor<int32_t> innerOffsetLt =
+        tBuf.GetWithOffset<int32_t>(RoundUp(realBS * alignK_, BITS32_PER_BLOCK), baseBuffOffset);
+
+    baseBuffOffset += sizeof(int32_t) * RoundUp(realBS * alignK_, TBUF_OFFSET_ALIGN_B32_CNT);
+    LocalTensor<int16_t> innerCntLt =
+        tBuf.GetWithOffset<int16_t>(RoundUp(realBS + aivNum_, B16_PER_BLOCK), baseBuffOffset);
+
+    Duplicate<int16_t>(combineReduceInfo, int16_t(-1), moeExpertNumInServer_ * realBS);
+    Duplicate<int32_t>(expCntMap, int32_t(0), moeExpertNumInServer_);
+    Duplicate<int32_t>(tokenOffset, int32_t(0), realBS * alignK_);
+    Duplicate<int16_t>(innerCntLt, 0, RoundUp(realBS + aivNum_, B16_PER_BLOCK));
+    Duplicate<int32_t>(innerOffsetLt, 0, (realBS)*alignK_);
+
+    SyncFunc<AscendC::HardEvent::V_S>();
+    SyncFunc<AscendC::HardEvent::MTE2_S>();
+
+    for (uint32_t tokenIdx = 0; tokenIdx < realBS; tokenIdx++) {
+        for (uint32_t expIdx = 0; expIdx < axisK_; expIdx++) {
+            int32_t expId = localUB_32(tokenIdx * alignK_ + expIdx);
+            if (expId >= currServerExpBegin && expId < currServerExpEnd) {
+                int32_t expIdInServer = expId % moeExpertNumInServer_;
+                uint32_t offsetInExp = expCntMap(expIdInServer);
+                expCntMap(expIdInServer) += 1;
+                combineReduceInfo(expIdInServer * realBS + offsetInExp) = static_cast<uint16_t>(tokenIdx);
+                tokenOffset(tokenIdx * axisK_ + expIdx) = offsetInExp;
+            }
+        }
+    }
+
+    for (uint32_t expIdx = 0; expIdx < moeExpertNumInServer_; expIdx++) {
+        if (expIdx % localMoeExpertNum_ == 0) {
+            continue;
+        }
+        expCntMap(expIdx) += expCntMap(expIdx - 1);
+    }
+
+    for (uint32_t expBlockId = 0; expBlockId < moeExpertNumInServer_; expBlockId++) {
+        uint32_t validCnt = (expBlockId % localMoeExpertNum_ == 0)
+                                ? expCntMap(expBlockId)
+                                : (expCntMap(expBlockId) - expCntMap(expBlockId - 1));
+        for (uint32_t tokenIdx = 0; tokenIdx < validCnt; tokenIdx++) {
+            uint32_t tokenId = static_cast<uint32_t>(combineReduceInfo(expBlockId * realBS + tokenIdx));
+            if (tokenId == -1) {
+                continue;
+            }
+            for (uint32_t expIdx = 0; expIdx < axisK_; expIdx++) {
+                uint32_t expId = localUB_32(tokenId * alignK_ + expIdx);
+                if (expId >= currServerExpBegin && expId < currServerExpEnd) {
+                    uint32_t expIdInServer = expId % moeExpertNumInServer_;
+                    uint32_t rankIdInServer = expIdInServer / localMoeExpertNum_;
+                    combineReduceInfo(expIdInServer * realBS + tokenOffset(tokenId * axisK_ + expIdx)) = -1;
+                    innerCntLt(tokenOccurNum) += 1;
+                    innerOffsetLt(expOccurNum) =
+                        (expIdInServer % localMoeExpertNum_ == 0) ? 0 : expCntMap(expIdInServer - 1);
+                    innerOffsetLt(expOccurNum) += rankIdInServer * globalBs_ * axisK_;
+                    innerOffsetLt(expOccurNum) += tokenOffset(tokenId * axisK_ + expIdx);
+                    expOccurNum += 1;
+                }
+            }
+            tokenOccurNum += 1;
+        }
+    }
+    for (uint32_t tokenIdx = 1; tokenIdx < realBS; ++tokenIdx) {
+        innerCntLt(tokenIdx) += innerCntLt(tokenIdx - 1);
+    }
+    PipeBarrier<PIPE_ALL>();
+    GlobalTensor<int16_t> combineInnerCnt;
+
+    combineInnerCnt.SetGlobalBuffer(
+        (__gm__ int16_t *)(epRecvCountsGM_ + combineInnerCntOffset + globalBs_ * curServerId * sizeof(int16_t)));
+    uint32_t copyTokenNum = (realBS + aivNum_) < globalBs_ ? (realBS + aivNum_) : globalBs_;
+    DataCopyExtParams innerCntWriteCountsParams{1, static_cast<uint16_t>(copyTokenNum * sizeof(int16_t)), 0, 0, 0};
+    SyncFunc<AscendC::HardEvent::S_MTE3>();
+    DataCopyPad(combineInnerCnt, innerCntLt, innerCntWriteCountsParams);
+    PipeBarrier<PIPE_ALL>();
+    GlobalTensor<int32_t> combineInnerOffset;
+    combineInnerOffset.SetGlobalBuffer((__gm__ int32_t *)(epRecvCountsGM_ + combineInnerCntIndexOffset +
+                                                          globalBs_ * axisK_ * curServerId * sizeof(int32_t)));
+
+    DataCopyExtParams innerOffsetWriteCountsParams{1, static_cast<uint32_t>(realBS * axisK_ * sizeof(int32_t)), 0, 0,
+                                                   0};
+    SyncFunc<AscendC::HardEvent::S_MTE3>();
+    DataCopyPad(combineInnerOffset, innerOffsetLt, innerOffsetWriteCountsParams);
+    PipeBarrier<PIPE_ALL>();
+}
+
+template <TemplateMC2TypeA2layeredClass>
+__aicore__ inline void MoeDistributeDispatchV2Layered<TemplateMC2TypeA2layeredFunc>::CreateOuterReduceInfo()
+{
+    // 仅一个核进去该逻辑
+    uint32_t baseBuffOffset = TBUF_TEMP_OFFSET;
+
+    LocalTensor<int32_t> miniExpIds = tBuf.GetWithOffset<int32_t>(RoundUp(axisBS_, BITS32_PER_BLOCK), baseBuffOffset);
+    baseBuffOffset += sizeof(int32_t) * RoundUp(axisBS_, TBUF_OFFSET_ALIGN_B32_CNT);
+
+    LocalTensor<int32_t> miniServerExpIds =
+        tBuf.GetWithOffset<int32_t>(RoundUp(axisBS_ * serverNum, BITS32_PER_BLOCK), baseBuffOffset);
+    baseBuffOffset += sizeof(int32_t) * RoundUp(axisBS_ * serverNum, TBUF_OFFSET_ALIGN_B32_CNT);
+
+    LocalTensor<int32_t> combineCnt_ = tBuf.GetWithOffset<int32_t>(moeExpertNum_, baseBuffOffset);
+    baseBuffOffset += sizeof(int32_t) * RoundUp(moeExpertNum_, TBUF_OFFSET_ALIGN_B32_CNT);
+
+    LocalTensor<int32_t> combineCntIdx_ =
+        tBuf.GetWithOffset<int32_t>(RoundUp(axisBS_, BITS32_PER_BLOCK), baseBuffOffset);
+    baseBuffOffset += sizeof(int32_t) * RoundUp(axisBS_, TBUF_OFFSET_ALIGN_B32_CNT);
+
+    LocalTensor<int32_t> combineOffset_ = tBuf.GetWithOffset<int32_t>(moeExpertNum_, baseBuffOffset);
+    baseBuffOffset += sizeof(int32_t) * RoundUp(moeExpertNum_, TBUF_OFFSET_ALIGN_B32_CNT);
+
+    LocalTensor<int32_t> combineOffsetIdx_ =
+        tBuf.GetWithOffset<int32_t>(RoundUp(axisBS_ * serverNum, BITS32_PER_BLOCK), baseBuffOffset);
+    baseBuffOffset += sizeof(int32_t) * RoundUp(axisBS_ * serverNum, TBUF_OFFSET_ALIGN_B32_CNT);
+
+    LocalTensor<int32_t> outerCntLt = tBuf.GetWithOffset<int32_t>(RoundUp(axisBS_, BITS32_PER_BLOCK), baseBuffOffset);
+    baseBuffOffset += sizeof(int32_t) * RoundUp(axisBS_, TBUF_OFFSET_ALIGN_B32_CNT);
+
+    LocalTensor<int32_t> outerOffsetLt =
+        tBuf.GetWithOffset<int32_t>(RoundUp(axisBS_ * axisK_, BITS32_PER_BLOCK), baseBuffOffset);
+    baseBuffOffset += sizeof(int32_t) * RoundUp(axisBS_ * axisK_, TBUF_OFFSET_ALIGN_B32_CNT);
+
+    LocalTensor<int32_t> expertIdsI32Tensor =
+        tBuf.GetWithOffset<int32_t>(RoundUp(axisBS_ * axisK_, BITS32_PER_BLOCK), baseBuffOffset);
+
+    DataCopyExtParams expCopyParams{1, static_cast<uint32_t>(axisBS_ * axisK_ * sizeof(int32_t)), 0, 0, 0};
+    DataCopyPadExtParams<int32_t> expPadParams;
+    DataCopyPad(expertIdsI32Tensor, expertIdsGMTensor_, expCopyParams, expPadParams);
+
+    Duplicate<int32_t>(miniExpIds, int32_t(moeExpertNum_), RoundUp(axisBS_, BITS32_PER_BLOCK));
+    Duplicate<int32_t>(miniServerExpIds, int32_t(moeExpertNum_), RoundUp(axisBS_ * serverNum, BITS32_PER_BLOCK));
+    Duplicate<int32_t>(combineCnt_, int32_t(0), moeExpertNum_);
+    Duplicate<int32_t>(combineOffset_, int32_t(0), moeExpertNum_);
+    Duplicate<int32_t>(outerCntLt, 0, RoundUp(axisBS_, BITS32_PER_BLOCK));
+    Duplicate<int32_t>(outerOffsetLt, 0, RoundUp(axisBS_ * axisK_, BITS32_PER_BLOCK));
+
+    SyncFunc<AscendC::HardEvent::MTE2_S>();
+    SyncFunc<AscendC::HardEvent::V_S>();
+
+    // ServerIdx，统计token去往了哪些server，以及在server上的偏移，统计目的专家信息
+    for (uint32_t expertIndex = 0; expertIndex < expertIdsCnt_; ++expertIndex) {
+        uint32_t tokenIdx = expertIndex / axisK_;
+        uint32_t expId = expertIdsI32Tensor(expertIndex);
+        uint32_t expServerId = expId / moeExpertNumInServer_;  // 专家在第几个server
+
+        // 获取当前token中最小的一个expId,用于后续计算该token出现的位置
+        uint32_t miniExpId = miniExpIds(tokenIdx);
+        miniExpIds(tokenIdx) = (expId < miniExpId) ? expId : miniExpId;
+
+        // 当前token每个目的server,统计其最小expId
+        if (miniServerExpIds(tokenIdx * serverNum + expServerId) > expId) {
+            miniServerExpIds(tokenIdx * serverNum + expServerId) = expId;
+        }
+
+        if (expertIndex % axisK_ != axisK_ - 1) {
+            continue;
+        }
+        // token的最后一个expID，将上述信息进行记录
+        combineCntIdx_(tokenIdx) = combineCnt_(miniExpId);
+        combineCnt_(miniExpId) += 1;
+
+        for (uint32_t serverIdx = 0; serverIdx < serverNum; ++serverIdx) {
+            uint32_t miniServerExpId = miniServerExpIds(tokenIdx * serverNum + serverIdx);
+            if (miniServerExpId != moeExpertNum_) {
+                combineOffsetIdx_(tokenIdx * serverNum + serverIdx) = combineOffset_(miniServerExpId);
+                combineOffset_(miniServerExpId) += 1;
+            }
+        }
+    }
+    // 计算前序和
+    for (uint32_t expertIndex = 1; expertIndex < moeExpertNum_; ++expertIndex) {
+        combineCnt_(expertIndex) += combineCnt_(expertIndex - 1);
+        combineOffset_(expertIndex) += combineOffset_(expertIndex - 1);
+    }
+
+    // 第三次遍历，填充bs个token的Reduceinfo
+    uint32_t outerOffsetIdx = 0;
+    for (uint32_t tokenIdx = 0; tokenIdx < axisBS_; ++tokenIdx) {
+        uint32_t miniExpId = miniExpIds(tokenIdx);
+        // 将cnt,offset填写到InfoTensor对应的位置
+        for (uint32_t serverIdx = 0; serverIdx < serverNum; ++serverIdx) {
+            // 对于无效server跳过
+            uint32_t miniServerExpId = miniServerExpIds(tokenIdx * serverNum + serverIdx);
+            if (miniServerExpId == moeExpertNum_) {
+                continue;
+            }
+            outerCntLt(tokenIdx) += 1;
+            uint32_t preServerCnt = (serverIdx == 0) ? 0 : combineOffset_(serverIdx * moeExpertNumInServer_ - 1);
+            uint32_t serverBaseCnt = serverIdx * axisBS_;
+            uint32_t preTokenCnt = (miniServerExpId == 0) ? 0 : combineOffset_(miniServerExpId - 1);
+            uint32_t tokenOffset =
+                preTokenCnt - preServerCnt + combineOffsetIdx_(tokenIdx * serverNum + serverIdx) + serverBaseCnt;
+            outerOffsetLt(outerOffsetIdx) = tokenOffset;
+            outerOffsetIdx++;
+        }
+    }
+
+    // 第四次遍历获取累加和
+    for (uint32_t tokenIdx = 1; tokenIdx < axisBS_; ++tokenIdx) {
+        outerCntLt(tokenIdx) += outerCntLt(tokenIdx - 1);
+    }
+
+    GlobalTensor<int32_t> combineOuterCnt;
+    combineOuterCnt.SetGlobalBuffer((__gm__ int32_t *)(epRecvCountsGM_ + combineOuterCntOffset));
+
+    DataCopyExtParams outerCntWriteCountsParams{1, static_cast<uint32_t>(axisBS_ * sizeof(int32_t)), 0, 0, 0};
+    SyncFunc<AscendC::HardEvent::S_MTE3>();
+    DataCopyPad(combineOuterCnt, outerCntLt, outerCntWriteCountsParams);
+
+    GlobalTensor<int32_t> combineOuterOffset;
+
+    PipeBarrier<PIPE_ALL>();
+    combineOuterOffset.SetGlobalBuffer((__gm__ int32_t *)(epRecvCountsGM_ + combineOuterCntIndexOffset));
+
+    DataCopyExtParams outerOffsetWriteCountsParams{1, static_cast<uint32_t>(axisBS_ * axisK_ * sizeof(int32_t)), 0, 0,
+                                                   0};
+    SyncFunc<AscendC::HardEvent::S_MTE3>();
+    DataCopyPad(combineOuterOffset, outerOffsetLt, outerOffsetWriteCountsParams);
+    PipeBarrier<PIPE_ALL>();
+}
+
+template <TemplateMC2TypeA2layeredClass>
+__aicore__ inline void MoeDistributeDispatchV2Layered<TemplateMC2TypeA2layeredFunc>::ReorderTokens()
+{
+    uint32_t sendTokenNum = axisBS_ / aivNum_;
+    uint32_t remainderTokenNum = axisBS_ % aivNum_;
+    uint32_t startTokenId = sendTokenNum * aivId_;
+    // 分核，每个Core处理sendTokenNum个Token的遍历
+    if (aivId_ < remainderTokenNum) {  // 前remainderRankNum个aiv需要多发1个卡的数据
+        sendTokenNum += 1;
+        startTokenId += aivId_;
+    } else {
+        startTokenId += remainderTokenNum;
+    }
+    uint32_t endTokenId = startTokenId + sendTokenNum;
+
+    if (sendTokenNum == 0) {
+        return;
+    }
+
+    LocalTensor<int32_t> expertIdsI32Tensor = tBuf.Get<int32_t>(RoundUp(axisBS_ * axisK_, BITS32_PER_BLOCK));
+
+    DataCopyExtParams expCopyParams{1, static_cast<uint32_t>(axisBS_ * axisK_ * sizeof(int32_t)), 0, 0, 0};
+    DataCopyPadExtParams<int32_t> expPadParams;
+    DataCopyPad(expertIdsI32Tensor, expertIdsGMTensor_, expCopyParams, expPadParams);
+    SyncFunc<AscendC::HardEvent::MTE2_V>();
+    Cast(expertIdsI16Tensor_, expertIdsI32Tensor, RoundMode::CAST_NONE, axisBS_ * axisK_);
+    SyncFunc<AscendC::HardEvent::V_MTE2>();
+
+    // 计算单个token在ub中占用buffer大小，量化情况下还包含量化所学workspace
+    uint32_t singleTokenUBSize = tokenStructLen_;
+    uint32_t quantTokenUBSize = 0;
+    if constexpr (DynamicQuant) {
+        quantTokenUBSize = tokenStructLen_ > axisH_ * sizeof(XType) ? tokenStructLen_ : axisH_ * sizeof(XType);
+        singleTokenUBSize = quantTokenUBSize + axisH_ * sizeof(float);
+    }
+    uint32_t maxTokenNumInUB = tokenUbSize_ / singleTokenUBSize;
+    uint32_t batchNum = (sendTokenNum + maxTokenNumInUB - 1) / maxTokenNumInUB;
+
+    LocalTensor<uint8_t> tokenTensorU8_ =
+        tBuf.GetWithOffset<uint8_t>(maxTokenNumInUB * tokenStructLen_, TBUF_TEMP_OFFSET);
+    LocalTensor<uint64_t> tokenTempTensorU64_ = tokenTensorU8_.template ReinterpretCast<uint64_t>();
+    LocalTensor<XType> tokenLt = tokenTensorU8_.template ReinterpretCast<XType>();
+    LocalTensor<float> tokenCastLt;  // 仅量化使用
+    GlobalTensor<uint8_t> expertIdsGMTensorU8_;
+    GlobalTensor<uint8_t> weightGt;
+    GlobalTensor<uint8_t> xGMtU8;
+    xGMtU8.SetGlobalBuffer((__gm__ uint8_t *)expandXGM_);
+    weightGt.SetGlobalBuffer((__gm__ uint8_t *)weightsGM_);
+    expertIdsGMTensorU8_.SetGlobalBuffer((__gm__ uint8_t *)expandIdxGM_);
+
+    if constexpr (DynamicQuant) {
+        uint32_t tokenCastLtOffset = RoundUp(TBUF_TEMP_OFFSET + quantTokenUBSize * maxTokenNumInUB, UB_32B_ALIGN);
+        tokenCastLt = tBuf.GetWithOffset<float>(axisH_ * maxTokenNumInUB, tokenCastLtOffset);
+    }
+
+    for (uint32_t batchIndex = 0; batchIndex < batchNum; batchIndex++) {
+        uint32_t currentTokenNum = sendTokenNum > maxTokenNumInUB ? maxTokenNumInUB : sendTokenNum;
+        if constexpr (DynamicQuant) {
+            DataCopy(tokenTensorU8_, xGMtU8[startTokenId * axisH_ * sizeof(XType)],
+                     currentTokenNum * axisH_ * sizeof(XType));
+            PipeBarrier<PIPE_ALL>();
+            QuantProcess(currentTokenNum, tokenLt, tokenCastLt);
+        } else {
+            DataCopyExtParams tokenCopyParams{static_cast<uint16_t>(currentTokenNum),
+                                              static_cast<uint32_t>(tokenLenInStruct_), 0,
+                                              static_cast<uint32_t>(tokenGapInStruct_), 0};
+            DataCopyPadExtParams<uint8_t> tokenPadParams;
+            DataCopyPad(tokenTensorU8_[tokenOffsetInStruct_], xGMtU8[startTokenId * tokenLenInStruct_], tokenCopyParams,
+                        tokenPadParams);
+        }
+        PipeBarrier<PIPE_ALL>();
+        // Expert进行拷贝
+        DataCopyExtParams expCopyParams{static_cast<uint16_t>(currentTokenNum), static_cast<uint32_t>(realLenInStruct_),
+                                        0, static_cast<uint32_t>(infoGapInStruct_), 0};
+        DataCopyPadExtParams<uint8_t> expPadParams;
+        DataCopyPad(tokenTensorU8_[expOffsetInStruct_], expertIdsGMTensorU8_[startTokenId * realLenInStruct_],
+                    expCopyParams, expPadParams);
+        PipeBarrier<PIPE_ALL>();
+
+        // Weights进行拷贝
+        DataCopyExtParams weightCopyParams{static_cast<uint16_t>(currentTokenNum),
+                                           static_cast<uint32_t>(realLenInStruct_), 0,
+                                           static_cast<uint32_t>(infoGapInStruct_), 0};
+        DataCopyPadExtParams<uint8_t> weightPadParams;
+        DataCopyPad(tokenTensorU8_[weightOffsetInStruct_], weightGt[startTokenId * realLenInStruct_], weightCopyParams,
+                    weightPadParams);
+        PipeBarrier<PIPE_ALL>();
+
+        for (uint32_t tokenIndex = 0; tokenIndex < currentTokenNum; ++tokenIndex) {
+            // 获取token在WinOut的地址
+            uint32_t tokenId = startTokenId + tokenIndex;
+            uint32_t startExpId = tokenId * axisK_;
+            uint32_t flagOffset = (tokenIndex * tokenStructLen_ + flagOffsetInStruct_) / sizeof(uint64_t);
+            tokenTempTensorU64_(flagOffset) = SHOULD_SEND_FLAG_VALUE;
+            uint64_t sendServerInfo = 0;
+            for (uint32_t i = 0; i < axisK_; i++) {
+                uint32_t expertId = static_cast<uint32_t>(expertIdsI16Tensor_(startExpId + i));  // 读取expId
+                uint32_t dstServerId = expertId / moeExpertNumInServer_;
+                sendServerInfo |= (1UL << dstServerId);
+            }
+            PipeBarrier<PIPE_ALL>();
+            GlobalTensor<uint64_t> sendServerInfoTemp =
+                tokenAddrFlagStructGlobalU64Tensor_[(FLAG_SIZE * tokenId) / sizeof(uint64_t)];
+            sendServerInfoTemp.SetValue(0, sendServerInfo);
+            DataCacheCleanAndInvalid<uint64_t, AscendC::CacheLine::SINGLE_CACHE_LINE, AscendC::DcciDst::CACHELINE_OUT>(
+                sendServerInfoTemp);
+            PipeBarrier<PIPE_ALL>();
+        }
+        uint32_t tokenWinOutOffset = startTokenId * tokenStructLen_;
+        DataCopy(sendTokensU8Tensor_[tokenWinOutOffset], tokenTensorU8_, currentTokenNum * tokenStructLen_);
+        PipeBarrier<PIPE_ALL>();
+        startTokenId += currentTokenNum;
+        sendTokenNum -= currentTokenNum;
+    }
+}
+
+template <TemplateMC2TypeA2layeredClass>
+__aicore__ inline void MoeDistributeDispatchV2Layered<TemplateMC2TypeA2layeredFunc>::QuantProcess(
+    uint32_t sendTokenNum, LocalTensor<XType> xTokenLt, LocalTensor<float> tokenCastLt)
+{
+    constexpr uint32_t maxArrUbOffset = 6 * 1024;
+    constexpr uint32_t maxArrLen = 3;
+    constexpr uint32_t maxValOffset = 0;
+    constexpr uint32_t minValOffset = 1;
+    constexpr uint32_t resValOffset = 2;
+    constexpr float quantMax = 127.0f;
+    const half deqScale = static_cast<half>(1.000000e+00f);
+    float dynamicScale = 0.0;
+    PipeBarrier<PIPE_ALL>();
+    LocalTensor<float> workLt = tBuf.GetWithOffset<float>(maxArrUbOffset / sizeof(float), 0);
+    LocalTensor<float> maxLt = tBuf.GetWithOffset<float>(maxArrLen, maxArrUbOffset);
+
+    Cast(tokenCastLt, xTokenLt, RoundMode::CAST_NONE, sendTokenNum * axisH_);
+    for (int32_t i = 0; i < sendTokenNum; ++i) {
+        PipeBarrier<PIPE_V>();
+        if constexpr (DynamicQuant) {
+            ReduceMax(maxLt[maxValOffset], tokenCastLt[i * axisH_], workLt, axisH_, false);
+            SyncFunc<AscendC::HardEvent::V_S>();
+            PipeBarrier<PIPE_V>();
+            ReduceMin(maxLt[minValOffset], tokenCastLt[i * axisH_], workLt, axisH_, false);
+            PipeBarrier<PIPE_V>();
+            Abs(maxLt, maxLt, maxArrLen - 1);
+            PipeBarrier<PIPE_V>();
+            ReduceMax(maxLt[resValOffset], maxLt, workLt, maxArrLen - 1, false);
+
+            SyncFunc<AscendC::HardEvent::V_S>();
+            float maxVal = maxLt(resValOffset);
+            dynamicScale = float(quantMax) / float(maxVal);
+            SyncFunc<AscendC::HardEvent::S_V>();
+            Muls(tokenCastLt[i * axisH_], tokenCastLt[i * axisH_], dynamicScale, axisH_);
+            PipeBarrier<PIPE_V>();
+        }
+
+        LocalTensor<half> halfLocalTemp = tokenCastLt[i * axisH_].template ReinterpretCast<half>();
+        LocalTensor<int32_t> int32LocalTemp = tokenCastLt[i * axisH_].template ReinterpretCast<int32_t>();
+        Cast(int32LocalTemp, tokenCastLt[i * axisH_], RoundMode::CAST_RINT, axisH_);
+        PipeBarrier<PIPE_V>();
+        SetDeqScale(deqScale);
+        PipeBarrier<PIPE_V>();
+
+        Cast(halfLocalTemp, int32LocalTemp, RoundMode::CAST_ROUND, axisH_);
+
+        PipeBarrier<PIPE_V>();
+        LocalTensor<ExpandXOutType> xOutTensor;
+        LocalTensor<uint8_t> tokenUnitLt;
+        tokenUnitLt = xTokenLt.template ReinterpretCast<uint8_t>();
+        xOutTensor = tokenUnitLt[i * tokenStructLen_ + tokenOffsetInStruct_].template ReinterpretCast<ExpandXOutType>();
+        Cast(xOutTensor, halfLocalTemp, RoundMode::CAST_TRUNC, axisH_);
+
+        LocalTensor<float> scaleTensor =
+            tokenUnitLt[i * tokenStructLen_ + scaleOffsetInStruct_].template ReinterpretCast<float>();
+        scaleTensor.SetValue(0, float(1.0) / dynamicScale);  // int8->float32
+    }
+}
+
+template <TemplateMC2TypeA2layeredClass>
+__aicore__ inline void
+MoeDistributeDispatchV2Layered<TemplateMC2TypeA2layeredFunc>::SendDataToServer(uint32_t destServerId)
+{
+    uint32_t dstRankId = rankId_ % SERVER_RANK_SIZE + destServerId * SERVER_RANK_SIZE;
+    uint64_t destServerMask = (1UL << destServerId);
+
+    // 根据BufferID选择对应WindowBuffer -> 根据对应本机的Server选择Dst对应预留区域
+    uint64_t dstRdmaAddr = (uint64_t)(hccl_.GetWindowsInAddr(dstRankId) + (halfWinSize_ * bufferId_ * 1UL) +
+                                      (serverId_ * SERVER_SIZE_ON_WIN * 1UL));
+    uint64_t srcRdmaAddrBase = (uint64_t)(hccl_.GetWindowsOutAddr(rankId_) + (halfWinSize_ * bufferId_ * 1UL));
+    LocalTensor<uint64_t> sendTokenInfoLocalTensor =
+        tBuf.GetWithOffset<uint64_t>((axisBS_ * FLAG_SIZE) / sizeof(uint64_t), 0);
+    DataCopy(sendTokenInfoLocalTensor, tokenAddrFlagStructGlobalU64Tensor_, (axisBS_ * FLAG_SIZE) / sizeof(uint64_t));
+    PipeBarrier<PIPE_ALL>();
+
+    for (uint32_t tokenIdx = 0; tokenIdx < axisBS_; ++tokenIdx) {
+        uint64_t destServerInfo = sendTokenInfoLocalTensor(tokenIdx * FLAG_SIZE / sizeof(uint64_t));
+        if ((destServerInfo & destServerMask) != 0) {  // 当前有需要发送的token立即发送
+            uint64_t srcRdmaAddr = (uint64_t)(srcRdmaAddrBase + (tokenStructLen_ * tokenIdx * 1UL));
+            AIVRDMAPostSend((GM_ADDR)srcRdmaAddr, (GM_ADDR)dstRdmaAddr, dstRankId, tokenStructLen_, qp_info_);
+            dstRdmaAddr += tokenStructLen_;
+            PipeBarrier<PIPE_ALL>();
+        }
+    }
+
+    uint64_t srcFlagRdmaAddr = (uint64_t)(sendStatusTensor_.GetPhyAddr());
+    uint64_t dstFlagRdmaAddr =
+        (uint64_t)(hccl_.GetWindowsInAddr(dstRankId) + halfWinSize_ * bufferId_ + WIN_SIZE + serverId_ * STATE_OFFSET);
+    AIVRDMAPostSend((GM_ADDR)srcFlagRdmaAddr, (GM_ADDR)dstFlagRdmaAddr, dstRankId, FLAG_SIZE, qp_info_);
+    PipeBarrier<PIPE_ALL>();
+}
+
+template <TemplateMC2TypeA2layeredClass>
+__aicore__ inline uint32_t MoeDistributeDispatchV2Layered<TemplateMC2TypeA2layeredFunc>::GetExpRank(uint32_t expertId)
+{
+    return expertId / localMoeExpertNum_;
+}
+
+template <TemplateMC2TypeA2layeredClass>
+__aicore__ inline void MoeDistributeDispatchV2Layered<TemplateMC2TypeA2layeredFunc>::SetIpcFlag(int32_t flagVal)
+{
+    if (aivId_ >= SERVER_RANK_SIZE) {
+        return;
+    }
+    uint32_t destRankIdx = aivId_;
+    uint32_t localRankId = rankId_ % SERVER_RANK_SIZE;
+    GlobalTensor<uint64_t> globalSet;
+    globalSet.SetGlobalBuffer((__gm__ uint64_t *)(shareAddrs[destRankIdx] + IPC_FLAG_OFFSET) +
+                              localRankId * B64_PER_BLOCK);
+    LocalTensor<uint64_t> localSet = tBuf.GetWithOffset<uint64_t>(B64_PER_BLOCK, 0);
+    uint64_t setVal = magicVal_;
+    localSet.SetValue(0, setVal);
+    SyncFunc<AscendC::HardEvent::S_MTE3>();
+    DataCopy(globalSet, localSet, B64_PER_BLOCK);
+    PipeBarrier<PIPE_ALL>();
+}
+
+template <TemplateMC2TypeA2layeredClass>
+__aicore__ inline void MoeDistributeDispatchV2Layered<TemplateMC2TypeA2layeredFunc>::WaitIpcFlag(int32_t flagVal)
+{
+    uint64_t waitVal = magicVal_;
+    if (aivId_ >= SERVER_RANK_SIZE) {
+        return;
+    }
+    LocalTensor<uint64_t> localWait = tBuf.GetWithOffset<uint64_t>(B64_PER_BLOCK, 0);
+    bool isSync = true;
+    uint32_t destRankIdx = aivId_;
+    uint32_t localRankId = rankId_ % SERVER_RANK_SIZE;
+    GlobalTensor<uint64_t> flagIpcGt;
+    flagIpcGt.SetGlobalBuffer((__gm__ uint64_t *)(shareAddrs[localRankId] + IPC_FLAG_OFFSET) +
+                              destRankIdx * B64_PER_BLOCK);
+    PipeBarrier<PIPE_ALL>();
+    do {
+        DataCopy(localWait, flagIpcGt, B64_PER_BLOCK);
+        SyncFunc<AscendC::HardEvent::MTE2_S>();
+        // 当有core未达到checkValue的阶段时，继续等待
+        uint64_t tempVal = localWait.GetValue(0);
+        if (tempVal >= waitVal) {
+            break;
+        }
+    } while (isSync);
+}
+
+template <TemplateMC2TypeA2layeredClass>
+__aicore__ inline uint32_t MoeDistributeDispatchV2Layered<TemplateMC2TypeA2layeredFunc>::GetArrivedTokenInfo(
+    uint32_t serverIdx, uint32_t tokenIdx, bool justExpInfo, LocalTensor<uint8_t> localUB_U8)
+{
+    GlobalTensor<uint64_t> TokenFlagGtU64;
+    GlobalTensor<uint8_t> TokensGtU8;
+
+    TokenFlagGtU64.SetGlobalBuffer((__gm__ uint64_t *)(windowInGM_));
+    TokensGtU8.SetGlobalBuffer((__gm__ uint8_t *)(windowInGM_));
+
+    LocalTensor<uint64_t> statusTensor = statusBuf_.Get<uint64_t>();
+    DataCopy(statusTensor, readStatusTensor_[(serverIdx)*STATE_OFFSET / sizeof(uint64_t)],
+             FLAG_SIZE / sizeof(uint64_t));
+    PipeBarrier<PIPE_ALL>();
+    uint64_t endFlagValue = statusTensor.GetValue(0);
+
+    uint32_t TokenOffset = serverIdx * SERVER_SIZE_ON_WIN + tokenIdx * tokenStructLen_;
+    DataCopy(statusTensor, TokenFlagGtU64[(TokenOffset + flagOffsetInStruct_) / sizeof(uint64_t)],
+             FLAG_SIZE / sizeof(uint64_t));
+    PipeBarrier<PIPE_ALL>();
+    uint64_t tokenFlagValue = statusTensor.GetValue(0);
+
+    uint32_t nextTokenOffset = serverIdx * SERVER_SIZE_ON_WIN + (tokenIdx + 1) * tokenStructLen_;
+    DataCopy(statusTensor, TokenFlagGtU64[(nextTokenOffset + flagOffsetInStruct_) / sizeof(uint64_t)],
+             FLAG_SIZE / sizeof(uint64_t));
+    PipeBarrier<PIPE_ALL>();
+    uint64_t nextTokenFlagValue = statusTensor.GetValue(0);
+
+    // 等到发送结束信号，没等到token结束信号，则返回结束等待状态
+    if (nextTokenFlagValue == SHOULD_SEND_FLAG_VALUE) {
+        if (justExpInfo) {
+            DataCopy(localUB_U8, TokensGtU8[TokenOffset + expOffsetInStruct_], expLenInStruct_);
+        } else {
+            DataCopy(localUB_U8, TokensGtU8[TokenOffset], tokenStructLen_);
+        }
+        PipeBarrier<PIPE_ALL>();
+        return ARRIVAL_STATUS;
+    }
+
+    if (endFlagValue != END_OF_WRITE_FLAG_VALUE) {
+        // 等待 token 或者 endOfWrite
+        PipeBarrier<PIPE_ALL>();
+        return WAIT_STATUS;
+    } else {  // 得到上个token->可以处理
+        if (tokenFlagValue == SHOULD_SEND_FLAG_VALUE) {
+            if (justExpInfo) {
+                DataCopy(localUB_U8, TokensGtU8[TokenOffset + expOffsetInStruct_], expLenInStruct_);
+            } else {
+                DataCopy(localUB_U8, TokensGtU8[TokenOffset], tokenStructLen_);
+            }
+            PipeBarrier<PIPE_ALL>();
+            return ARRIVAL_STATUS;
+        } else {
+            PipeBarrier<PIPE_ALL>();
+            return FINISH_STATUS;
+        }
+    }
+}
+
+template <TemplateMC2TypeA2layeredClass>
+__aicore__ inline uint32_t MoeDistributeDispatchV2Layered<TemplateMC2TypeA2layeredFunc>::GetSelfServerTokenInfo(
+    uint32_t tokenIdx, bool justExpInfo, LocalTensor<uint8_t> localUB_U8)
+{
+    if (tokenIdx >= axisBS_) {
+        return FINISH_STATUS;
+    }
+
+    LocalTensor<uint64_t> sendTokenInfoLocalTensor = statusBuf_.Get<uint64_t>();
+    DataCopy(sendTokenInfoLocalTensor, tokenAddrFlagStructGlobalU64Tensor_[tokenIdx * FLAG_SIZE / sizeof(uint64_t)],
+             FLAG_SIZE / sizeof(uint64_t));
+    PipeBarrier<PIPE_ALL>();
+
+    uint64_t sendFlag = sendTokenInfoLocalTensor(0);
+
+    uint64_t destServerMask = (1UL << serverId_);
+    if ((sendFlag & destServerMask) == 0) {
+        return SKIP_STATUS;
+    } else {
+        GlobalTensor<uint8_t> TokensGtU8;
+        TokensGtU8.SetGlobalBuffer((__gm__ uint8_t *)(windowOutGM_));
+        if (justExpInfo) {
+            DataCopy(localUB_U8, TokensGtU8[tokenIdx * tokenStructLen_ + expOffsetInStruct_], expLenInStruct_);
+        } else {
+            DataCopy(localUB_U8, TokensGtU8[tokenIdx * tokenStructLen_], tokenStructLen_);
+        }
+        PipeBarrier<PIPE_ALL>();
+        return ARRIVAL_STATUS;
+    }
+}
+
+template <TemplateMC2TypeA2layeredClass>
+__aicore__ inline void MoeDistributeDispatchV2Layered<TemplateMC2TypeA2layeredFunc>::Win2Ipc()
+{
+    uint32_t coresPerServer = (aivNum_ - serverNum - 1) / serverNum;
+    uint32_t logicAivId = aivId_ - serverNum - 1;
+    if (logicAivId >= coresPerServer * serverNum) {
+        return;
+    }
+    // 计算本core需要处理的ServerId
+    uint32_t formServerId = logicAivId / coresPerServer;
+
+    uint32_t expStartId = serverId_ * SERVER_RANK_SIZE * localMoeExpertNum_;
+    uint32_t expEndId = expStartId + SERVER_RANK_SIZE * localMoeExpertNum_;
+    // 获取到达的Token，统计专家信息，并且完成Ipc发送
+    uint32_t tokenIdx = 0;
+    uint32_t selfTokenIdx = 0;
+
+    uint32_t tokenStatus = WAIT_STATUS;
+    bool justExpInfo = (tokenIdx % coresPerServer != logicAivId % coresPerServer);
+    uint32_t tokenNumPerExpInfoSize =
+        SERVER_RANK_SIZE * localMoeExpertNum_ * EXP_TOKEN_COUNT_FLAG_CNT * sizeof(int32_t);
+
+    GlobalTensor<uint8_t> targetTokenIpcGt;
+    GlobalTensor<int32_t> targetCntIpcGt;
+
+    LocalTensor<int32_t> tokenNumPerExp =
+        tBuf.GetWithOffset<int32_t>(SERVER_RANK_SIZE * localMoeExpertNum_ * EXP_TOKEN_COUNT_FLAG_CNT, TBUF_TEMP_OFFSET);
+    LocalTensor<uint8_t> localUB_U8 = tBuf.GetWithOffset<uint8_t>(
+        tokenStructLen_ / sizeof(uint8_t), RoundUp(tokenNumPerExpInfoSize + TBUF_TEMP_OFFSET, IPC_BUFF_ALIGN));
+    LocalTensor<int32_t> localUB_32 = tBuf.GetWithOffset<int32_t>(
+        tokenStructLen_ / sizeof(int32_t), RoundUp(tokenNumPerExpInfoSize + TBUF_TEMP_OFFSET, IPC_BUFF_ALIGN));
+
+    Duplicate<int32_t>(tokenNumPerExp, 0, SERVER_RANK_SIZE * localMoeExpertNum_ * EXP_TOKEN_COUNT_FLAG_CNT);
+    PipeBarrier<PIPE_ALL>();
+    while (tokenStatus != FINISH_STATUS) {
+        if (formServerId == serverId_) {
+            tokenStatus = GetSelfServerTokenInfo(selfTokenIdx, justExpInfo, localUB_U8);
+            if (tokenStatus == SKIP_STATUS || tokenStatus == ARRIVAL_STATUS) {
+                selfTokenIdx++;
+            }
+        } else {
+            tokenStatus = GetArrivedTokenInfo(formServerId, tokenIdx, justExpInfo, localUB_U8);
+        }
+
+        if (tokenStatus != ARRIVAL_STATUS) {
+            continue;
+        }
+        LocalTensor<int32_t> expInfoTensor;
+        if (justExpInfo) {
+            expInfoTensor = localUB_32;
+        } else {
+            expInfoTensor = localUB_32[expOffsetInStruct_ / sizeof(int32_t)];
+        }
+
+        for (int32_t expIndex = 0; expIndex < axisK_; ++expIndex) {
+            uint32_t targetExpId = (uint32_t)(expInfoTensor(expIndex));
+            if (targetExpId < expStartId || targetExpId >= expEndId) {
+                continue;
+            }
+
+            uint32_t targetRankId = GetExpRank(targetExpId);
+            uint32_t localExpIdx = targetExpId % (localMoeExpertNum_ * SERVER_RANK_SIZE);
+            uint32_t targetTokenIdx = (uint32_t)(tokenNumPerExp(localExpIdx * EXP_TOKEN_COUNT_FLAG_CNT));
+            tokenNumPerExp(localExpIdx * EXP_TOKEN_COUNT_FLAG_CNT) += 1;
+            if (justExpInfo) {
+                continue;
+            }
+
+            // 本卡需要发送
+            uint32_t targetExpOffset =
+                (targetExpId % localMoeExpertNum_) * worldSize_ * RANK_SIZE_ON_IPC;            // 第几个Exp段
+            uint32_t targetServerOffset = formServerId * SERVER_RANK_SIZE * RANK_SIZE_ON_IPC;  // 第几个Server段
+            uint32_t targetRankOffset = (rankId_ % SERVER_RANK_SIZE) * RANK_SIZE_ON_IPC;       // 第几个Rank段
+            uint32_t targetTokenOffset = tokenStructLen_ * targetTokenIdx;                     // 第几个Token位
+            uint32_t targetOffset =
+                targetExpOffset + targetServerOffset + targetRankOffset + targetTokenOffset;  // 总偏移
+            targetTokenIpcGt.SetGlobalBuffer(
+                (__gm__ uint8_t *)(shareAddrs[targetRankId % SERVER_RANK_SIZE] + IPC_DATA_OFFSET + targetOffset));
+            PipeBarrier<PIPE_ALL>();
+            DataCopy(targetTokenIpcGt, localUB_U8, tokenStructLen_);
+            PipeBarrier<PIPE_ALL>();
+        }
+        tokenIdx += 1;
+        justExpInfo = (tokenIdx % coresPerServer != logicAivId % coresPerServer);
+    }
+    // 数据发送结束，填写tokenNum到对端Ipc，每轮填写coresPerServer个，总共要填写 SERVER_RANK_SIZE * localMoeExpertNum_个
+    uint32_t batchNum = (SERVER_RANK_SIZE * localMoeExpertNum_ + coresPerServer - 1) / coresPerServer;
+    for (uint32_t batch = 0; batch < batchNum; batch++) {
+        uint32_t targetExpId = expStartId + batch * coresPerServer + logicAivId % coresPerServer;
+        uint32_t targetRankId = GetExpRank(targetExpId);
+        if (targetExpId >= expEndId) {
+            return;
+        }
+        uint32_t localExpOffset = targetExpId % (localMoeExpertNum_ * SERVER_RANK_SIZE) * EXP_TOKEN_COUNT_FLAG_CNT;
+        uint32_t targetCntOffset = ((targetExpId % localMoeExpertNum_) * worldSize_ + formServerId * SERVER_RANK_SIZE +
+                                    (rankId_ % SERVER_RANK_SIZE)) *
+                                   EXP_TOKEN_COUNT_FLAG_CNT;
+        targetCntIpcGt.SetGlobalBuffer(
+            (__gm__ int32_t *)(shareAddrs[targetRankId % SERVER_RANK_SIZE] + IPC_TOKEN_CNT_OFFSET));
+        PipeBarrier<PIPE_ALL>();
+        DataCopy(targetCntIpcGt[targetCntOffset], tokenNumPerExp[localExpOffset], EXP_TOKEN_COUNT_FLAG_CNT);
+        PipeBarrier<PIPE_ALL>();
+    }
+    PipeBarrier<PIPE_ALL>();
+}
+
+template <TemplateMC2TypeA2layeredClass>
+__aicore__ inline void MoeDistributeDispatchV2Layered<TemplateMC2TypeA2layeredFunc>::Ipc2Out()
+{
+    uint32_t coresPerExp = aivNum_ / localMoeExpertNum_;
+    if (aivId_ >= coresPerExp * localMoeExpertNum_) {
+        return;
+    }
+    uint32_t coresPerServer = aivNum_ / serverNum;
+    uint32_t localRankId = rankId_ % SERVER_RANK_SIZE;
+    GlobalTensor<int32_t> flagIpcGt;
+    flagIpcGt.SetGlobalBuffer((__gm__ int32_t *)(shareAddrs[rankId_ % SERVER_RANK_SIZE]));
+    // PipeBarrier<PIPE_ALL>();
+    uint32_t curExpIdx = aivId_ / coresPerExp;   // 当前处理的专家在本卡上的Idx
+    uint32_t localAivId = aivId_ % coresPerExp;  // 处理本专家的同一批Core中，本Core的Idx
+    // 每个exp对应ranksize行
+    uint32_t srCntPerExp = serverNum * SERVER_RANK_SIZE;
+    // 平均每个核处理多少行
+    uint32_t srCntPerCore = srCntPerExp / coresPerExp;
+    // 平分后还剩多少行
+    uint32_t srCntRemain = srCntPerExp % coresPerExp;
+    // 前面的核共分到了多少剩余
+    uint32_t srCntPreRemain = (localAivId < srCntRemain) ? localAivId : srCntRemain;
+    // 当前核分到多少行
+    uint32_t srCntCurCore = (localAivId < srCntRemain) ? (srCntPerCore + 1) : srCntPerCore;
+
+    GlobalTensor<int32_t> tokenCntIpcGt;
+    tokenCntIpcGt.SetGlobalBuffer((__gm__ int32_t *)(shareAddrs[rankId_ % SERVER_RANK_SIZE] + IPC_TOKEN_CNT_OFFSET));
+
+    // tBuf 内存分配
+    // 4k ~ 6k 保存按expert统计的token个数信息
+    LocalTensor<int64_t> tokenCntByExpUB = tBuf.GetWithOffset<int64_t>(2 * 1024 / sizeof(int64_t), 4 * 1024);
+    // 6k ~ 8k 保存token个数统计信息
+    LocalTensor<int32_t> tokenCntUB = tBuf.GetWithOffset<int32_t>(2 * 1024 / sizeof(int32_t), 6 * 1024);
+    // 2k ~ 4k 保存权重信息
+    LocalTensor<float> weightLt = tBuf.GetWithOffset<float>(2 * 1024 / sizeof(float), 2 * 1024);
+
+    DataCopyExtParams copyExpertIdsParams{1,
+                                          static_cast<uint32_t>(serverNum * SERVER_RANK_SIZE * localMoeExpertNum_ *
+                                                                EXP_TOKEN_COUNT_FLAG_CNT * sizeof(int32_t)),
+                                          0, 0, 0};
+    DataCopyPadExtParams<int32_t> padParams;
+    PipeBarrier<PIPE_ALL>();
+    DataCopyPad(tokenCntUB, tokenCntIpcGt, copyExpertIdsParams, padParams);
+
+    SyncFunc<AscendC::HardEvent::MTE2_S>();
+    int32_t cntSum = 0;
+    const int tempSize = serverNum * SERVER_RANK_SIZE * localMoeExpertNum_;
+    int log2WorldSize = ScalarGetSFFValue<1>(worldSize_);
+#pragma unroll 8
+    for (uint32_t i = 0; i < tempSize; ++i) {
+        cntSum += tokenCntUB(i << 3);
+        tokenCntUB(i) = cntSum;
+    }
+
+    for (uint32_t i = 0; i < localMoeExpertNum_; ++i) {
+        if (expertTokenNumsType_ == 1) {
+            int32_t preValue = (i == 0) ? 0 : tokenCntUB(i * worldSize_ - 1);
+            tokenCntByExpUB(i) = static_cast<int64_t>(tokenCntUB(i * worldSize_ + worldSize_ - 1) - preValue);
+        } else {
+            tokenCntByExpUB(i) = static_cast<int64_t>(tokenCntUB(i * worldSize_ + worldSize_ - 1));
+        }
+    }
+
+    uint32_t srPreCnt = curExpIdx * srCntPerExp + localAivId * srCntPerCore + srCntPreRemain;
+    PipeBarrier<PIPE_ALL>();
+    GlobalTensor<uint8_t> srcIpcGt;
+    srcIpcGt.SetGlobalBuffer((__gm__ uint8_t *)(shareAddrs[rankId_ % SERVER_RANK_SIZE] + IPC_DATA_OFFSET));
+
+    LocalTensor<uint8_t> localUB = tBuf.GetWithOffset<uint8_t>(tokenUbSize_ / sizeof(uint8_t), TBUF_TEMP_OFFSET);
+    LocalTensor<float> localUBfloat = tBuf.GetWithOffset<float>(tokenUbSize_ / sizeof(float), TBUF_TEMP_OFFSET);
+    LocalTensor<int32_t> localUBint32 = tBuf.GetWithOffset<int32_t>(tokenUbSize_ / sizeof(int32_t), TBUF_TEMP_OFFSET);
+
+    int32_t sumTokenCnt = (0 == srPreCnt) ? 0 : tokenCntUB(srPreCnt - 1);
+    for (uint32_t idx = 0; idx < srCntCurCore; ++idx) {
+        // 循环本Core需要处理的Rank数
+        uint32_t srIdx = srPreCnt + idx;
+        int32_t curSrTokenCnt = tokenCntUB(srIdx) - (srIdx == 0 ? 0 : tokenCntUB(srIdx - 1));
+        if (curSrTokenCnt == 0) {
+            continue;
+            // 目标Rank没Token发来则跳过
+        }
+        uint32_t tokenCntInUB = tokenUbSize_ / tokenStructLen_;
+        // 单次能搬移的token数据量
+        uint32_t batchCnt = (curSrTokenCnt + tokenCntInUB - 1) / tokenCntInUB;
+        // 循环搬运次数
+        // 分批逻辑待修改，应该是先收集所有待处理Rank的Token，再写out
+        for (uint32_t batchIdx = 0; batchIdx < batchCnt; ++batchIdx) {
+            uint32_t tokenCntInBatch = tokenCntInUB;
+            if (batchIdx == batchCnt - 1) {
+                tokenCntInBatch = curSrTokenCnt - (batchCnt - 1) * tokenCntInUB;
+            }
+            DataCopyExtParams copyTokenParams{static_cast<uint16_t>(1),
+                                              static_cast<uint32_t>(tokenCntInBatch * tokenStructLen_), 0, 0, 0};
+            DataCopyPadExtParams<uint8_t> padParams;
+            uint32_t srcIpcOffset = srIdx * RANK_SIZE_ON_IPC + batchIdx * tokenCntInUB * tokenStructLen_;
+            DataCopyPad(localUB, srcIpcGt[srcIpcOffset], copyTokenParams, padParams);
+            SyncFunc<AscendC::HardEvent::MTE2_MTE3>();
+            DataCopyExtParams writeTokenParams{static_cast<uint16_t>(tokenCntInBatch),
+                                               static_cast<uint32_t>(sizeof(ExpandXOutType) * axisH_),
+                                               static_cast<uint32_t>(tokenGapInStruct_), 0, 0};
+            LocalTensor<ExpandXOutType> outUB = localUB.ReinterpretCast<ExpandXOutType>();
+            DataCopyPad(expandXOutGMTensor_[(sumTokenCnt + batchIdx * tokenCntInUB) * axisH_],
+                        outUB[tokenOffsetInStruct_ / sizeof(ExpandXOutType)], writeTokenParams);
+            PipeBarrier<PIPE_ALL>();
+
+            for (uint32_t tokenIdx = 0; tokenIdx < tokenCntInBatch; tokenIdx++) {
+                for (uint32_t expIdx = 0; expIdx < axisK_; expIdx++) {
+                    uint32_t expOffset = (tokenIdx * tokenStructLen_ + expOffsetInStruct_) / sizeof(int32_t) + expIdx;
+                    if (curExpIdx + rankId_ * localMoeExpertNum_ == localUBint32(expOffset)) {
+                        uint32_t weightOffset = expOffset + alignK_;
+                        weightLt(tokenIdx) = localUBfloat(weightOffset);
+                        break;
+                    }
+                }
+                LocalTensor<float> pintfLt =
+                    localUBfloat[(tokenIdx * tokenStructLen_ + weightOffsetInStruct_) / sizeof(float)];
+            }
+            // weight output
+            PipeBarrier<PIPE_ALL>();
+            DataCopyExtParams weightTokenParams{static_cast<uint16_t>(1),
+                                                static_cast<uint32_t>(tokenCntInBatch * sizeof(float)), 0, 0, 0};
+            DataCopyPad(weightsOutGt[(sumTokenCnt + batchIdx * tokenCntInUB)], weightLt, weightTokenParams);
+            PipeBarrier<PIPE_ALL>();
+            // dynamic scales to output
+            if constexpr (DynamicQuant) {
+                DataCopyExtParams quantTokenParams{
+                    static_cast<uint16_t>(tokenCntInBatch), static_cast<uint32_t>(sizeof(float)),
+                    static_cast<uint32_t>((tokenStructLen_ - UB_32B_ALIGN) / UB_32B_ALIGN), 0, 0};
+
+                LocalTensor<float> quantTempUB = localUB[scaleOffsetInStruct_].ReinterpretCast<float>();
+                DataCopyPad(dynamicScalesOutGMTensor_[(sumTokenCnt + batchIdx * tokenCntInUB)], quantTempUB,
+                            quantTokenParams);
+            }
+            SyncFunc<AscendC::HardEvent::MTE3_MTE2>();
+        }
+        sumTokenCnt += curSrTokenCnt;
+    }
+    if (aivId_ == 0) {
+        // 搬运token统计信息到output
+        GlobalTensor<int32_t> tokenNumsGlobal;
+        tokenNumsGlobal.SetGlobalBuffer((__gm__ int32_t *)(epRecvCountsGM_));
+        DataCopyExtParams countsParams{
+            1, static_cast<uint32_t>(localMoeExpertNum_ * serverNum * SERVER_RANK_SIZE * sizeof(int32_t)), 0, 0, 0};
+        SyncFunc<AscendC::HardEvent::S_MTE3>();
+        DataCopyPad(tokenNumsGlobal, tokenCntUB, countsParams);
+
+        // 搬运按expert的token信息到output
+        GlobalTensor<int64_t> expertTokenNumsGlobal;
+        expertTokenNumsGlobal.SetGlobalBuffer((__gm__ int64_t *)(expertTokenNumsOutGM_));
+        DataCopyExtParams writeCountsParams{1, static_cast<uint32_t>(localMoeExpertNum_ * sizeof(int64_t)), 0, 0, 0};
+        SyncFunc<AscendC::HardEvent::S_MTE3>();
+        DataCopyPad(expertTokenNumsGlobal, tokenCntByExpUB, writeCountsParams);
+    }
+}
+
+template <TemplateMC2TypeA2layeredClass>
+__aicore__ inline void MoeDistributeDispatchV2Layered<TemplateMC2TypeA2layeredFunc>::CleanUp()
+{
+    if (aivId_ == 0) {
+        bufferChosenGlobal_(0) = bufferId_ ^ 1;
+        DataCacheCleanAndInvalid<uint32_t, AscendC::CacheLine::SINGLE_CACHE_LINE, AscendC::DcciDst::CACHELINE_OUT>(
+            bufferChosenGlobal_);
+    }
+
+    uint32_t tokenEndFlagCleanSize = MAX_BS_NUM * FLAG_SIZE;
+    uint32_t writeEndFlagCleanSize = serverNum * STATE_OFFSET;
+    uint32_t maxCleanSize =
+        tokenEndFlagCleanSize > writeEndFlagCleanSize ? tokenEndFlagCleanSize : writeEndFlagCleanSize;
+    LocalTensor<int32_t> cleanTempLt_ = tBuf.GetWithOffset<int32_t>(maxCleanSize / sizeof(int32_t), TBUF_TEMP_OFFSET);
+    Duplicate<int32_t>(cleanTempLt_, 0, maxCleanSize / sizeof(int32_t));
+    PipeBarrier<PIPE_ALL>();
+    if (aivId_ == serverNum - 1) {
+        GlobalTensor<int32_t> readStatusTensorU32;
+        readStatusTensorU32.SetGlobalBuffer((__gm__ int32_t *)(windowInGM_ + WIN_SIZE));
+        DataCopy(readStatusTensorU32, cleanTempLt_, writeEndFlagCleanSize / sizeof(uint32_t));
+    }
+
+    GlobalTensor<int32_t> tokenEndFlagCleanTensor;
+    tokenEndFlagCleanTensor.SetGlobalBuffer((__gm__ int32_t *)(windowInGM_ + aivId_ * SERVER_SIZE_ON_WIN));
+    DataCopyExtParams cleanTokenEndFlagParams{uint16_t(MAX_BS_NUM), uint32_t(flagLenInStruct_), 0,
+                                              uint32_t(tokenStructLen_ - flagLenInStruct_), 0};
+    SyncFunc<AscendC::HardEvent::S_MTE3>();
+    DataCopyPad(tokenEndFlagCleanTensor[flagOffsetInStruct_ / sizeof(int32_t)], cleanTempLt_, cleanTokenEndFlagParams);
+    PipeBarrier<PIPE_ALL>();
+}
+
+template <TemplateMC2TypeA2layeredClass>
+__aicore__ inline void MoeDistributeDispatchV2Layered<TemplateMC2TypeA2layeredFunc>::Process()
+{
+    if ASCEND_IS_AIV {  // 全aiv处理
+        ReorderTokens();
+        PipeBarrier<PIPE_ALL>();
+        SyncAll<true>();
+        if (aivId_ < serverNum) {
+            if (aivId_ != serverId_) {
+                SendDataToServer(aivId_);
+            }
+            CreateInnerReduceInfo(aivId_);
+        } else if (aivId_ == serverNum) {
+            CreateOuterReduceInfo();
+        } else {
+            Win2Ipc();
+        }
+        PipeBarrier<PIPE_ALL>();
+        SyncAll<true>();
+        SetIpcFlag(IPC_FLAG_STEP_1);
+        WaitIpcFlag(IPC_FLAG_STEP_1);
+        PipeBarrier<PIPE_ALL>();
+        SyncAll<true>();
+        Ipc2Out();
+        if (aivId_ < serverNum) {
+            PipeBarrier<PIPE_ALL>();
+            CleanUp();
+        }
+
+        PipeBarrier<PIPE_ALL>();
+        SyncAll<true>();
+        hccl_.Finalize();
+    }
+}
+}  // namespace MoeDistributeDispatchA2Impl
+#endif  // MOE_DISTRIBUTE_DISPATCH_A2_LAYERED_H
diff --git a/csrc/deepep/ops2/op_kernel/moe_distribute_dispatch_v2_tiling.h b/csrc/deepep/ops2/op_kernel/moe_distribute_dispatch_v2_tiling.h
new file mode 100644
index 000000000..75ba73d4f
--- /dev/null
+++ b/csrc/deepep/ops2/op_kernel/moe_distribute_dispatch_v2_tiling.h
@@ -0,0 +1,57 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*!
+ * \file moe_distribute_dispatch_v2_tiling.h
+ * \brief
+ */
+
+#ifndef ASCENDC_MOE_DISTRIBUTE_DISPATCH_V2_TILING_H
+#define ASCENDC_MOE_DISTRIBUTE_DISPATCH_V2_TILING_H
+
+// a2
+struct MoeDistributeDispatchV2Info {
+    uint32_t epWorldSize;          // epWorldSize
+    uint32_t tpWorldSize;          // tpWorldSize
+    uint32_t epRankId;             // epRankId
+    uint32_t tpRankId;             // tpRankId
+    uint32_t expertSharedType;     // expert type
+    uint32_t sharedExpertRankNum;  // shared expert number
+    uint32_t moeExpertNum;         // moe expert number
+    uint32_t quantMode;            // quant mode
+    uint32_t globalBs;             // globalBs = BS * worldSize
+    uint32_t bs;                   // bs
+    uint32_t k;                    // k
+    uint32_t h;                    // h
+    uint32_t aivNum;               // aivNum
+    bool isQuant;                  // whether quant or not
+    bool isTokenMask;              // input active mask 1dims or not
+    bool isExpertMask;             // input active mask 2dims or not
+    bool reserved1;                // reserved
+    bool reserved2;                // reserved
+    bool reserved3;                // reserved
+    uint64_t totalUbSize;          // epWorldSize
+    uint32_t expertTokenNumsType;  // expert token nums type, support 0: cumsum mode, 1: count mode
+    int32_t zeroComputeExpertNum;  // sum of zero、copy and const expert nums
+};
+
+struct MoeDistributeDispatchV2TilingData {
+    Mc2InitTiling mc2InitTiling;
+    Mc2CcTiling mc2CcTiling;
+    MoeDistributeDispatchV2Info moeDistributeDispatchV2Info;
+};
+
+#endif
diff --git a/csrc/deepep/ops2/op_kernel/sync_collectives.h b/csrc/deepep/ops2/op_kernel/sync_collectives.h
new file mode 100644
index 000000000..11b75cbb4
--- /dev/null
+++ b/csrc/deepep/ops2/op_kernel/sync_collectives.h
@@ -0,0 +1,433 @@
+#ifndef SYNC_COLLECTIVES_H
+#define SYNC_COLLECTIVES_H
+
+#include "comm_args.h"
+
+using namespace AscendC;
+using namespace Moe;
+
+// Synchronization flag occupies length
+constexpr int64_t FLAG_UNIT_INT_NUM = 4;
+// Memory size occupied by each synchronization unit (Bytes)
+constexpr int64_t SYNC_UNIT_SIZE = FLAG_UNIT_INT_NUM * sizeof(int64_t);
+// High-order offset when using magic as a comparison value
+constexpr int64_t MAGIC_OFFSET = 32;
+constexpr int64_t MAGIC_MASK = ~((1LL << MAGIC_OFFSET) - 1);
+
+class SyncCollectives
+{
+public:
+    __aicore__ inline SyncCollectives() {}
+
+    __aicore__ inline void Init(int rank, int rankSize, GM_ADDR *shareAddrs, TBuf<QuePosition::VECCALC> &tBuf)
+    {
+        this->rank = rank;
+        this->rankSize = rankSize;
+        this->shareAddrs = shareAddrs;
+        this->blockIdx = GetBlockIdx();
+        this->blockNum = GetBlockNum();
+        // Length of a single indicator segment
+        segmentCount = GetBlockNum() * FLAG_UNIT_INT_NUM;
+        // Initialize the intra-card/inter-card synchronization address corresponding to the current core.
+        localSyncAddr = (__gm__ int64_t *)(shareAddrs[rank]);
+        basicSyncAddr = (__gm__ int64_t *)(shareAddrs[rank]) + GetBlockIdx() * FLAG_UNIT_INT_NUM;
+        blockOuterSyncAddr = (__gm__ int64_t *)(shareAddrs[rank]) + segmentCount + GetBlockIdx() * FLAG_UNIT_INT_NUM;
+        this->tBuf = tBuf;
+    }
+
+    __aicore__ inline void SetSyncFlag(int32_t magic, int32_t value, int32_t eventID)
+    {
+        int64_t v = MergeMagicWithValue(magic, value);
+        SetFlag(localSyncAddr + eventID * FLAG_UNIT_INT_NUM, v);
+    }
+
+    /**
+     * @brief Set the flag for the specified eventID of the designated card, with the value being a combination of magic
+     * and value.
+     * @param magic The operator batch, which will be combined into the high 32 bits of the flag value to be set.
+     * @param value The specific value to be set, which will be the low 32 bits of the flag value to be set.
+     * @param eventID Physically, it is an offset from the shared memory base address (requires scaling, not an absolute
+     * value).
+     * @param rank This rank is the rankId corresponding to the peerMems array in the CommArgs structure, not a global
+     * or local id. (Local is not applicable in the 91093 scenario, and global is not applicable in the 910B
+     * multi-machine scenario.)
+     */
+    __aicore__ inline void SetSyncFlag(int32_t magic, int32_t value, int32_t eventID, int32_t rank)
+    {
+        int64_t v = MergeMagicWithValue(magic, value);
+        SetFlag((__gm__ int64_t *)(shareAddrs[rank]) + eventID * FLAG_UNIT_INT_NUM, v);
+    }
+
+    __aicore__ inline int32_t CalEventIdByMulBlockNum(int32_t blockMultiplier, int32_t targetCoreId)
+    {
+        return (blockMultiplier * blockNum) + targetCoreId;
+    }
+
+    /**
+     * @brief Wait for the flag of the specified eventID on the specified card to become a value
+     *        composed of the combination of magic and value.
+     * @param magic The operator batch, which will be combined into the high 32 bits of the flag
+     *              value to be wait.
+     * @param value The specific value to be wait, which will be the low 32 bits of the flag
+     *              value to be wait.
+     * @param eventID Physically, it is an offset from the shared memory base address (requires
+     *                scaling, not an absolute value).
+     * @param rank This rank is the rankId corresponding to the peerMems array in the CommArgs
+     *              structure, not a global or local id. (Local is not applicable in the 91093
+     *              scenario, and global is not applicable in the 910B multi-machine scenario.)
+     */
+    __aicore__ inline void WaitSyncFlag(int32_t magic, int32_t value, int32_t eventID, int32_t rank)
+    {
+        int64_t v = MergeMagicWithValue(magic, value);
+        WaitOneRankPartFlag((__gm__ int64_t *)(shareAddrs[rank]) + eventID * FLAG_UNIT_INT_NUM, 1, v);
+    }
+
+    __aicore__ inline void WaitSyncFlag(int32_t magic, int32_t value, int32_t eventID)
+    {
+        int64_t v = MergeMagicWithValue(magic, value);
+        WaitOneRankPartFlag((__gm__ int64_t *)(shareAddrs[this->rank]) + eventID * FLAG_UNIT_INT_NUM, 1, v);
+    }
+
+    /**
+     * @brief Wait for the flags starting from the specified eventID on the specified card to become
+     *        a value composed of the combination of magic and value.<br>
+     *        Note: [eventID, eventID + flagNum)
+     */
+    __aicore__ inline void WaitSyncFlag(int32_t magic, int32_t value, int32_t eventID, int32_t rank, int64_t flagNum)
+    {
+        int64_t v = MergeMagicWithValue(magic, value);
+        WaitOneRankPartFlag((__gm__ int64_t *)(shareAddrs[rank]) + eventID * FLAG_UNIT_INT_NUM, flagNum, v);
+    }
+
+    // Set inner-card synchronization flag (memory A)
+    __aicore__ inline void SetInnerFlag(int32_t magic, int32_t eventID)
+    {
+        int64_t value = MergeMagicWithValue(magic, eventID);
+        SetFlag(basicSyncAddr, value);
+    }
+
+    __aicore__ inline void SetInnerFlag(int32_t magic, int32_t eventID, int64_t setRank, int64_t setBlock)
+    {
+        int64_t value = MergeMagicWithValue(magic, eventID);
+        SetFlag((__gm__ int64_t *)(shareAddrs[setRank]) + setBlock * FLAG_UNIT_INT_NUM, value);
+    }
+
+    // Wait for a single inner-card synchronization flag (memory A)
+    __aicore__ inline void WaitInnerFlag(int32_t magic, int32_t eventID, int64_t waitRank, int64_t waitBlock)
+    {
+        int64_t value = MergeMagicWithValue(magic, eventID);
+        WaitOneRankPartFlag((__gm__ int64_t *)(shareAddrs[waitRank]) + waitBlock * FLAG_UNIT_INT_NUM, 1, value);
+    }
+
+    // Wait for all inner-card synchronization flags within the entire rank (memory A)
+    __aicore__ inline void WaitRankInnerFlag(int32_t magic, int32_t eventID, int64_t waitRank)
+    {
+        int64_t value = MergeMagicWithValue(magic, eventID);
+        WaitOneRankAllFlag((__gm__ int64_t *)(shareAddrs[waitRank]), value);
+    }
+
+    // Check all inner-card synchronization flags within the entire rank (memory A)
+    __aicore__ inline bool CheckRankInnerFlag(int32_t magic, int32_t eventID, int64_t waitRank)
+    {
+        int64_t value = MergeMagicWithValue(magic, eventID);
+        return CheckOneRankAllFlag((__gm__ int64_t *)(shareAddrs[waitRank]), value);
+    }
+
+    // Set inter-card synchronization flag (memory B)
+    __aicore__ inline void SetOuterFlag(int32_t magic, int32_t eventID)
+    {
+        int64_t value = MergeMagicWithValue(magic, eventID);
+        SetFlag(blockOuterSyncAddr, value);
+    }
+
+    __aicore__ inline void SetOuterFlag(int32_t magic, int32_t eventID, int64_t setRank, int64_t setBlock)
+    {
+        __gm__ int64_t *flagAddr = GetOuterFlagAddr(setRank, setBlock);
+        int64_t value = MergeMagicWithValue(magic, eventID);
+        SetFlag(flagAddr, value);
+    }
+
+    // Wait for a single inter-card synchronization flag (memory B)
+    __aicore__ inline void WaitOuterFlag(int32_t magic, int32_t eventID, int64_t waitRank, int64_t waitBlock)
+    {
+        int64_t value = MergeMagicWithValue(magic, eventID);
+        __gm__ int64_t *flagAddr = GetOuterFlagAddr(waitRank, waitBlock);
+        WaitOneRankPartFlag(flagAddr, 1, value);
+    }
+
+    // Wait for all inter-card synchronization flags within the entire rank (memory B)
+    __aicore__ inline void WaitOneRankOuterFlag(int32_t magic, int32_t eventID, int64_t rank)
+    {
+        int64_t value = MergeMagicWithValue(magic, eventID);
+        __gm__ int64_t *flagAddr;
+        flagAddr = GetOuterFlagAddr(rank, 0);
+        WaitOneRankPartFlag(flagAddr, blockNum, value);
+    }
+
+    // Wait for flagNum inter-card synchronization flags starting from startBlock for all ranks (memory B)
+    __aicore__ inline void WaitAllRankPartOuterFlag(int32_t magic, int32_t eventID, int64_t startBlock, int64_t flagNum)
+    {
+        int64_t value = MergeMagicWithValue(magic, eventID);
+        __gm__ int64_t *flagAddr;
+        int waitRank;
+        for (auto r = 0; r < rankSize; ++r) {
+            waitRank = (rank + r) % rankSize;  // Offset reading of rank flags to prevent performance impact from
+                                               // concurrent copying by multiple cores
+            flagAddr = GetOuterFlagAddr(waitRank, startBlock);
+            WaitOneRankPartFlag(flagAddr, flagNum, value);
+        }
+    }
+
+    // Check flagNum inter-card synchronization flags starting from startBlock for all ranks (memory B)
+    __aicore__ inline bool CheckAllRankPartOuterFlag(int32_t magic, int32_t eventID, int64_t startBlock,
+                                                     int64_t flagNum)
+    {
+        int64_t value = MergeMagicWithValue(magic, eventID);
+        __gm__ int64_t *flagAddr;
+        int waitRank;
+        for (auto r = 0; r < rankSize; ++r) {
+            waitRank = (rank + r) % rankSize;  // Offset reading of rank flags to prevent performance impact from
+                                               // concurrent copying by multiple cores
+            flagAddr = GetOuterFlagAddr(waitRank, startBlock);
+            if (!CheckOneRankPartFlag(flagAddr, flagNum, value)) {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    // Wait for all inter-card synchronization flags for all ranks, full rank synchronization (memory B)
+    __aicore__ inline void WaitAllRankOuterFlag(int32_t magic, int32_t eventID)
+    {
+        WaitAllRankPartOuterFlag(magic, eventID, 0, blockNum);
+    }
+
+    // Check all inter-card synchronization flags for all ranks, full rank synchronization (memory B)
+    __aicore__ inline bool CheckAllRankOuterFlag(int32_t magic, int32_t eventID)
+    {
+        return CheckAllRankPartOuterFlag(magic, eventID, 0, blockNum);
+    }
+
+    // Low-level interface, set synchronization flag
+    __aicore__ inline void SetFlag(__gm__ int64_t *setAddr, int64_t setValue)
+    {
+        AscendC::SetFlag<HardEvent::MTE3_S>(EVENT_ID0);
+        AscendC::WaitFlag<HardEvent::MTE3_S>(EVENT_ID0);
+        AscendC::SetFlag<HardEvent::MTE2_S>(EVENT_ID0);
+        AscendC::WaitFlag<HardEvent::MTE2_S>(EVENT_ID0);
+        GlobalTensor<int64_t> globalSet;
+        globalSet.SetGlobalBuffer(setAddr, FLAG_UNIT_INT_NUM);
+        LocalTensor<int64_t> localSet = tBuf.GetWithOffset<int64_t>(1, 0);
+        localSet.SetValue(0, setValue);
+
+        // Copy global synchronization flag to local
+        AscendC::SetFlag<HardEvent::S_MTE3>(EVENT_ID0);
+        AscendC::WaitFlag<HardEvent::S_MTE3>(EVENT_ID0);  // Wait for SetValue to complete
+        DataCopy(globalSet, localSet, FLAG_UNIT_INT_NUM);
+        AscendC::SetFlag<HardEvent::MTE3_S>(EVENT_ID0);
+        AscendC::WaitFlag<HardEvent::MTE3_S>(EVENT_ID0);  // Wait for UB->GM to complete
+    }
+
+    // Low-level interface, wait for synchronization flag
+    __aicore__ inline void WaitFlag(__gm__ int64_t *waitAddr, int64_t waitValue)
+    {
+        WaitOneRankPartFlag(waitAddr, 1, waitValue);
+    }
+
+    // Read a flag, return an immediate number
+    __aicore__ inline int64_t GetFlag(__gm__ int64_t *waitAddr)
+    {
+        GlobalTensor<int64_t> globalWait;
+        globalWait.SetGlobalBuffer(waitAddr, FLAG_UNIT_INT_NUM);
+        LocalTensor<int64_t> localWait = tBuf.GetWithOffset<int64_t>(1, 0);
+        // Copy global to local
+        DataCopy(localWait, globalWait, FLAG_UNIT_INT_NUM);
+        AscendC::SetFlag<HardEvent::MTE2_S>(EVENT_ID0);
+        AscendC::WaitFlag<HardEvent::MTE2_S>(EVENT_ID0);  // Wait for GM->UB
+
+        int64_t res = localWait.GetValue(0);
+        return res;
+    }
+
+    // Get multiple consecutive synchronization flags within a single card
+    __aicore__ inline void WaitOneRankPartOuterFlag(int32_t magic, int32_t eventID, int64_t waitRank,
+                                                    int64_t startBlock, int64_t flagNum)
+    {
+        int64_t value = MergeMagicWithValue(magic, eventID);
+        __gm__ int64_t *flagAddr;
+        flagAddr = GetOuterFlagAddr(waitRank, startBlock);
+        WaitOneRankPartFlag(flagAddr, flagNum, value);
+    }
+
+    // Get synchronization flag within a single card (memory A)
+    __aicore__ inline int64_t GetInnerFlag(int64_t waitRank, int64_t waitBlock)
+    {
+        return GetFlag((__gm__ int64_t *)(shareAddrs[waitRank]) + waitBlock * FLAG_UNIT_INT_NUM);
+    }
+
+    __aicore__ inline int64_t GetOuterFlag(int64_t waitRank, int64_t waitBlock)
+    {
+        return GetFlag((__gm__ int64_t *)(shareAddrs[waitRank]) + segmentCount + waitBlock * FLAG_UNIT_INT_NUM);
+    }
+
+    // In the rank Chunk Flag area, return success if the destRank chunk Flag value is 0, otherwise fail
+    __aicore__ inline int64_t GetChunkFlag(int64_t rank, int64_t destRank, int64_t magic, int64_t timeout)
+    {
+        int64_t value = MergeMagicWithValue(magic, 0);
+        int64_t status = GetChunkFlagValue(
+            (__gm__ int64_t *)(shareAddrs[rank]) + IPC_CHUNK_FLAG + destRank * FLAG_UNIT_INT_NUM, value, timeout);
+        return status;
+    }
+
+    // Set the destRank chunk Flag value in the rank Chunk Flag area to value
+    __aicore__ inline void SetChunkFlag(int64_t rank, int64_t destRank, int64_t magic, int64_t eventId)
+    {
+        int64_t value = MergeMagicWithValue(magic, eventId);
+        SetFlag((__gm__ int64_t *)(shareAddrs[rank]) + IPC_CHUNK_FLAG + destRank * FLAG_UNIT_INT_NUM, value);
+    }
+
+    __aicore__ inline int64_t GetChunkRecvLen(int64_t rank, int64_t destRank, int64_t magic, int64_t timeout)
+    {
+        int64_t len =
+            GetChunkFlagValue((__gm__ int64_t *)(shareAddrs[rank]) + IPC_CHUNK_FLAG + destRank * FLAG_UNIT_INT_NUM, 0,
+                              timeout, true, magic);
+        return len;
+    }
+
+private:
+    __aicore__ inline int64_t MergeMagicWithValue(int32_t magic, int32_t value)
+    {
+        // Merge magic as the high bits and eventID as the low bits into a value for comparison
+        return (static_cast<int64_t>(static_cast<uint32_t>(magic)) << MAGIC_OFFSET) | static_cast<int64_t>(value);
+    }
+
+    __aicore__ inline __gm__ int64_t *GetInnerFlagAddr(int64_t flagRank, int64_t flagBlock)
+    {
+        return (__gm__ int64_t *)(shareAddrs[flagRank]) + flagBlock * FLAG_UNIT_INT_NUM;
+    }
+
+    __aicore__ inline __gm__ int64_t *GetOuterFlagAddr(int64_t flagRank, int64_t flagBlock)
+    {
+        return (__gm__ int64_t *)(shareAddrs[flagRank]) + segmentCount + flagBlock * FLAG_UNIT_INT_NUM;
+    }
+
+    // Wait for a part of synchronization flags within a rank
+    __aicore__ inline void WaitOneRankPartFlag(__gm__ int64_t *waitAddr, int64_t flagNum, int64_t checkValue)
+    {
+        GlobalTensor<int64_t> globalWait;
+        globalWait.SetGlobalBuffer(waitAddr, flagNum * FLAG_UNIT_INT_NUM);
+        LocalTensor<int64_t> localWait = tBuf.GetWithOffset<int64_t>(flagNum * FLAG_UNIT_INT_NUM, 0);
+        bool isSync = true;
+        int64_t checkedFlagNum = 0;
+        do {
+            // Copy global synchronization flags to local
+            DataCopy(localWait, globalWait[checkedFlagNum * FLAG_UNIT_INT_NUM],
+                     (flagNum - checkedFlagNum) * FLAG_UNIT_INT_NUM);
+            AscendC::SetFlag<HardEvent::MTE2_S>(EVENT_ID0);
+            AscendC::WaitFlag<HardEvent::MTE2_S>(EVENT_ID0);  // Wait for GM->UB
+
+            // Check if the synchronization flags are equal to checkValue
+            isSync = true;
+            int64_t remainToCheck = flagNum - checkedFlagNum;
+            for (auto i = 0; i < remainToCheck; ++i) {
+                // Continue waiting if any core has not reached the checkValue phase
+                int64_t v = localWait.GetValue(i * FLAG_UNIT_INT_NUM);
+                if ((v & MAGIC_MASK) != (checkValue & MAGIC_MASK) || v < checkValue) {
+                    isSync = false;
+                    checkedFlagNum += i;
+                    break;
+                }
+            }
+        } while (!isSync);
+    }
+
+    // Wait for all synchronization flags within a rank
+    __aicore__ inline void WaitOneRankAllFlag(__gm__ int64_t *waitAddr, int64_t checkValue)
+    {
+        WaitOneRankPartFlag(waitAddr, blockNum, checkValue);
+    }
+
+    // Check partial synchronization flags within a rank, copy only once
+    __aicore__ inline bool CheckOneRankPartFlag(__gm__ int64_t *waitAddr, int64_t flagNum, int64_t checkValue)
+    {
+        GlobalTensor<int64_t> globalWait;
+        globalWait.SetGlobalBuffer(waitAddr, flagNum * FLAG_UNIT_INT_NUM);
+        LocalTensor<int64_t> localWait = tBuf.GetWithOffset<int64_t>(flagNum * FLAG_UNIT_INT_NUM, 0);
+        // Copy global synchronization flags to local
+        DataCopy(localWait, globalWait, flagNum * FLAG_UNIT_INT_NUM);
+        AscendC::SetFlag<HardEvent::MTE2_S>(EVENT_ID0);
+        AscendC::WaitFlag<HardEvent::MTE2_S>(EVENT_ID0);  // Wait for GM->UB
+        // Check if the synchronization flags are equal to checkValue
+        bool isSync = true;
+        for (auto i = 0; i < flagNum; ++i) {
+            // Continue waiting if any core has not reached the checkValue phase
+            int64_t v = localWait.GetValue(i * FLAG_UNIT_INT_NUM);
+            if ((v & MAGIC_MASK) != (checkValue & MAGIC_MASK) || v < checkValue) {
+                isSync = false;
+                break;
+            }
+        }
+        return isSync;
+    }
+
+    __aicore__ inline int64_t GetChunkFlagValue(__gm__ int64_t *waitAddr, int64_t checkValue, int64_t timeout,
+                                                bool checkNonZero = false, int64_t magic = 0)
+    {
+        GlobalTensor<int64_t> globalWait;
+        globalWait.SetGlobalBuffer(waitAddr, FLAG_UNIT_INT_NUM);
+        LocalTensor<int64_t> localWait = tBuf.GetWithOffset<int64_t>(FLAG_UNIT_INT_NUM, 0);
+        bool isSync = true;
+
+        int64_t waitTimes = 0;
+        int64_t v = 0;
+
+        do {
+            // Copy global sync flag to local
+            DataCopy(localWait, globalWait[0], FLAG_UNIT_INT_NUM);
+            AscendC::SetFlag<HardEvent::MTE2_S>(EVENT_ID0);
+            AscendC::WaitFlag<HardEvent::MTE2_S>(EVENT_ID0);  // Wait for GM->UB
+
+            isSync = true;
+            v = localWait.GetValue(0);
+            if (checkNonZero) {
+                // Non-zero check mode
+                if (((v & MAGIC_MASK) == (static_cast<int64_t>(magic) << MAGIC_OFFSET)) && (v & 0xFFFFFFFF)) {
+                    return v & 0xFFFFFFFF;  // Return lower 32 bits when non-zero
+                }
+            } else {
+                // Exact value check mode
+                if (v == checkValue) {
+                    return WAIT_SUCCESS;
+                }
+            }
+
+            isSync = false;
+            waitTimes++;
+
+            if (timeout > INT64_MAX / MAX_WAIT_ROUND_UNIT || waitTimes >= (timeout * MAX_WAIT_ROUND_UNIT)) {
+                isSync = true;
+                return v;  // Return the read flag value
+            }
+        } while (!isSync);
+
+        return checkNonZero ? 0 : v;
+    }
+
+    // Check all sync flags within a rank, copy only once
+    __aicore__ inline bool CheckOneRankAllFlag(__gm__ int64_t *waitAddr, int64_t checkValue)
+    {
+        return CheckOneRankPartFlag(waitAddr, blockNum, checkValue);
+    }
+    int rank;
+    int rankSize;
+    int blockIdx;
+    int blockNum;
+    GM_ADDR *shareAddrs;
+    int64_t segmentCount;  // Length of a single sync flag segment (count in int64_t)
+    __gm__ int64_t *localSyncAddr;
+    __gm__ int64_t *basicSyncAddr;       // Intra-card sync flag address for the current block
+    __gm__ int64_t *blockOuterSyncAddr;  // Inter-card sync flag address for the current block
+    TBuf<QuePosition::VECCALC> tBuf;
+};
+
+#endif  // SYNC_COLLECTIVES_H
diff --git a/csrc/deepep/ops2/scripts/help.info b/csrc/deepep/ops2/scripts/help.info
new file mode 100644
index 000000000..de0069dc0
--- /dev/null
+++ b/csrc/deepep/ops2/scripts/help.info
@@ -0,0 +1 @@
+  --install-path                    Install operator package to specific dir path
diff --git a/csrc/deepep/ops2/scripts/install.sh b/csrc/deepep/ops2/scripts/install.sh
new file mode 100755
index 000000000..e302e094d
--- /dev/null
+++ b/csrc/deepep/ops2/scripts/install.sh
@@ -0,0 +1,317 @@
+#!/bin/bash
+
+vendor_name=customize
+targetdir=/usr/local/Ascend/opp
+target_custom=0
+
+sourcedir=$PWD/packages
+vendordir=vendors/$vendor_name
+
+QUIET="y"
+
+while true
+do
+    case $1 in
+    --quiet)
+        QUIET="y"
+        shift
+    ;;
+    --install-path=*)
+        INSTALL_PATH=$(echo $1 | cut -d"=" -f2-)
+        INSTALL_PATH=${INSTALL_PATH%*/}
+        shift
+    ;;
+    --*)
+        shift
+    ;;
+    *)
+        break
+    ;;
+    esac
+done
+
+log() {
+    cur_date=`date +"%Y-%m-%d %H:%M:%S"`
+    echo "[ops_custom] [$cur_date] "$1
+}
+
+if [ -n "${INSTALL_PATH}" ]; then
+    if [[ ! "${INSTALL_PATH}" = /* ]]; then
+        log "[ERROR] use absolute path for --install-path argument"
+        exit 1
+    fi
+    if [ ! -d ${INSTALL_PATH} ]; then
+        mkdir ${INSTALL_PATH} >> /dev/null 2>&1
+        if [ $? -ne 0 ]; then
+            log "[ERROR] create ${INSTALL_PATH}  failed"
+            exit 1
+        fi
+    fi
+    targetdir=${INSTALL_PATH}
+elif [ -n "${ASCEND_CUSTOM_OPP_PATH}" ]; then
+    if [ ! -d ${ASCEND_CUSTOM_OPP_PATH} ]; then
+        mkdir -p ${ASCEND_CUSTOM_OPP_PATH} >> /dev/null 2>&1
+        if [ $? -ne 0 ]; then
+            log "[ERROR] create ${ASCEND_CUSTOM_OPP_PATH}  failed"
+        fi
+    fi
+    targetdir=${ASCEND_CUSTOM_OPP_PATH}
+else
+    if [ "x${ASCEND_OPP_PATH}" == "x" ]; then
+        log "[ERROR] env ASCEND_OPP_PATH no exist"
+        exit 1
+    fi
+    targetdir="${ASCEND_OPP_PATH}"
+fi
+
+if [ ! -d $targetdir ];then
+    log "[ERROR] $targetdir no exist"
+    exit 1
+fi
+
+if [ ! -x $targetdir ] || [ ! -w $targetdir ] || [ ! -r $targetdir ];then
+    log "[WARNING] The directory $targetdir does not have sufficient permissions. \
+    Please check and modify the folder permissions (e.g., using chmod), \
+    or use the --install-path option to specify an installation path and \
+    change the environment variable ASCEND_CUSTOM_OPP_PATH to the specified path."
+fi
+
+upgrade()
+{
+    if [ ! -d ${sourcedir}/$vendordir/$1 ]; then
+        log "[INFO] no need to upgrade ops $1 files"
+        return 0
+    fi
+
+    if [ ! -d ${targetdir}/$vendordir/$1 ];then
+        log "[INFO] create ${targetdir}/$vendordir/$1."
+        mkdir -p ${targetdir}/$vendordir/$1
+        if [ $? -ne 0 ];then
+            log "[ERROR] create ${targetdir}/$vendordir/$1 failed"
+            return 1
+        fi
+    else
+        has_same_file=-1
+        for file_a in ${sourcedir}/$vendordir/$1/*; do
+            file_b=${file_a##*/};
+            if [ "ls ${targetdir}/$vendordir/$1" = "" ]; then
+                log "[INFO] ${targetdir}/$vendordir/$1 is empty !!"
+		        return 1
+	          fi
+            grep -q $file_b <<<`ls ${targetdir}/$vendordir/$1`;
+            if [[ $? -eq 0 ]]; then
+                echo -n "${file_b} "
+                has_same_file=0
+            fi
+        done
+        if [ 0 -eq $has_same_file ]; then
+            if test $QUIET = "n"; then
+                echo "[INFO]: has old version in ${targetdir}/$vendordir/$1, \
+                you want to Overlay Installation , please enter:[o]; \
+                or replace directory installation , please enter: [r]; \
+                or not install , please enter:[n]."
+
+                while true
+                do
+                    read orn
+                    if [ "$orn" = n ]; then
+                        return 0
+                    elif [ "$orn" = m ]; then
+                        break;
+                    elif [ "$orn" = r ]; then
+                        [ -n "${targetdir}/$vendordir/$1/" ] && rm -rf "${targetdir}/$vendordir/$1"/*
+                        break;
+                    else
+                        log "[ERROR] input error, please input again!"
+                    fi
+                done
+            fi
+        fi
+        log "[INFO] replace or merge old ops $1 files .g....."
+    fi
+
+    log "copy new ops $1 files ......"
+    if [ -d ${targetdir}/$vendordir/$1/ ]; then
+        chmod -R +w "$targetdir/$vendordir/$1/" >/dev/null 2>&1
+    fi
+    cp -rf ${sourcedir}/$vendordir/$1/* $targetdir/$vendordir/$1/
+    if [ $? -ne 0 ];then
+        log "[ERROR] copy new $1 files failed"
+        return 1
+    fi
+
+    return 0
+}
+upgrade_proto()
+{
+    if [ ! -f ${sourcedir}/$vendordir/custom.proto ]; then
+        log "[INFO] no need to upgrade custom.proto files"
+        return 0
+    fi
+    if [ ! -d ${targetdir}/$vendordir/framework/caffe ];then
+        log "[INFO] create ${targetdir}/$vendordir/framework/caffe."
+        mkdir -p ${targetdir}/$vendordir/framework/caffe
+        if [ $? -ne 0 ];then
+            log "[ERROR] create ${targetdir}/$vendordir/framework/caffe failed"
+            return 1
+        fi
+    else
+        if [ -f ${targetdir}/$vendordir/framework/caffe/custom.proto ]; then
+            # 有老版本,判断是否要覆盖式安装
+            if test $QUIET = "n"; then
+                  echo "[INFO] ${targetdir}/$vendordir/framework/caffe has old version"\
+                "custom.proto file. Do you want to replace? [y/n] "
+
+                while true
+                do
+                    read yn
+                    if [ "$yn" = n ]; then
+                        return 0
+                    elif [ "$yn" = y ]; then
+                        break;
+                    else
+                        log "[ERROR] input error, please input again!"
+                    fi
+                done
+            fi
+        fi
+        log "[INFO] replace old caffe.proto files ......"
+    fi
+    chmod -R +w "$targetdir/$vendordir/framework/caffe/" >/dev/null 2>&1
+    cp -rf ${sourcedir}/$vendordir/custom.proto ${targetdir}/$vendordir/framework/caffe/
+    if [ $? -ne 0 ];then
+        log "[ERROR] copy new custom.proto failed"
+        return 1
+    fi
+	log "[INFO] copy custom.proto success"
+
+    return 0
+}
+
+upgrade_file()
+{
+    if [ ! -e ${sourcedir}/$vendordir/$1 ]; then
+        log "[INFO] no need to upgrade ops $1 file"
+        return 0
+    fi
+
+    log "copy new $1 files ......"
+    cp -f ${sourcedir}/$vendordir/$1 $targetdir/$vendordir/$1
+    if [ $? -ne 0 ];then
+        log "[ERROR] copy new $1 file failed"
+        return 1
+    fi
+
+    return 0
+}
+
+delete_optiling_file()
+{
+  if [ ! -d ${targetdir}/vendors ];then
+    log "[INFO] $1 not exist, no need to uninstall"
+    return 0
+  fi
+  sys_info=$(uname -m)
+  if [ ! -d ${sourcedir}/$vendordir/$1/ai_core/tbe/op_tiling/lib/linux/${sys_info} ];then
+    rm -rf ${sourcedir}/$vendordir/$1/ai_core/tbe/op_tiling/liboptiling.so
+  fi
+  return 0
+}
+
+log "[INFO] copy uninstall sh success"
+
+if [ ! -d ${targetdir}/vendors ];then
+        log "[INFO] create ${targetdir}/vendors."
+        mkdir -p ${targetdir}/vendors
+        if [ $? -ne 0 ];then
+            log "[ERROR] create ${targetdir}/vendors failed"
+            exit 1
+        fi
+fi
+chmod u+w ${targetdir}/vendors
+
+log "[INFO] upgrade framework"
+upgrade framework
+if [ $? -ne 0 ];then
+    exit 1
+fi
+
+log "[INFO] upgrade op proto"
+upgrade op_proto
+if [ $? -ne 0 ];then
+    exit 1
+fi
+
+log "[INFO] upgrade op impl"
+delete_optiling_file op_impl
+upgrade op_impl
+if [ $? -ne 0 ];then
+    exit 1
+fi
+
+log "[INFO] upgrade op api"
+upgrade op_api
+if [ $? -ne 0 ];then
+    exit 1
+fi
+
+log "[INFO] upgrade version.info"
+upgrade_file version.info
+if [ $? -ne 0 ];then
+    exit 1
+fi
+
+upgrade_proto
+if [ $? -ne 0 ];then
+    exit 1
+fi
+
+# set the set_env.bash
+if [ -n "${INSTALL_PATH}" ] && [ -d ${INSTALL_PATH} ]; then
+    _ASCEND_CUSTOM_OPP_PATH=${targetdir}/${vendordir}
+    bin_path="${_ASCEND_CUSTOM_OPP_PATH}/bin"
+    set_env_variable="#!/bin/bash\nexport ASCEND_CUSTOM_OPP_PATH=${_ASCEND_CUSTOM_OPP_PATH}:\${ASCEND_CUSTOM_OPP_PATH}\nexport LD_LIBRARY_PATH=${_ASCEND_CUSTOM_OPP_PATH}/op_api/lib/:\${LD_LIBRARY_PATH}"
+    if [ ! -d ${bin_path} ]; then
+        mkdir -p ${bin_path} >> /dev/null 2>&1
+        if [ $? -ne 0 ]; then
+            log "[ERROR] create ${bin_path} failed"
+            exit 1
+        fi
+    fi
+    echo -e ${set_env_variable} > ${bin_path}/set_env.bash
+    if [ $? -ne 0 ]; then
+        log "[ERROR] write ASCEND_CUSTOM_OPP_PATH to set_env.bash failed"
+        exit 1
+    else
+        log "[INFO] using requirements: when custom module install finished or before you run the custom module, \
+        execute the command [ source ${bin_path}/set_env.bash ] to set the environment path"
+    fi
+else
+    _ASCEND_CUSTOM_OPP_PATH=${targetdir}/${vendordir}
+    config_file=${targetdir}/vendors/config.ini
+    if [ ! -f ${config_file} ]; then
+        touch ${config_file}
+        chmod 640 ${config_file}
+        echo "load_priority=$vendor_name" > ${config_file}
+        if [ $? -ne 0 ];then
+            log "[ERROR] echo load_priority failed"
+            exit 1
+        fi
+    else
+        found_vendors="$(grep -w "load_priority" "$config_file" | cut --only-delimited -d"=" -f2-)"
+        found_vendor=$(echo $found_vendors | sed "s/\<$vendor_name\>//g" | tr ',' ' ')
+        vendor=$(echo $found_vendor | tr -s ' ' ',')
+        if [ "$vendor" != "" ]; then
+            sed -i "/load_priority=$found_vendors/s@load_priority=$found_vendors@load_priority=$vendor_name,$vendor@g" "$config_file"
+        fi
+    fi
+    log "[INFO] using requirements: when custom module install finished or before you run the custom module, \
+        execute the command [ export LD_LIBRARY_PATH=${_ASCEND_CUSTOM_OPP_PATH}/op_api/lib/:\${LD_LIBRARY_PATH} ] to set the environment path"
+fi
+
+if [ -d ${targetdir}/$vendordir/op_impl/cpu/aicpu_kernel/impl/ ]; then
+    chmod -R 440 ${targetdir}/$vendordir/op_impl/cpu/aicpu_kernel/impl/* >/dev/null 2>&1
+fi
+
+echo "SUCCESS"
+exit 0
diff --git a/csrc/deepep/ops2/scripts/upgrade.sh b/csrc/deepep/ops2/scripts/upgrade.sh
new file mode 100755
index 000000000..38a591397
--- /dev/null
+++ b/csrc/deepep/ops2/scripts/upgrade.sh
@@ -0,0 +1,144 @@
+#!/bin/bash
+
+vendor_name=customize
+targetdir=/usr/local/Ascend/opp
+target_custom=0
+
+sourcedir=$PWD/packages
+vendordir=vendors/$vendor_name
+
+log() {
+    cur_date=`date +"%Y-%m-%d %H:%M:%S"`
+    echo "[ops_custom] [$cur_date] "$1
+}
+
+if [[ "x${ASCEND_OPP_PATH}" == "x" ]];then
+    log "[ERROR] env ASCEND_OPP_PATH no exist"
+    exit 1
+fi
+
+targetdir=${ASCEND_OPP_PATH}
+
+if [ ! -d $targetdir ];then
+    log "[ERROR] $targetdir no exist"
+    exit 1
+fi
+
+if [ ! -x $targetdir ] || [ ! -w $targetdir ] || [ ! -r $targetdir ];then
+    log "[WARNING] The directory $targetdir does not have sufficient permissions. \
+    Please check and modify the folder permissions (e.g., using chmod), \
+    or use the --install-path option to specify an installation path and \
+    change the environment variable ASCEND_CUSTOM_OPP_PATH to the specified path."
+fi
+
+upgrade()
+{
+    if [ ! -d ${sourcedir}/$vendordir/$1 ]; then
+        log "[INFO] no need to upgrade ops $1 files"
+        return 0
+    fi
+
+    if [ ! -d ${targetdir}/$vendordir/$1 ];then
+        log "[INFO] create ${targetdir}/$vendordir/$1."
+        mkdir -p ${targetdir}/$vendordir/$1
+        if [ $? -ne 0 ];then
+            log "[ERROR] create ${targetdir}/$vendordir/$1 failed"
+            return 1
+        fi
+    else
+        vendor_installed_dir=$(ls "$targetdir/vendors" 2> /dev/null)
+        for i in $vendor_installed_dir;do
+            vendor_installed_file=$(ls "$vendor_installed_dir/$vendor_name/$i" 2> /dev/null)
+            if [ "$i" = "$vendor_name" ] && [ "$vendor_installed_file" != "" ]; then
+                echo "[INFO]: $vendor_name custom opp package has been installed on the path $vendor_installed_dir, \
+                you want to Overlay Installation , please enter:[o]; \
+                or replace directory installation , please enter: [r]; \
+                or not install , please enter:[n]."
+            fi
+	          while true
+            do
+                read mrn
+                if [ "$mrn" = m ]; then
+                    break
+                elif [ "$mrn" = r ]; then
+                    [ -n "$vendor_installed_file" ] && rm -rf "$vendor_installed_file"
+                    break
+                elif [ "$mrn" = n ]; then
+                    return 0
+                else
+                    log "[WARNING]: Input error, please input m or r or n to choose!"
+                fi
+            done
+        done
+        log "[INFO] replace old ops $1 files ......"
+    fi
+
+    log "copy new ops $1 files ......"
+    cp -rf ${sourcedir}/$vendordir/$1/* $targetdir/$vendordir/$1/
+    if [ $? -ne 0 ];then
+        log "[ERROR] copy new $1 files failed"
+        return 1
+    fi
+
+    return 0
+}
+
+upgrade_file()
+{
+    if [ ! -e ${sourcedir}/$vendordir/$1 ]; then
+        log "[INFO] no need to upgrade ops $1 file"
+        return 0
+    fi
+
+    log "copy new $1 files ......"
+    cp -f ${sourcedir}/$vendordir/$1 $targetdir/$vendordir/$1
+    if [ $? -ne 0 ];then
+        log "[ERROR] copy new $1 file failed"
+        return 1
+    fi
+
+    return 0
+}
+
+log "[INFO] copy uninstall sh success"
+
+log "[INFO] upgrade framework"
+upgrade framework
+if [ $? -ne 0 ];then
+    exit 1
+fi
+
+log "[INFO] upgrade op proto"
+upgrade op_proto
+if [ $? -ne 0 ];then
+    exit 1
+fi
+
+log "[INFO] upgrade op impl"
+upgrade op_impl
+if [ $? -ne 0 ];then
+    exit 1
+fi
+
+log "[INFO] upgrade op api"
+upgrade op_api
+if [ $? -ne 0 ];then
+    exit 1
+fi
+
+log "[INFO] upgrade version.info"
+upgrade_file version.info
+if [ $? -ne 0 ];then
+    exit 1
+fi
+
+config_file=${targetdir}/vendors/config.ini
+found_vendors="$(grep -w "load_priority" "$config_file" | cut --only-delimited -d"=" -f2-)"
+found_vendor=$(echo $found_vendors | sed "s/\<$vendor_name\>//g" | tr ',' ' ')
+vendor=$(echo $found_vendor | tr -s ' ' ',')
+if [ "$vendor" != "" ]; then
+    sed -i "/load_priority=$found_vendors/s@load_priority=$found_vendors@load_priority=$vendor_name,$vendor@g" "$config_file"
+fi
+
+echo "SUCCESS"
+exit 0
diff --git a/csrc/deepep/ops2/utils/op_host/error_log.h b/csrc/deepep/ops2/utils/op_host/error_log.h
new file mode 100644
index 000000000..d809a9226
--- /dev/null
+++ b/csrc/deepep/ops2/utils/op_host/error_log.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved.
+ * Description: create log implementation file
+ * Author: Han Jiahui
+ * Create: 2025-05-21
+ * Note:
+ * History: 2025-05-21 create log implementation file
+ */
+#ifndef OPS_BUILT_IN_OP_TILING_ERROR_LOG_H_
+#define OPS_BUILT_IN_OP_TILING_ERROR_LOG_H_
+
+#include <string>
+#include "toolchain/slog.h"
+
+#define OP_LOGI(opname, ...)
+#define OP_LOGW(opname, ...)      \
+    printf("[WARN]" __VA_ARGS__); \
+    printf("\n")
+#define OP_LOGE_WITHOUT_REPORT(opname, ...) \
+    printf("[ERRORx]" __VA_ARGS__);         \
+    printf("\n")
+#define OP_LOGE(opname, ...)       \
+    printf("[ERROR]" __VA_ARGS__); \
+    printf("\n")
+#define OP_LOGD(opname, ...)
+
+namespace optiling {
+
+#define VECTOR_INNER_ERR_REPORT_TILIING(op_name, err_msg, ...)   \
+    do {                                                         \
+        OP_LOGE_WITHOUT_REPORT(op_name, err_msg, ##__VA_ARGS__); \
+    } while (0)
+
+#define OP_TILING_CHECK(cond, log_func, expr) \
+    do {                                      \
+        if (cond) {                           \
+            log_func;                         \
+            expr;                             \
+        }                                     \
+    } while (0)
+}  // namespace optiling
+
+#endif  // OPS_BUILT_IN_OP_TILING_ERROR_LOG_H_
diff --git a/python/deep_ep/deep_ep/buffer.py b/python/deep_ep/deep_ep/buffer.py
index 59813b5a4..fd7c592e6 100644
--- a/python/deep_ep/deep_ep/buffer.py
+++ b/python/deep_ep/deep_ep/buffer.py
@@ -431,6 +431,7 @@ def low_latency_dispatch(
         topk_idx: torch.Tensor,
         num_max_dispatch_tokens_per_rank: int,
         num_experts: int,
+        topk_weights: Optional[torch.Tensor] = None,
         cumulative_local_expert_recv_stats: Optional[torch.Tensor] = None,
         use_fp8: bool = True,
         round_scale: bool = False,
@@ -487,11 +488,13 @@ def low_latency_dispatch(
             packed_recv_count,
             packed_recv_src_info,
             packed_recv_layout_range,
+            expand_scales,
             event,
             hook,
         ) = self.runtime.low_latency_dispatch(
             x,
             topk_ids,
+            topk_weights,
             cumulative_local_expert_recv_stats,
             num_max_dispatch_tokens_per_rank,
             num_experts,
@@ -508,6 +511,7 @@ def low_latency_dispatch(
             x.size(1),
             num_experts,
             packed_recv_count,
+            expand_scales,
         )
         tensors_to_record = (
             x,
@@ -572,6 +576,7 @@ def low_latency_combine(
             hidden,
             num_experts,
             packed_recv_count,
+            expand_scales,
         ) = handle
         combined_x, event, hook = self.runtime.low_latency_combine(
             x,
@@ -586,6 +591,7 @@ def low_latency_combine(
             async_finish,
             return_recv_hook,
             out,
+            expand_scales,
         )
         tensors_to_record = (
             x,
diff --git a/tests/python/deepep/test_low_latency.py b/tests/python/deepep/test_low_latency.py
index ca2a48ed3..69c6f8558 100644
--- a/tests/python/deepep/test_low_latency.py
+++ b/tests/python/deepep/test_low_latency.py
@@ -72,6 +72,7 @@ def test(
         use_fp8=dispatch_use_fp8,
         round_scale=False,
         use_ue8m0=False,
+        topk_weights=topk_weights,
         cumulative_local_expert_recv_stats=cumulative_local_expert_recv_stats,
         async_finish=not return_recv_hook,
         return_recv_hook=return_recv_hook,
@@ -133,6 +134,16 @@ def test(
             )
 
     # Check combine correctness
+    (
+        src_info,
+        layout_range,
+        num_max_dispatch_tokens_per_rank,
+        hidden,
+        num_experts,
+        packed_recv_count,
+        expand_scales,
+    ) = handle
+
     out = torch.empty((num_tokens, hidden), dtype=torch.bfloat16, device="npu")
     combined_x, event, hook = buffer.low_latency_combine(
         simulated_gemm_x,
@@ -151,9 +162,14 @@ def test(
             combined_x,
         )
         assert torch.isnan(combined_x).sum().item() == 0
-        assert diff < 1e-5, f"Error: {diff=}, {zero_copy=}"
+        if dispatch_use_fp8:
+            assert diff < 1e-4, f"Error: {diff=}"
+        else:
+            assert diff < 1e-5, f"Error: {diff=}"
         hash_value ^= hash_tensor(combined_x)
 
+        print(f"rank {rank} PASSED")
+
     # noinspection PyShadowingNames
     def test_func(zero_copy: bool, return_recv_hook: bool):
         recv_x, recv_count, handle, event, hook = buffer.low_latency_dispatch(
@@ -161,6 +177,7 @@ def test_func(zero_copy: bool, return_recv_hook: bool):
             topk_idx,
             num_tokens,
             num_experts,
+            topk_weights=topk_weights,
             cumulative_local_expert_recv_stats=cumulative_local_expert_recv_stats,
             use_fp8=dispatch_use_fp8,
             async_finish=False,