Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
set -e

BUILD_DEEPEP_MODULE="ON"
BUILD_DEEPEP_OPS="ON"
BUILD_KERNELS_MODULE="ON"
BUILD_MEMORY_SAVER_MODULE="ON"

Expand All @@ -20,6 +21,11 @@ while getopts ":a:hd" opt; do
case "$OPTARG" in
deepep )
BUILD_DEEPEP_MODULE="ON"
BUILD_DEEPEP_OPS="ON"
;;
deepep2 )
BUILD_DEEPEP_MODULE="ON"
BUILD_DEEPEP_OPS="OFF"
;;
kernels )
BUILD_KERNELS_MODULE="ON"
Expand Down Expand Up @@ -120,7 +126,11 @@ function build_deepep_kernels()
if [[ "$ONLY_BUILD_DEEPEP_ADAPTER_MODULE" == "ON" ]]; then return 0; fi
if [[ "$BUILD_DEEPEP_MODULE" != "ON" ]]; then return 0; fi

KERNEL_DIR="csrc/deepep/ops"
if [[ "$BUILD_DEEPEP_OPS" == "ON" ]]; then
KERNEL_DIR="csrc/deepep/ops"
else
KERNEL_DIR="csrc/deepep/ops2"
fi
CUSTOM_OPP_DIR="${CURRENT_DIR}/python/deep_ep/deep_ep"

cd "$KERNEL_DIR" || exit
Expand All @@ -137,6 +147,7 @@ function build_deepep_kernels()
echo "find run package: $custom_opp_file"
chmod +x "$custom_opp_file"
fi
rm -rf "$CUSTOM_OPP_DIR"/vendors
./build_out/custom_opp_*.run --install-path=$CUSTOM_OPP_DIR
cd -
}
Expand Down
10 changes: 10 additions & 0 deletions csrc/deepep/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,14 @@
# this is the cmakelist file for deepep build
# deepep will be built as separated wheel package

if(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "x86_64")
set(DEEPEP_ARCH "x86_64")
elseif(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "aarch64")
set(DEEPEP_ARCH "aarch64")
else()
message(FATAL_ERROR "Unsupported host processor: ${CMAKE_SYSTEM_PROCESSOR}")
endif()

set(PROJECT_BUILD_PATH ${PROJECT_BINARY_DIR})
set(TARGET_INSTALL_DIR ${CMAKE_INSTALL_PREFIX})
set(ASCEND_HOME_PATH ${ASCEND_HOME_PATH})
Expand All @@ -27,6 +35,7 @@ target_include_directories( deep_ep_cpp PRIVATE
${TORCH_NPU_DIR}/include/third_party/acl/inc/acl
${TORCH_NPU_DIR}/include/third_party/acl/inc
${ASCEND_HOME_PATH}/include
${ASCEND_HOME_PATH}/${DEEPEP_ARCH}-linux/include/experiment/platform
)
target_link_directories(deep_ep_cpp PRIVATE
${TORCH_DIR}/lib
Expand All @@ -38,6 +47,7 @@ target_link_libraries(deep_ep_cpp PRIVATE
ascendcl
hccl
torch_npu
opapi
)

message(STATUS "TARGET_INSTALL_DIR = ${TARGET_INSTALL_DIR}")
Expand Down
79 changes: 56 additions & 23 deletions csrc/deepep/deep_ep.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ Buffer::Buffer(int64_t rank, int64_t num_ranks, int64_t num_nvl_bytes, int64_t n
}

this->shared_expert_rank_num = get_value_from_env("MOE_SHARED_EXPERT_RANK_NUM", 0);

soc_version = op::GetCurrentPlatformInfo().GetSocVersion();
}

Buffer::~Buffer() noexcept(false) {}
Expand Down Expand Up @@ -440,9 +442,10 @@ Buffer::intranode_combine(const torch::Tensor &x, const torch::Tensor &topk_idx,
return {combined_x, recv_topk_weights, event};
}

std::tuple<at::Tensor, std::optional<at::Tensor>, at::Tensor, at::Tensor, at::Tensor, std::optional<EventHandle>,
std::optional<std::function<void()>>>
std::tuple<at::Tensor, std::optional<at::Tensor>, at::Tensor, at::Tensor, at::Tensor, at::Tensor,
std::optional<EventHandle>, std::optional<std::function<void()>>>
Buffer::low_latency_dispatch(const at::Tensor &x, const at::Tensor &topk_idx,
const std::optional<torch::Tensor> &topk_weights,
const std::optional<at::Tensor> &cumulative_local_expert_recv_stats,
int64_t num_max_dispatch_tokens_per_rank, int64_t num_experts, bool use_fp8,
bool round_scale, bool use_ue8m0, bool async, bool return_recv_hook)
Expand Down Expand Up @@ -493,25 +496,54 @@ Buffer::low_latency_dispatch(const at::Tensor &x, const at::Tensor &topk_idx,
at::empty({num_max_tokens, hidden}, new_x.options().dtype(use_fp8 ? at::kChar : at::kBFloat16));
auto packed_recv_x_scales = at::empty({num_max_tokens}, at::dtype(at::kFloat).device(device));
auto expandIdx = at::empty({max_size}, at::dtype(at::kInt).device(device));
auto ep_recv_count = at::empty({num_local_experts * num_ranks}, at::dtype(at::kInt).device(device));
// ��Ӧ��layout��������ݳ���
int32_t server_num = num_ranks / LOCAL_RANK_SIZE;
at::Tensor expand_scales = at::empty({1}, at::dtype(at::kFloat).device(device));
; // just A2 layered need
at::Tensor ep_recv_count =
at::empty({num_local_experts * num_ranks}, at::dtype(at::kInt).device(device)); // A2 non-layered / A3
at::Tensor expert_scales =
at::ones({num_tokens, num_topk}, at::dtype(at::kFloat).device(device)); // A2 non-layered / A3
auto tp_recv_count = at::empty({1}, at::dtype(at::kInt).device(device));
auto packed_recv_count = at::empty({num_local_experts}, at::dtype(at::kLong).device(device));
auto expandScales = at::empty({1}, at::dtype(at::kFloat).device(device));
at::Tensor scales;
at::Tensor activateMask;
auto expert_scales = at::empty({1}, at::dtype(at::kFloat).device(device));
at::Tensor activate_mask;

if (soc_version == op::SocVersion::ASCEND910B) {
const char *hcclIntraPcieEnable = getenv("HCCL_INTRA_PCIE_ENABLE");
const char *hcclIntraRoceEnable = getenv("HCCL_INTRA_ROCE_ENABLE");
if (hcclIntraPcieEnable != nullptr && hcclIntraRoceEnable != nullptr && strcmp(hcclIntraPcieEnable, "1") == 0 &&
strcmp(hcclIntraRoceEnable, "0") == 0) { // A2 layered
if (topk_weights.has_value()) {
if (!this->is_padding) {
expert_scales = topk_weights.value();
} else {
std::vector<at::Tensor> weight_blocks;
if (topk_weights->size(0) != 0) {
weight_blocks.emplace_back(topk_weights.value());
}
for (int i = 0; i < this->padding_cnt; i++) {
at::Tensor tmp_weight =
torch::arange(0, num_topk, topk_weights->options()).reshape({1, num_topk});
weight_blocks.emplace_back(tmp_weight);
}
expert_scales = torch::cat(weight_blocks, 0);
}
}
int64_t recv_count_tensor_size = num_experts + 2 * global_bs * num_topk * server_num;
ep_recv_count = at::empty({recv_count_tensor_size}, at::dtype(at::kInt).device(device));
expand_scales = at::empty({num_max_tokens}, at::dtype(at::kFloat).device(device));
activate_mask = (new_topk_idx >= 0).to(torch::kBool);
}
}

int64_t quant_mode = use_fp8 ? 2 : 0;
int64_t tp_size = 1;
int64_t tp_rank = 0;
int64_t expert_shard_type = 0;
int outType = get_value_from_env("MOE_EXPERT_TOKEN_NUMS_TYPE", 1);
int64_t expert_token_nums_type = outType;

std::string comm_log = "0";
std::vector<char> comm_log_buf(comm_log.begin(), comm_log.end());
comm_log_buf.push_back('\0');
char *comm_log_ptr = comm_log_buf.data();

// get ep & tp name
char hcom_ep_name[HCOMM_NAME_LEN];
if (!moe_all_to_all_group_name.empty()) {
Expand All @@ -520,10 +552,11 @@ Buffer::low_latency_dispatch(const at::Tensor &x, const at::Tensor &topk_idx,
HCCL_CHECK(HcclGetCommName(ep_comm, hcom_ep_name));
}
char hcom_tp_name[HCOMM_NAME_LEN] = {0};
char comm_alg[] = "fullmesh";

EXEC_NPU_CMD(aclnnMoeDistributeDispatchV2, new_x, new_topk_idx,
scales, // smooth scales,
activateMask, // activateMask
activate_mask, // activateMask
expert_scales, // expert_scales
hcom_ep_name, // ep
num_ranks, // rankSize
Expand All @@ -538,17 +571,17 @@ Buffer::low_latency_dispatch(const at::Tensor &x, const at::Tensor &topk_idx,
quant_mode,
global_bs, // global_bs
expert_token_nums_type, // expert_token_nums_type
comm_log_ptr, packed_recv_x,
comm_alg, packed_recv_x,
packed_recv_x_scales, // dynamicScalesOut
expandIdx,
packed_recv_count, // expertTokenNumsOut
ep_recv_count, tp_recv_count, expandScales);
ep_recv_count, tp_recv_count, expand_scales);

// Wait streams
std::optional<EventHandle> event;

// Return values
return {packed_recv_x, packed_recv_x_scales, packed_recv_count, expandIdx, ep_recv_count,
return {packed_recv_x, packed_recv_x_scales, packed_recv_count, expandIdx, ep_recv_count, expand_scales,
event, std::function<void()>([] {})};
}

Expand All @@ -561,7 +594,7 @@ std::tuple<at::Tensor, std::optional<EventHandle>, std::optional<std::function<v
const at::Tensor &x, const at::Tensor &topk_idx, const at::Tensor &topk_weights, const at::Tensor &src_info,
const at::Tensor &layout_range, int64_t num_max_dispatch_tokens_per_rank, int64_t num_experts,
const at::Tensor &packed_recv_count, bool zero_copy, bool async, bool return_recv_hook,
const std::optional<at::Tensor> &out)
const std::optional<at::Tensor> &out, const at::Tensor &expand_scales)
{
at::Tensor new_idx = topk_idx;
at::Tensor new_scales = topk_weights;
Expand Down Expand Up @@ -598,8 +631,11 @@ std::tuple<at::Tensor, std::optional<EventHandle>, std::optional<std::function<v
at::Tensor ep_send_counts = layout_range;
at::Tensor expert_scales = new_scales;
at::Tensor tp_send_counts = at::empty({1}, at::dtype(at::kInt).device(device));
at::Tensor x_active_mask, activation_scale, weight_scale, group_list, expand_scales;

at::Tensor activation_scale, weight_scale, group_list;
at::Tensor x_active_mask;
if (soc_version == op::SocVersion::ASCEND910B) {
x_active_mask = (new_topk_idx >= 0).to(torch::kBool);
}
int64_t tp_world_size = 1;
int64_t tp_rankId = 0;
int64_t expert_shared_type = 0;
Expand All @@ -613,16 +649,13 @@ std::tuple<at::Tensor, std::optional<EventHandle>, std::optional<std::function<v
at::Tensor shared_expert_x{nullptr};
at::Tensor combined_x = at::empty({num_combined_tokens, hidden}, x.options());
std::optional<EventHandle> event;
std::string comm_log = "0";
std::vector<char> comm_log_buf(comm_log.begin(), comm_log.end());
comm_log_buf.push_back('\0');
char *comm_log_ptr = comm_log_buf.data();
char comm_alg[] = "fullmesh";

EXEC_NPU_CMD(aclnnMoeDistributeCombineV2, expand_x, expert_ids, expand_idx, ep_send_counts, expert_scales,
tp_send_counts, x_active_mask, activation_scale, weight_scale, group_list, expand_scales,
shared_expert_x, hcom_ep_name, num_ranks, rank, num_experts, hcom_tp_name, tp_world_size, tp_rankId,
expert_shared_type, shared_expert_num, shared_expert_rank_num, global_bs, out_dtype, comm_quant_mode,
group_list_type, comm_log_ptr, combined_x);
group_list_type, comm_alg, combined_x);
if (this->is_padding) {
if (this->padding_cnt == PADDING_SIZE) {
combined_x = this->ori_x;
Expand Down
9 changes: 6 additions & 3 deletions csrc/deepep/deep_ep.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include <optional>
#include "hccl/hccl.h"
#include "hccl/hccl_types.h"
#include "aclnn/opdev/platform.h"

#include "config.hpp"
#include "event.hpp"
Expand All @@ -16,6 +17,7 @@ namespace deep_ep {
struct Buffer {
int64_t rank, rdma_rank;
int64_t num_ranks;
op::SocVersion soc_version;

int64_t num_nvl_bytes;
int64_t num_rdma_bytes;
Expand Down Expand Up @@ -73,9 +75,10 @@ struct Buffer {
const std::optional<torch::Tensor> &topk_weights, const torch::Tensor &src_idx,
const torch::Tensor &send_head, const std::optional<at::Tensor> &combine_send_cost_stats);

std::tuple<at::Tensor, std::optional<at::Tensor>, at::Tensor, at::Tensor, at::Tensor, std::optional<EventHandle>,
std::optional<std::function<void()>>>
std::tuple<at::Tensor, std::optional<at::Tensor>, at::Tensor, at::Tensor, at::Tensor, at::Tensor,
std::optional<EventHandle>, std::optional<std::function<void()>>>
low_latency_dispatch(const at::Tensor &x, const at::Tensor &topk_idx,
const std::optional<torch::Tensor> &topk_weights,
const std::optional<at::Tensor> &cumulative_local_expert_recv_stats,
int64_t num_max_dispatch_tokens_per_rank, int64_t num_experts, bool use_fp8, bool round_scale,
bool use_ue8m0, bool async, bool return_recv_hook);
Expand All @@ -86,7 +89,7 @@ struct Buffer {
const at::Tensor &x, const at::Tensor &topk_idx, const at::Tensor &topk_weights, const at::Tensor &src_info,
const at::Tensor &layout_range, int64_t num_max_dispatch_tokens_per_rank, int64_t num_experts,
const at::Tensor &packed_recv_count, bool zero_copy, bool async, bool return_recv_hook,
const std::optional<at::Tensor> &out);
const std::optional<at::Tensor> &out, const at::Tensor &expand_scales);

std::vector<at::Tensor> fused_deep_moe(const at::Tensor &x, const at::Tensor &expertIds,
const at::Tensor &gmm1PermutedWeight,
Expand Down
67 changes: 67 additions & 0 deletions csrc/deepep/ops2/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
cmake_minimum_required(VERSION 3.16.0)
project(opp)

if(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "x86_64")
set(CANN_HOST_ARCH "x86_64")
elseif(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "aarch64")
set(CANN_HOST_ARCH "aarch64")
else()
message(FATAL_ERROR "Unsupported host processor: ${CMAKE_SYSTEM_PROCESSOR}")
endif()

include(cmake/config.cmake)
include(cmake/func.cmake)
include(cmake/intf.cmake)

set(CMAKE_COMPILE ${CMAKE_CXX_COMPILER})

if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/framework)
add_subdirectory(framework)
endif()
if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/op_host)
add_subdirectory(op_host)
endif()
if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel)
add_subdirectory(op_kernel)
endif()
if(ENABLE_TEST AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/testcases)
add_subdirectory(testcases)
endif()

add_custom_command(OUTPUT ${CMAKE_BINARY_DIR}/scripts/install.sh ${CMAKE_BINARY_DIR}/scripts/upgrade.sh
COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_BINARY_DIR}/scripts
COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_SOURCE_DIR}/scripts/ ${CMAKE_BINARY_DIR}/scripts/
COMMAND sed -i "s/vendor_name=customize/vendor_name=${vendor_name}/g" ${CMAKE_BINARY_DIR}/scripts/install.sh ${CMAKE_BINARY_DIR}/scripts/upgrade.sh
VERBATIM
)
add_custom_target(modify_vendor ALL DEPENDS ${CMAKE_BINARY_DIR}/scripts/install.sh ${CMAKE_BINARY_DIR}/scripts/upgrade.sh)

get_system_info(SYSTEM_INFO)

# gen version.info
add_custom_target(gen_version_info ALL
COMMAND bash ${CMAKE_CURRENT_SOURCE_DIR}/cmake/util/gen_version_info.sh ${ASCEND_CANN_PACKAGE_PATH} ${CMAKE_CURRENT_BINARY_DIR}
)

if(NOT ASCEND_PACK_SHARED_LIBRARY)
install(DIRECTORY ${CMAKE_BINARY_DIR}/scripts/ DESTINATION . FILE_PERMISSIONS OWNER_EXECUTE OWNER_READ GROUP_READ)

install(FILES ${CMAKE_SOURCE_DIR}/custom.proto DESTINATION packages OPTIONAL)
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/version.info
DESTINATION packages/vendors/${vendor_name}/)

# CPack config
set(CPACK_PACKAGE_NAME ${CMAKE_PROJECT_NAME})
set(CPACK_PACKAGE_VERSION ${CMAKE_PROJECT_VERSION})
set(CPACK_PACKAGE_DESCRIPTION "CPack opp project")
set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "CPack opp project")
set(CPACK_PACKAGE_DIRECTORY ${CMAKE_INSTALL_PREFIX})

set(CPACK_PACKAGE_FILE_NAME "custom_opp_${SYSTEM_INFO}_${CMAKE_SYSTEM_PROCESSOR}.run")
set(CPACK_GENERATOR External)
set(CPACK_CMAKE_GENERATOR "Unix Makefiles")
set(CPACK_EXTERNAL_ENABLE_STAGING TRUE)
set(CPACK_EXTERNAL_PACKAGE_SCRIPT ${CMAKE_SOURCE_DIR}/cmake/makeself.cmake)
set(CPACK_EXTERNAL_BUILT_PACKAGES ${CPACK_PACKAGE_DIRECTORY}/_CPack_Packages/Linux/External/${CPACK_PACKAGE_FILE_NAME}/${CPACK_PACKAGE_FILE_NAME})
include(CPack)
endif()
59 changes: 59 additions & 0 deletions csrc/deepep/ops2/CMakePresets.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
{
"version": 1,
"cmakeMinimumRequired": {
"major": 3,
"minor": 19,
"patch": 0
},
"configurePresets": [
{
"name": "default",
"displayName": "Default Config",
"description": "Default build using Unix Makefiles generator for native compilation",
"generator": "Unix Makefiles",
"binaryDir": "${sourceDir}/build_out",
"cacheVariables": {
"CMAKE_BUILD_TYPE": {
"type": "STRING",
"value": "Release"
},
"ENABLE_SOURCE_PACKAGE": {
"type": "BOOL",
"value": "True"
},
"ENABLE_BINARY_PACKAGE": {
"type": "BOOL",
"value": "True"
},
"ASCEND_COMPUTE_UNIT": {
"type": "STRING",
"value": "ascend910b"
},
"ENABLE_TEST": {
"type": "BOOL",
"value": "True"
},
"vendor_name": {
"type": "STRING",
"value": "hwcomputing"
},
"ASCEND_CANN_PACKAGE_PATH": {
"type": "PATH",
"value": "/usr/local/Ascend/ascend-toolkit/latest"
},
"ASCEND_PYTHON_EXECUTABLE": {
"type": "STRING",
"value": "python3"
},
"CMAKE_INSTALL_PREFIX": {
"type": "PATH",
"value": "${sourceDir}/build_out"
},
"ASCEND_PACK_SHARED_LIBRARY": {
"type": "BOOL",
"value": "False"
}
}
}
]
}
Loading