Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 6 additions & 8 deletions src/infiniop/ops/lp_norm/cuda/kernel.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,10 @@ __device__ void blockLPNormKernel(
local_max = max(local_max, fabsf((float)input[tid + ind * stride]));
}
__shared__ float global_max;
#if CUDART_VERSION >= 12090
#if CUDART_VERSION >= 12090 && !defined(ENABLE_METAX_API)
float max_block = BlockReduce(temp_storage).Reduce(local_max, ::cuda::maximum());
#elif defined(ENABLE_HYGON_API)
float max_block = BlockReduce(temp_storage).Reduce(
local_max, [](const float &a, const float &b) { return (a > b) ? a : b; }, BLOCK_SIZE);
#elif defined(ENABLE_HYGON_API) || defined(ENABLE_METAX_API)
float max_block = BlockReduce(temp_storage).Reduce(local_max, [](const float &a, const float &b) { return (a > b) ? a : b; }, BLOCK_SIZE);
#else
float max_block = BlockReduce(temp_storage).Reduce(local_max, cub::Max());
#endif
Expand Down Expand Up @@ -76,11 +75,10 @@ __device__ void blockLPNormStridesKernel(
local_max = max(local_max, fabsf((float)input[ind_i + ind]));
}
__shared__ float global_max;
#if CUDART_VERSION >= 12090
#if CUDART_VERSION >= 12090 && !defined(ENABLE_METAX_API)
float max_block = BlockReduce(temp_storage).Reduce(local_max, ::cuda::maximum());
#elif defined(ENABLE_HYGON_API)
float max_block = BlockReduce(temp_storage).Reduce(
local_max, [](const float &a, const float &b) { return (a > b) ? a : b; }, BLOCK_SIZE);
#elif defined(ENABLE_HYGON_API) || defined(ENABLE_METAX_API)
float max_block = BlockReduce(temp_storage).Reduce(local_max, [](const float &a, const float &b) { return (a > b) ? a : b; }, BLOCK_SIZE);
#else
float max_block = BlockReduce(temp_storage).Reduce(local_max, cub::Max());
#endif
Expand Down
8 changes: 8 additions & 0 deletions src/infiniop/ops/lp_norm/metax/lp_norm_metax.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#ifndef __LP_NORM_METAX_H__
#define __LP_NORM_METAX_H__

#include "../lp_norm.h"

DESCRIPTOR(metax)

#endif // __LP_NORM_METAX_H__
167 changes: 167 additions & 0 deletions src/infiniop/ops/lp_norm/metax/lp_norm_metax.maca
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
#include "../../../devices/metax/metax_common.h"
#include "../../../devices/metax/metax_kernel_common.h"
#include "../cuda/kernel.cuh"
#include "lp_norm_metax.h"

template <typename Tdata, unsigned int BLOCK_SIZE>
INFINIOP_METAX_KERNEL blockLPNorm(
Tdata *y, const Tdata *x,
float p, size_t dimsize,
ptrdiff_t stride, float eps) {
blockLPNormKernel<Tdata, BLOCK_SIZE>(x, y, p, dimsize, stride, eps);
}

template <typename Tdata, unsigned int BLOCK_SIZE>
INFINIOP_METAX_KERNEL blockLPNormStrides(
Tdata *y, const Tdata *x,
const ptrdiff_t *output_strides,
const ptrdiff_t *input_strides,
const size_t *shape, int ndim,
float p, size_t dimsize, float eps) {
blockLPNormStridesKernel<Tdata, BLOCK_SIZE>(
x, y, output_strides, input_strides, shape, ndim, p, dimsize, eps);
}

template <typename Tdata, unsigned int BLOCK_SIZE_x, unsigned int BLOCK_SIZE_y>
INFINIOP_METAX_KERNEL warpLPNorm(
Tdata *y, const Tdata *x,
float p, size_t othersize, size_t dimsize,
ptrdiff_t stride, float eps) {
warpLPNormKernel<Tdata, BLOCK_SIZE_x, BLOCK_SIZE_y>(x, y, p, othersize, dimsize, stride, eps);
}

template <typename Tdata, unsigned int BLOCK_SIZE_x, unsigned int BLOCK_SIZE_y>
INFINIOP_METAX_KERNEL warpLPNormStrides(
Tdata *y, const Tdata *x,
const ptrdiff_t *output_strides,
const ptrdiff_t *input_strides,
const size_t *shape, int ndim,
float p, size_t othersize, size_t dimsize,
float eps) {
warpLPNormStridesKernel<Tdata, BLOCK_SIZE_x, BLOCK_SIZE_y>(
x, y, output_strides, input_strides, shape, ndim, p, othersize, dimsize, eps);
}

namespace op::lp_norm::metax {

struct Descriptor::Opaque {
std::shared_ptr<device::metax::Handle::Internal> internal;
};

Descriptor::~Descriptor() {
delete _opaque;
}

infiniStatus_t Descriptor::create(
infiniopHandle_t handle,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t x_desc,
int axis,
int p,
float eps) {
auto info = LPNormInfo::createLPNormInfo(y_desc, x_desc, axis, p, eps);
CHECK_RESULT(info);
size_t workspace_size = y_desc->ndim() * (sizeof(ptrdiff_t) * 2 + sizeof(size_t));
*desc_ptr = new Descriptor(
new Opaque{reinterpret_cast<device::metax::Handle *>(handle)->internal()},
info.take(), workspace_size, handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS;
}

template <unsigned int BLOCK_SIZE, typename Tdata>
infiniStatus_t launchKernel(
const LPNormInfo &info, Tdata *y, const Tdata *x,
hcStream_t stream, void *workspace) {
size_t dimsize = info.dimsize;
size_t othersize = info.othersize;
float p_f = static_cast<float>(info.p);
float eps = info.eps;
int num_blocks = static_cast<int>(info.othersize);
ptrdiff_t stride = info.stride;
int ndim = static_cast<int>(info.ndim);

char *workspace_ptr = reinterpret_cast<char *>(workspace);
ptrdiff_t *input_strides_cuda = reinterpret_cast<ptrdiff_t *>(workspace_ptr);
ptrdiff_t *output_strides_cuda = input_strides_cuda + ndim;
size_t ptrdiff_array_size = 2 * ndim * sizeof(ptrdiff_t);
size_t *shape_cuda = reinterpret_cast<size_t *>(workspace_ptr + ptrdiff_array_size);

CHECK_METAX(hcMemcpyAsync(input_strides_cuda, info.input_strides.data(), sizeof(ptrdiff_t) * ndim, hcMemcpyHostToDevice, stream));
CHECK_METAX(hcMemcpyAsync(output_strides_cuda, info.output_strides.data(), sizeof(ptrdiff_t) * ndim, hcMemcpyHostToDevice, stream));
CHECK_METAX(hcMemcpyAsync(shape_cuda, info.input_shape.data(), sizeof(size_t) * ndim, hcMemcpyHostToDevice, stream));

if (info.continuous) {
if (dimsize > 1024) {
blockLPNorm<Tdata, BLOCK_SIZE>
<<<num_blocks, BLOCK_SIZE, 0, stream>>>(y, x, p_f, dimsize, stride, eps);
} else {
constexpr unsigned int BLOCK_SIZE_x = 32;
constexpr unsigned int BLOCK_SIZE_y = 32;
int num_block_x = (num_blocks + BLOCK_SIZE_y - 1) / BLOCK_SIZE_y;
dim3 block_dim(BLOCK_SIZE_x, BLOCK_SIZE_y, 1);
dim3 grid_dim(num_block_x, 1, 1);
warpLPNorm<Tdata, BLOCK_SIZE_x, BLOCK_SIZE_y>
<<<grid_dim, block_dim, 0, stream>>>(y, x, p_f, othersize, dimsize, stride, eps);
}
} else {
if (info.axis == ndim - 1) {
if (dimsize > 1024) {
blockLPNormStrides<Tdata, BLOCK_SIZE>
<<<num_blocks, BLOCK_SIZE, 0, stream>>>(
y, x, output_strides_cuda, input_strides_cuda, shape_cuda, ndim,
p_f, dimsize, eps);
} else {
constexpr unsigned int BLOCK_SIZE_x = 32;
constexpr unsigned int BLOCK_SIZE_y = 32;
int num_block_x = (num_blocks + BLOCK_SIZE_y - 1) / BLOCK_SIZE_y;
dim3 block_dim(BLOCK_SIZE_x, BLOCK_SIZE_y, 1);
dim3 grid_dim(num_block_x, 1, 1);
warpLPNormStrides<Tdata, BLOCK_SIZE_x, BLOCK_SIZE_y>
<<<grid_dim, block_dim, 0, stream>>>(
y, x, output_strides_cuda, input_strides_cuda, shape_cuda, ndim,
p_f, othersize, dimsize, eps);
}
} else {
return INFINI_STATUS_BAD_PARAM;
}
}
return INFINI_STATUS_SUCCESS;
}

infiniStatus_t Descriptor::calculate(
void *workspace, size_t workspace_size,
void *y, const void *x,
void *stream_) const {
hcStream_t stream = (hcStream_t)stream_;

#define CALCULATE_LP_NORM(BLOCK_SIZE, TDATA) \
launchKernel<BLOCK_SIZE, TDATA>(_info, (TDATA *)y, (const TDATA *)x, stream, workspace)

#define CALCULATE_LP_NORM_WITH_BLOCK_SIZE(BLOCK_SIZE) \
{ \
if (_info.dtype == INFINI_DTYPE_F16) \
return CALCULATE_LP_NORM(BLOCK_SIZE, half); \
else if (_info.dtype == INFINI_DTYPE_F32) \
return CALCULATE_LP_NORM(BLOCK_SIZE, float); \
else if (_info.dtype == INFINI_DTYPE_BF16) \
return CALCULATE_LP_NORM(BLOCK_SIZE, __nv_bfloat16); \
else \
return INFINI_STATUS_BAD_TENSOR_DTYPE; \
}

if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_1024) {
CALCULATE_LP_NORM_WITH_BLOCK_SIZE(METAX_BLOCK_SIZE_1024)
} else if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_512) {
CALCULATE_LP_NORM_WITH_BLOCK_SIZE(METAX_BLOCK_SIZE_512)
} else {
return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
}

#undef CALCULATE_LP_NORM_WITH_BLOCK_SIZE
#undef CALCULATE_LP_NORM

return INFINI_STATUS_SUCCESS;
}

} // namespace op::lp_norm::metax
15 changes: 15 additions & 0 deletions src/infiniop/ops/lp_norm/operator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_ALI_API) || defined(ENABLE_HYGON_API)
#include "nvidia/lp_norm_nvidia.cuh"
#endif
#ifdef ENABLE_METAX_API
#include "metax/lp_norm_metax.h"
#endif

__INFINI_C infiniStatus_t infiniopCreateLPNormDescriptor(
infiniopHandle_t handle,
Expand Down Expand Up @@ -42,6 +45,9 @@ __INFINI_C infiniStatus_t infiniopCreateLPNormDescriptor(
#ifdef ENABLE_HYGON_API
CREATE(INFINI_DEVICE_HYGON, nvidia);
#endif
#ifdef ENABLE_METAX_API
CREATE(INFINI_DEVICE_METAX, metax);
#endif

default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
Expand Down Expand Up @@ -72,6 +78,9 @@ __INFINI_C infiniStatus_t infiniopGetLPNormWorkspaceSize(infiniopLPNormDescripto
#ifdef ENABLE_HYGON_API
GET(INFINI_DEVICE_HYGON, nvidia);
#endif
#ifdef ENABLE_METAX_API
GET(INFINI_DEVICE_METAX, metax);
#endif

default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
Expand Down Expand Up @@ -115,6 +124,9 @@ __INFINI_C infiniStatus_t infiniopLPNorm(
#ifdef ENABLE_HYGON_API
CALCULATE(INFINI_DEVICE_HYGON, nvidia);
#endif
#ifdef ENABLE_METAX_API
CALCULATE(INFINI_DEVICE_METAX, metax);
#endif

default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
Expand Down Expand Up @@ -148,6 +160,9 @@ infiniopDestroyLPNormDescriptor(infiniopLPNormDescriptor_t desc) {
#ifdef ENABLE_HYGON_API
DELETE(INFINI_DEVICE_HYGON, nvidia);
#endif
#ifdef ENABLE_METAX_API
DELETE(INFINI_DEVICE_METAX, metax);
#endif

default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
Expand Down
Loading