Skip to content

Commit e68c19b

Browse files
authored
CANN: Add support for CONV_TRANSPOSE_1D when kernel size > 255 (ggml-org#17934)
* CONV_TRANSPOSE_1D kernel_size>255 * remove condition check * fix the bug of type conversion * removing trailing whitespaces * fix: return true in the switch case
1 parent c54bba8 commit e68c19b

File tree

3 files changed

+139
-15
lines changed

3 files changed

+139
-15
lines changed

ggml/src/ggml-cann/aclnn_ops.cpp

Lines changed: 137 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2990,32 +2990,156 @@ void ggml_cann_argmax(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
29902990
GGML_CANN_CALL_ACLNN_OP(ctx, ArgMax, acl_src.get(), 3, false, acl_dst.get());
29912991
}
29922992

2993-
void ggml_cann_conv_transpose_1d(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
2993+
void ggml_cann_conv_transpose_1d(ggml_backend_cann_context& ctx, ggml_tensor* dst){
29942994
ggml_tensor * src0 = dst->src[0];
29952995
ggml_tensor * src1 = dst->src[1];
29962996

29972997
// stride
2998-
int64_t s0 = ((const int32_t *) (dst->op_params))[0];
2998+
int64_t s0 = ((const int32_t*)(dst->op_params))[0];
29992999

3000-
acl_tensor_ptr acl_input = ggml_cann_create_tensor(src1, src1->ne, src1->nb, 3, ACL_FORMAT_NCL);
3000+
acl_tensor_ptr acl_input = ggml_cann_create_tensor(src1, src1->ne, src1->nb, 3, ACL_FORMAT_NCL);
30013001
acl_tensor_ptr acl_weight = ggml_cann_create_tensor(src0, src0->ne, src0->nb, 3, ACL_FORMAT_NCL);
3002-
acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst, dst->ne, dst->nb, 3, ACL_FORMAT_NCL);
3002+
acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst, dst->ne, dst->nb, 3, ACL_FORMAT_NCL);
3003+
3004+
// get base information of input and kernel
3005+
int64_t input_len = *(src1->ne);
3006+
int64_t dst_len = *(dst->ne);
3007+
int64_t kernel_size = *(src0->ne);
3008+
3009+
// set the max kernel size for each conv
3010+
int64_t max_kernel_size = 255;
3011+
3012+
// compute the partition of kernel
3013+
int64_t part_num = 1;
3014+
part_num = (kernel_size + max_kernel_size - 1) / max_kernel_size;
30033015

30043016
int64_t strideVal[1];
3005-
strideVal[0] = s0;
3006-
acl_int_array_ptr stride = ggml_cann_create_int_array(strideVal, 1);
3007-
int64_t paddingVal[] = { 0 };
3008-
acl_int_array_ptr padding = ggml_cann_create_int_array(paddingVal, 1);
3009-
int64_t dilationVal[] = { 1 };
3010-
acl_int_array_ptr dilation = ggml_cann_create_int_array(dilationVal, 1);
3011-
int8_t cubeMathType = 0;
3017+
strideVal[0] = s0;
3018+
acl_int_array_ptr stride = ggml_cann_create_int_array(strideVal, 1);
3019+
int64_t paddingVal[] = {0};
3020+
acl_int_array_ptr padding = ggml_cann_create_int_array(paddingVal, 1);
3021+
int64_t dilationVal[] = {1};
3022+
acl_int_array_ptr dilation = ggml_cann_create_int_array(dilationVal, 1);
3023+
bool transposed = true;
3024+
int64_t groups = 1;
3025+
int8_t cubeMathType = 0;
30123026

30133027
#ifdef ASCEND_310P
30143028
cubeMathType = 1;
30153029
#endif
30163030

3017-
GGML_CANN_CALL_ACLNN_OP(ctx, Convolution, acl_input.get(), acl_weight.get(), nullptr, stride.get(), padding.get(),
3018-
dilation.get(), true, padding.get(), 1, acl_dst.get(), cubeMathType);
3031+
auto weight_type = ggml_cann_type_mapping(src0->type);
3032+
auto dst_type = ggml_cann_type_mapping(dst->type);
3033+
3034+
// slice the kernel to make each conv available
3035+
int64_t slice_dim = -1;
3036+
int64_t slice_start = 0;
3037+
int64_t slice_end = max_kernel_size;
3038+
int64_t slice_step = 1;
3039+
int64_t interval = max_kernel_size;
3040+
3041+
int64_t left_pad_len = dilationVal[0] * (max_kernel_size - 1) + 1 - 2 * paddingVal[0];
3042+
int64_t right_pad_len = 0;
3043+
3044+
acl_scalar_ptr alpha = nullptr;
3045+
float alphaValue = 1.0;
3046+
alpha = ggml_cann_create_scalar(&alphaValue, aclDataType::ACL_FLOAT);
3047+
3048+
// set zero to destination
3049+
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceZero, acl_dst.get());
3050+
3051+
for(int k = 0; k < part_num; k++){
3052+
3053+
// create part kernel tensor and slice from big kernel
3054+
slice_start = max_kernel_size * k;
3055+
if(k == part_num - 1){
3056+
slice_end = kernel_size;
3057+
interval = kernel_size - max_kernel_size * k;
3058+
}else{
3059+
slice_end = max_kernel_size * (k+1);
3060+
}
3061+
3062+
int64_t part_ne[4];
3063+
for(int i = 0; i < 4; i++) {
3064+
part_ne[i] = *(src0->ne + i);
3065+
}
3066+
part_ne[0] = interval;
3067+
3068+
size_t part_nb[4];
3069+
part_nb[0] = sizeof(weight_type);
3070+
for (int i = 1; i < 4; i++) {
3071+
part_nb[i] = part_nb[i - 1] * part_ne[i - 1];
3072+
}
3073+
3074+
ggml_cann_pool_alloc part_kernel_allocator;
3075+
part_kernel_allocator.alloc(ctx.pool(), part_nb[3]);
3076+
void* part_kernel_buf = part_kernel_allocator.get();
3077+
3078+
acl_tensor_ptr part_kernel = ggml_cann_create_tensor(part_kernel_buf, weight_type,
3079+
ggml_element_size(src0), part_ne, part_nb, 3, ACL_FORMAT_NCL);
3080+
3081+
GGML_CANN_CALL_ACLNN_OP(ctx, Slice, acl_weight.get(), slice_dim, slice_start, slice_end, slice_step, part_kernel.get());
3082+
3083+
// create the part conv result tensor
3084+
int64_t part_dst_ne[4];
3085+
for(int i = 0; i < 4; i++){
3086+
part_dst_ne[i] = *(dst->ne + i);
3087+
}
3088+
part_dst_ne[0] = (input_len - 1) * strideVal[0] - 2 * paddingVal[0] + dilationVal[0] * (part_ne[0] - 1) + 1;
3089+
3090+
size_t part_dst_nb[4];
3091+
part_dst_nb[0] = sizeof(weight_type);
3092+
for (int i = 1; i < 4; i++) {
3093+
part_dst_nb[i] = part_dst_nb[i - 1] * part_dst_ne[i - 1];
3094+
}
3095+
ggml_cann_pool_alloc part_dst_allocator;
3096+
part_dst_allocator.alloc(ctx.pool(), part_dst_nb[3]);
3097+
void* part_dst_buf = part_dst_allocator.get();
3098+
3099+
acl_tensor_ptr acl_part_dst = ggml_cann_create_tensor(part_dst_buf, dst_type, ggml_element_size(dst),
3100+
part_dst_ne, part_dst_nb, 3, ACL_FORMAT_NCL);
3101+
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceZero, acl_part_dst.get());
3102+
3103+
// compute part conv transpose 1d
3104+
GGML_CANN_CALL_ACLNN_OP(ctx, Convolution, acl_input.get(), part_kernel.get(), nullptr, stride.get(),
3105+
padding.get(), dilation.get(), transposed, padding.get(), groups, acl_part_dst.get(), cubeMathType);
3106+
3107+
// compute the position of part result in final result
3108+
int64_t global_start = slice_start;
3109+
int64_t global_end = std::min((input_len - 1) * strideVal[0] + slice_end, dst_len);
3110+
3111+
left_pad_len = global_start;
3112+
right_pad_len = dst_len - global_end;
3113+
3114+
std::vector<int64_t> padDataVal = {left_pad_len,right_pad_len};
3115+
acl_int_array_ptr padData = ggml_cann_create_int_array(padDataVal.data(), 2);
3116+
3117+
acl_scalar_ptr pad_value = nullptr;
3118+
float pad_valueVal = 0.0;
3119+
pad_value = ggml_cann_create_scalar(&pad_valueVal, aclDataType::ACL_FLOAT);
3120+
3121+
int64_t conv_result_ne[4];
3122+
for(int i = 0; i < 4; i++){
3123+
conv_result_ne[i] = *(dst->ne + i);
3124+
}
3125+
3126+
size_t conv_result_nb[4];
3127+
conv_result_nb[0] = sizeof(weight_type);
3128+
for (int i = 1; i < 4; i++) {
3129+
conv_result_nb[i] = conv_result_nb[i - 1] * conv_result_ne[i - 1];
3130+
}
3131+
3132+
ggml_cann_pool_alloc conv_result_allocator;
3133+
conv_result_allocator.alloc(ctx.pool(), conv_result_nb[3]);
3134+
void* conv_result_buf = conv_result_allocator.get();
3135+
3136+
acl_tensor_ptr conv_result = ggml_cann_create_tensor(conv_result_buf, dst_type, ggml_element_size(dst),
3137+
conv_result_ne, conv_result_nb, 3, ACL_FORMAT_NCL);
3138+
3139+
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceZero, conv_result.get());
3140+
GGML_CANN_CALL_ACLNN_OP(ctx, ConstantPadNd, acl_part_dst.get(), padData.get(), pad_value.get(), conv_result.get());
3141+
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceAdd, acl_dst.get(), conv_result.get(), alpha.get());
3142+
}
30193143
}
30203144

30213145
void ggml_cann_elu(ggml_backend_cann_context & ctx, ggml_tensor * dst) {

ggml/src/ggml-cann/aclnn_ops.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@
4747
#include <aclnnop/aclnn_sign.h>
4848
#include <aclnnop/aclnn_silu.h>
4949
#include <aclnnop/aclnn_sin.h>
50+
#include <aclnnop/aclnn_slice.h>
5051
#include <aclnnop/aclnn_sqrt.h>
5152
#include <aclnnop/aclnn_tanh.h>
5253

ggml/src/ggml-cann/ggml-cann.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2424,8 +2424,7 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_ten
24242424
}
24252425
}
24262426
case GGML_OP_CONV_TRANSPOSE_1D:
2427-
// TODO: ((weightL - 1) * dilationW - padLeft)=1336 should not be larger than 255.
2428-
return (op->src[0]->ne[0] - 1) <= 255;
2427+
return true;
24292428
case GGML_OP_SCALE:
24302429
float bias;
24312430
memcpy(&bias, (const float *) (op->op_params) + 1, sizeof(float));

0 commit comments

Comments
 (0)