@@ -2990,32 +2990,156 @@ void ggml_cann_argmax(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
29902990 GGML_CANN_CALL_ACLNN_OP (ctx, ArgMax, acl_src.get (), 3 , false , acl_dst.get ());
29912991}
29922992
2993- void ggml_cann_conv_transpose_1d (ggml_backend_cann_context & ctx, ggml_tensor * dst) {
2993+ void ggml_cann_conv_transpose_1d (ggml_backend_cann_context& ctx, ggml_tensor* dst){
29942994 ggml_tensor * src0 = dst->src [0 ];
29952995 ggml_tensor * src1 = dst->src [1 ];
29962996
29972997 // stride
2998- int64_t s0 = ((const int32_t *) (dst->op_params ))[0 ];
2998+ int64_t s0 = ((const int32_t *) (dst->op_params ))[0 ];
29992999
3000- acl_tensor_ptr acl_input = ggml_cann_create_tensor (src1, src1->ne , src1->nb , 3 , ACL_FORMAT_NCL);
3000+ acl_tensor_ptr acl_input = ggml_cann_create_tensor (src1, src1->ne , src1->nb , 3 , ACL_FORMAT_NCL);
30013001 acl_tensor_ptr acl_weight = ggml_cann_create_tensor (src0, src0->ne , src0->nb , 3 , ACL_FORMAT_NCL);
3002- acl_tensor_ptr acl_dst = ggml_cann_create_tensor (dst, dst->ne , dst->nb , 3 , ACL_FORMAT_NCL);
3002+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor (dst, dst->ne , dst->nb , 3 , ACL_FORMAT_NCL);
3003+
3004+ // get base information of input and kernel
3005+ int64_t input_len = *(src1->ne );
3006+ int64_t dst_len = *(dst->ne );
3007+ int64_t kernel_size = *(src0->ne );
3008+
3009+ // set the max kernel size for each conv
3010+ int64_t max_kernel_size = 255 ;
3011+
3012+ // compute the partition of kernel
3013+ int64_t part_num = 1 ;
3014+ part_num = (kernel_size + max_kernel_size - 1 ) / max_kernel_size;
30033015
30043016 int64_t strideVal[1 ];
3005- strideVal[0 ] = s0;
3006- acl_int_array_ptr stride = ggml_cann_create_int_array (strideVal, 1 );
3007- int64_t paddingVal[] = { 0 };
3008- acl_int_array_ptr padding = ggml_cann_create_int_array (paddingVal, 1 );
3009- int64_t dilationVal[] = { 1 };
3010- acl_int_array_ptr dilation = ggml_cann_create_int_array (dilationVal, 1 );
3011- int8_t cubeMathType = 0 ;
3017+ strideVal[0 ] = s0;
3018+ acl_int_array_ptr stride = ggml_cann_create_int_array (strideVal, 1 );
3019+ int64_t paddingVal[] = {0 };
3020+ acl_int_array_ptr padding = ggml_cann_create_int_array (paddingVal, 1 );
3021+ int64_t dilationVal[] = {1 };
3022+ acl_int_array_ptr dilation = ggml_cann_create_int_array (dilationVal, 1 );
3023+ bool transposed = true ;
3024+ int64_t groups = 1 ;
3025+ int8_t cubeMathType = 0 ;
30123026
30133027#ifdef ASCEND_310P
30143028 cubeMathType = 1 ;
30153029#endif
30163030
3017- GGML_CANN_CALL_ACLNN_OP (ctx, Convolution, acl_input.get (), acl_weight.get (), nullptr , stride.get (), padding.get (),
3018- dilation.get (), true , padding.get (), 1 , acl_dst.get (), cubeMathType);
3031+ auto weight_type = ggml_cann_type_mapping (src0->type );
3032+ auto dst_type = ggml_cann_type_mapping (dst->type );
3033+
3034+ // slice the kernel to make each conv available
3035+ int64_t slice_dim = -1 ;
3036+ int64_t slice_start = 0 ;
3037+ int64_t slice_end = max_kernel_size;
3038+ int64_t slice_step = 1 ;
3039+ int64_t interval = max_kernel_size;
3040+
3041+ int64_t left_pad_len = dilationVal[0 ] * (max_kernel_size - 1 ) + 1 - 2 * paddingVal[0 ];
3042+ int64_t right_pad_len = 0 ;
3043+
3044+ acl_scalar_ptr alpha = nullptr ;
3045+ float alphaValue = 1.0 ;
3046+ alpha = ggml_cann_create_scalar (&alphaValue, aclDataType::ACL_FLOAT);
3047+
3048+ // set zero to destination
3049+ GGML_CANN_CALL_ACLNN_OP (ctx, InplaceZero, acl_dst.get ());
3050+
3051+ for (int k = 0 ; k < part_num; k++){
3052+
3053+ // create part kernel tensor and slice from big kernel
3054+ slice_start = max_kernel_size * k;
3055+ if (k == part_num - 1 ){
3056+ slice_end = kernel_size;
3057+ interval = kernel_size - max_kernel_size * k;
3058+ }else {
3059+ slice_end = max_kernel_size * (k+1 );
3060+ }
3061+
3062+ int64_t part_ne[4 ];
3063+ for (int i = 0 ; i < 4 ; i++) {
3064+ part_ne[i] = *(src0->ne + i);
3065+ }
3066+ part_ne[0 ] = interval;
3067+
3068+ size_t part_nb[4 ];
3069+ part_nb[0 ] = sizeof (weight_type);
3070+ for (int i = 1 ; i < 4 ; i++) {
3071+ part_nb[i] = part_nb[i - 1 ] * part_ne[i - 1 ];
3072+ }
3073+
3074+ ggml_cann_pool_alloc part_kernel_allocator;
3075+ part_kernel_allocator.alloc (ctx.pool (), part_nb[3 ]);
3076+ void * part_kernel_buf = part_kernel_allocator.get ();
3077+
3078+ acl_tensor_ptr part_kernel = ggml_cann_create_tensor (part_kernel_buf, weight_type,
3079+ ggml_element_size (src0), part_ne, part_nb, 3 , ACL_FORMAT_NCL);
3080+
3081+ GGML_CANN_CALL_ACLNN_OP (ctx, Slice, acl_weight.get (), slice_dim, slice_start, slice_end, slice_step, part_kernel.get ());
3082+
3083+ // create the part conv result tensor
3084+ int64_t part_dst_ne[4 ];
3085+ for (int i = 0 ; i < 4 ; i++){
3086+ part_dst_ne[i] = *(dst->ne + i);
3087+ }
3088+ part_dst_ne[0 ] = (input_len - 1 ) * strideVal[0 ] - 2 * paddingVal[0 ] + dilationVal[0 ] * (part_ne[0 ] - 1 ) + 1 ;
3089+
3090+ size_t part_dst_nb[4 ];
3091+ part_dst_nb[0 ] = sizeof (weight_type);
3092+ for (int i = 1 ; i < 4 ; i++) {
3093+ part_dst_nb[i] = part_dst_nb[i - 1 ] * part_dst_ne[i - 1 ];
3094+ }
3095+ ggml_cann_pool_alloc part_dst_allocator;
3096+ part_dst_allocator.alloc (ctx.pool (), part_dst_nb[3 ]);
3097+ void * part_dst_buf = part_dst_allocator.get ();
3098+
3099+ acl_tensor_ptr acl_part_dst = ggml_cann_create_tensor (part_dst_buf, dst_type, ggml_element_size (dst),
3100+ part_dst_ne, part_dst_nb, 3 , ACL_FORMAT_NCL);
3101+ GGML_CANN_CALL_ACLNN_OP (ctx, InplaceZero, acl_part_dst.get ());
3102+
3103+ // compute part conv transpose 1d
3104+ GGML_CANN_CALL_ACLNN_OP (ctx, Convolution, acl_input.get (), part_kernel.get (), nullptr , stride.get (),
3105+ padding.get (), dilation.get (), transposed, padding.get (), groups, acl_part_dst.get (), cubeMathType);
3106+
3107+ // compute the position of part result in final result
3108+ int64_t global_start = slice_start;
3109+ int64_t global_end = std::min ((input_len - 1 ) * strideVal[0 ] + slice_end, dst_len);
3110+
3111+ left_pad_len = global_start;
3112+ right_pad_len = dst_len - global_end;
3113+
3114+ std::vector<int64_t > padDataVal = {left_pad_len,right_pad_len};
3115+ acl_int_array_ptr padData = ggml_cann_create_int_array (padDataVal.data (), 2 );
3116+
3117+ acl_scalar_ptr pad_value = nullptr ;
3118+ float pad_valueVal = 0.0 ;
3119+ pad_value = ggml_cann_create_scalar (&pad_valueVal, aclDataType::ACL_FLOAT);
3120+
3121+ int64_t conv_result_ne[4 ];
3122+ for (int i = 0 ; i < 4 ; i++){
3123+ conv_result_ne[i] = *(dst->ne + i);
3124+ }
3125+
3126+ size_t conv_result_nb[4 ];
3127+ conv_result_nb[0 ] = sizeof (weight_type);
3128+ for (int i = 1 ; i < 4 ; i++) {
3129+ conv_result_nb[i] = conv_result_nb[i - 1 ] * conv_result_ne[i - 1 ];
3130+ }
3131+
3132+ ggml_cann_pool_alloc conv_result_allocator;
3133+ conv_result_allocator.alloc (ctx.pool (), conv_result_nb[3 ]);
3134+ void * conv_result_buf = conv_result_allocator.get ();
3135+
3136+ acl_tensor_ptr conv_result = ggml_cann_create_tensor (conv_result_buf, dst_type, ggml_element_size (dst),
3137+ conv_result_ne, conv_result_nb, 3 , ACL_FORMAT_NCL);
3138+
3139+ GGML_CANN_CALL_ACLNN_OP (ctx, InplaceZero, conv_result.get ());
3140+ GGML_CANN_CALL_ACLNN_OP (ctx, ConstantPadNd, acl_part_dst.get (), padData.get (), pad_value.get (), conv_result.get ());
3141+ GGML_CANN_CALL_ACLNN_OP (ctx, InplaceAdd, acl_dst.get (), conv_result.get (), alpha.get ());
3142+ }
30193143}
30203144
30213145void ggml_cann_elu (ggml_backend_cann_context & ctx, ggml_tensor * dst) {
0 commit comments