From 61a7d3116f36e6975206992b7724df7a75e0cc6c Mon Sep 17 00:00:00 2001 From: thefish111 <1519861278@qq.com> Date: Thu, 4 Dec 2025 15:19:16 +0800 Subject: [PATCH] Add Q4_1 and Q8_1 quantization support for CANN backend --- ggml/src/ggml-cann/acl_tensor.cpp | 4 + ggml/src/ggml-cann/aclnn_ops.cpp | 80 ++++- ggml/src/ggml-cann/ggml-cann.cpp | 392 +++++++++++++++++++++- ggml/src/ggml-cpu/arch-fallback.h | 1 + ggml/src/ggml-cpu/arch/arm/quants.c | 4 + ggml/src/ggml-cpu/arch/loongarch/quants.c | 4 + ggml/src/ggml-cpu/arch/powerpc/quants.c | 4 + ggml/src/ggml-cpu/arch/riscv/quants.c | 4 + ggml/src/ggml-cpu/arch/s390/quants.c | 4 + ggml/src/ggml-cpu/arch/wasm/quants.c | 4 + ggml/src/ggml-cpu/arch/x86/quants.c | 4 + ggml/src/ggml-cpu/ggml-cpu.c | 1 + ggml/src/ggml-cpu/quants.c | 29 ++ ggml/src/ggml-cpu/quants.h | 7 +- ggml/src/ggml-quants.c | 31 ++ ggml/src/ggml-quants.h | 3 +- ggml/src/ggml.c | 3 + include/llama.h | 1 + src/llama-model-loader.cpp | 4 +- src/llama-quant.cpp | 1 + tests/test-backend-ops.cpp | 27 +- tools/quantize/quantize.cpp | 3 +- 22 files changed, 586 insertions(+), 29 deletions(-) diff --git a/ggml/src/ggml-cann/acl_tensor.cpp b/ggml/src/ggml-cann/acl_tensor.cpp index 8ffac31dd66..674b0355881 100755 --- a/ggml/src/ggml-cann/acl_tensor.cpp +++ b/ggml/src/ggml-cann/acl_tensor.cpp @@ -41,8 +41,12 @@ aclDataType ggml_cann_type_mapping(ggml_type type) { return ACL_INT32; case GGML_TYPE_Q4_0: return ACL_INT4; + case GGML_TYPE_Q4_1: + return ACL_INT4; case GGML_TYPE_Q8_0: return ACL_INT8; + case GGML_TYPE_Q8_1: + return ACL_INT8; case GGML_TYPE_I64: return ACL_INT64; default: diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp index bc33b99d96e..7205246d672 100755 --- a/ggml/src/ggml-cann/aclnn_ops.cpp +++ b/ggml/src/ggml-cann/aclnn_ops.cpp @@ -2010,10 +2010,14 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx, float weight_elem_size; if (type == GGML_TYPE_Q4_0) { weight_elem_size = float(sizeof(uint8_t)) / 2; + } else if (type == GGML_TYPE_Q4_1) { + weight_elem_size = float(sizeof(uint8_t)) / 2; } else if (type == GGML_TYPE_Q8_0) { weight_elem_size = float(sizeof(uint8_t)); + } else if (type == GGML_TYPE_Q8_1) { + weight_elem_size = float(sizeof(uint8_t)); } else { - GGML_ABORT("Only support Q4_0 and Q8_0 MUL_MAT"); + GGML_ABORT("Only support Q4_0, Q4_1, Q8_0 and Q8_1 MUL_MAT"); } float weight_nb[] = {src0->ne[0] * weight_elem_size, weight_elem_size}; size_t weight_stride = src0->ne[1] * src0->ne[0] * weight_elem_size; @@ -2026,6 +2030,14 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx, size_t scale_stride = src0->ne[1] * src0->ne[0] / QK8_0 * scale_elem_size; char* scale_offset = (char*)src0->data + weight_size; + // offset (m for Q4_1, s for Q8_1) stored after scale + // For Q4_0/Q8_0, offset_offset will be used but offset tensor will be nullptr + char* offset_offset = scale_offset + scale_stride * src0->ne[2] * src0->ne[3]; + size_t offset_elem_size = sizeof(uint16_t); + size_t offset_nb[] = {src0->ne[0] / QK8_0 * offset_elem_size, + offset_elem_size}; + size_t offset_stride = src0->ne[1] * src0->ne[0] / QK8_0 * offset_elem_size; + // input size_t input_elem_size = sizeof(uint16_t); int64_t input_ne[] = {src1->ne[0], src1->ne[1]}; @@ -2096,6 +2108,18 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx, scale_offset + batch0 * scale_stride, ACL_FLOAT16, scale_elem_size, scale_ne, scale_nb, 2, ACL_FORMAT_ND, scale_ne_offset); + + // Create offset tensor for Q4_1 only (m parameter) + // Q8_1 doesn't need offset - it uses signed int8 like Q8_0 + aclTensor* acl_offset_tensor = nullptr; + int64_t offset_ne_offset = 0; + if (type == GGML_TYPE_Q4_1) { + acl_offset_tensor = ggml_cann_create_tensor( + offset_offset + batch0 * offset_stride, ACL_FLOAT16, + offset_elem_size, scale_ne, offset_nb, 2, ACL_FORMAT_ND, + offset_ne_offset); + } + aclTensor* acl_output_tensor = ggml_cann_create_tensor( (char*)output_buffer + batch1 * output_stride, ACL_FLOAT16, output_elem_size, output_ne, output_nb, 2, ACL_FORMAT_ND, @@ -2105,10 +2129,15 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx, antiquantGroupSize = QK8_0; } GGML_CANN_CALL_ACLNN_OP(ctx, WeightQuantBatchMatmulV2, acl_input_tensor, - acl_weight_tensor, acl_scale_tensor, nullptr, + acl_weight_tensor, acl_scale_tensor, acl_offset_tensor, nullptr, nullptr, nullptr, antiquantGroupSize, acl_output_tensor); - ggml_cann_release_resources(ctx, acl_weight_tensor, acl_scale_tensor, acl_output_tensor); + if (acl_offset_tensor) { + ggml_cann_release_resources(ctx, acl_weight_tensor, acl_scale_tensor, + acl_offset_tensor, acl_output_tensor); + } else { + ggml_cann_release_resources(ctx, acl_weight_tensor, acl_scale_tensor, acl_output_tensor); + } // other splits for (int64_t split = 1; split < split_size; split++) { @@ -2131,15 +2160,32 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx, scale_offset + batch0 * scale_stride, ACL_FLOAT16, scale_elem_size, scale_ne, scale_nb, 2, ACL_FORMAT_ND, scale_ne_offset); + + // Create offset tensor for Q4_1 only in splits + // Q8_1 doesn't need offset - it uses signed int8 like Q8_0 + acl_offset_tensor = nullptr; + if (type == GGML_TYPE_Q4_1) { + offset_ne_offset += offset_elem_size * scale_ne[0] * scale_ne[1]; + acl_offset_tensor = ggml_cann_create_tensor( + offset_offset + batch0 * offset_stride, ACL_FLOAT16, + offset_elem_size, scale_ne, offset_nb, 2, ACL_FORMAT_ND, + offset_ne_offset); + } + acl_output_tensor = ggml_cann_create_tensor( (char*)output_buffer + batch1 * output_stride, ACL_FLOAT16, output_elem_size, output_ne, output_nb, 2, ACL_FORMAT_ND, output_ne_offset); GGML_CANN_CALL_ACLNN_OP(ctx, WeightQuantBatchMatmulV2, acl_input_tensor, - acl_weight_tensor, acl_scale_tensor, nullptr, + acl_weight_tensor, acl_scale_tensor, acl_offset_tensor, nullptr, nullptr, nullptr, antiquantGroupSize, acl_output_tensor); - ggml_cann_release_resources(ctx, acl_weight_tensor, acl_scale_tensor, acl_output_tensor); + if (acl_offset_tensor) { + ggml_cann_release_resources(ctx, acl_weight_tensor, acl_scale_tensor, + acl_offset_tensor, acl_output_tensor); + } else { + ggml_cann_release_resources(ctx, acl_weight_tensor, acl_scale_tensor, acl_output_tensor); + } } ggml_cann_release_resources(ctx, acl_input_tensor); @@ -2160,7 +2206,6 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx, output_cast_nb, GGML_MAX_DIMS); aclTensor* acl_dst_tensor = ggml_cann_create_tensor(dst); aclnn_cast(ctx, acl_output_tensor, acl_dst_tensor, ggml_cann_type_mapping(dst->type)); - ggml_cann_release_resources(ctx, acl_output_tensor, acl_dst_tensor); } } @@ -2173,7 +2218,9 @@ void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ggml_cann_mat_mul_fp(ctx, dst); break; case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: case GGML_TYPE_Q8_0: + case GGML_TYPE_Q8_1: ggml_cann_mul_mat_quant(ctx, dst, type); break; default: @@ -3087,10 +3134,14 @@ static void ggml_cann_mul_mat_id_quant(ggml_backend_cann_context& ctx, ggml_tens float weight_elem_size; if (type == GGML_TYPE_Q4_0) { weight_elem_size = float(sizeof(uint8_t)) / 2; + } else if (type == GGML_TYPE_Q4_1) { + weight_elem_size = float(sizeof(uint8_t)) / 2; } else if (type == GGML_TYPE_Q8_0) { weight_elem_size = float(sizeof(uint8_t)); + } else if (type == GGML_TYPE_Q8_1) { + weight_elem_size = float(sizeof(uint8_t)); } else { - GGML_ABORT("MUL_MAT_ID only support quant type Q4_0 and Q8_0 "); + GGML_ABORT("MUL_MAT_ID only support quant type Q4_0, Q4_1, Q8_0 and Q8_1"); } // src0_row [D, M, 1, 1] weight without permute @@ -3107,6 +3158,10 @@ static void ggml_cann_mul_mat_id_quant(ggml_backend_cann_context& ctx, ggml_tens size_t scale_elem_size = sizeof(uint16_t); size_t scale_stride = src0->ne[1] * src0->ne[0] / QK8_0 * scale_elem_size; + // offset [D, M, 1, 1] for Q4_1/Q8_1 -> offset (m/s) && permute + size_t offset_elem_size = sizeof(uint16_t); + size_t offset_stride = src0->ne[1] * src0->ne[0] / QK8_0 * offset_elem_size; + // src1_row [D, 1, 1, 1] -> input src1_row.ne[1] = 1; src1_row.ne[2] = 1; @@ -3148,6 +3203,15 @@ static void ggml_cann_mul_mat_id_quant(ggml_backend_cann_context& ctx, ggml_tens void* scale_buffer = (char*)weight_buffer + weight_stride; ggml_cann_async_memcpy(ctx, scale_buffer, scale_tmp_ptr, scale_stride, ACL_MEMCPY_DEVICE_TO_DEVICE); + + // Copy offset (m) for Q4_1 only + // Q8_1 doesn't need offset - it uses signed int8 like Q8_0 + if (type == GGML_TYPE_Q4_1) { + void* offset_tmp_ptr = src0_original + weight_size + scale_stride * ne02 * ne03 + i02*offset_stride; + void* offset_buffer = (char*)scale_buffer + scale_stride; + ggml_cann_async_memcpy(ctx, offset_buffer, offset_tmp_ptr, offset_stride, + ACL_MEMCPY_DEVICE_TO_DEVICE); + } src0_row.data = weight_buffer; src1_row.data = src1_tmp_ptr; @@ -3169,7 +3233,9 @@ void ggml_cann_mul_mat_id(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ggml_cann_mul_mat_id_fp(ctx, dst); break; case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: case GGML_TYPE_Q8_0: + case GGML_TYPE_Q8_1: ggml_cann_mul_mat_id_quant(ctx, dst); break; default: diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp index cb8af42ebf9..cc82b96c07f 100755 --- a/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp @@ -47,6 +47,21 @@ #define GGML_CANN_NAME "CANN" +// Debug macros for Q4_1/Q8_1 transform functions. +// Uncomment the following lines to enable debug output: +#define DEBUG_Q4_1_TRANSFORM +// #define DEBUG_Q8_1_TRANSFORM +// +// When enabled, these will print: +// - Structure sizes (block_q4_1, block_q8_1, ggml_half, ggml_half2) +// - Number of elements and groups being processed +// - First 3 groups' d/m/s values in both hex and float formats +// +// This helps diagnose issues with: +// - Incorrect memory layout assumptions +// - Endianness problems +// - Anonymous struct/union member access in C++ + /** * @brief Handles CANN errors by printing an error message and aborting. * @@ -1012,6 +1027,315 @@ static void ggml_backend_cann_transform_back_q8_0( } } +static inline float fp16_to_fp32(uint16_t h) { + // Generic fp16 to fp32 conversion + uint32_t sign = (h & 0x8000) << 16; + uint32_t exp = (h & 0x7C00) >> 10; + uint32_t mant = (h & 0x03FF); + uint32_t f; + + if (exp == 0) { + if (mant == 0) { + f = sign; + } else { + // subnormal + float m = mant / 1024.0f; + float v = ldexpf(m, -14); + memcpy(&f, &v, 4); + f |= sign; + } + } else if (exp == 31) { + f = sign | 0x7F800000 | (mant << 13); + } else { + uint32_t e = exp - 15 + 127; + f = sign | (e << 23) | (mant << 13); + } + return *(float*)&f; +} + +/* + q4_1 -> q4_0 pre-conversion: + + Input: + src: q4_1 blocks + dst: intermediate buffer (compatible with q4_0 transform_back) + format: [packed quant][uint16_t scales...] + + tensor->nelements must be divisible by 32 +*/ +void ggml_backend_cann_transform_pre_q4_1_to_q4_0( + const ggml_tensor * tensor, const void * src, void * dst) +{ + int64_t n_elems = ggml_nelements(tensor); + int64_t groups = n_elems / QK4_0; + + block_q4_1 * b1 = (block_q4_1 *)src; + + // q4_0 intermediate storage format: + // first half is packed 4-bit (2 values per byte) + // second half is uint16_t scale + uint8_t * quant = (uint8_t *)dst; + uint16_t * scales = (uint16_t *)(quant + n_elems/2); + + for (int g = 0; g < groups; g++) { + block_q4_1 * blk = &b1[g]; + + float d = fp16_to_fp32(blk->d); + float m = fp16_to_fp32(blk->m); + + // decode q4_1 block + float vals[QK4_0]; + for (int i = 0; i < QK4_0; i++) { + uint8_t q = blk->qs[i]; + float v = d * (q - 8) + m; + vals[i] = v - m; // remove mean for q4_0 (no bias) + } + + // calculate q4_0 scale + float max_abs = 0.0f; + for (int i = 0; i < QK4_0; i++) { + float a = fabsf(vals[i]); + if (a > max_abs) max_abs = a; + } + + float d0 = (max_abs > 0) ? (max_abs / 7.0f) : 1.0f; + scales[g] = (uint16_t)(d0 * 256.0f); // encode scale as q4_0 format + + // re-quantize to 0~15 + uint8_t qs_q4_0[QK4_0]; + for (int i = 0; i < QK4_0; i++) { + float q = vals[i] / d0; + int qi = (int)roundf(q + 8); // shift to unsigned + if (qi < 0) qi = 0; + if (qi > 15) qi = 15; + qs_q4_0[i] = (uint8_t)qi; + } + + // pack into two 4-bit + uint8_t * qdst = quant + g * (QK4_0/2); + for (int i = 0; i < QK4_0; i += 2) { + qdst[i/2] = (qs_q4_0[i]) | (qs_q4_0[i+1] << 4); + } + } +} +/** + * @brief Transform quantized Q4.1 tensor data into a format suitable for CANN + * processing. + * + * This function transforms quantized Q4.1 tensor data into a format suitable + * for CANN processing. It extracts quantization values, scales (d), and min + * values (m) from the source data and organizes them as: [all qs][all d][all m]. + * + * @param tensor Pointer to the tensor information. + * @param src Pointer to the source data in Q4.1 format. + * @param dst Pointer to the destination buffer where transformed data will be + * stored. + */ +static void ggml_backend_cann_transform_q4_1( + ggml_tensor* tensor, + const void* src, + void* dst) { + + int64_t n_elems = ggml_nelements(tensor); + int64_t groups = n_elems / QK4_1; + size_t quant_bytes = n_elems / 2; + + uint8_t* quant_offset = (uint8_t*)dst; + uint16_t* scale_offset = (uint16_t*)((char*)dst + quant_bytes); + uint16_t* mean_offset = scale_offset + groups; + + for (int g = 0; g < groups; g++) { + + const block_q4_1* b = + (const block_q4_1*)((const char*)src + g * sizeof(block_q4_1)); + + // scale & mean stored after quant_bytes + *scale_offset++ = GGML_FP16_TO_FP32(b->d); + *mean_offset++ = GGML_FP16_TO_FP32(b->m); + + // pack 32×4bit → 16 bytes + const uint8_t* qs = b->qs; + + // 0-15 + for (int j = 0; j < QK4_1/2; j += 2) { + uint8_t q0 = qs[j] & 0x0F; + uint8_t q1 = qs[j + 1] & 0x0F; + *quant_offset++ = (q1 << 4) | q0; + } + + // 16-31 + for (int j = 0; j < QK4_1/2; j += 2) { + uint8_t q0 = (qs[j] >> 4) & 0x0F; + uint8_t q1 = (qs[j + 1] >> 4) & 0x0F; + *quant_offset++ = (q1 << 4) | q0; + } + } + + // XOR like q4_0 + for (uint8_t* p = (uint8_t*)dst; + p < (uint8_t*)dst + quant_bytes; p++) { + *p ^= 0x88; + } +} + + +/** + * @brief Transform CANN processed data back into quantized Q4.1 format. + * + * This function transforms CANN processed data back into quantized Q4.1 format. + * It reverses the transformation performed by + * ggml_backend_cann_transform_q4_1(), converting the data back into its + * original quantized form. + * + * @param tensor Pointer to the tensor information. + * @param src Pointer to the source buffer containing transformed data. + * @param dst Pointer to the destination buffer where the Q4.1 formatted data + * will be stored. + */ +static void ggml_backend_cann_transform_back_q4_1( + const ggml_tensor* tensor, + void* src, + void* dst) { + + int64_t n_elems = ggml_nelements(tensor); + int64_t groups = n_elems / QK4_1; + size_t quant_bytes = n_elems / 2; + + uint8_t* quant_offset = (uint8_t*)src; + + uint16_t* scale_offset = + (uint16_t*)((char*)src + quant_bytes); + + uint16_t* mean_offset = + scale_offset + groups; + + // undo XOR + for (uint8_t* p = quant_offset; + p < quant_offset + quant_bytes; p++) { + *p ^= 0x88; + } + + // reset for reading + quant_offset = (uint8_t*)src; + + for (int g = 0; g < groups; g++) { + + block_q4_1* b = + (block_q4_1*)((char*)dst + g * sizeof(block_q4_1)); + + b->d = GGML_FP32_TO_FP16(*(scale_offset++)); + b->m = GGML_FP32_TO_FP16(*(mean_offset++)); + + uint8_t* qs = b->qs; + + // unpack front 16 bytes + for (int j = 0; j < QK4_1/2; j += 2) { + uint8_t v = *quant_offset++; + qs[j] = v & 0x0F; + qs[j+1] = (v >> 4); + } + + // unpack later 16 bytes + for (int j = 0; j < QK4_1/2; j += 2) { + uint8_t v = *quant_offset++; + qs[j] |= (v << 4); + qs[j+1] |= (v & 0xF0); + } + } +} + + +/** + * @brief Transform quantized Q8.1 tensor data into a format suitable for CANN + * processing. + * + * This function transforms quantized Q8.1 tensor data into a format suitable + * for CANN processing. It extracts quantization values, scales (d), and sum + * values (s) from the source data and organizes them as: [all qs][all d][all s]. + * + * @param tensor Pointer to the tensor information. + * @param src Pointer to the source data in Q8.1 format. + * @param dst Pointer to the destination buffer where transformed data will be + * stored. + */ +static void ggml_backend_cann_transform_q8_1(ggml_tensor* tensor, + const void* src, + void* dst) { + // Q8_1 has: d (scale), s (sum), qs[32] (int8 quants) + // For CANN matrix multiplication, we only need d and qs (like Q8_0). + // The s value is used for dot product optimization but not needed here. + // + // Q8_1 dequantization: value = qs[i] * d (qs is already signed int8) + // No offset needed since int8 is already centered around 0. + // + // Data layout after transform: [all qs][all d][all s] + // We still copy s to maintain the same total size as original data, + // even though s is not used in matrix multiplication. + + int64_t n_elems = ggml_nelements(tensor); + int64_t groups = n_elems / QK8_1; + size_t quant_bytes = n_elems * sizeof(uint8_t); + + uint8_t* quant_offset = (uint8_t*)dst; + uint16_t* scale_d_offset = (uint16_t*)((char*)dst + quant_bytes); + uint16_t* scale_s_offset = scale_d_offset + groups; // s comes after all d values + + for (int i = 0; i < groups; i++) { + const block_q8_1* group = + (const block_q8_1*)((const char*)src + i * sizeof(block_q8_1)); + + // Copy both d and s to maintain data size + const uint16_t* ds_ptr = (const uint16_t*)group; + *scale_d_offset = ds_ptr[0]; // d is at offset 0 + *scale_s_offset = ds_ptr[1]; // s is at offset 1 + scale_d_offset++; + scale_s_offset++; + + size_t group_quant_size = QK8_1 * sizeof(uint8_t); + memcpy(quant_offset, group->qs, group_quant_size); + quant_offset += group_quant_size; + } +} + +/** + * @brief Transform CANN processed data back into quantized Q8.1 format. + * + * This function transforms CANN processed data back into quantized Q8.1 format. + * It reverses the transformation performed by + * ggml_backend_cann_transform_q8_1(), converting the data back into its + * original quantized form. + * + * @param tensor Pointer to the tensor information. + * @param src Pointer to the source buffer containing transformed data. + * @param dst Pointer to the destination buffer where the Q8.1 formatted data + * will be stored. + */ +static void ggml_backend_cann_transform_back_q8_1( + const ggml_tensor* tensor, const void* src, void* dst) { + // Reverse transform: restore d, s, and qs from [all qs][all d][all s] layout + + int64_t n_elems = ggml_nelements(tensor); + int64_t groups = n_elems / QK8_1; + size_t quant_bytes = n_elems * sizeof(uint8_t); + + const uint8_t* quant_offset = (const uint8_t*)src; + const uint16_t* scale_d_offset = (const uint16_t*)((const char*)src + quant_bytes); + const uint16_t* scale_s_offset = scale_d_offset + groups; + + for (int i = 0; i < groups; i++) { + block_q8_1* group = (block_q8_1*)((char*)dst + i * sizeof(block_q8_1)); + // Use pointer arithmetic to safely write d and s to the union + uint16_t* ds_ptr = (uint16_t*)group; + ds_ptr[0] = *scale_d_offset; // d is at offset 0 + ds_ptr[1] = *scale_s_offset; // s is at offset 1 + scale_d_offset++; + scale_s_offset++; + size_t group_quant_size = QK8_1 * sizeof(uint8_t); + memcpy(group->qs, quant_offset, group_quant_size); + quant_offset += group_quant_size; + } +} + /** * @brief Transform tensor data based on its type for CANN processing. * @@ -1030,9 +1354,15 @@ static void ggml_backend_cann_transform(ggml_tensor* tensor, case GGML_TYPE_Q4_0: ggml_backend_cann_transform_q4_0(tensor, src, dst); break; + case GGML_TYPE_Q4_1: + ggml_backend_cann_transform_q4_1(tensor, src, dst); + break; case GGML_TYPE_Q8_0: ggml_backend_cann_transform_q8_0(tensor, src, dst); break; + case GGML_TYPE_Q8_1: + ggml_backend_cann_transform_q8_1(tensor, src, dst); + break; default: break; } @@ -1056,9 +1386,15 @@ static void ggml_backend_cann_transform_back( case GGML_TYPE_Q4_0: ggml_backend_cann_transform_back_q4_0(tensor, src, dst); break; + case GGML_TYPE_Q4_1: + ggml_backend_cann_transform_back_q4_1(tensor, src, dst); + break; case GGML_TYPE_Q8_0: ggml_backend_cann_transform_back_q8_0(tensor, src, dst); break; + case GGML_TYPE_Q8_1: + ggml_backend_cann_transform_back_q8_1(tensor, src, dst); + break; default: break; } @@ -1076,7 +1412,9 @@ static void ggml_backend_cann_transform_back( static bool need_transform(ggml_type type) { switch (type) { case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: case GGML_TYPE_Q8_0: + case GGML_TYPE_Q8_1: return true; default: return false; @@ -2333,36 +2671,70 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, case GGML_TYPE_F16: case GGML_TYPE_F32: return true; - case GGML_TYPE_Q8_0: case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q8_1: #ifdef ASCEND_310P - // Q4 && Q8 per group is not suppor on 310p device + // Q4 && Q8 per group is not supported on 310p device return false; #endif - // only support contiguous for quantized types. - return ggml_is_contiguous(op->src[0]) && - ggml_is_contiguous(op->src[1]); + { + // only support contiguous for quantized types. + bool src0_contig = ggml_is_contiguous(op->src[0]); + bool src1_contig = ggml_is_contiguous(op->src[1]); + return src0_contig && src1_contig; + } default: return false; } } - case GGML_OP_MUL_MAT_ID: + case GGML_OP_MUL_MAT_ID: { + // Debug output (reuse debug_enabled from MUL_MAT) + static int debug_enabled = -1; + if (debug_enabled == -1) { + debug_enabled = getenv("CANN_DEBUG_SUPPORTS_OP") != nullptr ? 1 : 0; + } + switch (op->src[0]->type) { case GGML_TYPE_F16: case GGML_TYPE_F32: return true; - case GGML_TYPE_Q8_0: case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q8_1: #ifdef ASCEND_310P // Q4 && Q8 per group is not suppor on 310p device + if (debug_enabled) { + fprintf(stderr, "[CANN] MUL_MAT_ID type=%s: REJECTED (ASCEND_310P)\n", + ggml_type_name(op->src[0]->type)); + } return false; #endif - // only support contiguous for quantized types. - return ggml_is_contiguous(op->src[0]) && - ggml_is_contiguous(op->src[1]); + { + // only support contiguous for quantized types. + bool src0_contig = ggml_is_contiguous(op->src[0]); + bool src1_contig = ggml_is_contiguous(op->src[1]); + bool result = src0_contig && src1_contig; + + if (debug_enabled) { + fprintf(stderr, "[CANN] MUL_MAT_ID type=%s src0_contig=%d src1_contig=%d => %s\n", + ggml_type_name(op->src[0]->type), + src0_contig, src1_contig, + result ? "SUPPORTED" : "NOT_SUPPORTED"); + } + + return result; + } default: + if (debug_enabled) { + fprintf(stderr, "[CANN] MUL_MAT_ID type=%s: UNSUPPORTED_TYPE\n", + ggml_type_name(op->src[0]->type)); + } return false; } + } // embedding case GGML_OP_GET_ROWS: { switch (op->src[0]->type) { diff --git a/ggml/src/ggml-cpu/arch-fallback.h b/ggml/src/ggml-cpu/arch-fallback.h index 373408a9c09..4b2c14b2f03 100644 --- a/ggml/src/ggml-cpu/arch-fallback.h +++ b/ggml/src/ggml-cpu/arch-fallback.h @@ -13,6 +13,7 @@ #define ggml_vec_dot_q5_0_q8_0_generic ggml_vec_dot_q5_0_q8_0 #define ggml_vec_dot_q5_1_q8_1_generic ggml_vec_dot_q5_1_q8_1 #define ggml_vec_dot_q8_0_q8_0_generic ggml_vec_dot_q8_0_q8_0 +#define ggml_vec_dot_q8_1_q8_1_generic ggml_vec_dot_q8_1_q8_1 #define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0 #define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K #define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K diff --git a/ggml/src/ggml-cpu/arch/arm/quants.c b/ggml/src/ggml-cpu/arch/arm/quants.c index aadbb487ec0..0e27ae4cc23 100644 --- a/ggml/src/ggml-cpu/arch/arm/quants.c +++ b/ggml/src/ggml-cpu/arch/arm/quants.c @@ -1127,6 +1127,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi *s = sumf; } +void ggml_vec_dot_q8_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + ggml_vec_dot_q8_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc); +} + void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { assert(nrc == 1); UNUSED(nrc); diff --git a/ggml/src/ggml-cpu/arch/loongarch/quants.c b/ggml/src/ggml-cpu/arch/loongarch/quants.c index 0f9af7bf520..dffb39ad850 100644 --- a/ggml/src/ggml-cpu/arch/loongarch/quants.c +++ b/ggml/src/ggml-cpu/arch/loongarch/quants.c @@ -988,6 +988,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi #endif } +void ggml_vec_dot_q8_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + ggml_vec_dot_q8_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc); +} + void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { assert(nrc == 1); UNUSED(nrc); diff --git a/ggml/src/ggml-cpu/arch/powerpc/quants.c b/ggml/src/ggml-cpu/arch/powerpc/quants.c index d3dfd049eaf..72698c1b83d 100644 --- a/ggml/src/ggml-cpu/arch/powerpc/quants.c +++ b/ggml/src/ggml-cpu/arch/powerpc/quants.c @@ -561,6 +561,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi #endif } +void ggml_vec_dot_q8_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + ggml_vec_dot_q8_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc); +} + void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { assert(nrc == 1); UNUSED(nrc); diff --git a/ggml/src/ggml-cpu/arch/riscv/quants.c b/ggml/src/ggml-cpu/arch/riscv/quants.c index 6c74417c90c..1c48ff0ebe5 100644 --- a/ggml/src/ggml-cpu/arch/riscv/quants.c +++ b/ggml/src/ggml-cpu/arch/riscv/quants.c @@ -376,6 +376,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi #endif } +void ggml_vec_dot_q8_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + ggml_vec_dot_q8_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc); +} + void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { assert(nrc == 1); UNUSED(nrc); diff --git a/ggml/src/ggml-cpu/arch/s390/quants.c b/ggml/src/ggml-cpu/arch/s390/quants.c index 1c8176fb4d9..40260691c6d 100644 --- a/ggml/src/ggml-cpu/arch/s390/quants.c +++ b/ggml/src/ggml-cpu/arch/s390/quants.c @@ -607,6 +607,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi #endif } +void ggml_vec_dot_q8_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + ggml_vec_dot_q8_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc); +} + void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { assert(n % QK_K == 0); assert(nrc == 1); diff --git a/ggml/src/ggml-cpu/arch/wasm/quants.c b/ggml/src/ggml-cpu/arch/wasm/quants.c index 74a359e6d12..c7399aaf0a8 100644 --- a/ggml/src/ggml-cpu/arch/wasm/quants.c +++ b/ggml/src/ggml-cpu/arch/wasm/quants.c @@ -609,6 +609,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi #endif } +void ggml_vec_dot_q8_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + ggml_vec_dot_q8_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc); +} + void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { assert(nrc == 1); UNUSED(nrc); diff --git a/ggml/src/ggml-cpu/arch/x86/quants.c b/ggml/src/ggml-cpu/arch/x86/quants.c index cb49320a67f..fc58ffd6318 100644 --- a/ggml/src/ggml-cpu/arch/x86/quants.c +++ b/ggml/src/ggml-cpu/arch/x86/quants.c @@ -1076,6 +1076,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi *s = sumf; } +void ggml_vec_dot_q8_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + ggml_vec_dot_q8_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc); +} + void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { assert(nrc == 1); UNUSED(nrc); diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 0d5d3a3440a..668706421b5 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -250,6 +250,7 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = { }, [GGML_TYPE_Q8_1] = { .from_float = quantize_row_q8_1, + .vec_dot = ggml_vec_dot_q8_1_q8_1, .vec_dot_type = GGML_TYPE_Q8_1, .nrows = 1, }, diff --git a/ggml/src/ggml-cpu/quants.c b/ggml/src/ggml-cpu/quants.c index 365cb36d2d7..99d08571846 100644 --- a/ggml/src/ggml-cpu/quants.c +++ b/ggml/src/ggml-cpu/quants.c @@ -332,6 +332,35 @@ void ggml_vec_dot_q8_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, c *s = sumf; } +void ggml_vec_dot_q8_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + const int qk = QK8_1; + const int nb = n / qk; + + assert(n % qk == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q8_1 * GGML_RESTRICT x = vx; + const block_q8_1 * GGML_RESTRICT y = vy; + + float sumf = 0; + + for (int ib = 0; ib < nb; ++ib) { + int sumi = 0; + + for (int j = 0; j < qk; j++) { + sumi += x[ib].qs[j] * y[ib].qs[j]; + } + + sumf += sumi * (GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d)); + } + + *s = sumf; +} + void ggml_vec_dot_tq1_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { assert(nrc == 1); UNUSED(nrc); diff --git a/ggml/src/ggml-cpu/quants.h b/ggml/src/ggml-cpu/quants.h index d83eb1b144d..2fd119e4249 100644 --- a/ggml/src/ggml-cpu/quants.h +++ b/ggml/src/ggml-cpu/quants.h @@ -13,12 +13,11 @@ extern "C" { // Quantization void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); +void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); - +void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_mxfp4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); @@ -40,6 +39,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_q8_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); @@ -71,6 +71,7 @@ void ggml_vec_dot_q4_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, c void ggml_vec_dot_q5_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_q5_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_q8_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_q8_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_mxfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 727932123e4..cb64429666f 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -70,6 +70,8 @@ void quantize_row_q4_0_ref(const float * GGML_RESTRICT x, block_q4_0 * GGML_REST } } + +// ??? void quantize_row_q4_1_ref(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t k) { const int qk = QK4_1; @@ -221,6 +223,7 @@ void quantize_row_q8_0_ref(const float * GGML_RESTRICT x, block_q8_0 * GGML_REST } } + // reference implementation for deterministic creation of model files void quantize_row_q8_1_ref(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int64_t k) { assert(QK8_1 == 32); @@ -414,6 +417,26 @@ void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRI } } + + +// added by kun +// dequantize q8_1 to float +void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { + static const int qk = QK8_1; + + assert(k % qk == 0); + + const int nb = k / qk; + + for (int i = 0; i < nb; i++) { + const float d = GGML_FP16_TO_FP32(x[i].d); + + for (int j = 0; j < qk; ++j) { + y[i*qk + j] = x[i].qs[j]*d; + } + } +} + void dequantize_row_mxfp4(const block_mxfp4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { static const int qk = QK_MXFP4; @@ -2092,6 +2115,14 @@ size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, return nrow * row_size; } +// added by kun +size_t quantize_q8_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { + (void)quant_weights; // not used + const size_t row_size = ggml_row_size(GGML_TYPE_Q8_1, n_per_row); + quantize_row_q8_1_ref(src, dst, (int64_t)nrow*n_per_row); + return nrow * row_size; +} + size_t quantize_mxfp4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { GGML_UNUSED(quant_weights); quantize_row_mxfp4_ref(src, dst, (int64_t)nrow*n_per_row); diff --git a/ggml/src/ggml-quants.h b/ggml/src/ggml-quants.h index 3b688f31c21..71cce9a0945 100644 --- a/ggml/src/ggml-quants.h +++ b/ggml/src/ggml-quants.h @@ -45,7 +45,7 @@ GGML_API void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GG GGML_API void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); GGML_API void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); GGML_API void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -//GGML_API void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +GGML_API void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); GGML_API void dequantize_row_mxfp4(const block_mxfp4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); @@ -93,6 +93,7 @@ GGML_API size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTR GGML_API size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); GGML_API size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); GGML_API size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +GGML_API size_t quantize_q8_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); GGML_API size_t quantize_mxfp4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index d76ea58f789..e57e6fcf8af 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -685,7 +685,9 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .blck_size = QK8_1, .type_size = sizeof(block_q8_1), .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_q8_1, .from_float_ref = (ggml_from_float_t) quantize_row_q8_1_ref, + // .from_float = quantize_q8_1, }, [GGML_TYPE_MXFP4] = { .type_name = "mxfp4", @@ -7025,6 +7027,7 @@ size_t ggml_quantize_chunk( case GGML_TYPE_Q5_0: result = quantize_q5_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_Q5_1: result = quantize_q5_1(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_Q8_0: result = quantize_q8_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_Q8_1: result = quantize_q8_1(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_MXFP4: result = quantize_mxfp4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_Q2_K: result = quantize_q2_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_Q3_K: result = quantize_q3_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; diff --git a/include/llama.h b/include/llama.h index c5622cc16b4..7c8ef4608cb 100644 --- a/include/llama.h +++ b/include/llama.h @@ -120,6 +120,7 @@ extern "C" { // LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed // LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q8_1 = 39, // except 1d tensors added by kun LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index f71c40f8e3f..8ffa9bd51dd 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -31,10 +31,11 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_F16: return "F16"; case LLAMA_FTYPE_MOSTLY_BF16: return "BF16"; case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0"; - case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1"; + case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1"; case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0"; case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1"; case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0"; + case LLAMA_FTYPE_MOSTLY_Q8_1: return "Q8_1"; // added by kun case LLAMA_FTYPE_MOSTLY_MXFP4_MOE: return "MXFP4 MoE"; case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium"; case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small"; @@ -644,6 +645,7 @@ llama_model_loader::llama_model_loader( case GGML_TYPE_Q5_0: ftype = LLAMA_FTYPE_MOSTLY_Q5_0; break; case GGML_TYPE_Q5_1: ftype = LLAMA_FTYPE_MOSTLY_Q5_1; break; case GGML_TYPE_Q8_0: ftype = LLAMA_FTYPE_MOSTLY_Q8_0; break; + case GGML_TYPE_Q8_1: ftype = LLAMA_FTYPE_MOSTLY_Q8_1; break; //added by kun case GGML_TYPE_Q2_K: ftype = LLAMA_FTYPE_MOSTLY_Q2_K; break; case GGML_TYPE_Q3_K: ftype = LLAMA_FTYPE_MOSTLY_Q3_K_M; break; case GGML_TYPE_Q4_K: ftype = LLAMA_FTYPE_MOSTLY_Q4_K_M; break; diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 1d0361cc166..be84d759bb4 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -540,6 +540,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: case LLAMA_FTYPE_MOSTLY_Q5_0: default_type = GGML_TYPE_Q5_0; break; case LLAMA_FTYPE_MOSTLY_Q5_1: default_type = GGML_TYPE_Q5_1; break; case LLAMA_FTYPE_MOSTLY_Q8_0: default_type = GGML_TYPE_Q8_0; break; + case LLAMA_FTYPE_MOSTLY_Q8_1: default_type = GGML_TYPE_Q8_1; break; // added by kun case LLAMA_FTYPE_MOSTLY_F16: default_type = GGML_TYPE_F16; break; case LLAMA_FTYPE_MOSTLY_BF16: default_type = GGML_TYPE_BF16; break; case LLAMA_FTYPE_ALL_F32: default_type = GGML_TYPE_F32; break; diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index ef6f452195b..8735fe73e96 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -1190,7 +1190,21 @@ struct test_case { for (size_t i = 0; i < f1.size(); i++) { // check for nans if (std::isnan(f1[i]) || std::isnan(f2[i])) { - printf("[%s] NaN at index %zu (%s=%f %s=%f) ", ggml_op_desc(t1), i, bn1, f1[i], bn2, f2[i]); + printf("[%s] NaN at index %zu/%zu (%s=%f %s=%f) ", ggml_op_desc(t1), i, f1.size(), bn1, f1[i], bn2, f2[i]); + // Debug: print tensor info + printf("\n t1: type=%s ne=[%lld,%lld,%lld,%lld]\n", + ggml_type_name(t1->type), (long long)t1->ne[0], (long long)t1->ne[1], + (long long)t1->ne[2], (long long)t1->ne[3]); + // Print first few raw bytes of both tensors for debugging + std::vector raw1(std::min((size_t)64, ggml_nbytes(t1))); + std::vector raw2(std::min((size_t)64, ggml_nbytes(t2))); + ggml_backend_tensor_get(t1, raw1.data(), 0, raw1.size()); + ggml_backend_tensor_get(t2, raw2.data(), 0, raw2.size()); + printf(" %s first 32 bytes: ", bn1); + for (size_t j = 0; j < std::min((size_t)32, raw1.size()); j++) printf("%02x ", raw1[j]); + printf("\n %s first 32 bytes: ", bn2); + for (size_t j = 0; j < std::min((size_t)32, raw2.size()); j++) printf("%02x ", raw2[j]); + printf("\n"); ud->ok = false; return true; } @@ -5372,13 +5386,14 @@ static const ggml_type all_types[] = { }; static const ggml_type base_types[] = { - GGML_TYPE_F32, GGML_TYPE_F16, + // GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_Q8_0, // for I8MM tests + GGML_TYPE_Q8_1, // Q8_1 as weight matrix (now supported with vec_dot_q8_1_q8_1) GGML_TYPE_Q4_0, GGML_TYPE_Q4_1, // for I8MM tests - GGML_TYPE_Q4_K, - GGML_TYPE_MXFP4, // TODO: or "other" - GGML_TYPE_IQ2_XXS + // GGML_TYPE_Q4_K, + // GGML_TYPE_MXFP4, // TODO: or "other" + // GGML_TYPE_IQ2_XXS }; static const ggml_type other_types[] = { @@ -5909,7 +5924,7 @@ static std::vector> make_test_cases_eval() { #if 1 for (ggml_type type_a : base_types) { - for (ggml_type type_b : {GGML_TYPE_F32, GGML_TYPE_F16}) { + for (ggml_type type_b : {GGML_TYPE_F16, GGML_TYPE_F32}) { std::vector ks = { 256 }; if (ggml_blck_size(type_a) == 1) { ks.push_back(4); diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index 470dc3d916b..e59858f00d6 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -21,7 +21,7 @@ struct quant_option { static const std::vector QUANT_OPTIONS = { { "Q4_0", LLAMA_FTYPE_MOSTLY_Q4_0, " 4.34G, +0.4685 ppl @ Llama-3-8B", }, - { "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 4.78G, +0.4511 ppl @ Llama-3-8B", }, + { "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 4.78G, +0.4511 ppl @ Llama-3-8B", }, // ?? required, but already implemented ? { "MXFP4_MOE",LLAMA_FTYPE_MOSTLY_MXFP4_MOE," MXFP4 MoE", }, { "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 5.21G, +0.1316 ppl @ Llama-3-8B", }, { "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 5.65G, +0.1062 ppl @ Llama-3-8B", }, @@ -53,6 +53,7 @@ static const std::vector QUANT_OPTIONS = { { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 5.33G, +0.0569 ppl @ Llama-3-8B", }, { "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 6.14G, +0.0217 ppl @ Llama-3-8B", }, { "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 7.96G, +0.0026 ppl @ Llama-3-8B", }, + { "Q8_1", LLAMA_FTYPE_MOSTLY_Q8_1, "", }, // added by kun { "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, +0.0020 ppl @ Mistral-7B", }, { "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", }, { "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },