From 8203fa190c83f349e35b1db8351400cdfd81e3bf Mon Sep 17 00:00:00 2001 From: Ryan19929 Date: Thu, 17 Apr 2025 14:32:24 +0800 Subject: [PATCH 1/6] [fix](inverted_index) fix IK handling of full-width characters and inaccurate character classification --- .../analyzer/ik/IKTokenizer.cpp | 9 +- .../analyzer/ik/core/CharacterUtil.cpp | 99 +++++++----- .../analyzer/ik/core/CharacterUtil.h | 3 + .../analyzer/ik/core/LetterSegmenter.cpp | 53 ++++--- .../analyzer/ik/core/LetterSegmenter.h | 6 +- .../analyzer/ik_anayzer_test.cpp | 142 ++++++++++++++++++ .../{ => analyzer}/test_ik_analyzer.out | 0 .../{ => analyzer}/test_ik_analyzer.groovy | 10 +- 8 files changed, 247 insertions(+), 75 deletions(-) rename regression-test/data/inverted_index_p0/{ => analyzer}/test_ik_analyzer.out (100%) rename regression-test/suites/inverted_index_p0/{ => analyzer}/test_ik_analyzer.groovy (95%) diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/IKTokenizer.cpp b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/IKTokenizer.cpp index e1f451804e5067..72b906fa4e17b7 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/IKTokenizer.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/IKTokenizer.cpp @@ -32,13 +32,10 @@ Token* IKTokenizer::next(Token* token) { } std::string& token_text = tokens_text_[buffer_index_++]; + // full-width to half-width, and lowercase + // TODO(ryan19929): do regularizeString in fillBuffer. + CharacterUtil::regularizeString(token_text, this->lowercase); size_t size = std::min(token_text.size(), static_cast(LUCENE_MAX_WORD_LEN)); - if (this->lowercase) { - if (!token_text.empty() && static_cast(token_text[0]) < 0x80) { - std::transform(token_text.begin(), token_text.end(), token_text.begin(), - [](char c) { return to_lower(c); }); - } - } token->setNoCopy(token_text.data(), 0, size); return token; } diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.cpp b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.cpp index 808edc140397e4..3a13526dc2a0fb 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.cpp @@ -20,34 +20,32 @@ namespace doris::segment_v2 { int32_t CharacterUtil::identifyCharType(int32_t rune) { - // Numbers - if (rune >= 0x30 && rune <= 0x39) { + if (rune >= '0' && rune <= '9') { return CHAR_ARABIC; } - - // English - if ((rune >= 0x61 && rune <= 0x7a) || (rune >= 0x41 && rune <= 0x5a)) { + if ((rune >= 'a' && rune <= 'z') || (rune >= 'A' && rune <= 'Z')) { return CHAR_ENGLISH; } - // CJK Unified Chinese Characters - if ((rune >= 0x4E00 && rune <= 0x9FFF) || (rune >= 0x3400 && rune <= 0x4DBF) || - (rune >= 0x20000 && rune <= 0x2A6DF) || (rune >= 0x2A700 && rune <= 0x2B73F) || - (rune >= 0x2B740 && rune <= 0x2B81F) || (rune >= 0x2B820 && rune <= 0x2CEAF) || - (rune >= 0x2CEB0 && rune <= 0x2EBEF) || (rune >= 0x30000 && rune <= 0x3134F)) { + UBlockCode block = ublock_getCode(rune); + + if (block == UBLOCK_CJK_UNIFIED_IDEOGRAPHS || + block == UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS || + block == UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A) { return CHAR_CHINESE; } - - // Japanese and Korean characters - if ((rune >= 0x3040 && rune <= 0x309F) || (rune >= 0x30A0 && rune <= 0x30FF) || - (rune >= 0x31F0 && rune <= 0x31FF) || (rune >= 0xAC00 && rune <= 0xD7AF) || - (rune >= 0x1100 && rune <= 0x11FF)) { + + if (block == UBLOCK_HALFWIDTH_AND_FULLWIDTH_FORMS || + block == UBLOCK_HANGUL_SYLLABLES || + block == UBLOCK_HANGUL_JAMO || + block == UBLOCK_HANGUL_COMPATIBILITY_JAMO || + block == UBLOCK_HIRAGANA || + block == UBLOCK_KATAKANA || + block == UBLOCK_KATAKANA_PHONETIC_EXTENSIONS) { return CHAR_OTHER_CJK; } - // UTF-16 surrogate pairs and private zone - if ((rune >= 0xD800 && rune <= 0xDBFF) || (rune >= 0xDC00 && rune <= 0xDFFF) || - (rune >= 0xE000 && rune <= 0xF8FF)) { + if (rune > 0xFFFF) { return CHAR_SURROGATE; } @@ -55,31 +53,18 @@ int32_t CharacterUtil::identifyCharType(int32_t rune) { } int32_t CharacterUtil::regularize(int32_t rune, bool use_lowercase) { - // Full-width to half-width + // Full-width space if (rune == 0x3000) { - return 0x0020; // Convert full-width space to half-width - } - - // Full-width numbers - if (rune >= 0xFF10 && rune <= 0xFF19) { - return rune - 0xFEE0; // Convert to half-width numbers + return 0x0020; } - // Full-width letters - if (rune >= 0xFF21 && rune <= 0xFF3A) { + // All full-width characters + if (rune > 0xFF00 && rune < 0xFF5F) { rune = rune - 0xFEE0; - if (use_lowercase) { - rune += 32; // Convert to lowercase - } - return rune; - } - if (rune >= 0xFF41 && rune <= 0xFF5A) { - return rune - 0xFEE0; } - // Convert half-width uppercase letters to lowercase if (use_lowercase && rune >= 0x41 && rune <= 0x5A) { - return rune + 32; + rune += 32; } return rune; @@ -91,6 +76,7 @@ void CharacterUtil::TypedRune::regularize(bool use_lowercase) { void CharacterUtil::regularizeCharInfo(TypedRune& typedRune, bool use_lowercase) { typedRune.rune = regularize(typedRune.rune, use_lowercase); + typedRune.char_type = identifyCharType(typedRune.rune); } CharacterUtil::RuneStrLite CharacterUtil::decodeChar(const char* str, size_t length) { @@ -113,9 +99,8 @@ void CharacterUtil::decodeStringToRunes(const char* str, size_t length, TypedRun } typed_runes.emplace_back(runeStr.rune, byte_pos, runeStr.len, typed_runes.size(), 1); - if (use_lowercase) { - typed_runes.back().regularize(true); - } + typed_runes.back().regularize(use_lowercase); + byte_pos += runeStr.len; } } @@ -166,4 +151,40 @@ size_t CharacterUtil::adjustToCompleteChar(const char* buffer, size_t buffer_len return buffer_length; } + +void CharacterUtil::regularizeString(std::string& input, bool use_lowercase) { + std::string temp; + size_t len = input.size(); + temp.reserve(len); + for (size_t i = 0; i < len; ) { + unsigned char c = input[i]; + if ((c & 0xF0) == 0xE0 && i + 2 < len) { + int rune = ((c & 0x0F) << 12) | + ((input[i + 1] & 0x3F) << 6) | + (input[i + 2] & 0x3F); + if (rune == 0x3000) { + temp += ' '; + } else if (rune >= 0xFF01 && rune <= 0xFF5E) { + char half = static_cast(rune - 0xFEE0); + if (use_lowercase && half >= 'A' && half <= 'Z') { + half += 32; + } + temp += half; + } else { + temp += input[i]; + temp += input[i + 1]; + temp += input[i + 2]; + } + i += 3; + } else { + char ch = input[i]; + if (use_lowercase && ch >= 'A' && ch <= 'Z') { + ch += 32; + } + temp += ch; + i += 1; + } + } + input = std::move(temp); +} } // namespace doris::segment_v2 \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.h b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.h index 2f3dac6a520ded..66443d09c2cec3 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.h @@ -20,6 +20,7 @@ #include #include #include +#include #include "CLucene/_ApiHeader.h" #include "CLucene/analysis/jieba/Unicode.hpp" @@ -81,6 +82,8 @@ class CharacterUtil { static void regularizeCharInfo(TypedRune& type_rune, bool use_lowercase); static size_t adjustToCompleteChar(const char* buffer, size_t buffer_length); + + static void regularizeString(std::string& input, bool use_lowercase = true); }; } // namespace doris::segment_v2 diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/LetterSegmenter.cpp b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/LetterSegmenter.cpp index 2b63139913206b..e6593c31a675cf 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/LetterSegmenter.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/LetterSegmenter.cpp @@ -53,7 +53,6 @@ void LetterSegmenter::reset() { bool LetterSegmenter::processEnglishLetter(AnalyzeContext& context) { bool need_lock = false; - const auto& typed_runes = context.getTypedRuneArray(); if (english_start_ == -1) { // The current tokenizer has not yet started processing English characters if (context.getCurrentCharType() == CharacterUtil::CHAR_ENGLISH) { @@ -68,9 +67,7 @@ bool LetterSegmenter::processEnglishLetter(AnalyzeContext& context) { english_end_ = context.getCursor(); } else { // Encounter non-English characters, output tokens - Lexeme newLexeme(context.getBufferOffset(), typed_runes[english_start_].offset, - english_end_ - english_start_ + 1, Lexeme::Type::English, - english_start_, english_end_); + Lexeme newLexeme = createLexeme(context, english_start_, english_end_, Lexeme::Type::English); context.addLexeme(newLexeme); english_start_ = -1; english_end_ = -1; @@ -78,9 +75,7 @@ bool LetterSegmenter::processEnglishLetter(AnalyzeContext& context) { } if (context.isBufferConsumed() && (english_start_ != -1 && english_end_ != -1)) { - Lexeme newLexeme(context.getBufferOffset(), typed_runes[english_start_].offset, - english_end_ - english_start_ + 1, Lexeme::Type::English, english_start_, - english_end_); + Lexeme newLexeme = createLexeme(context, english_start_, english_end_, Lexeme::Type::English); context.addLexeme(newLexeme); english_start_ = -1; english_end_ = -1; @@ -96,7 +91,6 @@ bool LetterSegmenter::processEnglishLetter(AnalyzeContext& context) { bool LetterSegmenter::processArabicLetter(AnalyzeContext& context) { bool need_lock = false; - const auto& typed_runes = context.getTypedRuneArray(); if (arabic_start_ == -1) { // The current tokenizer has not yet started processing numeric characters @@ -115,9 +109,7 @@ bool LetterSegmenter::processArabicLetter(AnalyzeContext& context) { // Do not output numbers, but do not mark the end } else { // Encounter non-Arabic characters, output tokens - Lexeme newLexeme(context.getBufferOffset(), typed_runes[arabic_start_].offset, - arabic_end_ - arabic_start_ + 1, Lexeme::Type::Arabic, arabic_start_, - arabic_end_); + Lexeme newLexeme = createLexeme(context, arabic_start_, arabic_end_, Lexeme::Type::Arabic); context.addLexeme(newLexeme); arabic_start_ = -1; arabic_end_ = -1; @@ -125,9 +117,7 @@ bool LetterSegmenter::processArabicLetter(AnalyzeContext& context) { } if (context.isBufferConsumed() && (arabic_start_ != -1 && arabic_end_ != -1)) { - Lexeme newLexeme(context.getBufferOffset(), typed_runes[arabic_start_].offset, - arabic_end_ - arabic_start_ + 1, Lexeme::Type::Arabic, arabic_start_, - arabic_end_); + Lexeme newLexeme = createLexeme(context, arabic_start_, arabic_end_, Lexeme::Type::Arabic); context.addLexeme(newLexeme); arabic_start_ = -1; arabic_end_ = -1; @@ -143,7 +133,6 @@ bool LetterSegmenter::processArabicLetter(AnalyzeContext& context) { bool LetterSegmenter::processMixLetter(AnalyzeContext& context) { bool need_lock = false; - const auto& typed_runes = context.getTypedRuneArray(); if (start_ == -1) { // The current tokenizer has not yet started processing characters. @@ -164,8 +153,7 @@ bool LetterSegmenter::processMixLetter(AnalyzeContext& context) { end_ = context.getCursor(); } else { // Encounter non-letter characters, output a token - Lexeme newLexeme(context.getBufferOffset(), typed_runes[start_].offset, - end_ - start_ + 1, Lexeme::Type::Letter, start_, end_); + Lexeme newLexeme = createLexeme(context, start_, end_, Lexeme::Type::Letter); context.addLexeme(newLexeme); start_ = -1; end_ = -1; @@ -173,8 +161,7 @@ bool LetterSegmenter::processMixLetter(AnalyzeContext& context) { } if (context.isBufferConsumed() && (start_ != -1 && end_ != -1)) { - Lexeme newLexeme(context.getBufferOffset(), typed_runes[start_].offset, end_ - start_ + 1, - Lexeme::Type::Letter, start_, end_); + Lexeme newLexeme = createLexeme(context, start_, end_, Lexeme::Type::Letter); context.addLexeme(newLexeme); start_ = -1; end_ = -1; @@ -184,11 +171,31 @@ bool LetterSegmenter::processMixLetter(AnalyzeContext& context) { return need_lock; } -bool LetterSegmenter::isLetterConnector(char input) { - return std::binary_search(std::begin(letter_connectors_), std::end(letter_connectors_), input); +bool LetterSegmenter::isLetterConnector(int32_t input) { + if (input < 128) { + return std::binary_search(std::begin(letter_connectors_), std::end(letter_connectors_), + static_cast(input)); + } + return false; +} + +bool LetterSegmenter::isNumConnector(int32_t input) { + if (input < 128) { + return std::binary_search(std::begin(num_connectors_), std::end(num_connectors_), + static_cast(input)); + } + return false; } -bool LetterSegmenter::isNumConnector(char input) { - return std::binary_search(std::begin(num_connectors_), std::end(num_connectors_), input); +Lexeme LetterSegmenter::createLexeme(AnalyzeContext& context, int start, int end, Lexeme::Type type) { + const auto& typed_runes = context.getTypedRuneArray(); + return Lexeme( + context.getBufferOffset(), + typed_runes[start].getBytePosition(), + typed_runes[end].getNextBytePosition() - typed_runes[start].getBytePosition(), + type, + start, + end + ); } } // namespace doris::segment_v2 diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/LetterSegmenter.h b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/LetterSegmenter.h index 87b36e83fbfd07..edd09a5d2c4e61 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/LetterSegmenter.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/LetterSegmenter.h @@ -41,8 +41,10 @@ class LetterSegmenter : public ISegmenter { bool processEnglishLetter(AnalyzeContext& context); bool processArabicLetter(AnalyzeContext& context); bool processMixLetter(AnalyzeContext& context); - bool isLetterConnector(char input); - bool isNumConnector(char input); + bool isLetterConnector(int32_t input); + bool isNumConnector(int32_t input); + + Lexeme createLexeme(AnalyzeContext& context, int start, int end, Lexeme::Type type); int start_ {-1}; int end_ {-1}; diff --git a/be/test/olap/rowset/segment_v2/inverted_index/analyzer/ik_anayzer_test.cpp b/be/test/olap/rowset/segment_v2/inverted_index/analyzer/ik_anayzer_test.cpp index 18dc16b6925fe8..8b87618ed8a13c 100644 --- a/be/test/olap/rowset/segment_v2/inverted_index/analyzer/ik_anayzer_test.cpp +++ b/be/test/olap/rowset/segment_v2/inverted_index/analyzer/ik_anayzer_test.cpp @@ -428,6 +428,148 @@ TEST_F(IKTokenizerTest, TestLongTextCompareWithJava) { } } +TEST_F(IKTokenizerTest, TestFullWidthCharacters) { + std::vector datas; + + // test full width numbers + std::string fullWidthNumbersText = "4 3 2"; + tokenize(fullWidthNumbersText, datas, true); + std::vector expectedNumbers = {"4", "3", "2"}; // half width numbers + ASSERT_EQ(datas.size(), expectedNumbers.size()); + for (size_t i = 0; i < datas.size(); i++) { + ASSERT_EQ(datas[i], expectedNumbers[i]); + } + datas.clear(); + + fullWidthNumbersText = "432"; + tokenize(fullWidthNumbersText, datas, false); + expectedNumbers = {"432"}; + ASSERT_EQ(datas.size(), expectedNumbers.size()); + for (size_t i = 0; i < datas.size(); i++) { + ASSERT_EQ(datas[i], expectedNumbers[i]); + } + datas.clear(); + + // test full width currency symbol + std::string currencyText = "¥"; + tokenize(currencyText, datas, false); + ASSERT_EQ(datas.size(), 1); + ASSERT_EQ(datas[0], "¥"); + datas.clear(); + + // test full width symbol in word + std::string mixedText = "High&Low"; + tokenize(mixedText, datas, false); + std::vector expectedMixed = {"high&low", "high", "low"}; + ASSERT_EQ(datas.size(), expectedMixed.size()); + for (size_t i = 0; i < datas.size(); i++) { + ASSERT_EQ(datas[i], expectedMixed[i]); + } + datas.clear(); + + // test special separator + std::string specialSeparatorText = "1・2"; + tokenize(specialSeparatorText, datas, false); + std::vector expectedSeparator = {"1", "・", "2"}; + ASSERT_EQ(datas.size(), expectedSeparator.size()); + for (size_t i = 0; i < datas.size(); i++) { + ASSERT_EQ(datas[i], expectedSeparator[i]); + } + datas.clear(); + + // test special character + std::string specialCharText = "﨑"; + tokenize(specialCharText, datas, false); + ASSERT_EQ(datas.size(), 1); + ASSERT_EQ(datas[0], "﨑"); + datas.clear(); +} + +TEST_F(IKTokenizerTest, TestEmojiAndSpecialCharacters) { + std::vector datas; + + // test emoji + std::string emojiText = "🐼"; + tokenize(emojiText, datas, false); + ASSERT_EQ(datas.size(), 1); + ASSERT_EQ(datas[0], "🐼"); + datas.clear(); + + std::string emojiText2 = "🝢"; + tokenize(emojiText2, datas, false); + ASSERT_EQ(datas.size(), 1); + ASSERT_EQ(datas[0], "🝢"); + datas.clear(); + + // test special latin character + std::string specialLatinText1 = "abcşabc"; + tokenize(specialLatinText1, datas, false); + ASSERT_EQ(datas.size(), 2); + ASSERT_EQ(datas[0], "abc"); + ASSERT_EQ(datas[1], "abc"); + datas.clear(); + + std::string specialLatinText2 = "abcīabc"; + tokenize(specialLatinText2, datas, false); + ASSERT_EQ(datas.size(), 2); + ASSERT_EQ(datas[0], "abc"); + ASSERT_EQ(datas[1], "abc"); + datas.clear(); + + std::string specialLatinText3 = "celebrity…get"; + tokenize(specialLatinText3, datas, false); + std::vector expectedEllipsis = {"celebrity", "get"}; + ASSERT_EQ(datas.size(), expectedEllipsis.size()); + for (size_t i = 0; i < datas.size(); i++) { + ASSERT_EQ(datas[i], expectedEllipsis[i]); + } + datas.clear(); + + // test mixed alphabet word + std::string mixedAlphabetText1 = "Hulyaiрole"; + tokenize(mixedAlphabetText1, datas, false); + ASSERT_EQ(datas.size(), 2); + ASSERT_EQ(datas[0], "hulyai"); + ASSERT_EQ(datas[1], "ole"); + datas.clear(); + + std::string mixedAlphabetText2 = "Nisa Aşgabat"; + tokenize(mixedAlphabetText2, datas, false); + std::vector expectedName = {"nisa", "gabat"}; + ASSERT_EQ(datas.size(), expectedName.size()); + for (size_t i = 0; i < datas.size(); i++) { + ASSERT_EQ(datas[i], expectedName[i]); + } + datas.clear(); + + // test special connector + std::string specialConnectorText = "alـameer"; + tokenize(specialConnectorText, datas, false); + ASSERT_EQ(datas.size(), 2); + ASSERT_EQ(datas[0], "al"); + ASSERT_EQ(datas[1], "ameer"); + datas.clear(); + + // test rare unicode character + std::string rareUnicodeText1 = "𐓚"; + tokenize(rareUnicodeText1, datas, false); + ASSERT_EQ(datas.size(), 1); + ASSERT_EQ(datas[0], "𐓚"); + datas.clear(); + + std::string rareUnicodeText2 = "𑪱"; + tokenize(rareUnicodeText2, datas, false); + ASSERT_EQ(datas.size(), 1); + ASSERT_EQ(datas[0], "𑪱"); + datas.clear(); + + std::string rareUnicodeText3 = "𐴗"; + tokenize(rareUnicodeText3, datas, false); + ASSERT_EQ(datas.size(), 1); + ASSERT_EQ(datas[0], "𐴗"); + datas.clear(); +} + // Test the exception handling capabilities of the IKTokenizer and AnalyzeContext TEST_F(IKTokenizerTest, TestExceptionHandling) { // Common mock reader class for testing exception handling diff --git a/regression-test/data/inverted_index_p0/test_ik_analyzer.out b/regression-test/data/inverted_index_p0/analyzer/test_ik_analyzer.out similarity index 100% rename from regression-test/data/inverted_index_p0/test_ik_analyzer.out rename to regression-test/data/inverted_index_p0/analyzer/test_ik_analyzer.out diff --git a/regression-test/suites/inverted_index_p0/test_ik_analyzer.groovy b/regression-test/suites/inverted_index_p0/analyzer/test_ik_analyzer.groovy similarity index 95% rename from regression-test/suites/inverted_index_p0/test_ik_analyzer.groovy rename to regression-test/suites/inverted_index_p0/analyzer/test_ik_analyzer.groovy index c28aa68920b652..b57d40a4a002c0 100644 --- a/regression-test/suites/inverted_index_p0/test_ik_analyzer.groovy +++ b/regression-test/suites/inverted_index_p0/analyzer/test_ik_analyzer.groovy @@ -22,7 +22,7 @@ suite("test_ik_analyzer", "p0") { sql "DROP TABLE IF EXISTS ${tableNameSmart}" sql "DROP TABLE IF EXISTS ${tableNameMaxWord}" - // 创建smart模式测试表 + // Create test table for smart mode sql """ CREATE TABLE ${tableNameSmart} ( `id` int(11) NULL COMMENT "", @@ -37,7 +37,7 @@ suite("test_ik_analyzer", "p0") { ); """ - // 创建max_word模式测试表 + // Create test table for max_word mode sql """ CREATE TABLE ${tableNameMaxWord} ( `id` int(11) NULL COMMENT "", @@ -52,7 +52,7 @@ suite("test_ik_analyzer", "p0") { ); """ - // 插入测试数据 + // Insert test data def insertData = { table -> sql """ INSERT INTO ${table} VALUES (1, "我爱北京天安门"); """ sql """ INSERT INTO ${table} VALUES (2, "Apache Doris是一个现代化的MPP数据库"); """ @@ -68,14 +68,14 @@ suite("test_ik_analyzer", "p0") { sql "sync" sql """ set enable_common_expr_pushdown = true; """ - // 测试smart模式 + // Testing ik smart mode println "Testing ik smart mode:" qt_sql """ select * from ${tableNameSmart} where content match_phrase '北京'; """ qt_sql """ select * from ${tableNameSmart} where content match_phrase '计算机科学'; """ qt_sql """ select * from ${tableNameSmart} where content match_phrase '数据库管理系统'; """ qt_sql """ select * from ${tableNameSmart} where content match_phrase '中华人民共和国'; """ - // 测试max_word模式 + // Testing ik max_word mode println "Testing ik max_word mode:" qt_sql """ select * from ${tableNameMaxWord} where content match_phrase '北京'; """ qt_sql """ select * from ${tableNameMaxWord} where content match_phrase '计算机科学'; """ From 39a47f1e289f8bbd039f6a7068d412a76f6e52b8 Mon Sep 17 00:00:00 2001 From: Ryan19929 Date: Thu, 17 Apr 2025 14:38:49 +0800 Subject: [PATCH 2/6] [feature](inverted_index) enhance IK analyzer to support emoji and rare characters --- .../analyzer/ik/core/AnalyzeContext.cpp | 6 +++ .../analyzer/ik/core/AnalyzeContext.h | 10 ++--- .../analyzer/ik/core/IKSegmenter.cpp | 1 + .../analyzer/ik/core/IKSegmenter.h | 1 + .../ik/core/SurrogatePairSegmenter.cpp | 36 +++++++++++++++++ .../analyzer/ik/core/SurrogatePairSegmenter.h | 39 +++++++++++++++++++ 6 files changed, 88 insertions(+), 5 deletions(-) create mode 100644 be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/SurrogatePairSegmenter.cpp create mode 100644 be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/SurrogatePairSegmenter.h diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/AnalyzeContext.cpp b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/AnalyzeContext.cpp index d648f8e715afcd..59b97b2ee16d45 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/AnalyzeContext.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/AnalyzeContext.cpp @@ -199,6 +199,9 @@ void AnalyzeContext::lockBuffer(SegmenterType type) { case SegmenterType::LETTER_SEGMENTER: buffer_locker_ |= LETTER_SEGMENTER_FLAG; break; + case SegmenterType::SURROGATE_PAIR_SEGMENTER: + buffer_locker_ |= SURROGATE_PAIR_SEGMENTER_FLAG; + break; } } @@ -213,6 +216,9 @@ void AnalyzeContext::unlockBuffer(SegmenterType type) { case SegmenterType::LETTER_SEGMENTER: buffer_locker_ &= ~LETTER_SEGMENTER_FLAG; break; + case SegmenterType::SURROGATE_PAIR_SEGMENTER: + buffer_locker_ &= ~SURROGATE_PAIR_SEGMENTER_FLAG; + break; } } diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/AnalyzeContext.h b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/AnalyzeContext.h index b6b363d9d1a52d..a89f4973d0d955 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/AnalyzeContext.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/AnalyzeContext.h @@ -41,10 +41,10 @@ class AnalyzeContext { static const size_t BUFF_SIZE = 4096; static const size_t BUFF_EXHAUST_CRITICAL = 100; - static constexpr uint8_t CJK_SEGMENTER_FLAG = 0x01; // 0001 - static constexpr uint8_t CN_QUANTIFIER_FLAG = 0x02; // 0010 - static constexpr uint8_t LETTER_SEGMENTER_FLAG = 0x04; // 0100 - + static constexpr uint8_t CJK_SEGMENTER_FLAG = 0x01; // 0001 + static constexpr uint8_t CN_QUANTIFIER_FLAG = 0x02; // 0010 + static constexpr uint8_t LETTER_SEGMENTER_FLAG = 0x04; // 0100 + static constexpr uint8_t SURROGATE_PAIR_SEGMENTER_FLAG = 0x08; // 1000 // String buffer std::string segment_buff_; // An array storing Unicode code points (runes)Character information array @@ -73,7 +73,7 @@ class AnalyzeContext { void compound(Lexeme& lexeme); public: - enum class SegmenterType { CJK_SEGMENTER, CN_QUANTIFIER, LETTER_SEGMENTER }; + enum class SegmenterType { CJK_SEGMENTER, CN_QUANTIFIER, LETTER_SEGMENTER, SURROGATE_PAIR_SEGMENTER}; const CharacterUtil::TypedRuneArray& getTypedRuneArray() const { return typed_runes_; } explicit AnalyzeContext(IKMemoryPool& pool, std::shared_ptr config); virtual ~AnalyzeContext(); diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/IKSegmenter.cpp b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/IKSegmenter.cpp index 6c1d049ac708f9..674f22dfd2d6c9 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/IKSegmenter.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/IKSegmenter.cpp @@ -33,6 +33,7 @@ std::vector> IKSegmenter::loadSegmenters() { segmenters.push_back(std::make_unique()); segmenters.push_back(std::make_unique()); segmenters.push_back(std::make_unique()); + segmenters.push_back(std::make_unique()); return segmenters; } diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/IKSegmenter.h b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/IKSegmenter.h index 4f94fa435dbfac..33defbbe31a97b 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/IKSegmenter.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/IKSegmenter.h @@ -27,6 +27,7 @@ #include "IKArbitrator.h" #include "ISegmenter.h" #include "LetterSegmenter.h" +#include "SurrogatePairSegmenter.h" #include "olap/rowset/segment_v2/inverted_index/analyzer/ik/cfg/Configuration.h" namespace doris::segment_v2 { diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/SurrogatePairSegmenter.cpp b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/SurrogatePairSegmenter.cpp new file mode 100644 index 00000000000000..1e787dd5883fa4 --- /dev/null +++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/SurrogatePairSegmenter.cpp @@ -0,0 +1,36 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "SurrogatePairSegmenter.h" + +namespace doris::segment_v2 { + +void SurrogatePairSegmenter::analyze(AnalyzeContext& context) { + const auto& current_char_type = context.getCurrentCharType(); + + if (current_char_type == CharacterUtil::CHAR_SURROGATE) { + Lexeme newLexeme(context.getBufferOffset(), context.getCurrentCharOffset(), + context.getCurrentCharLen(), Lexeme::Type::CNChar, context.getCursor(), context.getCursor()); + context.addLexeme(newLexeme); + } + + context.unlockBuffer(SEGMENTER_TYPE); +} + +void SurrogatePairSegmenter::reset() {} + +} // namespace doris::segment_v2 \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/SurrogatePairSegmenter.h b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/SurrogatePairSegmenter.h new file mode 100644 index 00000000000000..f39eb30cf91e8f --- /dev/null +++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/SurrogatePairSegmenter.h @@ -0,0 +1,39 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "ISegmenter.h" +#include "AnalyzeContext.h" +#include "CharacterUtil.h" +#include "Lexeme.h" + +namespace doris::segment_v2 { + +class SurrogatePairSegmenter : public ISegmenter { +public: + static constexpr AnalyzeContext::SegmenterType SEGMENTER_TYPE = + AnalyzeContext::SegmenterType::SURROGATE_PAIR_SEGMENTER; + + SurrogatePairSegmenter() = default; + ~SurrogatePairSegmenter() override = default; + + void analyze(AnalyzeContext& context) override; + void reset() override; +}; + +} // namespace doris::segment_v2 \ No newline at end of file From cf8bc00b2138fc8c40134774bfe63e1f1a8d07cd Mon Sep 17 00:00:00 2001 From: Ryan19929 Date: Thu, 17 Apr 2025 17:09:46 +0800 Subject: [PATCH 3/6] [refactor](inverted_index) format code and remove unnecessary code --- .../analyzer/ik/core/AnalyzeContext.cpp | 1 - .../analyzer/ik/core/AnalyzeContext.h | 7 +- .../analyzer/ik/core/CN_QuantifierSegmenter.h | 1 - .../analyzer/ik/core/CharacterUtil.cpp | 83 +++---------------- .../analyzer/ik/core/CharacterUtil.h | 5 +- .../analyzer/ik/core/LetterSegmenter.cpp | 31 ++++--- .../analyzer/ik/core/LetterSegmenter.h | 1 - .../ik/core/SurrogatePairSegmenter.cpp | 11 +-- .../analyzer/ik/core/SurrogatePairSegmenter.h | 4 +- .../analyzer/ik_anayzer_test.cpp | 32 +++---- .../data/inverted_index_p0/test_tokenize.out | 12 +++ .../inverted_index_p0/test_tokenize.groovy | 5 ++ 12 files changed, 79 insertions(+), 114 deletions(-) diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/AnalyzeContext.cpp b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/AnalyzeContext.cpp index 59b97b2ee16d45..1d5ff916a12bb4 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/AnalyzeContext.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/AnalyzeContext.cpp @@ -172,7 +172,6 @@ bool AnalyzeContext::moveCursor() { void AnalyzeContext::initCursor() { cursor_ = 0; - typed_runes_[cursor_].regularize(config_->isEnableLowercase()); } bool AnalyzeContext::isBufferConsumed() const { diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/AnalyzeContext.h b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/AnalyzeContext.h index a89f4973d0d955..d9e947a713dfca 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/AnalyzeContext.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/AnalyzeContext.h @@ -73,7 +73,12 @@ class AnalyzeContext { void compound(Lexeme& lexeme); public: - enum class SegmenterType { CJK_SEGMENTER, CN_QUANTIFIER, LETTER_SEGMENTER, SURROGATE_PAIR_SEGMENTER}; + enum class SegmenterType { + CJK_SEGMENTER, + CN_QUANTIFIER, + LETTER_SEGMENTER, + SURROGATE_PAIR_SEGMENTER + }; const CharacterUtil::TypedRuneArray& getTypedRuneArray() const { return typed_runes_; } explicit AnalyzeContext(IKMemoryPool& pool, std::shared_ptr config); virtual ~AnalyzeContext(); diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CN_QuantifierSegmenter.h b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CN_QuantifierSegmenter.h index 27ccef61a83bdc..a20341d3f3eaaa 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CN_QuantifierSegmenter.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CN_QuantifierSegmenter.h @@ -29,7 +29,6 @@ class CN_QuantifierSegmenter : public ISegmenter { public: static constexpr AnalyzeContext::SegmenterType SEGMENTER_TYPE = AnalyzeContext::SegmenterType::CN_QUANTIFIER; - static const std::string SEGMENTER_NAME; static const std::u32string CHINESE_NUMBERS; static const std::unordered_set CHINESE_NUMBER_CHARS; diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.cpp b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.cpp index 3a13526dc2a0fb..4b7bc34361e1ba 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.cpp @@ -29,18 +29,14 @@ int32_t CharacterUtil::identifyCharType(int32_t rune) { UBlockCode block = ublock_getCode(rune); - if (block == UBLOCK_CJK_UNIFIED_IDEOGRAPHS || - block == UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS || + if (block == UBLOCK_CJK_UNIFIED_IDEOGRAPHS || block == UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS || block == UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A) { return CHAR_CHINESE; } - - if (block == UBLOCK_HALFWIDTH_AND_FULLWIDTH_FORMS || - block == UBLOCK_HANGUL_SYLLABLES || - block == UBLOCK_HANGUL_JAMO || - block == UBLOCK_HANGUL_COMPATIBILITY_JAMO || - block == UBLOCK_HIRAGANA || - block == UBLOCK_KATAKANA || + + if (block == UBLOCK_HALFWIDTH_AND_FULLWIDTH_FORMS || block == UBLOCK_HANGUL_SYLLABLES || + block == UBLOCK_HANGUL_JAMO || block == UBLOCK_HANGUL_COMPATIBILITY_JAMO || + block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA || block == UBLOCK_KATAKANA_PHONETIC_EXTENSIONS) { return CHAR_OTHER_CJK; } @@ -58,7 +54,7 @@ int32_t CharacterUtil::regularize(int32_t rune, bool use_lowercase) { return 0x0020; } - // All full-width characters + // All full-width characters if (rune > 0xFF00 && rune < 0xFF5F) { rune = rune - 0xFEE0; } @@ -71,12 +67,8 @@ int32_t CharacterUtil::regularize(int32_t rune, bool use_lowercase) { } void CharacterUtil::TypedRune::regularize(bool use_lowercase) { - CharacterUtil::regularizeCharInfo(*this, use_lowercase); -} - -void CharacterUtil::regularizeCharInfo(TypedRune& typedRune, bool use_lowercase) { - typedRune.rune = regularize(typedRune.rune, use_lowercase); - typedRune.char_type = identifyCharType(typedRune.rune); + this->rune = CharacterUtil::regularize(this->rune, use_lowercase); + this->char_type = CharacterUtil::identifyCharType(this->rune); } CharacterUtil::RuneStrLite CharacterUtil::decodeChar(const char* str, size_t length) { @@ -100,71 +92,22 @@ void CharacterUtil::decodeStringToRunes(const char* str, size_t length, TypedRun typed_runes.emplace_back(runeStr.rune, byte_pos, runeStr.len, typed_runes.size(), 1); typed_runes.back().regularize(use_lowercase); - - byte_pos += runeStr.len; - } -} - -// TODO: Maybe delete this function -size_t CharacterUtil::adjustToCompleteChar(const char* buffer, size_t buffer_length) { - if (buffer_length == 0) return 0; - - unsigned char last_byte = buffer[buffer_length - 1]; - - if (last_byte < 0x80) { - return buffer_length; - } - if ((last_byte & 0xC0) == 0x80) { - size_t adjustedLen = buffer_length - 1; - while (adjustedLen > 0) { - unsigned char byte = buffer[adjustedLen - 1]; - if ((byte & 0xC0) != 0x80) { - int charLen = 0; - if ((byte & 0xE0) == 0xC0) - charLen = 2; - else if ((byte & 0xF0) == 0xE0) - charLen = 3; - else if ((byte & 0xF8) == 0xF0) - charLen = 4; - if (buffer_length - adjustedLen + 1 < charLen) { - return adjustedLen - 1; - } - return buffer_length; - } - adjustedLen--; - } - return 0; - } - - int charLen = 0; - if ((last_byte & 0xE0) == 0xC0) - charLen = 2; - else if ((last_byte & 0xF0) == 0xE0) - charLen = 3; - else if ((last_byte & 0xF8) == 0xF0) - charLen = 4; - - if (charLen > 1) { - return buffer_length - 1; + byte_pos += runeStr.len; } - - return buffer_length; } void CharacterUtil::regularizeString(std::string& input, bool use_lowercase) { std::string temp; size_t len = input.size(); temp.reserve(len); - for (size_t i = 0; i < len; ) { + for (size_t i = 0; i < len;) { unsigned char c = input[i]; if ((c & 0xF0) == 0xE0 && i + 2 < len) { - int rune = ((c & 0x0F) << 12) | - ((input[i + 1] & 0x3F) << 6) | - (input[i + 2] & 0x3F); - if (rune == 0x3000) { + int rune = ((c & 0x0F) << 12) | ((input[i + 1] & 0x3F) << 6) | (input[i + 2] & 0x3F); + if (rune == 0x3000) { temp += ' '; - } else if (rune >= 0xFF01 && rune <= 0xFF5E) { + } else if (rune >= 0xFF01 && rune <= 0xFF5E) { char half = static_cast(rune - 0xFEE0); if (use_lowercase && half >= 'A' && half <= 'Z') { half += 32; diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.h b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.h index 66443d09c2cec3..0a8ad27b696e62 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.h @@ -17,10 +17,11 @@ #pragma once +#include + #include #include #include -#include #include "CLucene/_ApiHeader.h" #include "CLucene/analysis/jieba/Unicode.hpp" @@ -81,8 +82,6 @@ class CharacterUtil { static void regularizeCharInfo(TypedRune& type_rune, bool use_lowercase); - static size_t adjustToCompleteChar(const char* buffer, size_t buffer_length); - static void regularizeString(std::string& input, bool use_lowercase = true); }; diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/LetterSegmenter.cpp b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/LetterSegmenter.cpp index e6593c31a675cf..c593a1ec63dde3 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/LetterSegmenter.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/LetterSegmenter.cpp @@ -67,7 +67,8 @@ bool LetterSegmenter::processEnglishLetter(AnalyzeContext& context) { english_end_ = context.getCursor(); } else { // Encounter non-English characters, output tokens - Lexeme newLexeme = createLexeme(context, english_start_, english_end_, Lexeme::Type::English); + Lexeme newLexeme = + createLexeme(context, english_start_, english_end_, Lexeme::Type::English); context.addLexeme(newLexeme); english_start_ = -1; english_end_ = -1; @@ -75,7 +76,8 @@ bool LetterSegmenter::processEnglishLetter(AnalyzeContext& context) { } if (context.isBufferConsumed() && (english_start_ != -1 && english_end_ != -1)) { - Lexeme newLexeme = createLexeme(context, english_start_, english_end_, Lexeme::Type::English); + Lexeme newLexeme = + createLexeme(context, english_start_, english_end_, Lexeme::Type::English); context.addLexeme(newLexeme); english_start_ = -1; english_end_ = -1; @@ -109,7 +111,8 @@ bool LetterSegmenter::processArabicLetter(AnalyzeContext& context) { // Do not output numbers, but do not mark the end } else { // Encounter non-Arabic characters, output tokens - Lexeme newLexeme = createLexeme(context, arabic_start_, arabic_end_, Lexeme::Type::Arabic); + Lexeme newLexeme = + createLexeme(context, arabic_start_, arabic_end_, Lexeme::Type::Arabic); context.addLexeme(newLexeme); arabic_start_ = -1; arabic_end_ = -1; @@ -173,29 +176,25 @@ bool LetterSegmenter::processMixLetter(AnalyzeContext& context) { bool LetterSegmenter::isLetterConnector(int32_t input) { if (input < 128) { - return std::binary_search(std::begin(letter_connectors_), std::end(letter_connectors_), - static_cast(input)); + return std::binary_search(std::begin(letter_connectors_), std::end(letter_connectors_), + static_cast(input)); } return false; } bool LetterSegmenter::isNumConnector(int32_t input) { if (input < 128) { - return std::binary_search(std::begin(num_connectors_), std::end(num_connectors_), - static_cast(input)); + return std::binary_search(std::begin(num_connectors_), std::end(num_connectors_), + static_cast(input)); } return false; } -Lexeme LetterSegmenter::createLexeme(AnalyzeContext& context, int start, int end, Lexeme::Type type) { +Lexeme LetterSegmenter::createLexeme(AnalyzeContext& context, int start, int end, + Lexeme::Type type) { const auto& typed_runes = context.getTypedRuneArray(); - return Lexeme( - context.getBufferOffset(), - typed_runes[start].getBytePosition(), - typed_runes[end].getNextBytePosition() - typed_runes[start].getBytePosition(), - type, - start, - end - ); + return Lexeme(context.getBufferOffset(), typed_runes[start].getBytePosition(), + typed_runes[end].getNextBytePosition() - typed_runes[start].getBytePosition(), + type, start, end); } } // namespace doris::segment_v2 diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/LetterSegmenter.h b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/LetterSegmenter.h index edd09a5d2c4e61..70dc6b4988fcf0 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/LetterSegmenter.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/LetterSegmenter.h @@ -30,7 +30,6 @@ class LetterSegmenter : public ISegmenter { public: static constexpr AnalyzeContext::SegmenterType SEGMENTER_TYPE = AnalyzeContext::SegmenterType::LETTER_SEGMENTER; - static const std::string SEGMENTER_NAME; LetterSegmenter(); ~LetterSegmenter() override = default; diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/SurrogatePairSegmenter.cpp b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/SurrogatePairSegmenter.cpp index 1e787dd5883fa4..0aea370a502c0f 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/SurrogatePairSegmenter.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/SurrogatePairSegmenter.cpp @@ -21,16 +21,17 @@ namespace doris::segment_v2 { void SurrogatePairSegmenter::analyze(AnalyzeContext& context) { const auto& current_char_type = context.getCurrentCharType(); - + if (current_char_type == CharacterUtil::CHAR_SURROGATE) { Lexeme newLexeme(context.getBufferOffset(), context.getCurrentCharOffset(), - context.getCurrentCharLen(), Lexeme::Type::CNChar, context.getCursor(), context.getCursor()); + context.getCurrentCharLen(), Lexeme::Type::CNChar, context.getCursor(), + context.getCursor()); context.addLexeme(newLexeme); - } - + } + context.unlockBuffer(SEGMENTER_TYPE); } void SurrogatePairSegmenter::reset() {} -} // namespace doris::segment_v2 \ No newline at end of file +} // namespace doris::segment_v2 \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/SurrogatePairSegmenter.h b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/SurrogatePairSegmenter.h index f39eb30cf91e8f..bad22658b519cf 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/SurrogatePairSegmenter.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/SurrogatePairSegmenter.h @@ -17,9 +17,9 @@ #pragma once -#include "ISegmenter.h" #include "AnalyzeContext.h" #include "CharacterUtil.h" +#include "ISegmenter.h" #include "Lexeme.h" namespace doris::segment_v2 { @@ -36,4 +36,4 @@ class SurrogatePairSegmenter : public ISegmenter { void reset() override; }; -} // namespace doris::segment_v2 \ No newline at end of file +} // namespace doris::segment_v2 \ No newline at end of file diff --git a/be/test/olap/rowset/segment_v2/inverted_index/analyzer/ik_anayzer_test.cpp b/be/test/olap/rowset/segment_v2/inverted_index/analyzer/ik_anayzer_test.cpp index 8b87618ed8a13c..690499830f5704 100644 --- a/be/test/olap/rowset/segment_v2/inverted_index/analyzer/ik_anayzer_test.cpp +++ b/be/test/olap/rowset/segment_v2/inverted_index/analyzer/ik_anayzer_test.cpp @@ -264,7 +264,11 @@ TEST_F(IKTokenizerTest, TestSpecialCharacters) { // Test with special characters std::string specialText = "😊🚀👍测试特殊符号:@#¥%……&*()"; tokenize(specialText, datas, true); - ASSERT_EQ(datas.size(), 2); + ASSERT_EQ(datas.size(), 5); + std::vector expectedTokens = {"😊", "🚀", "👍", "测试", "特殊符号"}; + for (size_t i = 0; i < datas.size(); i++) { + ASSERT_EQ(datas[i], expectedTokens[i]); + } } TEST_F(IKTokenizerTest, TestBufferBoundaryWithSpace) { @@ -456,7 +460,7 @@ TEST_F(IKTokenizerTest, TestFullWidthCharacters) { ASSERT_EQ(datas.size(), 1); ASSERT_EQ(datas[0], "¥"); datas.clear(); - + // test full width symbol in word std::string mixedText = "High&Low"; tokenize(mixedText, datas, false); @@ -466,7 +470,7 @@ TEST_F(IKTokenizerTest, TestFullWidthCharacters) { ASSERT_EQ(datas[i], expectedMixed[i]); } datas.clear(); - + // test special separator std::string specialSeparatorText = "1・2"; tokenize(specialSeparatorText, datas, false); @@ -476,7 +480,7 @@ TEST_F(IKTokenizerTest, TestFullWidthCharacters) { ASSERT_EQ(datas[i], expectedSeparator[i]); } datas.clear(); - + // test special character std::string specialCharText = "﨑"; tokenize(specialCharText, datas, false); @@ -494,13 +498,13 @@ TEST_F(IKTokenizerTest, TestEmojiAndSpecialCharacters) { ASSERT_EQ(datas.size(), 1); ASSERT_EQ(datas[0], "🐼"); datas.clear(); - + std::string emojiText2 = "🝢"; tokenize(emojiText2, datas, false); ASSERT_EQ(datas.size(), 1); ASSERT_EQ(datas[0], "🝢"); datas.clear(); - + // test special latin character std::string specialLatinText1 = "abcşabc"; tokenize(specialLatinText1, datas, false); @@ -508,14 +512,14 @@ TEST_F(IKTokenizerTest, TestEmojiAndSpecialCharacters) { ASSERT_EQ(datas[0], "abc"); ASSERT_EQ(datas[1], "abc"); datas.clear(); - + std::string specialLatinText2 = "abcīabc"; tokenize(specialLatinText2, datas, false); ASSERT_EQ(datas.size(), 2); ASSERT_EQ(datas[0], "abc"); ASSERT_EQ(datas[1], "abc"); datas.clear(); - + std::string specialLatinText3 = "celebrity…get"; tokenize(specialLatinText3, datas, false); std::vector expectedEllipsis = {"celebrity", "get"}; @@ -524,7 +528,7 @@ TEST_F(IKTokenizerTest, TestEmojiAndSpecialCharacters) { ASSERT_EQ(datas[i], expectedEllipsis[i]); } datas.clear(); - + // test mixed alphabet word std::string mixedAlphabetText1 = "Hulyaiрole"; tokenize(mixedAlphabetText1, datas, false); @@ -532,7 +536,7 @@ TEST_F(IKTokenizerTest, TestEmojiAndSpecialCharacters) { ASSERT_EQ(datas[0], "hulyai"); ASSERT_EQ(datas[1], "ole"); datas.clear(); - + std::string mixedAlphabetText2 = "Nisa Aşgabat"; tokenize(mixedAlphabetText2, datas, false); std::vector expectedName = {"nisa", "gabat"}; @@ -541,7 +545,7 @@ TEST_F(IKTokenizerTest, TestEmojiAndSpecialCharacters) { ASSERT_EQ(datas[i], expectedName[i]); } datas.clear(); - + // test special connector std::string specialConnectorText = "alـameer"; tokenize(specialConnectorText, datas, false); @@ -549,20 +553,20 @@ TEST_F(IKTokenizerTest, TestEmojiAndSpecialCharacters) { ASSERT_EQ(datas[0], "al"); ASSERT_EQ(datas[1], "ameer"); datas.clear(); - + // test rare unicode character std::string rareUnicodeText1 = "𐓚"; tokenize(rareUnicodeText1, datas, false); ASSERT_EQ(datas.size(), 1); ASSERT_EQ(datas[0], "𐓚"); datas.clear(); - + std::string rareUnicodeText2 = "𑪱"; tokenize(rareUnicodeText2, datas, false); ASSERT_EQ(datas.size(), 1); ASSERT_EQ(datas[0], "𑪱"); datas.clear(); - + std::string rareUnicodeText3 = "𐴗"; tokenize(rareUnicodeText3, datas, false); ASSERT_EQ(datas.size(), 1); diff --git a/regression-test/data/inverted_index_p0/test_tokenize.out b/regression-test/data/inverted_index_p0/test_tokenize.out index 32e7968cb8b1de..68f030b7276ce2 100644 --- a/regression-test/data/inverted_index_p0/test_tokenize.out +++ b/regression-test/data/inverted_index_p0/test_tokenize.out @@ -67,3 +67,15 @@ -- !tokenize_sql -- ["中华人民共和国", "中华人民", "中华", "华人", "人民共和国", "人民", "共和国", "共和", "国"] +-- !tokenize_sql -- +["😊", "🚀", "👍", "测试", "特殊符号", "特殊", "符号"] + +-- !tokenize_sql -- +["high&low", "high", "low"] + +-- !tokenize_sql -- +["1", "・", "2"] + +-- !tokenize_sql -- +["abc", "abc"] + diff --git a/regression-test/suites/inverted_index_p0/test_tokenize.groovy b/regression-test/suites/inverted_index_p0/test_tokenize.groovy index f8066e6ad86cef..d0bdada2e31d7c 100644 --- a/regression-test/suites/inverted_index_p0/test_tokenize.groovy +++ b/regression-test/suites/inverted_index_p0/test_tokenize.groovy @@ -123,4 +123,9 @@ suite("test_tokenize"){ qt_tokenize_sql """SELECT TOKENIZE('北京大学计算机科学与技术系', '"parser"="ik","parser_mode"="ik_max_word"');""" qt_tokenize_sql """SELECT TOKENIZE('中华人民共和国', '"parser"="ik","parser_mode"="ik_max_word"');""" + qt_tokenize_sql """SELECT TOKENIZE('😊🚀👍测试特殊符号:@#¥%……&*()', '"parser"="ik","parser_mode"="ik_max_word"');""" + qt_tokenize_sql """SELECT TOKENIZE('High&Low', '"parser"="ik","parser_mode"="ik_max_word"');""" + qt_tokenize_sql """SELECT TOKENIZE('1・2', '"parser"="ik","parser_mode"="ik_max_word"');""" + qt_tokenize_sql """SELECT TOKENIZE('abcşīabc', '"parser"="ik","parser_mode"="ik_max_word"');""" + } From f193366e72662b8fac22ed11337aa6c99f71791e Mon Sep 17 00:00:00 2001 From: Ryan19929 Date: Mon, 21 Apr 2025 22:41:08 +0800 Subject: [PATCH 4/6] consistent with es --- .../inverted_index/analyzer/ik/cfg/Configuration.h | 1 + .../inverted_index/analyzer/ik/core/CharacterUtil.cpp | 11 +++-------- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/cfg/Configuration.h b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/cfg/Configuration.h index a9be1d76220b74..a0c9c894c5ccc4 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/cfg/Configuration.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/cfg/Configuration.h @@ -25,6 +25,7 @@ namespace doris::segment_v2 { class Configuration { private: bool use_smart_; + // TODO(ryan19929): delete config_->lower_case_, because it is always true(java version is same) bool enable_lowercase_; std::string dict_path_; diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.cpp b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.cpp index 4b7bc34361e1ba..bfc0f5ab85ab90 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.cpp @@ -49,17 +49,12 @@ int32_t CharacterUtil::identifyCharType(int32_t rune) { } int32_t CharacterUtil::regularize(int32_t rune, bool use_lowercase) { - // Full-width space if (rune == 0x3000) { return 0x0020; - } - - // All full-width characters - if (rune > 0xFF00 && rune < 0xFF5F) { + } else if (rune > 0xFF00 && rune < 0xFF5F) { rune = rune - 0xFEE0; - } - - if (use_lowercase && rune >= 0x41 && rune <= 0x5A) { + } else if (use_lowercase && rune >= 0x41 && rune <= 0x5A) { + // This else-if causes full-width letters unable to be converted to lowercase rune += 32; } From 1ae5a095beac769b187a3c74986996aec411c3aa Mon Sep 17 00:00:00 2001 From: Ryan19929 Date: Sun, 27 Apr 2025 23:20:40 +0800 Subject: [PATCH 5/6] update --- .../inverted_index/analyzer/ik/core/AnalyzeContext.cpp | 4 ++-- .../inverted_index/analyzer/ik/core/CharacterUtil.cpp | 5 ++++- .../inverted_index/analyzer/ik/core/CharacterUtil.h | 2 +- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/AnalyzeContext.cpp b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/AnalyzeContext.cpp index 1d5ff916a12bb4..3356210f20c2f8 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/AnalyzeContext.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/AnalyzeContext.cpp @@ -68,7 +68,7 @@ size_t AnalyzeContext::fillBuffer(lucene::util::Reader* reader) { int32_t readCount = 0; if (buffer_offset_ == 0) { readCount = max(0, reader->readCopy(segment_buff_.data(), 0, BUFF_SIZE)); - CharacterUtil::decodeStringToRunes(segment_buff_.c_str(), readCount, typed_runes_, + CharacterUtil::decodeStringToRunes(segment_buff_.data(), readCount, typed_runes_, config_->isEnableLowercase()); } else { size_t offset = available_ - typed_runes_[cursor_].getNextBytePosition(); @@ -82,7 +82,7 @@ size_t AnalyzeContext::fillBuffer(lucene::util::Reader* reader) { } else { readCount = std::max(0, reader->readCopy(segment_buff_.data(), 0, BUFF_SIZE)); } - CharacterUtil::decodeStringToRunes(segment_buff_.c_str(), readCount, typed_runes_, + CharacterUtil::decodeStringToRunes(segment_buff_.data(), readCount, typed_runes_, config_->isEnableLowercase()); } // Ensure readCount is set to 0 in case of diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.cpp b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.cpp index bfc0f5ab85ab90..eccdada581f525 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.cpp @@ -74,7 +74,7 @@ bool CharacterUtil::decodeString(const char* str, size_t length, RuneStrArray& r return cppjieba::DecodeRunesInString(str, length, runes); } -void CharacterUtil::decodeStringToRunes(const char* str, size_t length, TypedRuneArray& typed_runes, +void CharacterUtil::decodeStringToRunes(char* str, size_t length, TypedRuneArray& typed_runes, bool use_lowercase) { typed_runes.clear(); size_t byte_pos = 0; @@ -84,6 +84,9 @@ void CharacterUtil::decodeStringToRunes(const char* str, size_t length, TypedRun if (runeStr.len == 0) { break; } + if (runeStr.len == 1 && use_lowercase && str[byte_pos] >= 'A' && str[byte_pos] <= 'Z') { + str[byte_pos] += 32; + } typed_runes.emplace_back(runeStr.rune, byte_pos, runeStr.len, typed_runes.size(), 1); typed_runes.back().regularize(use_lowercase); diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.h b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.h index 0a8ad27b696e62..c60f8bb30ce2c7 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.h @@ -72,7 +72,7 @@ class CharacterUtil { static int32_t identifyCharType(int32_t rune); - static void decodeStringToRunes(const char* str, size_t length, TypedRuneArray& typed_runes, + static void decodeStringToRunes(char* str, size_t length, TypedRuneArray& typed_runes, bool use_lowercase); static int32_t regularize(int32_t rune, bool use_lowercase); From fd4f0b47e337e600d31266a3402968e52ccf09d4 Mon Sep 17 00:00:00 2001 From: Ryan19929 Date: Tue, 29 Apr 2025 18:44:12 +0800 Subject: [PATCH 6/6] extent chinese --- .../inverted_index/analyzer/ik/core/CharacterUtil.cpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.cpp b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.cpp index eccdada581f525..a991967392c78d 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.cpp @@ -30,7 +30,13 @@ int32_t CharacterUtil::identifyCharType(int32_t rune) { UBlockCode block = ublock_getCode(rune); if (block == UBLOCK_CJK_UNIFIED_IDEOGRAPHS || block == UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS || - block == UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A) { + block == UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A || + block == UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B || + block == UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C || + block == UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D || + block == UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_E || + block == UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_F || + block == UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_G) { return CHAR_CHINESE; }