From 8203fa190c83f349e35b1db8351400cdfd81e3bf Mon Sep 17 00:00:00 2001
From: Ryan19929 <black99129@gmail.com>
Date: Thu, 17 Apr 2025 14:32:24 +0800
Subject: [PATCH 1/6] [fix](inverted_index) fix IK handling of full-width
 characters and inaccurate character classification

---
 .../analyzer/ik/IKTokenizer.cpp               |   9 +-
 .../analyzer/ik/core/CharacterUtil.cpp        |  99 +++++++-----
 .../analyzer/ik/core/CharacterUtil.h          |   3 +
 .../analyzer/ik/core/LetterSegmenter.cpp      |  53 ++++---
 .../analyzer/ik/core/LetterSegmenter.h        |   6 +-
 .../analyzer/ik_anayzer_test.cpp              | 142 ++++++++++++++++++
 .../{ => analyzer}/test_ik_analyzer.out       |   0
 .../{ => analyzer}/test_ik_analyzer.groovy    |  10 +-
 8 files changed, 247 insertions(+), 75 deletions(-)
 rename regression-test/data/inverted_index_p0/{ => analyzer}/test_ik_analyzer.out (100%)
 rename regression-test/suites/inverted_index_p0/{ => analyzer}/test_ik_analyzer.groovy (95%)

diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/IKTokenizer.cpp b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/IKTokenizer.cpp
index e1f451804e5067..72b906fa4e17b7 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/IKTokenizer.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/IKTokenizer.cpp
@@ -32,13 +32,10 @@ Token* IKTokenizer::next(Token* token) {
     }
 
     std::string& token_text = tokens_text_[buffer_index_++];
+    // full-width to half-width, and lowercase
+    // TODO(ryan19929): do regularizeString in fillBuffer.
+    CharacterUtil::regularizeString(token_text, this->lowercase);
     size_t size = std::min(token_text.size(), static_cast<size_t>(LUCENE_MAX_WORD_LEN));
-    if (this->lowercase) {
-        if (!token_text.empty() && static_cast<uint8_t>(token_text[0]) < 0x80) {
-            std::transform(token_text.begin(), token_text.end(), token_text.begin(),
-                           [](char c) { return to_lower(c); });
-        }
-    }
     token->setNoCopy(token_text.data(), 0, size);
     return token;
 }
diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.cpp b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.cpp
index 808edc140397e4..3a13526dc2a0fb 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.cpp
@@ -20,34 +20,32 @@
 namespace doris::segment_v2 {
 
 int32_t CharacterUtil::identifyCharType(int32_t rune) {
-    // Numbers
-    if (rune >= 0x30 && rune <= 0x39) {
+    if (rune >= '0' && rune <= '9') {
         return CHAR_ARABIC;
     }
-
-    // English
-    if ((rune >= 0x61 && rune <= 0x7a) || (rune >= 0x41 && rune <= 0x5a)) {
+    if ((rune >= 'a' && rune <= 'z') || (rune >= 'A' && rune <= 'Z')) {
         return CHAR_ENGLISH;
     }
 
-    // CJK Unified Chinese Characters
-    if ((rune >= 0x4E00 && rune <= 0x9FFF) || (rune >= 0x3400 && rune <= 0x4DBF) ||
-        (rune >= 0x20000 && rune <= 0x2A6DF) || (rune >= 0x2A700 && rune <= 0x2B73F) ||
-        (rune >= 0x2B740 && rune <= 0x2B81F) || (rune >= 0x2B820 && rune <= 0x2CEAF) ||
-        (rune >= 0x2CEB0 && rune <= 0x2EBEF) || (rune >= 0x30000 && rune <= 0x3134F)) {
+    UBlockCode block = ublock_getCode(rune);
+
+    if (block == UBLOCK_CJK_UNIFIED_IDEOGRAPHS || 
+        block == UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS || 
+        block == UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A) {
         return CHAR_CHINESE;
     }
-
-    // Japanese and Korean characters
-    if ((rune >= 0x3040 && rune <= 0x309F) || (rune >= 0x30A0 && rune <= 0x30FF) ||
-        (rune >= 0x31F0 && rune <= 0x31FF) || (rune >= 0xAC00 && rune <= 0xD7AF) ||
-        (rune >= 0x1100 && rune <= 0x11FF)) {
+    
+    if (block == UBLOCK_HALFWIDTH_AND_FULLWIDTH_FORMS ||
+        block == UBLOCK_HANGUL_SYLLABLES ||
+        block == UBLOCK_HANGUL_JAMO ||
+        block == UBLOCK_HANGUL_COMPATIBILITY_JAMO ||
+        block == UBLOCK_HIRAGANA ||
+        block == UBLOCK_KATAKANA ||
+        block == UBLOCK_KATAKANA_PHONETIC_EXTENSIONS) {
         return CHAR_OTHER_CJK;
     }
 
-    // UTF-16 surrogate pairs and private zone
-    if ((rune >= 0xD800 && rune <= 0xDBFF) || (rune >= 0xDC00 && rune <= 0xDFFF) ||
-        (rune >= 0xE000 && rune <= 0xF8FF)) {
+    if (rune > 0xFFFF) {
         return CHAR_SURROGATE;
     }
 
@@ -55,31 +53,18 @@ int32_t CharacterUtil::identifyCharType(int32_t rune) {
 }
 
 int32_t CharacterUtil::regularize(int32_t rune, bool use_lowercase) {
-    // Full-width to half-width
+    // Full-width space
     if (rune == 0x3000) {
-        return 0x0020; // Convert full-width space to half-width
-    }
-
-    // Full-width numbers
-    if (rune >= 0xFF10 && rune <= 0xFF19) {
-        return rune - 0xFEE0; // Convert to half-width numbers
+        return 0x0020;
     }
 
-    // Full-width letters
-    if (rune >= 0xFF21 && rune <= 0xFF3A) {
+    // All full-width characters 
+    if (rune > 0xFF00 && rune < 0xFF5F) {
         rune = rune - 0xFEE0;
-        if (use_lowercase) {
-            rune += 32; // Convert to lowercase
-        }
-        return rune;
-    }
-    if (rune >= 0xFF41 && rune <= 0xFF5A) {
-        return rune - 0xFEE0;
     }
 
-    // Convert half-width uppercase letters to lowercase
     if (use_lowercase && rune >= 0x41 && rune <= 0x5A) {
-        return rune + 32;
+        rune += 32;
     }
 
     return rune;
@@ -91,6 +76,7 @@ void CharacterUtil::TypedRune::regularize(bool use_lowercase) {
 
 void CharacterUtil::regularizeCharInfo(TypedRune& typedRune, bool use_lowercase) {
     typedRune.rune = regularize(typedRune.rune, use_lowercase);
+    typedRune.char_type = identifyCharType(typedRune.rune);
 }
 
 CharacterUtil::RuneStrLite CharacterUtil::decodeChar(const char* str, size_t length) {
@@ -113,9 +99,8 @@ void CharacterUtil::decodeStringToRunes(const char* str, size_t length, TypedRun
         }
         typed_runes.emplace_back(runeStr.rune, byte_pos, runeStr.len, typed_runes.size(), 1);
 
-        if (use_lowercase) {
-            typed_runes.back().regularize(true);
-        }
+        typed_runes.back().regularize(use_lowercase);
+        
         byte_pos += runeStr.len;
     }
 }
@@ -166,4 +151,40 @@ size_t CharacterUtil::adjustToCompleteChar(const char* buffer, size_t buffer_len
 
     return buffer_length;
 }
+
+void CharacterUtil::regularizeString(std::string& input, bool use_lowercase) {
+    std::string temp;
+    size_t len = input.size();
+    temp.reserve(len);
+    for (size_t i = 0; i < len; ) {
+        unsigned char c = input[i];
+        if ((c & 0xF0) == 0xE0 && i + 2 < len) {
+            int rune = ((c & 0x0F) << 12) | 
+                       ((input[i + 1] & 0x3F) << 6) | 
+                       (input[i + 2] & 0x3F);
+            if (rune == 0x3000) { 
+                temp += ' ';
+            } else if (rune >= 0xFF01 && rune <= 0xFF5E) { 
+                char half = static_cast<char>(rune - 0xFEE0);
+                if (use_lowercase && half >= 'A' && half <= 'Z') {
+                    half += 32;
+                }
+                temp += half;
+            } else {
+                temp += input[i];
+                temp += input[i + 1];
+                temp += input[i + 2];
+            }
+            i += 3;
+        } else {
+            char ch = input[i];
+            if (use_lowercase && ch >= 'A' && ch <= 'Z') {
+                ch += 32;
+            }
+            temp += ch;
+            i += 1;
+        }
+    }
+    input = std::move(temp);
+}
 } // namespace doris::segment_v2
\ No newline at end of file
diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.h b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.h
index 2f3dac6a520ded..66443d09c2cec3 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.h
@@ -20,6 +20,7 @@
 #include <functional>
 #include <memory>
 #include <vector>
+#include <unicode/uchar.h> 
 
 #include "CLucene/_ApiHeader.h"
 #include "CLucene/analysis/jieba/Unicode.hpp"
@@ -81,6 +82,8 @@ class CharacterUtil {
     static void regularizeCharInfo(TypedRune& type_rune, bool use_lowercase);
 
     static size_t adjustToCompleteChar(const char* buffer, size_t buffer_length);
+
+    static void regularizeString(std::string& input, bool use_lowercase = true);
 };
 
 } // namespace doris::segment_v2
diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/LetterSegmenter.cpp b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/LetterSegmenter.cpp
index 2b63139913206b..e6593c31a675cf 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/LetterSegmenter.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/LetterSegmenter.cpp
@@ -53,7 +53,6 @@ void LetterSegmenter::reset() {
 bool LetterSegmenter::processEnglishLetter(AnalyzeContext& context) {
     bool need_lock = false;
 
-    const auto& typed_runes = context.getTypedRuneArray();
     if (english_start_ == -1) {
         // The current tokenizer has not yet started processing English characters
         if (context.getCurrentCharType() == CharacterUtil::CHAR_ENGLISH) {
@@ -68,9 +67,7 @@ bool LetterSegmenter::processEnglishLetter(AnalyzeContext& context) {
             english_end_ = context.getCursor();
         } else {
             // Encounter non-English characters, output tokens
-            Lexeme newLexeme(context.getBufferOffset(), typed_runes[english_start_].offset,
-                             english_end_ - english_start_ + 1, Lexeme::Type::English,
-                             english_start_, english_end_);
+            Lexeme newLexeme = createLexeme(context, english_start_, english_end_, Lexeme::Type::English);
             context.addLexeme(newLexeme);
             english_start_ = -1;
             english_end_ = -1;
@@ -78,9 +75,7 @@ bool LetterSegmenter::processEnglishLetter(AnalyzeContext& context) {
     }
 
     if (context.isBufferConsumed() && (english_start_ != -1 && english_end_ != -1)) {
-        Lexeme newLexeme(context.getBufferOffset(), typed_runes[english_start_].offset,
-                         english_end_ - english_start_ + 1, Lexeme::Type::English, english_start_,
-                         english_end_);
+        Lexeme newLexeme = createLexeme(context, english_start_, english_end_, Lexeme::Type::English);
         context.addLexeme(newLexeme);
         english_start_ = -1;
         english_end_ = -1;
@@ -96,7 +91,6 @@ bool LetterSegmenter::processEnglishLetter(AnalyzeContext& context) {
 
 bool LetterSegmenter::processArabicLetter(AnalyzeContext& context) {
     bool need_lock = false;
-    const auto& typed_runes = context.getTypedRuneArray();
 
     if (arabic_start_ == -1) {
         // The current tokenizer has not yet started processing numeric characters
@@ -115,9 +109,7 @@ bool LetterSegmenter::processArabicLetter(AnalyzeContext& context) {
             // Do not output numbers, but do not mark the end
         } else {
             // Encounter non-Arabic characters, output tokens
-            Lexeme newLexeme(context.getBufferOffset(), typed_runes[arabic_start_].offset,
-                             arabic_end_ - arabic_start_ + 1, Lexeme::Type::Arabic, arabic_start_,
-                             arabic_end_);
+            Lexeme newLexeme = createLexeme(context, arabic_start_, arabic_end_, Lexeme::Type::Arabic);
             context.addLexeme(newLexeme);
             arabic_start_ = -1;
             arabic_end_ = -1;
@@ -125,9 +117,7 @@ bool LetterSegmenter::processArabicLetter(AnalyzeContext& context) {
     }
 
     if (context.isBufferConsumed() && (arabic_start_ != -1 && arabic_end_ != -1)) {
-        Lexeme newLexeme(context.getBufferOffset(), typed_runes[arabic_start_].offset,
-                         arabic_end_ - arabic_start_ + 1, Lexeme::Type::Arabic, arabic_start_,
-                         arabic_end_);
+        Lexeme newLexeme = createLexeme(context, arabic_start_, arabic_end_, Lexeme::Type::Arabic);
         context.addLexeme(newLexeme);
         arabic_start_ = -1;
         arabic_end_ = -1;
@@ -143,7 +133,6 @@ bool LetterSegmenter::processArabicLetter(AnalyzeContext& context) {
 
 bool LetterSegmenter::processMixLetter(AnalyzeContext& context) {
     bool need_lock = false;
-    const auto& typed_runes = context.getTypedRuneArray();
 
     if (start_ == -1) {
         // The current tokenizer has not yet started processing characters.
@@ -164,8 +153,7 @@ bool LetterSegmenter::processMixLetter(AnalyzeContext& context) {
             end_ = context.getCursor();
         } else {
             // Encounter non-letter characters, output a token
-            Lexeme newLexeme(context.getBufferOffset(), typed_runes[start_].offset,
-                             end_ - start_ + 1, Lexeme::Type::Letter, start_, end_);
+            Lexeme newLexeme = createLexeme(context, start_, end_, Lexeme::Type::Letter);
             context.addLexeme(newLexeme);
             start_ = -1;
             end_ = -1;
@@ -173,8 +161,7 @@ bool LetterSegmenter::processMixLetter(AnalyzeContext& context) {
     }
 
     if (context.isBufferConsumed() && (start_ != -1 && end_ != -1)) {
-        Lexeme newLexeme(context.getBufferOffset(), typed_runes[start_].offset, end_ - start_ + 1,
-                         Lexeme::Type::Letter, start_, end_);
+        Lexeme newLexeme = createLexeme(context, start_, end_, Lexeme::Type::Letter);
         context.addLexeme(newLexeme);
         start_ = -1;
         end_ = -1;
@@ -184,11 +171,31 @@ bool LetterSegmenter::processMixLetter(AnalyzeContext& context) {
     return need_lock;
 }
 
-bool LetterSegmenter::isLetterConnector(char input) {
-    return std::binary_search(std::begin(letter_connectors_), std::end(letter_connectors_), input);
+bool LetterSegmenter::isLetterConnector(int32_t input) {
+    if (input < 128) {
+        return std::binary_search(std::begin(letter_connectors_), std::end(letter_connectors_), 
+                                 static_cast<char>(input));
+    }
+    return false;
+}
+
+bool LetterSegmenter::isNumConnector(int32_t input) {
+    if (input < 128) {
+        return std::binary_search(std::begin(num_connectors_), std::end(num_connectors_), 
+                                 static_cast<char>(input));
+    }
+    return false;
 }
 
-bool LetterSegmenter::isNumConnector(char input) {
-    return std::binary_search(std::begin(num_connectors_), std::end(num_connectors_), input);
+Lexeme LetterSegmenter::createLexeme(AnalyzeContext& context, int start, int end, Lexeme::Type type) {
+    const auto& typed_runes = context.getTypedRuneArray();
+    return Lexeme(
+        context.getBufferOffset(),
+        typed_runes[start].getBytePosition(),
+        typed_runes[end].getNextBytePosition() - typed_runes[start].getBytePosition(),
+        type,
+        start,
+        end
+    );
 }
 } // namespace doris::segment_v2
diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/LetterSegmenter.h b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/LetterSegmenter.h
index 87b36e83fbfd07..edd09a5d2c4e61 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/LetterSegmenter.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/LetterSegmenter.h
@@ -41,8 +41,10 @@ class LetterSegmenter : public ISegmenter {
     bool processEnglishLetter(AnalyzeContext& context);
     bool processArabicLetter(AnalyzeContext& context);
     bool processMixLetter(AnalyzeContext& context);
-    bool isLetterConnector(char input);
-    bool isNumConnector(char input);
+    bool isLetterConnector(int32_t input);
+    bool isNumConnector(int32_t input);
+
+    Lexeme createLexeme(AnalyzeContext& context, int start, int end, Lexeme::Type type);
 
     int start_ {-1};
     int end_ {-1};
diff --git a/be/test/olap/rowset/segment_v2/inverted_index/analyzer/ik_anayzer_test.cpp b/be/test/olap/rowset/segment_v2/inverted_index/analyzer/ik_anayzer_test.cpp
index 18dc16b6925fe8..8b87618ed8a13c 100644
--- a/be/test/olap/rowset/segment_v2/inverted_index/analyzer/ik_anayzer_test.cpp
+++ b/be/test/olap/rowset/segment_v2/inverted_index/analyzer/ik_anayzer_test.cpp
@@ -428,6 +428,148 @@ TEST_F(IKTokenizerTest, TestLongTextCompareWithJava) {
     }
 }
 
+TEST_F(IKTokenizerTest, TestFullWidthCharacters) {
+    std::vector<std::string> datas;
+
+    // test full width numbers
+    std::string fullWidthNumbersText = "４ ３ ２";
+    tokenize(fullWidthNumbersText, datas, true);
+    std::vector<std::string> expectedNumbers = {"4", "3", "2"}; // half width numbers
+    ASSERT_EQ(datas.size(), expectedNumbers.size());
+    for (size_t i = 0; i < datas.size(); i++) {
+        ASSERT_EQ(datas[i], expectedNumbers[i]);
+    }
+    datas.clear();
+
+    fullWidthNumbersText = "４３２";
+    tokenize(fullWidthNumbersText, datas, false);
+    expectedNumbers = {"432"};
+    ASSERT_EQ(datas.size(), expectedNumbers.size());
+    for (size_t i = 0; i < datas.size(); i++) {
+        ASSERT_EQ(datas[i], expectedNumbers[i]);
+    }
+    datas.clear();
+
+    // test full width currency symbol
+    std::string currencyText = "￥";
+    tokenize(currencyText, datas, false);
+    ASSERT_EQ(datas.size(), 1);
+    ASSERT_EQ(datas[0], "￥");
+    datas.clear();
+    
+    // test full width symbol in word
+    std::string mixedText = "High＆Low";
+    tokenize(mixedText, datas, false);
+    std::vector<std::string> expectedMixed = {"high&low", "high", "low"};
+    ASSERT_EQ(datas.size(), expectedMixed.size());
+    for (size_t i = 0; i < datas.size(); i++) {
+        ASSERT_EQ(datas[i], expectedMixed[i]);
+    }
+    datas.clear();
+    
+    // test special separator
+    std::string specialSeparatorText = "1･2";
+    tokenize(specialSeparatorText, datas, false);
+    std::vector<std::string> expectedSeparator = {"1", "･", "2"};
+    ASSERT_EQ(datas.size(), expectedSeparator.size());
+    for (size_t i = 0; i < datas.size(); i++) {
+        ASSERT_EQ(datas[i], expectedSeparator[i]);
+    }
+    datas.clear();
+    
+    // test special character
+    std::string specialCharText = "﨑";
+    tokenize(specialCharText, datas, false);
+    ASSERT_EQ(datas.size(), 1);
+    ASSERT_EQ(datas[0], "﨑");
+    datas.clear();
+}
+
+TEST_F(IKTokenizerTest, TestEmojiAndSpecialCharacters) {
+    std::vector<std::string> datas;
+
+    // test emoji
+    std::string emojiText = "🐼";
+    tokenize(emojiText, datas, false);
+    ASSERT_EQ(datas.size(), 1);
+    ASSERT_EQ(datas[0], "🐼");
+    datas.clear();
+    
+    std::string emojiText2 = "🝢";
+    tokenize(emojiText2, datas, false);
+    ASSERT_EQ(datas.size(), 1);
+    ASSERT_EQ(datas[0], "🝢");
+    datas.clear();
+    
+    // test special latin character
+    std::string specialLatinText1 = "abcşabc";
+    tokenize(specialLatinText1, datas, false);
+    ASSERT_EQ(datas.size(), 2);
+    ASSERT_EQ(datas[0], "abc");
+    ASSERT_EQ(datas[1], "abc");
+    datas.clear();
+    
+    std::string specialLatinText2 = "abcīabc";
+    tokenize(specialLatinText2, datas, false);
+    ASSERT_EQ(datas.size(), 2);
+    ASSERT_EQ(datas[0], "abc");
+    ASSERT_EQ(datas[1], "abc");
+    datas.clear();
+    
+    std::string specialLatinText3 = "celebrity…get";
+    tokenize(specialLatinText3, datas, false);
+    std::vector<std::string> expectedEllipsis = {"celebrity", "get"};
+    ASSERT_EQ(datas.size(), expectedEllipsis.size());
+    for (size_t i = 0; i < datas.size(); i++) {
+        ASSERT_EQ(datas[i], expectedEllipsis[i]);
+    }
+    datas.clear();
+    
+    // test mixed alphabet word
+    std::string mixedAlphabetText1 = "Hulyaiрole";
+    tokenize(mixedAlphabetText1, datas, false);
+    ASSERT_EQ(datas.size(), 2);
+    ASSERT_EQ(datas[0], "hulyai");
+    ASSERT_EQ(datas[1], "ole");
+    datas.clear();
+    
+    std::string mixedAlphabetText2 = "Nisa Aşgabat";
+    tokenize(mixedAlphabetText2, datas, false);
+    std::vector<std::string> expectedName = {"nisa", "gabat"};
+    ASSERT_EQ(datas.size(), expectedName.size());
+    for (size_t i = 0; i < datas.size(); i++) {
+        ASSERT_EQ(datas[i], expectedName[i]);
+    }
+    datas.clear();
+    
+    // test special connector
+    std::string specialConnectorText = "alـameer";
+    tokenize(specialConnectorText, datas, false);
+    ASSERT_EQ(datas.size(), 2);
+    ASSERT_EQ(datas[0], "al");
+    ASSERT_EQ(datas[1], "ameer");
+    datas.clear();
+    
+    // test rare unicode character
+    std::string rareUnicodeText1 = "𐓚";
+    tokenize(rareUnicodeText1, datas, false);
+    ASSERT_EQ(datas.size(), 1);
+    ASSERT_EQ(datas[0], "𐓚");
+    datas.clear();
+    
+    std::string rareUnicodeText2 = "𑪱";
+    tokenize(rareUnicodeText2, datas, false);
+    ASSERT_EQ(datas.size(), 1);
+    ASSERT_EQ(datas[0], "𑪱");
+    datas.clear();
+    
+    std::string rareUnicodeText3 = "𐴗";
+    tokenize(rareUnicodeText3, datas, false);
+    ASSERT_EQ(datas.size(), 1);
+    ASSERT_EQ(datas[0], "𐴗");
+    datas.clear();
+}
+
 // Test the exception handling capabilities of the IKTokenizer and AnalyzeContext
 TEST_F(IKTokenizerTest, TestExceptionHandling) {
     // Common mock reader class for testing exception handling
diff --git a/regression-test/data/inverted_index_p0/test_ik_analyzer.out b/regression-test/data/inverted_index_p0/analyzer/test_ik_analyzer.out
similarity index 100%
rename from regression-test/data/inverted_index_p0/test_ik_analyzer.out
rename to regression-test/data/inverted_index_p0/analyzer/test_ik_analyzer.out
diff --git a/regression-test/suites/inverted_index_p0/test_ik_analyzer.groovy b/regression-test/suites/inverted_index_p0/analyzer/test_ik_analyzer.groovy
similarity index 95%
rename from regression-test/suites/inverted_index_p0/test_ik_analyzer.groovy
rename to regression-test/suites/inverted_index_p0/analyzer/test_ik_analyzer.groovy
index c28aa68920b652..b57d40a4a002c0 100644
--- a/regression-test/suites/inverted_index_p0/test_ik_analyzer.groovy
+++ b/regression-test/suites/inverted_index_p0/analyzer/test_ik_analyzer.groovy
@@ -22,7 +22,7 @@ suite("test_ik_analyzer", "p0") {
     sql "DROP TABLE IF EXISTS ${tableNameSmart}"
     sql "DROP TABLE IF EXISTS ${tableNameMaxWord}"
 
-    // 创建smart模式测试表
+    // Create test table for smart mode
     sql """
       CREATE TABLE ${tableNameSmart} (
       `id` int(11) NULL COMMENT "",
@@ -37,7 +37,7 @@ suite("test_ik_analyzer", "p0") {
       );
     """
 
-    // 创建max_word模式测试表
+    // Create test table for max_word mode
     sql """
       CREATE TABLE ${tableNameMaxWord} (
       `id` int(11) NULL COMMENT "",
@@ -52,7 +52,7 @@ suite("test_ik_analyzer", "p0") {
       );
     """
 
-    // 插入测试数据
+    // Insert test data
     def insertData = { table ->
         sql """ INSERT INTO ${table} VALUES (1, "我爱北京天安门"); """
         sql """ INSERT INTO ${table} VALUES (2, "Apache Doris是一个现代化的MPP数据库"); """
@@ -68,14 +68,14 @@ suite("test_ik_analyzer", "p0") {
         sql "sync"
         sql """ set enable_common_expr_pushdown = true; """
 
-        // 测试smart模式
+        // Testing ik smart mode
         println "Testing ik smart mode:"
         qt_sql """ select * from ${tableNameSmart} where content match_phrase '北京'; """
         qt_sql """ select * from ${tableNameSmart} where content match_phrase '计算机科学'; """
         qt_sql """ select * from ${tableNameSmart} where content match_phrase '数据库管理系统'; """
         qt_sql """ select * from ${tableNameSmart} where content match_phrase '中华人民共和国'; """
 
-        // 测试max_word模式
+        // Testing ik max_word mode
         println "Testing ik max_word mode:"
         qt_sql """ select * from ${tableNameMaxWord} where content match_phrase '北京'; """
         qt_sql """ select * from ${tableNameMaxWord} where content match_phrase '计算机科学'; """

From 39a47f1e289f8bbd039f6a7068d412a76f6e52b8 Mon Sep 17 00:00:00 2001
From: Ryan19929 <black99129@gmail.com>
Date: Thu, 17 Apr 2025 14:38:49 +0800
Subject: [PATCH 2/6] [feature](inverted_index) enhance IK analyzer to support
 emoji and rare characters

---
 .../analyzer/ik/core/AnalyzeContext.cpp       |  6 +++
 .../analyzer/ik/core/AnalyzeContext.h         | 10 ++---
 .../analyzer/ik/core/IKSegmenter.cpp          |  1 +
 .../analyzer/ik/core/IKSegmenter.h            |  1 +
 .../ik/core/SurrogatePairSegmenter.cpp        | 36 +++++++++++++++++
 .../analyzer/ik/core/SurrogatePairSegmenter.h | 39 +++++++++++++++++++
 6 files changed, 88 insertions(+), 5 deletions(-)
 create mode 100644 be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/SurrogatePairSegmenter.cpp
 create mode 100644 be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/SurrogatePairSegmenter.h

diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/AnalyzeContext.cpp b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/AnalyzeContext.cpp
index d648f8e715afcd..59b97b2ee16d45 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/AnalyzeContext.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/AnalyzeContext.cpp
@@ -199,6 +199,9 @@ void AnalyzeContext::lockBuffer(SegmenterType type) {
     case SegmenterType::LETTER_SEGMENTER:
         buffer_locker_ |= LETTER_SEGMENTER_FLAG;
         break;
+    case SegmenterType::SURROGATE_PAIR_SEGMENTER:
+        buffer_locker_ |= SURROGATE_PAIR_SEGMENTER_FLAG;
+        break;
     }
 }
 
@@ -213,6 +216,9 @@ void AnalyzeContext::unlockBuffer(SegmenterType type) {
     case SegmenterType::LETTER_SEGMENTER:
         buffer_locker_ &= ~LETTER_SEGMENTER_FLAG;
         break;
+    case SegmenterType::SURROGATE_PAIR_SEGMENTER:
+        buffer_locker_ &= ~SURROGATE_PAIR_SEGMENTER_FLAG;
+        break;
     }
 }
 
diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/AnalyzeContext.h b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/AnalyzeContext.h
index b6b363d9d1a52d..a89f4973d0d955 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/AnalyzeContext.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/AnalyzeContext.h
@@ -41,10 +41,10 @@ class AnalyzeContext {
     static const size_t BUFF_SIZE = 4096;
     static const size_t BUFF_EXHAUST_CRITICAL = 100;
 
-    static constexpr uint8_t CJK_SEGMENTER_FLAG = 0x01;    // 0001
-    static constexpr uint8_t CN_QUANTIFIER_FLAG = 0x02;    // 0010
-    static constexpr uint8_t LETTER_SEGMENTER_FLAG = 0x04; // 0100
-
+    static constexpr uint8_t CJK_SEGMENTER_FLAG = 0x01;            // 0001
+    static constexpr uint8_t CN_QUANTIFIER_FLAG = 0x02;            // 0010
+    static constexpr uint8_t LETTER_SEGMENTER_FLAG = 0x04;         // 0100
+    static constexpr uint8_t SURROGATE_PAIR_SEGMENTER_FLAG = 0x08; // 1000
     // String buffer
     std::string segment_buff_;
     // An array storing Unicode code points (runes)Character information array
@@ -73,7 +73,7 @@ class AnalyzeContext {
     void compound(Lexeme& lexeme);
 
 public:
-    enum class SegmenterType { CJK_SEGMENTER, CN_QUANTIFIER, LETTER_SEGMENTER };
+    enum class SegmenterType { CJK_SEGMENTER, CN_QUANTIFIER, LETTER_SEGMENTER, SURROGATE_PAIR_SEGMENTER};
     const CharacterUtil::TypedRuneArray& getTypedRuneArray() const { return typed_runes_; }
     explicit AnalyzeContext(IKMemoryPool<Cell>& pool, std::shared_ptr<Configuration> config);
     virtual ~AnalyzeContext();
diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/IKSegmenter.cpp b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/IKSegmenter.cpp
index 6c1d049ac708f9..674f22dfd2d6c9 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/IKSegmenter.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/IKSegmenter.cpp
@@ -33,6 +33,7 @@ std::vector<std::unique_ptr<ISegmenter>> IKSegmenter::loadSegmenters() {
     segmenters.push_back(std::make_unique<LetterSegmenter>());
     segmenters.push_back(std::make_unique<CN_QuantifierSegmenter>());
     segmenters.push_back(std::make_unique<CJKSegmenter>());
+    segmenters.push_back(std::make_unique<SurrogatePairSegmenter>());
     return segmenters;
 }
 
diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/IKSegmenter.h b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/IKSegmenter.h
index 4f94fa435dbfac..33defbbe31a97b 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/IKSegmenter.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/IKSegmenter.h
@@ -27,6 +27,7 @@
 #include "IKArbitrator.h"
 #include "ISegmenter.h"
 #include "LetterSegmenter.h"
+#include "SurrogatePairSegmenter.h"
 #include "olap/rowset/segment_v2/inverted_index/analyzer/ik/cfg/Configuration.h"
 namespace doris::segment_v2 {
 
diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/SurrogatePairSegmenter.cpp b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/SurrogatePairSegmenter.cpp
new file mode 100644
index 00000000000000..1e787dd5883fa4
--- /dev/null
+++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/SurrogatePairSegmenter.cpp
@@ -0,0 +1,36 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "SurrogatePairSegmenter.h"
+
+namespace doris::segment_v2 {
+
+void SurrogatePairSegmenter::analyze(AnalyzeContext& context) {
+    const auto& current_char_type = context.getCurrentCharType();
+    
+    if (current_char_type == CharacterUtil::CHAR_SURROGATE) {
+        Lexeme newLexeme(context.getBufferOffset(), context.getCurrentCharOffset(),
+                         context.getCurrentCharLen(), Lexeme::Type::CNChar, context.getCursor(), context.getCursor());
+        context.addLexeme(newLexeme);
+    } 
+       
+    context.unlockBuffer(SEGMENTER_TYPE);
+}
+
+void SurrogatePairSegmenter::reset() {}
+
+} // namespace doris::segment_v2 
\ No newline at end of file
diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/SurrogatePairSegmenter.h b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/SurrogatePairSegmenter.h
new file mode 100644
index 00000000000000..f39eb30cf91e8f
--- /dev/null
+++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/SurrogatePairSegmenter.h
@@ -0,0 +1,39 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "ISegmenter.h"
+#include "AnalyzeContext.h"
+#include "CharacterUtil.h"
+#include "Lexeme.h"
+
+namespace doris::segment_v2 {
+
+class SurrogatePairSegmenter : public ISegmenter {
+public:
+    static constexpr AnalyzeContext::SegmenterType SEGMENTER_TYPE =
+            AnalyzeContext::SegmenterType::SURROGATE_PAIR_SEGMENTER;
+
+    SurrogatePairSegmenter() = default;
+    ~SurrogatePairSegmenter() override = default;
+
+    void analyze(AnalyzeContext& context) override;
+    void reset() override;
+};
+
+} // namespace doris::segment_v2 
\ No newline at end of file

From cf8bc00b2138fc8c40134774bfe63e1f1a8d07cd Mon Sep 17 00:00:00 2001
From: Ryan19929 <huijie.wu@qq.com>
Date: Thu, 17 Apr 2025 17:09:46 +0800
Subject: [PATCH 3/6] [refactor](inverted_index) format code and remove
 unnecessary code

---
 .../analyzer/ik/core/AnalyzeContext.cpp       |  1 -
 .../analyzer/ik/core/AnalyzeContext.h         |  7 +-
 .../analyzer/ik/core/CN_QuantifierSegmenter.h |  1 -
 .../analyzer/ik/core/CharacterUtil.cpp        | 83 +++----------------
 .../analyzer/ik/core/CharacterUtil.h          |  5 +-
 .../analyzer/ik/core/LetterSegmenter.cpp      | 31 ++++---
 .../analyzer/ik/core/LetterSegmenter.h        |  1 -
 .../ik/core/SurrogatePairSegmenter.cpp        | 11 +--
 .../analyzer/ik/core/SurrogatePairSegmenter.h |  4 +-
 .../analyzer/ik_anayzer_test.cpp              | 32 +++----
 .../data/inverted_index_p0/test_tokenize.out  | 12 +++
 .../inverted_index_p0/test_tokenize.groovy    |  5 ++
 12 files changed, 79 insertions(+), 114 deletions(-)

diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/AnalyzeContext.cpp b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/AnalyzeContext.cpp
index 59b97b2ee16d45..1d5ff916a12bb4 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/AnalyzeContext.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/AnalyzeContext.cpp
@@ -172,7 +172,6 @@ bool AnalyzeContext::moveCursor() {
 
 void AnalyzeContext::initCursor() {
     cursor_ = 0;
-    typed_runes_[cursor_].regularize(config_->isEnableLowercase());
 }
 
 bool AnalyzeContext::isBufferConsumed() const {
diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/AnalyzeContext.h b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/AnalyzeContext.h
index a89f4973d0d955..d9e947a713dfca 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/AnalyzeContext.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/AnalyzeContext.h
@@ -73,7 +73,12 @@ class AnalyzeContext {
     void compound(Lexeme& lexeme);
 
 public:
-    enum class SegmenterType { CJK_SEGMENTER, CN_QUANTIFIER, LETTER_SEGMENTER, SURROGATE_PAIR_SEGMENTER};
+    enum class SegmenterType {
+        CJK_SEGMENTER,
+        CN_QUANTIFIER,
+        LETTER_SEGMENTER,
+        SURROGATE_PAIR_SEGMENTER
+    };
     const CharacterUtil::TypedRuneArray& getTypedRuneArray() const { return typed_runes_; }
     explicit AnalyzeContext(IKMemoryPool<Cell>& pool, std::shared_ptr<Configuration> config);
     virtual ~AnalyzeContext();
diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CN_QuantifierSegmenter.h b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CN_QuantifierSegmenter.h
index 27ccef61a83bdc..a20341d3f3eaaa 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CN_QuantifierSegmenter.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CN_QuantifierSegmenter.h
@@ -29,7 +29,6 @@ class CN_QuantifierSegmenter : public ISegmenter {
 public:
     static constexpr AnalyzeContext::SegmenterType SEGMENTER_TYPE =
             AnalyzeContext::SegmenterType::CN_QUANTIFIER;
-    static const std::string SEGMENTER_NAME;
     static const std::u32string CHINESE_NUMBERS;
     static const std::unordered_set<char32_t> CHINESE_NUMBER_CHARS;
 
diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.cpp b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.cpp
index 3a13526dc2a0fb..4b7bc34361e1ba 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.cpp
@@ -29,18 +29,14 @@ int32_t CharacterUtil::identifyCharType(int32_t rune) {
 
     UBlockCode block = ublock_getCode(rune);
 
-    if (block == UBLOCK_CJK_UNIFIED_IDEOGRAPHS || 
-        block == UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS || 
+    if (block == UBLOCK_CJK_UNIFIED_IDEOGRAPHS || block == UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS ||
         block == UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A) {
         return CHAR_CHINESE;
     }
-    
-    if (block == UBLOCK_HALFWIDTH_AND_FULLWIDTH_FORMS ||
-        block == UBLOCK_HANGUL_SYLLABLES ||
-        block == UBLOCK_HANGUL_JAMO ||
-        block == UBLOCK_HANGUL_COMPATIBILITY_JAMO ||
-        block == UBLOCK_HIRAGANA ||
-        block == UBLOCK_KATAKANA ||
+
+    if (block == UBLOCK_HALFWIDTH_AND_FULLWIDTH_FORMS || block == UBLOCK_HANGUL_SYLLABLES ||
+        block == UBLOCK_HANGUL_JAMO || block == UBLOCK_HANGUL_COMPATIBILITY_JAMO ||
+        block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA ||
         block == UBLOCK_KATAKANA_PHONETIC_EXTENSIONS) {
         return CHAR_OTHER_CJK;
     }
@@ -58,7 +54,7 @@ int32_t CharacterUtil::regularize(int32_t rune, bool use_lowercase) {
         return 0x0020;
     }
 
-    // All full-width characters 
+    // All full-width characters
     if (rune > 0xFF00 && rune < 0xFF5F) {
         rune = rune - 0xFEE0;
     }
@@ -71,12 +67,8 @@ int32_t CharacterUtil::regularize(int32_t rune, bool use_lowercase) {
 }
 
 void CharacterUtil::TypedRune::regularize(bool use_lowercase) {
-    CharacterUtil::regularizeCharInfo(*this, use_lowercase);
-}
-
-void CharacterUtil::regularizeCharInfo(TypedRune& typedRune, bool use_lowercase) {
-    typedRune.rune = regularize(typedRune.rune, use_lowercase);
-    typedRune.char_type = identifyCharType(typedRune.rune);
+    this->rune = CharacterUtil::regularize(this->rune, use_lowercase);
+    this->char_type = CharacterUtil::identifyCharType(this->rune);
 }
 
 CharacterUtil::RuneStrLite CharacterUtil::decodeChar(const char* str, size_t length) {
@@ -100,71 +92,22 @@ void CharacterUtil::decodeStringToRunes(const char* str, size_t length, TypedRun
         typed_runes.emplace_back(runeStr.rune, byte_pos, runeStr.len, typed_runes.size(), 1);
 
         typed_runes.back().regularize(use_lowercase);
-        
-        byte_pos += runeStr.len;
-    }
-}
-
-// TODO: Maybe delete this function
-size_t CharacterUtil::adjustToCompleteChar(const char* buffer, size_t buffer_length) {
-    if (buffer_length == 0) return 0;
-
-    unsigned char last_byte = buffer[buffer_length - 1];
-
-    if (last_byte < 0x80) {
-        return buffer_length;
-    }
 
-    if ((last_byte & 0xC0) == 0x80) {
-        size_t adjustedLen = buffer_length - 1;
-        while (adjustedLen > 0) {
-            unsigned char byte = buffer[adjustedLen - 1];
-            if ((byte & 0xC0) != 0x80) {
-                int charLen = 0;
-                if ((byte & 0xE0) == 0xC0)
-                    charLen = 2;
-                else if ((byte & 0xF0) == 0xE0)
-                    charLen = 3;
-                else if ((byte & 0xF8) == 0xF0)
-                    charLen = 4;
-                if (buffer_length - adjustedLen + 1 < charLen) {
-                    return adjustedLen - 1;
-                }
-                return buffer_length;
-            }
-            adjustedLen--;
-        }
-        return 0;
-    }
-
-    int charLen = 0;
-    if ((last_byte & 0xE0) == 0xC0)
-        charLen = 2;
-    else if ((last_byte & 0xF0) == 0xE0)
-        charLen = 3;
-    else if ((last_byte & 0xF8) == 0xF0)
-        charLen = 4;
-
-    if (charLen > 1) {
-        return buffer_length - 1;
+        byte_pos += runeStr.len;
     }
-
-    return buffer_length;
 }
 
 void CharacterUtil::regularizeString(std::string& input, bool use_lowercase) {
     std::string temp;
     size_t len = input.size();
     temp.reserve(len);
-    for (size_t i = 0; i < len; ) {
+    for (size_t i = 0; i < len;) {
         unsigned char c = input[i];
         if ((c & 0xF0) == 0xE0 && i + 2 < len) {
-            int rune = ((c & 0x0F) << 12) | 
-                       ((input[i + 1] & 0x3F) << 6) | 
-                       (input[i + 2] & 0x3F);
-            if (rune == 0x3000) { 
+            int rune = ((c & 0x0F) << 12) | ((input[i + 1] & 0x3F) << 6) | (input[i + 2] & 0x3F);
+            if (rune == 0x3000) {
                 temp += ' ';
-            } else if (rune >= 0xFF01 && rune <= 0xFF5E) { 
+            } else if (rune >= 0xFF01 && rune <= 0xFF5E) {
                 char half = static_cast<char>(rune - 0xFEE0);
                 if (use_lowercase && half >= 'A' && half <= 'Z') {
                     half += 32;
diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.h b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.h
index 66443d09c2cec3..0a8ad27b696e62 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.h
@@ -17,10 +17,11 @@
 
 #pragma once
 
+#include <unicode/uchar.h>
+
 #include <functional>
 #include <memory>
 #include <vector>
-#include <unicode/uchar.h> 
 
 #include "CLucene/_ApiHeader.h"
 #include "CLucene/analysis/jieba/Unicode.hpp"
@@ -81,8 +82,6 @@ class CharacterUtil {
 
     static void regularizeCharInfo(TypedRune& type_rune, bool use_lowercase);
 
-    static size_t adjustToCompleteChar(const char* buffer, size_t buffer_length);
-
     static void regularizeString(std::string& input, bool use_lowercase = true);
 };
 
diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/LetterSegmenter.cpp b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/LetterSegmenter.cpp
index e6593c31a675cf..c593a1ec63dde3 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/LetterSegmenter.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/LetterSegmenter.cpp
@@ -67,7 +67,8 @@ bool LetterSegmenter::processEnglishLetter(AnalyzeContext& context) {
             english_end_ = context.getCursor();
         } else {
             // Encounter non-English characters, output tokens
-            Lexeme newLexeme = createLexeme(context, english_start_, english_end_, Lexeme::Type::English);
+            Lexeme newLexeme =
+                    createLexeme(context, english_start_, english_end_, Lexeme::Type::English);
             context.addLexeme(newLexeme);
             english_start_ = -1;
             english_end_ = -1;
@@ -75,7 +76,8 @@ bool LetterSegmenter::processEnglishLetter(AnalyzeContext& context) {
     }
 
     if (context.isBufferConsumed() && (english_start_ != -1 && english_end_ != -1)) {
-        Lexeme newLexeme = createLexeme(context, english_start_, english_end_, Lexeme::Type::English);
+        Lexeme newLexeme =
+                createLexeme(context, english_start_, english_end_, Lexeme::Type::English);
         context.addLexeme(newLexeme);
         english_start_ = -1;
         english_end_ = -1;
@@ -109,7 +111,8 @@ bool LetterSegmenter::processArabicLetter(AnalyzeContext& context) {
             // Do not output numbers, but do not mark the end
         } else {
             // Encounter non-Arabic characters, output tokens
-            Lexeme newLexeme = createLexeme(context, arabic_start_, arabic_end_, Lexeme::Type::Arabic);
+            Lexeme newLexeme =
+                    createLexeme(context, arabic_start_, arabic_end_, Lexeme::Type::Arabic);
             context.addLexeme(newLexeme);
             arabic_start_ = -1;
             arabic_end_ = -1;
@@ -173,29 +176,25 @@ bool LetterSegmenter::processMixLetter(AnalyzeContext& context) {
 
 bool LetterSegmenter::isLetterConnector(int32_t input) {
     if (input < 128) {
-        return std::binary_search(std::begin(letter_connectors_), std::end(letter_connectors_), 
-                                 static_cast<char>(input));
+        return std::binary_search(std::begin(letter_connectors_), std::end(letter_connectors_),
+                                  static_cast<char>(input));
     }
     return false;
 }
 
 bool LetterSegmenter::isNumConnector(int32_t input) {
     if (input < 128) {
-        return std::binary_search(std::begin(num_connectors_), std::end(num_connectors_), 
-                                 static_cast<char>(input));
+        return std::binary_search(std::begin(num_connectors_), std::end(num_connectors_),
+                                  static_cast<char>(input));
     }
     return false;
 }
 
-Lexeme LetterSegmenter::createLexeme(AnalyzeContext& context, int start, int end, Lexeme::Type type) {
+Lexeme LetterSegmenter::createLexeme(AnalyzeContext& context, int start, int end,
+                                     Lexeme::Type type) {
     const auto& typed_runes = context.getTypedRuneArray();
-    return Lexeme(
-        context.getBufferOffset(),
-        typed_runes[start].getBytePosition(),
-        typed_runes[end].getNextBytePosition() - typed_runes[start].getBytePosition(),
-        type,
-        start,
-        end
-    );
+    return Lexeme(context.getBufferOffset(), typed_runes[start].getBytePosition(),
+                  typed_runes[end].getNextBytePosition() - typed_runes[start].getBytePosition(),
+                  type, start, end);
 }
 } // namespace doris::segment_v2
diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/LetterSegmenter.h b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/LetterSegmenter.h
index edd09a5d2c4e61..70dc6b4988fcf0 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/LetterSegmenter.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/LetterSegmenter.h
@@ -30,7 +30,6 @@ class LetterSegmenter : public ISegmenter {
 public:
     static constexpr AnalyzeContext::SegmenterType SEGMENTER_TYPE =
             AnalyzeContext::SegmenterType::LETTER_SEGMENTER;
-    static const std::string SEGMENTER_NAME;
     LetterSegmenter();
     ~LetterSegmenter() override = default;
 
diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/SurrogatePairSegmenter.cpp b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/SurrogatePairSegmenter.cpp
index 1e787dd5883fa4..0aea370a502c0f 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/SurrogatePairSegmenter.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/SurrogatePairSegmenter.cpp
@@ -21,16 +21,17 @@ namespace doris::segment_v2 {
 
 void SurrogatePairSegmenter::analyze(AnalyzeContext& context) {
     const auto& current_char_type = context.getCurrentCharType();
-    
+
     if (current_char_type == CharacterUtil::CHAR_SURROGATE) {
         Lexeme newLexeme(context.getBufferOffset(), context.getCurrentCharOffset(),
-                         context.getCurrentCharLen(), Lexeme::Type::CNChar, context.getCursor(), context.getCursor());
+                         context.getCurrentCharLen(), Lexeme::Type::CNChar, context.getCursor(),
+                         context.getCursor());
         context.addLexeme(newLexeme);
-    } 
-       
+    }
+
     context.unlockBuffer(SEGMENTER_TYPE);
 }
 
 void SurrogatePairSegmenter::reset() {}
 
-} // namespace doris::segment_v2 
\ No newline at end of file
+} // namespace doris::segment_v2
\ No newline at end of file
diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/SurrogatePairSegmenter.h b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/SurrogatePairSegmenter.h
index f39eb30cf91e8f..bad22658b519cf 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/SurrogatePairSegmenter.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/SurrogatePairSegmenter.h
@@ -17,9 +17,9 @@
 
 #pragma once
 
-#include "ISegmenter.h"
 #include "AnalyzeContext.h"
 #include "CharacterUtil.h"
+#include "ISegmenter.h"
 #include "Lexeme.h"
 
 namespace doris::segment_v2 {
@@ -36,4 +36,4 @@ class SurrogatePairSegmenter : public ISegmenter {
     void reset() override;
 };
 
-} // namespace doris::segment_v2 
\ No newline at end of file
+} // namespace doris::segment_v2
\ No newline at end of file
diff --git a/be/test/olap/rowset/segment_v2/inverted_index/analyzer/ik_anayzer_test.cpp b/be/test/olap/rowset/segment_v2/inverted_index/analyzer/ik_anayzer_test.cpp
index 8b87618ed8a13c..690499830f5704 100644
--- a/be/test/olap/rowset/segment_v2/inverted_index/analyzer/ik_anayzer_test.cpp
+++ b/be/test/olap/rowset/segment_v2/inverted_index/analyzer/ik_anayzer_test.cpp
@@ -264,7 +264,11 @@ TEST_F(IKTokenizerTest, TestSpecialCharacters) {
     // Test with special characters
     std::string specialText = "😊🚀👍测试特殊符号：@#¥%……&*（）";
     tokenize(specialText, datas, true);
-    ASSERT_EQ(datas.size(), 2);
+    ASSERT_EQ(datas.size(), 5);
+    std::vector<std::string> expectedTokens = {"😊", "🚀", "👍", "测试", "特殊符号"};
+    for (size_t i = 0; i < datas.size(); i++) {
+        ASSERT_EQ(datas[i], expectedTokens[i]);
+    }
 }
 
 TEST_F(IKTokenizerTest, TestBufferBoundaryWithSpace) {
@@ -456,7 +460,7 @@ TEST_F(IKTokenizerTest, TestFullWidthCharacters) {
     ASSERT_EQ(datas.size(), 1);
     ASSERT_EQ(datas[0], "￥");
     datas.clear();
-    
+
     // test full width symbol in word
     std::string mixedText = "High＆Low";
     tokenize(mixedText, datas, false);
@@ -466,7 +470,7 @@ TEST_F(IKTokenizerTest, TestFullWidthCharacters) {
         ASSERT_EQ(datas[i], expectedMixed[i]);
     }
     datas.clear();
-    
+
     // test special separator
     std::string specialSeparatorText = "1･2";
     tokenize(specialSeparatorText, datas, false);
@@ -476,7 +480,7 @@ TEST_F(IKTokenizerTest, TestFullWidthCharacters) {
         ASSERT_EQ(datas[i], expectedSeparator[i]);
     }
     datas.clear();
-    
+
     // test special character
     std::string specialCharText = "﨑";
     tokenize(specialCharText, datas, false);
@@ -494,13 +498,13 @@ TEST_F(IKTokenizerTest, TestEmojiAndSpecialCharacters) {
     ASSERT_EQ(datas.size(), 1);
     ASSERT_EQ(datas[0], "🐼");
     datas.clear();
-    
+
     std::string emojiText2 = "🝢";
     tokenize(emojiText2, datas, false);
     ASSERT_EQ(datas.size(), 1);
     ASSERT_EQ(datas[0], "🝢");
     datas.clear();
-    
+
     // test special latin character
     std::string specialLatinText1 = "abcşabc";
     tokenize(specialLatinText1, datas, false);
@@ -508,14 +512,14 @@ TEST_F(IKTokenizerTest, TestEmojiAndSpecialCharacters) {
     ASSERT_EQ(datas[0], "abc");
     ASSERT_EQ(datas[1], "abc");
     datas.clear();
-    
+
     std::string specialLatinText2 = "abcīabc";
     tokenize(specialLatinText2, datas, false);
     ASSERT_EQ(datas.size(), 2);
     ASSERT_EQ(datas[0], "abc");
     ASSERT_EQ(datas[1], "abc");
     datas.clear();
-    
+
     std::string specialLatinText3 = "celebrity…get";
     tokenize(specialLatinText3, datas, false);
     std::vector<std::string> expectedEllipsis = {"celebrity", "get"};
@@ -524,7 +528,7 @@ TEST_F(IKTokenizerTest, TestEmojiAndSpecialCharacters) {
         ASSERT_EQ(datas[i], expectedEllipsis[i]);
     }
     datas.clear();
-    
+
     // test mixed alphabet word
     std::string mixedAlphabetText1 = "Hulyaiрole";
     tokenize(mixedAlphabetText1, datas, false);
@@ -532,7 +536,7 @@ TEST_F(IKTokenizerTest, TestEmojiAndSpecialCharacters) {
     ASSERT_EQ(datas[0], "hulyai");
     ASSERT_EQ(datas[1], "ole");
     datas.clear();
-    
+
     std::string mixedAlphabetText2 = "Nisa Aşgabat";
     tokenize(mixedAlphabetText2, datas, false);
     std::vector<std::string> expectedName = {"nisa", "gabat"};
@@ -541,7 +545,7 @@ TEST_F(IKTokenizerTest, TestEmojiAndSpecialCharacters) {
         ASSERT_EQ(datas[i], expectedName[i]);
     }
     datas.clear();
-    
+
     // test special connector
     std::string specialConnectorText = "alـameer";
     tokenize(specialConnectorText, datas, false);
@@ -549,20 +553,20 @@ TEST_F(IKTokenizerTest, TestEmojiAndSpecialCharacters) {
     ASSERT_EQ(datas[0], "al");
     ASSERT_EQ(datas[1], "ameer");
     datas.clear();
-    
+
     // test rare unicode character
     std::string rareUnicodeText1 = "𐓚";
     tokenize(rareUnicodeText1, datas, false);
     ASSERT_EQ(datas.size(), 1);
     ASSERT_EQ(datas[0], "𐓚");
     datas.clear();
-    
+
     std::string rareUnicodeText2 = "𑪱";
     tokenize(rareUnicodeText2, datas, false);
     ASSERT_EQ(datas.size(), 1);
     ASSERT_EQ(datas[0], "𑪱");
     datas.clear();
-    
+
     std::string rareUnicodeText3 = "𐴗";
     tokenize(rareUnicodeText3, datas, false);
     ASSERT_EQ(datas.size(), 1);
diff --git a/regression-test/data/inverted_index_p0/test_tokenize.out b/regression-test/data/inverted_index_p0/test_tokenize.out
index 32e7968cb8b1de..68f030b7276ce2 100644
--- a/regression-test/data/inverted_index_p0/test_tokenize.out
+++ b/regression-test/data/inverted_index_p0/test_tokenize.out
@@ -67,3 +67,15 @@
 -- !tokenize_sql --
 ["中华人民共和国", "中华人民", "中华", "华人", "人民共和国", "人民", "共和国", "共和", "国"]
 
+-- !tokenize_sql --
+["😊", "🚀", "👍", "测试", "特殊符号", "特殊", "符号"]
+
+-- !tokenize_sql --
+["high&low", "high", "low"]
+
+-- !tokenize_sql --
+["1", "･", "2"]
+
+-- !tokenize_sql --
+["abc", "abc"]
+
diff --git a/regression-test/suites/inverted_index_p0/test_tokenize.groovy b/regression-test/suites/inverted_index_p0/test_tokenize.groovy
index f8066e6ad86cef..d0bdada2e31d7c 100644
--- a/regression-test/suites/inverted_index_p0/test_tokenize.groovy
+++ b/regression-test/suites/inverted_index_p0/test_tokenize.groovy
@@ -123,4 +123,9 @@ suite("test_tokenize"){
     qt_tokenize_sql """SELECT TOKENIZE('北京大学计算机科学与技术系', '"parser"="ik","parser_mode"="ik_max_word"');"""
     qt_tokenize_sql """SELECT TOKENIZE('中华人民共和国', '"parser"="ik","parser_mode"="ik_max_word"');"""
 
+    qt_tokenize_sql """SELECT TOKENIZE('😊🚀👍测试特殊符号：@#¥%……&*（）', '"parser"="ik","parser_mode"="ik_max_word"');"""
+    qt_tokenize_sql """SELECT TOKENIZE('High＆Low', '"parser"="ik","parser_mode"="ik_max_word"');"""
+    qt_tokenize_sql """SELECT TOKENIZE('1･2', '"parser"="ik","parser_mode"="ik_max_word"');"""
+    qt_tokenize_sql """SELECT TOKENIZE('abcşīabc', '"parser"="ik","parser_mode"="ik_max_word"');"""
+
 }

From f193366e72662b8fac22ed11337aa6c99f71791e Mon Sep 17 00:00:00 2001
From: Ryan19929 <black99129@gmail.com>
Date: Mon, 21 Apr 2025 22:41:08 +0800
Subject: [PATCH 4/6] consistent with es

---
 .../inverted_index/analyzer/ik/cfg/Configuration.h    |  1 +
 .../inverted_index/analyzer/ik/core/CharacterUtil.cpp | 11 +++--------
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/cfg/Configuration.h b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/cfg/Configuration.h
index a9be1d76220b74..a0c9c894c5ccc4 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/cfg/Configuration.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/cfg/Configuration.h
@@ -25,6 +25,7 @@ namespace doris::segment_v2 {
 class Configuration {
 private:
     bool use_smart_;
+    // TODO(ryan19929): delete config_->lower_case_, because it is always true(java version is same)
     bool enable_lowercase_;
     std::string dict_path_;
 
diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.cpp b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.cpp
index 4b7bc34361e1ba..bfc0f5ab85ab90 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.cpp
@@ -49,17 +49,12 @@ int32_t CharacterUtil::identifyCharType(int32_t rune) {
 }
 
 int32_t CharacterUtil::regularize(int32_t rune, bool use_lowercase) {
-    // Full-width space
     if (rune == 0x3000) {
         return 0x0020;
-    }
-
-    // All full-width characters
-    if (rune > 0xFF00 && rune < 0xFF5F) {
+    } else if (rune > 0xFF00 && rune < 0xFF5F) {
         rune = rune - 0xFEE0;
-    }
-
-    if (use_lowercase && rune >= 0x41 && rune <= 0x5A) {
+    } else if (use_lowercase && rune >= 0x41 && rune <= 0x5A) {
+        // This else-if causes full-width letters unable to be converted to lowercase
         rune += 32;
     }
 

From 1ae5a095beac769b187a3c74986996aec411c3aa Mon Sep 17 00:00:00 2001
From: Ryan19929 <black99129@gmail.com>
Date: Sun, 27 Apr 2025 23:20:40 +0800
Subject: [PATCH 5/6] update

---
 .../inverted_index/analyzer/ik/core/AnalyzeContext.cpp       | 4 ++--
 .../inverted_index/analyzer/ik/core/CharacterUtil.cpp        | 5 ++++-
 .../inverted_index/analyzer/ik/core/CharacterUtil.h          | 2 +-
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/AnalyzeContext.cpp b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/AnalyzeContext.cpp
index 1d5ff916a12bb4..3356210f20c2f8 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/AnalyzeContext.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/AnalyzeContext.cpp
@@ -68,7 +68,7 @@ size_t AnalyzeContext::fillBuffer(lucene::util::Reader* reader) {
         int32_t readCount = 0;
         if (buffer_offset_ == 0) {
             readCount = max(0, reader->readCopy(segment_buff_.data(), 0, BUFF_SIZE));
-            CharacterUtil::decodeStringToRunes(segment_buff_.c_str(), readCount, typed_runes_,
+            CharacterUtil::decodeStringToRunes(segment_buff_.data(), readCount, typed_runes_,
                                                config_->isEnableLowercase());
         } else {
             size_t offset = available_ - typed_runes_[cursor_].getNextBytePosition();
@@ -82,7 +82,7 @@ size_t AnalyzeContext::fillBuffer(lucene::util::Reader* reader) {
             } else {
                 readCount = std::max(0, reader->readCopy(segment_buff_.data(), 0, BUFF_SIZE));
             }
-            CharacterUtil::decodeStringToRunes(segment_buff_.c_str(), readCount, typed_runes_,
+            CharacterUtil::decodeStringToRunes(segment_buff_.data(), readCount, typed_runes_,
                                                config_->isEnableLowercase());
         }
         // Ensure readCount is set to 0 in case of
diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.cpp b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.cpp
index bfc0f5ab85ab90..eccdada581f525 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.cpp
@@ -74,7 +74,7 @@ bool CharacterUtil::decodeString(const char* str, size_t length, RuneStrArray& r
     return cppjieba::DecodeRunesInString(str, length, runes);
 }
 
-void CharacterUtil::decodeStringToRunes(const char* str, size_t length, TypedRuneArray& typed_runes,
+void CharacterUtil::decodeStringToRunes(char* str, size_t length, TypedRuneArray& typed_runes,
                                         bool use_lowercase) {
     typed_runes.clear();
     size_t byte_pos = 0;
@@ -84,6 +84,9 @@ void CharacterUtil::decodeStringToRunes(const char* str, size_t length, TypedRun
         if (runeStr.len == 0) {
             break;
         }
+        if (runeStr.len == 1 && use_lowercase && str[byte_pos] >= 'A' && str[byte_pos] <= 'Z') {
+            str[byte_pos] += 32;
+        }
         typed_runes.emplace_back(runeStr.rune, byte_pos, runeStr.len, typed_runes.size(), 1);
 
         typed_runes.back().regularize(use_lowercase);
diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.h b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.h
index 0a8ad27b696e62..c60f8bb30ce2c7 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.h
@@ -72,7 +72,7 @@ class CharacterUtil {
 
     static int32_t identifyCharType(int32_t rune);
 
-    static void decodeStringToRunes(const char* str, size_t length, TypedRuneArray& typed_runes,
+    static void decodeStringToRunes(char* str, size_t length, TypedRuneArray& typed_runes,
                                     bool use_lowercase);
 
     static int32_t regularize(int32_t rune, bool use_lowercase);

From fd4f0b47e337e600d31266a3402968e52ccf09d4 Mon Sep 17 00:00:00 2001
From: Ryan19929 <black99129@gmail.com>
Date: Tue, 29 Apr 2025 18:44:12 +0800
Subject: [PATCH 6/6] extent chinese

---
 .../inverted_index/analyzer/ik/core/CharacterUtil.cpp     | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.cpp b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.cpp
index eccdada581f525..a991967392c78d 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.cpp
@@ -30,7 +30,13 @@ int32_t CharacterUtil::identifyCharType(int32_t rune) {
     UBlockCode block = ublock_getCode(rune);
 
     if (block == UBLOCK_CJK_UNIFIED_IDEOGRAPHS || block == UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS ||
-        block == UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A) {
+        block == UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A ||
+        block == UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B ||
+        block == UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C ||
+        block == UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D ||
+        block == UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_E ||
+        block == UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_F ||
+        block == UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_G) {
         return CHAR_CHINESE;
     }