This is an automated email from the ASF dual-hosted git repository.
yangsiyu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 782f722b5e7 [fix](inverted index) fix pinyin filter bug (#60080)
782f722b5e7 is described below
commit 782f722b5e7f180614d50aa1e553e62baa555f63
Author: Ryan19929 <[email protected]>
AuthorDate: Thu Jan 29 13:56:44 2026 +0800
[fix](inverted index) fix pinyin filter bug (#60080)
---
.../inverted_index/token_filter/pinyin_filter.cpp | 40 +++---
.../token_filter/pinyin_filter_test.cpp | 102 ++++++++++++++-
.../analyzer/test_custom_analyzer.out | 15 +++
.../analyzer/test_custom_analyzer.groovy | 142 ++++++++++++++++++++-
4 files changed, 273 insertions(+), 26 deletions(-)
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/token_filter/pinyin_filter.cpp
b/be/src/olap/rowset/segment_v2/inverted_index/token_filter/pinyin_filter.cpp
index 13bf212b21c..21d74dacce1 100644
---
a/be/src/olap/rowset/segment_v2/inverted_index/token_filter/pinyin_filter.cpp
+++
b/be/src/olap/rowset/segment_v2/inverted_index/token_filter/pinyin_filter.cpp
@@ -142,11 +142,17 @@ bool PinyinFilter::readTerm(Token* token) {
if (!processed_original_ && has_current_token_) {
bool should_add_original = config_->keepOriginal;
- // For emoji/symbol fallback: check if ANY content was generated
(candidates OR pending letters)
- // If nothing was generated, this is likely an emoji/symbol that
should be preserved
- if (!should_add_original && candidate_.empty() &&
first_letters_.empty() &&
- full_pinyin_letters_.empty()) {
- // No candidates and no pending letters, this is emoji/symbol
+ // For emoji/symbol fallback: check if ANY content WILL BE ACTUALLY
OUTPUT
+ // Not just whether buffers have content, but whether they will be
processed
+ // This handles cases like: keep_first_letter=false but first_letters_
has content
+ bool will_output_first_letter = config_->keepFirstLetter &&
!first_letters_.empty();
+ bool will_output_full_pinyin =
+ config_->keepJoinedFullPinyin && !full_pinyin_letters_.empty();
+ bool has_candidates = !candidate_.empty();
+
+ if (!should_add_original && !has_candidates &&
!will_output_first_letter &&
+ !will_output_full_pinyin) {
+ // No content will be output, trigger fallback to preserve
original token
should_add_original = true;
}
@@ -239,24 +245,6 @@ bool PinyinFilter::processCurrentToken() {
PinyinUtil::instance().convert(source_codepoints,
PinyinFormat::TONELESS_PINYIN_FORMAT);
auto chinese_list = ChineseUtil::segmentChinese(source_codepoints);
- // Early return optimization: if no Chinese characters found
- if (pinyin_list.empty() && chinese_list.empty()) {
- // Check if there are non-ASCII Unicode characters (like emoji) to
preserve
- bool has_unicode_symbols = false;
- for (const auto& cp : source_codepoints) {
- if (cp >= 128) { // Non-ASCII character
- has_unicode_symbols = true;
- break;
- }
- }
-
- // If no Unicode symbols, return false and let other filters handle it
- if (!has_unicode_symbols) {
- return false;
- }
- // Otherwise, continue processing to preserve Unicode symbols
- }
-
// Process each character and generate candidates
position_ = 0;
std::string first_letters_buffer;
@@ -306,8 +294,12 @@ bool PinyinFilter::processCurrentToken() {
if (config_->keepNoneChineseInJoinedFullPinyin) {
full_pinyin_buffer += static_cast<char>(codepoint);
}
+ } else if (is_ascii) {
+ // For non-alphanumeric ASCII characters (like spaces,
punctuation),
+ // do nothing and continue to keep the buffer intact.
+ continue;
} else {
- // Process accumulated ASCII buffer when we hit non-ASCII
+ // Process accumulated ASCII buffer when we hit non-ASCII
(Chinese) characters
if (!ascii_buffer.empty()) {
processAsciiBuffer(ascii_buffer, ascii_buffer_start_pos,
static_cast<int>(i));
ascii_buffer.clear();
diff --git
a/be/test/olap/rowset/segment_v2/inverted_index/token_filter/pinyin_filter_test.cpp
b/be/test/olap/rowset/segment_v2/inverted_index/token_filter/pinyin_filter_test.cpp
index 8662cace8ce..93a992337a1 100644
---
a/be/test/olap/rowset/segment_v2/inverted_index/token_filter/pinyin_filter_test.cpp
+++
b/be/test/olap/rowset/segment_v2/inverted_index/token_filter/pinyin_filter_test.cpp
@@ -761,4 +761,104 @@ TEST_F(PinyinFilterTest, TestTokenFilter_NonChineseCJK) {
EXPECT_EQ(tokens2[0], "한글") << "Korean hangul should be preserved as-is";
}
-} // namespace doris::segment_v2::inverted_index
\ No newline at end of file
+TEST_F(PinyinFilterTest, TestBugFix_SpaceHandlingWithKeywordTokenizer) {
+ std::unordered_map<std::string, std::string> config;
+ config["keep_joined_full_pinyin"] = "true";
+ config["keep_none_chinese"] = "true";
+ config["keep_none_chinese_in_joined_full_pinyin"] = "true";
+ config["none_chinese_pinyin_tokenize"] = "false";
+ config["keep_original"] = "false";
+ config["keep_first_letter"] = "false";
+ config["keep_full_pinyin"] = "false";
+ config["lowercase"] = "false";
+ config["trim_whitespace"] = "false";
+ config["ignore_pinyin_offset"] = "true";
+
+ // Test case 1: Pure English with space
+ // Before fix: ["ALF", "Characters"] - space triggered buffer processing
+ // After fix: ["ALFCharacters"] - space is skipped, buffer continues
accumulating
+ auto tokens1 = tokenizeWithFilter("ALF Characters", "keyword", config);
+ EXPECT_EQ(tokens1.size(), 1) << "Should produce one token (space should
not split)";
+ EXPECT_EQ(tokens1[0], "ALFCharacters") << "Space should be ignored in
joined output";
+
+ // Test case 2: English with multiple spaces
+ auto tokens2 = tokenizeWithFilter("Hello World", "keyword", config);
+ EXPECT_EQ(tokens2.size(), 1) << "Multiple spaces should not split tokens";
+ EXPECT_EQ(tokens2[0], "HelloWorld") << "All spaces should be ignored";
+
+ // Test case 3: Mixed with punctuation
+ auto tokens3 = tokenizeWithFilter("Test-Case_123", "keyword", config);
+ EXPECT_EQ(tokens3.size(), 1) << "Punctuation should not split tokens";
+ EXPECT_EQ(tokens3[0], "TestCase123") << "Non-alphanumeric ASCII chars
should be ignored";
+}
+
+// Test Bug #1: Space handling with Chinese-English mixed content
+TEST_F(PinyinFilterTest, TestBugFix_SpaceHandlingWithMixedContent) {
+ std::unordered_map<std::string, std::string> config;
+ config["keep_joined_full_pinyin"] = "true";
+ config["keep_none_chinese"] = "true";
+ config["keep_none_chinese_in_joined_full_pinyin"] = "true";
+ config["none_chinese_pinyin_tokenize"] = "false";
+ config["keep_original"] = "false";
+ config["keep_first_letter"] = "false";
+ config["keep_full_pinyin"] = "false";
+ config["lowercase"] = "true";
+ config["ignore_pinyin_offset"] = "true";
+
+ // Chinese-English mixed with spaces
+ // The space should be ignored, English letters should be preserved in
joined output
+ auto tokens = tokenizeWithFilter("ALF 刘德华", "keyword", config);
+ EXPECT_GT(tokens.size(), 0) << "Should produce tokens";
+
+ // Check that English and pinyin are joined together
+ bool found_joined = false;
+ for (const auto& token : tokens) {
+ if (token.find("alf") != std::string::npos && token.find("liu") !=
std::string::npos) {
+ found_joined = true;
+ EXPECT_EQ(token, "alfliudehua") << "English and pinyin should be
joined, space ignored";
+ break;
+ }
+ }
+ EXPECT_TRUE(found_joined) << "Should find joined English+Pinyin token";
+}
+
+// Test Bug #2: Fallback mechanism for pure English text
+// When keep_none_chinese=false and input is pure English, should preserve
original token (ES behavior)
+TEST_F(PinyinFilterTest, TestBugFix_PureEnglishFallback) {
+ std::unordered_map<std::string, std::string> config;
+ config["keep_none_chinese"] = "false"; // Don't generate separate English
tokens
+ config["keep_original"] = "false";
+ config["keep_first_letter"] = "false";
+ config["keep_full_pinyin"] = "false";
+ config["keep_joined_full_pinyin"] = "true";
+ config["ignore_pinyin_offset"] = "true";
+ config["lowercase"] = "false"; // Preserve original case for testing
+ config["trim_whitespace"] = "false"; // Preserve original whitespace
+ // CRITICAL: Must set these to false to trigger fallback correctly
+ config["keep_none_chinese_in_first_letter"] = "false";
+ config["keep_none_chinese_in_joined_full_pinyin"] = "false";
+
+ // Test case 1: Pure English text (no Chinese to convert)
+ // Before fix: [] - token was dropped because:
+ // 1. processCurrentToken() returned false (early return removed in fix
#1)
+ // 2. Fallback checked first_letters_.empty() instead of will_output
(fixed in this commit)
+ // After fix: ["Lanky Kong"] - original token preserved via improved
fallback mechanism
+ // The fallback now checks if ANY content WILL BE OUTPUT, not just if
buffers have content
+ auto tokens1 = tokenizeWithFilter("Lanky Kong", "keyword", config);
+ EXPECT_EQ(tokens1.size(), 1) << "Pure English should be preserved via
fallback";
+ EXPECT_EQ(tokens1[0], "Lanky Kong") << "Original token should be returned";
+
+ // Test case 2: Another pure English example
+ // ES behavior: fallback preserves original text INCLUDING spaces
+ // (trim_whitespace only removes leading/trailing, not middle spaces)
+ auto tokens2 = tokenizeWithFilter("ALF Characters", "keyword", config);
+ EXPECT_EQ(tokens2.size(), 1) << "Pure English with space should be
preserved";
+ EXPECT_EQ(tokens2[0], "ALF Characters") << "Original token preserved as-is
(ES behavior)";
+
+ // Test case 3: Pure numbers
+ auto tokens3 = tokenizeWithFilter("12345", "keyword", config);
+ EXPECT_EQ(tokens3.size(), 1) << "Pure numbers should be preserved";
+ EXPECT_EQ(tokens3[0], "12345");
+}
+
+} // namespace doris::segment_v2::inverted_index
diff --git
a/regression-test/data/inverted_index_p0/analyzer/test_custom_analyzer.out
b/regression-test/data/inverted_index_p0/analyzer/test_custom_analyzer.out
index 44852a591e3..102b797498c 100644
--- a/regression-test/data/inverted_index_p0/analyzer/test_custom_analyzer.out
+++ b/regression-test/data/inverted_index_p0/analyzer/test_custom_analyzer.out
@@ -290,6 +290,21 @@
-- !sql_ignore_offset_false_mixed --
[{\n "token": "liu"\n }, {\n "token": "lad"\n }, {\n
"token": "a"\n }, {\n "token": "de"\n }]
+-- !sql_bug1_mixed_tokenizer --
+[{\n "token": "ALFliudehua"\n }]
+
+-- !sql_bug1_mixed_filter --
+[{\n "token": "ALFliudehua"\n }]
+
+-- !sql_bug2_pure_english --
+[{\n "token": "Lanky Kong"\n }]
+
+-- !sql_bug2_pure_numbers --
+[{\n "token": "12345"\n }]
+
+-- !sql_bug2_chinese --
+[{\n "token": "liudehua"\n }]
+
-- !sql_table_ignore_offset_1 --
1 刘德华
diff --git
a/regression-test/suites/inverted_index_p0/analyzer/test_custom_analyzer.groovy
b/regression-test/suites/inverted_index_p0/analyzer/test_custom_analyzer.groovy
index 478fade7fc5..ab559717bc3 100644
---
a/regression-test/suites/inverted_index_p0/analyzer/test_custom_analyzer.groovy
+++
b/regression-test/suites/inverted_index_p0/analyzer/test_custom_analyzer.groovy
@@ -94,7 +94,7 @@ suite("test_custom_analyzer", "p0") {
"type" = "pinyin",
"keep_first_letter" = "true",
"keep_full_pinyin" = "false",
- "keep_joined_full_pinyin " = "false",
+ "keep_joined_full_pinyin" = "false",
"keep_original" = "false",
"lowercase" = "true"
);
@@ -639,6 +639,146 @@ suite("test_custom_analyzer", "p0") {
qt_sql_ignore_offset_true_mixed """ select tokenize('刘a德',
'"analyzer"="pinyin_analyzer_ignore_true"'); """
qt_sql_ignore_offset_false_mixed """ select tokenize('刘a德',
'"analyzer"="pinyin_analyzer_ignore_false"'); """
+ // ==================== Bug Fix Tests ====================
+ // Test Bug #1: Space handling consistency between pinyin tokenizer and
pinyin filter
+ // When using pinyin filter with keyword tokenizer, spaces should be
ignored (not trigger buffer processing)
+ // This matches ES behavior where spaces don't split the ASCII buffer
+
+ // Drop existing objects first to ensure clean state
+ try {
+ sql """ DROP INVERTED INDEX ANALYZER pinyin_analyzer_space_test """
+ } catch (Exception e) { /* ignore if not exists */ }
+ try {
+ sql """ DROP INVERTED INDEX ANALYZER pinyin_filter_analyzer_space_test
"""
+ } catch (Exception e) { /* ignore if not exists */ }
+ try {
+ sql """ DROP INVERTED INDEX TOKENIZER pinyin_tokenizer_space_test """
+ } catch (Exception e) { /* ignore if not exists */ }
+ try {
+ sql """ DROP INVERTED INDEX TOKEN_FILTER pinyin_filter_space_test """
+ } catch (Exception e) { /* ignore if not exists */ }
+
+ // Create pinyin tokenizer for comparison (spaces should be ignored in
joined output)
+ // Key settings: keep_none_chinese=false (don't output English separately)
+ // keep_none_chinese_in_joined_full_pinyin=true (include
English in joined output)
+ sql """
+ CREATE INVERTED INDEX TOKENIZER pinyin_tokenizer_space_test
+ PROPERTIES (
+ "type" = "pinyin",
+ "keep_first_letter" = "false",
+ "keep_separate_first_letter" = "false",
+ "keep_full_pinyin" = "false",
+ "keep_joined_full_pinyin" = "true",
+ "keep_none_chinese" = "false",
+ "keep_none_chinese_in_joined_full_pinyin" = "true",
+ "none_chinese_pinyin_tokenize" = "false",
+ "keep_original" = "false",
+ "lowercase" = "false",
+ "trim_whitespace" = "false",
+ "ignore_pinyin_offset" = "true"
+ );
+ """
+
+ // Create pinyin filter with keyword tokenizer for comparison
+ // Same settings as tokenizer to ensure consistent behavior
+ sql """
+ CREATE INVERTED INDEX TOKEN_FILTER pinyin_filter_space_test
+ PROPERTIES (
+ "type" = "pinyin",
+ "keep_first_letter" = "false",
+ "keep_separate_first_letter" = "false",
+ "keep_full_pinyin" = "false",
+ "keep_joined_full_pinyin" = "true",
+ "keep_none_chinese" = "false",
+ "keep_none_chinese_in_joined_full_pinyin" = "true",
+ "none_chinese_pinyin_tokenize" = "false",
+ "keep_original" = "false",
+ "lowercase" = "false",
+ "trim_whitespace" = "false",
+ "ignore_pinyin_offset" = "true"
+ );
+ """
+
+ // Wait for tokenizer and filter to be ready before creating analyzers
+ sql """ select sleep(15) """
+
+ sql """
+ CREATE INVERTED INDEX ANALYZER pinyin_analyzer_space_test
+ PROPERTIES (
+ "tokenizer" = "pinyin_tokenizer_space_test"
+ );
+ """
+
+ sql """
+ CREATE INVERTED INDEX ANALYZER pinyin_filter_analyzer_space_test
+ PROPERTIES (
+ "tokenizer" = "keyword",
+ "token_filter" = "pinyin_filter_space_test"
+ );
+ """
+
+ // Wait for analyzers to be ready
+ sql """ select sleep(15) """
+
+ // Bug #1 Test: Mixed Chinese and English with spaces
+ // Input: "ALF 刘德华" - space should be ignored, English and pinyin should
be joined
+ // Key point: Space between "ALF" and "刘德华" should NOT split the ASCII
buffer
+ // Expected output: ["ALFliudehua"] - English and pinyin joined together
+ qt_sql_bug1_mixed_tokenizer """ select tokenize('ALF 刘德华',
'"analyzer"="pinyin_analyzer_space_test"'); """
+ qt_sql_bug1_mixed_filter """ select tokenize('ALF 刘德华',
'"analyzer"="pinyin_filter_analyzer_space_test"'); """
+
+ // Test Bug #2: Pure English fallback
+ // When keep_none_chinese=false and input is pure English, should preserve
original token (ES behavior)
+
+ // Drop existing objects first
+ try {
+ sql """ DROP INVERTED INDEX ANALYZER pinyin_analyzer_fallback_test """
+ } catch (Exception e) { /* ignore if not exists */ }
+ try {
+ sql """ DROP INVERTED INDEX TOKEN_FILTER pinyin_filter_fallback_test
"""
+ } catch (Exception e) { /* ignore if not exists */ }
+
+ sql """
+ CREATE INVERTED INDEX TOKEN_FILTER pinyin_filter_fallback_test
+ PROPERTIES (
+ "type" = "pinyin",
+ "keep_none_chinese" = "false",
+ "keep_original" = "false",
+ "keep_first_letter" = "false",
+ "keep_full_pinyin" = "false",
+ "keep_joined_full_pinyin" = "true",
+ "ignore_pinyin_offset" = "true",
+ "keep_none_chinese_in_first_letter" = "false",
+ "keep_none_chinese_in_joined_full_pinyin" = "false",
+ "lowercase" = "false"
+ );
+ """
+
+ // Wait for filter to be ready before creating analyzer
+ sql """ select sleep(15) """
+
+ sql """
+ CREATE INVERTED INDEX ANALYZER pinyin_analyzer_fallback_test
+ PROPERTIES (
+ "tokenizer" = "keyword",
+ "token_filter" = "pinyin_filter_fallback_test"
+ );
+ """
+
+ // Wait for analyzer to be ready
+ sql """ select sleep(15) """
+
+ // Bug #2 Test: Pure English should be preserved via fallback mechanism
+ // Before fix: [] (token was dropped)
+ // After fix: original token preserved
+ qt_sql_bug2_pure_english """ select tokenize('Lanky Kong',
'"analyzer"="pinyin_analyzer_fallback_test"'); """
+ qt_sql_bug2_pure_numbers """ select tokenize('12345',
'"analyzer"="pinyin_analyzer_fallback_test"'); """
+
+ // Bug #2 Test: Chinese should still work normally (output joined pinyin)
+ qt_sql_bug2_chinese """ select tokenize('刘德华',
'"analyzer"="pinyin_analyzer_fallback_test"'); """
+
+ // ==================== End Bug Fix Tests ====================
+
// Test table creation and queries with ignore_pinyin_offset
def indexTbName7 = "test_custom_analyzer_pinyin_offset"
sql "DROP TABLE IF EXISTS ${indexTbName7}"
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]