This is an automated email from the ASF dual-hosted git repository.

kxiao pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-2.0 by this push:
     new de95e76c752 [pick][feature](inverted index) add ignore_above property 
to prevent long string from indexing #28585 #28819 (#29002)
de95e76c752 is described below

commit de95e76c7526c4fe1492093375b0047f468a91d3
Author: qiye <jianliang5...@gmail.com>
AuthorDate: Mon Dec 25 21:55:56 2023 +0800

    [pick][feature](inverted index) add ignore_above property to prevent long 
string from indexing #28585 #28819 (#29002)
---
 be/src/olap/inverted_index_parser.cpp              |  9 ++++++
 be/src/olap/inverted_index_parser.h                |  7 +++++
 .../rowset/segment_v2/inverted_index_writer.cpp    | 34 +++++++++++++++++++---
 docs/en/docs/data-table/index/inverted-index.md    |  3 ++
 docs/zh-CN/docs/data-table/index/inverted-index.md |  3 ++
 .../apache/doris/analysis/InvertedIndexUtil.java   | 13 +++++++++
 6 files changed, 65 insertions(+), 4 deletions(-)

diff --git a/be/src/olap/inverted_index_parser.cpp 
b/be/src/olap/inverted_index_parser.cpp
index 85e2f523dde..17cddc042f0 100644
--- a/be/src/olap/inverted_index_parser.cpp
+++ b/be/src/olap/inverted_index_parser.cpp
@@ -117,6 +117,15 @@ CharFilterMap get_parser_char_filter_map_from_properties(
     return char_filter_map;
 }
 
+std::string get_parser_ignore_above_value_from_properties(
+        const std::map<std::string, std::string>& properties) {
+    if (properties.find(INVERTED_INDEX_PARSER_IGNORE_ABOVE_KEY) != 
properties.end()) {
+        return properties.at(INVERTED_INDEX_PARSER_IGNORE_ABOVE_KEY);
+    } else {
+        return INVERTED_INDEX_PARSER_IGNORE_ABOVE_VALUE;
+    }
+}
+
 std::string get_parser_lowercase_from_properties(
         const std::map<std::string, std::string>& properties) {
     if (properties.find(INVERTED_INDEX_PARSER_LOWERCASE_KEY) != 
properties.end()) {
diff --git a/be/src/olap/inverted_index_parser.h 
b/be/src/olap/inverted_index_parser.h
index a265c6289a7..c786773be97 100644
--- a/be/src/olap/inverted_index_parser.h
+++ b/be/src/olap/inverted_index_parser.h
@@ -69,6 +69,9 @@ const std::string INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE = 
"char_filter_type";
 const std::string INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN = 
"char_filter_pattern";
 const std::string INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT = 
"char_filter_replacement";
 
+const std::string INVERTED_INDEX_PARSER_IGNORE_ABOVE_KEY = "ignore_above";
+const std::string INVERTED_INDEX_PARSER_IGNORE_ABOVE_VALUE = "256";
+
 const std::string INVERTED_INDEX_PARSER_LOWERCASE_KEY = "lower_case";
 
 std::string inverted_index_parser_type_to_string(InvertedIndexParserType 
parser_type);
@@ -84,6 +87,10 @@ std::string get_parser_phrase_support_string_from_properties(
 CharFilterMap get_parser_char_filter_map_from_properties(
         const std::map<std::string, std::string>& properties);
 
+// get parser ignore_above value from properties
+std::string get_parser_ignore_above_value_from_properties(
+        const std::map<std::string, std::string>& properties);
+
 std::string get_parser_lowercase_from_properties(
         const std::map<std::string, std::string>& properties);
 } // namespace doris
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp 
b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
index 18f7f7ccc5e..c9a9cf4795a 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
@@ -294,9 +294,22 @@ public:
                         "field or index writer is null in inverted index 
writer");
             }
             auto* v = (Slice*)values;
+            auto ignore_above_value =
+                    
get_parser_ignore_above_value_from_properties(_index_meta->properties());
+            auto ignore_above = std::stoi(ignore_above_value);
             for (int i = 0; i < count; ++i) {
-                new_fulltext_field(v->get_data(), v->get_size());
-                RETURN_IF_ERROR(add_document());
+                // only ignore_above UNTOKENIZED strings
+                if (_parser_type == InvertedIndexParserType::PARSER_NONE &&
+                    v->get_size() > ignore_above) {
+                    VLOG_DEBUG << "fulltext index value length can be at most "
+                               << ignore_above_value << ", but got "
+                               << "value length:" << v->get_size() << ", 
ignore this value";
+                    new_fulltext_field(empty_value.c_str(), 0);
+                    RETURN_IF_ERROR(add_null_document());
+                } else {
+                    new_fulltext_field(v->get_data(), v->get_size());
+                    RETURN_IF_ERROR(add_document());
+                }
                 ++v;
                 _rid++;
             }
@@ -319,6 +332,9 @@ public:
                 return Status::InternalError(
                         "field or index writer is null in inverted index 
writer");
             }
+            auto ignore_above_value =
+                    
get_parser_ignore_above_value_from_properties(_index_meta->properties());
+            auto ignore_above = std::stoi(ignore_above_value);
             for (int i = 0; i < count; ++i) {
                 // offsets[i+1] is now row element count
                 std::vector<std::string> strings;
@@ -335,9 +351,19 @@ public:
                 }
 
                 auto value = join(strings, " ");
-                new_fulltext_field(value.c_str(), value.length());
+                // only ignore_above UNTOKENIZED strings
+                if (_parser_type == InvertedIndexParserType::PARSER_NONE &&
+                    value.length() > ignore_above) {
+                    VLOG_DEBUG << "fulltext index value length can be at most "
+                               << ignore_above_value << ", but got "
+                               << "value length:" << value.length() << ", 
ignore this value";
+                    new_fulltext_field(empty_value.c_str(), 0);
+                    RETURN_IF_ERROR(add_null_document());
+                } else {
+                    new_fulltext_field(value.c_str(), value.length());
+                    RETURN_IF_ERROR(add_document());
+                }
                 _rid++;
-                _index_writer->addDocument(_doc.get());
             }
         } else if constexpr (field_is_numeric_type(field_type)) {
             for (int i = 0; i < count; ++i) {
diff --git a/docs/en/docs/data-table/index/inverted-index.md 
b/docs/en/docs/data-table/index/inverted-index.md
index 789316bfaf3..75a8f6a3b1b 100644
--- a/docs/en/docs/data-table/index/inverted-index.md
+++ b/docs/en/docs/data-table/index/inverted-index.md
@@ -89,6 +89,9 @@ The features for inverted index is as follows:
         - char_replace: replace each char in the pattern with a char in the 
replacement
           - char_filter_pattern: character array to be replaced
           - char_filter_replacement: replaced character array, can be left 
unset, defaults to a space character
+    - ignore_above: Controls whether strings are indexed.
+      - Strings longer than the ignore_above setting will not be indexed. For 
arrays of strings, ignore_above will be applied for each array element 
separately and string elements longer than ignore_above will not be indexed.
+      - default value is 256 bytes.
     - lower_case: Whether to convert tokens to lowercase, thereby achieving 
case-insensitive matching.
       - true: Convert to lowercase
       - false: Do not convert to lowercase 
diff --git a/docs/zh-CN/docs/data-table/index/inverted-index.md 
b/docs/zh-CN/docs/data-table/index/inverted-index.md
index 3dcfd7895c9..2f4c3f85bfe 100644
--- a/docs/zh-CN/docs/data-table/index/inverted-index.md
+++ b/docs/zh-CN/docs/data-table/index/inverted-index.md
@@ -87,6 +87,9 @@ Doris倒排索引的功能简要介绍如下:
         - char_replace 将pattern中每个char替换为一个replacement中的char
           - char_filter_pattern:需要被替换掉的字符数组
           - char_filter_replacement:替换后的字符数组,可以不用配置,默认为一个空格字符
+    - ignore_above:控制字符串是否建索引。
+      - 长度超过 ignore_above 设置的字符串不会被索引。对于字符串数组,ignore_above 将分别应用于每个数组元素,长度超过 
ignore_above 的字符串元素将不被索引。
+      - 默认为 256 字节
     - lower_case: 是否将分词进行小写转换,从而在匹配的时候实现忽略大小写
       - true: 转换小写
       - false:不转换小写
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java 
b/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java
index 9e0ea206001..172af8c07dc 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java
@@ -43,6 +43,8 @@ public class InvertedIndexUtil {
 
     public static String INVERTED_INDEX_CHAR_FILTER_CHAR_REPLACE = 
"char_replace";
 
+    public static String INVERTED_INDEX_PARSER_IGNORE_ABOVE = "ignore_above";
+
     public static String INVERTED_INDEX_PARSER_LOWERCASE = "lower_case";
 
     public static String getInvertedIndexParser(Map<String, String> 
properties) {
@@ -100,6 +102,17 @@ public class InvertedIndexUtil {
             if (parser == null && !properties.isEmpty()) {
                 throw new AnalysisException("invalid index properties, please 
check the properties");
             }
+            String ignoreAbove = 
properties.get(INVERTED_INDEX_PARSER_IGNORE_ABOVE);
+            if (ignoreAbove != null) {
+                try {
+                    int ignoreAboveValue = Integer.parseInt(ignoreAbove);
+                    if (ignoreAboveValue <= 0) {
+                        throw new AnalysisException("invalid index properties, 
ignore_above must be positive");
+                    }
+                } catch (NumberFormatException e) {
+                    throw new AnalysisException("invalid index properties, 
ignore_above must be integer");
+                }
+            }
             String lowerCase = properties.get(INVERTED_INDEX_PARSER_LOWERCASE);
             if (lowerCase != null) {
                 if (!"true".equals(lowerCase) && !"false".equals(lowerCase)) {


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to