This is an automated email from the ASF dual-hosted git repository. kxiao pushed a commit to branch branch-2.0 in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.0 by this push: new de95e76c752 [pick][feature](inverted index) add ignore_above property to prevent long string from indexing #28585 #28819 (#29002) de95e76c752 is described below commit de95e76c7526c4fe1492093375b0047f468a91d3 Author: qiye <jianliang5...@gmail.com> AuthorDate: Mon Dec 25 21:55:56 2023 +0800 [pick][feature](inverted index) add ignore_above property to prevent long string from indexing #28585 #28819 (#29002) --- be/src/olap/inverted_index_parser.cpp | 9 ++++++ be/src/olap/inverted_index_parser.h | 7 +++++ .../rowset/segment_v2/inverted_index_writer.cpp | 34 +++++++++++++++++++--- docs/en/docs/data-table/index/inverted-index.md | 3 ++ docs/zh-CN/docs/data-table/index/inverted-index.md | 3 ++ .../apache/doris/analysis/InvertedIndexUtil.java | 13 +++++++++ 6 files changed, 65 insertions(+), 4 deletions(-) diff --git a/be/src/olap/inverted_index_parser.cpp b/be/src/olap/inverted_index_parser.cpp index 85e2f523dde..17cddc042f0 100644 --- a/be/src/olap/inverted_index_parser.cpp +++ b/be/src/olap/inverted_index_parser.cpp @@ -117,6 +117,15 @@ CharFilterMap get_parser_char_filter_map_from_properties( return char_filter_map; } +std::string get_parser_ignore_above_value_from_properties( + const std::map<std::string, std::string>& properties) { + if (properties.find(INVERTED_INDEX_PARSER_IGNORE_ABOVE_KEY) != properties.end()) { + return properties.at(INVERTED_INDEX_PARSER_IGNORE_ABOVE_KEY); + } else { + return INVERTED_INDEX_PARSER_IGNORE_ABOVE_VALUE; + } +} + std::string get_parser_lowercase_from_properties( const std::map<std::string, std::string>& properties) { if (properties.find(INVERTED_INDEX_PARSER_LOWERCASE_KEY) != properties.end()) { diff --git a/be/src/olap/inverted_index_parser.h b/be/src/olap/inverted_index_parser.h index a265c6289a7..c786773be97 100644 --- a/be/src/olap/inverted_index_parser.h +++ b/be/src/olap/inverted_index_parser.h @@ -69,6 +69,9 @@ const std::string INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE = "char_filter_type"; const std::string INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN = "char_filter_pattern"; const std::string INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT = "char_filter_replacement"; +const std::string INVERTED_INDEX_PARSER_IGNORE_ABOVE_KEY = "ignore_above"; +const std::string INVERTED_INDEX_PARSER_IGNORE_ABOVE_VALUE = "256"; + const std::string INVERTED_INDEX_PARSER_LOWERCASE_KEY = "lower_case"; std::string inverted_index_parser_type_to_string(InvertedIndexParserType parser_type); @@ -84,6 +87,10 @@ std::string get_parser_phrase_support_string_from_properties( CharFilterMap get_parser_char_filter_map_from_properties( const std::map<std::string, std::string>& properties); +// get parser ignore_above value from properties +std::string get_parser_ignore_above_value_from_properties( + const std::map<std::string, std::string>& properties); + std::string get_parser_lowercase_from_properties( const std::map<std::string, std::string>& properties); } // namespace doris diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp index 18f7f7ccc5e..c9a9cf4795a 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp @@ -294,9 +294,22 @@ public: "field or index writer is null in inverted index writer"); } auto* v = (Slice*)values; + auto ignore_above_value = + get_parser_ignore_above_value_from_properties(_index_meta->properties()); + auto ignore_above = std::stoi(ignore_above_value); for (int i = 0; i < count; ++i) { - new_fulltext_field(v->get_data(), v->get_size()); - RETURN_IF_ERROR(add_document()); + // only ignore_above UNTOKENIZED strings + if (_parser_type == InvertedIndexParserType::PARSER_NONE && + v->get_size() > ignore_above) { + VLOG_DEBUG << "fulltext index value length can be at most " + << ignore_above_value << ", but got " + << "value length:" << v->get_size() << ", ignore this value"; + new_fulltext_field(empty_value.c_str(), 0); + RETURN_IF_ERROR(add_null_document()); + } else { + new_fulltext_field(v->get_data(), v->get_size()); + RETURN_IF_ERROR(add_document()); + } ++v; _rid++; } @@ -319,6 +332,9 @@ public: return Status::InternalError( "field or index writer is null in inverted index writer"); } + auto ignore_above_value = + get_parser_ignore_above_value_from_properties(_index_meta->properties()); + auto ignore_above = std::stoi(ignore_above_value); for (int i = 0; i < count; ++i) { // offsets[i+1] is now row element count std::vector<std::string> strings; @@ -335,9 +351,19 @@ public: } auto value = join(strings, " "); - new_fulltext_field(value.c_str(), value.length()); + // only ignore_above UNTOKENIZED strings + if (_parser_type == InvertedIndexParserType::PARSER_NONE && + value.length() > ignore_above) { + VLOG_DEBUG << "fulltext index value length can be at most " + << ignore_above_value << ", but got " + << "value length:" << value.length() << ", ignore this value"; + new_fulltext_field(empty_value.c_str(), 0); + RETURN_IF_ERROR(add_null_document()); + } else { + new_fulltext_field(value.c_str(), value.length()); + RETURN_IF_ERROR(add_document()); + } _rid++; - _index_writer->addDocument(_doc.get()); } } else if constexpr (field_is_numeric_type(field_type)) { for (int i = 0; i < count; ++i) { diff --git a/docs/en/docs/data-table/index/inverted-index.md b/docs/en/docs/data-table/index/inverted-index.md index 789316bfaf3..75a8f6a3b1b 100644 --- a/docs/en/docs/data-table/index/inverted-index.md +++ b/docs/en/docs/data-table/index/inverted-index.md @@ -89,6 +89,9 @@ The features for inverted index is as follows: - char_replace: replace each char in the pattern with a char in the replacement - char_filter_pattern: character array to be replaced - char_filter_replacement: replaced character array, can be left unset, defaults to a space character + - ignore_above: Controls whether strings are indexed. + - Strings longer than the ignore_above setting will not be indexed. For arrays of strings, ignore_above will be applied for each array element separately and string elements longer than ignore_above will not be indexed. + - default value is 256 bytes. - lower_case: Whether to convert tokens to lowercase, thereby achieving case-insensitive matching. - true: Convert to lowercase - false: Do not convert to lowercase diff --git a/docs/zh-CN/docs/data-table/index/inverted-index.md b/docs/zh-CN/docs/data-table/index/inverted-index.md index 3dcfd7895c9..2f4c3f85bfe 100644 --- a/docs/zh-CN/docs/data-table/index/inverted-index.md +++ b/docs/zh-CN/docs/data-table/index/inverted-index.md @@ -87,6 +87,9 @@ Doris倒排索引的功能简要介绍如下: - char_replace 将pattern中每个char替换为一个replacement中的char - char_filter_pattern:需要被替换掉的字符数组 - char_filter_replacement:替换后的字符数组,可以不用配置,默认为一个空格字符 + - ignore_above:控制字符串是否建索引。 + - 长度超过 ignore_above 设置的字符串不会被索引。对于字符串数组,ignore_above 将分别应用于每个数组元素,长度超过 ignore_above 的字符串元素将不被索引。 + - 默认为 256 字节 - lower_case: 是否将分词进行小写转换,从而在匹配的时候实现忽略大小写 - true: 转换小写 - false:不转换小写 diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java index 9e0ea206001..172af8c07dc 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java @@ -43,6 +43,8 @@ public class InvertedIndexUtil { public static String INVERTED_INDEX_CHAR_FILTER_CHAR_REPLACE = "char_replace"; + public static String INVERTED_INDEX_PARSER_IGNORE_ABOVE = "ignore_above"; + public static String INVERTED_INDEX_PARSER_LOWERCASE = "lower_case"; public static String getInvertedIndexParser(Map<String, String> properties) { @@ -100,6 +102,17 @@ public class InvertedIndexUtil { if (parser == null && !properties.isEmpty()) { throw new AnalysisException("invalid index properties, please check the properties"); } + String ignoreAbove = properties.get(INVERTED_INDEX_PARSER_IGNORE_ABOVE); + if (ignoreAbove != null) { + try { + int ignoreAboveValue = Integer.parseInt(ignoreAbove); + if (ignoreAboveValue <= 0) { + throw new AnalysisException("invalid index properties, ignore_above must be positive"); + } + } catch (NumberFormatException e) { + throw new AnalysisException("invalid index properties, ignore_above must be integer"); + } + } String lowerCase = properties.get(INVERTED_INDEX_PARSER_LOWERCASE); if (lowerCase != null) { if (!"true".equals(lowerCase) && !"false".equals(lowerCase)) { --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org