This is an automated email from the ASF dual-hosted git repository. kxiao pushed a commit to branch branch-2.0 in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.0 by this push: new 52971aeeff9 [opt](invert index) use lowercase by default (#32232) 52971aeeff9 is described below commit 52971aeeff93dfcbbdf68bf0a499dc2b2c757081 Author: zzzxl <33418555+zzzxl1...@users.noreply.github.com> AuthorDate: Mon Mar 18 15:59:04 2024 +0800 [opt](invert index) use lowercase by default (#32232) --- be/src/clucene | 2 +- be/src/olap/compaction.cpp | 26 ++++++++++++++++++++-- be/src/olap/inverted_index_parser.cpp | 11 +-------- be/src/olap/inverted_index_parser.h | 19 +++++++++++++--- be/src/olap/match_predicate.cpp | 2 +- .../segment_v2/inverted_index_compaction.cpp | 6 ++--- .../rowset/segment_v2/inverted_index_compaction.h | 2 +- .../rowset/segment_v2/inverted_index_writer.cpp | 4 ++-- be/src/olap/rowset/segment_v2/segment_iterator.cpp | 3 ++- be/src/olap/tablet_schema.cpp | 6 +++++ .../data/inverted_index_p0/test_lowercase.out | 6 +++++ 11 files changed, 63 insertions(+), 24 deletions(-) diff --git a/be/src/clucene b/be/src/clucene index e9c7f1f9a4a..fe7ecdb2d62 160000 --- a/be/src/clucene +++ b/be/src/clucene @@ -1 +1 @@ -Subproject commit e9c7f1f9a4a324d418eab978fa7ccbcf0878f60c +Subproject commit fe7ecdb2d6214e69caf68eba744d3b5221716119 diff --git a/be/src/olap/compaction.cpp b/be/src/olap/compaction.cpp index fabe83f0186..40bf05ef08f 100644 --- a/be/src/olap/compaction.cpp +++ b/be/src/olap/compaction.cpp @@ -461,9 +461,11 @@ Status Compaction::do_compaction_impl(int64_t permits) { // src index files // format: rowsetId_segmentId std::vector<std::string> src_index_files(src_segment_num); + std::vector<RowsetId> src_rowset_ids; for (const auto& m : src_seg_to_id_map) { std::pair<RowsetId, uint32_t> p = m.first; src_index_files[m.second] = p.first.to_string() + "_" + std::to_string(p.second); + src_rowset_ids.push_back(p.first); } // dest index files @@ -530,14 +532,34 @@ Status Compaction::do_compaction_impl(int64_t permits) { ctx.skip_inverted_index.cbegin(), ctx.skip_inverted_index.cend(), [&src_segment_num, &dest_segment_num, &index_writer_path, &src_index_files, &dest_index_files, &fs, &tablet_path, &trans_vec, &dest_segment_num_rows, - &status, this](int32_t column_uniq_id) { + &status, &src_rowset_ids, this](int32_t column_uniq_id) { + // if index properties are different, index compaction maybe needs to be skipped. + bool maybe_skip = false; + std::optional<std::map<std::string, std::string>> first_properties; + for (const auto& rowset_id : src_rowset_ids) { + auto rowset_ptr = _tablet->get_rowset(rowset_id); + const auto* tablet_index = + rowset_ptr->tablet_schema()->get_inverted_index(column_uniq_id); + const auto& properties = tablet_index->properties(); + if (!first_properties.has_value()) { + first_properties = properties; + } else { + if (properties != first_properties.value()) { + LOG(WARNING) << "if index properties are different, index " + "compaction needs to be skipped."; + maybe_skip = true; + break; + } + } + } + auto index_id = _cur_tablet_schema->get_inverted_index(column_uniq_id)->index_id(); try { auto st = compact_column(index_id, src_segment_num, dest_segment_num, src_index_files, dest_index_files, fs, index_writer_path, tablet_path, trans_vec, - dest_segment_num_rows); + dest_segment_num_rows, maybe_skip); if (!st.ok()) { LOG(WARNING) << "failed to do index compaction" << ". tablet=" << _tablet->full_name() diff --git a/be/src/olap/inverted_index_parser.cpp b/be/src/olap/inverted_index_parser.cpp index 17cddc042f0..3f8d4f9c1be 100644 --- a/be/src/olap/inverted_index_parser.cpp +++ b/be/src/olap/inverted_index_parser.cpp @@ -79,7 +79,7 @@ std::string get_parser_phrase_support_string_from_properties( if (properties.find(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY) != properties.end()) { return properties.at(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY); } else { - return INVERTED_INDEX_PARSER_PHRASE_SUPPORT_NO; + return INVERTED_INDEX_PARSER_FALSE; } } @@ -126,13 +126,4 @@ std::string get_parser_ignore_above_value_from_properties( } } -std::string get_parser_lowercase_from_properties( - const std::map<std::string, std::string>& properties) { - if (properties.find(INVERTED_INDEX_PARSER_LOWERCASE_KEY) != properties.end()) { - return properties.at(INVERTED_INDEX_PARSER_LOWERCASE_KEY); - } else { - return ""; - } -} - } // namespace doris diff --git a/be/src/olap/inverted_index_parser.h b/be/src/olap/inverted_index_parser.h index c786773be97..1a16d9ad97d 100644 --- a/be/src/olap/inverted_index_parser.h +++ b/be/src/olap/inverted_index_parser.h @@ -49,6 +49,9 @@ struct InvertedIndexCtx { using InvertedIndexCtxSPtr = std::shared_ptr<InvertedIndexCtx>; +const std::string INVERTED_INDEX_PARSER_TRUE = "true"; +const std::string INVERTED_INDEX_PARSER_FALSE = "false"; + const std::string INVERTED_INDEX_PARSER_MODE_KEY = "parser_mode"; const std::string INVERTED_INDEX_PARSER_FINE_GRANULARITY = "fine_grained"; const std::string INVERTED_INDEX_PARSER_COARSE_GRANULARITY = "coarse_grained"; @@ -62,8 +65,6 @@ const std::string INVERTED_INDEX_PARSER_ENGLISH = "english"; const std::string INVERTED_INDEX_PARSER_CHINESE = "chinese"; const std::string INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY = "support_phrase"; -const std::string INVERTED_INDEX_PARSER_PHRASE_SUPPORT_YES = "true"; -const std::string INVERTED_INDEX_PARSER_PHRASE_SUPPORT_NO = "false"; const std::string INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE = "char_filter_type"; const std::string INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN = "char_filter_pattern"; @@ -91,6 +92,18 @@ CharFilterMap get_parser_char_filter_map_from_properties( std::string get_parser_ignore_above_value_from_properties( const std::map<std::string, std::string>& properties); +template <bool ReturnTrue = false> std::string get_parser_lowercase_from_properties( - const std::map<std::string, std::string>& properties); + const std::map<std::string, std::string>& properties) { + if (properties.find(INVERTED_INDEX_PARSER_LOWERCASE_KEY) != properties.end()) { + return properties.at(INVERTED_INDEX_PARSER_LOWERCASE_KEY); + } else { + if constexpr (ReturnTrue) { + return INVERTED_INDEX_PARSER_TRUE; + } else { + return ""; + } + } +} + } // namespace doris diff --git a/be/src/olap/match_predicate.cpp b/be/src/olap/match_predicate.cpp index 8ffd6d99936..c4095299ab8 100644 --- a/be/src/olap/match_predicate.cpp +++ b/be/src/olap/match_predicate.cpp @@ -126,7 +126,7 @@ bool MatchPredicate::_skip_evaluate(InvertedIndexIterator* iterator) const { if ((_match_type == MatchType::MATCH_PHRASE || _match_type == MatchType::MATCH_PHRASE_PREFIX) && iterator->get_inverted_index_reader_type() == InvertedIndexReaderType::FULLTEXT && get_parser_phrase_support_string_from_properties(iterator->get_index_properties()) == - INVERTED_INDEX_PARSER_PHRASE_SUPPORT_NO) { + INVERTED_INDEX_PARSER_FALSE) { return true; } return false; diff --git a/be/src/olap/rowset/segment_v2/inverted_index_compaction.cpp b/be/src/olap/rowset/segment_v2/inverted_index_compaction.cpp index b04edd6eb83..ff076e84397 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_compaction.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_compaction.cpp @@ -29,7 +29,7 @@ Status compact_column(int32_t index_id, int src_segment_num, int dest_segment_nu std::vector<std::string> dest_index_files, const io::FileSystemSPtr& fs, std::string index_writer_path, std::string tablet_path, std::vector<std::vector<std::pair<uint32_t, uint32_t>>> trans_vec, - std::vector<uint32_t> dest_segment_num_rows) { + std::vector<uint32_t> dest_segment_num_rows, bool maybe_skip) { DBUG_EXECUTE_IF("index_compaction_compact_column_throw_error", { if (index_id % 2 == 0) { _CLTHROWA(CL_ERR_IO, "debug point: test throw error in index compaction"); @@ -68,8 +68,8 @@ Status compact_column(int32_t index_id, int src_segment_num, int dest_segment_nu } DCHECK_EQ(src_index_dirs.size(), trans_vec.size()); - index_writer->indexCompaction(src_index_dirs, dest_index_dirs, trans_vec, - dest_segment_num_rows); + index_writer->indexCompaction(src_index_dirs, dest_index_dirs, trans_vec, dest_segment_num_rows, + maybe_skip); index_writer->close(); _CLDELETE(index_writer); diff --git a/be/src/olap/rowset/segment_v2/inverted_index_compaction.h b/be/src/olap/rowset/segment_v2/inverted_index_compaction.h index f615192b199..7d6ad1c2d48 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_compaction.h +++ b/be/src/olap/rowset/segment_v2/inverted_index_compaction.h @@ -30,6 +30,6 @@ Status compact_column(int32_t index_id, int src_segment_num, int dest_segment_nu std::vector<std::string> dest_index_files, const io::FileSystemSPtr& fs, std::string index_writer_path, std::string tablet_path, std::vector<std::vector<std::pair<uint32_t, uint32_t>>> trans_vec, - std::vector<uint32_t> dest_segment_num_rows); + std::vector<uint32_t> dest_segment_num_rows, bool maybe_skip); } // namespace segment_v2 } // namespace doris diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp index c99de50d7ac..35e6dc2f24a 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp @@ -205,7 +205,7 @@ public: // ANALYSER_NOT_SET, ANALYSER_NONE use default SimpleAnalyzer _analyzer = std::make_unique<lucene::analysis::SimpleAnalyzer<char>>(); } - auto lowercase = get_parser_lowercase_from_properties(_index_meta->properties()); + auto lowercase = get_parser_lowercase_from_properties<true>(_index_meta->properties()); if (lowercase == "true") { _analyzer->set_lowercase(true); } else if (lowercase == "false") { @@ -234,7 +234,7 @@ public: } _field = new lucene::document::Field(_field_name.c_str(), field_config); if (get_parser_phrase_support_string_from_properties(_index_meta->properties()) == - INVERTED_INDEX_PARSER_PHRASE_SUPPORT_YES) { + INVERTED_INDEX_PARSER_TRUE) { _field->setOmitTermFreqAndPositions(false); } else { _field->setOmitTermFreqAndPositions(true); diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp b/be/src/olap/rowset/segment_v2/segment_iterator.cpp index ed4a55986f8..984d118ddf5 100644 --- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp +++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp @@ -1232,9 +1232,10 @@ Status SegmentIterator::_init_inverted_index_iterators() { for (auto cid : _schema->column_ids()) { int32_t unique_id = _opts.tablet_schema->column(cid).unique_id(); if (_inverted_index_iterators[cid] == nullptr) { + // Use segment’s own index_meta, for compatibility with future indexing needs to default to lowercase. RETURN_IF_ERROR(_segment->new_inverted_index_iterator( _opts.tablet_schema->column(cid), - _opts.tablet_schema->get_inverted_index(unique_id), _opts, + _segment->_tablet_schema->get_inverted_index(unique_id), _opts, &_inverted_index_iterators[cid])); } } diff --git a/be/src/olap/tablet_schema.cpp b/be/src/olap/tablet_schema.cpp index 8c810b528b9..e6e3dde7778 100644 --- a/be/src/olap/tablet_schema.cpp +++ b/be/src/olap/tablet_schema.cpp @@ -616,6 +616,12 @@ void TabletIndex::to_schema_pb(TabletIndexPB* index) const { for (const auto& kv : _properties) { (*index->mutable_properties())[kv.first] = kv.second; } + + // lowercase by default + if (!_properties.contains(INVERTED_INDEX_PARSER_LOWERCASE_KEY)) { + (*index->mutable_properties())[INVERTED_INDEX_PARSER_LOWERCASE_KEY] = + INVERTED_INDEX_PARSER_TRUE; + } } void TabletSchema::append_column(TabletColumn column, bool is_dropped_column) { diff --git a/regression-test/data/inverted_index_p0/test_lowercase.out b/regression-test/data/inverted_index_p0/test_lowercase.out index 03c2f57468f..2ca46501026 100644 --- a/regression-test/data/inverted_index_p0/test_lowercase.out +++ b/regression-test/data/inverted_index_p0/test_lowercase.out @@ -31,11 +31,17 @@ -- !sql -- 1 hello 我来到北京清华大学 +2 HELLO 我爱你中国 +3 Hello 人民可以得到更多实惠 -- !sql -- +1 hello 我来到北京清华大学 2 HELLO 我爱你中国 +3 Hello 人民可以得到更多实惠 -- !sql -- +1 hello 我来到北京清华大学 +2 HELLO 我爱你中国 3 Hello 人民可以得到更多实惠 -- !sql -- --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org