This is an automated email from the ASF dual-hosted git repository.

kxiao pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-2.0 by this push:
     new 52971aeeff9 [opt](invert index) use lowercase by default (#32232)
52971aeeff9 is described below

commit 52971aeeff93dfcbbdf68bf0a499dc2b2c757081
Author: zzzxl <33418555+zzzxl1...@users.noreply.github.com>
AuthorDate: Mon Mar 18 15:59:04 2024 +0800

    [opt](invert index) use lowercase by default (#32232)
---
 be/src/clucene                                     |  2 +-
 be/src/olap/compaction.cpp                         | 26 ++++++++++++++++++++--
 be/src/olap/inverted_index_parser.cpp              | 11 +--------
 be/src/olap/inverted_index_parser.h                | 19 +++++++++++++---
 be/src/olap/match_predicate.cpp                    |  2 +-
 .../segment_v2/inverted_index_compaction.cpp       |  6 ++---
 .../rowset/segment_v2/inverted_index_compaction.h  |  2 +-
 .../rowset/segment_v2/inverted_index_writer.cpp    |  4 ++--
 be/src/olap/rowset/segment_v2/segment_iterator.cpp |  3 ++-
 be/src/olap/tablet_schema.cpp                      |  6 +++++
 .../data/inverted_index_p0/test_lowercase.out      |  6 +++++
 11 files changed, 63 insertions(+), 24 deletions(-)

diff --git a/be/src/clucene b/be/src/clucene
index e9c7f1f9a4a..fe7ecdb2d62 160000
--- a/be/src/clucene
+++ b/be/src/clucene
@@ -1 +1 @@
-Subproject commit e9c7f1f9a4a324d418eab978fa7ccbcf0878f60c
+Subproject commit fe7ecdb2d6214e69caf68eba744d3b5221716119
diff --git a/be/src/olap/compaction.cpp b/be/src/olap/compaction.cpp
index fabe83f0186..40bf05ef08f 100644
--- a/be/src/olap/compaction.cpp
+++ b/be/src/olap/compaction.cpp
@@ -461,9 +461,11 @@ Status Compaction::do_compaction_impl(int64_t permits) {
             // src index files
             // format: rowsetId_segmentId
             std::vector<std::string> src_index_files(src_segment_num);
+            std::vector<RowsetId> src_rowset_ids;
             for (const auto& m : src_seg_to_id_map) {
                 std::pair<RowsetId, uint32_t> p = m.first;
                 src_index_files[m.second] = p.first.to_string() + "_" + 
std::to_string(p.second);
+                src_rowset_ids.push_back(p.first);
             }
 
             // dest index files
@@ -530,14 +532,34 @@ Status Compaction::do_compaction_impl(int64_t permits) {
                     ctx.skip_inverted_index.cbegin(), 
ctx.skip_inverted_index.cend(),
                     [&src_segment_num, &dest_segment_num, &index_writer_path, 
&src_index_files,
                      &dest_index_files, &fs, &tablet_path, &trans_vec, 
&dest_segment_num_rows,
-                     &status, this](int32_t column_uniq_id) {
+                     &status, &src_rowset_ids, this](int32_t column_uniq_id) {
+                        // if index properties are different, index compaction 
maybe needs to be skipped.
+                        bool maybe_skip = false;
+                        std::optional<std::map<std::string, std::string>> 
first_properties;
+                        for (const auto& rowset_id : src_rowset_ids) {
+                            auto rowset_ptr = _tablet->get_rowset(rowset_id);
+                            const auto* tablet_index =
+                                    
rowset_ptr->tablet_schema()->get_inverted_index(column_uniq_id);
+                            const auto& properties = 
tablet_index->properties();
+                            if (!first_properties.has_value()) {
+                                first_properties = properties;
+                            } else {
+                                if (properties != first_properties.value()) {
+                                    LOG(WARNING) << "if index properties are 
different, index "
+                                                    "compaction needs to be 
skipped.";
+                                    maybe_skip = true;
+                                    break;
+                                }
+                            }
+                        }
+
                         auto index_id =
                                 
_cur_tablet_schema->get_inverted_index(column_uniq_id)->index_id();
                         try {
                             auto st = compact_column(index_id, 
src_segment_num, dest_segment_num,
                                                      src_index_files, 
dest_index_files, fs,
                                                      index_writer_path, 
tablet_path, trans_vec,
-                                                     dest_segment_num_rows);
+                                                     dest_segment_num_rows, 
maybe_skip);
                             if (!st.ok()) {
                                 LOG(WARNING) << "failed to do index compaction"
                                              << ". tablet=" << 
_tablet->full_name()
diff --git a/be/src/olap/inverted_index_parser.cpp 
b/be/src/olap/inverted_index_parser.cpp
index 17cddc042f0..3f8d4f9c1be 100644
--- a/be/src/olap/inverted_index_parser.cpp
+++ b/be/src/olap/inverted_index_parser.cpp
@@ -79,7 +79,7 @@ std::string get_parser_phrase_support_string_from_properties(
     if (properties.find(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY) != 
properties.end()) {
         return properties.at(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY);
     } else {
-        return INVERTED_INDEX_PARSER_PHRASE_SUPPORT_NO;
+        return INVERTED_INDEX_PARSER_FALSE;
     }
 }
 
@@ -126,13 +126,4 @@ std::string get_parser_ignore_above_value_from_properties(
     }
 }
 
-std::string get_parser_lowercase_from_properties(
-        const std::map<std::string, std::string>& properties) {
-    if (properties.find(INVERTED_INDEX_PARSER_LOWERCASE_KEY) != 
properties.end()) {
-        return properties.at(INVERTED_INDEX_PARSER_LOWERCASE_KEY);
-    } else {
-        return "";
-    }
-}
-
 } // namespace doris
diff --git a/be/src/olap/inverted_index_parser.h 
b/be/src/olap/inverted_index_parser.h
index c786773be97..1a16d9ad97d 100644
--- a/be/src/olap/inverted_index_parser.h
+++ b/be/src/olap/inverted_index_parser.h
@@ -49,6 +49,9 @@ struct InvertedIndexCtx {
 
 using InvertedIndexCtxSPtr = std::shared_ptr<InvertedIndexCtx>;
 
+const std::string INVERTED_INDEX_PARSER_TRUE = "true";
+const std::string INVERTED_INDEX_PARSER_FALSE = "false";
+
 const std::string INVERTED_INDEX_PARSER_MODE_KEY = "parser_mode";
 const std::string INVERTED_INDEX_PARSER_FINE_GRANULARITY = "fine_grained";
 const std::string INVERTED_INDEX_PARSER_COARSE_GRANULARITY = "coarse_grained";
@@ -62,8 +65,6 @@ const std::string INVERTED_INDEX_PARSER_ENGLISH = "english";
 const std::string INVERTED_INDEX_PARSER_CHINESE = "chinese";
 
 const std::string INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY = "support_phrase";
-const std::string INVERTED_INDEX_PARSER_PHRASE_SUPPORT_YES = "true";
-const std::string INVERTED_INDEX_PARSER_PHRASE_SUPPORT_NO = "false";
 
 const std::string INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE = "char_filter_type";
 const std::string INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN = 
"char_filter_pattern";
@@ -91,6 +92,18 @@ CharFilterMap get_parser_char_filter_map_from_properties(
 std::string get_parser_ignore_above_value_from_properties(
         const std::map<std::string, std::string>& properties);
 
+template <bool ReturnTrue = false>
 std::string get_parser_lowercase_from_properties(
-        const std::map<std::string, std::string>& properties);
+        const std::map<std::string, std::string>& properties) {
+    if (properties.find(INVERTED_INDEX_PARSER_LOWERCASE_KEY) != 
properties.end()) {
+        return properties.at(INVERTED_INDEX_PARSER_LOWERCASE_KEY);
+    } else {
+        if constexpr (ReturnTrue) {
+            return INVERTED_INDEX_PARSER_TRUE;
+        } else {
+            return "";
+        }
+    }
+}
+
 } // namespace doris
diff --git a/be/src/olap/match_predicate.cpp b/be/src/olap/match_predicate.cpp
index 8ffd6d99936..c4095299ab8 100644
--- a/be/src/olap/match_predicate.cpp
+++ b/be/src/olap/match_predicate.cpp
@@ -126,7 +126,7 @@ bool MatchPredicate::_skip_evaluate(InvertedIndexIterator* 
iterator) const {
     if ((_match_type == MatchType::MATCH_PHRASE || _match_type == 
MatchType::MATCH_PHRASE_PREFIX) &&
         iterator->get_inverted_index_reader_type() == 
InvertedIndexReaderType::FULLTEXT &&
         
get_parser_phrase_support_string_from_properties(iterator->get_index_properties())
 ==
-                INVERTED_INDEX_PARSER_PHRASE_SUPPORT_NO) {
+                INVERTED_INDEX_PARSER_FALSE) {
         return true;
     }
     return false;
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_compaction.cpp 
b/be/src/olap/rowset/segment_v2/inverted_index_compaction.cpp
index b04edd6eb83..ff076e84397 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_compaction.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_compaction.cpp
@@ -29,7 +29,7 @@ Status compact_column(int32_t index_id, int src_segment_num, 
int dest_segment_nu
                       std::vector<std::string> dest_index_files, const 
io::FileSystemSPtr& fs,
                       std::string index_writer_path, std::string tablet_path,
                       std::vector<std::vector<std::pair<uint32_t, uint32_t>>> 
trans_vec,
-                      std::vector<uint32_t> dest_segment_num_rows) {
+                      std::vector<uint32_t> dest_segment_num_rows, bool 
maybe_skip) {
     DBUG_EXECUTE_IF("index_compaction_compact_column_throw_error", {
         if (index_id % 2 == 0) {
             _CLTHROWA(CL_ERR_IO, "debug point: test throw error in index 
compaction");
@@ -68,8 +68,8 @@ Status compact_column(int32_t index_id, int src_segment_num, 
int dest_segment_nu
     }
 
     DCHECK_EQ(src_index_dirs.size(), trans_vec.size());
-    index_writer->indexCompaction(src_index_dirs, dest_index_dirs, trans_vec,
-                                  dest_segment_num_rows);
+    index_writer->indexCompaction(src_index_dirs, dest_index_dirs, trans_vec, 
dest_segment_num_rows,
+                                  maybe_skip);
 
     index_writer->close();
     _CLDELETE(index_writer);
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_compaction.h 
b/be/src/olap/rowset/segment_v2/inverted_index_compaction.h
index f615192b199..7d6ad1c2d48 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_compaction.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index_compaction.h
@@ -30,6 +30,6 @@ Status compact_column(int32_t index_id, int src_segment_num, 
int dest_segment_nu
                       std::vector<std::string> dest_index_files, const 
io::FileSystemSPtr& fs,
                       std::string index_writer_path, std::string tablet_path,
                       std::vector<std::vector<std::pair<uint32_t, uint32_t>>> 
trans_vec,
-                      std::vector<uint32_t> dest_segment_num_rows);
+                      std::vector<uint32_t> dest_segment_num_rows, bool 
maybe_skip);
 } // namespace segment_v2
 } // namespace doris
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp 
b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
index c99de50d7ac..35e6dc2f24a 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
@@ -205,7 +205,7 @@ public:
                 // ANALYSER_NOT_SET, ANALYSER_NONE use default SimpleAnalyzer
                 _analyzer = 
std::make_unique<lucene::analysis::SimpleAnalyzer<char>>();
             }
-            auto lowercase = 
get_parser_lowercase_from_properties(_index_meta->properties());
+            auto lowercase = 
get_parser_lowercase_from_properties<true>(_index_meta->properties());
             if (lowercase == "true") {
                 _analyzer->set_lowercase(true);
             } else if (lowercase == "false") {
@@ -234,7 +234,7 @@ public:
         }
         _field = new lucene::document::Field(_field_name.c_str(), 
field_config);
         if 
(get_parser_phrase_support_string_from_properties(_index_meta->properties()) ==
-            INVERTED_INDEX_PARSER_PHRASE_SUPPORT_YES) {
+            INVERTED_INDEX_PARSER_TRUE) {
             _field->setOmitTermFreqAndPositions(false);
         } else {
             _field->setOmitTermFreqAndPositions(true);
diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp 
b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
index ed4a55986f8..984d118ddf5 100644
--- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp
+++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
@@ -1232,9 +1232,10 @@ Status SegmentIterator::_init_inverted_index_iterators() 
{
     for (auto cid : _schema->column_ids()) {
         int32_t unique_id = _opts.tablet_schema->column(cid).unique_id();
         if (_inverted_index_iterators[cid] == nullptr) {
+            // Use segment’s own index_meta, for compatibility with future 
indexing needs to default to lowercase.
             RETURN_IF_ERROR(_segment->new_inverted_index_iterator(
                     _opts.tablet_schema->column(cid),
-                    _opts.tablet_schema->get_inverted_index(unique_id), _opts,
+                    _segment->_tablet_schema->get_inverted_index(unique_id), 
_opts,
                     &_inverted_index_iterators[cid]));
         }
     }
diff --git a/be/src/olap/tablet_schema.cpp b/be/src/olap/tablet_schema.cpp
index 8c810b528b9..e6e3dde7778 100644
--- a/be/src/olap/tablet_schema.cpp
+++ b/be/src/olap/tablet_schema.cpp
@@ -616,6 +616,12 @@ void TabletIndex::to_schema_pb(TabletIndexPB* index) const 
{
     for (const auto& kv : _properties) {
         (*index->mutable_properties())[kv.first] = kv.second;
     }
+
+    // lowercase by default
+    if (!_properties.contains(INVERTED_INDEX_PARSER_LOWERCASE_KEY)) {
+        (*index->mutable_properties())[INVERTED_INDEX_PARSER_LOWERCASE_KEY] =
+                INVERTED_INDEX_PARSER_TRUE;
+    }
 }
 
 void TabletSchema::append_column(TabletColumn column, bool is_dropped_column) {
diff --git a/regression-test/data/inverted_index_p0/test_lowercase.out 
b/regression-test/data/inverted_index_p0/test_lowercase.out
index 03c2f57468f..2ca46501026 100644
--- a/regression-test/data/inverted_index_p0/test_lowercase.out
+++ b/regression-test/data/inverted_index_p0/test_lowercase.out
@@ -31,11 +31,17 @@
 
 -- !sql --
 1      hello 我来到北京清华大学
+2      HELLO 我爱你中国
+3      Hello 人民可以得到更多实惠
 
 -- !sql --
+1      hello 我来到北京清华大学
 2      HELLO 我爱你中国
+3      Hello 人民可以得到更多实惠
 
 -- !sql --
+1      hello 我来到北京清华大学
+2      HELLO 我爱你中国
 3      Hello 人民可以得到更多实惠
 
 -- !sql --


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to