This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch branch-2.1
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-2.1 by this push:
     new fbcf63e1f5b [cherry-pick] (branch-2.1)fix variant index  (#36577)
fbcf63e1f5b is described below

commit fbcf63e1f5b97b20936c87e250986ddc33b20554
Author: Sun Chenyang <csun5...@gmail.com>
AuthorDate: Thu Jun 20 17:57:26 2024 +0800

    [cherry-pick] (branch-2.1)fix variant index  (#36577)
    
    pick from master #36163
---
 .../olap/rowset/segment_v2/inverted_index_writer.h |   2 +-
 be/src/olap/rowset/segment_v2/segment_writer.cpp   |   4 +-
 .../rowset/segment_v2/vertical_segment_writer.cpp  |   4 +-
 be/src/olap/tablet_schema.cpp                      |  12 ++-
 be/src/olap/tablet_schema.h                        |   5 +-
 be/src/olap/task/index_builder.cpp                 |   2 +-
 be/src/vec/common/schema_util.cpp                  |   2 +-
 .../test_variant_index_format_v1.out               |  10 ++
 .../test_variant_index_format_v1.groovy            | 105 +++++++++++++++++++++
 9 files changed, 135 insertions(+), 11 deletions(-)

diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.h 
b/be/src/olap/rowset/segment_v2/inverted_index_writer.h
index 06bc960bc33..3b4e5ba2709 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_writer.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.h
@@ -75,7 +75,7 @@ public:
 
     // check if the column is valid for inverted index, some columns
     // are generated from variant, but not all of them are supported
-    static bool check_column_valid(const TabletColumn& column) {
+    static bool check_support_inverted_index(const TabletColumn& column) {
         // bellow types are not supported in inverted index for extracted 
columns
         static std::set<FieldType> invalid_types = {
                 FieldType::OLAP_FIELD_TYPE_DOUBLE,
diff --git a/be/src/olap/rowset/segment_v2/segment_writer.cpp 
b/be/src/olap/rowset/segment_v2/segment_writer.cpp
index 33f4e863824..7665aec1372 100644
--- a/be/src/olap/rowset/segment_v2/segment_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/segment_writer.cpp
@@ -218,9 +218,7 @@ Status SegmentWriter::init(const std::vector<uint32_t>& 
col_ids, bool has_key) {
         }
         // indexes for this column
         opts.indexes = 
std::move(_tablet_schema->get_indexes_for_column(column));
-        if (!InvertedIndexColumnWriter::check_column_valid(column)) {
-            // skip inverted index if invalid
-            opts.indexes.clear();
+        if (!InvertedIndexColumnWriter::check_support_inverted_index(column)) {
             opts.need_zone_map = false;
             opts.need_bloom_filter = false;
             opts.need_bitmap_index = false;
diff --git a/be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp 
b/be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp
index 394f5bae184..15b3688585c 100644
--- a/be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp
@@ -171,9 +171,7 @@ Status 
VerticalSegmentWriter::_create_column_writer(uint32_t cid, const TabletCo
     }
     // indexes for this column
     opts.indexes = _tablet_schema->get_indexes_for_column(column);
-    if (!InvertedIndexColumnWriter::check_column_valid(column)) {
-        // skip inverted index if invalid
-        opts.indexes.clear();
+    if (!InvertedIndexColumnWriter::check_support_inverted_index(column)) {
         opts.need_zone_map = false;
         opts.need_bloom_filter = false;
         opts.need_bitmap_index = false;
diff --git a/be/src/olap/tablet_schema.cpp b/be/src/olap/tablet_schema.cpp
index 0418f4c6334..290e5a6bc25 100644
--- a/be/src/olap/tablet_schema.cpp
+++ b/be/src/olap/tablet_schema.cpp
@@ -1275,6 +1275,10 @@ const TabletColumn& TabletSchema::column(const 
std::string& field_name) const {
 std::vector<const TabletIndex*> TabletSchema::get_indexes_for_column(
         const TabletColumn& col) const {
     std::vector<const TabletIndex*> indexes_for_column;
+    // Some columns (Float, Double, JSONB ...) from the variant do not support 
index, but they are listed in TabltetIndex.
+    if 
(!segment_v2::InvertedIndexColumnWriter::check_support_inverted_index(col)) {
+        return indexes_for_column;
+    }
     int32_t col_unique_id = col.is_extracted_column() ? col.parent_unique_id() 
: col.unique_id();
     const std::string& suffix_path =
             col.has_path_info() ? 
escape_for_path_name(col.path_info_ptr()->get_path()) : "";
@@ -1346,7 +1350,13 @@ const TabletIndex* 
TabletSchema::get_inverted_index(int32_t col_unique_id,
     return nullptr;
 }
 
-const TabletIndex* TabletSchema::get_inverted_index(const TabletColumn& col) 
const {
+const TabletIndex* TabletSchema::get_inverted_index(const TabletColumn& col,
+                                                    bool check_valid) const {
+    // With check_valid set to true by default
+    // Some columns(Float, Double, JSONB ...) from the variant do not support 
inverted index
+    if (check_valid && 
!segment_v2::InvertedIndexColumnWriter::check_support_inverted_index(col)) {
+        return nullptr;
+    }
     // TODO use more efficient impl
     // Use parent id if unique not assigned, this could happend when accessing 
subcolumns of variants
     int32_t col_unique_id = col.is_extracted_column() ? col.parent_unique_id() 
: col.unique_id();
diff --git a/be/src/olap/tablet_schema.h b/be/src/olap/tablet_schema.h
index c6fc428195d..b8f26a1f601 100644
--- a/be/src/olap/tablet_schema.h
+++ b/be/src/olap/tablet_schema.h
@@ -353,7 +353,10 @@ public:
     bool has_inverted_index_with_index_id(int64_t index_id, const std::string& 
suffix_path) const;
     const TabletIndex* get_inverted_index_with_index_id(int64_t index_id,
                                                         const std::string& 
suffix_name) const;
-    const TabletIndex* get_inverted_index(const TabletColumn& col) const;
+    // check_valid: check if this column supports inverted index
+    // Some columns (Float, Double, JSONB ...) from the variant do not support 
index, but they are listed in TabletIndex.
+    // If returned, the index file will not be found.
+    const TabletIndex* get_inverted_index(const TabletColumn& col, bool 
check_valid = true) const;
     const TabletIndex* get_inverted_index(int32_t col_unique_id,
                                           const std::string& suffix_path) 
const;
     bool has_ngram_bf_index(int32_t col_unique_id) const;
diff --git a/be/src/olap/task/index_builder.cpp 
b/be/src/olap/task/index_builder.cpp
index 447d57d520c..fd7d6b5edf2 100644
--- a/be/src/olap/task/index_builder.cpp
+++ b/be/src/olap/task/index_builder.cpp
@@ -330,7 +330,7 @@ Status 
IndexBuilder::handle_single_rowset(RowsetMetaSharedPtr output_rowset_meta
                     continue;
                 }
                 auto column = output_rowset_schema->column(column_idx);
-                if (!InvertedIndexColumnWriter::check_column_valid(column)) {
+                if 
(!InvertedIndexColumnWriter::check_support_inverted_index(column)) {
                     continue;
                 }
                 
DCHECK(output_rowset_schema->has_inverted_index_with_index_id(index_id, ""));
diff --git a/be/src/vec/common/schema_util.cpp 
b/be/src/vec/common/schema_util.cpp
index 49e901e0846..e8fd23f7569 100644
--- a/be/src/vec/common/schema_util.cpp
+++ b/be/src/vec/common/schema_util.cpp
@@ -403,7 +403,7 @@ void inherit_root_attributes(TabletSchemaSPtr& schema) {
         if (it == variants_index_meta.end()) {
             continue;
         }
-        auto index_meta = schema->get_inverted_index(col);
+        auto index_meta = schema->get_inverted_index(col, false);
         // add index meta
         TabletIndex index_info = it->second;
         
index_info.set_escaped_escaped_index_suffix_path(col.path_info_ptr()->get_path());
diff --git 
a/regression-test/data/inverted_index_p0/test_variant_index_format_v1.out 
b/regression-test/data/inverted_index_p0/test_variant_index_format_v1.out
new file mode 100644
index 00000000000..e2a83121baf
--- /dev/null
+++ b/regression-test/data/inverted_index_p0/test_variant_index_format_v1.out
@@ -0,0 +1,10 @@
+-- This file is automatically generated. You should know what you did if you 
want to edit this
+-- !sql --
+\N
+\N
+\N
+4748
+
+-- !sql --
+4
+
diff --git 
a/regression-test/suites/inverted_index_p0/test_variant_index_format_v1.groovy 
b/regression-test/suites/inverted_index_p0/test_variant_index_format_v1.groovy
new file mode 100644
index 00000000000..0cd5b938696
--- /dev/null
+++ 
b/regression-test/suites/inverted_index_p0/test_variant_index_format_v1.groovy
@@ -0,0 +1,105 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_variant_index_format_v1", "p0") {
+
+    def calc_file_crc_on_tablet = { ip, port, tablet ->
+        return curl("GET", 
String.format("http://%s:%s/api/calc_crc?tablet_id=%s";, ip, port, tablet))
+    }
+    def set_be_config = { key, value ->
+        String backend_id;
+        def backendId_to_backendIP = [:]
+        def backendId_to_backendHttpPort = [:]
+        getBackendIpHttpPort(backendId_to_backendIP, 
backendId_to_backendHttpPort);
+
+        backend_id = backendId_to_backendIP.keySet()[0]
+        def (code, out, err) = 
update_be_config(backendId_to_backendIP.get(backend_id), 
backendId_to_backendHttpPort.get(backend_id), key, value)
+        logger.info("update config: code=" + code + ", out=" + out + ", err=" 
+ err)
+    }
+
+    def load_json_data = {table_name, file_name ->
+        // load the json data
+        streamLoad {
+            table "${table_name}"
+
+            // set http request header params
+            set 'read_json_by_line', 'true' 
+            set 'format', 'json' 
+            set 'max_filter_ratio', '0.1'
+            file file_name // import json file
+            time 10000 // limit inflight 10s
+
+            // if declared a check callback, the default check condition will 
ignore.
+            // So you must check all condition
+
+            check { result, exception, startTime, endTime ->
+                if (exception != null) {
+                        throw exception
+                }
+                logger.info("Stream load ${file_name} result: 
${result}".toString())
+                def json = parseJson(result)
+                assertEquals("success", json.Status.toLowerCase())
+                // assertEquals(json.NumberTotalRows, json.NumberLoadedRows + 
json.NumberUnselectedRows)
+                assertTrue(json.NumberLoadedRows > 0 && json.LoadBytes > 0)
+            }
+        }
+    }
+
+    def table_name = "github_events"
+    sql """DROP TABLE IF EXISTS ${table_name}"""
+    sql """
+        CREATE TABLE IF NOT EXISTS ${table_name} (
+            k bigint,
+            v variant,
+            INDEX idx_var(v) USING INVERTED PROPERTIES("parser" = "english") 
COMMENT ''
+        )
+        DUPLICATE KEY(`k`)
+        DISTRIBUTED BY HASH(k) BUCKETS 1
+        properties("replication_num" = "1", "disable_auto_compaction" = 
"true", "inverted_index_storage_format" = "V1");
+    """
+
+    set_be_config.call("memory_limitation_per_thread_for_schema_change_bytes", 
"6294967296")
+    load_json_data.call(table_name, """${getS3Url() + 
'/regression/gharchive.m/2015-01-01-0.json'}""")
+    load_json_data.call(table_name, """${getS3Url() + 
'/regression/gharchive.m/2015-01-01-1.json'}""")
+    load_json_data.call(table_name, """${getS3Url() + 
'/regression/gharchive.m/2015-01-01-2.json'}""")
+    load_json_data.call(table_name, """${getS3Url() + 
'/regression/gharchive.m/2015-01-01-3.json'}""") 
+    load_json_data.call(table_name, """${getS3Url() + 
'/regression/gharchive.m/2022-11-07-16.json'}""")
+    load_json_data.call(table_name, """${getS3Url() + 
'/regression/gharchive.m/2022-11-07-10.json'}""")
+    load_json_data.call(table_name, """${getS3Url() + 
'/regression/gharchive.m/2022-11-07-22.json'}""")
+    load_json_data.call(table_name, """${getS3Url() + 
'/regression/gharchive.m/2022-11-07-23.json'}""")
+    def backendId_to_backendIP = [:]
+    def backendId_to_backendHttpPort = [:]
+    getBackendIpHttpPort(backendId_to_backendIP, backendId_to_backendHttpPort);
+
+    tablets = sql_return_maparray """ show tablets from ${table_name}; """
+    String tablet_id = tablets[0].TabletId
+    String backend_id = tablets[0].BackendId
+    String ip = backendId_to_backendIP.get(backend_id)
+    String port = backendId_to_backendHttpPort.get(backend_id)
+    def (code_0, out_0, err_0) = calc_file_crc_on_tablet(ip, port, tablet_id)
+    logger.info("Run calc_file_crc_on_tablet: code=" + code_0 + ", out=" + 
out_0 + ", err=" + err_0)
+    assertTrue(code_0 == 0)
+    assertTrue(out_0.contains("crc_value"))
+    assertTrue(out_0.contains("used_time_ms"))
+    assertEquals("0", parseJson(out_0.trim()).start_version)
+    assertEquals("9", parseJson(out_0.trim()).end_version)
+    assertEquals("9", parseJson(out_0.trim()).rowset_count)
+
+    qt_sql """select cast(v["payload"]["pull_request"]["additions"] as int)  
from github_events where cast(v["repo"]["name"] as string) = 
'xpressengine/xe-core' order by 1;"""
+    qt_sql """select count() from github_events where  cast(v["repo"]["name"] 
as string) = 'xpressengine/xe-core'"""
+    set_be_config.call("memory_limitation_per_thread_for_schema_change_bytes", 
"2147483648")
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to