This is an automated email from the ASF dual-hosted git repository.
eldenmoon pushed a commit to branch variant-sparse
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/variant-sparse by this push:
new 00b4042dab1 [enhance](variant)Add ut writer var (#49624)
00b4042dab1 is described below
commit 00b4042dab1103f96c9546b2fc101423be0a7739
Author: Sun Chenyang <[email protected]>
AuthorDate: Fri Mar 28 16:02:40 2025 +0800
[enhance](variant)Add ut writer var (#49624)
---
.../rowset/segment_v2/hierarchical_data_reader.cpp | 2 +-
.../segment_v2/variant_column_writer_impl.cpp | 17 +-
.../rowset/segment_v2/variant_column_writer_impl.h | 12 +-
be/src/vec/columns/column_object.cpp | 16 +-
be/src/vec/columns/column_object.h | 2 +
be/src/vec/common/schema_util.cpp | 42 +-
be/src/vec/common/schema_util.h | 16 +-
be/src/vec/olap/olap_data_convertor.cpp | 8 +-
be/src/vec/olap/olap_data_convertor.h | 6 +
be/test/common/schema_util_test.cpp | 121 ----
.../compaction/util/index_compaction_utils.cpp | 2 +-
.../variant_column_writer_reader_test.cpp | 663 +++++++++++++++++++++
.../olap/rowset/variant_with_compaction_test.cpp | 0
be/test/testutil/schema_utils.h | 48 ++
be/test/testutil/variant_util.h | 137 +++++
be/test/vec/common/schema_util_rowset_test.cpp | 265 ++++++++
be/test/vec/common/schema_util_test.cpp | 357 +++++++++++
17 files changed, 1532 insertions(+), 182 deletions(-)
diff --git a/be/src/olap/rowset/segment_v2/hierarchical_data_reader.cpp
b/be/src/olap/rowset/segment_v2/hierarchical_data_reader.cpp
index 2367a38821b..185a6d82422 100644
--- a/be/src/olap/rowset/segment_v2/hierarchical_data_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/hierarchical_data_reader.cpp
@@ -382,7 +382,7 @@ Status
HierarchicalDataReader::_process_sparse_column(vectorized::ColumnObject&
// from "" which is empty path and root
if (container_variant.is_null_root()) {
// root was created with nrows with Nothing type,
resize it to fit the size of sparse column
-
container_variant.get_root()->resize(sparse_data_offsets.size());
+
container_variant.get_subcolumn({})->resize(sparse_data_offsets.size());
// bool added =
container_variant.add_sub_column({}, sparse_data_offsets.size());
// if (!added) {
// return Status::InternalError("Failed to add
subcolumn for sparse column");
diff --git a/be/src/olap/rowset/segment_v2/variant_column_writer_impl.cpp
b/be/src/olap/rowset/segment_v2/variant_column_writer_impl.cpp
index 34fe6e085ec..ae1f2a8de61 100644
--- a/be/src/olap/rowset/segment_v2/variant_column_writer_impl.cpp
+++ b/be/src/olap/rowset/segment_v2/variant_column_writer_impl.cpp
@@ -403,9 +403,9 @@ Status VariantColumnWriterImpl::_process_sparse_column(
return status;
}
VLOG_DEBUG << "dump sparse "
- << vectorized::schema_util::dump_column(
- vectorized::ColumnObject::get_sparse_column_type(),
- ptr->get_sparse_column());
+ << vectorized::Block::dump_column(
+ ptr->get_sparse_column(),
+ vectorized::ColumnObject::get_sparse_column_type());
RETURN_IF_ERROR(
_sparse_column_writer->append(column->get_nullmap(),
column->get_data(), num_rows));
++column_id;
@@ -498,10 +498,11 @@ bool VariantColumnWriterImpl::is_finalized() const {
Status VariantColumnWriterImpl::append_data(const uint8_t** ptr, size_t
num_rows) {
DCHECK(!is_finalized());
- const auto& src = *reinterpret_cast<const vectorized::ColumnObject*>(*ptr);
+ const auto* column = reinterpret_cast<const
vectorized::VariantColumnData*>(*ptr);
+ const auto& src = *reinterpret_cast<const
vectorized::ColumnObject*>(column->column_data);
auto* dst_ptr = assert_cast<vectorized::ColumnObject*>(_column.get());
// TODO: if direct write we could avoid copy
- dst_ptr->insert_range_from(src, 0, num_rows);
+ dst_ptr->insert_range_from(src, column->row_pos, num_rows);
return Status::OK();
}
@@ -629,9 +630,11 @@ Status VariantSubcolumnWriter::init() {
}
Status VariantSubcolumnWriter::append_data(const uint8_t** ptr, size_t
num_rows) {
- const auto& src = *reinterpret_cast<const vectorized::ColumnObject*>(*ptr);
+ const auto* column = reinterpret_cast<const
vectorized::VariantColumnData*>(*ptr);
+ const auto& src = *reinterpret_cast<const
vectorized::ColumnObject*>(column->column_data);
auto* dst_ptr = assert_cast<vectorized::ColumnObject*>(_column.get());
- dst_ptr->insert_range_from(src, 0, num_rows);
+ // TODO: if direct write we could avoid copy
+ dst_ptr->insert_range_from(src, column->row_pos, num_rows);
return Status::OK();
}
diff --git a/be/src/olap/rowset/segment_v2/variant_column_writer_impl.h
b/be/src/olap/rowset/segment_v2/variant_column_writer_impl.h
index d9974dd6f2e..9f67cf04505 100644
--- a/be/src/olap/rowset/segment_v2/variant_column_writer_impl.h
+++ b/be/src/olap/rowset/segment_v2/variant_column_writer_impl.h
@@ -36,8 +36,13 @@ class ColumnWriter;
class ScalarColumnWriter;
struct VariantStatistics {
- // If reached the size of this, we should stop writing statistics for
sparse data
- constexpr static size_t MAX_SPARSE_DATA_STATISTICS_SIZE = 10000;
+ // #ifdef BE_TEST
+ // static constexpr size_t MAX_SPARSE_DATA_STATISTICS_SIZE = 10;
+ // #else
+ // // If reached the size of this, we should stop writing statistics
for sparse data
+ // static constexpr size_t MAX_SPARSE_DATA_STATISTICS_SIZE = 10000;
+ // #endif
+ static constexpr size_t MAX_SPARSE_DATA_STATISTICS_SIZE = 10000;
std::map<std::string, size_t> subcolumns_non_null_size;
std::map<std::string, size_t> sparse_column_non_null_size;
@@ -96,5 +101,8 @@ private:
// hold the references of subcolumns indexes
std::vector<std::unique_ptr<TabletIndex>> _subcolumns_indexes;
};
+
+void _init_column_meta(ColumnMetaPB* meta, uint32_t column_id, const
TabletColumn& column,
+ CompressionTypePB compression_type);
} // namespace segment_v2
} // namespace doris
\ No newline at end of file
diff --git a/be/src/vec/columns/column_object.cpp
b/be/src/vec/columns/column_object.cpp
index 896fad0795d..448ff7df4c1 100644
--- a/be/src/vec/columns/column_object.cpp
+++ b/be/src/vec/columns/column_object.cpp
@@ -640,6 +640,17 @@ bool ColumnObject::Subcolumn::is_finalized() const {
(data.empty() || (data.size() == 1));
}
+void ColumnObject::Subcolumn::resize(size_t n) {
+ if (n == num_rows) {
+ return;
+ }
+ if (n > num_rows) {
+ insert_many_defaults(n - num_rows);
+ } else {
+ pop_back(num_rows - n);
+ }
+}
+
template <typename Func>
MutableColumnPtr ColumnObject::apply_for_columns(Func&& func) const {
if (!is_finalized()) {
@@ -1107,7 +1118,8 @@ void
ColumnObject::Subcolumn::serialize_to_sparse_column(ColumnString* key, std:
row -= num_of_defaults_in_prefix;
for (size_t i = 0; i < data.size(); ++i) {
const auto& part = data[i];
- if (row < part->size()) {
+ size_t current_column_size = part->size();
+ if (row < current_column_size) {
// no need null in sparse column
if (!assert_cast<const ColumnNullable&,
TypeCheckOnRelease::DISABLE>(*part).is_null_at(
row)) {
@@ -1129,7 +1141,7 @@ void
ColumnObject::Subcolumn::serialize_to_sparse_column(ColumnString* key, std:
return;
}
- row -= part->size();
+ row -= current_column_size;
}
throw doris::Exception(ErrorCode::OUT_OF_BOUND,
diff --git a/be/src/vec/columns/column_object.h
b/be/src/vec/columns/column_object.h
index 10078c0ede5..f7c42843f7b 100644
--- a/be/src/vec/columns/column_object.h
+++ b/be/src/vec/columns/column_object.h
@@ -204,6 +204,8 @@ public:
bool is_empty_nested(size_t row) const;
+ void resize(size_t n);
+
private:
class LeastCommonType {
public:
diff --git a/be/src/vec/common/schema_util.cpp
b/be/src/vec/common/schema_util.cpp
index 2f8a4de72e7..6298daffd8f 100644
--- a/be/src/vec/common/schema_util.cpp
+++ b/be/src/vec/common/schema_util.cpp
@@ -534,7 +534,8 @@ Status _parse_variant_columns(Block& block, const
std::vector<int>& variant_pos,
}
if (scalar_root_column->is_column_string()) {
- variant_column = ColumnObject::create(var.max_subcolumns_count());
+ // now, subcolumns have not been set, so we set it to 0
+ variant_column = ColumnObject::create(0);
parse_json_to_variant(*variant_column.get(),
assert_cast<const
ColumnString&>(*scalar_root_column), config);
} else {
@@ -577,43 +578,6 @@ vectorized::ColumnObject::Subcolumns get_sorted_subcolumns(
return sorted;
}
-// ---------------------------
-
-std::string dump_column(DataTypePtr type, const ColumnPtr& col) {
- Block tmp;
- tmp.insert(ColumnWithTypeAndName {col, type, col->get_name()});
- return tmp.dump_data(0, tmp.rows());
-}
-
-// ---------------------------
-Status extract(ColumnPtr source, const PathInData& path, MutableColumnPtr&
dst) {
- auto type_string = std::make_shared<DataTypeString>();
- std::string jsonpath = path.to_jsonpath();
- bool is_nullable = source->is_nullable();
- auto json_type = is_nullable ?
make_nullable(std::make_shared<DataTypeJsonb>())
- : std::make_shared<DataTypeJsonb>();
- ColumnsWithTypeAndName arguments {
- {source, json_type, ""},
- {type_string->create_column_const(1, Field(String(jsonpath.data(),
jsonpath.size()))),
- type_string, ""}};
- auto function =
- SimpleFunctionFactory::instance().get_function("jsonb_extract",
arguments, json_type);
- if (!function) {
- return Status::InternalError("Not found function jsonb_extract");
- }
- Block tmp_block {arguments};
- vectorized::ColumnNumbers argnum;
- argnum.emplace_back(0);
- argnum.emplace_back(1);
- uint32_t result_column = cast_set<uint32_t>(tmp_block.columns());
- tmp_block.insert({nullptr, json_type, ""});
- RETURN_IF_ERROR(function->execute(nullptr, tmp_block, argnum,
result_column, source->size()));
- dst = tmp_block.get_by_position(result_column)
- .column->convert_to_full_column_if_const()
- ->assume_mutable();
- return Status::OK();
-}
-
bool has_schema_index_diff(const TabletSchema* new_schema, const TabletSchema*
old_schema,
int32_t new_col_idx, int32_t old_col_idx) {
const auto& column_new = new_schema->column(new_col_idx);
@@ -645,8 +609,6 @@ TabletColumn create_sparse_column(const TabletColumn&
variant) {
return res;
}
-using PathToNoneNullValues = std::unordered_map<std::string, size_t>;
-
Status collect_path_stats(const RowsetSharedPtr& rs,
std::unordered_map<int32_t, PathToNoneNullValues>&
uid_to_path_stats) {
SegmentCacheHandle segment_cache;
diff --git a/be/src/vec/common/schema_util.h b/be/src/vec/common/schema_util.h
index 4c1ee876254..8281cdec7b6 100644
--- a/be/src/vec/common/schema_util.h
+++ b/be/src/vec/common/schema_util.h
@@ -54,6 +54,8 @@ struct ColumnWithTypeAndName;
const std::string SPARSE_COLUMN_PATH = "__DORIS_VARIANT_SPARSE__";
namespace doris::vectorized::schema_util {
+using PathToNoneNullValues = std::unordered_map<std::string, size_t>;
+
/// Returns number of dimensions in Array type. 0 if type is not array.
size_t get_number_of_dimensions(const IDataType& type);
@@ -122,17 +124,21 @@ void inherit_column_attributes(const TabletColumn&
source, TabletColumn& target,
vectorized::ColumnObject::Subcolumns get_sorted_subcolumns(
const vectorized::ColumnObject::Subcolumns& subcolumns);
-// Extract json data from source with path
-Status extract(ColumnPtr source, const PathInData& path, MutableColumnPtr&
dst);
-
-std::string dump_column(DataTypePtr type, const ColumnPtr& col);
-
bool has_schema_index_diff(const TabletSchema* new_schema, const TabletSchema*
old_schema,
int32_t new_col_idx, int32_t old_col_idx);
// create ColumnMap<String, String>
TabletColumn create_sparse_column(const TabletColumn& variant);
+// get the subpaths and sparse paths for the variant column
+void get_subpaths(const TabletColumn& variant,
+ const std::unordered_map<int32_t, PathToNoneNullValues>&
path_stats,
+ std::unordered_map<int32_t, TabletSchema::PathsSetInfo>&
uid_to_paths_set_info);
+
+// collect path stats from the rowset
+Status collect_path_stats(const RowsetSharedPtr& rs,
+ std::unordered_map<int32_t, PathToNoneNullValues>&
uid_to_path_stats);
+
// Build the temporary schema for compaction, this will reduce the memory
usage of compacting variant columns
Status get_compaction_schema(const std::vector<RowsetSharedPtr>& rowsets,
TabletSchemaSPtr& target);
diff --git a/be/src/vec/olap/olap_data_convertor.cpp
b/be/src/vec/olap/olap_data_convertor.cpp
index c2887c41934..c286127d54e 100644
--- a/be/src/vec/olap/olap_data_convertor.cpp
+++ b/be/src/vec/olap/olap_data_convertor.cpp
@@ -1205,6 +1205,8 @@ Status
OlapBlockDataConvertor::OlapColumnDataConvertorVariant::convert_to_olap()
}
// Do nothing, the column writer will finally do finalize and write
subcolumns one by one
// since we are not sure the final column(type and columns) until the end
of the last block
+ // need to return the position of the column data
+ _variant_column_data = std::make_unique<VariantColumnData>(_value_ptr,
_row_pos);
return Status::OK();
}
@@ -1212,9 +1214,9 @@ const void*
OlapBlockDataConvertor::OlapColumnDataConvertorVariant::get_data() c
if (!_value_ptr) {
return _root_data_convertor->get_data();
}
- // return the ptr of original column, see
VariantColumnWriterImpl::append_data
- // which will cast to ColumnObject
- return _value_ptr;
+ // return the ptr of VariantColumnData, see
VariantColumnWriterImpl::append_data
+ // which will cast to VariantColumnData
+ return _variant_column_data.get();
}
const void*
OlapBlockDataConvertor::OlapColumnDataConvertorVariant::get_data_at(
size_t offset) const {
diff --git a/be/src/vec/olap/olap_data_convertor.h
b/be/src/vec/olap/olap_data_convertor.h
index c2a8e6ace47..e0f1184021e 100644
--- a/be/src/vec/olap/olap_data_convertor.h
+++ b/be/src/vec/olap/olap_data_convertor.h
@@ -72,6 +72,11 @@ public:
virtual ~IOlapColumnDataAccessor() = default;
};
+struct VariantColumnData {
+ const void* column_data;
+ size_t row_pos;
+};
+
class OlapBlockDataConvertor {
public:
OlapBlockDataConvertor() = default;
@@ -536,6 +541,7 @@ private:
private:
const void* _value_ptr;
std::unique_ptr<OlapColumnDataConvertorVarChar> _root_data_convertor;
+ std::unique_ptr<VariantColumnData> _variant_column_data;
};
private:
diff --git a/be/test/common/schema_util_test.cpp
b/be/test/common/schema_util_test.cpp
deleted file mode 100644
index fb8b23c10cb..00000000000
--- a/be/test/common/schema_util_test.cpp
+++ /dev/null
@@ -1,121 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "vec/common/schema_util.h"
-
-#include <gtest/gtest.h>
-
-namespace doris {
-
-class SchemaUtilTest : public testing::Test {};
-
-void construct_column(ColumnPB* column_pb, TabletIndexPB* tablet_index,
int64_t index_id,
- const std::string& index_name, int32_t col_unique_id,
- const std::string& column_type, const std::string&
column_name,
- const IndexType& index_type) {
- column_pb->set_unique_id(col_unique_id);
- column_pb->set_name(column_name);
- column_pb->set_type(column_type);
- column_pb->set_is_nullable(true);
- column_pb->set_is_bf_column(true);
- tablet_index->set_index_id(index_id);
- tablet_index->set_index_name(index_name);
- tablet_index->set_index_type(index_type);
- tablet_index->add_col_unique_id(col_unique_id);
-}
-
-void construct_subcolumn(TabletSchemaSPtr schema, const FieldType& type,
int32_t col_unique_id,
- std::string_view path, std::vector<TabletColumn>*
subcolumns) {
- TabletColumn subcol;
- subcol.set_type(type);
- subcol.set_is_nullable(true);
- subcol.set_unique_id(-1);
- subcol.set_parent_unique_id(col_unique_id);
- vectorized::PathInData col_path(path);
- subcol.set_path_info(col_path);
- subcol.set_name(col_path.get_path());
-
- if (type == FieldType::OLAP_FIELD_TYPE_ARRAY) {
- TabletColumn array_item_col;
- // double not support inverted index
- array_item_col.set_type(FieldType::OLAP_FIELD_TYPE_DOUBLE);
- array_item_col.set_is_nullable(true);
- array_item_col.set_unique_id(-1);
- array_item_col.set_parent_unique_id(col_unique_id);
-
- subcol.add_sub_column(array_item_col);
- }
-
- schema->append_column(subcol);
- subcolumns->emplace_back(std::move(subcol));
-}
-
-TEST_F(SchemaUtilTest, inherit_column_attributes) {
- TabletSchemaPB schema_pb;
- schema_pb.set_keys_type(KeysType::DUP_KEYS);
-
schema_pb.set_inverted_index_storage_format(InvertedIndexStorageFormatPB::V2);
-
- construct_column(schema_pb.add_column(), schema_pb.add_index(), 10000,
"key_index", 0, "INT",
- "key", IndexType::INVERTED);
- construct_column(schema_pb.add_column(), schema_pb.add_index(), 10001,
"v1_index", 1, "VARIANT",
- "v1", IndexType::INVERTED);
- construct_column(schema_pb.add_column(), schema_pb.add_index(), 10003,
"v3_index", 3, "VARIANT",
- "v3", IndexType::INVERTED);
-
- TabletSchemaSPtr tablet_schema = std::make_shared<TabletSchema>();
- tablet_schema->init_from_pb(schema_pb);
- std::vector<TabletColumn> subcolumns;
-
- construct_subcolumn(tablet_schema, FieldType::OLAP_FIELD_TYPE_STRING, 1,
"v1.b", &subcolumns);
- construct_subcolumn(tablet_schema, FieldType::OLAP_FIELD_TYPE_INT, 1,
"v1.c", &subcolumns);
-
- construct_subcolumn(tablet_schema, FieldType::OLAP_FIELD_TYPE_ARRAY, 3,
"v3.d", &subcolumns);
- construct_subcolumn(tablet_schema, FieldType::OLAP_FIELD_TYPE_FLOAT, 3,
"v3.a", &subcolumns);
-
- vectorized::schema_util::inherit_column_attributes(tablet_schema);
- for (const auto& col : subcolumns) {
- switch (col._parent_col_unique_id) {
- case 1:
- EXPECT_TRUE(tablet_schema->inverted_index(col) != nullptr);
- break;
- case 3:
- EXPECT_TRUE(tablet_schema->inverted_index(col) == nullptr);
- break;
- default:
- EXPECT_TRUE(false);
- }
- }
- EXPECT_EQ(tablet_schema->inverted_indexes().size(), 7);
-
- for (const auto& col : tablet_schema->_cols) {
- if (!col->is_extracted_column()) {
- continue;
- }
- switch (col->_parent_col_unique_id) {
- case 1:
- EXPECT_TRUE(col->is_bf_column());
- break;
- case 3:
- EXPECT_TRUE(!col->is_bf_column());
- break;
- default:
- EXPECT_TRUE(false);
- }
- }
-}
-
-} // namespace doris
diff --git
a/be/test/olap/rowset/segment_v2/inverted_index/compaction/util/index_compaction_utils.cpp
b/be/test/olap/rowset/segment_v2/inverted_index/compaction/util/index_compaction_utils.cpp
index 02353fc5441..4f4a601e63c 100644
---
a/be/test/olap/rowset/segment_v2/inverted_index/compaction/util/index_compaction_utils.cpp
+++
b/be/test/olap/rowset/segment_v2/inverted_index/compaction/util/index_compaction_utils.cpp
@@ -416,7 +416,7 @@ class IndexCompactionUtils {
// only base compaction can handle delete predicate
BaseCompaction compaction(*engine_ref, tablet);
compaction._input_rowsets = std::move(rowsets);
- compaction.build_basic_info();
+ RETURN_IF_ERROR(compaction.build_basic_info());
std::vector<RowsetReaderSharedPtr> input_rs_readers;
create_input_rowsets_readers(compaction, input_rs_readers);
diff --git
a/be/test/olap/rowset/segment_v2/variant_column_writer_reader_test.cpp
b/be/test/olap/rowset/segment_v2/variant_column_writer_reader_test.cpp
new file mode 100644
index 00000000000..df8428ac877
--- /dev/null
+++ b/be/test/olap/rowset/segment_v2/variant_column_writer_reader_test.cpp
@@ -0,0 +1,663 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "gtest/gtest.h"
+#include "olap/rowset/segment_v2/column_reader.h"
+#include "olap/rowset/segment_v2/hierarchical_data_reader.h"
+#include "olap/rowset/segment_v2/variant_column_writer_impl.h"
+#include "olap/storage_engine.h"
+#include "testutil/schema_utils.h"
+#include "testutil/variant_util.h"
+
+using namespace doris::vectorized;
+
+namespace doris {
+
+constexpr static uint32_t MAX_PATH_LEN = 1024;
+constexpr static std::string_view dest_dir =
"/ut_dir/variant_column_writer_test";
+constexpr static std::string_view tmp_dir = "./ut_dir/tmp";
+
+class VariantColumnWriterReaderTest : public testing::Test {
+public:
+ void SetUp() override {
+ // absolute dir
+ char buffer[MAX_PATH_LEN];
+ EXPECT_NE(getcwd(buffer, MAX_PATH_LEN), nullptr);
+ _current_dir = std::string(buffer);
+ _absolute_dir = _current_dir + std::string(dest_dir);
+
EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_absolute_dir).ok());
+
EXPECT_TRUE(io::global_local_filesystem()->create_directory(_absolute_dir).ok());
+
+ // tmp dir
+
EXPECT_TRUE(io::global_local_filesystem()->delete_directory(tmp_dir).ok());
+
EXPECT_TRUE(io::global_local_filesystem()->create_directory(tmp_dir).ok());
+ std::vector<StorePath> paths;
+ paths.emplace_back(std::string(tmp_dir), 1024000000);
+ auto tmp_file_dirs = std::make_unique<segment_v2::TmpFileDirs>(paths);
+ Status st = tmp_file_dirs->init();
+ EXPECT_TRUE(st.ok()) << st.to_json();
+ ExecEnv::GetInstance()->set_tmp_file_dir(std::move(tmp_file_dirs));
+
+ // storage engine
+ doris::EngineOptions options;
+ auto engine = std::make_unique<StorageEngine>(options);
+ _engine_ref = engine.get();
+ _data_dir = std::make_unique<DataDir>(*_engine_ref, _absolute_dir);
+ static_cast<void>(_data_dir->update_capacity());
+ ExecEnv::GetInstance()->set_storage_engine(std::move(engine));
+ }
+
+ void TearDown() override {
+
EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_absolute_dir).ok());
+
EXPECT_TRUE(io::global_local_filesystem()->delete_directory(tmp_dir).ok());
+ _engine_ref = nullptr;
+ ExecEnv::GetInstance()->set_storage_engine(nullptr);
+ }
+
+ VariantColumnWriterReaderTest() = default;
+ ~VariantColumnWriterReaderTest() override = default;
+
+private:
+ TabletSchemaSPtr _tablet_schema = nullptr;
+ StorageEngine* _engine_ref = nullptr;
+ std::unique_ptr<DataDir> _data_dir = nullptr;
+ TabletSharedPtr _tablet = nullptr;
+ std::string _absolute_dir;
+ std::string _current_dir;
+};
+
+void check_column_meta(const ColumnMetaPB& column_meta, auto& path_with_size) {
+ EXPECT_TRUE(column_meta.has_column_path_info());
+ auto path = std::make_shared<vectorized::PathInData>();
+ path->from_protobuf(column_meta.column_path_info());
+ EXPECT_EQ(column_meta.column_path_info().parrent_column_unique_id(), 1);
+ EXPECT_EQ(column_meta.none_null_size(),
path_with_size[path->copy_pop_front().get_path()]);
+}
+
+void check_sparse_column_meta(const ColumnMetaPB& column_meta, auto&
path_with_size) {
+ EXPECT_TRUE(column_meta.has_column_path_info());
+ auto path = std::make_shared<vectorized::PathInData>();
+ path->from_protobuf(column_meta.column_path_info());
+ EXPECT_EQ(column_meta.column_path_info().parrent_column_unique_id(), 1);
+ for (const auto& [path, size] :
+ column_meta.variant_statistics().sparse_column_non_null_size()) {
+ EXPECT_EQ(size, path_with_size[path]);
+ }
+ EXPECT_EQ(path->copy_pop_front().get_path(), "__DORIS_VARIANT_SPARSE__");
+}
+
+TEST_F(VariantColumnWriterReaderTest, test_write_data_normal) {
+ // 1. create tablet_schema
+ TabletSchemaPB schema_pb;
+ schema_pb.set_keys_type(KeysType::DUP_KEYS);
+ SchemaUtils::construct_column(schema_pb.add_column(), 1, "VARIANT", "V1");
+ _tablet_schema = std::make_shared<TabletSchema>();
+ _tablet_schema->init_from_pb(schema_pb);
+
+ // 2. create tablet
+ TabletMetaSharedPtr tablet_meta(new TabletMeta(_tablet_schema));
+ tablet_meta->_tablet_id = 10000;
+ _tablet = std::make_shared<Tablet>(*_engine_ref, tablet_meta,
_data_dir.get());
+
+ EXPECT_TRUE(_tablet->init().ok());
+
EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_tablet->tablet_path()).ok());
+
EXPECT_TRUE(io::global_local_filesystem()->create_directory(_tablet->tablet_path()).ok());
+
+ // 3. create file_writer
+ io::FileWriterPtr file_writer;
+ auto file_path = local_segment_path(_tablet->tablet_path(), "0", 0);
+ auto st = io::global_local_filesystem()->create_file(file_path,
&file_writer);
+ EXPECT_TRUE(st.ok()) << st.msg();
+
+ // 4. create column_writer
+ SegmentFooterPB footer;
+ ColumnWriterOptions opts;
+ opts.meta = footer.add_columns();
+ opts.compression_type = CompressionTypePB::LZ4;
+ opts.file_writer = file_writer.get();
+ opts.footer = &footer;
+ RowsetWriterContext rowset_ctx;
+ rowset_ctx.write_type = DataWriteType::TYPE_DIRECT;
+ opts.rowset_ctx = &rowset_ctx;
+ opts.rowset_ctx->tablet_schema = _tablet_schema;
+ TabletColumn column = _tablet_schema->column(0);
+ _init_column_meta(opts.meta, 0, column, CompressionTypePB::LZ4);
+
+ std::unique_ptr<ColumnWriter> writer;
+ EXPECT_TRUE(ColumnWriter::create(opts, &column, file_writer.get(),
&writer).ok());
+ EXPECT_TRUE(writer->init().ok());
+ EXPECT_TRUE(assert_cast<VariantColumnWriter*>(writer.get()) != nullptr);
+
+ // 5. write data
+ auto olap_data_convertor =
std::make_unique<vectorized::OlapBlockDataConvertor>();
+ auto block = _tablet_schema->create_block();
+ auto column_object =
(*std::move(block.get_by_position(0).column)).mutate();
+ std::unordered_map<int, std::string> inserted_jsonstr;
+ auto path_with_size =
+ VariantUtil::fill_object_column_with_test_data(column_object,
1000, &inserted_jsonstr);
+ olap_data_convertor->add_column_data_convertor(column);
+ olap_data_convertor->set_source_content(&block, 0, 1000);
+ auto [result, accessor] = olap_data_convertor->convert_column_data(0);
+ EXPECT_TRUE(result.ok());
+ EXPECT_TRUE(accessor != nullptr);
+ EXPECT_TRUE(writer->append(accessor->get_nullmap(), accessor->get_data(),
1000).ok());
+ st = writer->finish();
+ EXPECT_TRUE(st.ok()) << st.msg();
+ st = writer->write_data();
+ EXPECT_TRUE(st.ok()) << st.msg();
+ st = writer->write_ordinal_index();
+ EXPECT_TRUE(st.ok()) << st.msg();
+ st = writer->write_zone_map();
+ EXPECT_TRUE(st.ok()) << st.msg();
+ EXPECT_TRUE(file_writer->close().ok());
+ footer.set_num_rows(1000);
+
+ // 6. check footer
+ EXPECT_EQ(footer.columns_size(), 5);
+ auto column_meta = footer.columns(0);
+ EXPECT_EQ(column_meta.type(), (int)FieldType::OLAP_FIELD_TYPE_VARIANT);
+
+ for (int i = 1; i < footer.columns_size() - 1; ++i) {
+ auto column_meta = footer.columns(i);
+ check_column_meta(column_meta, path_with_size);
+ }
+ check_sparse_column_meta(footer.columns(footer.columns_size() - 1),
path_with_size);
+
+ // 7. check variant reader
+ io::FileReaderSPtr file_reader;
+ st = io::global_local_filesystem()->open_file(file_path, &file_reader);
+ EXPECT_TRUE(st.ok()) << st.msg();
+ ColumnReaderOptions read_opts;
+ std::unique_ptr<ColumnReader> column_reader;
+ st = ColumnReader::create(read_opts, footer, 0, 1000, file_reader,
&column_reader);
+ EXPECT_TRUE(st.ok()) << st.msg();
+
+ auto variant_column_reader =
assert_cast<VariantColumnReader*>(column_reader.get());
+ EXPECT_TRUE(variant_column_reader != nullptr);
+
+ auto subcolumn_reader =
variant_column_reader->get_reader_by_path(PathInData("key0"));
+ EXPECT_TRUE(subcolumn_reader != nullptr);
+ subcolumn_reader =
variant_column_reader->get_reader_by_path(PathInData("key1"));
+ EXPECT_TRUE(subcolumn_reader != nullptr);
+ subcolumn_reader =
variant_column_reader->get_reader_by_path(PathInData("key2"));
+ EXPECT_TRUE(subcolumn_reader != nullptr);
+
EXPECT_TRUE(variant_column_reader->exist_in_sparse_column(PathInData("key3")));
+
EXPECT_TRUE(variant_column_reader->exist_in_sparse_column(PathInData("key4")));
+
EXPECT_TRUE(variant_column_reader->exist_in_sparse_column(PathInData("key5")));
+
EXPECT_TRUE(variant_column_reader->exist_in_sparse_column(PathInData("key6")));
+
EXPECT_TRUE(variant_column_reader->exist_in_sparse_column(PathInData("key7")));
+
EXPECT_TRUE(variant_column_reader->exist_in_sparse_column(PathInData("key8")));
+
EXPECT_TRUE(variant_column_reader->exist_in_sparse_column(PathInData("key9")));
+ auto size = variant_column_reader->get_metadata_size();
+ EXPECT_GT(size, 0);
+
+ // 8. check statistics
+ auto statistics = variant_column_reader->get_stats();
+ for (const auto& [path, size] : statistics->subcolumns_non_null_size) {
+ EXPECT_EQ(path_with_size[path], size);
+ }
+ for (const auto& [path, size] : statistics->sparse_column_non_null_size) {
+ EXPECT_EQ(path_with_size[path], size);
+ }
+
+ // 9. check hier reader
+ ColumnIterator* it;
+ TabletColumn parent_column = _tablet_schema->column(0);
+ StorageReadOptions storage_read_opts;
+ storage_read_opts.io_ctx.reader_type = ReaderType::READER_QUERY;
+ st = variant_column_reader->new_iterator(&it, parent_column,
&storage_read_opts);
+ EXPECT_TRUE(st.ok()) << st.msg();
+ EXPECT_TRUE(assert_cast<HierarchicalDataReader*>(it) != nullptr);
+ ColumnIteratorOptions column_iter_opts;
+ OlapReaderStatistics stats;
+ column_iter_opts.stats = &stats;
+ column_iter_opts.file_reader = file_reader.get();
+ st = it->init(column_iter_opts);
+ EXPECT_TRUE(st.ok()) << st.msg();
+
+ MutableColumnPtr new_column_object = ColumnObject::create(3);
+ size_t nrows = 1000;
+ st = it->seek_to_ordinal(0);
+ EXPECT_TRUE(st.ok()) << st.msg();
+ st = it->next_batch(&nrows, new_column_object);
+ EXPECT_TRUE(st.ok()) << st.msg();
+ EXPECT_TRUE(stats.bytes_read > 0);
+
+ for (int i = 0; i < 1000; ++i) {
+ std::string value;
+ st = assert_cast<ColumnObject*>(new_column_object.get())
+ ->serialize_one_row_to_string(i, &value);
+
+ EXPECT_TRUE(st.ok()) << st.msg();
+ EXPECT_EQ(value, inserted_jsonstr[i]);
+ }
+
+ std::vector<rowid_t> row_ids;
+ for (int i = 0; i < 1000; ++i) {
+ if (i % 7 == 0) {
+ row_ids.push_back(i);
+ }
+ }
+ new_column_object = ColumnObject::create(3);
+ st = it->read_by_rowids(row_ids.data(), row_ids.size(), new_column_object);
+ EXPECT_TRUE(st.ok()) << st.msg();
+ for (int i = 0; i < row_ids.size(); ++i) {
+ std::string value;
+ st = assert_cast<ColumnObject*>(new_column_object.get())
+ ->serialize_one_row_to_string(i, &value);
+ EXPECT_TRUE(st.ok()) << st.msg();
+ EXPECT_EQ(value, inserted_jsonstr[row_ids[i]]);
+ }
+
+ auto read_to_column_object = [&]() {
+ new_column_object = ColumnObject::create(3);
+ nrows = 1000;
+ st = it->seek_to_ordinal(0);
+ EXPECT_TRUE(st.ok()) << st.msg();
+ st = it->next_batch(&nrows, new_column_object);
+ EXPECT_TRUE(st.ok()) << st.msg();
+ EXPECT_TRUE(stats.bytes_read > 0);
+ EXPECT_EQ(nrows, 1000);
+ };
+
+ // 10. check sparse extract reader
+ for (int i = 3; i < 10; ++i) {
+ std::string key = ".key" + std::to_string(i);
+ TabletColumn subcolumn_in_sparse;
+ subcolumn_in_sparse.set_name(parent_column.name_lower_case() + key);
+ subcolumn_in_sparse.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT);
+ subcolumn_in_sparse.set_parent_unique_id(parent_column.unique_id());
+
subcolumn_in_sparse.set_path_info(PathInData(parent_column.name_lower_case() +
key));
+ subcolumn_in_sparse.set_variant_max_subcolumns_count(
+ parent_column.variant_max_subcolumns_count());
+ subcolumn_in_sparse.set_is_nullable(true);
+
+ st = variant_column_reader->new_iterator(&it, subcolumn_in_sparse,
&storage_read_opts);
+ EXPECT_TRUE(st.ok()) << st.msg();
+ EXPECT_TRUE(assert_cast<SparseColumnExtractReader*>(it) != nullptr);
+ st = it->init(column_iter_opts);
+ EXPECT_TRUE(st.ok()) << st.msg();
+
+ read_to_column_object();
+
+ for (int row = 0; row < 1000; ++row) {
+ std::string value;
+ st = assert_cast<ColumnObject*>(new_column_object.get())
+ ->serialize_one_row_to_string(row, &value);
+ EXPECT_TRUE(st.ok()) << st.msg();
+ if (inserted_jsonstr[row].find(key) != std::string::npos) {
+ if (i % 2 == 0) {
+ EXPECT_EQ(value, "88");
+ } else {
+ EXPECT_EQ(value, "str99");
+ }
+ }
+ }
+ }
+
+ // 11. check leaf reader
+ auto check_leaf_reader = [&]() {
+ for (int i = 0; i < 3; ++i) {
+ std::string key = ".key" + std::to_string(i);
+ TabletColumn subcolumn;
+ subcolumn.set_name(parent_column.name_lower_case() + key);
+ subcolumn.set_type((FieldType)(int)footer.columns(i + 1).type());
+ subcolumn.set_parent_unique_id(parent_column.unique_id());
+ subcolumn.set_path_info(PathInData(parent_column.name_lower_case()
+ key));
+ subcolumn.set_variant_max_subcolumns_count(
+ parent_column.variant_max_subcolumns_count());
+ subcolumn.set_is_nullable(true);
+
+ st = variant_column_reader->new_iterator(&it, subcolumn,
&storage_read_opts);
+ EXPECT_TRUE(st.ok()) << st.msg();
+ EXPECT_TRUE(assert_cast<FileColumnIterator*>(it) != nullptr);
+ st = it->init(column_iter_opts);
+ EXPECT_TRUE(st.ok()) << st.msg();
+
+ auto column_type =
DataTypeFactory::instance().create_data_type(subcolumn, false);
+ auto read_column = column_type->create_column();
+ nrows = 1000;
+ st = it->seek_to_ordinal(0);
+ EXPECT_TRUE(st.ok()) << st.msg();
+ st = it->next_batch(&nrows, read_column);
+ EXPECT_TRUE(st.ok()) << st.msg();
+ EXPECT_TRUE(stats.bytes_read > 0);
+
+ for (int row = 0; row < 1000; ++row) {
+ const std::string& value =
column_type->to_string(*read_column, row);
+ if (inserted_jsonstr[row].find(key) != std::string::npos) {
+ if (i % 2 == 0) {
+ EXPECT_EQ(value, "88");
+ } else {
+ EXPECT_EQ(value, "str99");
+ }
+ }
+ }
+ }
+ };
+ check_leaf_reader();
+
+ // 12. check empty
+ TabletColumn subcolumn;
+ subcolumn.set_name(parent_column.name_lower_case() + ".key10");
+ subcolumn.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT);
+ subcolumn.set_parent_unique_id(parent_column.unique_id());
+ subcolumn.set_path_info(PathInData(parent_column.name_lower_case() +
".key10"));
+ subcolumn.set_is_nullable(true);
+ st = variant_column_reader->new_iterator(&it, subcolumn,
&storage_read_opts);
+ EXPECT_TRUE(st.ok()) << st.msg();
+ EXPECT_TRUE(assert_cast<DefaultValueColumnIterator*>(it) != nullptr);
+
+ // 13. check statistics size == limit
+ auto& variant_stats = variant_column_reader->_statistics;
+ EXPECT_TRUE(variant_stats->sparse_column_non_null_size.size() <
+ VariantStatistics::MAX_SPARSE_DATA_STATISTICS_SIZE);
+ auto limit = VariantStatistics::MAX_SPARSE_DATA_STATISTICS_SIZE -
+ variant_stats->sparse_column_non_null_size.size();
+ for (int i = 0; i < limit; ++i) {
+ std::string key = parent_column.name_lower_case() + ".key10" +
std::to_string(i);
+ variant_stats->sparse_column_non_null_size[key] = 10000;
+ }
+ EXPECT_TRUE(variant_stats->sparse_column_non_null_size.size() ==
+ VariantStatistics::MAX_SPARSE_DATA_STATISTICS_SIZE);
+
+ st = variant_column_reader->new_iterator(&it, subcolumn,
&storage_read_opts);
+ EXPECT_TRUE(st.ok()) << st.msg();
+ EXPECT_TRUE(assert_cast<HierarchicalDataReader*>(it) != nullptr);
+ st = it->init(column_iter_opts);
+ EXPECT_TRUE(st.ok()) << st.msg();
+
+ auto check_empty_column = [&]() {
+ for (int row = 0; row < 1000; ++row) {
+ std::string value;
+ st = assert_cast<ColumnObject*>(new_column_object.get())
+ ->serialize_one_row_to_string(row, &value);
+
+ EXPECT_TRUE(st.ok()) << st.msg();
+ EXPECT_EQ(value, "{}");
+ }
+ };
+
+ read_to_column_object();
+ check_empty_column();
+
+ // construct tablet schema for compaction
+ storage_read_opts.io_ctx.reader_type = ReaderType::READER_BASE_COMPACTION;
+ storage_read_opts.tablet_schema = _tablet_schema;
+ std::unordered_map<int32_t, TabletSchema::PathsSetInfo>
uid_to_paths_set_info;
+ TabletSchema::PathsSetInfo paths_set_info;
+ paths_set_info.sub_path_set.insert("key0");
+ paths_set_info.sub_path_set.insert("key3");
+ paths_set_info.sub_path_set.insert("key4");
+ paths_set_info.sparse_path_set.insert("key1");
+ paths_set_info.sparse_path_set.insert("key2");
+ paths_set_info.sparse_path_set.insert("key5");
+ paths_set_info.sparse_path_set.insert("key6");
+ paths_set_info.sparse_path_set.insert("key7");
+ paths_set_info.sparse_path_set.insert("key8");
+ paths_set_info.sparse_path_set.insert("key9");
+ uid_to_paths_set_info[parent_column.unique_id()] = paths_set_info;
+ _tablet_schema->set_path_set_info(uid_to_paths_set_info);
+
+ // 14. check compaction subcolumn reader
+ check_leaf_reader();
+
+ // 15. check compaction root reader
+ st = variant_column_reader->new_iterator(&it, parent_column,
&storage_read_opts);
+ EXPECT_TRUE(st.ok()) << st.msg();
+ EXPECT_TRUE(assert_cast<VariantRootColumnIterator*>(it) != nullptr);
+ st = it->init(column_iter_opts);
+ EXPECT_TRUE(st.ok()) << st.msg();
+
+ // 16. check compacton sparse column
+ TabletColumn sparse_column =
schema_util::create_sparse_column(parent_column);
+ st = variant_column_reader->new_iterator(&it, sparse_column,
&storage_read_opts);
+ EXPECT_TRUE(st.ok()) << st.msg();
+ EXPECT_TRUE(assert_cast<SparseColumnMergeReader*>(it) != nullptr);
+ st = it->init(column_iter_opts);
+ EXPECT_TRUE(st.ok()) << st.msg();
+ auto column_type =
DataTypeFactory::instance().create_data_type(sparse_column, false);
+ auto read_column = column_type->create_column();
+ nrows = 1000;
+ st = it->seek_to_ordinal(0);
+ EXPECT_TRUE(st.ok()) << st.msg();
+ st = it->next_batch(&nrows, read_column);
+ EXPECT_TRUE(st.ok()) << st.msg();
+ EXPECT_TRUE(stats.bytes_read > 0);
+
+ for (int row = 0; row < 1000; ++row) {
+ const std::string& value = column_type->to_string(*read_column, row);
+ EXPECT_TRUE(value.find("key0") == std::string::npos)
+ << "row: " << row << ", value: " << value;
+ EXPECT_TRUE(value.find("key3") == std::string::npos)
+ << "row: " << row << ", value: " << value;
+ EXPECT_TRUE(value.find("key4") == std::string::npos)
+ << "row: " << row << ", value: " << value;
+ }
+
+ // 17. check limit = 10000
+ subcolumn.set_name(parent_column.name_lower_case() + ".key10");
+ subcolumn.set_path_info(PathInData(parent_column.name_lower_case() +
".key10"));
+ st = variant_column_reader->new_iterator(&it, subcolumn,
&storage_read_opts);
+ EXPECT_TRUE(st.ok()) << st.msg();
+ EXPECT_TRUE(assert_cast<SparseColumnExtractReader*>(it) != nullptr);
+
+ for (int i = 0; i < limit; ++i) {
+ std::string key = parent_column.name_lower_case() + ".key10" +
std::to_string(i);
+ variant_stats->sparse_column_non_null_size.erase(key);
+ }
+
+ // 18. check compacton sparse extract column
+ subcolumn.set_name(parent_column.name_lower_case() + ".key3");
+ subcolumn.set_path_info(PathInData(parent_column.name_lower_case() +
".key3"));
+ st = variant_column_reader->new_iterator(&it, subcolumn,
&storage_read_opts);
+ EXPECT_TRUE(st.ok()) << st.msg();
+ EXPECT_TRUE(assert_cast<SparseColumnExtractReader*>(it) != nullptr);
+
+ // 19. check compaction default column
+ subcolumn.set_name(parent_column.name_lower_case() + ".key10");
+ subcolumn.set_path_info(PathInData(parent_column.name_lower_case() +
".key10"));
+ st = variant_column_reader->new_iterator(&it, subcolumn,
&storage_read_opts);
+ EXPECT_TRUE(st.ok()) << st.msg();
+ EXPECT_TRUE(assert_cast<DefaultValueColumnIterator*>(it) != nullptr);
+
EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_tablet->tablet_path()).ok());
+}
+
+TEST_F(VariantColumnWriterReaderTest, test_write_data_advanced) {
+ // 1. create tablet_schema
+ TabletSchemaPB schema_pb;
+ schema_pb.set_keys_type(KeysType::DUP_KEYS);
+ SchemaUtils::construct_column(schema_pb.add_column(), 1, "VARIANT", "V1",
10);
+ _tablet_schema = std::make_shared<TabletSchema>();
+ _tablet_schema->init_from_pb(schema_pb);
+
+ // 2. create tablet
+ TabletMetaSharedPtr tablet_meta(new TabletMeta(_tablet_schema));
+ tablet_meta->_tablet_id = 10000;
+ _tablet = std::make_shared<Tablet>(*_engine_ref, tablet_meta,
_data_dir.get());
+ EXPECT_TRUE(_tablet->init().ok());
+
EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_tablet->tablet_path()).ok());
+
EXPECT_TRUE(io::global_local_filesystem()->create_directory(_tablet->tablet_path()).ok());
+
+ // 3. create file_writer
+ io::FileWriterPtr file_writer;
+ auto file_path = local_segment_path(_tablet->tablet_path(), "0", 0);
+ auto st = io::global_local_filesystem()->create_file(file_path,
&file_writer);
+ EXPECT_TRUE(st.ok()) << st.msg();
+
+ // 4. create column_writer
+ SegmentFooterPB footer;
+ ColumnWriterOptions opts;
+ opts.meta = footer.add_columns();
+ opts.compression_type = CompressionTypePB::LZ4;
+ opts.file_writer = file_writer.get();
+ opts.footer = &footer;
+ RowsetWriterContext rowset_ctx;
+ rowset_ctx.write_type = DataWriteType::TYPE_DIRECT;
+ opts.rowset_ctx = &rowset_ctx;
+ opts.rowset_ctx->tablet_schema = _tablet_schema;
+ TabletColumn column = _tablet_schema->column(0);
+ _init_column_meta(opts.meta, 0, column, CompressionTypePB::LZ4);
+
+ std::unique_ptr<ColumnWriter> writer;
+ EXPECT_TRUE(ColumnWriter::create(opts, &column, file_writer.get(),
&writer).ok());
+ EXPECT_TRUE(writer->init().ok());
+ EXPECT_TRUE(assert_cast<VariantColumnWriter*>(writer.get()) != nullptr);
+
+ // 5. write data
+ auto olap_data_convertor =
std::make_unique<vectorized::OlapBlockDataConvertor>();
+ auto block = _tablet_schema->create_block();
+ auto column_object =
(*std::move(block.get_by_position(0).column)).mutate();
+ std::unordered_map<int, std::string> inserted_jsonstr;
+ auto path_with_size =
VariantUtil::fill_object_column_with_nested_test_data(column_object, 1000,
+
&inserted_jsonstr);
+ olap_data_convertor->add_column_data_convertor(column);
+ olap_data_convertor->set_source_content(&block, 0, 1000);
+ auto [result, accessor] = olap_data_convertor->convert_column_data(0);
+ EXPECT_TRUE(result.ok());
+ EXPECT_TRUE(accessor != nullptr);
+ EXPECT_TRUE(writer->append(accessor->get_nullmap(), accessor->get_data(),
1000).ok());
+ st = writer->finish();
+ EXPECT_TRUE(st.ok()) << st.msg();
+ st = writer->write_data();
+ EXPECT_TRUE(st.ok()) << st.msg();
+ st = writer->write_ordinal_index();
+ EXPECT_TRUE(st.ok()) << st.msg();
+ st = writer->write_zone_map();
+ EXPECT_TRUE(st.ok()) << st.msg();
+ EXPECT_TRUE(file_writer->close().ok());
+ footer.set_num_rows(1000);
+
+ // 6. check footer
+ EXPECT_EQ(footer.columns_size(), 12);
+ auto column_meta = footer.columns(0);
+ EXPECT_EQ(column_meta.type(), (int)FieldType::OLAP_FIELD_TYPE_VARIANT);
+
+ for (int i = 1; i < footer.columns_size() - 1; ++i) {
+ auto column_meta = footer.columns(i);
+ check_column_meta(column_meta, path_with_size);
+ }
+ check_sparse_column_meta(footer.columns(footer.columns_size() - 1),
path_with_size);
+
+ // 7. check variant reader
+ io::FileReaderSPtr file_reader;
+ st = io::global_local_filesystem()->open_file(file_path, &file_reader);
+ EXPECT_TRUE(st.ok()) << st.msg();
+ ColumnReaderOptions read_opts;
+ std::unique_ptr<ColumnReader> column_reader;
+ st = ColumnReader::create(read_opts, footer, 0, 1000, file_reader,
&column_reader);
+ EXPECT_TRUE(st.ok()) << st.msg();
+
+ auto variant_column_reader =
assert_cast<VariantColumnReader*>(column_reader.get());
+ EXPECT_TRUE(variant_column_reader != nullptr);
+
+ // 8. check statistics
+ auto statistics = variant_column_reader->get_stats();
+ for (const auto& [path, size] : statistics->subcolumns_non_null_size) {
+ std::cout << "path: " << path << ", size: " << size << std::endl;
+ EXPECT_EQ(path_with_size[path], size);
+ }
+ for (const auto& [path, size] : statistics->sparse_column_non_null_size) {
+ std::cout << "sparse path: " << path << ", size: " << size <<
std::endl;
+ EXPECT_EQ(path_with_size[path], size);
+ }
+
+ // 9. check root
+ ColumnIterator* it;
+ TabletColumn parent_column = _tablet_schema->column(0);
+ StorageReadOptions storage_read_opts;
+ storage_read_opts.io_ctx.reader_type = ReaderType::READER_QUERY;
+ st = variant_column_reader->new_iterator(&it, parent_column,
&storage_read_opts);
+ EXPECT_TRUE(st.ok()) << st.msg();
+ EXPECT_TRUE(assert_cast<HierarchicalDataReader*>(it) != nullptr);
+ ColumnIteratorOptions column_iter_opts;
+ OlapReaderStatistics stats;
+ column_iter_opts.stats = &stats;
+ column_iter_opts.file_reader = file_reader.get();
+ st = it->init(column_iter_opts);
+ EXPECT_TRUE(st.ok()) << st.msg();
+
+ MutableColumnPtr new_column_object = ColumnObject::create(3);
+ size_t nrows = 1000;
+ st = it->seek_to_ordinal(0);
+ EXPECT_TRUE(st.ok()) << st.msg();
+ st = it->next_batch(&nrows, new_column_object);
+ EXPECT_TRUE(st.ok()) << st.msg();
+ EXPECT_TRUE(stats.bytes_read > 0);
+
+ for (int i = 0; i < 1000; ++i) {
+ std::string value;
+ st = assert_cast<ColumnObject*>(new_column_object.get())
+ ->serialize_one_row_to_string(i, &value);
+ EXPECT_TRUE(st.ok()) << st.msg();
+ EXPECT_EQ(value, inserted_jsonstr[i]);
+ }
+
+ auto read_to_column_object = [&]() {
+ new_column_object = ColumnObject::create(10);
+ nrows = 1000;
+ st = it->seek_to_ordinal(0);
+ EXPECT_TRUE(st.ok()) << st.msg();
+ st = it->next_batch(&nrows, new_column_object);
+ EXPECT_TRUE(st.ok()) << st.msg();
+ EXPECT_TRUE(stats.bytes_read > 0);
+ EXPECT_EQ(nrows, 1000);
+ };
+
+ auto check_key_stats = [&](const std::string& key_num) {
+ std::string key = ".key" + key_num;
+ TabletColumn subcolumn_in_nested;
+ subcolumn_in_nested.set_name(parent_column.name_lower_case() + key);
+ subcolumn_in_nested.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT);
+ subcolumn_in_nested.set_parent_unique_id(parent_column.unique_id());
+
subcolumn_in_nested.set_path_info(PathInData(parent_column.name_lower_case() +
key));
+ subcolumn_in_nested.set_variant_max_subcolumns_count(
+ parent_column.variant_max_subcolumns_count());
+ subcolumn_in_nested.set_is_nullable(true);
+
+ st = variant_column_reader->new_iterator(&it, subcolumn_in_nested,
&storage_read_opts);
+ EXPECT_TRUE(st.ok()) << st.msg();
+ EXPECT_TRUE(assert_cast<HierarchicalDataReader*>(it) != nullptr);
+ st = it->init(column_iter_opts);
+ EXPECT_TRUE(st.ok()) << st.msg();
+ read_to_column_object();
+
+ size_t key_count = 0;
+ size_t key_nested_count = 0;
+ for (int row = 0; row < 1000; ++row) {
+ std::string value;
+ st = assert_cast<ColumnObject*>(new_column_object.get())
+ ->serialize_one_row_to_string(row, &value);
+ EXPECT_TRUE(st.ok()) << st.msg();
+ if (value.find("nested" + key_num) != std::string::npos) {
+ key_nested_count++;
+ } else if (value.find("88") != std::string::npos) {
+ key_count++;
+ }
+ }
+ EXPECT_EQ(key_count, path_with_size["key" + key_num]);
+ EXPECT_EQ(key_nested_count, path_with_size["key" + key_num + ".nested"
+ key_num]);
+ };
+
+ for (int i = 3; i < 10; ++i) {
+ check_key_stats(std::to_string(i));
+ }
+
+
EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_tablet->tablet_path()).ok());
+}
+
+} // namespace doris
\ No newline at end of file
diff --git a/be/test/olap/rowset/variant_with_compaction_test.cpp
b/be/test/olap/rowset/variant_with_compaction_test.cpp
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/be/test/testutil/schema_utils.h b/be/test/testutil/schema_utils.h
new file mode 100644
index 00000000000..fdb8354f975
--- /dev/null
+++ b/be/test/testutil/schema_utils.h
@@ -0,0 +1,48 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "vec/common/schema_util.h"
+
+namespace doris {
+
+class SchemaUtils {
+public:
+ static void construct_column(ColumnPB* column_pb, int32_t col_unique_id,
+ const std::string& column_type, const
std::string& column_name,
+ int variant_max_subcolumns_count = 3, bool
is_key = false) {
+ column_pb->set_unique_id(col_unique_id);
+ column_pb->set_name(column_name);
+ column_pb->set_type(column_type);
+ column_pb->set_is_key(is_key);
+ column_pb->set_is_nullable(false);
+ if (column_type == "VARIANT") {
+
column_pb->set_variant_max_subcolumns_count(variant_max_subcolumns_count);
+ }
+ }
+
+ static void construct_tablet_index(TabletIndexPB* tablet_index, int64_t
index_id,
+ const std::string& index_name, int32_t
col_unique_id) {
+ tablet_index->set_index_id(index_id);
+ tablet_index->set_index_name(index_name);
+ tablet_index->set_index_type(IndexType::INVERTED);
+ tablet_index->add_col_unique_id(col_unique_id);
+ }
+};
+
+} // namespace doris
diff --git a/be/test/testutil/variant_util.h b/be/test/testutil/variant_util.h
new file mode 100644
index 00000000000..27b9000ce55
--- /dev/null
+++ b/be/test/testutil/variant_util.h
@@ -0,0 +1,137 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "vec/columns/column_object.h"
+#include "vec/columns/column_string.h"
+#include "vec/common/schema_util.h"
+#include "vec/data_types/data_type_string.h"
+#include "vec/json/parse2column.h"
+
+namespace doris {
+
+using namespace vectorized;
+
+class VariantUtil {
+public:
+ static schema_util::PathToNoneNullValues fill_string_column_with_test_data(
+ auto& column_string, int size, std::unordered_map<int,
std::string>* inserted_jsonstr) {
+ schema_util::PathToNoneNullValues all_path_stats;
+ std::srand(42);
+ for (int i = 0; i < size; i++) {
+ std::string json_str = "{";
+ int num_pairs = std::rand() % 10 + 1;
+ for (int j = 0; j < num_pairs; j++) {
+ std::string key = "key" + std::to_string(j);
+ if (j % 2 == 0) {
+ int value = 88;
+ json_str += "\"" + key + "\":" + std::to_string(value);
+ } else {
+ std::string value = "str" + std::to_string(99);
+ json_str += "\"" + key + "\":\"" + value + "\"";
+ }
+ if (j < num_pairs - 1) {
+ json_str += ",";
+ }
+ all_path_stats[key] += 1;
+ }
+ json_str += "}";
+ vectorized::Field str(json_str);
+ column_string->insert_data(json_str.data(), json_str.size());
+ (*inserted_jsonstr)[i] = json_str;
+ }
+ return all_path_stats;
+ }
+
+ static schema_util::PathToNoneNullValues
fill_string_column_with_nested_test_data(
+ auto& column_string, int size, std::unordered_map<int,
std::string>* inserted_jsonstr) {
+ schema_util::PathToNoneNullValues all_path_stats;
+ std::srand(42);
+ for (int i = 0; i < size; i++) {
+ std::string json_str = "{";
+
+ int num_paths = std::rand() % 9 + 2;
+ int current_path = 0;
+
+ json_str += "\"key0\":{";
+
+ json_str += "\"key1\":{";
+
+ json_str += "\"key2\":" + std::to_string(88) + ",";
+ json_str += "\"key3\":\"" + std::to_string(88) + "\"";
+ json_str += "},";
+ json_str += "\"key4\":" + std::to_string(88);
+ json_str += "},";
+
+ all_path_stats["key0.key1.key2"] += 1;
+ all_path_stats["key0.key1.key3"] += 1;
+ all_path_stats["key0.key4"] += 1;
+ current_path += 3;
+
+ while (current_path < num_paths) {
+ std::string key = "key" + std::to_string(current_path);
+ if (std::rand() % 2 == 0) {
+ json_str += "\"" + key + "\":{";
+ json_str +=
+ "\"nested" + std::to_string(current_path) + "\":"
+ std::to_string(88);
+ json_str += "},";
+ all_path_stats[key + ".nested" +
std::to_string(current_path)] += 1;
+ } else {
+ // 添加简单路径
+ json_str += "\"" + key + "\":\"" + std::to_string(88) +
"\",";
+ all_path_stats[key] += 1;
+ }
+ current_path++;
+ }
+
+ json_str = json_str.substr(0, json_str.length() - 1);
+ json_str += "}";
+
+ vectorized::Field str(json_str);
+ column_string->insert_data(json_str.data(), json_str.size());
+ (*inserted_jsonstr)[i] = json_str;
+ }
+ return all_path_stats;
+ }
+
+ static schema_util::PathToNoneNullValues fill_object_column_with_test_data(
+ auto& column_object, int size, std::unordered_map<int,
std::string>* inserted_jsonstr) {
+ auto type_string = std::make_shared<vectorized::DataTypeString>();
+ auto column = type_string->create_column();
+ auto column_string = assert_cast<ColumnString*>(column.get());
+ auto res = fill_string_column_with_test_data(column_string, size,
inserted_jsonstr);
+ vectorized::ParseConfig config;
+ config.enable_flatten_nested = false;
+ parse_json_to_variant(*column_object, *column_string, config);
+ return res;
+ }
+
+ static schema_util::PathToNoneNullValues
fill_object_column_with_nested_test_data(
+ auto& column_object, int size, std::unordered_map<int,
std::string>* inserted_jsonstr) {
+ auto type_string = std::make_shared<vectorized::DataTypeString>();
+ auto column = type_string->create_column();
+ auto column_string = assert_cast<ColumnString*>(column.get());
+ auto res = fill_string_column_with_nested_test_data(column_string,
size, inserted_jsonstr);
+ vectorized::ParseConfig config;
+ config.enable_flatten_nested = false;
+ parse_json_to_variant(*column_object, *column_string, config);
+ return res;
+ }
+};
+
+} // namespace doris
diff --git a/be/test/vec/common/schema_util_rowset_test.cpp
b/be/test/vec/common/schema_util_rowset_test.cpp
new file mode 100644
index 00000000000..eb4b2ad4c39
--- /dev/null
+++ b/be/test/vec/common/schema_util_rowset_test.cpp
@@ -0,0 +1,265 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gmock/gmock-more-matchers.h>
+#include <gtest/gtest.h>
+
+#include "olap/rowset/beta_rowset_writer.h"
+#include "olap/rowset/rowset_factory.h"
+#include "olap/rowset/segment_v2/variant_column_writer_impl.h"
+#include "olap/storage_engine.h"
+#include "olap/tablet_schema.h"
+#include "vec/common/schema_util.h"
+#include "vec/json/parse2column.h"
+
+using namespace doris::vectorized;
+
+using namespace doris::segment_v2;
+
+using namespace doris;
+
+constexpr static uint32_t MAX_PATH_LEN = 1024;
+constexpr static std::string_view dest_dir = "/ut_dir/schema_util_test";
+constexpr static std::string_view tmp_dir = "./ut_dir/tmp";
+
+class SchemaUtilRowsetTest : public testing::Test {
+protected:
+ void SetUp() override {
+ // absolute dir
+ char buffer[MAX_PATH_LEN];
+ EXPECT_NE(getcwd(buffer, MAX_PATH_LEN), nullptr);
+ _curreent_dir = std::string(buffer);
+ _absolute_dir = _curreent_dir + std::string(dest_dir);
+
EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_absolute_dir).ok());
+
EXPECT_TRUE(io::global_local_filesystem()->create_directory(_absolute_dir).ok());
+
+ // tmp dir
+
EXPECT_TRUE(io::global_local_filesystem()->delete_directory(tmp_dir).ok());
+
EXPECT_TRUE(io::global_local_filesystem()->create_directory(tmp_dir).ok());
+ std::vector<StorePath> paths;
+ paths.emplace_back(std::string(tmp_dir), 1024000000);
+ auto tmp_file_dirs = std::make_unique<segment_v2::TmpFileDirs>(paths);
+ EXPECT_TRUE(tmp_file_dirs->init().ok());
+ ExecEnv::GetInstance()->set_tmp_file_dir(std::move(tmp_file_dirs));
+
+ // storage engine
+ doris::EngineOptions options;
+ auto engine = std::make_unique<StorageEngine>(options);
+ _engine_ref = engine.get();
+ _data_dir = std::make_unique<DataDir>(*_engine_ref, _absolute_dir);
+ static_cast<void>(_data_dir->update_capacity());
+ EXPECT_TRUE(_data_dir->init(true).ok());
+ ExecEnv::GetInstance()->set_storage_engine(std::move(engine));
+ }
+ void TearDown() override {
+
//EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_tablet->tablet_path()).ok());
+
EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_absolute_dir).ok());
+
EXPECT_TRUE(io::global_local_filesystem()->delete_directory(tmp_dir).ok());
+ _engine_ref = nullptr;
+ ExecEnv::GetInstance()->set_storage_engine(nullptr);
+ }
+
+public:
+ SchemaUtilRowsetTest() = default;
+ virtual ~SchemaUtilRowsetTest() = default;
+
+private:
+ StorageEngine* _engine_ref = nullptr;
+ std::unique_ptr<DataDir> _data_dir = nullptr;
+ TabletSharedPtr _tablet = nullptr;
+ std::string _absolute_dir;
+ std::string _curreent_dir;
+};
+
+static void construct_column(ColumnPB* column_pb, int32_t col_unique_id,
+ const std::string& column_type, const
std::string& column_name,
+ bool is_key = false) {
+ column_pb->set_unique_id(col_unique_id);
+ column_pb->set_name(column_name);
+ column_pb->set_type(column_type);
+ column_pb->set_is_key(is_key);
+ column_pb->set_is_nullable(false);
+ if (column_type == "VARIANT") {
+ column_pb->set_variant_max_subcolumns_count(3);
+ }
+}
+
+// static void construct_tablet_index(TabletIndexPB* tablet_index, int64_t
index_id, const std::string& index_name, int32_t col_unique_id) {
+// tablet_index->set_index_id(index_id);
+// tablet_index->set_index_name(index_name);
+// tablet_index->set_index_type(IndexType::INVERTED);
+// tablet_index->add_col_unique_id(col_unique_id);
+// }
+
+static std::unordered_map<int32_t, schema_util::PathToNoneNullValues>
all_path_stats;
+static void fill_string_column_with_test_data(auto& column_string, int size,
int uid) {
+ std::srand(42);
+ for (int i = 0; i < size; i++) {
+ std::string json_str = "{";
+ int num_pairs = std::rand() % 10 + 1;
+ for (int j = 0; j < num_pairs; j++) {
+ std::string key = "key" + std::to_string(j);
+ if (std::rand() % 2 == 0) {
+ int value = std::rand() % 100;
+ json_str += "\"" + key + "\" : " + std::to_string(value);
+ } else {
+ std::string value = "str" + std::to_string(std::rand() % 100);
+ json_str += "\"" + key + "\" : \"" + value + "\"";
+ }
+ if (j < num_pairs - 1) {
+ json_str += ", ";
+ }
+ all_path_stats[uid][key] += 1;
+ }
+ json_str += "}";
+ vectorized::Field str(json_str);
+ column_string->insert_data(json_str.data(), json_str.size());
+ }
+}
+
+static void fill_varaint_column(auto& variant_column, int size, int uid) {
+ auto type_string = std::make_shared<vectorized::DataTypeString>();
+ auto column = type_string->create_column();
+ auto column_string = assert_cast<ColumnString*>(column.get());
+ fill_string_column_with_test_data(column_string, size, uid);
+ vectorized::ParseConfig config;
+ config.enable_flatten_nested = false;
+ parse_json_to_variant(*variant_column, *column_string, config);
+}
+
+static void fill_block_with_test_data(vectorized::Block* block, int size) {
+ auto columns = block->mutate_columns();
+ // insert key
+ for (int i = 0; i < size; i++) {
+ vectorized::Field key = i;
+ columns[0]->insert(key);
+ }
+
+ // insert v1
+ fill_varaint_column(columns[1], size, 1);
+
+ // insert v2
+ for (int i = 0; i < size; i++) {
+ vectorized::Field v2("V2");
+ columns[2]->insert(v2);
+ }
+
+ // insert v3
+ fill_varaint_column(columns[3], size, 3);
+
+ // insert v4
+ for (int i = 0; i < size; i++) {
+ vectorized::Field v4(i);
+ columns[4]->insert(v4);
+ }
+}
+static int64_t inc_id = 1000;
+static RowsetWriterContext rowset_writer_context(const
std::unique_ptr<DataDir>& data_dir,
+ const TabletSchemaSPtr&
schema,
+ const std::string&
tablet_path) {
+ RowsetWriterContext context;
+ RowsetId rowset_id;
+ rowset_id.init(inc_id);
+ context.rowset_id = rowset_id;
+ context.rowset_type = BETA_ROWSET;
+ context.data_dir = data_dir.get();
+ context.rowset_state = VISIBLE;
+ context.tablet_schema = schema;
+ context.tablet_path = tablet_path;
+ context.version = Version(inc_id, inc_id);
+ context.max_rows_per_segment = 200;
+ inc_id++;
+ return context;
+}
+
+static RowsetSharedPtr create_rowset(auto& rowset_writer, const
TabletSchemaSPtr& tablet_schema) {
+ vectorized::Block block = tablet_schema->create_block();
+ fill_block_with_test_data(&block, 1000);
+ auto st = rowset_writer->add_block(&block);
+ EXPECT_TRUE(st.ok()) << st.msg();
+ st = rowset_writer->flush();
+ EXPECT_TRUE(st.ok()) << st.msg();
+
+ RowsetSharedPtr rowset;
+ EXPECT_TRUE(rowset_writer->build(rowset).ok());
+ EXPECT_TRUE(rowset->num_segments() == 5);
+ return rowset;
+}
+
+TEST_F(SchemaUtilRowsetTest, collect_path_stats_and_get_compaction_schema) {
+ // 1.create tablet schema
+ TabletSchemaPB schema_pb;
+ construct_column(schema_pb.add_column(), 0, "INT", "key", true);
+ construct_column(schema_pb.add_column(), 1, "VARIANT", "v1");
+ construct_column(schema_pb.add_column(), 2, "STRING", "v2");
+ construct_column(schema_pb.add_column(), 3, "VARIANT", "v3");
+ construct_column(schema_pb.add_column(), 4, "INT", "v4");
+ TabletSchemaSPtr tablet_schema = std::make_shared<TabletSchema>();
+ tablet_schema->init_from_pb(schema_pb);
+
+ // 2. create tablet
+ TabletMetaSharedPtr tablet_meta(new TabletMeta(tablet_schema));
+ _tablet = std::make_shared<Tablet>(*_engine_ref, tablet_meta,
_data_dir.get());
+ EXPECT_TRUE(_tablet->init().ok());
+
EXPECT_TRUE(io::global_local_filesystem()->create_directory(_tablet->tablet_path()).ok());
+
+ // 3. create rowset
+ std::vector<RowsetSharedPtr> rowsets;
+ for (int i = 0; i < 5; i++) {
+ const auto& res = RowsetFactory::create_rowset_writer(
+ *_engine_ref,
+ rowset_writer_context(_data_dir, tablet_schema,
_tablet->tablet_path()), false);
+ EXPECT_TRUE(res.has_value()) << res.error();
+ const auto& rowset_writer = res.value();
+ auto rowset = create_rowset(rowset_writer, tablet_schema);
+ EXPECT_TRUE(_tablet->add_rowset(rowset).ok());
+ rowsets.push_back(rowset);
+ }
+
+ std::unordered_map<int32_t, schema_util::PathToNoneNullValues> path_stats;
+ for (const auto& rowset : rowsets) {
+ auto st = schema_util::collect_path_stats(rowset, path_stats);
+ EXPECT_TRUE(st.ok()) << st.msg();
+ }
+
+ for (const auto& [uid, path_stats] : path_stats) {
+ for (const auto& [path, size] : path_stats) {
+ EXPECT_EQ(all_path_stats[uid][path], size);
+ }
+ }
+
+ // 4. get compaction schema
+ TabletSchemaSPtr compaction_schema = tablet_schema;
+ auto st = schema_util::get_compaction_schema(rowsets, compaction_schema);
+ EXPECT_TRUE(st.ok()) << st.msg();
+
+ // 5. check compaction schema
+ std::unordered_map<int32_t, std::vector<std::string>>
compaction_schema_map;
+ for (const auto& column : compaction_schema->columns()) {
+ if (column->parent_unique_id() > 0) {
+
compaction_schema_map[column->parent_unique_id()].push_back(column->name());
+ }
+ }
+ for (auto& [uid, paths] : compaction_schema_map) {
+ EXPECT_EQ(paths.size(), 4);
+ std::sort(paths.begin(), paths.end());
+ EXPECT_TRUE(paths[0].ends_with("__DORIS_VARIANT_SPARSE__"));
+ EXPECT_TRUE(paths[1].ends_with("key0"));
+ EXPECT_TRUE(paths[2].ends_with("key1"));
+ EXPECT_TRUE(paths[3].ends_with("key2"));
+ }
+}
diff --git a/be/test/vec/common/schema_util_test.cpp
b/be/test/vec/common/schema_util_test.cpp
new file mode 100644
index 00000000000..ffa790ddf49
--- /dev/null
+++ b/be/test/vec/common/schema_util_test.cpp
@@ -0,0 +1,357 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "vec/common/schema_util.h"
+
+#include <gmock/gmock-more-matchers.h>
+#include <gtest/gtest.h>
+
+#include "olap/rowset/segment_v2/variant_column_writer_impl.h"
+
+using namespace doris::vectorized;
+
+using namespace doris::segment_v2;
+
+using namespace doris;
+
+class SchemaUtilTest : public testing::Test {
+public:
+ SchemaUtilTest() = default;
+ virtual ~SchemaUtilTest() = default;
+};
+
+void construct_column(ColumnPB* column_pb, TabletIndexPB* tablet_index,
int64_t index_id,
+ const std::string& index_name, int32_t col_unique_id,
+ const std::string& column_type, const std::string&
column_name,
+ const IndexType& index_type) {
+ column_pb->set_unique_id(col_unique_id);
+ column_pb->set_name(column_name);
+ column_pb->set_type(column_type);
+ column_pb->set_is_nullable(true);
+ column_pb->set_is_bf_column(true);
+ tablet_index->set_index_id(index_id);
+ tablet_index->set_index_name(index_name);
+ tablet_index->set_index_type(index_type);
+ tablet_index->add_col_unique_id(col_unique_id);
+}
+
+void construct_subcolumn(TabletSchemaSPtr schema, const FieldType& type,
int32_t col_unique_id,
+ std::string_view path, std::vector<TabletColumn>*
subcolumns) {
+ TabletColumn subcol;
+ subcol.set_type(type);
+ subcol.set_is_nullable(true);
+ subcol.set_unique_id(-1);
+ subcol.set_parent_unique_id(col_unique_id);
+ vectorized::PathInData col_path(path);
+ subcol.set_path_info(col_path);
+ subcol.set_name(col_path.get_path());
+
+ if (type == FieldType::OLAP_FIELD_TYPE_ARRAY) {
+ TabletColumn array_item_col;
+ // double not support inverted index
+ array_item_col.set_type(FieldType::OLAP_FIELD_TYPE_DOUBLE);
+ array_item_col.set_is_nullable(true);
+ array_item_col.set_unique_id(-1);
+ array_item_col.set_parent_unique_id(col_unique_id);
+
+ subcol.add_sub_column(array_item_col);
+ }
+
+ schema->append_column(subcol);
+ subcolumns->emplace_back(std::move(subcol));
+}
+
+// void construct_subcolumn(TabletSchemaSPtr schema, const FieldType& type,
+// int32_t col_unique_id, std::string_view
path,
+// std::vector<TabletColumn>* subcolumns) {
+// TabletColumn subcol;
+// subcol.set_type(type);
+// subcol.set_is_nullable(true);
+// subcol.set_unique_id(-1);
+// subcol.set_parent_unique_id(col_unique_id);
+// vectorized::PathInData col_path(path);
+// subcol.set_path_info(col_path);
+// subcol.set_name(col_path.get_path());
+// schema->append_column(subcol);
+// subcolumns->emplace_back(std::move(subcol));
+// }
+
+TEST_F(SchemaUtilTest, inherit_column_attributes) {
+ TabletSchemaPB schema_pb;
+ schema_pb.set_keys_type(KeysType::DUP_KEYS);
+
schema_pb.set_inverted_index_storage_format(InvertedIndexStorageFormatPB::V2);
+
+ construct_column(schema_pb.add_column(), schema_pb.add_index(), 10000,
"key_index", 0, "INT",
+ "key", IndexType::INVERTED);
+ construct_column(schema_pb.add_column(), schema_pb.add_index(), 10001,
"v1_index", 1, "VARIANT",
+ "v1", IndexType::INVERTED);
+ construct_column(schema_pb.add_column(), schema_pb.add_index(), 10003,
"v3_index", 3, "VARIANT",
+ "v3", IndexType::INVERTED);
+
+ TabletSchemaSPtr tablet_schema = std::make_shared<TabletSchema>();
+ tablet_schema->init_from_pb(schema_pb);
+ std::vector<TabletColumn> subcolumns;
+
+ construct_subcolumn(tablet_schema, FieldType::OLAP_FIELD_TYPE_STRING, 1,
"v1.b", &subcolumns);
+ construct_subcolumn(tablet_schema, FieldType::OLAP_FIELD_TYPE_INT, 1,
"v1.c", &subcolumns);
+
+ construct_subcolumn(tablet_schema, FieldType::OLAP_FIELD_TYPE_ARRAY, 3,
"v3.d", &subcolumns);
+ construct_subcolumn(tablet_schema, FieldType::OLAP_FIELD_TYPE_FLOAT, 3,
"v3.a", &subcolumns);
+
+ schema_util::inherit_column_attributes(tablet_schema);
+ for (const auto& col : subcolumns) {
+ switch (col._parent_col_unique_id) {
+ case 1:
+ EXPECT_TRUE(tablet_schema->inverted_index(col) != nullptr);
+ break;
+ case 3:
+ EXPECT_TRUE(tablet_schema->inverted_index(col) == nullptr);
+ break;
+ default:
+ EXPECT_TRUE(false);
+ }
+ }
+ EXPECT_EQ(tablet_schema->inverted_indexes().size(), 7);
+
+ for (const auto& col : tablet_schema->_cols) {
+ if (!col->is_extracted_column()) {
+ continue;
+ }
+ switch (col->_parent_col_unique_id) {
+ case 1:
+ EXPECT_TRUE(col->is_bf_column());
+ break;
+ case 3:
+ EXPECT_TRUE(!col->is_bf_column());
+ break;
+ default:
+ EXPECT_TRUE(false);
+ }
+ }
+}
+
+static std::unordered_map<std::string, int>
construct_column_map_with_random_values(
+ auto& column_map, int key_size, int value_size, const std::string&
prefix) {
+ std::unordered_map<std::string, int> key_value_counts;
+ auto& key = assert_cast<ColumnString&>(column_map->get_keys());
+ auto& value = assert_cast<ColumnString&>(column_map->get_values());
+ auto& offsets = column_map->get_offsets();
+
+ std::srand(42);
+
+ for (int i = 0; i < key_size; ++i) {
+ std::string current_key = prefix + std::to_string(i);
+
+ int value_count = std::rand() % value_size + 1;
+ key_value_counts[current_key] = value_count;
+
+ for (int j = 0; j < value_count; ++j) {
+ key.insert_data(current_key.data(), current_key.size());
+ auto value_str = prefix + std::to_string(j);
+ value.insert_data(value_str.data(), value_str.size());
+ }
+ offsets.push_back(key.size());
+ }
+
+ return key_value_counts;
+}
+
+TEST_F(SchemaUtilTest, calculate_variant_stats) {
+ VariantStatisticsPB stats;
+ auto column_map = ColumnMap::create(ColumnString::create(),
ColumnString::create(),
+ ColumnArray::ColumnOffsets::create());
+
+ const auto& key_value_counts =
+ construct_column_map_with_random_values(column_map, 200, 100,
"key_");
+
+ // calculate stats
+ schema_util::calculate_variant_stats(*column_map, &stats, 0, 200);
+ EXPECT_EQ(stats.sparse_column_non_null_size_size(),
key_value_counts.size());
+
+ for (const auto& kv : key_value_counts) {
+ auto it = stats.sparse_column_non_null_size().find(kv.first);
+ EXPECT_NE(it, stats.sparse_column_non_null_size().end());
+ EXPECT_EQ(it->second, kv.second);
+ }
+
+ // test with different key size
+ column_map->clear();
+ const auto& key_value_counts2 =
+ construct_column_map_with_random_values(column_map, 3000, 100,
"key_");
+ schema_util::calculate_variant_stats(*column_map, &stats, 0, 3000);
+ EXPECT_EQ(stats.sparse_column_non_null_size_size(), 3000);
+
+ for (const auto& [path, size] : stats.sparse_column_non_null_size()) {
+ auto first_size = key_value_counts.find(path) == key_value_counts.end()
+ ? 0
+ : key_value_counts.find(path)->second;
+ auto second_size = key_value_counts2.find(path) ==
key_value_counts2.end()
+ ? 0
+ : key_value_counts2.find(path)->second;
+ EXPECT_EQ(size, first_size + second_size);
+ }
+
+ // test with max size
+ column_map->clear();
+ const auto& key_value_counts3 = construct_column_map_with_random_values(
+ column_map, VariantStatistics::MAX_SPARSE_DATA_STATISTICS_SIZE, 5,
"key2_");
+ schema_util::calculate_variant_stats(*column_map, &stats, 0,
+
VariantStatistics::MAX_SPARSE_DATA_STATISTICS_SIZE);
+ EXPECT_EQ(VariantStatistics::MAX_SPARSE_DATA_STATISTICS_SIZE,
+ stats.sparse_column_non_null_size_size());
+
+ for (const auto& [path, size] : stats.sparse_column_non_null_size()) {
+ auto first_size = key_value_counts.find(path) == key_value_counts.end()
+ ? 0
+ : key_value_counts.find(path)->second;
+ auto second_size = key_value_counts2.find(path) ==
key_value_counts2.end()
+ ? 0
+ : key_value_counts2.find(path)->second;
+ auto third_size = key_value_counts3.find(path) ==
key_value_counts3.end()
+ ? 0
+ : key_value_counts3.find(path)->second;
+ EXPECT_EQ(size, first_size + second_size + third_size);
+ }
+}
+
+TEST_F(SchemaUtilTest, get_subpaths) {
+ TabletColumn variant;
+ variant.set_unique_id(1);
+ variant.set_variant_max_subcolumns_count(3);
+
+ std::unordered_map<int32_t, schema_util::PathToNoneNullValues> path_stats;
+ path_stats[1] = {
+ {"path1", 1000}, {"path2", 800}, {"path3", 500}, {"path4", 300},
{"path5", 200}};
+
+ // get subpaths
+ std::unordered_map<int32_t, TabletSchema::PathsSetInfo>
uid_to_paths_set_info;
+ schema_util::get_subpaths(variant, path_stats, uid_to_paths_set_info);
+
+ EXPECT_EQ(uid_to_paths_set_info[1].sub_path_set.size(), 3);
+ EXPECT_EQ(uid_to_paths_set_info[1].sparse_path_set.size(), 2);
+
+ EXPECT_TRUE(uid_to_paths_set_info[1].sub_path_set.find("path1") !=
+ uid_to_paths_set_info[1].sub_path_set.end());
+ EXPECT_TRUE(uid_to_paths_set_info[1].sub_path_set.find("path2") !=
+ uid_to_paths_set_info[1].sub_path_set.end());
+ EXPECT_TRUE(uid_to_paths_set_info[1].sub_path_set.find("path3") !=
+ uid_to_paths_set_info[1].sub_path_set.end());
+
+ EXPECT_TRUE(uid_to_paths_set_info[1].sparse_path_set.find("path4") !=
+ uid_to_paths_set_info[1].sparse_path_set.end());
+ EXPECT_TRUE(uid_to_paths_set_info[1].sparse_path_set.find("path5") !=
+ uid_to_paths_set_info[1].sparse_path_set.end());
+}
+
+TEST_F(SchemaUtilTest, get_subpaths_equal_to_max) {
+ TabletColumn variant;
+ variant.set_unique_id(1);
+ variant.set_variant_max_subcolumns_count(3);
+
+ std::unordered_map<int32_t, schema_util::PathToNoneNullValues> path_stats;
+ path_stats[1] = {{"path1", 1000}, {"path2", 800}, {"path3", 500}};
+
+ std::unordered_map<int32_t, TabletSchema::PathsSetInfo>
uid_to_paths_set_info;
+ schema_util::get_subpaths(variant, path_stats, uid_to_paths_set_info);
+
+ EXPECT_EQ(uid_to_paths_set_info[1].sub_path_set.size(), 3);
+ EXPECT_EQ(uid_to_paths_set_info[1].sparse_path_set.size(), 0);
+
+ EXPECT_TRUE(uid_to_paths_set_info[1].sub_path_set.find("path1") !=
+ uid_to_paths_set_info[1].sub_path_set.end());
+ EXPECT_TRUE(uid_to_paths_set_info[1].sub_path_set.find("path2") !=
+ uid_to_paths_set_info[1].sub_path_set.end());
+ EXPECT_TRUE(uid_to_paths_set_info[1].sub_path_set.find("path3") !=
+ uid_to_paths_set_info[1].sub_path_set.end());
+}
+
+TEST_F(SchemaUtilTest, get_subpaths_multiple_variants) {
+ TabletColumn variant1;
+ variant1.set_unique_id(1);
+ variant1.set_variant_max_subcolumns_count(3);
+
+ TabletColumn variant2;
+ variant2.set_unique_id(2);
+ variant2.set_variant_max_subcolumns_count(2);
+
+ TabletColumn variant3;
+ variant3.set_unique_id(3);
+ variant3.set_variant_max_subcolumns_count(4);
+
+ std::unordered_map<int32_t, schema_util::PathToNoneNullValues> path_stats;
+ path_stats[1] = {
+ {"path1", 1000}, {"path2", 800}, {"path3", 500}, {"path4", 300},
{"path5", 200}};
+ path_stats[2] = {{"path1", 1000}, {"path2", 800}};
+ path_stats[3] = {{"path1", 1000}, {"path2", 800}, {"path3", 500},
{"path4", 300}};
+ path_stats[4] = {
+ {"path1", 1000}, {"path2", 800}, {"path3", 500}, {"path4", 300},
{"path5", 200}};
+
+ std::unordered_map<int32_t, TabletSchema::PathsSetInfo>
uid_to_paths_set_info;
+ schema_util::get_subpaths(variant1, path_stats, uid_to_paths_set_info);
+ schema_util::get_subpaths(variant2, path_stats, uid_to_paths_set_info);
+ schema_util::get_subpaths(variant3, path_stats, uid_to_paths_set_info);
+
+ EXPECT_EQ(uid_to_paths_set_info[1].sub_path_set.size(), 3);
+ EXPECT_EQ(uid_to_paths_set_info[1].sparse_path_set.size(), 2);
+
+ EXPECT_EQ(uid_to_paths_set_info[2].sub_path_set.size(), 2);
+ EXPECT_EQ(uid_to_paths_set_info[2].sparse_path_set.size(), 0);
+
+ EXPECT_EQ(uid_to_paths_set_info[3].sub_path_set.size(), 4);
+ EXPECT_EQ(uid_to_paths_set_info[3].sparse_path_set.size(), 0);
+
+ EXPECT_TRUE(uid_to_paths_set_info[1].sub_path_set.find("path1") !=
+ uid_to_paths_set_info[1].sub_path_set.end());
+ EXPECT_TRUE(uid_to_paths_set_info[1].sub_path_set.find("path2") !=
+ uid_to_paths_set_info[1].sub_path_set.end());
+ EXPECT_TRUE(uid_to_paths_set_info[1].sub_path_set.find("path3") !=
+ uid_to_paths_set_info[1].sub_path_set.end());
+
+ EXPECT_TRUE(uid_to_paths_set_info[1].sparse_path_set.find("path4") !=
+ uid_to_paths_set_info[1].sparse_path_set.end());
+ EXPECT_TRUE(uid_to_paths_set_info[1].sparse_path_set.find("path5") !=
+ uid_to_paths_set_info[1].sparse_path_set.end());
+
+ EXPECT_TRUE(uid_to_paths_set_info[2].sub_path_set.find("path1") !=
+ uid_to_paths_set_info[2].sub_path_set.end());
+ EXPECT_TRUE(uid_to_paths_set_info[2].sub_path_set.find("path2") !=
+ uid_to_paths_set_info[2].sub_path_set.end());
+
+ EXPECT_TRUE(uid_to_paths_set_info[3].sub_path_set.find("path1") !=
+ uid_to_paths_set_info[3].sub_path_set.end());
+ EXPECT_TRUE(uid_to_paths_set_info[3].sub_path_set.find("path2") !=
+ uid_to_paths_set_info[3].sub_path_set.end());
+ EXPECT_TRUE(uid_to_paths_set_info[3].sub_path_set.find("path3") !=
+ uid_to_paths_set_info[3].sub_path_set.end());
+ EXPECT_TRUE(uid_to_paths_set_info[3].sub_path_set.find("path4") !=
+ uid_to_paths_set_info[3].sub_path_set.end());
+}
+
+TEST_F(SchemaUtilTest, get_subpaths_no_path_stats) {
+ TabletColumn variant;
+ variant.set_unique_id(1);
+ variant.set_variant_max_subcolumns_count(3);
+
+ std::unordered_map<int32_t, schema_util::PathToNoneNullValues> path_stats;
+ path_stats[2] = {{"path1", 1000}, {"path2", 800}};
+
+ std::unordered_map<int32_t, TabletSchema::PathsSetInfo>
uid_to_paths_set_info;
+ schema_util::get_subpaths(variant, path_stats, uid_to_paths_set_info);
+
+ EXPECT_EQ(uid_to_paths_set_info[1].sub_path_set.size(), 0);
+ EXPECT_EQ(uid_to_paths_set_info[1].sparse_path_set.size(), 0);
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]