This is an automated email from the ASF dual-hosted git repository.
dataroaring pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-3.0 by this push:
new 670ee4538d0 branch-3.0: [feat](test)add some be ut for orc/parquet
reader (#49418) (#50414)
670ee4538d0 is described below
commit 670ee4538d01c377c9e493f07e83c7ddc727facb
Author: daidai <[email protected]>
AuthorDate: Wed Apr 30 17:52:38 2025 +0800
branch-3.0: [feat](test)add some be ut for orc/parquet reader (#49418)
(#50414)
bp #49418
---
be/src/vec/exec/format/orc/vorc_reader.cpp | 8 +-
be/test/vec/exec/orc/orc_convert_dict_test.cpp | 237 ++++++++++
.../exec/orc/orc_convert_to_orc_literal_test.cpp | 216 ++++++++++
be/test/vec/exec/orc/orc_memory_stream_test.h | 102 +++++
be/test/vec/exec/orc/orc_reader_fill_data_test.cpp | 475 +++++++++++++++++++++
.../vec/exec/orc/orc_reader_init_column_test.cpp | 359 ++++++++++++++++
6 files changed, 1391 insertions(+), 6 deletions(-)
diff --git a/be/src/vec/exec/format/orc/vorc_reader.cpp
b/be/src/vec/exec/format/orc/vorc_reader.cpp
index 4b4e06d2c37..3a87202184a 100644
--- a/be/src/vec/exec/format/orc/vorc_reader.cpp
+++ b/be/src/vec/exec/format/orc/vorc_reader.cpp
@@ -430,6 +430,8 @@ bool OrcReader::_check_acid_schema(const orc::Type& type) {
return false;
}
}
+ } else {
+ return false;
}
return true;
}
@@ -1425,15 +1427,9 @@ Status OrcReader::_fill_doris_data_column(const
std::string& col_name,
case TypeIndex::Decimal128V3:
return _decode_decimal_column<Decimal128V3, is_filter>(col_name,
data_column, data_type,
cvb,
num_values);
- case TypeIndex::Date:
- return _decode_time_column<VecDateTimeValue, Int64,
orc::LongVectorBatch, is_filter>(
- col_name, data_column, cvb, num_values);
case TypeIndex::DateV2:
return _decode_time_column<DateV2Value<DateV2ValueType>, UInt32,
orc::LongVectorBatch,
is_filter>(col_name, data_column, cvb,
num_values);
- case TypeIndex::DateTime:
- return _decode_time_column<VecDateTimeValue, Int64,
orc::TimestampVectorBatch, is_filter>(
- col_name, data_column, cvb, num_values);
case TypeIndex::DateTimeV2:
return _decode_time_column<DateV2Value<DateTimeV2ValueType>, UInt64,
orc::TimestampVectorBatch,
is_filter>(col_name, data_column, cvb,
diff --git a/be/test/vec/exec/orc/orc_convert_dict_test.cpp
b/be/test/vec/exec/orc/orc_convert_dict_test.cpp
new file mode 100644
index 00000000000..bce08cc63db
--- /dev/null
+++ b/be/test/vec/exec/orc/orc_convert_dict_test.cpp
@@ -0,0 +1,237 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+
+#include <memory>
+
+#include "orc/ColumnPrinter.hh"
+#include "vec/columns/column_array.h"
+#include "vec/columns/column_nullable.h"
+#include "vec/columns/column_string.h"
+#include "vec/columns/column_struct.h"
+#include "vec/exec/format/orc/vorc_reader.h"
+
+namespace doris {
+namespace vectorized {
+class OrcReaderConvertDictTest : public ::testing::Test {
+protected:
+ void SetUp() override {}
+
+ void TearDown() override {}
+};
+
+std::unique_ptr<orc::EncodedStringVectorBatch> create_encoded_string_batch(
+ const std::vector<std::string>& dict_values) {
+ auto batch =
+ std::make_unique<orc::EncodedStringVectorBatch>(1024 * 1024,
*orc::getDefaultPool());
+ batch->dictionary =
std::make_unique<orc::StringDictionary>(*orc::getDefaultPool());
+
+ // Fill dictionary data
+ int sz = 0;
+ for (const auto& value : dict_values) {
+ sz += value.length();
+ }
+
+ batch->dictionary->dictionaryBlob.resize(sz + 1024);
+ batch->dictionary->dictionaryOffset.resize(dict_values.size() + 10);
+ int x = 0;
+ for (const auto& value : dict_values) {
+ batch->dictionary->dictionaryOffset[x + 1] =
+ batch->dictionary->dictionaryOffset[x] + value.size();
+ int y = batch->dictionary->dictionaryOffset[x];
+ for (auto ch : value) {
+ batch->dictionary->dictionaryBlob[y] = ch;
+ y++;
+ }
+ x++;
+ }
+
+ return batch;
+}
+
+TEST_F(OrcReaderConvertDictTest, ConvertDictColumnToStringColumnBasic) {
+ // Prepare dictionary data
+ std::vector<std::string> dict_values = {"hello", "world", "doris", "test"};
+ auto string_batch = create_encoded_string_batch(dict_values);
+
+ // Prepare dictionary index column
+ auto dict_column = ColumnInt32::create();
+ std::vector<int32_t> indices = {0, 1, 2,
+ 3, 1, 0}; // "hello", "world", "doris",
"test", "world", "hello"
+ for (auto x : indices) {
+ dict_column->insert(x);
+ }
+
+ // Create ORC type
+ auto orc_type_ptr = createPrimitiveType(orc::TypeKind::STRING);
+
+ TFileScanRangeParams params;
+ TFileRangeDesc range;
+ auto reader = OrcReader::create_unique(params, range, "", nullptr, true);
+
+ // Execute conversion
+ auto result_column = reader->_convert_dict_column_to_string_column(
+ dict_column.get(), nullptr, string_batch.get(),
orc_type_ptr.get());
+
+ // Validate results
+ auto* string_column = assert_cast<const
ColumnString*>(result_column.get());
+ ASSERT_EQ(string_column->size(), 6);
+ ASSERT_EQ(string_column->get_data_at(0).to_string(), "hello");
+ ASSERT_EQ(string_column->get_data_at(1).to_string(), "world");
+ ASSERT_EQ(string_column->get_data_at(2).to_string(), "doris");
+ ASSERT_EQ(string_column->get_data_at(3).to_string(), "test");
+ ASSERT_EQ(string_column->get_data_at(4).to_string(), "world");
+ ASSERT_EQ(string_column->get_data_at(5).to_string(), "hello");
+}
+
+TEST_F(OrcReaderConvertDictTest, ConvertDictColumnToStringColumnWithNulls) {
+ // Prepare dictionary data
+ std::vector<std::string> dict_values = {"hello", "world", "doris"};
+ auto string_batch = create_encoded_string_batch(dict_values);
+
+ // Prepare dictionary index column
+ auto dict_column = ColumnInt32::create();
+ std::vector<int32_t> indices = {0, 1, 2, 1, 0};
+ for (auto x : indices) {
+ dict_column->insert(x);
+ }
+
+ // Prepare null map
+ NullMap null_map = {0, 1, 0, 0, 1}; // 2nd and 5th elements are null
+
+ // Create ORC type
+ auto orc_type_ptr = createPrimitiveType(orc::TypeKind::STRING);
+
+ TFileScanRangeParams params;
+ TFileRangeDesc range;
+ auto _reader = OrcReader::create_unique(params, range, "", nullptr, true);
+
+ // Execute conversion
+ auto result_column = _reader->_convert_dict_column_to_string_column(
+ dict_column.get(), &null_map, string_batch.get(),
orc_type_ptr.get());
+
+ // Validate results
+ auto* string_column = assert_cast<const
ColumnString*>(result_column.get());
+ ASSERT_EQ(string_column->size(), 5);
+ ASSERT_EQ(string_column->get_data_at(0).to_string(), "hello");
+ ASSERT_EQ(string_column->get_data_at(1).to_string(), ""); // null value
+ ASSERT_EQ(string_column->get_data_at(2).to_string(), "doris");
+ ASSERT_EQ(string_column->get_data_at(3).to_string(), "world");
+ ASSERT_EQ(string_column->get_data_at(4).to_string(), ""); // null value
+}
+
+TEST_F(OrcReaderConvertDictTest, ConvertDictColumnToStringColumnChar) {
+ // Prepare dictionary data (CHAR type with right-padded spaces)
+ std::vector<std::string> dict_values = {"hello ", "world ", "test "};
+ auto string_batch = create_encoded_string_batch(dict_values);
+
+ // Prepare dictionary index column
+ auto dict_column = ColumnInt32::create();
+ std::vector<int32_t> indices = {0, 1, 2, 1};
+ for (auto x : indices) {
+ dict_column->insert(x);
+ }
+
+ // Create ORC CHAR type
+ auto orc_type_ptr = createPrimitiveType(orc::TypeKind::CHAR);
+ TFileScanRangeParams params;
+ TFileRangeDesc range;
+ auto _reader = OrcReader::create_unique(params, range, "", nullptr, true);
+
+ // Execute conversion
+ auto result_column = _reader->_convert_dict_column_to_string_column(
+ dict_column.get(), nullptr, string_batch.get(),
orc_type_ptr.get());
+
+ // Validate results (should remove trailing spaces)
+ auto* string_column = assert_cast<const
ColumnString*>(result_column.get());
+ ASSERT_EQ(string_column->size(), 4);
+ ASSERT_EQ(string_column->get_data_at(0).to_string(), "hello"); // spaces
removed
+ ASSERT_EQ(string_column->get_data_at(1).to_string(), "world"); // spaces
removed
+ ASSERT_EQ(string_column->get_data_at(2).to_string(), "test"); // spaces
removed
+ ASSERT_EQ(string_column->get_data_at(3).to_string(), "world"); // spaces
removed
+}
+
+TEST_F(OrcReaderConvertDictTest, ConvertDictColumnToStringColumnEmpty) {
+ // Prepare empty dictionary data
+ std::vector<std::string> dict_values = {""};
+ auto string_batch = create_encoded_string_batch(dict_values);
+
+ // Prepare dictionary index column
+ auto dict_column = ColumnInt32::create();
+ std::vector<int32_t> indices = {0, 0, 0};
+ for (auto x : indices) {
+ dict_column->insert(x);
+ }
+
+ // Create ORC type
+ auto orc_type_ptr = createPrimitiveType(orc::TypeKind::STRING);
+ TFileScanRangeParams params;
+ TFileRangeDesc range;
+ auto _reader = OrcReader::create_unique(params, range, "", nullptr, true);
+ // Execute conversion
+ auto result_column = _reader->_convert_dict_column_to_string_column(
+ dict_column.get(), nullptr, string_batch.get(),
orc_type_ptr.get());
+
+ // Validate results
+ auto* string_column = assert_cast<const
ColumnString*>(result_column.get());
+ ASSERT_EQ(string_column->size(), 3);
+ ASSERT_EQ(string_column->get_data_at(0).to_string(), "");
+ ASSERT_EQ(string_column->get_data_at(1).to_string(), "");
+ ASSERT_EQ(string_column->get_data_at(2).to_string(), "");
+}
+
+TEST_F(OrcReaderConvertDictTest, ConvertDictColumnToStringColumnMixed) {
+ // Prepare mixed length dictionary data
+ std::vector<std::string> dict_values = {"", "a", "ab", "abc", "abcd"};
+ auto string_batch = create_encoded_string_batch(dict_values);
+
+ // Prepare dictionary index column
+ auto dict_column = ColumnInt32::create();
+ std::vector<int32_t> indices = {0, 1, 2, 3, 4, 2, 1, 0};
+ for (auto x : indices) {
+ dict_column->insert(x);
+ }
+
+ // Prepare partial null values
+ NullMap null_map = {0, 0, 1, 0, 0, 1, 0, 0};
+
+ // Create ORC type
+ auto orc_type_ptr = createPrimitiveType(orc::TypeKind::STRING);
+ TFileScanRangeParams params;
+ TFileRangeDesc range;
+ auto _reader = OrcReader::create_unique(params, range, "", nullptr, true);
+ // Execute conversion
+ auto result_column = _reader->_convert_dict_column_to_string_column(
+ dict_column.get(), &null_map, string_batch.get(),
orc_type_ptr.get());
+
+ // Validate results
+ auto* string_column = assert_cast<const
ColumnString*>(result_column.get());
+ ASSERT_EQ(string_column->size(), 8);
+ ASSERT_EQ(string_column->get_data_at(0).to_string(), "");
+ ASSERT_EQ(string_column->get_data_at(1).to_string(), "a");
+ ASSERT_EQ(string_column->get_data_at(2).to_string(), ""); // null
+ ASSERT_EQ(string_column->get_data_at(3).to_string(), "abc");
+ ASSERT_EQ(string_column->get_data_at(4).to_string(), "abcd");
+ ASSERT_EQ(string_column->get_data_at(5).to_string(), ""); // null
+ ASSERT_EQ(string_column->get_data_at(6).to_string(), "a");
+ ASSERT_EQ(string_column->get_data_at(7).to_string(), "");
+}
+
+} // namespace vectorized
+
+} // namespace doris
diff --git a/be/test/vec/exec/orc/orc_convert_to_orc_literal_test.cpp
b/be/test/vec/exec/orc/orc_convert_to_orc_literal_test.cpp
new file mode 100644
index 00000000000..ac79f22a6bb
--- /dev/null
+++ b/be/test/vec/exec/orc/orc_convert_to_orc_literal_test.cpp
@@ -0,0 +1,216 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+
+#include <memory>
+
+#include "orc/ColumnPrinter.hh"
+#include "vec/columns/column_struct.h"
+#include "vec/core/types.h"
+#include "vec/data_types/data_type_nullable.h"
+#include "vec/exec/format/orc/vorc_reader.cpp"
+
+namespace doris {
+namespace vectorized {
+class OrcReaderConvertToOrcLiteralTest : public ::testing::Test {
+protected:
+ void SetUp() override {}
+
+ void TearDown() override {}
+};
+
+TEST_F(OrcReaderConvertToOrcLiteralTest, ConvertTypesTest) {
+ // TINYINT test
+ {
+ int8_t tiny_value = 127;
+ StringRef literal_data(reinterpret_cast<char*>(&tiny_value),
sizeof(tiny_value));
+ auto orc_type_ptr = createPrimitiveType(orc::TypeKind::BYTE);
+ auto [success, literal] =
+ convert_to_orc_literal<TYPE_TINYINT>(orc_type_ptr.get(),
literal_data.data, 0, 0);
+ ASSERT_TRUE(success);
+ ASSERT_EQ(literal.getLong(), 127);
+ }
+
+ // SMALLINT test
+ {
+ int16_t small_value = 32000;
+ StringRef literal_data(reinterpret_cast<char*>(&small_value),
sizeof(small_value));
+ auto orc_type_ptr = createPrimitiveType(orc::TypeKind::SHORT);
+ auto [success, literal] =
+ convert_to_orc_literal<TYPE_SMALLINT>(orc_type_ptr.get(),
literal_data.data, 0, 0);
+ ASSERT_TRUE(success);
+ ASSERT_EQ(literal.getLong(), 32000);
+ }
+
+ // INT test
+ {
+ int32_t int_value = 2147483647;
+ StringRef literal_data(reinterpret_cast<char*>(&int_value),
sizeof(int_value));
+ auto orc_type_ptr = createPrimitiveType(orc::TypeKind::INT);
+ auto [success, literal] =
+ convert_to_orc_literal<TYPE_INT>(orc_type_ptr.get(),
literal_data.data, 0, 0);
+ ASSERT_TRUE(success);
+ ASSERT_EQ(literal.getLong(), 2147483647);
+ }
+
+ // BIGINT test
+ {
+ int64_t big_value = 9223372036854775807LL;
+ StringRef literal_data(reinterpret_cast<char*>(&big_value),
sizeof(big_value));
+ auto orc_type_ptr = createPrimitiveType(orc::TypeKind::LONG);
+ auto [success, literal] =
+ convert_to_orc_literal<TYPE_BIGINT>(orc_type_ptr.get(),
literal_data.data, 0, 0);
+ ASSERT_TRUE(success);
+ ASSERT_EQ(literal.getLong(), 9223372036854775807LL);
+ }
+ // FLOAT test
+ {
+ float float_value = 3.14159f;
+ StringRef literal_data(reinterpret_cast<char*>(&float_value),
sizeof(float_value));
+ auto orc_type_ptr = createPrimitiveType(orc::TypeKind::FLOAT);
+ auto [success, literal] =
+ convert_to_orc_literal<TYPE_FLOAT>(orc_type_ptr.get(),
literal_data.data, 0, 0);
+ ASSERT_TRUE(success);
+ ASSERT_NEAR(literal.getFloat(), 3.14159f, 0.0001);
+ }
+
+ // DOUBLE test
+ {
+ double double_value = 3.14159265358979323846;
+ StringRef literal_data(reinterpret_cast<char*>(&double_value),
sizeof(double_value));
+ auto orc_type_ptr = createPrimitiveType(orc::TypeKind::DOUBLE);
+ auto [success, literal] =
+ convert_to_orc_literal<TYPE_DOUBLE>(orc_type_ptr.get(),
literal_data.data, 0, 0);
+ ASSERT_TRUE(success);
+ ASSERT_DOUBLE_EQ(literal.getFloat(), 3.14159265358979323846);
+ }
+ // STRING test
+ {
+ std::string str_value = "Hello, World!";
+ StringRef literal_data(str_value.data(), str_value.size());
+ auto orc_type_ptr = createPrimitiveType(orc::TypeKind::STRING);
+ auto [success, literal] =
+ convert_to_orc_literal<TYPE_STRING>(orc_type_ptr.get(),
(void*)&literal_data, 0, 0);
+ ASSERT_TRUE(success);
+ ASSERT_EQ(std::string(literal.getString().data(),
literal.getString().length()),
+ "Hello, World!");
+ }
+
+ // DECIMAL32 test
+ {
+ int32_t decimal32_value = 12345;
+ StringRef literal_data(reinterpret_cast<const char*>(&decimal32_value),
+ sizeof(decimal32_value));
+ auto orc_type_ptr = createPrimitiveType(orc::TypeKind::DECIMAL);
+ auto [success, literal] =
+ convert_to_orc_literal<TYPE_DECIMAL32>(orc_type_ptr.get(),
literal_data.data, 9, 4);
+ ASSERT_TRUE(success);
+ ASSERT_EQ(literal.getDecimal().toString(), "1.2345");
+ }
+
+ // DECIMAL64 test
+ {
+ int64_t decimal64_value = 123456789012345LL;
+ StringRef literal_data(reinterpret_cast<const char*>(&decimal64_value),
+ sizeof(decimal64_value));
+ auto orc_type_ptr = createPrimitiveType(orc::TypeKind::DECIMAL);
+ auto [success, literal] =
convert_to_orc_literal<TYPE_DECIMAL64>(orc_type_ptr.get(),
+
literal_data.data, 18, 6);
+ ASSERT_TRUE(success);
+ ASSERT_EQ(literal.getDecimal().toString(), "123456789.012345");
+ }
+
+ // DECIMAL128 test
+ {
+ int128_t decimal128_value = 1234512345;
+ StringRef literal_data(reinterpret_cast<const
char*>(&decimal128_value),
+ sizeof(decimal128_value));
+ auto orc_type_ptr = createPrimitiveType(orc::TypeKind::DECIMAL);
+ auto [success, literal] = convert_to_orc_literal<TYPE_DECIMAL128I>(
+ orc_type_ptr.get(), literal_data.data, 38, 9);
+ ASSERT_TRUE(success);
+ ASSERT_EQ(literal.getDecimal().toString(), "1.234512345");
+ }
+
+ {
+ // Normal date
+ VecDateTimeValue date_value;
+ date_value.from_date_str("2024-03-14", 10);
+ StringRef literal_data(reinterpret_cast<const char*>(&date_value),
sizeof(date_value));
+ auto orc_type_ptr = createPrimitiveType(orc::TypeKind::DATE);
+ auto [success, literal] =
+ convert_to_orc_literal<TYPE_DATE>(orc_type_ptr.get(),
literal_data.data, 0, 0);
+ ASSERT_TRUE(success);
+
+ // Verify converted day offset
+ int64_t expected_days = 19796; // Day count for 2024-03-14
+ ASSERT_EQ(literal.getDate(), expected_days);
+
+ // Boundary date - minimum value
+ date_value.from_date_str("0001-01-01", 10);
+ literal_data = StringRef(reinterpret_cast<const char*>(&date_value),
sizeof(date_value));
+ std::tie(success, literal) =
+ convert_to_orc_literal<TYPE_DATE>(orc_type_ptr.get(),
literal_data.data, 0, 0);
+ ASSERT_TRUE(success); //-719162
+ ASSERT_EQ(literal.getDate(), -719162);
+
+ // Boundary date - maximum value
+ date_value.from_date_str("9999-12-31", 10);
+ literal_data = StringRef(reinterpret_cast<const char*>(&date_value),
sizeof(date_value));
+ std::tie(success, literal) =
+ convert_to_orc_literal<TYPE_DATE>(orc_type_ptr.get(),
literal_data.data, 0, 0);
+ ASSERT_TRUE(success); //
+ ASSERT_EQ(literal.getDate(), 2932896);
+ }
+
+ // DATETIME type test
+ {
+ // Normal timestamp
+ VecDateTimeValue datetime_value;
+ datetime_value.from_date_str("2024-03-14 15:30:45", 19);
+ StringRef literal_data(reinterpret_cast<const char*>(&datetime_value),
+ sizeof(datetime_value));
+ auto orc_type_ptr = createPrimitiveType(orc::TypeKind::TIMESTAMP);
+ auto [success, literal] =
+ convert_to_orc_literal<TYPE_DATETIME>(orc_type_ptr.get(),
literal_data.data, 0, 0);
+ ASSERT_TRUE(success);
+
+ // Verify seconds and nanoseconds
+ ASSERT_EQ(literal.getTimestamp().getMillis(), 1710430245000); //
+
+ // Midnight time
+ datetime_value.from_date_str("2024-03-14 00:00:00", 19);
+ literal_data =
+ StringRef(reinterpret_cast<const char*>(&datetime_value),
sizeof(datetime_value));
+ std::tie(success, literal) =
+ convert_to_orc_literal<TYPE_DATETIME>(orc_type_ptr.get(),
literal_data.data, 0, 0);
+ ASSERT_TRUE(success);
+ ASSERT_EQ(literal.getTimestamp().getMillis(), 1710374400000); //
+
+ // Leap year handling
+ datetime_value.from_date_str("2024-02-29 12:00:00", 19);
+ literal_data =
+ StringRef(reinterpret_cast<const char*>(&datetime_value),
sizeof(datetime_value));
+ std::tie(success, literal) =
+ convert_to_orc_literal<TYPE_DATETIME>(orc_type_ptr.get(),
literal_data.data, 0, 0);
+ ASSERT_TRUE(success);
+ ASSERT_EQ(literal.getTimestamp().getMillis(), 1709208000000); //
+ }
+}
+} // namespace vectorized
+} // namespace doris
diff --git a/be/test/vec/exec/orc/orc_memory_stream_test.h
b/be/test/vec/exec/orc/orc_memory_stream_test.h
new file mode 100644
index 00000000000..52c9daad591
--- /dev/null
+++ b/be/test/vec/exec/orc/orc_memory_stream_test.h
@@ -0,0 +1,102 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+
+#include <memory>
+
+#include "orc/ColumnPrinter.hh"
+#include "vec/columns/column_array.h"
+#include "vec/columns/column_map.h"
+#include "vec/columns/column_nullable.h"
+#include "vec/columns/column_string.h"
+#include "vec/columns/column_struct.h"
+#include "vec/columns/columns_number.h"
+#include "vec/core/types.h"
+#include "vec/data_types/data_type_array.h"
+#include "vec/data_types/data_type_date.h"
+#include "vec/data_types/data_type_date_time.h"
+#include "vec/data_types/data_type_decimal.h"
+#include "vec/data_types/data_type_map.h"
+#include "vec/data_types/data_type_nullable.h"
+#include "vec/data_types/data_type_number.h"
+#include "vec/data_types/data_type_string.h"
+#include "vec/data_types/data_type_struct.h"
+#include "vec/exec/format/orc/vorc_reader.h"
+
+namespace doris {
+namespace vectorized {
+
+class MemoryOutputStream : public orc::OutputStream {
+public:
+ MemoryOutputStream(size_t capacity) : name("MemoryOutputStream") {
+ data = new char[capacity];
+ length = 0;
+ naturalWriteSize = 2048;
+ }
+
+ virtual ~MemoryOutputStream() override { delete[] data; };
+
+ virtual uint64_t getLength() const override { return length; }
+
+ virtual uint64_t getNaturalWriteSize() const override { return
naturalWriteSize; }
+
+ virtual void write(const void* buf, size_t size) override {
+ memcpy(data + length, buf, size);
+ length += size;
+ }
+
+ virtual const std::string& getName() const override { return name; }
+
+ const char* getData() const { return data; }
+
+ void close() override {}
+
+private:
+ char* data;
+ std::string name;
+ uint64_t length, naturalWriteSize;
+};
+
+class MemoryInputStream : public orc::InputStream {
+public:
+ MemoryInputStream(const char* _buffer, size_t _size)
+ : buffer(_buffer), size(_size), naturalReadSize(1024),
name("MemoryInputStream") {}
+
+ ~MemoryInputStream() override {}
+
+ virtual uint64_t getLength() const override { return size; }
+
+ virtual uint64_t getNaturalReadSize() const override { return
naturalReadSize; }
+
+ virtual void read(void* buf, uint64_t length, uint64_t offset) override {
+ memcpy(buf, buffer + offset, length);
+ }
+
+ virtual const std::string& getName() const override { return name; }
+
+ // const char* getData() const {
+ // return buffer;
+ // }
+
+private:
+ const char* buffer;
+ uint64_t size, naturalReadSize;
+ std::string name;
+};
+} // namespace vectorized
+} // namespace doris
\ No newline at end of file
diff --git a/be/test/vec/exec/orc/orc_reader_fill_data_test.cpp
b/be/test/vec/exec/orc/orc_reader_fill_data_test.cpp
new file mode 100644
index 00000000000..d896419a338
--- /dev/null
+++ b/be/test/vec/exec/orc/orc_reader_fill_data_test.cpp
@@ -0,0 +1,475 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+
+#include <memory>
+
+#include "orc/ColumnPrinter.hh"
+#include "orc_memory_stream_test.h"
+#include "vec/columns/column_array.h"
+#include "vec/columns/column_struct.h"
+#include "vec/core/types.h"
+#include "vec/data_types/data_type_array.h"
+#include "vec/data_types/data_type_decimal.h"
+#include "vec/data_types/data_type_map.h"
+#include "vec/data_types/data_type_number.h"
+#include "vec/data_types/data_type_struct.h"
+#include "vec/exec/format/orc/vorc_reader.h"
+
+namespace doris {
+namespace vectorized {
+class OrcReaderFillDataTest : public ::testing::Test {
+protected:
+ void SetUp() override {}
+
+ void TearDown() override {}
+};
+
+std::unique_ptr<orc::LongVectorBatch> create_long_batch(size_t size,
+ const
std::vector<int64_t>& values,
+ const
std::vector<bool>& nulls = {}) {
+ auto batch = std::make_unique<orc::LongVectorBatch>(size,
*orc::getDefaultPool());
+ batch->resize(size);
+ batch->notNull.resize(size);
+
+ bool has_nulls = nulls.size() == size;
+ for (size_t i = 0; i < size; ++i) {
+ if (has_nulls) {
+ batch->notNull[i] = !nulls[i];
+ } else {
+ batch->notNull[i] = true;
+ }
+
+ if (batch->notNull[i]) {
+ batch->data[i] = values[i];
+ }
+ }
+
+ if (has_nulls) {
+ batch->hasNulls = true;
+ } else {
+ batch->hasNulls = false;
+ }
+ return batch;
+}
+
+TEST_F(OrcReaderFillDataTest, TestFillLongColumn) {
+ std::vector<int64_t> values = {1, 2, 3, 4, 5};
+ auto batch = create_long_batch(values.size(), values);
+ auto column = ColumnInt64::create();
+ auto data_type = std::make_shared<DataTypeInt64>();
+
+ auto orc_type_ptr = createPrimitiveType(orc::TypeKind::LONG);
+
+ TFileScanRangeParams params;
+ TFileRangeDesc range;
+ auto reader = OrcReader::create_unique(params, range, "", nullptr, true);
+
+ MutableColumnPtr xx = column->assume_mutable();
+
+ Status status = reader->_fill_doris_data_column<false>(
+ "test_long", xx, data_type, orc_type_ptr.get(), batch.get(),
values.size());
+
+ ASSERT_TRUE(status.ok());
+ ASSERT_EQ(column->size(), values.size());
+
+ for (size_t i = 0; i < values.size(); ++i) {
+ ASSERT_EQ(column->get_int(i), values[i]);
+ }
+}
+
+TEST_F(OrcReaderFillDataTest, TestFillLongColumnWithNull) {
+ std::vector<int64_t> values = {1, 2, 3, 4, 5};
+ std::vector<bool> nulls = {false, true, false, true, false};
+ auto batch = create_long_batch(values.size(), values, nulls);
+ auto column = ColumnInt64::create();
+ auto data_type = std::make_shared<DataTypeInt64>();
+
+ auto orc_type_ptr = createPrimitiveType(orc::TypeKind::LONG);
+
+ TFileScanRangeParams params;
+ TFileRangeDesc range;
+ auto reader = OrcReader::create_unique(params, range, "", nullptr, true);
+
+ MutableColumnPtr xx = column->assume_mutable();
+
+ Status status = reader->_fill_doris_data_column<false>(
+ "test_long_with_null", xx, data_type, orc_type_ptr.get(),
batch.get(), values.size());
+
+ ASSERT_TRUE(status.ok());
+ ASSERT_EQ(column->size(), values.size());
+
+ for (size_t i = 0; i < values.size(); ++i) {
+ if (!nulls[i]) {
+ ASSERT_EQ(column->get_int(i), values[i]);
+ }
+ }
+}
+
+TEST_F(OrcReaderFillDataTest, ComplexTypeConversionTest) {
+ // Array类型测试
+ {
+ using namespace orc;
+ std::unique_ptr<orc::Type>
type(orc::Type::buildTypeFromString("struct<col1:array<int>>"));
+
+ WriterOptions options;
+ options.setMemoryPool(orc::getDefaultPool());
+
+ MemoryOutputStream memStream(100 * 1024 * 1024);
+ std::unique_ptr<orc::Writer> writer = orc::createWriter(*type,
&memStream, options);
+
+ std::unique_ptr<orc::ColumnVectorBatch> batch =
writer->createRowBatch(1024);
+ orc::StructVectorBatch* structBatch =
dynamic_cast<orc::StructVectorBatch*>(batch.get());
+ orc::ListVectorBatch* listBatch =
+ dynamic_cast<orc::ListVectorBatch*>(structBatch->fields[0]);
+ orc::LongVectorBatch* intBatch =
+ dynamic_cast<orc::LongVectorBatch*>(listBatch->elements.get());
+ int64_t* data = intBatch->data.data();
+ int64_t* offsets = listBatch->offsets.data();
+ uint64_t rowCount = 20;
+ uint64_t offset = 0;
+ uint64_t maxListLength = 5;
+ for (uint64_t i = 0; i < rowCount; ++i) {
+ offsets[i] = static_cast<int64_t>(offset);
+ for (uint64_t length = i % maxListLength + 1; length != 0;
--length) {
+ data[offset++] = static_cast<int64_t>(i);
+ }
+ }
+ offsets[rowCount] = static_cast<int64_t>(offset);
+
+ structBatch->numElements = rowCount;
+ listBatch->numElements = rowCount;
+
+ TFileScanRangeParams params;
+ TFileRangeDesc range;
+ auto reader = OrcReader::create_unique(params, range, "", nullptr,
true);
+
+ auto doris_struct_type = std::make_shared<DataTypeStruct>(
+ std::vector<DataTypePtr> {
+
std::make_shared<DataTypeArray>(std::make_shared<DataTypeInt32>())},
+ std::vector<std::string> {"col1"});
+ MutableColumnPtr doris_column =
doris_struct_type->create_column()->assume_mutable();
+
+ Status status = reader->_fill_doris_data_column<false>(
+ "test", doris_column, doris_struct_type, type.get(),
structBatch, rowCount);
+
+ ASSERT_TRUE(status.ok());
+ std::string line;
+ std::unique_ptr<orc::ColumnPrinter> printer =
orc::createColumnPrinter(line, type.get());
+ printer->reset(*structBatch);
+
+ for (int i = 0; i < rowCount; i++) {
+ line.clear();
+ printer->printRow(i);
+ std::cout << "line = " << line << "\n";
+ }
+ Block block {std::vector<ColumnWithTypeAndName> {
+ {doris_column->get_ptr(), doris_struct_type, "cc"}}};
+ std::cout << block.dump_data() << "\n";
+
+ ASSERT_EQ(block.dump_data(),
+ "+-----------------------------+\n"
+ "|cc(Struct(col1:Array(Int32)))|\n"
+ "+-----------------------------+\n"
+ "| {[0]}|\n"
+ "| {[1, 1]}|\n"
+ "| {[2, 2, 2]}|\n"
+ "| {[3, 3, 3, 3]}|\n"
+ "| {[4, 4, 4, 4, 4]}|\n"
+ "| {[5]}|\n"
+ "| {[6, 6]}|\n"
+ "| {[7, 7, 7]}|\n"
+ "| {[8, 8, 8, 8]}|\n"
+ "| {[9, 9, 9, 9, 9]}|\n"
+ "| {[10]}|\n"
+ "| {[11, 11]}|\n"
+ "| {[12, 12, 12]}|\n"
+ "| {[13, 13, 13, 13]}|\n"
+ "| {[14, 14, 14, 14, 14]}|\n"
+ "| {[15]}|\n"
+ "| {[16, 16]}|\n"
+ "| {[17, 17, 17]}|\n"
+ "| {[18, 18, 18, 18]}|\n"
+ "| {[19, 19, 19, 19, 19]}|\n"
+ "+-----------------------------+\n");
+ }
+
+ {
+ using namespace orc;
+ auto type =
std::unique_ptr<Type>(Type::buildTypeFromString("struct<col1:int,col2:int>"));
+
+ size_t rowCount = 10;
+ MemoryOutputStream memStream(100 * 1024 * 1024);
+ WriterOptions options;
+ options.setMemoryPool(getDefaultPool());
+ auto writer = createWriter(*type, &memStream, options);
+ auto batch = writer->createRowBatch(rowCount);
+ auto& structBatch = dynamic_cast<StructVectorBatch&>(*batch);
+ auto& longBatch1 =
dynamic_cast<LongVectorBatch&>(*structBatch.fields[0]);
+ auto& longBatch2 =
dynamic_cast<LongVectorBatch&>(*structBatch.fields[1]);
+ structBatch.numElements = rowCount;
+ longBatch1.numElements = rowCount;
+ longBatch2.numElements = rowCount;
+ for (size_t i = 0; i < rowCount; ++i) {
+ longBatch1.data[i] = static_cast<int64_t>(i * 100);
+ longBatch2.data[i] = static_cast<int64_t>(i * 300);
+ }
+
+ std::string line;
+ std::unique_ptr<orc::ColumnPrinter> printer =
orc::createColumnPrinter(line, type.get());
+ printer->reset(structBatch);
+
+ for (int i = 0; i < rowCount; i++) {
+ line.clear();
+ printer->printRow(i);
+ std::cout << "line = " << line << "\n";
+ }
+
+ TFileScanRangeParams params;
+ TFileRangeDesc range;
+ auto reader = OrcReader::create_unique(params, range, "", nullptr,
true);
+
+ auto doris_struct_type = std::make_shared<DataTypeStruct>(
+ std::vector<DataTypePtr> {std::make_shared<DataTypeInt32>(),
+ std::make_shared<DataTypeInt32>()},
+ std::vector<std::string> {"col1", "col2"});
+ MutableColumnPtr doris_column =
doris_struct_type->create_column()->assume_mutable();
+
+ Status status = reader->_fill_doris_data_column<false>(
+ "test", doris_column, doris_struct_type, type.get(),
&structBatch, rowCount);
+
+ ASSERT_TRUE(status.ok());
+
+ Block block {std::vector<ColumnWithTypeAndName> {
+ {doris_column->get_ptr(), doris_struct_type, "cc"}}};
+ std::cout << block.dump_data() << "\n";
+
+ ASSERT_EQ(block.dump_data(),
+ "+----------------------------------+\n"
+ "|cc(Struct(col1:Int32, col2:Int32))|\n"
+ "+----------------------------------+\n"
+ "| {0, 0}|\n"
+ "| {100, 300}|\n"
+ "| {200, 600}|\n"
+ "| {300, 900}|\n"
+ "| {400, 1200}|\n"
+ "| {500, 1500}|\n"
+ "| {600, 1800}|\n"
+ "| {700, 2100}|\n"
+ "| {800, 2400}|\n"
+ "| {900, 2700}|\n"
+ "+----------------------------------+\n");
+ }
+
+ {
+ using namespace orc;
+
+ const uint64_t maxPrecision = 18;
+ MemoryOutputStream memStream(100 * 1024 * 102);
+ MemoryPool* pool = getDefaultPool();
+ std::unique_ptr<Type>
type(Type::buildTypeFromString("struct<col1:decimal(18,5)>"));
+ WriterOptions options;
+ options.setMemoryPool(pool);
+
+ uint64_t rowCount = 5;
+ std::unique_ptr<Writer> writer = createWriter(*type, &memStream,
options);
+ std::unique_ptr<ColumnVectorBatch> batch =
+ writer->createRowBatch(2 * rowCount + 2 * maxPrecision);
+ StructVectorBatch* structBatch =
dynamic_cast<StructVectorBatch*>(batch.get());
+ Decimal64VectorBatch* decBatch =
+ dynamic_cast<Decimal64VectorBatch*>(structBatch->fields[0]);
+ decBatch->scale = 5;
+ decBatch->precision = 18;
+ // write positive decimals
+ for (uint64_t i = 0; i < rowCount; ++i) {
+ decBatch->values[i] = static_cast<int64_t>(i + 10000);
+ }
+
+ // write negative decimals
+ for (uint64_t i = rowCount; i < 2 * rowCount; ++i) {
+ decBatch->values[i] = static_cast<int64_t>(i - 10000);
+ }
+
+ // write all precision decimals
+ int64_t dec = 0;
+ for (uint64_t i = 2 * rowCount; i < 2 * rowCount + 2 * maxPrecision; i
+= 2) {
+ dec = dec * 10 + 9;
+ decBatch->values[i] = dec;
+ decBatch->values[i + 1] = -dec;
+ }
+ rowCount = 2 * (rowCount + maxPrecision);
+ structBatch->numElements = decBatch->numElements = rowCount;
+
+ std::string line;
+ std::unique_ptr<orc::ColumnPrinter> printer =
orc::createColumnPrinter(line, type.get());
+ printer->reset(*structBatch);
+
+ for (int i = 0; i < rowCount; i++) {
+ line.clear();
+ printer->printRow(i);
+ std::cout << "line = " << line << "\n";
+ }
+
+ TFileScanRangeParams params;
+ TFileRangeDesc range;
+ auto reader = OrcReader::create_unique(params, range, "", nullptr,
true);
+
+ auto doris_struct_type = std::make_shared<DataTypeStruct>(
+ std::vector<DataTypePtr>
{std::make_shared<DataTypeDecimal<Decimal64>>(18, 5)},
+ std::vector<std::string> {"col1"});
+ MutableColumnPtr doris_column =
doris_struct_type->create_column()->assume_mutable();
+ reader->_decimal_scale_params.resize(0);
+ reader->_decimal_scale_params_index = 0;
+ Status status = reader->_fill_doris_data_column<false>(
+ "test", doris_column, doris_struct_type, type.get(),
structBatch, rowCount);
+
+ ASSERT_TRUE(status.ok());
+
+ Block block {std::vector<ColumnWithTypeAndName> {
+ {doris_column->get_ptr(), doris_struct_type, "cc"}}};
+ std::cout << block.dump_data() << "\n";
+ ASSERT_EQ(block.dump_data(),
+ "+-------------------------------+\n"
+ "|cc(Struct(col1:Decimal(18, 5)))|\n"
+ "+-------------------------------+\n"
+ "| {0.10000}|\n"
+ "| {0.10001}|\n"
+ "| {0.10002}|\n"
+ "| {0.10003}|\n"
+ "| {0.10004}|\n"
+ "| {-0.09995}|\n"
+ "| {-0.09994}|\n"
+ "| {-0.09993}|\n"
+ "| {-0.09992}|\n"
+ "| {-0.09991}|\n"
+ "| {0.00009}|\n"
+ "| {-0.00009}|\n"
+ "| {0.00099}|\n"
+ "| {-0.00099}|\n"
+ "| {0.00999}|\n"
+ "| {-0.00999}|\n"
+ "| {0.09999}|\n"
+ "| {-0.09999}|\n"
+ "| {0.99999}|\n"
+ "| {-0.99999}|\n"
+ "| {9.99999}|\n"
+ "| {-9.99999}|\n"
+ "| {99.99999}|\n"
+ "| {-99.99999}|\n"
+ "| {999.99999}|\n"
+ "| {-999.99999}|\n"
+ "| {9999.99999}|\n"
+ "| {-9999.99999}|\n"
+ "| {99999.99999}|\n"
+ "| {-99999.99999}|\n"
+ "| {999999.99999}|\n"
+ "| {-999999.99999}|\n"
+ "| {9999999.99999}|\n"
+ "| {-9999999.99999}|\n"
+ "| {99999999.99999}|\n"
+ "| {-99999999.99999}|\n"
+ "| {999999999.99999}|\n"
+ "| {-999999999.99999}|\n"
+ "| {9999999999.99999}|\n"
+ "| {-9999999999.99999}|\n"
+ "| {99999999999.99999}|\n"
+ "| {-99999999999.99999}|\n"
+ "| {999999999999.99999}|\n"
+ "| {-999999999999.99999}|\n"
+ "| {9999999999999.99999}|\n"
+ "| {-9999999999999.99999}|\n"
+ "+-------------------------------+\n");
+ }
+
+ {
+ using namespace orc;
+ size_t rowCount = 10;
+ MemoryOutputStream memStream(100 * 1024 * 1024);
+ MemoryPool* pool = getDefaultPool();
+ auto type =
std::unique_ptr<Type>(Type::buildTypeFromString("map<int,float>"));
+ WriterOptions options;
+ options.setMemoryPool(pool);
+ auto writer = createWriter(*type, &memStream, options);
+ auto batch = writer->createRowBatch(rowCount * 10);
+ auto& mapBatch = dynamic_cast<MapVectorBatch&>(*batch);
+ int64_t* offsets = mapBatch.offsets.data();
+ auto& keyBatch = dynamic_cast<LongVectorBatch&>(*(mapBatch.keys));
+ auto& valueBatch =
dynamic_cast<DoubleVectorBatch&>(*(mapBatch.elements));
+
+ mapBatch.numElements = rowCount;
+ uint64_t Offset = 0;
+
+ for (size_t i = 0; i < rowCount; ++i) {
+ offsets[i] = static_cast<int64_t>(Offset);
+ for (int j = 0; j < i / 2; j++) {
+ keyBatch.data[Offset] = i * 100;
+ valueBatch.data[Offset] = i * 3.;
+ Offset++;
+ }
+ }
+ offsets[rowCount] = static_cast<int64_t>(Offset);
+
+ keyBatch.numElements = Offset;
+ valueBatch.numElements = Offset;
+
+ std::string line;
+ std::unique_ptr<orc::ColumnPrinter> printer =
orc::createColumnPrinter(line, type.get());
+ printer->reset(mapBatch);
+
+ for (int i = 0; i < rowCount; i++) {
+ line.clear();
+
+ printer->printRow(i);
+ std::cout << "line = " << line << "\n";
+ }
+
+ TFileScanRangeParams params;
+ TFileRangeDesc range;
+ auto reader = OrcReader::create_unique(params, range, "", nullptr,
true);
+
+ auto doris_struct_type =
std::make_shared<DataTypeMap>(std::make_shared<DataTypeInt32>(),
+
std::make_shared<DataTypeFloat32>());
+ MutableColumnPtr doris_column =
doris_struct_type->create_column()->assume_mutable();
+
+ Status status = reader->_fill_doris_data_column<false>(
+ "test", doris_column, doris_struct_type, type.get(),
&mapBatch, rowCount);
+
+ ASSERT_TRUE(status.ok());
+
+ Block block {std::vector<ColumnWithTypeAndName> {
+ {doris_column->get_ptr(), doris_struct_type, "cc"}}};
+ std::cout << block.dump_data() << "\n";
+ ASSERT_EQ(block.dump_data(),
+ "+-----------------------+\n"
+ "|cc(Map(Int32, Float32))|\n"
+ "+-----------------------+\n"
+ "| {}|\n"
+ "| {}|\n"
+ "| {200:6}|\n"
+ "| {300:9}|\n"
+ "| {400:12, 400:12}|\n"
+ "| {500:15, 500:15}|\n"
+ "|{600:18, 600:18, 600...|\n"
+ "|{700:21, 700:21, 700...|\n"
+ "|{800:24, 800:24, 800...|\n"
+ "|{900:27, 900:27, 900...|\n"
+ "+-----------------------+\n");
+ }
+}
+} // namespace vectorized
+} // namespace doris
\ No newline at end of file
diff --git a/be/test/vec/exec/orc/orc_reader_init_column_test.cpp
b/be/test/vec/exec/orc/orc_reader_init_column_test.cpp
new file mode 100644
index 00000000000..44cc9cdfc59
--- /dev/null
+++ b/be/test/vec/exec/orc/orc_reader_init_column_test.cpp
@@ -0,0 +1,359 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+
+#include <memory>
+
+#include "orc/ColumnPrinter.hh"
+#include "orc_memory_stream_test.h"
+#include "vec/core/types.h"
+#include "vec/exec/format/orc/vorc_reader.h"
+
+namespace doris {
+namespace vectorized {
+class OrcReaderInitColumnTest : public ::testing::Test {
+protected:
+ void SetUp() override {}
+
+ void TearDown() override {}
+};
+TEST_F(OrcReaderInitColumnTest, InitReadColumn) {
+ {
+ using namespace orc;
+ size_t rowCount = 10;
+ MemoryOutputStream memStream(100 * 1024 * 1024);
+ MemoryPool* pool = getDefaultPool();
+ auto type =
std::unique_ptr<Type>(Type::buildTypeFromString("struct<col1:int,col2:int>"));
+ WriterOptions options;
+ options.setMemoryPool(pool);
+ auto writer = createWriter(*type, &memStream, options);
+ auto batch = writer->createRowBatch(rowCount);
+ writer->add(*batch);
+ writer->close();
+
+ auto inStream =
+ std::make_unique<MemoryInputStream>(memStream.getData(),
memStream.getLength());
+ ReaderOptions readerOptions;
+ readerOptions.setMemoryPool(*pool);
+ auto orc_reader = createReader(std::move(inStream), readerOptions);
+
+ TFileScanRangeParams params;
+ TFileRangeDesc range;
+ auto reader = OrcReader::create_unique(params, range, "", nullptr,
true);
+ reader->_reader = std::move(orc_reader);
+ std::vector<std::string> tmp;
+ tmp.emplace_back("col1");
+
+ reader->_column_names = &tmp;
+ Status st = reader->_init_read_columns();
+ std::cout << "st =" << st << "\n";
+ std::list<std::string> ans;
+ ans.emplace_back("col1");
+ ASSERT_EQ(ans, reader->_read_cols);
+ }
+
+ {
+ using namespace orc;
+ size_t rowCount = 10;
+ MemoryOutputStream memStream(100 * 1024 * 1024);
+ MemoryPool* pool = getDefaultPool();
+ auto type =
std::unique_ptr<Type>(Type::buildTypeFromString("struct<col1:int,col2:int>"));
+ WriterOptions options;
+ options.setMemoryPool(pool);
+ auto writer = createWriter(*type, &memStream, options);
+ auto batch = writer->createRowBatch(rowCount);
+ writer->add(*batch);
+ writer->close();
+
+ auto inStream =
+ std::make_unique<MemoryInputStream>(memStream.getData(),
memStream.getLength());
+ ReaderOptions readerOptions;
+ readerOptions.setMemoryPool(*pool);
+ auto orc_reader = createReader(std::move(inStream), readerOptions);
+
+ TFileScanRangeParams params;
+ params.slot_name_to_schema_pos.insert({"xxxxx", 0});
+ params.__isset.slot_name_to_schema_pos = true;
+ TFileRangeDesc range;
+ auto reader = OrcReader::create_unique(params, range, "", nullptr,
true);
+ reader->_reader = std::move(orc_reader);
+ reader->_is_hive1_orc_or_use_idx = true;
+ std::vector<std::string> column_names;
+ column_names.emplace_back("xxxxx");
+
+ reader->_column_names = &column_names;
+ Status st = reader->_init_read_columns();
+
+ std::cout << "st =" << st << "\n";
+
+ std::list<std::string> ans;
+ ans.emplace_back("col1");
+ ASSERT_EQ(ans, reader->_read_cols);
+ }
+ {
+ using namespace orc;
+ size_t rowCount = 10;
+ MemoryOutputStream memStream(100 * 1024 * 1024);
+ MemoryPool* pool = getDefaultPool();
+ auto type = std::unique_ptr<Type>(
+
Type::buildTypeFromString("struct<_col0:int,_col1:int,_col2:bigint>"));
+ WriterOptions options;
+ options.setMemoryPool(pool);
+ auto writer = createWriter(*type, &memStream, options);
+ auto batch = writer->createRowBatch(rowCount);
+ writer->add(*batch);
+ writer->close();
+
+ auto inStream =
+ std::make_unique<MemoryInputStream>(memStream.getData(),
memStream.getLength());
+ ReaderOptions readerOptions;
+ readerOptions.setMemoryPool(*pool);
+ auto orc_reader = createReader(std::move(inStream), readerOptions);
+
+ TFileScanRangeParams params;
+ params.slot_name_to_schema_pos.insert({"a", 0});
+ params.slot_name_to_schema_pos.insert({"b", 1});
+ params.slot_name_to_schema_pos.insert({"c", 2});
+
+ params.__isset.slot_name_to_schema_pos = true;
+ TFileRangeDesc range;
+ auto reader = OrcReader::create_unique(params, range, "", nullptr,
true);
+ reader->_reader = std::move(orc_reader);
+ std::vector<std::string> column_names;
+ column_names.emplace_back("b");
+ column_names.emplace_back("c");
+
+ reader->_column_names = &column_names;
+ Status st = reader->_init_read_columns();
+
+ std::list<std::string> ans;
+ ans.emplace_back("_col1");
+ ans.emplace_back("_col2");
+ ASSERT_EQ(ans, reader->_read_cols);
+ }
+
+ {
+ using namespace orc;
+ auto acid_type = createStructType();
+ acid_type->addStructField("operation",
createPrimitiveType(orc::TypeKind::INT));
+ acid_type->addStructField("originalTransaction",
createPrimitiveType(orc::TypeKind::LONG));
+ acid_type->addStructField("bucket",
createPrimitiveType(orc::TypeKind::INT));
+ acid_type->addStructField("rowId",
createPrimitiveType(orc::TypeKind::LONG));
+ acid_type->addStructField("currentTransaction",
createPrimitiveType(orc::TypeKind::LONG));
+ auto row_type = createStructType();
+ row_type->addStructField("CoL1",
createPrimitiveType(orc::TypeKind::LONG));
+ row_type->addStructField("col2",
createPrimitiveType(orc::TypeKind::LONG));
+ row_type->addStructField("colUMN3",
createPrimitiveType(orc::TypeKind::LONG));
+ acid_type->addStructField("row", std::move(row_type));
+
+ size_t rowCount = 10;
+ MemoryOutputStream memStream(100 * 1024 * 1024);
+ MemoryPool* pool = getDefaultPool();
+ WriterOptions options;
+ options.setMemoryPool(pool);
+ auto writer = createWriter(*acid_type, &memStream, options);
+ auto batch = writer->createRowBatch(rowCount);
+ writer->add(*batch);
+ writer->close();
+
+ auto inStream =
+ std::make_unique<MemoryInputStream>(memStream.getData(),
memStream.getLength());
+ ReaderOptions readerOptions;
+ readerOptions.setMemoryPool(*pool);
+ auto orc_reader = createReader(std::move(inStream), readerOptions);
+
+ TFileScanRangeParams params;
+ TFileRangeDesc range;
+ auto reader = OrcReader::create_unique(params, range, "", nullptr,
true);
+ reader->_reader = std::move(orc_reader);
+ std::vector<std::string> column_names;
+ column_names.emplace_back("col1");
+ column_names.emplace_back("column3");
+ reader->_column_names = &column_names;
+ reader->_is_acid = true;
+ Status st = reader->_init_read_columns();
+
+ std::list<std::string> ans;
+ ans.emplace_back("row.CoL1");
+ ans.emplace_back("row.colUMN3");
+ ASSERT_EQ(ans, reader->_read_cols);
+ }
+}
+
+TEST_F(OrcReaderInitColumnTest, CheckAcidSchemaTest) {
+ using namespace orc;
+ TFileScanRangeParams params;
+ TFileRangeDesc range;
+ auto _reader = OrcReader::create_unique(params, range, "", nullptr, true);
+ // 1. Test standard ACID schema
+ {
+ // Create standard ACID structure
+ auto acid_type = createStructType();
+ acid_type->addStructField("operation",
createPrimitiveType(orc::TypeKind::INT));
+ acid_type->addStructField("originalTransaction",
createPrimitiveType(orc::TypeKind::LONG));
+ acid_type->addStructField("bucket",
createPrimitiveType(orc::TypeKind::INT));
+ acid_type->addStructField("rowId",
createPrimitiveType(orc::TypeKind::LONG));
+ acid_type->addStructField("currentTransaction",
createPrimitiveType(orc::TypeKind::LONG));
+ acid_type->addStructField("row", createStructType());
+
+ ASSERT_TRUE(_reader->_check_acid_schema(*acid_type));
+ }
+
+ // 2. Test case-insensitive field names
+ {
+ auto acid_type = createStructType();
+ acid_type->addStructField("OPERATION",
createPrimitiveType(orc::TypeKind::INT));
+ acid_type->addStructField("OriginalTransaction",
createPrimitiveType(orc::TypeKind::LONG));
+ acid_type->addStructField("Bucket",
createPrimitiveType(orc::TypeKind::INT));
+ acid_type->addStructField("ROWID",
createPrimitiveType(orc::TypeKind::LONG));
+ acid_type->addStructField("currentTRANSACTION",
createPrimitiveType(orc::TypeKind::LONG));
+ acid_type->addStructField("ROW", createStructType());
+
+ ASSERT_TRUE(_reader->_check_acid_schema(*acid_type));
+ }
+
+ // 3. Test non-ACID schema - field count mismatch
+ {
+ auto non_acid_type = createStructType();
+ non_acid_type->addStructField("operation",
createPrimitiveType(orc::TypeKind::INT));
+ non_acid_type->addStructField("originalTransaction",
+
createPrimitiveType(orc::TypeKind::LONG));
+ // Only added two fields
+
+ ASSERT_FALSE(_reader->_check_acid_schema(*non_acid_type));
+ }
+
+ // 4. Test non-ACID schema - field name mismatch
+ {
+ auto wrong_name_type = createStructType();
+ wrong_name_type->addStructField("operation",
createPrimitiveType(orc::TypeKind::INT));
+ wrong_name_type->addStructField("wrongName",
createPrimitiveType(orc::TypeKind::LONG));
+ wrong_name_type->addStructField("bucket",
createPrimitiveType(orc::TypeKind::INT));
+ wrong_name_type->addStructField("rowId",
createPrimitiveType(orc::TypeKind::LONG));
+ wrong_name_type->addStructField("currentTransaction",
+
createPrimitiveType(orc::TypeKind::LONG));
+ wrong_name_type->addStructField("row", createStructType());
+
+ ASSERT_FALSE(_reader->_check_acid_schema(*wrong_name_type));
+ }
+
+ // 5. Test non-struct type
+ {
+ auto int_type = createPrimitiveType(orc::TypeKind::INT);
+ ASSERT_FALSE(_reader->_check_acid_schema(*int_type));
+
+ auto string_type = createPrimitiveType(orc::TypeKind::STRING);
+ ASSERT_FALSE(_reader->_check_acid_schema(*string_type));
+ }
+}
+
+TEST_F(OrcReaderInitColumnTest, RemoveAcidTest) {
+ using namespace orc;
+ TFileScanRangeParams params;
+ TFileRangeDesc range;
+ auto _reader = OrcReader::create_unique(params, range, "", nullptr, true);
+ // 1. Test removing ACID info from ACID schema
+ {
+ // Create ACID schema
+ auto acid_type = createStructType();
+ acid_type->addStructField("operation",
createPrimitiveType(orc::TypeKind::INT));
+ acid_type->addStructField("originalTransaction",
createPrimitiveType(orc::TypeKind::LONG));
+ acid_type->addStructField("bucket",
createPrimitiveType(orc::TypeKind::INT));
+ acid_type->addStructField("rowId",
createPrimitiveType(orc::TypeKind::LONG));
+ acid_type->addStructField("currentTransaction",
createPrimitiveType(orc::TypeKind::LONG));
+
+ // Create actual data structure
+ auto row_type = createStructType();
+ row_type->addStructField("id",
createPrimitiveType(orc::TypeKind::INT));
+ row_type->addStructField("name",
createPrimitiveType(orc::TypeKind::STRING));
+ acid_type->addStructField("row", std::move(row_type));
+
+ // Verify that after removing ACID we get the type of the row field
+ const orc::Type& removed_type = _reader->_remove_acid(*acid_type);
+ ASSERT_EQ(removed_type.getKind(), orc::TypeKind::STRUCT);
+ ASSERT_EQ(removed_type.getSubtypeCount(), 2); // id and name fields
+ ASSERT_EQ(removed_type.getFieldName(0), "id");
+ ASSERT_EQ(removed_type.getFieldName(1), "name");
+ }
+
+ // 2. Test that non-ACID schema remains unchanged
+ {
+ // Create normal schema
+ auto normal_type = createStructType();
+ normal_type->addStructField("field1",
createPrimitiveType(orc::TypeKind::INT));
+ normal_type->addStructField("field2",
createPrimitiveType(orc::TypeKind::STRING));
+
+ const orc::Type& result_type = _reader->_remove_acid(*normal_type);
+ ASSERT_EQ(&result_type, normal_type.get()); // Should return the same
type
+ ASSERT_EQ(result_type.getSubtypeCount(), 2);
+ ASSERT_EQ(result_type.getFieldName(0), "field1");
+ ASSERT_EQ(result_type.getFieldName(1), "field2");
+ }
+
+ // 3. Test primitive types (non-struct) remain unchanged
+ {
+ auto int_type = createPrimitiveType(orc::TypeKind::INT);
+ const orc::Type& result_type = _reader->_remove_acid(*int_type);
+ ASSERT_EQ(&result_type, int_type.get());
+ ASSERT_EQ(result_type.getKind(), orc::TypeKind::INT);
+ }
+
+ // 4. Test complex nested ACID schema
+ {
+ // Create nested ACID schema
+ auto acid_type = createStructType();
+ acid_type->addStructField("operation",
createPrimitiveType(orc::TypeKind::INT));
+ acid_type->addStructField("originalTransaction",
createPrimitiveType(orc::TypeKind::LONG));
+ acid_type->addStructField("bucket",
createPrimitiveType(orc::TypeKind::INT));
+ acid_type->addStructField("rowId",
createPrimitiveType(orc::TypeKind::LONG));
+ acid_type->addStructField("currentTransaction",
createPrimitiveType(orc::TypeKind::LONG));
+
+ // Create complex row structure
+ auto row_type = createStructType();
+
+ // Add basic fields
+ row_type->addStructField("id",
createPrimitiveType(orc::TypeKind::INT));
+
+ // Add array field
+ auto array_type =
createListType(createPrimitiveType(orc::TypeKind::STRING));
+ row_type->addStructField("tags", std::move(array_type));
+
+ // Add Map field
+ auto map_type =
createMapType(createPrimitiveType(orc::TypeKind::STRING),
+ createPrimitiveType(orc::TypeKind::INT));
+ row_type->addStructField("properties", std::move(map_type));
+
+ acid_type->addStructField("row", std::move(row_type));
+
+ // Verify structure after removing ACID
+ const orc::Type& removed_type = _reader->_remove_acid(*acid_type);
+ ASSERT_EQ(removed_type.getKind(), orc::TypeKind::STRUCT);
+ ASSERT_EQ(removed_type.getSubtypeCount(), 3); // id, tags, properties
+ ASSERT_EQ(removed_type.getFieldName(0), "id");
+ ASSERT_EQ(removed_type.getFieldName(1), "tags");
+ ASSERT_EQ(removed_type.getFieldName(2), "properties");
+
+ // Verify field types
+ ASSERT_EQ(removed_type.getSubtype(0)->getKind(), orc::TypeKind::INT);
+ ASSERT_EQ(removed_type.getSubtype(1)->getKind(), orc::TypeKind::LIST);
+ ASSERT_EQ(removed_type.getSubtype(2)->getKind(), orc::TypeKind::MAP);
+ }
+}
+
+} // namespace vectorized
+} // namespace doris
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]