github-actions[bot] commented on code in PR #32873: URL: https://github.com/apache/doris/pull/32873#discussion_r1547044974
########## be/src/vec/exec/format/column_type_convert.h: ########## @@ -0,0 +1,502 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once +#include <gen_cpp/parquet_types.h> Review Comment: warning: 'gen_cpp/parquet_types.h' file not found [clang-diagnostic-error] ```cpp #include <gen_cpp/parquet_types.h> ^ ``` ########## be/src/vec/exec/format/column_type_convert.cpp: ########## @@ -0,0 +1,293 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "vec/exec/format/column_type_convert.h" + +namespace doris::vectorized::converter { + +#define FOR_LOGICAL_NUMERIC_TYPES(M) \ + M(TYPE_BOOLEAN) \ + M(TYPE_TINYINT) \ + M(TYPE_SMALLINT) \ + M(TYPE_INT) \ + M(TYPE_BIGINT) \ + M(TYPE_LARGEINT) \ + M(TYPE_FLOAT) \ + M(TYPE_DOUBLE) + +#define FOR_LOGICAL_DECIMAL_TYPES(M) \ + M(TYPE_DECIMALV2) \ + M(TYPE_DECIMAL32) \ + M(TYPE_DECIMAL64) \ + M(TYPE_DECIMAL128I) \ + M(TYPE_DECIMAL256) + +#define FOR_LOGICAL_TIME_TYPES(M) \ + M(TYPE_DATETIME) \ + M(TYPE_DATE) \ + M(TYPE_DATETIMEV2) \ + M(TYPE_DATEV2) + +#define FOR_ALL_LOGICAL_TYPES(M) \ + M(TYPE_BOOLEAN) \ + M(TYPE_TINYINT) \ + M(TYPE_SMALLINT) \ + M(TYPE_INT) \ + M(TYPE_BIGINT) \ + M(TYPE_LARGEINT) \ + M(TYPE_FLOAT) \ + M(TYPE_DOUBLE) \ + M(TYPE_DECIMALV2) \ + M(TYPE_DECIMAL32) \ + M(TYPE_DECIMAL64) \ + M(TYPE_DECIMAL128I) \ + M(TYPE_DECIMAL256) \ + M(TYPE_DATETIME) \ + M(TYPE_DATE) \ + M(TYPE_DATETIMEV2) \ + M(TYPE_DATEV2) + +bool ColumnTypeConverter::is_numeric_type(PrimitiveType type) { + switch (type) { + case TYPE_BOOLEAN: + case TYPE_TINYINT: + case TYPE_SMALLINT: + case TYPE_INT: + case TYPE_BIGINT: + case TYPE_LARGEINT: + case TYPE_FLOAT: + case TYPE_DOUBLE: + return true; + default: + return false; + } +} + +bool ColumnTypeConverter::is_decimal_type(doris::PrimitiveType type) { + switch (type) { + case TYPE_DECIMALV2: + case TYPE_DECIMAL32: + case TYPE_DECIMAL64: + case TYPE_DECIMAL128I: + case TYPE_DECIMAL256: + return true; + default: + return false; + } +} + +ColumnPtr ColumnTypeConverter::get_column(const TypeDescriptor& src_type, ColumnPtr& dst_column, + const DataTypePtr& dst_type) { + if (is_consistent()) { + if (_cached_src_type == nullptr) { + _cached_src_type = + DataTypeFactory::instance().create_data_type(src_type, dst_type->is_nullable()); + } + return dst_column; + } + + if (_cached_src_column == nullptr) { + _cached_src_type = + DataTypeFactory::instance().create_data_type(src_type, dst_type->is_nullable()); + _cached_src_column = + DataTypeFactory::instance().create_data_type(src_type, false)->create_column(); + } + // remove the old cached data + _cached_src_column->assume_mutable()->clear(); + + if (dst_type->is_nullable()) { + // In order to share null map between parquet converted src column and dst column to avoid copying. It is very tricky that will + // call mutable function `doris_nullable_column->get_null_map_column_ptr()` which will set `_need_update_has_null = true`. + // Because some operations such as agg will call `has_null()` to set `_need_update_has_null = false`. + auto doris_nullable_column = + const_cast<ColumnNullable*>(static_cast<const ColumnNullable*>(dst_column.get())); + return ColumnNullable::create(_cached_src_column, + doris_nullable_column->get_null_map_column_ptr()); + } + + return _cached_src_column; +} + +std::unique_ptr<ColumnTypeConverter> ColumnTypeConverter::get_converter( Review Comment: warning: function 'get_converter' exceeds recommended size/complexity thresholds [readability-function-size] ```cpp std::unique_ptr<ColumnTypeConverter> ColumnTypeConverter::get_converter( ^ ``` <details> <summary>Additional context</summary> **be/src/vec/exec/format/column_type_convert.cpp:124:** 165 lines including whitespace and comments (threshold 80) ```cpp std::unique_ptr<ColumnTypeConverter> ColumnTypeConverter::get_converter( ^ ``` </details> ########## be/src/vec/exec/format/parquet/delta_bit_pack_decoder.h: ########## @@ -106,12 +104,12 @@ class DeltaDecoder : public Decoder { * Block * [min delta] [list of bitwidths of the mini blocks] [miniblocks] */ -template <typename T, tparquet::Type::type PhysicalType> +template <typename T> class DeltaBitPackDecoder final : public DeltaDecoder { public: using UT = std::make_unsigned_t<T>; - DeltaBitPackDecoder() : DeltaDecoder(new FixLengthPlainDecoder<PhysicalType>()) {} + DeltaBitPackDecoder() : DeltaDecoder(new FixLengthPlainDecoder()) {} Review Comment: warning: use '= default' to define a trivial default constructor [modernize-use-equals-default] ```suggestion DeltaBitPackDecoder() : DeltaDecoder(new FixLengthPlainDecoder()) = default; ``` ########## be/src/vec/exec/format/parquet/parquet_column_convert.h: ########## @@ -17,104 +17,17 @@ #pragma once -#include <gen_cpp/PlanNodes_types.h> -#include <gen_cpp/Types_types.h> #include <gen_cpp/parquet_types.h> Review Comment: warning: 'gen_cpp/parquet_types.h' file not found [clang-diagnostic-error] ```cpp #include <gen_cpp/parquet_types.h> ^ ``` ########## be/src/vec/exec/format/parquet/parquet_column_convert.h: ########## @@ -525,262 +398,81 @@ } else if constexpr (ScaleType == DecimalScaleParams::SCALE_DOWN) { value /= scale_params.scale_factor; } - data[start_idx + i] = (DecimalPhysicalType)value; + data[start_idx + i] = (DecimalType)value; } return Status::OK(); } }; -template <typename DecimalType, typename ValueCopyType> -class StringToDecimalString : public ColumnConvert { -public: - Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override { - convert_null(src_col, dst_col); - - size_t rows = src_col->size(); - - auto buf = static_cast<const ColumnString*>(src_col.get())->get_chars().data(); - auto& offset = static_cast<const ColumnString*>(src_col.get())->get_offsets(); - - auto data = static_cast<ColumnString*>(dst_col.get()); - for (int i = 0; i < rows; i++) { - int len = offset[i] - offset[i - 1]; - // When Decimal in parquet is stored in byte arrays, binary and fixed, - // the unscaled number must be encoded as two's complement using big-endian byte order. - ValueCopyType value = 0; - memcpy(reinterpret_cast<char*>(&value), buf + offset[i - 1], len); - value = BitUtil::big_endian_to_host(value); - value = value >> ((sizeof(value) - len) * 8); - std::string ans = reinterpret_cast<DecimalType&>(value).to_string( - _convert_params->field_schema->parquet_schema.scale); - data->insert_data(ans.data(), ans.size()); - } - return Status::OK(); - } -}; - -class Int32ToDateString : public ColumnConvert { -public: - Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override { - convert_null(src_col, dst_col); +class Int32ToDate : public PhysicalToLogicalConverter { + Status physical_convert(ColumnPtr& src_physical_col) override { + ColumnPtr src_col = remove_nullable(src_physical_col); + MutableColumnPtr dst_col = remove_nullable(_src_logical_column)->assume_mutable(); size_t rows = src_col->size(); + size_t start_idx = dst_col->size(); + dst_col->reserve(start_idx + rows); auto& src_data = static_cast<const ColumnVector<int32>*>(src_col.get())->get_data(); + auto& data = static_cast<ColumnDateV2*>(dst_col.get())->get_data(); date_day_offset_dict& date_dict = date_day_offset_dict::get(); - auto str_col = static_cast<ColumnString*>(dst_col.get()); - char buf[50]; for (int i = 0; i < rows; i++) { int64_t date_value = (int64_t)src_data[i] + _convert_params->offset_days; - DateV2Value<DateV2ValueType> value = date_dict[date_value]; - char* end = value.to_string(buf); - str_col->insert_data(buf, end - buf); + data.push_back_without_reserve(date_dict[date_value].to_date_int_val()); } return Status::OK(); } }; -class Int96ToTimestampString : public ColumnConvert { -public: - Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override { - convert_null(src_col, dst_col); +struct Int64ToTimestamp : public PhysicalToLogicalConverter { + Status physical_convert(ColumnPtr& src_physical_col) override { + ColumnPtr src_col = remove_nullable(src_physical_col); + MutableColumnPtr dst_col = remove_nullable(_src_logical_column)->assume_mutable(); - auto& src_data = static_cast<const ColumnVector<Int8>*>(src_col.get())->get_data(); - auto dst_data = static_cast<ColumnString*>(dst_col.get()); + size_t rows = src_col->size(); + size_t start_idx = dst_col->size(); + dst_col->resize(start_idx + rows); - size_t rows = src_col->size() / sizeof(ParquetInt96); - ParquetInt96* data = (ParquetInt96*)src_data.data(); + auto src_data = static_cast<const ColumnVector<int64_t>*>(src_col.get())->get_data().data(); + auto& data = static_cast<ColumnVector<UInt64>*>(dst_col.get())->get_data(); - char buf[50]; for (int i = 0; i < rows; i++) { - uint64_t num = 0; + int64_t x = src_data[i]; + auto& num = data[start_idx + i]; auto& value = reinterpret_cast<DateV2Value<DateTimeV2ValueType>&>(num); - int64_t micros = data[i].to_timestamp_micros(); - value.from_unixtime(micros / 1000000, *_convert_params->ctz); - value.set_microsecond(micros % 1000000); - char* end = value.to_string(buf); - dst_data->insert_data(buf, end - buf); + value.from_unixtime(x / _convert_params->second_mask, *_convert_params->ctz); + value.set_microsecond((x % _convert_params->second_mask) * + (_convert_params->scale_to_nano_factor / 1000)); } return Status::OK(); } }; -inline Status get_converter(tparquet::Type::type parquet_physical_type, PrimitiveType show_type, - std::shared_ptr<const IDataType> dst_data_type, - std::unique_ptr<ColumnConvert>* converter, FieldSchema* field_schema, - cctz::time_zone* ctz) { - std::unique_ptr<ParquetConvert::ConvertParams> convert_params = - std::make_unique<ParquetConvert::ConvertParams>(); - convert_params->init(field_schema, ctz); - auto dst_type = remove_nullable(dst_data_type)->get_type_id(); - switch (dst_type) { -#define DISPATCH(NUMERIC_TYPE, CPP_NUMERIC_TYPE, PHYSICAL_TYPE) \ - case NUMERIC_TYPE: \ - switch (parquet_physical_type) { \ - case tparquet::Type::BOOLEAN: \ - *converter = std::make_unique< \ - NumberToNumberConvert<tparquet::Type::BOOLEAN, CPP_NUMERIC_TYPE>>(); \ - break; \ - case tparquet::Type::INT32: \ - *converter = std::make_unique< \ - NumberToNumberConvert<tparquet::Type::INT32, CPP_NUMERIC_TYPE>>(); \ - break; \ - case tparquet::Type::INT64: \ - *converter = std::make_unique< \ - NumberToNumberConvert<tparquet::Type::INT64, CPP_NUMERIC_TYPE>>(); \ - break; \ - case tparquet::Type::FLOAT: \ - *converter = std::make_unique< \ - NumberToNumberConvert<tparquet::Type::FLOAT, CPP_NUMERIC_TYPE>>(); \ - break; \ - case tparquet::Type::DOUBLE: \ - *converter = std::make_unique< \ - NumberToNumberConvert<tparquet::Type::DOUBLE, CPP_NUMERIC_TYPE>>(); \ - break; \ - default: \ - break; \ - } \ - break; - FOR_LOGICAL_NUMERIC_TYPES(DISPATCH) -#undef DISPATCH - - case TypeIndex::String: { - if (tparquet::Type::FIXED_LEN_BYTE_ARRAY == parquet_physical_type) { - if (show_type == PrimitiveType::TYPE_DECIMAL32) { - *converter = std::make_unique<StringToDecimalString<Decimal32, Int32>>(); - break; - } else if (show_type == PrimitiveType::TYPE_DECIMAL64) { - *converter = std::make_unique<StringToDecimalString<Decimal64, Int64>>(); - break; - } else if (show_type == PrimitiveType::TYPE_DECIMALV2) { - *converter = std::make_unique<StringToDecimalString<Decimal128V2, Int128>>(); - break; - } else if (show_type == PrimitiveType::TYPE_DECIMAL128I) { - *converter = std::make_unique<StringToDecimalString<Decimal128V2, Int128>>(); - break; - } - - } else if (tparquet::Type::INT96 == parquet_physical_type) { - *converter = std::make_unique<Int96ToTimestampString>(); - break; - } else if (tparquet::Type::INT32 == parquet_physical_type) { - if (show_type == PrimitiveType::TYPE_DATEV2) { - *converter = std::make_unique<Int32ToDateString>(); - break; - } - } - - if (parquet_physical_type == tparquet::Type::BOOLEAN) { - *converter = std::make_unique<NumberToStringConvert<tparquet::Type::BOOLEAN>>(); - } else if (parquet_physical_type == tparquet::Type::INT32) { - *converter = std::make_unique<NumberToStringConvert<tparquet::Type::INT32>>(); - - } else if (parquet_physical_type == tparquet::Type::INT64) { - *converter = std::make_unique<NumberToStringConvert<tparquet::Type::INT64>>(); +struct Int96toTimestamp : public PhysicalToLogicalConverter { + Status physical_convert(ColumnPtr& src_physical_col) override { Review Comment: warning: method 'physical_convert' can be made static [readability-convert-member-functions-to-static] ```suggestion static Status physical_convert(ColumnPtr& src_physical_col) override { ``` ########## be/src/vec/exec/format/parquet/parquet_column_convert.h: ########## @@ -193,195 +106,161 @@ } template <typename DecimalPrimitiveType> - void init_decimal_converter(DataTypePtr& data_type) { + void init_decimal_converter(int dst_scale) { if (field_schema == nullptr || decimal_scale.scale_type != DecimalScaleParams::NOT_INIT) { return; } auto scale = field_schema->parquet_schema.scale; - auto* decimal_type = static_cast<DataTypeDecimal<DecimalPrimitiveType>*>( - const_cast<IDataType*>(remove_nullable(data_type).get())); - auto dest_scale = decimal_type->get_scale(); - if (dest_scale > scale) { + if (dst_scale > scale) { decimal_scale.scale_type = DecimalScaleParams::SCALE_UP; decimal_scale.scale_factor = - DecimalScaleParams::get_scale_factor<DecimalPrimitiveType>(dest_scale - scale); - } else if (dest_scale < scale) { + DecimalScaleParams::get_scale_factor<DecimalPrimitiveType>(dst_scale - scale); + } else if (dst_scale < scale) { decimal_scale.scale_type = DecimalScaleParams::SCALE_DOWN; decimal_scale.scale_factor = - DecimalScaleParams::get_scale_factor<DecimalPrimitiveType>(scale - dest_scale); + DecimalScaleParams::get_scale_factor<DecimalPrimitiveType>(scale - dst_scale); } else { decimal_scale.scale_type = DecimalScaleParams::NO_SCALE; decimal_scale.scale_factor = 1; } } }; -/* -* parquet_physical_type : The type of data stored in parquet. -* Read data into columns returned by get_column according to the physical type of parquet. -* show_type : The data format that should be displayed. -* doris_column : What type of column does the upper layer need to put the data in. -* -* example : -* In hive, if decimal is stored as FIXED_LENBYTE_ARRAY in parquet, -* then we use `ALTER TABLE TableName CHANGE COLUMN Col_Decimal Col_Decimal String;` -* to convert this column to string type. -* parquet_type : FIXED_LEN_BYTE_ARRAY. -* ans_data_type : ColumnInt8 -* show_type : Decimal. -* doris_column : ColumnString. -*/ -ColumnPtr get_column(tparquet::Type::type parquet_physical_type, PrimitiveType show_type, - ColumnPtr& doris_column, DataTypePtr& doris_type, bool* need_convert); - -struct ColumnConvert { - virtual Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) { return Status::OK(); } - - virtual ~ColumnConvert() = default; - - void convert_null(ColumnPtr& src_col, MutableColumnPtr& dst_col) { - src_col = remove_nullable(src_col); - dst_col = remove_nullable(dst_col->get_ptr())->assume_mutable(); - } +class PhysicalToLogicalConverter { +protected: + ColumnPtr _cached_src_physical_column = nullptr; + DataTypePtr _cached_src_physical_type = nullptr; + ColumnPtr _src_logical_column = nullptr; + std::unique_ptr<converter::ColumnTypeConverter> _logical_converter = nullptr; + + std::string _error_msg; -public: std::unique_ptr<ConvertParams> _convert_params; -}; -template <tparquet::Type::type parquet_physical_type, typename dst_type> -struct NumberToNumberConvert : public ColumnConvert { - Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override { - using ColumnType = typename PhysicalTypeTraits<parquet_physical_type>::ColumnType; - convert_null(src_col, dst_col); +public: + static std::unique_ptr<PhysicalToLogicalConverter> get_converter( + FieldSchema* field_schema, const TypeDescriptor& src_logical_type, + const DataTypePtr& dst_logical_type, cctz::time_zone* ctz); - size_t rows = src_col->size(); - auto& src_data = static_cast<const ColumnType*>(src_col.get())->get_data(); + static bool is_parquet_native_type(PrimitiveType type); - size_t start_idx = dst_col->size(); - dst_col->resize(start_idx + rows); - auto& data = static_cast<ColumnVector<dst_type>&>(*dst_col.get()).get_data(); - for (int i = 0; i < rows; i++) { - dst_type value = static_cast<dst_type>(src_data[i]); - data[start_idx + i] = value; - } + static bool is_decimal_type(PrimitiveType type); - return Status::OK(); + PhysicalToLogicalConverter() = default; + virtual ~PhysicalToLogicalConverter() = default; + + virtual Status physical_convert(ColumnPtr& src_physical_col) { return Status::OK(); } + + Status convert(ColumnPtr& src_physical_col, MutableColumnPtr& dst_logical_col) { + // convert physical values and save in _src_logical_column + RETURN_IF_ERROR(physical_convert(src_physical_col)); + return _logical_converter->convert(_src_logical_column, dst_logical_col); } -}; -template <tparquet::Type::type parquet_physical_type> -struct NumberToStringConvert : public ColumnConvert { - Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override { - using ColumnType = typename PhysicalTypeTraits<parquet_physical_type>::ColumnType; - convert_null(src_col, dst_col); + virtual ColumnPtr get_physical_column(tparquet::Type::type src_physical_type, + const TypeDescriptor& src_logical_type, + ColumnPtr& dst_logical_column, + const DataTypePtr& dst_logical_type); - size_t rows = src_col->size(); - auto& src_data = static_cast<const ColumnType*>(src_col.get())->get_data(); + DataTypePtr& get_physical_type() { return _cached_src_physical_type; } - char buf[100]; - auto str_col = static_cast<ColumnString*>(dst_col.get()); - for (int i = 0; i < rows; i++) { - if constexpr (parquet_physical_type == tparquet::Type::FLOAT) { - int len = FastFloatToBuffer(src_data[i], buf, true); - str_col->insert_data(buf, len); + virtual bool is_consistent() { return false; } - } else if constexpr (parquet_physical_type == tparquet::Type::DOUBLE) { - int len = FastDoubleToBuffer(src_data[i], buf, true); - str_col->insert_data(buf, len); - } else if constexpr (parquet_physical_type == tparquet::Type::INT32) { - char* end = FastInt32ToBufferLeft(src_data[i], buf); - str_col->insert_data(buf, end - buf); + virtual bool support() { return true; } - } else if constexpr (parquet_physical_type == tparquet::Type::INT64) { - char* end = FastInt64ToBufferLeft(src_data[i], buf); - str_col->insert_data(buf, end - buf); + std::string get_error_msg() { return _error_msg; } +}; - } else { - string value = std::to_string(src_data[i]); - str_col->insert_data(value.data(), value.size()); - } - } - return Status::OK(); - } +class ConsistentPhysicalConverter : public PhysicalToLogicalConverter { + bool is_consistent() override { return true; } }; -struct Int96toTimestamp : public ColumnConvert { +class UnsupportedConverter : public PhysicalToLogicalConverter { public: - Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override { - convert_null(src_col, dst_col); + UnsupportedConverter(std::string error_msg) { _error_msg = error_msg; } + + UnsupportedConverter(tparquet::Type::type src_physical_type, + const TypeDescriptor& src_logical_type) { + std::string src_physical_str = tparquet::to_string(src_physical_type); + std::string src_logical_str = + std::string(getTypeName(DataTypeFactory::instance() + .create_data_type(src_logical_type, false) + ->get_type_id())); + _error_msg = src_physical_str + " => " + src_logical_str; + } - size_t rows = src_col->size() / sizeof(ParquetInt96); - auto& src_data = static_cast<const ColumnVector<Int8>*>(src_col.get())->get_data(); - auto ParquetInt96_data = (ParquetInt96*)src_data.data(); - size_t start_idx = dst_col->size(); - dst_col->resize(start_idx + rows); - auto& data = static_cast<ColumnVector<UInt64>*>(dst_col.get())->get_data(); + bool support() override { return false; } - for (int i = 0; i < rows; i++) { - ParquetInt96 src_cell_data = ParquetInt96_data[i]; - auto& dst_value = - reinterpret_cast<DateV2Value<DateTimeV2ValueType>&>(data[start_idx + i]); - - int64_t timestamp_with_micros = src_cell_data.to_timestamp_micros(); - dst_value.from_unixtime(timestamp_with_micros / 1000000, *_convert_params->ctz); - dst_value.set_microsecond(timestamp_with_micros % 1000000); - } - return Status::OK(); + Status physical_convert(ColumnPtr& src_physical_col) override { + return Status::InternalError("Unsupported physical to logical type: {}", _error_msg); } }; -struct Int64ToTimestamp : public ColumnConvert { -public: - Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override { - convert_null(src_col, dst_col); - - size_t rows = src_col->size(); - size_t start_idx = dst_col->size(); - dst_col->resize(start_idx + rows); - - auto src_data = static_cast<const ColumnVector<int64_t>*>(src_col.get())->get_data().data(); - auto& data = static_cast<ColumnVector<UInt64>*>(dst_col.get())->get_data(); - - for (int i = 0; i < rows; i++) { - int64_t x = src_data[i]; - auto& num = data[start_idx + i]; - auto& value = reinterpret_cast<DateV2Value<DateTimeV2ValueType>&>(num); - value.from_unixtime(x / _convert_params->second_mask, *_convert_params->ctz); - value.set_microsecond((x % _convert_params->second_mask) * - (_convert_params->scale_to_nano_factor / 1000)); +// for tinyint, smallint +template <PrimitiveType IntPrimitiveType> +class LittleIntPhysicalConverter : public PhysicalToLogicalConverter { + Status physical_convert(ColumnPtr& src_physical_col) override { + using DstCppType = typename PrimitiveTypeTraits<IntPrimitiveType>::CppType; + using DstColumnType = typename PrimitiveTypeTraits<IntPrimitiveType>::ColumnType; + ColumnPtr from_col = remove_nullable(src_physical_col); + MutableColumnPtr to_col = remove_nullable(_src_logical_column)->assume_mutable(); + + size_t rows = from_col->size(); + // always comes from tparquet::Type::INT32 + auto& src_data = static_cast<const ColumnInt32*>(from_col.get())->get_data(); + size_t start_idx = to_col->size(); + to_col->resize(start_idx + rows); + auto& data = static_cast<DstColumnType&>(*to_col.get()).get_data(); + for (int i = 0; i < rows; ++i) { + data[start_idx + i] = static_cast<DstCppType>(src_data[i]); } + return Status::OK(); } }; -class Int32ToDate : public ColumnConvert { -public: - Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override { - convert_null(src_col, dst_col); - - size_t rows = src_col->size(); - size_t start_idx = dst_col->size(); - dst_col->reserve(start_idx + rows); - - auto& src_data = static_cast<const ColumnVector<int32>*>(src_col.get())->get_data(); - auto& data = static_cast<ColumnDateV2*>(dst_col.get())->get_data(); - date_day_offset_dict& date_dict = date_day_offset_dict::get(); +class FixedSizeBinaryConverter : public PhysicalToLogicalConverter { +private: + int _type_length; - for (int i = 0; i < rows; i++) { - int64_t date_value = (int64_t)src_data[i] + _convert_params->offset_days; - data.push_back_without_reserve(date_dict[date_value].to_date_int_val()); +public: + FixedSizeBinaryConverter(int type_length) : _type_length(type_length) {} + + Status physical_convert(ColumnPtr& src_physical_col) override { Review Comment: warning: method 'physical_convert' can be made const [readability-make-member-function-const] ```suggestion Status physical_convert(ColumnPtr& src_physical_col) const override { ``` ########## be/src/vec/exec/format/parquet/fix_length_plain_decoder.h: ########## @@ -18,25 +18,21 @@ #pragma once #include <gen_cpp/parquet_types.h> Review Comment: warning: 'gen_cpp/parquet_types.h' file not found [clang-diagnostic-error] ```cpp #include <gen_cpp/parquet_types.h> ^ ``` ########## be/src/vec/exec/format/parquet/parquet_column_convert.h: ########## @@ -525,262 +398,81 @@ } else if constexpr (ScaleType == DecimalScaleParams::SCALE_DOWN) { value /= scale_params.scale_factor; } - data[start_idx + i] = (DecimalPhysicalType)value; + data[start_idx + i] = (DecimalType)value; } return Status::OK(); } }; -template <typename DecimalType, typename ValueCopyType> -class StringToDecimalString : public ColumnConvert { -public: - Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override { - convert_null(src_col, dst_col); - - size_t rows = src_col->size(); - - auto buf = static_cast<const ColumnString*>(src_col.get())->get_chars().data(); - auto& offset = static_cast<const ColumnString*>(src_col.get())->get_offsets(); - - auto data = static_cast<ColumnString*>(dst_col.get()); - for (int i = 0; i < rows; i++) { - int len = offset[i] - offset[i - 1]; - // When Decimal in parquet is stored in byte arrays, binary and fixed, - // the unscaled number must be encoded as two's complement using big-endian byte order. - ValueCopyType value = 0; - memcpy(reinterpret_cast<char*>(&value), buf + offset[i - 1], len); - value = BitUtil::big_endian_to_host(value); - value = value >> ((sizeof(value) - len) * 8); - std::string ans = reinterpret_cast<DecimalType&>(value).to_string( - _convert_params->field_schema->parquet_schema.scale); - data->insert_data(ans.data(), ans.size()); - } - return Status::OK(); - } -}; - -class Int32ToDateString : public ColumnConvert { -public: - Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override { - convert_null(src_col, dst_col); +class Int32ToDate : public PhysicalToLogicalConverter { + Status physical_convert(ColumnPtr& src_physical_col) override { + ColumnPtr src_col = remove_nullable(src_physical_col); + MutableColumnPtr dst_col = remove_nullable(_src_logical_column)->assume_mutable(); size_t rows = src_col->size(); + size_t start_idx = dst_col->size(); + dst_col->reserve(start_idx + rows); auto& src_data = static_cast<const ColumnVector<int32>*>(src_col.get())->get_data(); + auto& data = static_cast<ColumnDateV2*>(dst_col.get())->get_data(); date_day_offset_dict& date_dict = date_day_offset_dict::get(); - auto str_col = static_cast<ColumnString*>(dst_col.get()); - char buf[50]; for (int i = 0; i < rows; i++) { int64_t date_value = (int64_t)src_data[i] + _convert_params->offset_days; - DateV2Value<DateV2ValueType> value = date_dict[date_value]; - char* end = value.to_string(buf); - str_col->insert_data(buf, end - buf); + data.push_back_without_reserve(date_dict[date_value].to_date_int_val()); } return Status::OK(); } }; -class Int96ToTimestampString : public ColumnConvert { -public: - Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override { - convert_null(src_col, dst_col); +struct Int64ToTimestamp : public PhysicalToLogicalConverter { + Status physical_convert(ColumnPtr& src_physical_col) override { Review Comment: warning: method 'physical_convert' can be made static [readability-convert-member-functions-to-static] ```suggestion static Status physical_convert(ColumnPtr& src_physical_col) override { ``` ########## be/src/vec/exec/format/parquet/delta_bit_pack_decoder.h: ########## @@ -417,9 +410,8 @@ return Status::OK(); } -template <typename T, tparquet::Type::type PhysicalType> -Status DeltaBitPackDecoder<T, PhysicalType>::_get_internal(T* buffer, int num_values, - int* out_num_values) { +template <typename T> +Status DeltaBitPackDecoder<T>::_get_internal(T* buffer, int num_values, int* out_num_values) { Review Comment: warning: function '_get_internal' has cognitive complexity of 76 (threshold 50) [readability-function-cognitive-complexity] ```cpp Status DeltaBitPackDecoder<T>::_get_internal(T* buffer, int num_values, int* out_num_values) { ^ ``` <details> <summary>Additional context</summary> **be/src/vec/exec/format/parquet/delta_bit_pack_decoder.h:415:** +1, including nesting penalty of 0, nesting level increased to 1 ```cpp if (num_values == 0) { ^ ``` **be/src/vec/exec/format/parquet/delta_bit_pack_decoder.h:420:** +1, including nesting penalty of 0, nesting level increased to 1 ```cpp while (i < num_values) { ^ ``` **be/src/vec/exec/format/parquet/delta_bit_pack_decoder.h:421:** +2, including nesting penalty of 1, nesting level increased to 2 ```cpp if (PREDICT_FALSE(_values_remaining_current_mini_block == 0)) { ^ ``` **be/src/vec/exec/format/parquet/delta_bit_pack_decoder.h:422:** +3, including nesting penalty of 2, nesting level increased to 3 ```cpp if (PREDICT_FALSE(!_block_initialized)) { ^ ``` **be/src/vec/exec/format/parquet/delta_bit_pack_decoder.h:425:** +4, including nesting penalty of 3, nesting level increased to 4 ```cpp if (i == num_values) { ^ ``` **be/src/vec/exec/format/parquet/delta_bit_pack_decoder.h:433:** +5, including nesting penalty of 4, nesting level increased to 5 ```cpp if (_total_value_count != 1) { ^ ``` **be/src/vec/exec/format/parquet/delta_bit_pack_decoder.h:434:** +6, including nesting penalty of 5, nesting level increased to 6 ```cpp RETURN_IF_ERROR(_init_block()); ^ ``` **be/src/common/status.h:541:** expanded from macro 'RETURN_IF_ERROR' ```cpp do { \ ^ ``` **be/src/vec/exec/format/parquet/delta_bit_pack_decoder.h:434:** +7, including nesting penalty of 6, nesting level increased to 7 ```cpp RETURN_IF_ERROR(_init_block()); ^ ``` **be/src/common/status.h:543:** expanded from macro 'RETURN_IF_ERROR' ```cpp if (UNLIKELY(!_status_.ok())) { \ ^ ``` **be/src/vec/exec/format/parquet/delta_bit_pack_decoder.h:438:** +4, including nesting penalty of 3, nesting level increased to 4 ```cpp RETURN_IF_ERROR(_init_block()); ^ ``` **be/src/common/status.h:541:** expanded from macro 'RETURN_IF_ERROR' ```cpp do { \ ^ ``` **be/src/vec/exec/format/parquet/delta_bit_pack_decoder.h:438:** +5, including nesting penalty of 4, nesting level increased to 5 ```cpp RETURN_IF_ERROR(_init_block()); ^ ``` **be/src/common/status.h:543:** expanded from macro 'RETURN_IF_ERROR' ```cpp if (UNLIKELY(!_status_.ok())) { \ ^ ``` **be/src/vec/exec/format/parquet/delta_bit_pack_decoder.h:439:** +1, nesting level increased to 3 ```cpp } else { ^ ``` **be/src/vec/exec/format/parquet/delta_bit_pack_decoder.h:441:** +4, including nesting penalty of 3, nesting level increased to 4 ```cpp if (_mini_block_idx < _mini_blocks_per_block) { ^ ``` **be/src/vec/exec/format/parquet/delta_bit_pack_decoder.h:442:** +5, including nesting penalty of 4, nesting level increased to 5 ```cpp RETURN_IF_ERROR(_init_mini_block(_delta_bit_widths.data()[_mini_block_idx])); ^ ``` **be/src/common/status.h:541:** expanded from macro 'RETURN_IF_ERROR' ```cpp do { \ ^ ``` **be/src/vec/exec/format/parquet/delta_bit_pack_decoder.h:442:** +6, including nesting penalty of 5, nesting level increased to 6 ```cpp RETURN_IF_ERROR(_init_mini_block(_delta_bit_widths.data()[_mini_block_idx])); ^ ``` **be/src/common/status.h:543:** expanded from macro 'RETURN_IF_ERROR' ```cpp if (UNLIKELY(!_status_.ok())) { \ ^ ``` **be/src/vec/exec/format/parquet/delta_bit_pack_decoder.h:443:** +1, nesting level increased to 4 ```cpp } else { ^ ``` **be/src/vec/exec/format/parquet/delta_bit_pack_decoder.h:444:** +5, including nesting penalty of 4, nesting level increased to 5 ```cpp RETURN_IF_ERROR(_init_block()); ^ ``` **be/src/common/status.h:541:** expanded from macro 'RETURN_IF_ERROR' ```cpp do { \ ^ ``` **be/src/vec/exec/format/parquet/delta_bit_pack_decoder.h:444:** +6, including nesting penalty of 5, nesting level increased to 6 ```cpp RETURN_IF_ERROR(_init_block()); ^ ``` **be/src/common/status.h:543:** expanded from macro 'RETURN_IF_ERROR' ```cpp if (UNLIKELY(!_status_.ok())) { \ ^ ``` **be/src/vec/exec/format/parquet/delta_bit_pack_decoder.h:451:** +2, including nesting penalty of 1, nesting level increased to 2 ```cpp for (int j = 0; j < values_decode; ++j) { ^ ``` **be/src/vec/exec/format/parquet/delta_bit_pack_decoder.h:452:** +3, including nesting penalty of 2, nesting level increased to 3 ```cpp if (!_bit_reader->GetValue(_delta_bit_width, buffer + i + j)) { ^ ``` **be/src/vec/exec/format/parquet/delta_bit_pack_decoder.h:456:** +2, including nesting penalty of 1, nesting level increased to 2 ```cpp for (int j = 0; j < values_decode; ++j) { ^ ``` **be/src/vec/exec/format/parquet/delta_bit_pack_decoder.h:468:** +1, including nesting penalty of 0, nesting level increased to 1 ```cpp if (PREDICT_FALSE(_total_values_remaining == 0)) { ^ ``` **be/src/vec/exec/format/parquet/delta_bit_pack_decoder.h:469:** +2, including nesting penalty of 1, nesting level increased to 2 ```cpp if (!_bit_reader->Advance(_delta_bit_width * _values_remaining_current_mini_block)) { ^ ``` </details> ########## be/src/vec/exec/format/parquet/fix_length_plain_decoder.h: ########## @@ -47,106 +43,35 @@ ColumnSelectVector& select_vector, bool is_dict_filter); Status skip_values(size_t num_values) override; - -protected: - template <bool has_filter> - Status _decode_numeric(MutableColumnPtr& doris_column, ColumnSelectVector& select_vector); - - template <bool has_filter> - Status _decode_string(MutableColumnPtr& doris_column, ColumnSelectVector& select_vector); }; -template <tparquet::Type::type PhysicalType> -Status FixLengthPlainDecoder<PhysicalType>::skip_values(size_t num_values) { +Status FixLengthPlainDecoder::skip_values(size_t num_values) { _offset += _type_length * num_values; if (UNLIKELY(_offset > _data->size)) { return Status::IOError("Out-of-bounds access in parquet data decoder"); } return Status::OK(); } -template <tparquet::Type::type PhysicalType> -Status FixLengthPlainDecoder<PhysicalType>::decode_values(MutableColumnPtr& doris_column, - DataTypePtr& data_type, - ColumnSelectVector& select_vector, - bool is_dict_filter) { +Status FixLengthPlainDecoder::decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type, Review Comment: warning: method 'decode_values' can be made static [readability-convert-member-functions-to-static] ```suggestion static Status FixLengthPlainDecoder::decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type, ``` ########## be/src/vec/exec/format/column_type_convert.cpp: ########## @@ -0,0 +1,293 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "vec/exec/format/column_type_convert.h" + +namespace doris::vectorized::converter { + +#define FOR_LOGICAL_NUMERIC_TYPES(M) \ + M(TYPE_BOOLEAN) \ + M(TYPE_TINYINT) \ + M(TYPE_SMALLINT) \ + M(TYPE_INT) \ + M(TYPE_BIGINT) \ + M(TYPE_LARGEINT) \ + M(TYPE_FLOAT) \ + M(TYPE_DOUBLE) + +#define FOR_LOGICAL_DECIMAL_TYPES(M) \ + M(TYPE_DECIMALV2) \ + M(TYPE_DECIMAL32) \ + M(TYPE_DECIMAL64) \ + M(TYPE_DECIMAL128I) \ + M(TYPE_DECIMAL256) + +#define FOR_LOGICAL_TIME_TYPES(M) \ + M(TYPE_DATETIME) \ + M(TYPE_DATE) \ + M(TYPE_DATETIMEV2) \ + M(TYPE_DATEV2) + +#define FOR_ALL_LOGICAL_TYPES(M) \ + M(TYPE_BOOLEAN) \ + M(TYPE_TINYINT) \ + M(TYPE_SMALLINT) \ + M(TYPE_INT) \ + M(TYPE_BIGINT) \ + M(TYPE_LARGEINT) \ + M(TYPE_FLOAT) \ + M(TYPE_DOUBLE) \ + M(TYPE_DECIMALV2) \ + M(TYPE_DECIMAL32) \ + M(TYPE_DECIMAL64) \ + M(TYPE_DECIMAL128I) \ + M(TYPE_DECIMAL256) \ + M(TYPE_DATETIME) \ + M(TYPE_DATE) \ + M(TYPE_DATETIMEV2) \ + M(TYPE_DATEV2) + +bool ColumnTypeConverter::is_numeric_type(PrimitiveType type) { + switch (type) { + case TYPE_BOOLEAN: + case TYPE_TINYINT: + case TYPE_SMALLINT: + case TYPE_INT: + case TYPE_BIGINT: + case TYPE_LARGEINT: + case TYPE_FLOAT: + case TYPE_DOUBLE: + return true; + default: + return false; + } +} + +bool ColumnTypeConverter::is_decimal_type(doris::PrimitiveType type) { + switch (type) { + case TYPE_DECIMALV2: + case TYPE_DECIMAL32: + case TYPE_DECIMAL64: + case TYPE_DECIMAL128I: + case TYPE_DECIMAL256: + return true; + default: + return false; + } +} + +ColumnPtr ColumnTypeConverter::get_column(const TypeDescriptor& src_type, ColumnPtr& dst_column, + const DataTypePtr& dst_type) { + if (is_consistent()) { + if (_cached_src_type == nullptr) { + _cached_src_type = + DataTypeFactory::instance().create_data_type(src_type, dst_type->is_nullable()); + } + return dst_column; + } + + if (_cached_src_column == nullptr) { + _cached_src_type = + DataTypeFactory::instance().create_data_type(src_type, dst_type->is_nullable()); + _cached_src_column = + DataTypeFactory::instance().create_data_type(src_type, false)->create_column(); + } + // remove the old cached data + _cached_src_column->assume_mutable()->clear(); + + if (dst_type->is_nullable()) { + // In order to share null map between parquet converted src column and dst column to avoid copying. It is very tricky that will + // call mutable function `doris_nullable_column->get_null_map_column_ptr()` which will set `_need_update_has_null = true`. + // Because some operations such as agg will call `has_null()` to set `_need_update_has_null = false`. + auto doris_nullable_column = + const_cast<ColumnNullable*>(static_cast<const ColumnNullable*>(dst_column.get())); + return ColumnNullable::create(_cached_src_column, + doris_nullable_column->get_null_map_column_ptr()); + } + + return _cached_src_column; +} + +std::unique_ptr<ColumnTypeConverter> ColumnTypeConverter::get_converter( Review Comment: warning: function 'get_converter' has cognitive complexity of 108 (threshold 50) [readability-function-cognitive-complexity] ```cpp std::unique_ptr<ColumnTypeConverter> ColumnTypeConverter::get_converter( ^ ``` <details> <summary>Additional context</summary> **be/src/vec/exec/format/column_type_convert.cpp:129:** +1, including nesting penalty of 0, nesting level increased to 1 ```cpp if (src_primitive_type == dst_primitive_type) { ^ ``` **be/src/vec/exec/format/column_type_convert.cpp:132:** +1, including nesting penalty of 0, nesting level increased to 1 ```cpp if (is_string_type(src_primitive_type) && is_string_type(dst_primitive_type)) { ^ ``` **be/src/vec/exec/format/column_type_convert.cpp:132:** +1 ```cpp if (is_string_type(src_primitive_type) && is_string_type(dst_primitive_type)) { ^ ``` **be/src/vec/exec/format/column_type_convert.cpp:138:** +1, including nesting penalty of 0, nesting level increased to 1 ```cpp if (is_numeric_type(src_primitive_type) && is_numeric_type(dst_primitive_type)) { ^ ``` **be/src/vec/exec/format/column_type_convert.cpp:138:** +1 ```cpp if (is_numeric_type(src_primitive_type) && is_numeric_type(dst_primitive_type)) { ^ ``` **be/src/vec/exec/format/column_type_convert.cpp:139:** +2, including nesting penalty of 1, nesting level increased to 2 ```cpp switch (src_primitive_type) { ^ ``` **be/src/vec/exec/format/column_type_convert.cpp:163:** +3, including nesting penalty of 2, nesting level increased to 3 ```cpp FOR_LOGICAL_NUMERIC_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/column_type_convert.cpp:163:** +3, including nesting penalty of 2, nesting level increased to 3 ```cpp FOR_LOGICAL_NUMERIC_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/column_type_convert.cpp:163:** +3, including nesting penalty of 2, nesting level increased to 3 ```cpp FOR_LOGICAL_NUMERIC_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/column_type_convert.cpp:163:** +3, including nesting penalty of 2, nesting level increased to 3 ```cpp FOR_LOGICAL_NUMERIC_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/column_type_convert.cpp:163:** +3, including nesting penalty of 2, nesting level increased to 3 ```cpp FOR_LOGICAL_NUMERIC_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/column_type_convert.cpp:163:** +3, including nesting penalty of 2, nesting level increased to 3 ```cpp FOR_LOGICAL_NUMERIC_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/column_type_convert.cpp:163:** +3, including nesting penalty of 2, nesting level increased to 3 ```cpp FOR_LOGICAL_NUMERIC_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/column_type_convert.cpp:163:** +3, including nesting penalty of 2, nesting level increased to 3 ```cpp FOR_LOGICAL_NUMERIC_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/column_type_convert.cpp:172:** +1, including nesting penalty of 0, nesting level increased to 1 ```cpp if (is_string_type(dst_primitive_type)) { ^ ``` **be/src/vec/exec/format/column_type_convert.cpp:174:** +2, including nesting penalty of 1, nesting level increased to 2 ```cpp if (is_numeric_type(src_primitive_type)) { ^ ``` **be/src/vec/exec/format/column_type_convert.cpp:175:** +3, including nesting penalty of 2, nesting level increased to 3 ```cpp switch (src_primitive_type) { ^ ``` **be/src/vec/exec/format/column_type_convert.cpp:184:** +1, nesting level increased to 2 ```cpp } else if (is_decimal_type(src_primitive_type)) { // decimal type to string ^ ``` **be/src/vec/exec/format/column_type_convert.cpp:185:** +3, including nesting penalty of 2, nesting level increased to 3 ```cpp switch (src_primitive_type) { ^ ``` **be/src/vec/exec/format/column_type_convert.cpp:194:** +1, nesting level increased to 2 ```cpp } else if (is_date_type(src_primitive_type)) { // date and datetime type to string ^ ``` **be/src/vec/exec/format/column_type_convert.cpp:195:** +3, including nesting penalty of 2, nesting level increased to 3 ```cpp switch (src_primitive_type) { ^ ``` **be/src/vec/exec/format/column_type_convert.cpp:210:** +1, including nesting penalty of 0, nesting level increased to 1 ```cpp if (is_string_type(src_primitive_type)) { ^ ``` **be/src/vec/exec/format/column_type_convert.cpp:211:** +2, including nesting penalty of 1, nesting level increased to 2 ```cpp switch (dst_primitive_type) { ^ ``` **be/src/vec/exec/format/column_type_convert.cpp:224:** +1, including nesting penalty of 0, nesting level increased to 1 ```cpp if (src_primitive_type == TYPE_DATEV2 && dst_primitive_type == TYPE_DATETIMEV2) { ^ ``` **be/src/vec/exec/format/column_type_convert.cpp:224:** +1 ```cpp if (src_primitive_type == TYPE_DATEV2 && dst_primitive_type == TYPE_DATETIMEV2) { ^ ``` **be/src/vec/exec/format/column_type_convert.cpp:227:** +1, including nesting penalty of 0, nesting level increased to 1 ```cpp if (src_primitive_type == TYPE_DATETIMEV2 && dst_primitive_type == TYPE_DATEV2) { ^ ``` **be/src/vec/exec/format/column_type_convert.cpp:227:** +1 ```cpp if (src_primitive_type == TYPE_DATETIMEV2 && dst_primitive_type == TYPE_DATEV2) { ^ ``` **be/src/vec/exec/format/column_type_convert.cpp:232:** +1, including nesting penalty of 0, nesting level increased to 1 ```cpp if (is_numeric_type(src_primitive_type) && is_decimal_type(dst_primitive_type)) { ^ ``` **be/src/vec/exec/format/column_type_convert.cpp:232:** +1 ```cpp if (is_numeric_type(src_primitive_type) && is_decimal_type(dst_primitive_type)) { ^ ``` **be/src/vec/exec/format/column_type_convert.cpp:234:** +2, including nesting penalty of 1, nesting level increased to 2 ```cpp switch (src_primitive_type) { ^ ``` **be/src/vec/exec/format/column_type_convert.cpp:253:** +3, including nesting penalty of 2, nesting level increased to 3 ```cpp FOR_LOGICAL_NUMERIC_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/column_type_convert.cpp:253:** +3, including nesting penalty of 2, nesting level increased to 3 ```cpp FOR_LOGICAL_NUMERIC_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/column_type_convert.cpp:253:** +3, including nesting penalty of 2, nesting level increased to 3 ```cpp FOR_LOGICAL_NUMERIC_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/column_type_convert.cpp:253:** +3, including nesting penalty of 2, nesting level increased to 3 ```cpp FOR_LOGICAL_NUMERIC_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/column_type_convert.cpp:253:** +3, including nesting penalty of 2, nesting level increased to 3 ```cpp FOR_LOGICAL_NUMERIC_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/column_type_convert.cpp:253:** +3, including nesting penalty of 2, nesting level increased to 3 ```cpp FOR_LOGICAL_NUMERIC_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/column_type_convert.cpp:253:** +3, including nesting penalty of 2, nesting level increased to 3 ```cpp FOR_LOGICAL_NUMERIC_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/column_type_convert.cpp:253:** +3, including nesting penalty of 2, nesting level increased to 3 ```cpp FOR_LOGICAL_NUMERIC_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/column_type_convert.cpp:261:** +1, including nesting penalty of 0, nesting level increased to 1 ```cpp if (is_decimal_type(src_primitive_type) && is_numeric_type(dst_primitive_type)) { ^ ``` **be/src/vec/exec/format/column_type_convert.cpp:261:** +1 ```cpp if (is_decimal_type(src_primitive_type) && is_numeric_type(dst_primitive_type)) { ^ ``` **be/src/vec/exec/format/column_type_convert.cpp:263:** +2, including nesting penalty of 1, nesting level increased to 2 ```cpp switch (dst_primitive_type) { ^ ``` **be/src/vec/exec/format/column_type_convert.cpp:282:** +3, including nesting penalty of 2, nesting level increased to 3 ```cpp FOR_LOGICAL_NUMERIC_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/column_type_convert.cpp:282:** +3, including nesting penalty of 2, nesting level increased to 3 ```cpp FOR_LOGICAL_NUMERIC_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/column_type_convert.cpp:282:** +3, including nesting penalty of 2, nesting level increased to 3 ```cpp FOR_LOGICAL_NUMERIC_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/column_type_convert.cpp:282:** +3, including nesting penalty of 2, nesting level increased to 3 ```cpp FOR_LOGICAL_NUMERIC_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/column_type_convert.cpp:282:** +3, including nesting penalty of 2, nesting level increased to 3 ```cpp FOR_LOGICAL_NUMERIC_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/column_type_convert.cpp:282:** +3, including nesting penalty of 2, nesting level increased to 3 ```cpp FOR_LOGICAL_NUMERIC_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/column_type_convert.cpp:282:** +3, including nesting penalty of 2, nesting level increased to 3 ```cpp FOR_LOGICAL_NUMERIC_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/column_type_convert.cpp:282:** +3, including nesting penalty of 2, nesting level increased to 3 ```cpp FOR_LOGICAL_NUMERIC_TYPES(DISPATCH) ^ ``` </details> ########## be/src/vec/exec/format/parquet/parquet_column_convert.h: ########## @@ -193,195 +106,161 @@ } template <typename DecimalPrimitiveType> - void init_decimal_converter(DataTypePtr& data_type) { + void init_decimal_converter(int dst_scale) { if (field_schema == nullptr || decimal_scale.scale_type != DecimalScaleParams::NOT_INIT) { return; } auto scale = field_schema->parquet_schema.scale; - auto* decimal_type = static_cast<DataTypeDecimal<DecimalPrimitiveType>*>( - const_cast<IDataType*>(remove_nullable(data_type).get())); - auto dest_scale = decimal_type->get_scale(); - if (dest_scale > scale) { + if (dst_scale > scale) { decimal_scale.scale_type = DecimalScaleParams::SCALE_UP; decimal_scale.scale_factor = - DecimalScaleParams::get_scale_factor<DecimalPrimitiveType>(dest_scale - scale); - } else if (dest_scale < scale) { + DecimalScaleParams::get_scale_factor<DecimalPrimitiveType>(dst_scale - scale); + } else if (dst_scale < scale) { decimal_scale.scale_type = DecimalScaleParams::SCALE_DOWN; decimal_scale.scale_factor = - DecimalScaleParams::get_scale_factor<DecimalPrimitiveType>(scale - dest_scale); + DecimalScaleParams::get_scale_factor<DecimalPrimitiveType>(scale - dst_scale); } else { decimal_scale.scale_type = DecimalScaleParams::NO_SCALE; decimal_scale.scale_factor = 1; } } }; -/* -* parquet_physical_type : The type of data stored in parquet. -* Read data into columns returned by get_column according to the physical type of parquet. -* show_type : The data format that should be displayed. -* doris_column : What type of column does the upper layer need to put the data in. -* -* example : -* In hive, if decimal is stored as FIXED_LENBYTE_ARRAY in parquet, -* then we use `ALTER TABLE TableName CHANGE COLUMN Col_Decimal Col_Decimal String;` -* to convert this column to string type. -* parquet_type : FIXED_LEN_BYTE_ARRAY. -* ans_data_type : ColumnInt8 -* show_type : Decimal. -* doris_column : ColumnString. -*/ -ColumnPtr get_column(tparquet::Type::type parquet_physical_type, PrimitiveType show_type, - ColumnPtr& doris_column, DataTypePtr& doris_type, bool* need_convert); - -struct ColumnConvert { - virtual Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) { return Status::OK(); } - - virtual ~ColumnConvert() = default; - - void convert_null(ColumnPtr& src_col, MutableColumnPtr& dst_col) { - src_col = remove_nullable(src_col); - dst_col = remove_nullable(dst_col->get_ptr())->assume_mutable(); - } +class PhysicalToLogicalConverter { +protected: + ColumnPtr _cached_src_physical_column = nullptr; + DataTypePtr _cached_src_physical_type = nullptr; + ColumnPtr _src_logical_column = nullptr; + std::unique_ptr<converter::ColumnTypeConverter> _logical_converter = nullptr; + + std::string _error_msg; -public: std::unique_ptr<ConvertParams> _convert_params; -}; -template <tparquet::Type::type parquet_physical_type, typename dst_type> -struct NumberToNumberConvert : public ColumnConvert { - Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override { - using ColumnType = typename PhysicalTypeTraits<parquet_physical_type>::ColumnType; - convert_null(src_col, dst_col); +public: + static std::unique_ptr<PhysicalToLogicalConverter> get_converter( + FieldSchema* field_schema, const TypeDescriptor& src_logical_type, + const DataTypePtr& dst_logical_type, cctz::time_zone* ctz); - size_t rows = src_col->size(); - auto& src_data = static_cast<const ColumnType*>(src_col.get())->get_data(); + static bool is_parquet_native_type(PrimitiveType type); - size_t start_idx = dst_col->size(); - dst_col->resize(start_idx + rows); - auto& data = static_cast<ColumnVector<dst_type>&>(*dst_col.get()).get_data(); - for (int i = 0; i < rows; i++) { - dst_type value = static_cast<dst_type>(src_data[i]); - data[start_idx + i] = value; - } + static bool is_decimal_type(PrimitiveType type); - return Status::OK(); + PhysicalToLogicalConverter() = default; + virtual ~PhysicalToLogicalConverter() = default; + + virtual Status physical_convert(ColumnPtr& src_physical_col) { return Status::OK(); } + + Status convert(ColumnPtr& src_physical_col, MutableColumnPtr& dst_logical_col) { Review Comment: warning: method 'convert' can be made static [readability-convert-member-functions-to-static] ```suggestion static Status convert(ColumnPtr& src_physical_col, MutableColumnPtr& dst_logical_col) { ``` ########## be/src/vec/exec/format/parquet/parquet_column_convert.cpp: ########## @@ -20,67 +20,278 @@ #include <cctz/time_zone.h> #include "vec/columns/column_nullable.h" -namespace doris::vectorized { -namespace ParquetConvert { +namespace doris::vectorized::parquet { const cctz::time_zone ConvertParams::utc0 = cctz::utc_time_zone(); -ColumnPtr get_column(tparquet::Type::type parquet_physical_type, PrimitiveType show_type, - ColumnPtr& doris_column, DataTypePtr& doris_type, bool* need_convert) { - ColumnPtr ans_column = doris_column; - DataTypePtr tmp_data_type; - - switch (parquet_physical_type) { - case tparquet::Type::type::BOOLEAN: - tmp_data_type = std::make_shared<DataTypeUInt8>(); - break; - case tparquet::Type::type::INT32: - tmp_data_type = std::make_shared<DataTypeInt32>(); - break; - case tparquet::Type::type::INT64: - tmp_data_type = std::make_shared<DataTypeInt64>(); - break; - case tparquet::Type::type::FLOAT: - tmp_data_type = std::make_shared<DataTypeFloat32>(); - break; - case tparquet::Type::type::DOUBLE: - tmp_data_type = std::make_shared<DataTypeFloat64>(); - break; - case tparquet::Type::type::BYTE_ARRAY: - case tparquet::Type::type::FIXED_LEN_BYTE_ARRAY: - tmp_data_type = std::make_shared<DataTypeString>(); - break; - case tparquet::Type::type::INT96: - tmp_data_type = std::make_shared<DataTypeInt8>(); - break; +#define FOR_LOGICAL_DECIMAL_TYPES(M) \ + M(TYPE_DECIMALV2) \ + M(TYPE_DECIMAL32) \ + M(TYPE_DECIMAL64) \ + M(TYPE_DECIMAL128I) + +bool PhysicalToLogicalConverter::is_parquet_native_type(PrimitiveType type) { + switch (type) { + case TYPE_BOOLEAN: + case TYPE_INT: + case TYPE_BIGINT: + case TYPE_FLOAT: + case TYPE_DOUBLE: + case TYPE_STRING: + case TYPE_CHAR: + case TYPE_VARCHAR: + return true; + default: + return false; } +} + +bool PhysicalToLogicalConverter::is_decimal_type(doris::PrimitiveType type) { + switch (type) { + case TYPE_DECIMAL32: + case TYPE_DECIMAL64: + case TYPE_DECIMAL128I: + case TYPE_DECIMALV2: + return true; + default: + return false; + } +} - if (tmp_data_type->get_type_id() == remove_nullable(doris_type)->get_type_id()) { - if (tmp_data_type->get_type_id() == TypeIndex::String && - (show_type == PrimitiveType::TYPE_DECIMAL32 || - show_type == PrimitiveType::TYPE_DECIMAL64 || - show_type == PrimitiveType::TYPE_DECIMALV2 || - show_type == PrimitiveType::TYPE_DECIMAL128I)) { - *need_convert = true; - ans_column = tmp_data_type->create_column(); +ColumnPtr PhysicalToLogicalConverter::get_physical_column(tparquet::Type::type src_physical_type, + const TypeDescriptor& src_logical_type, + ColumnPtr& dst_logical_column, + const DataTypePtr& dst_logical_type) { + if (is_consistent() && _logical_converter->is_consistent()) { + if (_cached_src_physical_type == nullptr) { + _cached_src_physical_type = DataTypeFactory::instance().create_data_type( + src_logical_type, dst_logical_type->is_nullable()); + } + return dst_logical_column; + } + + if (_cached_src_physical_column == nullptr) { + switch (src_physical_type) { + case tparquet::Type::type::BOOLEAN: + _cached_src_physical_type = std::make_shared<DataTypeUInt8>(); + break; + case tparquet::Type::type::INT32: + _cached_src_physical_type = std::make_shared<DataTypeInt32>(); + break; + case tparquet::Type::type::INT64: + _cached_src_physical_type = std::make_shared<DataTypeInt64>(); + break; + case tparquet::Type::type::FLOAT: + _cached_src_physical_type = std::make_shared<DataTypeFloat32>(); + break; + case tparquet::Type::type::DOUBLE: + _cached_src_physical_type = std::make_shared<DataTypeFloat64>(); + break; + case tparquet::Type::type::BYTE_ARRAY: + _cached_src_physical_type = std::make_shared<DataTypeString>(); + break; + case tparquet::Type::type::FIXED_LEN_BYTE_ARRAY: + _cached_src_physical_type = std::make_shared<DataTypeUInt8>(); + break; + case tparquet::Type::type::INT96: + _cached_src_physical_type = std::make_shared<DataTypeInt8>(); + break; + } + _cached_src_physical_column = _cached_src_physical_type->create_column(); + if (dst_logical_type->is_nullable()) { + _cached_src_physical_type = make_nullable(_cached_src_physical_type); + } + } + // remove the old cached data + _cached_src_physical_column->assume_mutable()->clear(); + if (is_consistent()) { + if (dst_logical_type->is_nullable()) { + auto doris_nullable_column = const_cast<ColumnNullable*>( + static_cast<const ColumnNullable*>(dst_logical_column.get())); + _src_logical_column = ColumnNullable::create( + _cached_src_physical_column, doris_nullable_column->get_null_map_column_ptr()); } else { - *need_convert = false; + _src_logical_column = _cached_src_physical_column; } } else { - ans_column = tmp_data_type->create_column(); - *need_convert = true; + _src_logical_column = _logical_converter->get_column(src_logical_type, dst_logical_column, + dst_logical_type); } - if (*need_convert && doris_type->is_nullable()) { + if (dst_logical_type->is_nullable()) { // In order to share null map between parquet converted src column and dst column to avoid copying. It is very tricky that will // call mutable function `doris_nullable_column->get_null_map_column_ptr()` which will set `_need_update_has_null = true`. // Because some operations such as agg will call `has_null()` to set `_need_update_has_null = false`. - auto doris_nullable_column = - const_cast<ColumnNullable*>(static_cast<const ColumnNullable*>(doris_column.get())); - ans_column = ColumnNullable::create(ans_column, - doris_nullable_column->get_null_map_column_ptr()); + auto doris_nullable_column = const_cast<ColumnNullable*>( + static_cast<const ColumnNullable*>(dst_logical_column.get())); + return ColumnNullable::create(_cached_src_physical_column, + doris_nullable_column->get_null_map_column_ptr()); + } + + return _cached_src_physical_column; +} + +static void get_decimal_converter(FieldSchema* field_schema, const TypeDescriptor& src_logical_type, Review Comment: warning: function 'get_decimal_converter' exceeds recommended size/complexity thresholds [readability-function-size] ```cpp static void get_decimal_converter(FieldSchema* field_schema, const TypeDescriptor& src_logical_type, ^ ``` <details> <summary>Additional context</summary> **be/src/vec/exec/format/parquet/parquet_column_convert.cpp:132:** 103 lines including whitespace and comments (threshold 80) ```cpp static void get_decimal_converter(FieldSchema* field_schema, const TypeDescriptor& src_logical_type, ^ ``` </details> ########## be/src/vec/exec/format/parquet/parquet_column_convert.cpp: ########## @@ -20,67 +20,278 @@ #include <cctz/time_zone.h> #include "vec/columns/column_nullable.h" -namespace doris::vectorized { -namespace ParquetConvert { +namespace doris::vectorized::parquet { const cctz::time_zone ConvertParams::utc0 = cctz::utc_time_zone(); -ColumnPtr get_column(tparquet::Type::type parquet_physical_type, PrimitiveType show_type, - ColumnPtr& doris_column, DataTypePtr& doris_type, bool* need_convert) { - ColumnPtr ans_column = doris_column; - DataTypePtr tmp_data_type; - - switch (parquet_physical_type) { - case tparquet::Type::type::BOOLEAN: - tmp_data_type = std::make_shared<DataTypeUInt8>(); - break; - case tparquet::Type::type::INT32: - tmp_data_type = std::make_shared<DataTypeInt32>(); - break; - case tparquet::Type::type::INT64: - tmp_data_type = std::make_shared<DataTypeInt64>(); - break; - case tparquet::Type::type::FLOAT: - tmp_data_type = std::make_shared<DataTypeFloat32>(); - break; - case tparquet::Type::type::DOUBLE: - tmp_data_type = std::make_shared<DataTypeFloat64>(); - break; - case tparquet::Type::type::BYTE_ARRAY: - case tparquet::Type::type::FIXED_LEN_BYTE_ARRAY: - tmp_data_type = std::make_shared<DataTypeString>(); - break; - case tparquet::Type::type::INT96: - tmp_data_type = std::make_shared<DataTypeInt8>(); - break; +#define FOR_LOGICAL_DECIMAL_TYPES(M) \ + M(TYPE_DECIMALV2) \ + M(TYPE_DECIMAL32) \ + M(TYPE_DECIMAL64) \ + M(TYPE_DECIMAL128I) + +bool PhysicalToLogicalConverter::is_parquet_native_type(PrimitiveType type) { + switch (type) { + case TYPE_BOOLEAN: + case TYPE_INT: + case TYPE_BIGINT: + case TYPE_FLOAT: + case TYPE_DOUBLE: + case TYPE_STRING: + case TYPE_CHAR: + case TYPE_VARCHAR: + return true; + default: + return false; } +} + +bool PhysicalToLogicalConverter::is_decimal_type(doris::PrimitiveType type) { + switch (type) { + case TYPE_DECIMAL32: + case TYPE_DECIMAL64: + case TYPE_DECIMAL128I: + case TYPE_DECIMALV2: + return true; + default: + return false; + } +} - if (tmp_data_type->get_type_id() == remove_nullable(doris_type)->get_type_id()) { - if (tmp_data_type->get_type_id() == TypeIndex::String && - (show_type == PrimitiveType::TYPE_DECIMAL32 || - show_type == PrimitiveType::TYPE_DECIMAL64 || - show_type == PrimitiveType::TYPE_DECIMALV2 || - show_type == PrimitiveType::TYPE_DECIMAL128I)) { - *need_convert = true; - ans_column = tmp_data_type->create_column(); +ColumnPtr PhysicalToLogicalConverter::get_physical_column(tparquet::Type::type src_physical_type, + const TypeDescriptor& src_logical_type, + ColumnPtr& dst_logical_column, + const DataTypePtr& dst_logical_type) { + if (is_consistent() && _logical_converter->is_consistent()) { + if (_cached_src_physical_type == nullptr) { + _cached_src_physical_type = DataTypeFactory::instance().create_data_type( + src_logical_type, dst_logical_type->is_nullable()); + } + return dst_logical_column; + } + + if (_cached_src_physical_column == nullptr) { + switch (src_physical_type) { + case tparquet::Type::type::BOOLEAN: + _cached_src_physical_type = std::make_shared<DataTypeUInt8>(); + break; + case tparquet::Type::type::INT32: + _cached_src_physical_type = std::make_shared<DataTypeInt32>(); + break; + case tparquet::Type::type::INT64: + _cached_src_physical_type = std::make_shared<DataTypeInt64>(); + break; + case tparquet::Type::type::FLOAT: + _cached_src_physical_type = std::make_shared<DataTypeFloat32>(); + break; + case tparquet::Type::type::DOUBLE: + _cached_src_physical_type = std::make_shared<DataTypeFloat64>(); + break; + case tparquet::Type::type::BYTE_ARRAY: + _cached_src_physical_type = std::make_shared<DataTypeString>(); + break; + case tparquet::Type::type::FIXED_LEN_BYTE_ARRAY: + _cached_src_physical_type = std::make_shared<DataTypeUInt8>(); + break; + case tparquet::Type::type::INT96: + _cached_src_physical_type = std::make_shared<DataTypeInt8>(); + break; + } + _cached_src_physical_column = _cached_src_physical_type->create_column(); + if (dst_logical_type->is_nullable()) { + _cached_src_physical_type = make_nullable(_cached_src_physical_type); + } + } + // remove the old cached data + _cached_src_physical_column->assume_mutable()->clear(); + if (is_consistent()) { + if (dst_logical_type->is_nullable()) { + auto doris_nullable_column = const_cast<ColumnNullable*>( + static_cast<const ColumnNullable*>(dst_logical_column.get())); + _src_logical_column = ColumnNullable::create( + _cached_src_physical_column, doris_nullable_column->get_null_map_column_ptr()); } else { - *need_convert = false; + _src_logical_column = _cached_src_physical_column; } } else { - ans_column = tmp_data_type->create_column(); - *need_convert = true; + _src_logical_column = _logical_converter->get_column(src_logical_type, dst_logical_column, + dst_logical_type); } - if (*need_convert && doris_type->is_nullable()) { + if (dst_logical_type->is_nullable()) { // In order to share null map between parquet converted src column and dst column to avoid copying. It is very tricky that will // call mutable function `doris_nullable_column->get_null_map_column_ptr()` which will set `_need_update_has_null = true`. // Because some operations such as agg will call `has_null()` to set `_need_update_has_null = false`. - auto doris_nullable_column = - const_cast<ColumnNullable*>(static_cast<const ColumnNullable*>(doris_column.get())); - ans_column = ColumnNullable::create(ans_column, - doris_nullable_column->get_null_map_column_ptr()); + auto doris_nullable_column = const_cast<ColumnNullable*>( + static_cast<const ColumnNullable*>(dst_logical_column.get())); + return ColumnNullable::create(_cached_src_physical_column, + doris_nullable_column->get_null_map_column_ptr()); + } + + return _cached_src_physical_column; +} + +static void get_decimal_converter(FieldSchema* field_schema, const TypeDescriptor& src_logical_type, Review Comment: warning: function 'get_decimal_converter' has cognitive complexity of 131 (threshold 50) [readability-function-cognitive-complexity] ```cpp static void get_decimal_converter(FieldSchema* field_schema, const TypeDescriptor& src_logical_type, ^ ``` <details> <summary>Additional context</summary> **be/src/vec/exec/format/parquet/parquet_column_convert.cpp:141:** +1, including nesting penalty of 0, nesting level increased to 1 ```cpp if (is_decimal(remove_nullable(dst_logical_type))) { ^ ``` **be/src/vec/exec/format/parquet/parquet_column_convert.cpp:144:** +1, including nesting penalty of 0, nesting level increased to 1 ```cpp if (src_physical_type == tparquet::Type::FIXED_LEN_BYTE_ARRAY) { ^ ``` **be/src/vec/exec/format/parquet/parquet_column_convert.cpp:145:** +2, including nesting penalty of 1, nesting level increased to 2 ```cpp switch (src_logical_primitive) { ^ ``` **be/src/vec/exec/format/parquet/parquet_column_convert.cpp:166:** +3, including nesting penalty of 2, nesting level increased to 3 ```cpp FOR_LOGICAL_DECIMAL_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/parquet/parquet_column_convert.cpp:166:** +1, nesting level increased to 3 ```cpp FOR_LOGICAL_DECIMAL_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/parquet/parquet_column_convert.cpp:166:** +1, nesting level increased to 3 ```cpp FOR_LOGICAL_DECIMAL_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/parquet/parquet_column_convert.cpp:166:** +3, including nesting penalty of 2, nesting level increased to 3 ```cpp FOR_LOGICAL_DECIMAL_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/parquet/parquet_column_convert.cpp:166:** +1, nesting level increased to 3 ```cpp FOR_LOGICAL_DECIMAL_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/parquet/parquet_column_convert.cpp:166:** +1, nesting level increased to 3 ```cpp FOR_LOGICAL_DECIMAL_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/parquet/parquet_column_convert.cpp:166:** +3, including nesting penalty of 2, nesting level increased to 3 ```cpp FOR_LOGICAL_DECIMAL_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/parquet/parquet_column_convert.cpp:166:** +1, nesting level increased to 3 ```cpp FOR_LOGICAL_DECIMAL_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/parquet/parquet_column_convert.cpp:166:** +1, nesting level increased to 3 ```cpp FOR_LOGICAL_DECIMAL_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/parquet/parquet_column_convert.cpp:166:** +3, including nesting penalty of 2, nesting level increased to 3 ```cpp FOR_LOGICAL_DECIMAL_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/parquet/parquet_column_convert.cpp:166:** +1, nesting level increased to 3 ```cpp FOR_LOGICAL_DECIMAL_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/parquet/parquet_column_convert.cpp:166:** +1, nesting level increased to 3 ```cpp FOR_LOGICAL_DECIMAL_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/parquet/parquet_column_convert.cpp:171:** +1, nesting level increased to 1 ```cpp } else if (src_physical_type == tparquet::Type::BYTE_ARRAY) { ^ ``` **be/src/vec/exec/format/parquet/parquet_column_convert.cpp:172:** +2, including nesting penalty of 1, nesting level increased to 2 ```cpp switch (src_logical_primitive) { ^ ``` **be/src/vec/exec/format/parquet/parquet_column_convert.cpp:190:** +3, including nesting penalty of 2, nesting level increased to 3 ```cpp FOR_LOGICAL_DECIMAL_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/parquet/parquet_column_convert.cpp:190:** +1, nesting level increased to 3 ```cpp FOR_LOGICAL_DECIMAL_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/parquet/parquet_column_convert.cpp:190:** +1, nesting level increased to 3 ```cpp FOR_LOGICAL_DECIMAL_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/parquet/parquet_column_convert.cpp:190:** +3, including nesting penalty of 2, nesting level increased to 3 ```cpp FOR_LOGICAL_DECIMAL_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/parquet/parquet_column_convert.cpp:190:** +1, nesting level increased to 3 ```cpp FOR_LOGICAL_DECIMAL_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/parquet/parquet_column_convert.cpp:190:** +1, nesting level increased to 3 ```cpp FOR_LOGICAL_DECIMAL_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/parquet/parquet_column_convert.cpp:190:** +3, including nesting penalty of 2, nesting level increased to 3 ```cpp FOR_LOGICAL_DECIMAL_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/parquet/parquet_column_convert.cpp:190:** +1, nesting level increased to 3 ```cpp FOR_LOGICAL_DECIMAL_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/parquet/parquet_column_convert.cpp:190:** +1, nesting level increased to 3 ```cpp FOR_LOGICAL_DECIMAL_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/parquet/parquet_column_convert.cpp:190:** +3, including nesting penalty of 2, nesting level increased to 3 ```cpp FOR_LOGICAL_DECIMAL_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/parquet/parquet_column_convert.cpp:190:** +1, nesting level increased to 3 ```cpp FOR_LOGICAL_DECIMAL_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/parquet/parquet_column_convert.cpp:190:** +1, nesting level increased to 3 ```cpp FOR_LOGICAL_DECIMAL_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/parquet/parquet_column_convert.cpp:195:** +1, nesting level increased to 1 ```cpp } else if (src_physical_type == tparquet::Type::INT32 || ^ ``` **be/src/vec/exec/format/parquet/parquet_column_convert.cpp:197:** +2, including nesting penalty of 1, nesting level increased to 2 ```cpp switch (src_logical_primitive) { ^ ``` **be/src/vec/exec/format/parquet/parquet_column_convert.cpp:230:** +3, including nesting penalty of 2, nesting level increased to 3 ```cpp FOR_LOGICAL_DECIMAL_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/parquet/parquet_column_convert.cpp:230:** +4, including nesting penalty of 3, nesting level increased to 4 ```cpp FOR_LOGICAL_DECIMAL_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/parquet/parquet_column_convert.cpp:230:** +1, nesting level increased to 4 ```cpp FOR_LOGICAL_DECIMAL_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/parquet/parquet_column_convert.cpp:230:** +1, nesting level increased to 3 ```cpp FOR_LOGICAL_DECIMAL_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/parquet/parquet_column_convert.cpp:230:** +4, including nesting penalty of 3, nesting level increased to 4 ```cpp FOR_LOGICAL_DECIMAL_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/parquet/parquet_column_convert.cpp:230:** +1, nesting level increased to 4 ```cpp FOR_LOGICAL_DECIMAL_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/parquet/parquet_column_convert.cpp:230:** +1, nesting level increased to 3 ```cpp FOR_LOGICAL_DECIMAL_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/parquet/parquet_column_convert.cpp:230:** +4, including nesting penalty of 3, nesting level increased to 4 ```cpp FOR_LOGICAL_DECIMAL_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/parquet/parquet_column_convert.cpp:230:** +1, nesting level increased to 4 ```cpp FOR_LOGICAL_DECIMAL_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/parquet/parquet_column_convert.cpp:230:** +3, including nesting penalty of 2, nesting level increased to 3 ```cpp FOR_LOGICAL_DECIMAL_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/parquet/parquet_column_convert.cpp:230:** +4, including nesting penalty of 3, nesting level increased to 4 ```cpp FOR_LOGICAL_DECIMAL_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/parquet/parquet_column_convert.cpp:230:** +1, nesting level increased to 4 ```cpp FOR_LOGICAL_DECIMAL_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/parquet/parquet_column_convert.cpp:230:** +1, nesting level increased to 3 ```cpp FOR_LOGICAL_DECIMAL_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/parquet/parquet_column_convert.cpp:230:** +4, including nesting penalty of 3, nesting level increased to 4 ```cpp FOR_LOGICAL_DECIMAL_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/parquet/parquet_column_convert.cpp:230:** +1, nesting level increased to 4 ```cpp FOR_LOGICAL_DECIMAL_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/parquet/parquet_column_convert.cpp:230:** +1, nesting level increased to 3 ```cpp FOR_LOGICAL_DECIMAL_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/parquet/parquet_column_convert.cpp:230:** +4, including nesting penalty of 3, nesting level increased to 4 ```cpp FOR_LOGICAL_DECIMAL_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/parquet/parquet_column_convert.cpp:230:** +1, nesting level increased to 4 ```cpp FOR_LOGICAL_DECIMAL_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/parquet/parquet_column_convert.cpp:230:** +3, including nesting penalty of 2, nesting level increased to 3 ```cpp FOR_LOGICAL_DECIMAL_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/parquet/parquet_column_convert.cpp:230:** +4, including nesting penalty of 3, nesting level increased to 4 ```cpp FOR_LOGICAL_DECIMAL_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/parquet/parquet_column_convert.cpp:230:** +1, nesting level increased to 4 ```cpp FOR_LOGICAL_DECIMAL_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/parquet/parquet_column_convert.cpp:230:** +1, nesting level increased to 3 ```cpp FOR_LOGICAL_DECIMAL_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/parquet/parquet_column_convert.cpp:230:** +4, including nesting penalty of 3, nesting level increased to 4 ```cpp FOR_LOGICAL_DECIMAL_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/parquet/parquet_column_convert.cpp:230:** +1, nesting level increased to 4 ```cpp FOR_LOGICAL_DECIMAL_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/parquet/parquet_column_convert.cpp:230:** +1, nesting level increased to 3 ```cpp FOR_LOGICAL_DECIMAL_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/parquet/parquet_column_convert.cpp:230:** +4, including nesting penalty of 3, nesting level increased to 4 ```cpp FOR_LOGICAL_DECIMAL_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/parquet/parquet_column_convert.cpp:230:** +1, nesting level increased to 4 ```cpp FOR_LOGICAL_DECIMAL_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/parquet/parquet_column_convert.cpp:230:** +3, including nesting penalty of 2, nesting level increased to 3 ```cpp FOR_LOGICAL_DECIMAL_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/parquet/parquet_column_convert.cpp:230:** +4, including nesting penalty of 3, nesting level increased to 4 ```cpp FOR_LOGICAL_DECIMAL_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/parquet/parquet_column_convert.cpp:230:** +1, nesting level increased to 4 ```cpp FOR_LOGICAL_DECIMAL_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/parquet/parquet_column_convert.cpp:230:** +1, nesting level increased to 3 ```cpp FOR_LOGICAL_DECIMAL_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/parquet/parquet_column_convert.cpp:230:** +4, including nesting penalty of 3, nesting level increased to 4 ```cpp FOR_LOGICAL_DECIMAL_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/parquet/parquet_column_convert.cpp:230:** +1, nesting level increased to 4 ```cpp FOR_LOGICAL_DECIMAL_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/parquet/parquet_column_convert.cpp:230:** +1, nesting level increased to 3 ```cpp FOR_LOGICAL_DECIMAL_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/parquet/parquet_column_convert.cpp:230:** +4, including nesting penalty of 3, nesting level increased to 4 ```cpp FOR_LOGICAL_DECIMAL_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/parquet/parquet_column_convert.cpp:230:** +1, nesting level increased to 4 ```cpp FOR_LOGICAL_DECIMAL_TYPES(DISPATCH) ^ ``` **be/src/vec/exec/format/parquet/parquet_column_convert.cpp:235:** +1, nesting level increased to 1 ```cpp } else { ^ ``` </details> ########## be/src/vec/exec/format/parquet/parquet_column_convert.h: ########## @@ -525,262 +398,81 @@ } else if constexpr (ScaleType == DecimalScaleParams::SCALE_DOWN) { value /= scale_params.scale_factor; } - data[start_idx + i] = (DecimalPhysicalType)value; + data[start_idx + i] = (DecimalType)value; } return Status::OK(); } }; -template <typename DecimalType, typename ValueCopyType> -class StringToDecimalString : public ColumnConvert { -public: - Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override { - convert_null(src_col, dst_col); - - size_t rows = src_col->size(); - - auto buf = static_cast<const ColumnString*>(src_col.get())->get_chars().data(); - auto& offset = static_cast<const ColumnString*>(src_col.get())->get_offsets(); - - auto data = static_cast<ColumnString*>(dst_col.get()); - for (int i = 0; i < rows; i++) { - int len = offset[i] - offset[i - 1]; - // When Decimal in parquet is stored in byte arrays, binary and fixed, - // the unscaled number must be encoded as two's complement using big-endian byte order. - ValueCopyType value = 0; - memcpy(reinterpret_cast<char*>(&value), buf + offset[i - 1], len); - value = BitUtil::big_endian_to_host(value); - value = value >> ((sizeof(value) - len) * 8); - std::string ans = reinterpret_cast<DecimalType&>(value).to_string( - _convert_params->field_schema->parquet_schema.scale); - data->insert_data(ans.data(), ans.size()); - } - return Status::OK(); - } -}; - -class Int32ToDateString : public ColumnConvert { -public: - Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override { - convert_null(src_col, dst_col); +class Int32ToDate : public PhysicalToLogicalConverter { + Status physical_convert(ColumnPtr& src_physical_col) override { Review Comment: warning: method 'physical_convert' can be made static [readability-convert-member-functions-to-static] ```suggestion static Status physical_convert(ColumnPtr& src_physical_col) override { ``` ########## be/src/vec/exec/format/parquet/vparquet_column_reader.cpp: ########## @@ -480,13 +480,19 @@ Status ScalarColumnReader::_try_load_dict_page(bool* loaded, bool* has_dict) { Status ScalarColumnReader::read_column_data(ColumnPtr& doris_column, DataTypePtr& type, Review Comment: warning: function 'read_column_data' has cognitive complexity of 81 (threshold 50) [readability-function-cognitive-complexity] ```cpp Status ScalarColumnReader::read_column_data(ColumnPtr& doris_column, DataTypePtr& type, ^ ``` <details> <summary>Additional context</summary> **be/src/vec/exec/format/parquet/vparquet_column_reader.cpp:484:** +1, including nesting penalty of 0, nesting level increased to 1 ```cpp if (_converter == nullptr) { ^ ``` **be/src/vec/exec/format/parquet/vparquet_column_reader.cpp:487:** +2, including nesting penalty of 1, nesting level increased to 2 ```cpp if (!_converter->support()) { ^ ``` **be/src/vec/exec/format/parquet/vparquet_column_reader.cpp:496:** +1, including nesting penalty of 0, nesting level increased to 1 ```cpp do { ^ ``` **be/src/vec/exec/format/parquet/vparquet_column_reader.cpp:497:** +2, including nesting penalty of 1, nesting level increased to 2 ```cpp if (_chunk_reader->remaining_num_values() == 0) { ^ ``` **be/src/vec/exec/format/parquet/vparquet_column_reader.cpp:498:** +3, including nesting penalty of 2, nesting level increased to 3 ```cpp if (!_chunk_reader->has_next_page()) { ^ ``` **be/src/vec/exec/format/parquet/vparquet_column_reader.cpp:503:** +3, including nesting penalty of 2, nesting level increased to 3 ```cpp RETURN_IF_ERROR(_chunk_reader->next_page()); ^ ``` **be/src/common/status.h:541:** expanded from macro 'RETURN_IF_ERROR' ```cpp do { \ ^ ``` **be/src/vec/exec/format/parquet/vparquet_column_reader.cpp:503:** +4, including nesting penalty of 3, nesting level increased to 4 ```cpp RETURN_IF_ERROR(_chunk_reader->next_page()); ^ ``` **be/src/common/status.h:543:** expanded from macro 'RETURN_IF_ERROR' ```cpp if (UNLIKELY(!_status_.ok())) { \ ^ ``` **be/src/vec/exec/format/parquet/vparquet_column_reader.cpp:505:** +2, including nesting penalty of 1, nesting level increased to 2 ```cpp if (_nested_column) { ^ ``` **be/src/vec/exec/format/parquet/vparquet_column_reader.cpp:506:** +3, including nesting penalty of 2, nesting level increased to 3 ```cpp RETURN_IF_ERROR(_chunk_reader->load_page_data_idempotent()); ^ ``` **be/src/common/status.h:541:** expanded from macro 'RETURN_IF_ERROR' ```cpp do { \ ^ ``` **be/src/vec/exec/format/parquet/vparquet_column_reader.cpp:506:** +4, including nesting penalty of 3, nesting level increased to 4 ```cpp RETURN_IF_ERROR(_chunk_reader->load_page_data_idempotent()); ^ ``` **be/src/common/status.h:543:** expanded from macro 'RETURN_IF_ERROR' ```cpp if (UNLIKELY(!_status_.ok())) { \ ^ ``` **be/src/vec/exec/format/parquet/vparquet_column_reader.cpp:507:** +3, including nesting penalty of 2, nesting level increased to 3 ```cpp RETURN_IF_ERROR(_read_nested_column(resolved_column, resolved_type, select_vector, ^ ``` **be/src/common/status.h:541:** expanded from macro 'RETURN_IF_ERROR' ```cpp do { \ ^ ``` **be/src/vec/exec/format/parquet/vparquet_column_reader.cpp:507:** +4, including nesting penalty of 3, nesting level increased to 4 ```cpp RETURN_IF_ERROR(_read_nested_column(resolved_column, resolved_type, select_vector, ^ ``` **be/src/common/status.h:543:** expanded from macro 'RETURN_IF_ERROR' ```cpp if (UNLIKELY(!_status_.ok())) { \ ^ ``` **be/src/vec/exec/format/parquet/vparquet_column_reader.cpp:517:** +2, including nesting penalty of 1, nesting level increased to 2 ```cpp if (read_ranges.size() == 0) { ^ ``` **be/src/vec/exec/format/parquet/vparquet_column_reader.cpp:520:** +3, including nesting penalty of 2, nesting level increased to 3 ```cpp RETURN_IF_ERROR(_chunk_reader->skip_page()); ^ ``` **be/src/common/status.h:541:** expanded from macro 'RETURN_IF_ERROR' ```cpp do { \ ^ ``` **be/src/vec/exec/format/parquet/vparquet_column_reader.cpp:520:** +4, including nesting penalty of 3, nesting level increased to 4 ```cpp RETURN_IF_ERROR(_chunk_reader->skip_page()); ^ ``` **be/src/common/status.h:543:** expanded from macro 'RETURN_IF_ERROR' ```cpp if (UNLIKELY(!_status_.ok())) { \ ^ ``` **be/src/vec/exec/format/parquet/vparquet_column_reader.cpp:522:** +1, nesting level increased to 2 ```cpp } else { ^ ``` **be/src/vec/exec/format/parquet/vparquet_column_reader.cpp:526:** +3, including nesting penalty of 2, nesting level increased to 3 ```cpp if (select_vector.has_filter() && select_vector.filter_ratio() > 0.6) { ^ ``` **be/src/vec/exec/format/parquet/vparquet_column_reader.cpp:526:** +1 ```cpp if (select_vector.has_filter() && select_vector.filter_ratio() > 0.6) { ^ ``` **be/src/vec/exec/format/parquet/vparquet_column_reader.cpp:532:** +4, including nesting penalty of 3, nesting level increased to 4 ```cpp if (batch_size >= remaining_num_values && ^ ``` **be/src/vec/exec/format/parquet/vparquet_column_reader.cpp:532:** +1 ```cpp if (batch_size >= remaining_num_values && ^ ``` **be/src/vec/exec/format/parquet/vparquet_column_reader.cpp:537:** +5, including nesting penalty of 4, nesting level increased to 5 ```cpp RETURN_IF_ERROR(_chunk_reader->skip_page()); ^ ``` **be/src/common/status.h:541:** expanded from macro 'RETURN_IF_ERROR' ```cpp do { \ ^ ``` **be/src/vec/exec/format/parquet/vparquet_column_reader.cpp:537:** +6, including nesting penalty of 5, nesting level increased to 6 ```cpp RETURN_IF_ERROR(_chunk_reader->skip_page()); ^ ``` **be/src/common/status.h:543:** expanded from macro 'RETURN_IF_ERROR' ```cpp if (UNLIKELY(!_status_.ok())) { \ ^ ``` **be/src/vec/exec/format/parquet/vparquet_column_reader.cpp:539:** +5, including nesting penalty of 4, nesting level increased to 5 ```cpp if (!_chunk_reader->has_next_page()) { ^ ``` **be/src/vec/exec/format/parquet/vparquet_column_reader.cpp:544:** +1 ```cpp skip_whole_batch = batch_size <= remaining_num_values && ^ ``` **be/src/vec/exec/format/parquet/vparquet_column_reader.cpp:546:** +4, including nesting penalty of 3, nesting level increased to 4 ```cpp if (skip_whole_batch) { ^ ``` **be/src/vec/exec/format/parquet/vparquet_column_reader.cpp:551:** +3, including nesting penalty of 2, nesting level increased to 3 ```cpp RETURN_IF_ERROR(_chunk_reader->load_page_data_idempotent()); ^ ``` **be/src/common/status.h:541:** expanded from macro 'RETURN_IF_ERROR' ```cpp do { \ ^ ``` **be/src/vec/exec/format/parquet/vparquet_column_reader.cpp:551:** +4, including nesting penalty of 3, nesting level increased to 4 ```cpp RETURN_IF_ERROR(_chunk_reader->load_page_data_idempotent()); ^ ``` **be/src/common/status.h:543:** expanded from macro 'RETURN_IF_ERROR' ```cpp if (UNLIKELY(!_status_.ok())) { \ ^ ``` **be/src/vec/exec/format/parquet/vparquet_column_reader.cpp:576:** +2, including nesting penalty of 1, nesting level increased to 2 ```cpp if (_chunk_reader->remaining_num_values() == 0 && !_chunk_reader->has_next_page()) { ^ ``` </details> -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org