This is an automated email from the ASF dual-hosted git repository. joemcdonnell pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git
commit ee69ed1d0386f41689269a522e2aed490e52987d Author: Daniel Vanko <[email protected]> AuthorDate: Wed Jun 25 11:42:51 2025 +0200 IMPALA-13625: Allow reading Parquet int32/int64 as decimal without logical types This patch allows reading columns with integer logical type as decimals. This can occur when we're trying to read files that were written as INT but the column was altered to a suitable DECIMAL. In this case the precision is based on physical type and equals 9 and 18, for int32 and int64 respectively. Test: * add new e2e tests Change-Id: I56006eb3cca28c81ec8467d77b35005fbf669680 Reviewed-on: http://gerrit.cloudera.org:8080/22922 Reviewed-by: Impala Public Jenkins <[email protected]> Tested-by: Impala Public Jenkins <[email protected]> --- be/src/exec/parquet/parquet-data-converter.h | 21 ++++++ be/src/exec/parquet/parquet-metadata-utils.cc | 23 +++++- .../queries/QueryTest/parquet-type-widening.test | 82 +++++++++++++++++++++- 3 files changed, 123 insertions(+), 3 deletions(-) diff --git a/be/src/exec/parquet/parquet-data-converter.h b/be/src/exec/parquet/parquet-data-converter.h index eb042362e..93c403bef 100644 --- a/be/src/exec/parquet/parquet-data-converter.h +++ b/be/src/exec/parquet/parquet-data-converter.h @@ -70,6 +70,21 @@ class ParquetDataConverter { } int32_t GetPrecision() const { + // If logical type is INTEGER, the precision is determined by the physical type. + if (parquet_element_->__isset.logicalType + && UNLIKELY(parquet_element_->logicalType.__isset.INTEGER)) { + switch (parquet_element_->type) { + case parquet::Type::INT32: + return ColumnType::MAX_DECIMAL4_PRECISION + 1; + case parquet::Type::INT64: + return ColumnType::MAX_DECIMAL8_PRECISION + 1; + default: + DCHECK(false) << "Unexpected physical type for INTEGER logical type: " + << to_string(parquet_element_->type); + break; + } + } + if (parquet_element_->__isset.logicalType && parquet_element_->logicalType.__isset.DECIMAL) { return parquet_element_->logicalType.DECIMAL.precision; @@ -77,6 +92,7 @@ class ParquetDataConverter { return parquet_element_->precision; } + /// Returns true if we need to do a conversion from the Parquet type to the slot type. bool CheckIfNeedsConversion() { if (!MATERIALIZED) return false; @@ -87,6 +103,11 @@ class ParquetDataConverter { return true; } if (col_type_->type == TYPE_DECIMAL) { + // If the logical type is INTEGER for a Decimal slot, conversion is needed. + if (parquet_element_->__isset.logicalType + && UNLIKELY(parquet_element_->logicalType.__isset.INTEGER)) { + return true; + } if (col_type_->precision != GetPrecision()) { // Decimal values can be stored by Decimal4Value (4 bytes), Decimal8Value, and // Decimal16Value. We only need to do a conversion for different precision if diff --git a/be/src/exec/parquet/parquet-metadata-utils.cc b/be/src/exec/parquet/parquet-metadata-utils.cc index 2cb1c3bad..f2749f6ac 100644 --- a/be/src/exec/parquet/parquet-metadata-utils.cc +++ b/be/src/exec/parquet/parquet-metadata-utils.cc @@ -89,6 +89,11 @@ bool IsSupportedType(PrimitiveType impala_type, return encodings->second.find(parquet_type) != encodings->second.end(); } +/// Returns true if Parquet's logical type is INTEGER. +bool IsIntLogicalType(const parquet::SchemaElement& element) { + return element.__isset.logicalType && element.logicalType.__isset.INTEGER; +} + /// Returns true if encoding 'e' is supported by Impala, false otherwise. static bool IsEncodingSupported(parquet::Encoding::type e) { switch (e) { @@ -208,7 +213,21 @@ int32_t GetScale(const parquet::SchemaElement& schema_element) { } // Precision is required, this should be called after checking IsPrecisionSet() +// unless logical type is INTEGER, in which case the precision is based on physical type. int32_t GetPrecision(const parquet::SchemaElement& schema_element) { + if (UNLIKELY(IsIntLogicalType(schema_element))) { + switch (schema_element.type) { + case parquet::Type::INT32: + return ColumnType::MAX_DECIMAL4_PRECISION + 1; + case parquet::Type::INT64: + return ColumnType::MAX_DECIMAL8_PRECISION + 1; + default: + DCHECK(false) << "Unexpected physical type for INTEGER logical type: " + << to_string(schema_element.type); + break; + } + } + DCHECK(IsPrecisionSet(schema_element)); if (schema_element.__isset.logicalType && schema_element.logicalType.__isset.DECIMAL) { return schema_element.logicalType.DECIMAL.precision; @@ -370,7 +389,7 @@ Status ParquetMetadataUtils::ValidateColumn(const char* filename, // We require that the precision be a positive value, and not larger than the // precision in table schema. - if (!IsPrecisionSet(schema_element)) { + if (!IsPrecisionSet(schema_element) && !IsIntLogicalType(schema_element)) { ErrorMsg msg(TErrorCode::PARQUET_MISSING_PRECISION, filename, schema_element.name); return Status(msg); } else { @@ -388,7 +407,7 @@ Status ParquetMetadataUtils::ValidateColumn(const char* filename, } } - if (!is_converted_type_decimal) { + if (!is_converted_type_decimal && !IsIntLogicalType(schema_element)) { // TODO: is this validation useful? It is not required at all to read the data and // might only serve to reject otherwise perfectly readable files. ErrorMsg msg(TErrorCode::PARQUET_BAD_CONVERTED_TYPE, filename, diff --git a/testdata/workloads/functional-query/queries/QueryTest/parquet-type-widening.test b/testdata/workloads/functional-query/queries/QueryTest/parquet-type-widening.test index f0f11c788..74c951e23 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/parquet-type-widening.test +++ b/testdata/workloads/functional-query/queries/QueryTest/parquet-type-widening.test @@ -6,4 +6,84 @@ select * from primitive_type_widening; 10,20,30,40,50,60,70,80,90,1230.4560546875 ---- TYPES SMALLINT,INT,BIGINT,DOUBLE,INT,BIGINT,DOUBLE,INT,DOUBLE,DOUBLE -==== \ No newline at end of file +==== +---- QUERY +# IMPALA-13625: Allow reading Parquet int32/int64 as decimal without logical types +create table read_int_as_decimal (c1 int, c2 bigint) stored as parquet; +insert into read_int_as_decimal +values + (1, cast(10000000000 as bigint)), + (MIN_INT(), MIN_BIGINT()), + (MAX_INT(), MAX_BIGINT()); +select * from read_int_as_decimal order by c1; +---- RESULTS +-2147483648,-9223372036854775808 +1,10000000000 +2147483647,9223372036854775807 +---- TYPES +INT, BIGINT +==== +---- QUERY +alter table read_int_as_decimal change c1 c1 decimal(10,0); +select c1 from read_int_as_decimal order by c1; +---- RESULTS +-2147483648 +1 +2147483647 +---- TYPES +DECIMAL +==== +---- QUERY +alter table read_int_as_decimal change c1 c1 decimal(8,0); +select c1 from read_int_as_decimal order by c1; +---- CATCH +column 'c1' has a precision that does not match the table metadata precision. File metadata precision: 10, table metadata precision: 8. +==== +---- QUERY +alter table read_int_as_decimal change c1 c1 decimal(20,4); +select c1 from read_int_as_decimal order by c1; +---- RESULTS +-2147483648.0000 +1.0000 +2147483647.0000 +---- TYPES +DECIMAL +==== +---- QUERY +alter table read_int_as_decimal change c1 c1 decimal(12,4); +select c1 from read_int_as_decimal order by c1; +---- RESULTS +1.0000 +NULL +NULL +==== +---- QUERY +alter table read_int_as_decimal change c2 c2 decimal(19,0); +select c2 from read_int_as_decimal order by c2; +---- RESULTS +-9223372036854775808 +10000000000 +9223372036854775807 +==== +---- QUERY +alter table read_int_as_decimal change c2 c2 decimal(2,0); +select c2 from read_int_as_decimal order by c2; +---- CATCH +column 'c2' has a precision that does not match the table metadata precision. File metadata precision: 19, table metadata precision: 2. +==== +---- QUERY +alter table read_int_as_decimal change c2 c2 decimal(25,5); +select c2 from read_int_as_decimal order by c2; +---- RESULTS +-9223372036854775808.00000 +10000000000.00000 +9223372036854775807.00000 +==== +---- QUERY +alter table read_int_as_decimal change c2 c2 decimal(20,5); +select c2 from read_int_as_decimal order by c2; +---- RESULTS +10000000000.00000 +NULL +NULL +====
