This is an automated email from the ASF dual-hosted git repository. michaelsmith pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git
commit bbfba13ed4d084681b542d7c5e1b5156576a603b Author: Daniel Becker <[email protected]> AuthorDate: Tue May 14 16:01:46 2024 +0200 IMPALA-13079: Add support for FLOAT/DOUBLE in Iceberg metadata tables Until now, the float and double data types were not supported in Iceberg metadata tables. This commit adds support for them. Testing: - added a test table that contains all primitive types (except for decimal, which is still not supported), a struct, an array and a map - added a test query that queries the `files` metadata table of the above table - the 'readable_metrics' struct contains lower and upper bounds for all columns in the original table, with the original type Change-Id: I2171c9aa9b6d2b634b8c511263b1610cb1d7cb29 Reviewed-on: http://gerrit.cloudera.org:8080/21425 Reviewed-by: Impala Public Jenkins <[email protected]> Tested-by: Impala Public Jenkins <[email protected]> --- be/src/exec/iceberg-metadata/iceberg-row-reader.cc | 32 +++++++++++ be/src/exec/iceberg-metadata/iceberg-row-reader.h | 8 +++ .../functional/functional_schema_template.sql | 67 ++++++++++++++++++++++ .../datasets/functional/schema_constraints.csv | 1 + .../queries/QueryTest/iceberg-metadata-tables.test | 14 +++++ 5 files changed, 122 insertions(+) diff --git a/be/src/exec/iceberg-metadata/iceberg-row-reader.cc b/be/src/exec/iceberg-metadata/iceberg-row-reader.cc index 4d259b5a1..685a12518 100644 --- a/be/src/exec/iceberg-metadata/iceberg-row-reader.cc +++ b/be/src/exec/iceberg-metadata/iceberg-row-reader.cc @@ -46,6 +46,8 @@ Status IcebergRowReader::InitJNI() { RETURN_IF_ERROR(JniUtil::GetGlobalClassRef(env, "java/lang/Boolean", &boolean_cl_)); RETURN_IF_ERROR(JniUtil::GetGlobalClassRef(env, "java/lang/Integer", &integer_cl_)); RETURN_IF_ERROR(JniUtil::GetGlobalClassRef(env, "java/lang/Long", &long_cl_)); + RETURN_IF_ERROR(JniUtil::GetGlobalClassRef(env, "java/lang/Float", &float_cl_)); + RETURN_IF_ERROR(JniUtil::GetGlobalClassRef(env, "java/lang/Double", &double_cl_)); RETURN_IF_ERROR(JniUtil::GetGlobalClassRef(env, "java/lang/CharSequence", &char_sequence_cl_)); RETURN_IF_ERROR(JniUtil::GetGlobalClassRef(env, "java/nio/ByteBuffer", @@ -60,6 +62,10 @@ Status IcebergRowReader::InitJNI() { &integer_value_)); RETURN_IF_ERROR(JniUtil::GetMethodID(env, long_cl_, "longValue", "()J", &long_value_)); + RETURN_IF_ERROR(JniUtil::GetMethodID(env, long_cl_, "floatValue", "()F", + &float_value_)); + RETURN_IF_ERROR(JniUtil::GetMethodID(env, long_cl_, "doubleValue", "()D", + &double_value_)); RETURN_IF_ERROR(JniUtil::GetMethodID(env, char_sequence_cl_, "toString", "()Ljava/lang/String;", &char_sequence_to_string_)); return Status::OK(); @@ -108,6 +114,12 @@ Status IcebergRowReader::WriteSlot(JNIEnv* env, const jobject* struct_like_row, } case TYPE_BIGINT: { // java.lang.Long RETURN_IF_ERROR(WriteLongSlot(env, accessed_value, slot)); break; + } case TYPE_FLOAT: { // java.lang.Float + RETURN_IF_ERROR(WriteFloatSlot(env, accessed_value, slot)); + break; + } case TYPE_DOUBLE: { // java.lang.Double + RETURN_IF_ERROR(WriteDoubleSlot(env, accessed_value, slot)); + break; } case TYPE_TIMESTAMP: { // org.apache.iceberg.types.TimestampType RETURN_IF_ERROR(WriteTimeStampSlot(env, accessed_value, slot)); break; @@ -188,6 +200,26 @@ Status IcebergRowReader::WriteLongSlot(JNIEnv* env, const jobject &accessed_valu return Status::OK(); } +Status IcebergRowReader::WriteFloatSlot(JNIEnv* env, const jobject &accessed_value, + void* slot) { + DCHECK(accessed_value != nullptr); + DCHECK(env->IsInstanceOf(accessed_value, float_cl_) == JNI_TRUE); + jfloat result = env->CallFloatMethod(accessed_value, float_value_); + RETURN_ERROR_IF_EXC(env); + *reinterpret_cast<float*>(slot) = result; + return Status::OK(); +} + +Status IcebergRowReader::WriteDoubleSlot(JNIEnv* env, const jobject &accessed_value, + void* slot) { + DCHECK(accessed_value != nullptr); + DCHECK(env->IsInstanceOf(accessed_value, double_cl_) == JNI_TRUE); + jdouble result = env->CallDoubleMethod(accessed_value, double_value_); + RETURN_ERROR_IF_EXC(env); + *reinterpret_cast<double*>(slot) = result; + return Status::OK(); +} + Status IcebergRowReader::WriteTimeStampSlot(JNIEnv* env, const jobject &accessed_value, void* slot) { DCHECK(accessed_value != nullptr); diff --git a/be/src/exec/iceberg-metadata/iceberg-row-reader.h b/be/src/exec/iceberg-metadata/iceberg-row-reader.h index 34f21d773..67a51d2fe 100644 --- a/be/src/exec/iceberg-metadata/iceberg-row-reader.h +++ b/be/src/exec/iceberg-metadata/iceberg-row-reader.h @@ -57,6 +57,8 @@ class IcebergRowReader { inline static jclass boolean_cl_ = nullptr; inline static jclass integer_cl_ = nullptr; inline static jclass long_cl_ = nullptr; + inline static jclass float_cl_ = nullptr; + inline static jclass double_cl_ = nullptr; inline static jclass char_sequence_cl_ = nullptr; inline static jclass byte_buffer_cl_ = nullptr; @@ -66,6 +68,8 @@ class IcebergRowReader { inline static jmethodID boolean_value_ = nullptr; inline static jmethodID integer_value_ = nullptr; inline static jmethodID long_value_ = nullptr; + inline static jmethodID float_value_ = nullptr; + inline static jmethodID double_value_ = nullptr; inline static jmethodID char_sequence_to_string_ = nullptr; /// The scan node that started this row reader. @@ -91,6 +95,10 @@ class IcebergRowReader { WARN_UNUSED_RESULT; Status WriteLongSlot(JNIEnv* env, const jobject &accessed_value, void* slot) WARN_UNUSED_RESULT; + Status WriteFloatSlot(JNIEnv* env, const jobject &accessed_value, void* slot) + WARN_UNUSED_RESULT; + Status WriteDoubleSlot(JNIEnv* env, const jobject &accessed_value, void* slot) + WARN_UNUSED_RESULT; /// Iceberg TimeStamp is parsed into TimestampValue. Status WriteTimeStampSlot(JNIEnv* env, const jobject &accessed_value, void* slot) WARN_UNUSED_RESULT; diff --git a/testdata/datasets/functional/functional_schema_template.sql b/testdata/datasets/functional/functional_schema_template.sql index 0290e6ba0..762f6e319 100644 --- a/testdata/datasets/functional/functional_schema_template.sql +++ b/testdata/datasets/functional/functional_schema_template.sql @@ -3893,6 +3893,73 @@ SELECT * FROM {db_name}{db_suffix}.iceberg_query_metadata; ---- DATASET functional ---- BASE_TABLE_NAME +iceberg_metadata_alltypes +---- CREATE +CREATE TABLE IF NOT EXISTS {db_name}{db_suffix}.{table_name} ( + b boolean, + i int, + l bigint, + f float, + d double, + ts timestamp, + dt date, + s string, + bn binary, + -- TODO IMPALA-13080: Add decimal. + strct struct<i: int>, + arr array<double>, + mp map<int, float> +) +STORED BY ICEBERG +LOCATION '/test-warehouse/iceberg_test/hadoop_catalog/ice/iceberg_metadata_alltypes' +TBLPROPERTIES('format-version'='2'); +---- DEPENDENT_LOAD_HIVE +INSERT INTO {db_name}{db_suffix}.{table_name} VALUES ( + false, + 1, + -10, + 2e-10, + -2e-100, + to_utc_timestamp("2024-05-14 14:51:12", "UTC"), + to_date("2024-05-14"), + "Some string", + "bin1", + named_struct("i", 10), + array(cast(10.0 as double), cast(20.0 as double)), + map(10, cast(10.0 as float), 100, cast(100.0 as float)) +), +( + NULL, + 5, + 150, + 2e15, + double('NaN'), + to_utc_timestamp("2025-06-15 18:51:12", "UTC"), + to_date("2025-06-15"), + "A string", + NULL, + named_struct("i", -150), + array(cast(-10.0 as double), cast(-2e100 as double)), + map(10, cast(0.5 as float), 101, cast(1e3 as float)) +), +( + true, + 5, + 150, + float('NaN'), + 2e100, + NULL, + NULL, + NULL, + "bin2", + named_struct("i", -150), + array(cast(-12.0 as double), cast(-2e100 as double)), + map(10, cast(0.5 as float), 101, cast(1e3 as float)) +); +==== +---- DATASET +functional +---- BASE_TABLE_NAME iceberg_with_key_metadata ---- CREATE CREATE EXTERNAL TABLE IF NOT EXISTS {db_name}{db_suffix}.{table_name} diff --git a/testdata/datasets/functional/schema_constraints.csv b/testdata/datasets/functional/schema_constraints.csv index 84046aa7b..58b705e78 100644 --- a/testdata/datasets/functional/schema_constraints.csv +++ b/testdata/datasets/functional/schema_constraints.csv @@ -110,6 +110,7 @@ table_name:iceberg_lineitem_sixblocks, constraint:restrict_to, table_format:parq table_name:iceberg_spark_compaction_with_dangling_delete, constraint:restrict_to, table_format:parquet/none/none table_name:iceberg_v2_equality_delete_schema_evolution, constraint:restrict_to, table_format:parquet/none/none table_name:iceberg_query_metadata, constraint:restrict_to, table_format:parquet/none/none +table_name:iceberg_metadata_alltypes, constraint:restrict_to, table_format:parquet/none/none table_name:iceberg_view, constraint:restrict_to, table_format:parquet/none/none # TODO: Support Avro. Data loading currently fails for Avro because complex types diff --git a/testdata/workloads/functional-query/queries/QueryTest/iceberg-metadata-tables.test b/testdata/workloads/functional-query/queries/QueryTest/iceberg-metadata-tables.test index 339940e19..d8f947fad 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/iceberg-metadata-tables.test +++ b/testdata/workloads/functional-query/queries/QueryTest/iceberg-metadata-tables.test @@ -845,6 +845,20 @@ select value_counts, readable_metrics.d.lower_bound, readable_metrics.d.upper_bo STRING,DATE,DATE ==== +#### +# Query the `files` metadata table of a table that contains all types - because of lower +# and upper bounds, the 'readable_metrics' struct of the metadata table will also contain +# all types. +# TODO IMPALA-13080: Add DECIMAL. +#### +---- QUERY +select * from functional_parquet.iceberg_metadata_alltypes.`files`; +---- RESULTS +0,regex:'.*\.parquet','PARQUET',0,3,3648,'{1:32,2:63,3:71,4:43,5:55,6:47,7:39,8:58,9:47,13:63,14:96,15:75,16:78}','{1:3,2:3,3:3,4:3,5:3,6:3,7:3,8:3,9:3,13:3,14:6,15:6,16:6}','{1:1,2:0,3:0,4:0,5:0,6:1,7:1,8:1,9:1,13:0,14:0,15:0,16:0}','{16:0,4:1,5:1,14:0}','{1:"AA==",2:"AQAAAA==",3:"9v////////8=",4:"/+ZbLw==",5:"MAWO5C7/O6s=",6:"AFgLImsYBgA=",7:"kU0AAA==",8:"QSBzdHJpbmc=",9:"YmluMQ==",13:"av///w==",14:"fcOUJa1JwtQ=",16:"AAAAPw=="}','{1:"AQ==",2:"BQAAAA==",3:"lgAAAAAAAAA=",4:"qV/jWA==",5:" [...] +---- TYPES +INT,STRING,STRING,INT,BIGINT,BIGINT,STRING,STRING,STRING,STRING,STRING,STRING,BINARY,STRING,STRING,INT,STRING +==== + #### # Describe all the metadata tables once ####
