This is an automated email from the ASF dual-hosted git repository. stigahuang pushed a commit to branch branch-4.1.1 in repository https://gitbox.apache.org/repos/asf/impala.git
commit 051c59bd80dfb8291dd4ef61419db7db2c41cab4 Author: Gergely Fürnstáhl <[email protected]> AuthorDate: Wed Apr 20 10:34:27 2022 +0200 IMPALA-9410: Support resolving ORC file columns by names Added query option and implementation to be able to resolve columns by names. Changed secondary resolution strategy for iceberg orc tables to name based resolution. Testing: Added new test dimension for orc tests, added results to now working iceberg migrated table test Change-Id: I29562a059160c19eb58ccea76aa959d2e408f8de Reviewed-on: http://gerrit.cloudera.org:8080/18397 Reviewed-by: Impala Public Jenkins <[email protected]> Tested-by: Impala Public Jenkins <[email protected]> Reviewed-on: http://gerrit.cloudera.org:8080/18906 Reviewed-by: Zoltan Borok-Nagy <[email protected]> Tested-by: Quanlong Huang <[email protected]> --- be/src/exec/hdfs-orc-scanner.cc | 5 +- be/src/exec/orc-metadata-utils.cc | 93 ++++- be/src/exec/orc-metadata-utils.h | 11 +- be/src/service/query-options.cc | 7 + be/src/service/query-options.h | 462 ++++++++++----------- common/thrift/ImpalaService.thrift | 3 + common/thrift/Query.thrift | 3 + .../queries/QueryTest/iceberg-migrated-tables.test | 8 +- tests/common/test_dimensions.py | 7 + tests/query_test/test_nested_types.py | 23 +- tests/query_test/test_scanners.py | 1 + 11 files changed, 378 insertions(+), 245 deletions(-) diff --git a/be/src/exec/hdfs-orc-scanner.cc b/be/src/exec/hdfs-orc-scanner.cc index cb02dac52..f6643e9b4 100644 --- a/be/src/exec/hdfs-orc-scanner.cc +++ b/be/src/exec/hdfs-orc-scanner.cc @@ -357,8 +357,9 @@ Status HdfsOrcScanner::Open(ScannerContext* context) { RETURN_IF_ERROR(footer_status); bool is_table_full_acid = scan_node_->hdfs_table()->IsTableFullAcid(); - schema_resolver_.reset(new OrcSchemaResolver(*scan_node_->hdfs_table(), - &reader_->getType(), filename(), is_table_full_acid)); + schema_resolver_.reset( + new OrcSchemaResolver(*scan_node_->hdfs_table(), &reader_->getType(), filename(), + is_table_full_acid, state_->query_options().orc_schema_resolution)); bool is_file_full_acid = schema_resolver_->HasFullAcidV2Schema(); acid_original_file_ = is_table_full_acid && !is_file_full_acid; if (is_table_full_acid) { diff --git a/be/src/exec/orc-metadata-utils.cc b/be/src/exec/orc-metadata-utils.cc index 890db5792..b08e5c61f 100644 --- a/be/src/exec/orc-metadata-utils.cc +++ b/be/src/exec/orc-metadata-utils.cc @@ -37,16 +37,21 @@ inline int GetFieldIdFromStr(const std::string& str) { } OrcSchemaResolver::OrcSchemaResolver(const HdfsTableDescriptor& tbl_desc, - const orc::Type* root, const char* filename, bool is_table_acid) : - tbl_desc_(tbl_desc), root_(root), filename_(filename), + const orc::Type* root, const char* filename, bool is_table_acid, + TSchemaResolutionStrategy::type schema_resolution) + : schema_resolution_strategy_(schema_resolution), + tbl_desc_(tbl_desc), + root_(root), + filename_(filename), is_table_full_acid_(is_table_acid) { DetermineFullAcidSchema(); - schema_resolution_strategy_ = TSchemaResolutionStrategy::POSITION; if (tbl_desc_.IsIcebergTable() && root_->getSubtypeCount() > 0) { // Use FIELD_ID-based column resolution for Iceberg tables if possible. const orc::Type* first_child = root_->getSubtype(0); if (first_child->hasAttributeKey(ICEBERG_FIELD_ID)) { schema_resolution_strategy_ = TSchemaResolutionStrategy::FIELD_ID; + } else { + schema_resolution_strategy_ = TSchemaResolutionStrategy::NAME; } } } @@ -55,6 +60,8 @@ Status OrcSchemaResolver::ResolveColumn(const SchemaPath& col_path, const orc::Type** node, bool* pos_field, bool* missing_field) const { if (schema_resolution_strategy_ == TSchemaResolutionStrategy::POSITION) { return ResolveColumnByPosition(col_path, node, pos_field, missing_field); + } else if (schema_resolution_strategy_ == TSchemaResolutionStrategy::NAME) { + return ResolveColumnByName(col_path, node, pos_field, missing_field); } else if (schema_resolution_strategy_ == TSchemaResolutionStrategy::FIELD_ID) { return ResolveColumnByIcebergFieldId(col_path, node, pos_field, missing_field); } else { @@ -162,6 +169,86 @@ Status OrcSchemaResolver::ValidateMap(const ColumnType& type, return Status::OK(); } +Status OrcSchemaResolver::ResolveColumnByName(const SchemaPath& col_path, + const orc::Type** node, bool* pos_field, bool* missing_field) const { + const ColumnType* table_col_type = nullptr; + *node = root_; + *pos_field = false; + *missing_field = false; + if (col_path.empty()) return Status::OK(); + SchemaPath table_path, file_path; + TranslateColPaths(col_path, &table_path, &file_path); + + int i = 0; + + // Resolve table and file ACID differences + int table_idx = table_path[i]; + int file_idx = file_path[i]; + if (table_idx == -1 || file_idx == -1) { + DCHECK_NE(table_idx, file_idx); + if (table_idx == -1) { + DCHECK_EQ(*node, root_); + *node = (*node)->getSubtype(file_idx); + } else { + DCHECK(table_col_type == nullptr); + table_col_type = &tbl_desc_.col_descs()[table_idx].type(); + } + i++; + } + + for (; i < table_path.size(); ++i) { + table_idx = table_path[i]; + if (table_col_type == nullptr) { + // non ACID table, or top level user column in ACID table + table_col_type = &tbl_desc_.col_descs()[table_idx].type(); + const std::string& name = tbl_desc_.col_descs()[table_idx].name(); + *node = FindChildWithName(*node, name); + if (*node == nullptr) { + *missing_field = true; + return Status::OK(); + } + RETURN_IF_ERROR(ValidateType(*table_col_type, **node, table_path, i)); + continue; + } else if (table_col_type->type == TYPE_STRUCT) { + // Resolve struct field by name. + DCHECK_LT(table_idx, table_col_type->field_names.size()); + const std::string& name = table_col_type->field_names[table_idx]; + *node = FindChildWithName(*node, name); + } else if (table_col_type->type == TYPE_ARRAY) { + if (table_idx == SchemaPathConstants::ARRAY_POS) { + *pos_field = true; + break; // return *node as the ARRAY node + } + DCHECK_EQ(table_idx, SchemaPathConstants::ARRAY_ITEM); + *node = (*(node))->getSubtype(table_idx); + } else if (table_col_type->type == TYPE_MAP) { + DCHECK(table_idx == SchemaPathConstants::MAP_KEY + || table_idx == SchemaPathConstants::MAP_VALUE); + // At this point we've found a MAP with a matching name. It's safe to resolve + // the child (key or value) by position. + *node = (*(node))->getSubtype(table_idx); + } + if (*node == nullptr) { + *missing_field = true; + return Status::OK(); + } + table_col_type = &table_col_type->children[table_idx]; + RETURN_IF_ERROR(ValidateType(*table_col_type, **node, table_path, i)); + } + return Status::OK(); +} + +const orc::Type* OrcSchemaResolver::FindChildWithName( + const orc::Type* node, const std::string& name) const { + for (int i = 0; i < node->getSubtypeCount(); ++i) { + const orc::Type* child = node->getSubtype(i); + DCHECK(child != nullptr); + const std::string& fieldName = node->getFieldName(i); + if (iequals(fieldName, name)) return child; + } + return nullptr; +} + Status OrcSchemaResolver::ResolveColumnByIcebergFieldId(const SchemaPath& col_path, const orc::Type** node, bool* pos_field, bool* missing_field) const { const ColumnType* table_col_type = nullptr; diff --git a/be/src/exec/orc-metadata-utils.h b/be/src/exec/orc-metadata-utils.h index af23afb11..aaab7d46f 100644 --- a/be/src/exec/orc-metadata-utils.h +++ b/be/src/exec/orc-metadata-utils.h @@ -42,7 +42,8 @@ constexpr int CURRENT_TRANSCACTION_TYPE_ID = 5; class OrcSchemaResolver { public: OrcSchemaResolver(const HdfsTableDescriptor& tbl_desc, const orc::Type* root, - const char* filename, bool is_table_acid); + const char* filename, bool is_table_acid, + TSchemaResolutionStrategy::type schema_resolution); /// Resolve SchemaPath into orc::Type (ORC column representation) /// 'pos_field' is set to true if 'col_path' reference the index field of an array @@ -94,12 +95,20 @@ class OrcSchemaResolver { Status ResolveColumnByPosition(const SchemaPath& col_path, const orc::Type** node, bool* pos_field, bool* missing_field) const; + /// Resolve column based on name. + Status ResolveColumnByName(const SchemaPath& col_path, const orc::Type** node, + bool* pos_field, bool* missing_field) const; + /// Resolve column based on the Iceberg field ids. This way we will retrieve the /// Iceberg field ids from the HMS table via 'col_path', then find the corresponding /// field in the ORC file. Status ResolveColumnByIcebergFieldId(const SchemaPath& col_path, const orc::Type** node, bool* pos_field, bool* missing_field) const; + /// Finds child of 'node' whose column name matches to provided 'name'. + const orc::Type* FindChildWithName( + const orc::Type* node, const std::string& name) const; + /// Finds child of 'node' that has Iceberg field id equals to 'field_id'. const orc::Type* FindChildWithFieldId(const orc::Type* node, const int field_id) const; diff --git a/be/src/service/query-options.cc b/be/src/service/query-options.cc index c5a1206ff..20105b5e3 100644 --- a/be/src/service/query-options.cc +++ b/be/src/service/query-options.cc @@ -1222,6 +1222,13 @@ Status impala::SetQueryOption(const string& key, const string& value, query_options->__set_test_replan(IsTrue(value)); break; } + case TImpalaQueryOptions::ORC_SCHEMA_RESOLUTION: { + TSchemaResolutionStrategy::type enum_type; + RETURN_IF_ERROR(GetThriftEnum(value, "orc schema resolution", + _TSchemaResolutionStrategy_VALUES_TO_NAMES, &enum_type)); + query_options->__set_orc_schema_resolution(enum_type); + break; + } default: if (IsRemovedQueryOption(key)) { LOG(WARNING) << "Ignoring attempt to set removed query option '" << key << "'"; diff --git a/be/src/service/query-options.h b/be/src/service/query-options.h index fa9691b2c..48444a4f9 100644 --- a/be/src/service/query-options.h +++ b/be/src/service/query-options.h @@ -48,241 +48,233 @@ typedef std::unordered_map<string, beeswax::TQueryOptionLevel::type> // option in the enum TImpalaQueryOptions (defined in ImpalaService.thrift) // plus one. Thus, the second argument to the DCHECK has to be updated every // time we add or remove a query option to/from the enum TImpalaQueryOptions. -#define QUERY_OPTS_TABLE\ - DCHECK_EQ(_TImpalaQueryOptions_VALUES_TO_NAMES.size(),\ - TImpalaQueryOptions::LOCK_MAX_WAIT_TIME_S + 1);\ - REMOVED_QUERY_OPT_FN(abort_on_default_limit_exceeded, ABORT_ON_DEFAULT_LIMIT_EXCEEDED)\ - QUERY_OPT_FN(abort_on_error, ABORT_ON_ERROR, TQueryOptionLevel::REGULAR)\ - REMOVED_QUERY_OPT_FN(allow_unsupported_formats, ALLOW_UNSUPPORTED_FORMATS)\ - QUERY_OPT_FN(batch_size, BATCH_SIZE, TQueryOptionLevel::DEVELOPMENT)\ - QUERY_OPT_FN(debug_action, DEBUG_ACTION, TQueryOptionLevel::DEVELOPMENT)\ - REMOVED_QUERY_OPT_FN(default_order_by_limit, DEFAULT_ORDER_BY_LIMIT)\ - REMOVED_QUERY_OPT_FN(disable_cached_reads, DISABLE_CACHED_READS)\ - QUERY_OPT_FN(disable_outermost_topn, DISABLE_OUTERMOST_TOPN,\ - TQueryOptionLevel::DEVELOPMENT)\ - QUERY_OPT_FN(disable_codegen, DISABLE_CODEGEN, TQueryOptionLevel::REGULAR)\ - QUERY_OPT_FN(explain_level, EXPLAIN_LEVEL, TQueryOptionLevel::REGULAR)\ - QUERY_OPT_FN(hbase_cache_blocks, HBASE_CACHE_BLOCKS, TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(hbase_caching, HBASE_CACHING, TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(max_errors, MAX_ERRORS, TQueryOptionLevel::ADVANCED)\ - REMOVED_QUERY_OPT_FN(max_io_buffers, MAX_IO_BUFFERS)\ - QUERY_OPT_FN(max_scan_range_length, MAX_SCAN_RANGE_LENGTH,\ - TQueryOptionLevel::DEVELOPMENT)\ - QUERY_OPT_FN(mem_limit, MEM_LIMIT, TQueryOptionLevel::REGULAR)\ - QUERY_OPT_FN(num_nodes, NUM_NODES, TQueryOptionLevel::DEVELOPMENT)\ - QUERY_OPT_FN(num_scanner_threads, NUM_SCANNER_THREADS, TQueryOptionLevel::REGULAR)\ - QUERY_OPT_FN(compression_codec, COMPRESSION_CODEC, TQueryOptionLevel::REGULAR)\ - QUERY_OPT_FN(parquet_file_size, PARQUET_FILE_SIZE, TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(request_pool, REQUEST_POOL, TQueryOptionLevel::REGULAR)\ - REMOVED_QUERY_OPT_FN(reservation_request_timeout, RESERVATION_REQUEST_TIMEOUT)\ - QUERY_OPT_FN(sync_ddl, SYNC_DDL, TQueryOptionLevel::REGULAR)\ - REMOVED_QUERY_OPT_FN(v_cpu_cores, V_CPU_CORES)\ - REMOVED_QUERY_OPT_FN(rm_initial_mem, RM_INITIAL_MEM)\ - QUERY_OPT_FN(query_timeout_s, QUERY_TIMEOUT_S, TQueryOptionLevel::REGULAR)\ - QUERY_OPT_FN(buffer_pool_limit, BUFFER_POOL_LIMIT, TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(appx_count_distinct, APPX_COUNT_DISTINCT, TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(disable_unsafe_spills, DISABLE_UNSAFE_SPILLS, TQueryOptionLevel::REGULAR)\ - REMOVED_QUERY_OPT_FN(seq_compression_mode, SEQ_COMPRESSION_MODE)\ - QUERY_OPT_FN(exec_single_node_rows_threshold, EXEC_SINGLE_NODE_ROWS_THRESHOLD,\ - TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(optimize_partition_key_scans, OPTIMIZE_PARTITION_KEY_SCANS,\ - TQueryOptionLevel::REGULAR)\ - QUERY_OPT_FN(replica_preference, REPLICA_PREFERENCE, TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(schedule_random_replica, SCHEDULE_RANDOM_REPLICA,\ - TQueryOptionLevel::ADVANCED)\ - REMOVED_QUERY_OPT_FN(scan_node_codegen_threshold, SCAN_NODE_CODEGEN_THRESHOLD)\ - QUERY_OPT_FN(disable_streaming_preaggregations, DISABLE_STREAMING_PREAGGREGATIONS,\ - TQueryOptionLevel::REGULAR)\ - QUERY_OPT_FN(runtime_filter_mode, RUNTIME_FILTER_MODE, TQueryOptionLevel::REGULAR)\ - QUERY_OPT_FN(runtime_bloom_filter_size, RUNTIME_BLOOM_FILTER_SIZE,\ - TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(runtime_filter_wait_time_ms, RUNTIME_FILTER_WAIT_TIME_MS,\ - TQueryOptionLevel::REGULAR)\ - QUERY_OPT_FN(disable_row_runtime_filtering, DISABLE_ROW_RUNTIME_FILTERING,\ - TQueryOptionLevel::REGULAR)\ - QUERY_OPT_FN(max_num_runtime_filters, MAX_NUM_RUNTIME_FILTERS,\ - TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(parquet_annotate_strings_utf8, PARQUET_ANNOTATE_STRINGS_UTF8,\ - TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(parquet_fallback_schema_resolution, PARQUET_FALLBACK_SCHEMA_RESOLUTION,\ - TQueryOptionLevel::REGULAR)\ - QUERY_OPT_FN(mt_dop, MT_DOP, TQueryOptionLevel::REGULAR)\ - QUERY_OPT_FN(s3_skip_insert_staging, S3_SKIP_INSERT_STAGING,\ - TQueryOptionLevel::REGULAR)\ - QUERY_OPT_FN(runtime_filter_min_size, RUNTIME_FILTER_MIN_SIZE,\ - TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(runtime_filter_max_size, RUNTIME_FILTER_MAX_SIZE,\ - TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(prefetch_mode, PREFETCH_MODE, TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(strict_mode, STRICT_MODE, TQueryOptionLevel::DEVELOPMENT)\ - QUERY_OPT_FN(scratch_limit, SCRATCH_LIMIT, TQueryOptionLevel::REGULAR)\ - QUERY_OPT_FN(enable_expr_rewrites, ENABLE_EXPR_REWRITES, TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(enable_cnf_rewrites, ENABLE_CNF_REWRITES, TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(decimal_v2, DECIMAL_V2, TQueryOptionLevel::DEVELOPMENT)\ - QUERY_OPT_FN(parquet_dictionary_filtering, PARQUET_DICTIONARY_FILTERING,\ - TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(parquet_array_resolution, PARQUET_ARRAY_RESOLUTION,\ - TQueryOptionLevel::REGULAR)\ - QUERY_OPT_FN(parquet_read_statistics, PARQUET_READ_STATISTICS,\ - TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(default_join_distribution_mode, DEFAULT_JOIN_DISTRIBUTION_MODE,\ - TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(disable_codegen_rows_threshold, DISABLE_CODEGEN_ROWS_THRESHOLD,\ - TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(default_spillable_buffer_size, DEFAULT_SPILLABLE_BUFFER_SIZE,\ - TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(min_spillable_buffer_size, MIN_SPILLABLE_BUFFER_SIZE,\ - TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(max_row_size, MAX_ROW_SIZE, TQueryOptionLevel::REGULAR)\ - QUERY_OPT_FN(idle_session_timeout, IDLE_SESSION_TIMEOUT, TQueryOptionLevel::REGULAR)\ - QUERY_OPT_FN(compute_stats_min_sample_size, COMPUTE_STATS_MIN_SAMPLE_SIZE,\ - TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(exec_time_limit_s, EXEC_TIME_LIMIT_S, TQueryOptionLevel::REGULAR)\ - QUERY_OPT_FN(shuffle_distinct_exprs, SHUFFLE_DISTINCT_EXPRS,\ - TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(max_mem_estimate_for_admission, MAX_MEM_ESTIMATE_FOR_ADMISSION,\ - TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(thread_reservation_limit, THREAD_RESERVATION_LIMIT,\ - TQueryOptionLevel::REGULAR)\ - QUERY_OPT_FN(thread_reservation_aggregate_limit, THREAD_RESERVATION_AGGREGATE_LIMIT,\ - TQueryOptionLevel::REGULAR)\ - QUERY_OPT_FN(kudu_read_mode, KUDU_READ_MODE, TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(allow_erasure_coded_files, ALLOW_ERASURE_CODED_FILES,\ - TQueryOptionLevel::DEVELOPMENT)\ - QUERY_OPT_FN(timezone, TIMEZONE, TQueryOptionLevel::REGULAR)\ - QUERY_OPT_FN(scan_bytes_limit, SCAN_BYTES_LIMIT,\ - TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(cpu_limit_s, CPU_LIMIT_S, TQueryOptionLevel::DEVELOPMENT)\ - QUERY_OPT_FN(topn_bytes_limit, TOPN_BYTES_LIMIT, TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(client_identifier, CLIENT_IDENTIFIER, TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(resource_trace_ratio, RESOURCE_TRACE_RATIO, TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(num_remote_executor_candidates, NUM_REMOTE_EXECUTOR_CANDIDATES,\ - TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(num_rows_produced_limit, NUM_ROWS_PRODUCED_LIMIT,\ - TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(\ - planner_testcase_mode, PLANNER_TESTCASE_MODE, TQueryOptionLevel::DEVELOPMENT)\ - QUERY_OPT_FN(default_file_format, DEFAULT_FILE_FORMAT, TQueryOptionLevel::REGULAR)\ - QUERY_OPT_FN(parquet_timestamp_type, PARQUET_TIMESTAMP_TYPE,\ - TQueryOptionLevel::DEVELOPMENT)\ - QUERY_OPT_FN(parquet_read_page_index, PARQUET_READ_PAGE_INDEX,\ - TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(parquet_write_page_index, PARQUET_WRITE_PAGE_INDEX,\ - TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(parquet_page_row_count_limit, PARQUET_PAGE_ROW_COUNT_LIMIT,\ - TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(disable_hdfs_num_rows_estimate, DISABLE_HDFS_NUM_ROWS_ESTIMATE,\ - TQueryOptionLevel::REGULAR)\ - QUERY_OPT_FN(default_hints_insert_statement, DEFAULT_HINTS_INSERT_STATEMENT,\ - TQueryOptionLevel::REGULAR)\ - QUERY_OPT_FN(spool_query_results, SPOOL_QUERY_RESULTS,\ - TQueryOptionLevel::DEVELOPMENT)\ - QUERY_OPT_FN(default_transactional_type, DEFAULT_TRANSACTIONAL_TYPE,\ - TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(statement_expression_limit, STATEMENT_EXPRESSION_LIMIT,\ - TQueryOptionLevel::REGULAR)\ - QUERY_OPT_FN(max_statement_length_bytes, MAX_STATEMENT_LENGTH_BYTES,\ - TQueryOptionLevel::REGULAR)\ - QUERY_OPT_FN(disable_data_cache, DISABLE_DATA_CACHE,\ - TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(max_result_spooling_mem, MAX_RESULT_SPOOLING_MEM,\ - TQueryOptionLevel::DEVELOPMENT)\ - QUERY_OPT_FN(max_spilled_result_spooling_mem, MAX_SPILLED_RESULT_SPOOLING_MEM,\ - TQueryOptionLevel::DEVELOPMENT)\ - QUERY_OPT_FN(disable_hbase_num_rows_estimate, DISABLE_HBASE_NUM_ROWS_ESTIMATE,\ - TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(fetch_rows_timeout_ms, FETCH_ROWS_TIMEOUT_MS,\ - TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(now_string, NOW_STRING, TQueryOptionLevel::DEVELOPMENT)\ - QUERY_OPT_FN(parquet_object_store_split_size, PARQUET_OBJECT_STORE_SPLIT_SIZE,\ - TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(mem_limit_executors, MEM_LIMIT_EXECUTORS, TQueryOptionLevel::DEVELOPMENT)\ - QUERY_OPT_FN(broadcast_bytes_limit, BROADCAST_BYTES_LIMIT, TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(preagg_bytes_limit, PREAGG_BYTES_LIMIT, TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(max_cnf_exprs, MAX_CNF_EXPRS, TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(kudu_snapshot_read_timestamp_micros, KUDU_SNAPSHOT_READ_TIMESTAMP_MICROS,\ - TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(retry_failed_queries, RETRY_FAILED_QUERIES, TQueryOptionLevel::REGULAR)\ - QUERY_OPT_FN(enabled_runtime_filter_types, ENABLED_RUNTIME_FILTER_TYPES,\ - TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(async_codegen, ASYNC_CODEGEN, TQueryOptionLevel::DEVELOPMENT)\ - QUERY_OPT_FN(enable_distinct_semi_join_optimization,\ - ENABLE_DISTINCT_SEMI_JOIN_OPTIMIZATION, TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(sort_run_bytes_limit, SORT_RUN_BYTES_LIMIT, TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(max_fs_writers, MAX_FS_WRITERS, TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(refresh_updated_hms_partitions,\ - REFRESH_UPDATED_HMS_PARTITIONS, TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(spool_all_results_for_retries, SPOOL_ALL_RESULTS_FOR_RETRIES,\ - TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(runtime_filter_error_rate, RUNTIME_FILTER_ERROR_RATE,\ - TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(use_local_tz_for_unix_timestamp_conversions,\ - USE_LOCAL_TZ_FOR_UNIX_TIMESTAMP_CONVERSIONS, TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(convert_legacy_hive_parquet_utc_timestamps,\ - CONVERT_LEGACY_HIVE_PARQUET_UTC_TIMESTAMPS, TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(enable_outer_join_to_inner_transformation,\ - ENABLE_OUTER_JOIN_TO_INNER_TRANSFORMATION, TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(targeted_kudu_scan_range_length, TARGETED_KUDU_SCAN_RANGE_LENGTH,\ - TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(report_skew_limit, REPORT_SKEW_LIMIT,\ - TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(optimize_simple_limit, OPTIMIZE_SIMPLE_LIMIT,\ - TQueryOptionLevel::REGULAR)\ - QUERY_OPT_FN(use_dop_for_costing, USE_DOP_FOR_COSTING,\ - TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(broadcast_to_partition_factor, BROADCAST_TO_PARTITION_FACTOR,\ - TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(join_rows_produced_limit, JOIN_ROWS_PRODUCED_LIMIT,\ - TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(utf8_mode, UTF8_MODE, TQueryOptionLevel::DEVELOPMENT)\ - QUERY_OPT_FN(analytic_rank_pushdown_threshold,\ - ANALYTIC_RANK_PUSHDOWN_THRESHOLD, TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(minmax_filter_threshold, MINMAX_FILTER_THRESHOLD,\ - TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(minmax_filtering_level, MINMAX_FILTERING_LEVEL,\ - TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(compute_column_minmax_stats, COMPUTE_COLUMN_MINMAX_STATS,\ - TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(show_column_minmax_stats, SHOW_COLUMN_MINMAX_STATS,\ - TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(default_ndv_scale, DEFAULT_NDV_SCALE, TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(kudu_replica_selection, KUDU_REPLICA_SELECTION,\ - TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(delete_stats_in_truncate, DELETE_STATS_IN_TRUNCATE,\ - TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(parquet_bloom_filtering, PARQUET_BLOOM_FILTERING,\ - TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(minmax_filter_sorted_columns, MINMAX_FILTER_SORTED_COLUMNS,\ - TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(minmax_filter_fast_code_path, MINMAX_FILTER_FAST_CODE_PATH,\ - TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(enable_kudu_transaction, ENABLE_KUDU_TRANSACTION,\ - TQueryOptionLevel::DEVELOPMENT)\ - QUERY_OPT_FN(minmax_filter_partition_columns, MINMAX_FILTER_PARTITION_COLUMNS,\ - TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(parquet_bloom_filter_write, PARQUET_BLOOM_FILTER_WRITE,\ - TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(orc_read_statistics, ORC_READ_STATISTICS,\ - TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(enable_async_ddl_execution, ENABLE_ASYNC_DDL_EXECUTION,\ - TQueryOptionLevel::ADVANCED) \ - QUERY_OPT_FN(enable_async_load_data_execution, ENABLE_ASYNC_LOAD_DATA_EXECUTION,\ - TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(parquet_late_materialization_threshold,\ - PARQUET_LATE_MATERIALIZATION_THRESHOLD, TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(parquet_dictionary_runtime_filter_entry_limit,\ - PARQUET_DICTIONARY_RUNTIME_FILTER_ENTRY_LIMIT, TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(abort_java_udf_on_exception,\ - ABORT_JAVA_UDF_ON_EXCEPTION, TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(orc_async_read, ORC_ASYNC_READ, TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(runtime_in_list_filter_entry_limit,\ - RUNTIME_IN_LIST_FILTER_ENTRY_LIMIT, TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(enable_replan, ENABLE_REPLAN,\ - TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(test_replan, TEST_REPLAN,\ - TQueryOptionLevel::ADVANCED)\ - QUERY_OPT_FN(lock_max_wait_time_s, LOCK_MAX_WAIT_TIME_S, TQueryOptionLevel::REGULAR)\ - ; +#define QUERY_OPTS_TABLE \ + DCHECK_EQ(_TImpalaQueryOptions_VALUES_TO_NAMES.size(), \ + TImpalaQueryOptions::ORC_SCHEMA_RESOLUTION + 1); \ + REMOVED_QUERY_OPT_FN(abort_on_default_limit_exceeded, ABORT_ON_DEFAULT_LIMIT_EXCEEDED) \ + QUERY_OPT_FN(abort_on_error, ABORT_ON_ERROR, TQueryOptionLevel::REGULAR) \ + REMOVED_QUERY_OPT_FN(allow_unsupported_formats, ALLOW_UNSUPPORTED_FORMATS) \ + QUERY_OPT_FN(batch_size, BATCH_SIZE, TQueryOptionLevel::DEVELOPMENT) \ + QUERY_OPT_FN(debug_action, DEBUG_ACTION, TQueryOptionLevel::DEVELOPMENT) \ + REMOVED_QUERY_OPT_FN(default_order_by_limit, DEFAULT_ORDER_BY_LIMIT) \ + REMOVED_QUERY_OPT_FN(disable_cached_reads, DISABLE_CACHED_READS) \ + QUERY_OPT_FN( \ + disable_outermost_topn, DISABLE_OUTERMOST_TOPN, TQueryOptionLevel::DEVELOPMENT) \ + QUERY_OPT_FN(disable_codegen, DISABLE_CODEGEN, TQueryOptionLevel::REGULAR) \ + QUERY_OPT_FN(explain_level, EXPLAIN_LEVEL, TQueryOptionLevel::REGULAR) \ + QUERY_OPT_FN(hbase_cache_blocks, HBASE_CACHE_BLOCKS, TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN(hbase_caching, HBASE_CACHING, TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN(max_errors, MAX_ERRORS, TQueryOptionLevel::ADVANCED) \ + REMOVED_QUERY_OPT_FN(max_io_buffers, MAX_IO_BUFFERS) \ + QUERY_OPT_FN( \ + max_scan_range_length, MAX_SCAN_RANGE_LENGTH, TQueryOptionLevel::DEVELOPMENT) \ + QUERY_OPT_FN(mem_limit, MEM_LIMIT, TQueryOptionLevel::REGULAR) \ + QUERY_OPT_FN(num_nodes, NUM_NODES, TQueryOptionLevel::DEVELOPMENT) \ + QUERY_OPT_FN(num_scanner_threads, NUM_SCANNER_THREADS, TQueryOptionLevel::REGULAR) \ + QUERY_OPT_FN(compression_codec, COMPRESSION_CODEC, TQueryOptionLevel::REGULAR) \ + QUERY_OPT_FN(parquet_file_size, PARQUET_FILE_SIZE, TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN(request_pool, REQUEST_POOL, TQueryOptionLevel::REGULAR) \ + REMOVED_QUERY_OPT_FN(reservation_request_timeout, RESERVATION_REQUEST_TIMEOUT) \ + QUERY_OPT_FN(sync_ddl, SYNC_DDL, TQueryOptionLevel::REGULAR) \ + REMOVED_QUERY_OPT_FN(v_cpu_cores, V_CPU_CORES) \ + REMOVED_QUERY_OPT_FN(rm_initial_mem, RM_INITIAL_MEM) \ + QUERY_OPT_FN(query_timeout_s, QUERY_TIMEOUT_S, TQueryOptionLevel::REGULAR) \ + QUERY_OPT_FN(buffer_pool_limit, BUFFER_POOL_LIMIT, TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN(appx_count_distinct, APPX_COUNT_DISTINCT, TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN(disable_unsafe_spills, DISABLE_UNSAFE_SPILLS, TQueryOptionLevel::REGULAR) \ + REMOVED_QUERY_OPT_FN(seq_compression_mode, SEQ_COMPRESSION_MODE) \ + QUERY_OPT_FN(exec_single_node_rows_threshold, EXEC_SINGLE_NODE_ROWS_THRESHOLD, \ + TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN(optimize_partition_key_scans, OPTIMIZE_PARTITION_KEY_SCANS, \ + TQueryOptionLevel::REGULAR) \ + QUERY_OPT_FN(replica_preference, REPLICA_PREFERENCE, TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN( \ + schedule_random_replica, SCHEDULE_RANDOM_REPLICA, TQueryOptionLevel::ADVANCED) \ + REMOVED_QUERY_OPT_FN(scan_node_codegen_threshold, SCAN_NODE_CODEGEN_THRESHOLD) \ + QUERY_OPT_FN(disable_streaming_preaggregations, DISABLE_STREAMING_PREAGGREGATIONS, \ + TQueryOptionLevel::REGULAR) \ + QUERY_OPT_FN(runtime_filter_mode, RUNTIME_FILTER_MODE, TQueryOptionLevel::REGULAR) \ + QUERY_OPT_FN( \ + runtime_bloom_filter_size, RUNTIME_BLOOM_FILTER_SIZE, TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN(runtime_filter_wait_time_ms, RUNTIME_FILTER_WAIT_TIME_MS, \ + TQueryOptionLevel::REGULAR) \ + QUERY_OPT_FN(disable_row_runtime_filtering, DISABLE_ROW_RUNTIME_FILTERING, \ + TQueryOptionLevel::REGULAR) \ + QUERY_OPT_FN( \ + max_num_runtime_filters, MAX_NUM_RUNTIME_FILTERS, TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN(parquet_annotate_strings_utf8, PARQUET_ANNOTATE_STRINGS_UTF8, \ + TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN(parquet_fallback_schema_resolution, PARQUET_FALLBACK_SCHEMA_RESOLUTION, \ + TQueryOptionLevel::REGULAR) \ + QUERY_OPT_FN(mt_dop, MT_DOP, TQueryOptionLevel::REGULAR) \ + QUERY_OPT_FN( \ + s3_skip_insert_staging, S3_SKIP_INSERT_STAGING, TQueryOptionLevel::REGULAR) \ + QUERY_OPT_FN( \ + runtime_filter_min_size, RUNTIME_FILTER_MIN_SIZE, TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN( \ + runtime_filter_max_size, RUNTIME_FILTER_MAX_SIZE, TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN(prefetch_mode, PREFETCH_MODE, TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN(strict_mode, STRICT_MODE, TQueryOptionLevel::DEVELOPMENT) \ + QUERY_OPT_FN(scratch_limit, SCRATCH_LIMIT, TQueryOptionLevel::REGULAR) \ + QUERY_OPT_FN(enable_expr_rewrites, ENABLE_EXPR_REWRITES, TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN(enable_cnf_rewrites, ENABLE_CNF_REWRITES, TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN(decimal_v2, DECIMAL_V2, TQueryOptionLevel::DEVELOPMENT) \ + QUERY_OPT_FN(parquet_dictionary_filtering, PARQUET_DICTIONARY_FILTERING, \ + TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN( \ + parquet_array_resolution, PARQUET_ARRAY_RESOLUTION, TQueryOptionLevel::REGULAR) \ + QUERY_OPT_FN( \ + parquet_read_statistics, PARQUET_READ_STATISTICS, TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN(default_join_distribution_mode, DEFAULT_JOIN_DISTRIBUTION_MODE, \ + TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN(disable_codegen_rows_threshold, DISABLE_CODEGEN_ROWS_THRESHOLD, \ + TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN(default_spillable_buffer_size, DEFAULT_SPILLABLE_BUFFER_SIZE, \ + TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN( \ + min_spillable_buffer_size, MIN_SPILLABLE_BUFFER_SIZE, TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN(max_row_size, MAX_ROW_SIZE, TQueryOptionLevel::REGULAR) \ + QUERY_OPT_FN(idle_session_timeout, IDLE_SESSION_TIMEOUT, TQueryOptionLevel::REGULAR) \ + QUERY_OPT_FN(compute_stats_min_sample_size, COMPUTE_STATS_MIN_SAMPLE_SIZE, \ + TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN(exec_time_limit_s, EXEC_TIME_LIMIT_S, TQueryOptionLevel::REGULAR) \ + QUERY_OPT_FN( \ + shuffle_distinct_exprs, SHUFFLE_DISTINCT_EXPRS, TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN(max_mem_estimate_for_admission, MAX_MEM_ESTIMATE_FOR_ADMISSION, \ + TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN( \ + thread_reservation_limit, THREAD_RESERVATION_LIMIT, TQueryOptionLevel::REGULAR) \ + QUERY_OPT_FN(thread_reservation_aggregate_limit, THREAD_RESERVATION_AGGREGATE_LIMIT, \ + TQueryOptionLevel::REGULAR) \ + QUERY_OPT_FN(kudu_read_mode, KUDU_READ_MODE, TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN(allow_erasure_coded_files, ALLOW_ERASURE_CODED_FILES, \ + TQueryOptionLevel::DEVELOPMENT) \ + QUERY_OPT_FN(timezone, TIMEZONE, TQueryOptionLevel::REGULAR) \ + QUERY_OPT_FN(scan_bytes_limit, SCAN_BYTES_LIMIT, TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN(cpu_limit_s, CPU_LIMIT_S, TQueryOptionLevel::DEVELOPMENT) \ + QUERY_OPT_FN(topn_bytes_limit, TOPN_BYTES_LIMIT, TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN(client_identifier, CLIENT_IDENTIFIER, TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN(resource_trace_ratio, RESOURCE_TRACE_RATIO, TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN(num_remote_executor_candidates, NUM_REMOTE_EXECUTOR_CANDIDATES, \ + TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN( \ + num_rows_produced_limit, NUM_ROWS_PRODUCED_LIMIT, TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN( \ + planner_testcase_mode, PLANNER_TESTCASE_MODE, TQueryOptionLevel::DEVELOPMENT) \ + QUERY_OPT_FN(default_file_format, DEFAULT_FILE_FORMAT, TQueryOptionLevel::REGULAR) \ + QUERY_OPT_FN( \ + parquet_timestamp_type, PARQUET_TIMESTAMP_TYPE, TQueryOptionLevel::DEVELOPMENT) \ + QUERY_OPT_FN( \ + parquet_read_page_index, PARQUET_READ_PAGE_INDEX, TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN( \ + parquet_write_page_index, PARQUET_WRITE_PAGE_INDEX, TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN(parquet_page_row_count_limit, PARQUET_PAGE_ROW_COUNT_LIMIT, \ + TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN(disable_hdfs_num_rows_estimate, DISABLE_HDFS_NUM_ROWS_ESTIMATE, \ + TQueryOptionLevel::REGULAR) \ + QUERY_OPT_FN(default_hints_insert_statement, DEFAULT_HINTS_INSERT_STATEMENT, \ + TQueryOptionLevel::REGULAR) \ + QUERY_OPT_FN(spool_query_results, SPOOL_QUERY_RESULTS, TQueryOptionLevel::DEVELOPMENT) \ + QUERY_OPT_FN(default_transactional_type, DEFAULT_TRANSACTIONAL_TYPE, \ + TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN(statement_expression_limit, STATEMENT_EXPRESSION_LIMIT, \ + TQueryOptionLevel::REGULAR) \ + QUERY_OPT_FN(max_statement_length_bytes, MAX_STATEMENT_LENGTH_BYTES, \ + TQueryOptionLevel::REGULAR) \ + QUERY_OPT_FN(disable_data_cache, DISABLE_DATA_CACHE, TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN( \ + max_result_spooling_mem, MAX_RESULT_SPOOLING_MEM, TQueryOptionLevel::DEVELOPMENT) \ + QUERY_OPT_FN(max_spilled_result_spooling_mem, MAX_SPILLED_RESULT_SPOOLING_MEM, \ + TQueryOptionLevel::DEVELOPMENT) \ + QUERY_OPT_FN(disable_hbase_num_rows_estimate, DISABLE_HBASE_NUM_ROWS_ESTIMATE, \ + TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN( \ + fetch_rows_timeout_ms, FETCH_ROWS_TIMEOUT_MS, TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN(now_string, NOW_STRING, TQueryOptionLevel::DEVELOPMENT) \ + QUERY_OPT_FN(parquet_object_store_split_size, PARQUET_OBJECT_STORE_SPLIT_SIZE, \ + TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN(mem_limit_executors, MEM_LIMIT_EXECUTORS, TQueryOptionLevel::DEVELOPMENT) \ + QUERY_OPT_FN( \ + broadcast_bytes_limit, BROADCAST_BYTES_LIMIT, TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN(preagg_bytes_limit, PREAGG_BYTES_LIMIT, TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN(max_cnf_exprs, MAX_CNF_EXPRS, TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN(kudu_snapshot_read_timestamp_micros, KUDU_SNAPSHOT_READ_TIMESTAMP_MICROS, \ + TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN(retry_failed_queries, RETRY_FAILED_QUERIES, TQueryOptionLevel::REGULAR) \ + QUERY_OPT_FN(enabled_runtime_filter_types, ENABLED_RUNTIME_FILTER_TYPES, \ + TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN(async_codegen, ASYNC_CODEGEN, TQueryOptionLevel::DEVELOPMENT) \ + QUERY_OPT_FN(enable_distinct_semi_join_optimization, \ + ENABLE_DISTINCT_SEMI_JOIN_OPTIMIZATION, TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN(sort_run_bytes_limit, SORT_RUN_BYTES_LIMIT, TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN(max_fs_writers, MAX_FS_WRITERS, TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN(refresh_updated_hms_partitions, REFRESH_UPDATED_HMS_PARTITIONS, \ + TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN(spool_all_results_for_retries, SPOOL_ALL_RESULTS_FOR_RETRIES, \ + TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN( \ + runtime_filter_error_rate, RUNTIME_FILTER_ERROR_RATE, TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN(use_local_tz_for_unix_timestamp_conversions, \ + USE_LOCAL_TZ_FOR_UNIX_TIMESTAMP_CONVERSIONS, TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN(convert_legacy_hive_parquet_utc_timestamps, \ + CONVERT_LEGACY_HIVE_PARQUET_UTC_TIMESTAMPS, TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN(enable_outer_join_to_inner_transformation, \ + ENABLE_OUTER_JOIN_TO_INNER_TRANSFORMATION, TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN(targeted_kudu_scan_range_length, TARGETED_KUDU_SCAN_RANGE_LENGTH, \ + TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN(report_skew_limit, REPORT_SKEW_LIMIT, TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN(optimize_simple_limit, OPTIMIZE_SIMPLE_LIMIT, TQueryOptionLevel::REGULAR) \ + QUERY_OPT_FN(use_dop_for_costing, USE_DOP_FOR_COSTING, TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN(broadcast_to_partition_factor, BROADCAST_TO_PARTITION_FACTOR, \ + TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN( \ + join_rows_produced_limit, JOIN_ROWS_PRODUCED_LIMIT, TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN(utf8_mode, UTF8_MODE, TQueryOptionLevel::DEVELOPMENT) \ + QUERY_OPT_FN(analytic_rank_pushdown_threshold, ANALYTIC_RANK_PUSHDOWN_THRESHOLD, \ + TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN( \ + minmax_filter_threshold, MINMAX_FILTER_THRESHOLD, TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN( \ + minmax_filtering_level, MINMAX_FILTERING_LEVEL, TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN(compute_column_minmax_stats, COMPUTE_COLUMN_MINMAX_STATS, \ + TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN( \ + show_column_minmax_stats, SHOW_COLUMN_MINMAX_STATS, TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN(default_ndv_scale, DEFAULT_NDV_SCALE, TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN( \ + kudu_replica_selection, KUDU_REPLICA_SELECTION, TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN( \ + delete_stats_in_truncate, DELETE_STATS_IN_TRUNCATE, TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN( \ + parquet_bloom_filtering, PARQUET_BLOOM_FILTERING, TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN(minmax_filter_sorted_columns, MINMAX_FILTER_SORTED_COLUMNS, \ + TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN(minmax_filter_fast_code_path, MINMAX_FILTER_FAST_CODE_PATH, \ + TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN( \ + enable_kudu_transaction, ENABLE_KUDU_TRANSACTION, TQueryOptionLevel::DEVELOPMENT) \ + QUERY_OPT_FN(minmax_filter_partition_columns, MINMAX_FILTER_PARTITION_COLUMNS, \ + TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN(parquet_bloom_filter_write, PARQUET_BLOOM_FILTER_WRITE, \ + TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN(orc_read_statistics, ORC_READ_STATISTICS, TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN(enable_async_ddl_execution, ENABLE_ASYNC_DDL_EXECUTION, \ + TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN(enable_async_load_data_execution, ENABLE_ASYNC_LOAD_DATA_EXECUTION, \ + TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN(parquet_late_materialization_threshold, \ + PARQUET_LATE_MATERIALIZATION_THRESHOLD, TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN(parquet_dictionary_runtime_filter_entry_limit, \ + PARQUET_DICTIONARY_RUNTIME_FILTER_ENTRY_LIMIT, TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN(abort_java_udf_on_exception, ABORT_JAVA_UDF_ON_EXCEPTION, \ + TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN(orc_async_read, ORC_ASYNC_READ, TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN(runtime_in_list_filter_entry_limit, RUNTIME_IN_LIST_FILTER_ENTRY_LIMIT, \ + TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN(enable_replan, ENABLE_REPLAN, TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN(test_replan, TEST_REPLAN, TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN(lock_max_wait_time_s, LOCK_MAX_WAIT_TIME_S, TQueryOptionLevel::REGULAR) \ + QUERY_OPT_FN(orc_schema_resolution, ORC_SCHEMA_RESOLUTION, TQueryOptionLevel::REGULAR); /// Enforce practical limits on some query options to avoid undesired query state. static const int64_t SPILLABLE_BUFFER_LIMIT = 1LL << 40; // 1 TB diff --git a/common/thrift/ImpalaService.thrift b/common/thrift/ImpalaService.thrift index 8a118d241..60844d6c6 100644 --- a/common/thrift/ImpalaService.thrift +++ b/common/thrift/ImpalaService.thrift @@ -733,6 +733,9 @@ enum TImpalaQueryOptions { // Maximum wait time on HMS ACID lock in seconds. LOCK_MAX_WAIT_TIME_S = 145 + + // Determines how to resolve ORC files' schemas. Valid values are "position" and "name". + ORC_SCHEMA_RESOLUTION = 146; } // The summary of a DML statement. diff --git a/common/thrift/Query.thrift b/common/thrift/Query.thrift index 4d45ad1c8..3e4726c14 100644 --- a/common/thrift/Query.thrift +++ b/common/thrift/Query.thrift @@ -591,6 +591,9 @@ struct TQueryOptions { // See comment in ImpalaService.thrift 146: optional i32 lock_max_wait_time_s = 300 + + // See comment in ImpalaService.thrift + 147: optional TSchemaResolutionStrategy orc_schema_resolution = 0; } // Impala currently has three types of sessions: Beeswax, HiveServer2 and external diff --git a/testdata/workloads/functional-query/queries/QueryTest/iceberg-migrated-tables.test b/testdata/workloads/functional-query/queries/QueryTest/iceberg-migrated-tables.test index db40813e5..d1549c529 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/iceberg-migrated-tables.test +++ b/testdata/workloads/functional-query/queries/QueryTest/iceberg-migrated-tables.test @@ -74,10 +74,12 @@ BIGINT, DOUBLE, DECIMAL # * Partition FLOAT column to DOUBLE # * Partition DECIMAL(5,3) column to DECIMAL(8,3) # * Non-partition column has been moved to end of the schema -# Currently this fails due to IMPALA-9410 select * from functional_parquet.iceberg_legacy_partition_schema_evolution_orc ----- CATCH -Parse error in possibly corrupt ORC file +---- RESULTS +1,1.100000023841858,2.718,2 +1,1.100000023841858,3.141,1 +---- TYPES +BIGINT, DOUBLE, DECIMAL, INT ==== ---- QUERY # Read only the partition columns. diff --git a/tests/common/test_dimensions.py b/tests/common/test_dimensions.py index 4ac912b47..5a5788aac 100644 --- a/tests/common/test_dimensions.py +++ b/tests/common/test_dimensions.py @@ -164,6 +164,13 @@ def hs2_text_constraint(v): v.get_value('table_format').file_format == 'text' and v.get_value('table_format').compression_codec == 'none') + +def orc_schema_resolution_constraint(v): + """ Constraint to use multiple orc_schema_resolution only in case of orc files""" + file_format = v.get_value('table_format').file_format + orc_schema_resolution = v.get_value('orc_schema_resolution') + return file_format == 'orc' or orc_schema_resolution == 0 + # Common sets of values for the exec option vectors ALL_BATCH_SIZES = [0] diff --git a/tests/query_test/test_nested_types.py b/tests/query_test/test_nested_types.py index ef64e1181..23e46e6e5 100644 --- a/tests/query_test/test_nested_types.py +++ b/tests/query_test/test_nested_types.py @@ -37,7 +37,7 @@ from tests.common.skip import ( ) from tests.common.test_dimensions import (create_exec_option_dimension, create_exec_option_dimension_from_dict, create_client_protocol_dimension, - create_orc_dimension) + create_orc_dimension, orc_schema_resolution_constraint) from tests.common.test_vector import ImpalaTestDimension from tests.util.filesystem_utils import WAREHOUSE, get_fs_path, IS_HDFS @@ -48,6 +48,13 @@ class TestNestedTypes(ImpalaTestSuite): def get_workload(self): return 'functional-query' + @staticmethod + def orc_schema_resolution_constraint(vector): + """ Constraint to use multiple orc_schema_resolution only in case of orc files""" + file_format = vector.get_value('table_format').file_format + orc_schema_resolution = vector.get_value('orc_schema_resolution') + return file_format == 'orc' or orc_schema_resolution == 0 + @classmethod def add_test_dimensions(cls): super(TestNestedTypes, cls).add_test_dimensions() @@ -55,6 +62,8 @@ class TestNestedTypes(ImpalaTestSuite): v.get_value('table_format').file_format in ['parquet', 'orc']) cls.ImpalaTestMatrix.add_dimension( ImpalaTestDimension('mt_dop', 0, 2)) + cls.ImpalaTestMatrix.add_dimension(ImpalaTestDimension('orc_schema_resolution', 0, 1)) + cls.ImpalaTestMatrix.add_constraint(orc_schema_resolution_constraint) def test_scanner_basic(self, vector): """Queries that do not materialize arrays.""" @@ -135,6 +144,8 @@ class TestNestedStructsInSelectList(ImpalaTestSuite): cls.ImpalaTestMatrix.add_dimension(create_client_protocol_dimension()) cls.ImpalaTestMatrix.add_constraint(lambda v: v.get_value('protocol') == 'hs2') + cls.ImpalaTestMatrix.add_dimension(ImpalaTestDimension('orc_schema_resolution', 0, 1)) + cls.ImpalaTestMatrix.add_constraint(orc_schema_resolution_constraint) def test_struct_in_select_list(self, vector, unique_database): """Queries where a struct column is in the select list""" @@ -170,6 +181,8 @@ class TestNestedTArraysInSelectList(ImpalaTestSuite): create_exec_option_dimension_from_dict({ 'disable_codegen': ['False', 'True']})) cls.ImpalaTestMatrix.add_dimension(create_client_protocol_dimension()) + cls.ImpalaTestMatrix.add_dimension(ImpalaTestDimension('orc_schema_resolution', 0, 1)) + cls.ImpalaTestMatrix.add_constraint(orc_schema_resolution_constraint) def test_array_in_select_list(self, vector, unique_database): """Queries where an array column is in the select list""" @@ -215,6 +228,8 @@ class TestComputeStatsWithNestedTypes(ImpalaTestSuite): super(TestComputeStatsWithNestedTypes, cls).add_test_dimensions() cls.ImpalaTestMatrix.add_constraint(lambda v: v.get_value('table_format').file_format in ['parquet', 'orc']) + cls.ImpalaTestMatrix.add_dimension(ImpalaTestDimension('orc_schema_resolution', 0, 1)) + cls.ImpalaTestMatrix.add_constraint(orc_schema_resolution_constraint) def test_compute_stats_with_structs(self, vector): """COMPUTE STATS and SHOW COLUMN STATS for tables with structs""" @@ -232,6 +247,8 @@ class TestZippingUnnest(ImpalaTestSuite): super(TestZippingUnnest, cls).add_test_dimensions() cls.ImpalaTestMatrix.add_constraint(lambda v: v.get_value('table_format').file_format in ['parquet', 'orc']) + cls.ImpalaTestMatrix.add_dimension(ImpalaTestDimension('orc_schema_resolution', 0, 1)) + cls.ImpalaTestMatrix.add_constraint(orc_schema_resolution_constraint) def test_zipping_unnest_in_from_clause(self, vector): """Queries where zipping unnest is executed by providing UNNEST() in the from clause. @@ -261,6 +278,8 @@ class TestNestedTypesNoMtDop(ImpalaTestSuite): super(TestNestedTypesNoMtDop, cls).add_test_dimensions() cls.ImpalaTestMatrix.add_constraint(lambda v: v.get_value('table_format').file_format in ['parquet', 'orc']) + cls.ImpalaTestMatrix.add_dimension(ImpalaTestDimension('orc_schema_resolution', 0, 1)) + cls.ImpalaTestMatrix.add_constraint(orc_schema_resolution_constraint) def test_tpch(self, vector): """Queries over the larger nested TPCH dataset.""" @@ -818,6 +837,8 @@ class TestMaxNestingDepth(ImpalaTestSuite): super(TestMaxNestingDepth, cls).add_test_dimensions() cls.ImpalaTestMatrix.add_constraint(lambda v: v.get_value('table_format').file_format in ['parquet', 'orc']) + cls.ImpalaTestMatrix.add_dimension(ImpalaTestDimension('orc_schema_resolution', 0, 1)) + cls.ImpalaTestMatrix.add_constraint(orc_schema_resolution_constraint) def test_max_nesting_depth(self, vector, unique_database): """Tests that Impala can scan Parquet and ORC files having complex types of diff --git a/tests/query_test/test_scanners.py b/tests/query_test/test_scanners.py index 53a15f0ae..112b53dc0 100644 --- a/tests/query_test/test_scanners.py +++ b/tests/query_test/test_scanners.py @@ -1477,6 +1477,7 @@ class TestOrc(ImpalaTestSuite): super(TestOrc, cls).add_test_dimensions() cls.ImpalaTestMatrix.add_constraint( lambda v: v.get_value('table_format').file_format == 'orc') + cls.ImpalaTestMatrix.add_dimension(ImpalaTestDimension('orc_schema_resolution', 0, 1)) @SkipIfS3.hdfs_block_size @SkipIfGCS.hdfs_block_size
