This is an automated email from the ASF dual-hosted git repository. joemcdonnell pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git
commit d2e495e83a3962277a538360752d20e0ad5ab323 Author: stiga-huang <[email protected]> AuthorDate: Thu Aug 8 15:53:58 2024 +0800 IMPALA-13284: Loading test data on Apache Hive3 There are some failures in loading test data on Apache Hive 3.1.3: - STORED AS JSONFILE is not supported - STORED BY ICEBERG is not supported. Similarly, STORED BY ICEBERG STORED AS AVRO is not supported. - Missing the jar of iceberg-hive-runtime in CLASSPATH of HMS and Tez jobs. - Creating table in Impala is not translated to EXTERNAL table in HMS - Hive INSERT on insert-only tables failed in generating InsertEvents (HIVE-20067). This patch fixes the syntax issues by using old syntax of Apache Hive 3.1.3: - Convert STORED AS JSONFILE to ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.JsonSerDe' - Convert STORED BY ICEBERG to STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' - Convert STORED BY ICEBERG STORED AS AVRO to the above one with tblproperties('write.format.default'='avro') Most of the conversion are done in generate-schema-statements.py. One exception is in testdata/bin/load-dependent-tables.sql where we need to generate a new file with the conversion when using it. The missing jar of iceberg-hive-runtime is added into HIVE_AUX_JARS_PATH in bin/impala-config.sh. Note that this is only needed by Apache Hive3 since CDP Hive3 has the jar of hive-iceberg-handler in its lib folder. To fix the failure of InsertEvents, we add the patch of HIVE-20067 and modify testdata/bin/patch_hive.sh to also recompile the submodule standalone-metastore. Modified some statements in testdata/datasets/functional/functional_schema_template.sql to be more reliable in retry. Tests - Verified the testdata can be loaded in ubuntu-20.04-from-scratch Change-Id: I8f52c91602da8822b0f46f19dc4111c7187ce400 Reviewed-on: http://gerrit.cloudera.org:8080/21657 Reviewed-by: Impala Public Jenkins <[email protected]> Tested-by: Impala Public Jenkins <[email protected]> --- bin/impala-config.sh | 7 ++++ testdata/bin/create-load-data.sh | 11 +++++- testdata/bin/generate-schema-statements.py | 29 ++++++++++++-- testdata/bin/load-dependent-tables.sql | 1 - testdata/bin/patch_hive.sh | 3 +- testdata/cluster/hive/README | 3 ++ testdata/cluster/hive/patch3-HIVE-20067.diff | 46 ++++++++++++++++++++++ .../functional/functional_schema_template.sql | 10 +++-- 8 files changed, 100 insertions(+), 10 deletions(-) diff --git a/bin/impala-config.sh b/bin/impala-config.sh index d56060ad0..72a75de8c 100755 --- a/bin/impala-config.sh +++ b/bin/impala-config.sh @@ -1025,6 +1025,13 @@ export HIVE_CONF_DIR="$IMPALA_FE_DIR/./src/test/resources" export POSTGRES_JDBC_DRIVER="${IMPALA_FE_DIR}/target/dependency/postgresql-${IMPALA_POSTGRES_JDBC_DRIVER_VERSION}.jar" export HIVE_AUX_JARS_PATH="$POSTGRES_JDBC_DRIVER" +# Add the jar of iceberg-hive-runtime to have HiveIcebergStorageHandler. +# Only needed by Apache Hive3 since CDP Hive3 has the jar of hive-iceberg-handler in its +# lib folder. +if $USE_APACHE_HIVE; then + export HIVE_AUX_JARS_PATH="$HIVE_AUX_JARS_PATH:\ +$IMPALA_HOME/fe/target/dependency/iceberg-hive-runtime-${IMPALA_ICEBERG_VERSION}.jar" +fi export AUX_CLASSPATH="" ### Tell hive not to use jline export HADOOP_USER_CLASSPATH_FIRST=true diff --git a/testdata/bin/create-load-data.sh b/testdata/bin/create-load-data.sh index 07b2971a2..44ea40c4c 100755 --- a/testdata/bin/create-load-data.sh +++ b/testdata/bin/create-load-data.sh @@ -382,8 +382,15 @@ function copy-and-load-dependent-tables { # For tables that rely on loading data from local fs test-wareload-house # TODO: Find a good way to integrate this with the normal data loading scripts - beeline -n $USER -u "${JDBC_URL}" -f\ - ${IMPALA_HOME}/testdata/bin/load-dependent-tables.sql + SQL_FILE=${IMPALA_HOME}/testdata/bin/load-dependent-tables.sql + if $USE_APACHE_HIVE; then + # Apache Hive 3.1 doesn't support "STORED AS JSONFILE" (HIVE-19899) + NEW_SQL_FILE=${IMPALA_HOME}/testdata/bin/load-dependent-tables-hive3.sql + sed "s/STORED AS JSONFILE/ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.JsonSerDe'"\ + $SQL_FILE > $NEW_SQL_FILE + SQL_FILE=$NEW_SQL_FILE + fi + beeline -n $USER -u "${JDBC_URL}" -f $SQL_FILE } function create-internal-hbase-table { diff --git a/testdata/bin/generate-schema-statements.py b/testdata/bin/generate-schema-statements.py index 134bd3ff7..da96eaca9 100755 --- a/testdata/bin/generate-schema-statements.py +++ b/testdata/bin/generate-schema-statements.py @@ -232,7 +232,7 @@ HINT_SHUFFLE = "/* +shuffle, clustered */" def build_create_statement(table_template, table_name, db_name, db_suffix, file_format, compression, hdfs_location, - force_reload): + force_reload, is_hive_stmt): create_stmt = '' if (force_reload): tbl_type = 'TABLE' @@ -247,12 +247,34 @@ def build_create_statement(table_template, table_name, db_name, db_suffix, # Remove location part from the format string table_template = table_template.replace("LOCATION '{hdfs_location}'", "") - create_stmt += table_template.format( + stmt = table_template.format( db_name=db_name, db_suffix=db_suffix, table_name=table_name, file_format=FILE_FORMAT_TO_STORED_AS_MAP[file_format], hdfs_location=hdfs_location) + # Apache Hive 3.1 doesn't support "STORED BY ICEBERG STORED AS AVRO" and + # "STORED AS JSONFILE" (HIVE-25162, HIVE-19899) + if is_hive_stmt and os.environ['USE_APACHE_HIVE'] == "true": + if "STORED AS JSONFILE" in stmt: + stmt = stmt.replace("STORED AS JSONFILE", + "ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.JsonSerDe'") + elif "STORED BY ICEBERG" in stmt: + if "STORED AS" not in stmt: + stmt = stmt.replace( + "STORED BY ICEBERG", + "STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler'") + else: + assert "TBLPROPERTIES" not in stmt,\ + ("Cannot convert STORED BY ICEBERG STORED AS file_format with TBLPROPERTIES " + "also in the statement:\n" + stmt) + iceberg_file_format = re.search(r"STORED AS (\w+)", stmt).group(1) + stmt = re.sub(r"STORED BY ICEBERG\s+STORED AS \w+", + ("STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler'" + " TBLPROPERTIES('write.format.default'='{}')").format( + iceberg_file_format), + stmt) + create_stmt += stmt return create_stmt @@ -835,7 +857,8 @@ def generate_statements(output_name, test_vectors, sections, if table_template: output.create.append(build_create_statement(table_template, table_name, db_name, - db_suffix, create_file_format, create_codec, data_path, force_reload)) + db_suffix, create_file_format, create_codec, data_path, force_reload, + create_hive)) # HBASE create table if file_format == 'hbase': # If the HBASE_COLUMN_FAMILIES section does not exist, default to 'd' diff --git a/testdata/bin/load-dependent-tables.sql b/testdata/bin/load-dependent-tables.sql index 57af65ccc..eb18992f5 100644 --- a/testdata/bin/load-dependent-tables.sql +++ b/testdata/bin/load-dependent-tables.sql @@ -94,7 +94,6 @@ LOCATION '/test-warehouse/chars_formats_text'; DROP TABLE IF EXISTS functional_json.chars_formats; CREATE EXTERNAL TABLE functional_json.chars_formats (cs CHAR(5), cl CHAR(140), vc VARCHAR(32)) -ROW FORMAT delimited fields terminated by ',' escaped by '\\' STORED AS JSONFILE LOCATION '/test-warehouse/chars_formats_json'; diff --git a/testdata/bin/patch_hive.sh b/testdata/bin/patch_hive.sh index c87cee11c..c974b9eee 100755 --- a/testdata/bin/patch_hive.sh +++ b/testdata/bin/patch_hive.sh @@ -75,7 +75,8 @@ done # 3. Repackage the hive submodules affected by the patch if [[ "${HIVE_REBUILD}" = "true" ]]; then echo "Repackage the hive-exec module" - ${IMPALA_HOME}/bin/mvn-quiet.sh -pl ql clean package -Dmaven.test.skip + ${IMPALA_HOME}/bin/mvn-quiet.sh -pl ql,standalone-metastore clean package \ + -Dmaven.test.skip cp $HIVE_SRC_DIR/ql/target/hive-exec-${APACHE_HIVE_VERSION}.jar $HIVE_HOME/lib/ fi popd diff --git a/testdata/cluster/hive/README b/testdata/cluster/hive/README index c80b84227..2347bbd77 100644 --- a/testdata/cluster/hive/README +++ b/testdata/cluster/hive/README @@ -10,3 +10,6 @@ Bump guava version to 28.1-jre. Fix HIVE-22717. patch2-HIVE-20038.diff: Update queries on non-bucketed + partitioned tables throws NPE + +patch3-HIVE-20067.diff: +Fix failures in firing InsertEvent for on insert-only tables diff --git a/testdata/cluster/hive/patch3-HIVE-20067.diff b/testdata/cluster/hive/patch3-HIVE-20067.diff new file mode 100644 index 000000000..e0fa43125 --- /dev/null +++ b/testdata/cluster/hive/patch3-HIVE-20067.diff @@ -0,0 +1,46 @@ +diff --git a/ql/src/test/queries/clientpositive/mm_all.q b/ql/src/test/queries/clientpositive/mm_all.q +index 61dd3e7475..a524c29ef5 100644 +--- a/ql/src/test/queries/clientpositive/mm_all.q ++++ b/ql/src/test/queries/clientpositive/mm_all.q +@@ -3,6 +3,7 @@ + + -- MASK_LINEAGE + ++set hive.metastore.dml.events=true; + set hive.mapred.mode=nonstrict; + set hive.explain.user=false; + set hive.fetch.task.conversion=none; +diff --git a/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/events/InsertEvent.java b/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/events/InsertEvent.java +index aa014e9317..60ad7db60e 100644 +--- a/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/events/InsertEvent.java ++++ b/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/events/InsertEvent.java +@@ -18,7 +18,9 @@ + + package org.apache.hadoop.hive.metastore.events; + +-import com.google.common.collect.Lists; ++import java.util.ArrayList; ++import java.util.List; ++ + import org.apache.hadoop.classification.InterfaceAudience; + import org.apache.hadoop.classification.InterfaceStability; + import org.apache.hadoop.hive.metastore.IHMSHandler; +@@ -33,8 +35,7 @@ + import org.apache.hadoop.hive.metastore.utils.MetaStoreUtils; + import org.apache.thrift.TException; + +-import java.util.ArrayList; +-import java.util.List; ++import com.google.common.collect.Lists; + + @InterfaceAudience.Public + @InterfaceStability.Stable +@@ -65,7 +66,7 @@ public InsertEvent(String catName, String db, String table, List<String> partVal + // TODO MS-SPLIT Switch this back once HiveMetaStoreClient is moved. + //req.setCapabilities(HiveMetaStoreClient.TEST_VERSION); + req.setCapabilities(new ClientCapabilities( +- Lists.newArrayList(ClientCapability.TEST_CAPABILITY))); ++ Lists.newArrayList(ClientCapability.TEST_CAPABILITY, ClientCapability.INSERT_ONLY_TABLES))); + try { + this.tableObj = handler.get_table_req(req).getTable(); + if (partVals != null) { diff --git a/testdata/datasets/functional/functional_schema_template.sql b/testdata/datasets/functional/functional_schema_template.sql index 559b78c45..a8541728c 100644 --- a/testdata/datasets/functional/functional_schema_template.sql +++ b/testdata/datasets/functional/functional_schema_template.sql @@ -2620,7 +2620,10 @@ materialized_view -- The create materialized view command is moved down so that the database's -- managed directory has been created. Otherwise the command would fail. This -- is a bug in Hive. -CREATE MATERIALIZED VIEW IF NOT EXISTS {db_name}{db_suffix}.{table_name} +-- Always drop the view first since IF NOT EXISTS is ignored in CREATE VIEW +-- in Apache Hive3 (HIVE-20462, HIVE-21675). +DROP MATERIALIZED VIEW IF EXISTS {db_name}{db_suffix}.{table_name}; +CREATE MATERIALIZED VIEW {db_name}{db_suffix}.{table_name} AS SELECT * FROM {db_name}{db_suffix}.insert_only_transactional_table; ===== ---- DATASET @@ -3840,6 +3843,7 @@ CREATE EXTERNAL TABLE IF NOT EXISTS {db_name}{db_suffix}.{table_name} ( ) STORED BY ICEBERG STORED AS AVRO LOCATION '/test-warehouse/iceberg_test/hadoop_catalog/ice/iceberg_avro_format'; +---- DEPENDENT_LOAD_HIVE INSERT INTO TABLE {db_name}{db_suffix}.{table_name} values(1, 'A', 0.5, true),(2, 'B', 1.5, true),(3, 'C', 2.5, false); ==== ---- DATASET @@ -3887,7 +3891,7 @@ functional ---- BASE_TABLE_NAME iceberg_view ---- CREATE -CREATE VIEW {db_name}{db_suffix}.{table_name} AS +CREATE VIEW IF NOT EXISTS {db_name}{db_suffix}.{table_name} AS SELECT * FROM {db_name}{db_suffix}.iceberg_query_metadata; ==== ---- DATASET @@ -3995,7 +3999,7 @@ functional ---- BASE_TABLE_NAME iceberg_lineitem_sixblocks ---- CREATE -CREATE TABLE IF NOT EXISTS {db_name}{db_suffix}.{table_name} +CREATE EXTERNAL TABLE IF NOT EXISTS {db_name}{db_suffix}.{table_name} LIKE PARQUET '/test-warehouse/lineitem_sixblocks_iceberg/lineitem_sixblocks.parquet' STORED AS PARQUET LOCATION '/test-warehouse/lineitem_sixblocks_iceberg/';
