(impala) 02/03: IMPALA-13284: Loading test data on Apache Hive3

joemcdonnell Tue, 20 Aug 2024 08:41:47 -0700

This is an automated email from the ASF dual-hosted git repository.

joemcdonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git


commit d2e495e83a3962277a538360752d20e0ad5ab323
Author: stiga-huang <[email protected]>
AuthorDate: Thu Aug 8 15:53:58 2024 +0800

    IMPALA-13284: Loading test data on Apache Hive3
    
    There are some failures in loading test data on Apache Hive 3.1.3:
     - STORED AS JSONFILE is not supported
     - STORED BY ICEBERG is not supported. Similarly, STORED BY ICEBERG
       STORED AS AVRO is not supported.
     - Missing the jar of iceberg-hive-runtime in CLASSPATH of HMS and Tez
       jobs.
     - Creating table in Impala is not translated to EXTERNAL table in HMS
     - Hive INSERT on insert-only tables failed in generating InsertEvents
       (HIVE-20067).
    
    This patch fixes the syntax issues by using old syntax of Apache Hive
    3.1.3:
     - Convert STORED AS JSONFILE to ROW FORMAT SERDE
       'org.apache.hadoop.hive.serde2.JsonSerDe'
     - Convert STORED BY ICEBERG to STORED BY
       'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler'
     - Convert STORED BY ICEBERG STORED AS AVRO to the above one with
       tblproperties('write.format.default'='avro')
    Most of the conversion are done in generate-schema-statements.py. One
    exception is in testdata/bin/load-dependent-tables.sql where we need to
    generate a new file with the conversion when using it.
    
    The missing jar of iceberg-hive-runtime is added into HIVE_AUX_JARS_PATH
    in bin/impala-config.sh. Note that this is only needed by Apache Hive3
    since CDP Hive3 has the jar of hive-iceberg-handler in its lib folder.
    
    To fix the failure of InsertEvents, we add the patch of HIVE-20067 and
    modify testdata/bin/patch_hive.sh to also recompile the submodule
    standalone-metastore.
    
    Modified some statements in
    testdata/datasets/functional/functional_schema_template.sql to be more
    reliable in retry.
    
    Tests
     - Verified the testdata can be loaded in ubuntu-20.04-from-scratch
    
    Change-Id: I8f52c91602da8822b0f46f19dc4111c7187ce400
    Reviewed-on: http://gerrit.cloudera.org:8080/21657
    Reviewed-by: Impala Public Jenkins <[email protected]>
    Tested-by: Impala Public Jenkins <[email protected]>
---
 bin/impala-config.sh                               |  7 ++++
 testdata/bin/create-load-data.sh                   | 11 +++++-
 testdata/bin/generate-schema-statements.py         | 29 ++++++++++++--
 testdata/bin/load-dependent-tables.sql             |  1 -
 testdata/bin/patch_hive.sh                         |  3 +-
 testdata/cluster/hive/README                       |  3 ++
 testdata/cluster/hive/patch3-HIVE-20067.diff       | 46 ++++++++++++++++++++++
 .../functional/functional_schema_template.sql      | 10 +++--
 8 files changed, 100 insertions(+), 10 deletions(-)

diff --git a/bin/impala-config.sh b/bin/impala-config.sh
index d56060ad0..72a75de8c 100755
--- a/bin/impala-config.sh
+++ b/bin/impala-config.sh
@@ -1025,6 +1025,13 @@ export 
HIVE_CONF_DIR="$IMPALA_FE_DIR/./src/test/resources"
 export 
POSTGRES_JDBC_DRIVER="${IMPALA_FE_DIR}/target/dependency/postgresql-${IMPALA_POSTGRES_JDBC_DRIVER_VERSION}.jar"
 
 export HIVE_AUX_JARS_PATH="$POSTGRES_JDBC_DRIVER"
+# Add the jar of iceberg-hive-runtime to have HiveIcebergStorageHandler.
+# Only needed by Apache Hive3 since CDP Hive3 has the jar of 
hive-iceberg-handler in its
+# lib folder.
+if $USE_APACHE_HIVE; then
+  export HIVE_AUX_JARS_PATH="$HIVE_AUX_JARS_PATH:\
+$IMPALA_HOME/fe/target/dependency/iceberg-hive-runtime-${IMPALA_ICEBERG_VERSION}.jar"
+fi
 export AUX_CLASSPATH=""
 ### Tell hive not to use jline
 export HADOOP_USER_CLASSPATH_FIRST=true
diff --git a/testdata/bin/create-load-data.sh b/testdata/bin/create-load-data.sh
index 07b2971a2..44ea40c4c 100755
--- a/testdata/bin/create-load-data.sh
+++ b/testdata/bin/create-load-data.sh
@@ -382,8 +382,15 @@ function copy-and-load-dependent-tables {
 
   # For tables that rely on loading data from local fs test-wareload-house
   # TODO: Find a good way to integrate this with the normal data loading 
scripts
-  beeline -n $USER -u "${JDBC_URL}" -f\
-    ${IMPALA_HOME}/testdata/bin/load-dependent-tables.sql
+  SQL_FILE=${IMPALA_HOME}/testdata/bin/load-dependent-tables.sql
+  if $USE_APACHE_HIVE; then
+    # Apache Hive 3.1 doesn't support "STORED AS JSONFILE" (HIVE-19899)
+    NEW_SQL_FILE=${IMPALA_HOME}/testdata/bin/load-dependent-tables-hive3.sql
+    sed "s/STORED AS JSONFILE/ROW FORMAT SERDE 
'org.apache.hadoop.hive.serde2.JsonSerDe'"\
+      $SQL_FILE > $NEW_SQL_FILE
+    SQL_FILE=$NEW_SQL_FILE
+  fi
+  beeline -n $USER -u "${JDBC_URL}" -f $SQL_FILE
 }
 
 function create-internal-hbase-table {
diff --git a/testdata/bin/generate-schema-statements.py 
b/testdata/bin/generate-schema-statements.py
index 134bd3ff7..da96eaca9 100755
--- a/testdata/bin/generate-schema-statements.py
+++ b/testdata/bin/generate-schema-statements.py
@@ -232,7 +232,7 @@ HINT_SHUFFLE = "/* +shuffle, clustered */"
 
 def build_create_statement(table_template, table_name, db_name, db_suffix,
                            file_format, compression, hdfs_location,
-                           force_reload):
+                           force_reload, is_hive_stmt):
   create_stmt = ''
   if (force_reload):
     tbl_type = 'TABLE'
@@ -247,12 +247,34 @@ def build_create_statement(table_template, table_name, 
db_name, db_suffix,
     # Remove location part from the format string
     table_template = table_template.replace("LOCATION '{hdfs_location}'", "")
 
-  create_stmt += table_template.format(
+  stmt = table_template.format(
     db_name=db_name,
     db_suffix=db_suffix,
     table_name=table_name,
     file_format=FILE_FORMAT_TO_STORED_AS_MAP[file_format],
     hdfs_location=hdfs_location)
+  # Apache Hive 3.1 doesn't support "STORED BY ICEBERG STORED AS AVRO" and
+  # "STORED AS JSONFILE" (HIVE-25162, HIVE-19899)
+  if is_hive_stmt and os.environ['USE_APACHE_HIVE'] == "true":
+    if "STORED AS JSONFILE" in stmt:
+      stmt = stmt.replace("STORED AS JSONFILE",
+                          "ROW FORMAT SERDE 
'org.apache.hadoop.hive.serde2.JsonSerDe'")
+    elif "STORED BY ICEBERG" in stmt:
+      if "STORED AS" not in stmt:
+        stmt = stmt.replace(
+            "STORED BY ICEBERG",
+            "STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler'")
+      else:
+        assert "TBLPROPERTIES" not in stmt,\
+            ("Cannot convert STORED BY ICEBERG STORED AS file_format with 
TBLPROPERTIES "
+             "also in the statement:\n" + stmt)
+        iceberg_file_format = re.search(r"STORED AS (\w+)", stmt).group(1)
+        stmt = re.sub(r"STORED BY ICEBERG\s+STORED AS \w+",
+                      ("STORED BY 
'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler'"
+                       " TBLPROPERTIES('write.format.default'='{}')").format(
+                            iceberg_file_format),
+                      stmt)
+  create_stmt += stmt
   return create_stmt
 
 
@@ -835,7 +857,8 @@ def generate_statements(output_name, test_vectors, sections,
 
       if table_template:
         output.create.append(build_create_statement(table_template, 
table_name, db_name,
-            db_suffix, create_file_format, create_codec, data_path, 
force_reload))
+            db_suffix, create_file_format, create_codec, data_path, 
force_reload,
+            create_hive))
       # HBASE create table
       if file_format == 'hbase':
         # If the HBASE_COLUMN_FAMILIES section does not exist, default to 'd'
diff --git a/testdata/bin/load-dependent-tables.sql 
b/testdata/bin/load-dependent-tables.sql
index 57af65ccc..eb18992f5 100644
--- a/testdata/bin/load-dependent-tables.sql
+++ b/testdata/bin/load-dependent-tables.sql
@@ -94,7 +94,6 @@ LOCATION '/test-warehouse/chars_formats_text';
 DROP TABLE IF EXISTS functional_json.chars_formats;
 CREATE EXTERNAL TABLE functional_json.chars_formats
 (cs CHAR(5), cl CHAR(140), vc VARCHAR(32))
-ROW FORMAT delimited fields terminated by ','  escaped by '\\'
 STORED AS JSONFILE
 LOCATION '/test-warehouse/chars_formats_json';
 
diff --git a/testdata/bin/patch_hive.sh b/testdata/bin/patch_hive.sh
index c87cee11c..c974b9eee 100755
--- a/testdata/bin/patch_hive.sh
+++ b/testdata/bin/patch_hive.sh
@@ -75,7 +75,8 @@ done
 # 3. Repackage the hive submodules affected by the patch
 if [[ "${HIVE_REBUILD}" = "true" ]]; then
   echo "Repackage the hive-exec module"
-  ${IMPALA_HOME}/bin/mvn-quiet.sh -pl ql clean package -Dmaven.test.skip
+  ${IMPALA_HOME}/bin/mvn-quiet.sh -pl ql,standalone-metastore clean package \
+      -Dmaven.test.skip
   cp $HIVE_SRC_DIR/ql/target/hive-exec-${APACHE_HIVE_VERSION}.jar 
$HIVE_HOME/lib/
 fi
 popd
diff --git a/testdata/cluster/hive/README b/testdata/cluster/hive/README
index c80b84227..2347bbd77 100644
--- a/testdata/cluster/hive/README
+++ b/testdata/cluster/hive/README
@@ -10,3 +10,6 @@ Bump guava version to 28.1-jre. Fix HIVE-22717.
 
 patch2-HIVE-20038.diff:
 Update queries on non-bucketed + partitioned tables throws NPE
+
+patch3-HIVE-20067.diff:
+Fix failures in firing InsertEvent for on insert-only tables
diff --git a/testdata/cluster/hive/patch3-HIVE-20067.diff 
b/testdata/cluster/hive/patch3-HIVE-20067.diff
new file mode 100644
index 000000000..e0fa43125
--- /dev/null
+++ b/testdata/cluster/hive/patch3-HIVE-20067.diff
@@ -0,0 +1,46 @@
+diff --git a/ql/src/test/queries/clientpositive/mm_all.q 
b/ql/src/test/queries/clientpositive/mm_all.q
+index 61dd3e7475..a524c29ef5 100644
+--- a/ql/src/test/queries/clientpositive/mm_all.q
++++ b/ql/src/test/queries/clientpositive/mm_all.q
+@@ -3,6 +3,7 @@
+ 
+ -- MASK_LINEAGE
+ 
++set hive.metastore.dml.events=true;
+ set hive.mapred.mode=nonstrict;
+ set hive.explain.user=false;
+ set hive.fetch.task.conversion=none;
+diff --git 
a/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/events/InsertEvent.java
 
b/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/events/InsertEvent.java
+index aa014e9317..60ad7db60e 100644
+--- 
a/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/events/InsertEvent.java
++++ 
b/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/events/InsertEvent.java
+@@ -18,7 +18,9 @@
+ 
+ package org.apache.hadoop.hive.metastore.events;
+ 
+-import com.google.common.collect.Lists;
++import java.util.ArrayList;
++import java.util.List;
++
+ import org.apache.hadoop.classification.InterfaceAudience;
+ import org.apache.hadoop.classification.InterfaceStability;
+ import org.apache.hadoop.hive.metastore.IHMSHandler;
+@@ -33,8 +35,7 @@
+ import org.apache.hadoop.hive.metastore.utils.MetaStoreUtils;
+ import org.apache.thrift.TException;
+ 
+-import java.util.ArrayList;
+-import java.util.List;
++import com.google.common.collect.Lists;
+ 
+ @InterfaceAudience.Public
+ @InterfaceStability.Stable
+@@ -65,7 +66,7 @@ public InsertEvent(String catName, String db, String table, 
List<String> partVal
+     // TODO MS-SPLIT Switch this back once HiveMetaStoreClient is moved.
+     //req.setCapabilities(HiveMetaStoreClient.TEST_VERSION);
+     req.setCapabilities(new ClientCapabilities(
+-      Lists.newArrayList(ClientCapability.TEST_CAPABILITY)));
++        Lists.newArrayList(ClientCapability.TEST_CAPABILITY, 
ClientCapability.INSERT_ONLY_TABLES)));
+     try {
+       this.tableObj = handler.get_table_req(req).getTable();
+       if (partVals != null) {
diff --git a/testdata/datasets/functional/functional_schema_template.sql 
b/testdata/datasets/functional/functional_schema_template.sql
index 559b78c45..a8541728c 100644
--- a/testdata/datasets/functional/functional_schema_template.sql
+++ b/testdata/datasets/functional/functional_schema_template.sql
@@ -2620,7 +2620,10 @@ materialized_view
 -- The create materialized view command is moved down so that the database's
 -- managed directory has been created. Otherwise the command would fail. This
 -- is a bug in Hive.
-CREATE MATERIALIZED VIEW IF NOT EXISTS {db_name}{db_suffix}.{table_name}
+-- Always drop the view first since IF NOT EXISTS is ignored in CREATE VIEW
+-- in Apache Hive3 (HIVE-20462, HIVE-21675).
+DROP MATERIALIZED VIEW IF EXISTS {db_name}{db_suffix}.{table_name};
+CREATE MATERIALIZED VIEW {db_name}{db_suffix}.{table_name}
   AS SELECT * FROM {db_name}{db_suffix}.insert_only_transactional_table;
 =====
 ---- DATASET
@@ -3840,6 +3843,7 @@ CREATE EXTERNAL TABLE IF NOT EXISTS 
{db_name}{db_suffix}.{table_name} (
 )
 STORED BY ICEBERG STORED AS AVRO
 LOCATION '/test-warehouse/iceberg_test/hadoop_catalog/ice/iceberg_avro_format';
+---- DEPENDENT_LOAD_HIVE
 INSERT INTO TABLE {db_name}{db_suffix}.{table_name} values(1, 'A', 0.5, 
true),(2, 'B', 1.5, true),(3, 'C', 2.5, false);
 ====
 ---- DATASET
@@ -3887,7 +3891,7 @@ functional
 ---- BASE_TABLE_NAME
 iceberg_view
 ---- CREATE
-CREATE VIEW {db_name}{db_suffix}.{table_name} AS
+CREATE VIEW IF NOT EXISTS {db_name}{db_suffix}.{table_name} AS
 SELECT * FROM  {db_name}{db_suffix}.iceberg_query_metadata;
 ====
 ---- DATASET
@@ -3995,7 +3999,7 @@ functional
 ---- BASE_TABLE_NAME
 iceberg_lineitem_sixblocks
 ---- CREATE
-CREATE TABLE IF NOT EXISTS {db_name}{db_suffix}.{table_name}
+CREATE EXTERNAL TABLE IF NOT EXISTS {db_name}{db_suffix}.{table_name}
 LIKE PARQUET 
'/test-warehouse/lineitem_sixblocks_iceberg/lineitem_sixblocks.parquet'
 STORED AS PARQUET
 LOCATION '/test-warehouse/lineitem_sixblocks_iceberg/';

(impala) 02/03: IMPALA-13284: Loading test data on Apache Hive3

Reply via email to