This is an automated email from the ASF dual-hosted git repository.

stigahuang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git


The following commit(s) were added to refs/heads/master by this push:
     new c12ac6c27 IMPALA-11607: Support for TPC-H and TPC-DS test datasets 
stored as Iceberg tables
c12ac6c27 is described below

commit c12ac6c27b2df1eae693b44c157d65499f491d21
Author: LPL <[email protected]>
AuthorDate: Fri Sep 30 17:09:48 2022 +0800

    IMPALA-11607: Support for TPC-H and TPC-DS test datasets stored as Iceberg 
tables
    
    Impala 4.1.0 has officially supported Iceberg tables, but we don't have
    tools to generate TPC-H and TPC-DS test datasets for Iceberg tables. We
    should use load-data.py to generate test datasets, e.g:
    "${IMPALA_HOME}/bin/load-data.py -w tpcds -f -s 10
    --table_formats=text/none,iceberg/none".
    
    Testing:
     - load-data.py -w tpch -s 1 --table_formats=text/none,parquet/none,
     iceberg/none -e exhaustive -f
     - load-data.py -w tpcds -s 1 --table_formats=text/none,parquet/zstd,
     iceberg/zstd -e exhaustive -f
     - single_node_perf_run.py --workloads=tpch --scale=30 --iterations=3 --
     table_formats=parquet/lz4,iceberg/lz4 --num_impalads=1 --load 3ee3dc847
    
      Workload: TPCH, Scale Factor: 30
    
      Table Format: parquet/lz4/block
      +----------+----------------+-----------+
      | Query    | Time Taken (s) | Client ID |
      +----------+----------------+-----------+
      | TPCH-Q1  | 7.73           | 1         |
      | TPCH-Q1  | 7.50           | 1         |
      | TPCH-Q1  | 7.55           | 1         |
      +----------+----------------+-----------+
      | TPCH-Q10 | 3.65           | 1         |
      | TPCH-Q10 | 3.70           | 1         |
      | TPCH-Q10 | 3.69           | 1         |
      +----------+----------------+-----------+
      | TPCH-Q11 | 0.68           | 1         |
      | TPCH-Q11 | 0.68           | 1         |
      | TPCH-Q11 | 0.67           | 1         |
      +----------+----------------+-----------+
      | TPCH-Q12 | 1.33           | 1         |
      | TPCH-Q12 | 1.08           | 1         |
      | TPCH-Q12 | 1.08           | 1         |
      +----------+----------------+-----------+
      | TPCH-Q13 | 10.63          | 1         |
      | TPCH-Q13 | 10.36          | 1         |
      | TPCH-Q13 | 10.52          | 1         |
      +----------+----------------+-----------+
      | TPCH-Q14 | 1.63           | 1         |
      | TPCH-Q14 | 1.79           | 1         |
      | TPCH-Q14 | 1.63           | 1         |
      +----------+----------------+-----------+
      | TPCH-Q15 | 1.44           | 1         |
      | TPCH-Q15 | 1.54           | 1         |
      | TPCH-Q15 | 1.45           | 1         |
      +----------+----------------+-----------+
      | TPCH-Q16 | 2.35           | 1         |
      | TPCH-Q16 | 2.35           | 1         |
      | TPCH-Q16 | 2.40           | 1         |
      +----------+----------------+-----------+
      | TPCH-Q17 | 1.20           | 1         |
      | TPCH-Q17 | 1.13           | 1         |
      | TPCH-Q17 | 1.10           | 1         |
      +----------+----------------+-----------+
      | TPCH-Q18 | 17.50          | 1         |
      | TPCH-Q18 | 17.04          | 1         |
      | TPCH-Q18 | 14.43          | 1         |
      +----------+----------------+-----------+
      | TPCH-Q19 | 1.61           | 1         |
      | TPCH-Q19 | 1.56           | 1         |
      | TPCH-Q19 | 1.56           | 1         |
      +----------+----------------+-----------+
      | TPCH-Q2  | 0.68           | 1         |
      | TPCH-Q2  | 0.68           | 1         |
      | TPCH-Q2  | 0.68           | 1         |
      +----------+----------------+-----------+
      | TPCH-Q20 | 1.09           | 1         |
      | TPCH-Q20 | 1.09           | 1         |
      | TPCH-Q20 | 1.15           | 1         |
      +----------+----------------+-----------+
      | TPCH-Q21 | 24.94          | 1         |
      | TPCH-Q21 | 26.42          | 1         |
      | TPCH-Q21 | 25.62          | 1         |
      +----------+----------------+-----------+
      | TPCH-Q22 | 3.69           | 1         |
      | TPCH-Q22 | 3.58           | 1         |
      | TPCH-Q22 | 3.24           | 1         |
      +----------+----------------+-----------+
      | TPCH-Q3  | 3.58           | 1         |
      | TPCH-Q3  | 3.68           | 1         |
      | TPCH-Q3  | 3.89           | 1         |
      +----------+----------------+-----------+
      | TPCH-Q4  | 1.37           | 1         |
      | TPCH-Q4  | 1.37           | 1         |
      | TPCH-Q4  | 1.32           | 1         |
      +----------+----------------+-----------+
      | TPCH-Q5  | 1.68           | 1         |
      | TPCH-Q5  | 1.69           | 1         |
      | TPCH-Q5  | 1.65           | 1         |
      +----------+----------------+-----------+
      | TPCH-Q6  | 0.46           | 1         |
      | TPCH-Q6  | 0.47           | 1         |
      | TPCH-Q6  | 0.47           | 1         |
      +----------+----------------+-----------+
      | TPCH-Q7  | 2.54           | 1         |
      | TPCH-Q7  | 2.49           | 1         |
      | TPCH-Q7  | 2.54           | 1         |
      +----------+----------------+-----------+
      | TPCH-Q8  | 1.90           | 1         |
      | TPCH-Q8  | 1.90           | 1         |
      | TPCH-Q8  | 1.90           | 1         |
      +----------+----------------+-----------+
      | TPCH-Q9  | 12.72          | 1         |
      | TPCH-Q9  | 12.02          | 1         |
      | TPCH-Q9  | 12.67          | 1         |
      +----------+----------------+-----------+
    
      Table Format: iceberg/lz4/block
      +----------+----------------+-----------+
      | Query    | Time Taken (s) | Client ID |
      +----------+----------------+-----------+
      | TPCH-Q1  | 7.51           | 1         |
      | TPCH-Q1  | 7.61           | 1         |
      | TPCH-Q1  | 7.60           | 1         |
      +----------+----------------+-----------+
      | TPCH-Q10 | 3.79           | 1         |
      | TPCH-Q10 | 3.73           | 1         |
      | TPCH-Q10 | 3.67           | 1         |
      +----------+----------------+-----------+
      | TPCH-Q11 | 0.75           | 1         |
      | TPCH-Q11 | 0.88           | 1         |
      | TPCH-Q11 | 0.69           | 1         |
      +----------+----------------+-----------+
      | TPCH-Q12 | 1.18           | 1         |
      | TPCH-Q12 | 1.13           | 1         |
      | TPCH-Q12 | 1.04           | 1         |
      +----------+----------------+-----------+
      | TPCH-Q13 | 10.11          | 1         |
      | TPCH-Q13 | 9.76           | 1         |
      | TPCH-Q13 | 10.31          | 1         |
      +----------+----------------+-----------+
      | TPCH-Q14 | 1.64           | 1         |
      | TPCH-Q14 | 1.64           | 1         |
      | TPCH-Q14 | 1.53           | 1         |
      +----------+----------------+-----------+
      | TPCH-Q15 | 1.44           | 1         |
      | TPCH-Q15 | 1.39           | 1         |
      | TPCH-Q15 | 1.35           | 1         |
      +----------+----------------+-----------+
      | TPCH-Q16 | 2.24           | 1         |
      | TPCH-Q16 | 2.15           | 1         |
      | TPCH-Q16 | 2.15           | 1         |
      +----------+----------------+-----------+
      | TPCH-Q17 | 1.00           | 1         |
      | TPCH-Q17 | 1.00           | 1         |
      | TPCH-Q17 | 0.99           | 1         |
      +----------+----------------+-----------+
      | TPCH-Q18 | 15.14          | 1         |
      | TPCH-Q18 | 14.94          | 1         |
      | TPCH-Q18 | 15.04          | 1         |
      +----------+----------------+-----------+
      | TPCH-Q19 | 1.63           | 1         |
      | TPCH-Q19 | 1.66           | 1         |
      | TPCH-Q19 | 1.60           | 1         |
      +----------+----------------+-----------+
      | TPCH-Q2  | 0.69           | 1         |
      | TPCH-Q2  | 0.75           | 1         |
      | TPCH-Q2  | 0.69           | 1         |
      +----------+----------------+-----------+
      | TPCH-Q20 | 1.10           | 1         |
      | TPCH-Q20 | 1.09           | 1         |
      | TPCH-Q20 | 1.21           | 1         |
      +----------+----------------+-----------+
      | TPCH-Q21 | 25.38          | 1         |
      | TPCH-Q21 | 25.48          | 1         |
      | TPCH-Q21 | 25.18          | 1         |
      +----------+----------------+-----------+
      | TPCH-Q22 | 3.54           | 1         |
      | TPCH-Q22 | 3.59           | 1         |
      | TPCH-Q22 | 3.49           | 1         |
      +----------+----------------+-----------+
      | TPCH-Q3  | 3.95           | 1         |
      | TPCH-Q3  | 3.96           | 1         |
      | TPCH-Q3  | 3.79           | 1         |
      +----------+----------------+-----------+
      | TPCH-Q4  | 1.43           | 1         |
      | TPCH-Q4  | 1.43           | 1         |
      | TPCH-Q4  | 1.43           | 1         |
      +----------+----------------+-----------+
      | TPCH-Q5  | 1.84           | 1         |
      | TPCH-Q5  | 1.80           | 1         |
      | TPCH-Q5  | 1.71           | 1         |
      +----------+----------------+-----------+
      | TPCH-Q6  | 0.56           | 1         |
      | TPCH-Q6  | 0.47           | 1         |
      | TPCH-Q6  | 0.53           | 1         |
      +----------+----------------+-----------+
      | TPCH-Q7  | 2.75           | 1         |
      | TPCH-Q7  | 2.59           | 1         |
      | TPCH-Q7  | 2.49           | 1         |
      +----------+----------------+-----------+
      | TPCH-Q8  | 1.90           | 1         |
      | TPCH-Q8  | 2.00           | 1         |
      | TPCH-Q8  | 1.91           | 1         |
      +----------+----------------+-----------+
      | TPCH-Q9  | 13.34          | 1         |
      | TPCH-Q9  | 12.18          | 1         |
      | TPCH-Q9  | 12.71          | 1         |
      +----------+----------------+-----------+
    
    Change-Id: Ic8bbc5209e7e649f67a48144a2844b35d9f9c7f1
    Reviewed-on: http://gerrit.cloudera.org:8080/19114
    Reviewed-by: Zoltan Borok-Nagy <[email protected]>
    Tested-by: Impala Public Jenkins <[email protected]>
---
 testdata/bin/generate-schema-statements.py | 39 ++++++++++++++++++++++++------
 tests/common/test_dimensions.py            |  3 ++-
 2 files changed, 33 insertions(+), 9 deletions(-)

diff --git a/testdata/bin/generate-schema-statements.py 
b/testdata/bin/generate-schema-statements.py
index e37e485b8..8f28e310d 100755
--- a/testdata/bin/generate-schema-statements.py
+++ b/testdata/bin/generate-schema-statements.py
@@ -203,6 +203,7 @@ FILE_FORMAT_MAP = {
   'avro': 'AVRO',
   'hbase': "'org.apache.hadoop.hive.hbase.HBaseStorageHandler'",
   'kudu': "KUDU",
+  'iceberg': "ICEBERG",
   }
 
 HIVE_TO_AVRO_TYPE_MAP = {
@@ -309,7 +310,8 @@ def build_table_template(file_format, columns, 
partition_columns, row_format, tb
 
   tblproperties_clause = "TBLPROPERTIES (\n{0}\n)"
 
-  external = "" if is_transactional(tblproperties) else "EXTERNAL"
+  external = "" if is_transactional(tblproperties) or 
is_iceberg_table(file_format) \
+              else "EXTERNAL"
 
   if file_format == 'avro':
     # TODO Is this flag ever used?
@@ -319,7 +321,8 @@ def build_table_template(file_format, columns, 
partition_columns, row_format, tb
     else:
       tblproperties["avro.schema.url"] = "hdfs://%s/%s/%s/{table_name}.json" \
         % (options.hdfs_namenode, options.hive_warehouse_dir, avro_schema_dir)
-  elif file_format in ['parquet', 'orc']:  # columnar formats don't need row 
format
+  # columnar formats don't need row format
+  elif file_format in ['parquet', 'orc', 'iceberg']:
     row_format_stmt = str()
   elif file_format == 'kudu':
     # Use partitioned_by to set a trivial hash distribution
@@ -341,6 +344,22 @@ def build_table_template(file_format, columns, 
partition_columns, row_format, tb
   else:
     tblproperties_clause = 
tblproperties_clause.format(",\n".join(all_tblproperties))
 
+  columns_str = ""
+  if is_iceberg_table(file_format):
+    for col in columns.split("\n"):
+      # The primary keys and foreign keys of the Iceberg tables should be 
omitted
+      if col.lower().startswith(("primary key", "foreign key")):
+        continue
+      # Omit PRIMARY KEY declaration e.g 'col_i INT PRIMARY KEY,'
+      col = re.sub(r"(?i)\s*primary\s*key\s*", " ", col)
+      # Iceberg tables do not support TINYINT and SMALLINT
+      col = re.sub(r"(?i)\s*tinyint\s*", " INT", col)
+      col = re.sub(r"(?i)\s*smallint\s*", " INT", col)
+      columns_str = columns_str + col + ",\n"
+    columns_str = columns_str.strip().strip(",")
+  else:
+    columns_str = ",\n".join(columns.split("\n"))
+
   # Note: columns are ignored but allowed if a custom serde is specified
   # (e.g. Avro)
   stmt = """
@@ -357,13 +376,12 @@ LOCATION '{{hdfs_location}}'
     external=external,
     table_comment=table_comment_stmt,
     row_format=row_format_stmt,
-    columns=',\n'.join(columns.split('\n')),
+    columns=columns_str,
     primary_keys=primary_keys_clause,
     partitioned_by=partitioned_by,
     tblproperties=tblproperties_clause,
     file_format_string=file_format_string
     ).strip()
-
   # Remove empty lines from the stmt string.  There is an empty line for
   # each of the sections that didn't have anything (e.g. partitioned_by)
   stmt = os.linesep.join([s for s in stmt.splitlines() if s])
@@ -454,8 +472,9 @@ def build_impala_parquet_codec_statement(codec):
 
 def build_insert_into_statement(insert, db_name, db_suffix, table_name, 
file_format,
                                 hdfs_path, for_impala=False):
-  insert_hint = "/* +shuffle, clustered */" \
-    if for_impala and file_format == 'parquet' else ""
+  insert_hint = ""
+  if for_impala and (file_format == 'parquet' or 
is_iceberg_table(file_format)):
+    insert_hint = "/* +shuffle, clustered */"
   insert_statement = insert.format(db_name=db_name,
                                    db_suffix=db_suffix,
                                    table_name=table_name,
@@ -509,7 +528,7 @@ def build_insert(insert, db_name, db_suffix, file_format,
     output += build_codec_enabled_statement(codec) + "\n"
     output += build_compression_codec_statement(codec, compression_type,
                                                 file_format) + "\n"
-  elif file_format == 'parquet':
+  elif file_format == 'parquet' or is_iceberg_table(file_format):
     # This is for Impala parquet, add the appropriate codec statement
     output += build_impala_parquet_codec_statement(codec) + "\n"
   output += build_insert_into_statement(insert, db_name, db_suffix,
@@ -807,7 +826,7 @@ def generate_statements(output_name, test_vectors, sections,
                                                               db_suffix, 
table_name))
           else:
             print 'Empty base table load for %s. Skipping load generation' % 
table_name
-        elif file_format in ['kudu', 'parquet']:
+        elif file_format in ['kudu', 'parquet', 'iceberg']:
           if insert_hive:
             hive_output.load.append(build_insert(insert_hive, db_name, 
db_suffix,
                 file_format, codec, compression_type, table_name, data_path))
@@ -845,6 +864,10 @@ def is_transactional(table_properties):
   return table_properties.get('transactional', "").lower() == 'true'
 
 
+def is_iceberg_table(file_format):
+  return file_format == 'iceberg'
+
+
 def parse_schema_template_file(file_name):
   VALID_SECTION_NAMES = ['DATASET', 'BASE_TABLE_NAME', 'COLUMNS', 
'PARTITION_COLUMNS',
                          'ROW_FORMAT', 'CREATE', 'CREATE_HIVE', 'CREATE_KUDU', 
'COMMENT',
diff --git a/tests/common/test_dimensions.py b/tests/common/test_dimensions.py
index 5a5788aac..e551d572b 100644
--- a/tests/common/test_dimensions.py
+++ b/tests/common/test_dimensions.py
@@ -31,7 +31,8 @@ WORKLOAD_DIR = os.environ['IMPALA_WORKLOAD_DIR']
 # of what specific table format to target along with the exec options 
(num_nodes, etc)
 # to use when running the query.
 class TableFormatInfo(object):
-  KNOWN_FILE_FORMATS = ['text', 'seq', 'rc', 'parquet', 'orc', 'avro', 
'hbase', 'kudu']
+  KNOWN_FILE_FORMATS = ['text', 'seq', 'rc', 'parquet', 'orc', 'avro', 'hbase',
+                        'kudu', 'iceberg']
   KNOWN_COMPRESSION_CODECS = ['none', 'snap', 'gzip', 'bzip', 'def', 'zstd', 
'lz4']
   KNOWN_COMPRESSION_TYPES = ['none', 'block', 'record']
 

Reply via email to