This is an automated email from the ASF dual-hosted git repository. stigahuang pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git
commit a9db5f1d72d637b499b670bb5442973f8ae3b116 Author: Csaba Ringhofer <[email protected]> AuthorDate: Fri Feb 21 16:53:14 2025 +0100 IMPALA-13778: Update ignored_dir_prefix_list There are some missing prefixes that are not yet ignored during recursive listing: - -tmp. (Hive changed to this new tmp dir from _tmp. in HIVE-27536) - _impala_insert_staging Impala may bump to these often so it is better to ignore them by default. Change-Id: I023244525dd333af1c5bfbad06708f3ec86aeacf Reviewed-on: http://gerrit.cloudera.org:8080/22518 Reviewed-by: Impala Public Jenkins <[email protected]> Tested-by: Impala Public Jenkins <[email protected]> --- be/src/util/backend-gflag-util.cc | 3 ++- .../java/org/apache/impala/common/FileSystemUtil.java | 4 ++++ tests/metadata/test_recursive_listing.py | 18 ++++++++++-------- 3 files changed, 16 insertions(+), 9 deletions(-) diff --git a/be/src/util/backend-gflag-util.cc b/be/src/util/backend-gflag-util.cc index 292a050db..ccc8feb95 100644 --- a/be/src/util/backend-gflag-util.cc +++ b/be/src/util/backend-gflag-util.cc @@ -222,7 +222,8 @@ DEFINE_bool(use_hms_column_order_for_hbase_tables, false, "Use the column order in HMS for HBase tables instead of ordering the columns by " "family/qualifier. Keeping the default as false for backward compatibility."); -DEFINE_string(ignored_dir_prefix_list, ".,_tmp.,_spark_metadata", +DEFINE_string(ignored_dir_prefix_list, + ".,_tmp.,-tmp.,_spark_metadata,_impala_insert_staging", "Comma separated list to specify the prefix for tmp/staging dirs that catalogd should" " skip in loading file metadata."); diff --git a/fe/src/main/java/org/apache/impala/common/FileSystemUtil.java b/fe/src/main/java/org/apache/impala/common/FileSystemUtil.java index 1d8d8c437..e17880db1 100644 --- a/fe/src/main/java/org/apache/impala/common/FileSystemUtil.java +++ b/fe/src/main/java/org/apache/impala/common/FileSystemUtil.java @@ -1044,7 +1044,9 @@ public class FileSystemUtil { public static final String DOT = "."; public static final String HIVE_TEMP_FILE_PREFIX = "_tmp."; + public static final String HIVE_NEW_TEMP_FILE_PREFIX = "-tmp."; public static final String SPARK_TEMP_FILE_PREFIX = "_spark_metadata"; + public static final String IMPALA_STAGING_DIR_PREFIX = "_impala_insert_staging"; /** * Prefix string used by tools like hive/spark/flink to write certain temporary or @@ -1059,7 +1061,9 @@ public class FileSystemUtil { || BackendConfig.INSTANCE.getIgnoredDirPrefixList() == null) { TMP_DIR_PREFIX_LIST.add(DOT); TMP_DIR_PREFIX_LIST.add(HIVE_TEMP_FILE_PREFIX); + TMP_DIR_PREFIX_LIST.add(HIVE_NEW_TEMP_FILE_PREFIX); TMP_DIR_PREFIX_LIST.add(SPARK_TEMP_FILE_PREFIX); + TMP_DIR_PREFIX_LIST.add(IMPALA_STAGING_DIR_PREFIX); LOG.warn("BackendConfig.INSTANCE uninitialized. Use hard-coded prefix-list."); } else { String s = BackendConfig.INSTANCE.getIgnoredDirPrefixList(); diff --git a/tests/metadata/test_recursive_listing.py b/tests/metadata/test_recursive_listing.py index 9ec8bc135..957575fb2 100644 --- a/tests/metadata/test_recursive_listing.py +++ b/tests/metadata/test_recursive_listing.py @@ -108,14 +108,16 @@ class TestRecursiveListing(ImpalaTestSuite): # Create files in the nested hidden directories and refresh. Make sure it does not # show up - self.filesystem_client.make_dir("{0}/.hive-staging".format(part_path)) - self.filesystem_client.create_file( - "{0}/.hive-staging/file3.txt".format(part_path), - "data-should-be-ignored-by-impala") - self.filesystem_client.make_dir("{0}/_tmp.base_000000_1".format(part_path)) - self.filesystem_client.create_file( - "{0}/_tmp.base_000000_1/000000_0.manifest".format(part_path), - "manifest-file_contents") + dir_file_list = [ + (".hive-staging", "file3.txt"), + ("_tmp.base_000000_1", "000000_0.manifest"), + ("-tmp.base_000000_1", "000000_0.manifest"), + ("_impala_insert_staging", "bc4e15747fc7d788-f632b3b300000000_944410164_data.0.txt") + ] + for (dir, file) in dir_file_list: + self.filesystem_client.make_dir("{0}/{1}".format(part_path, dir)) + self.filesystem_client.create_file( + "{0}/{1}/{2}".format(part_path, dir, file), "shouldntreadthis") self.execute_query_expect_success(self.client, "refresh {0}".format(fq_tbl_name)) assert len(self._show_files(fq_tbl_name)) == 3 assert len(self._get_rows(fq_tbl_name)) == 3
