This is an automated email from the ASF dual-hosted git repository. joemcdonnell pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git
commit 4680cfd341e5245088cfce1d6d8507e7314182f1 Author: Gabor Kaszab <[email protected]> AuthorDate: Mon Sep 23 14:16:00 2024 +0200 IMPALA-11265: Part1: Clear GroupContentFiles once used GroupContentFiles stores the file descriptors in Iceberg's format and is used for creating file descriptors in Impala's format. Once this creation is done, we no longer have to keep these Iceberg ContentFiles. Dropping these could significantly reduce the memory footprint of an Iceberg table. Measurements: I have a test table that has 110k files. The measurements showed that cleaning the GroupedContentFiles could reduce the memory size of this particular table from 140MB to 80MB. Change-Id: I1efdd2a46c9675f7461535259e5892ed213a6b21 Reviewed-on: http://gerrit.cloudera.org:8080/21847 Reviewed-by: Impala Public Jenkins <[email protected]> Tested-by: Impala Public Jenkins <[email protected]> --- fe/src/main/java/org/apache/impala/catalog/IcebergTable.java | 3 +++ .../java/org/apache/impala/catalog/iceberg/GroupedContentFiles.java | 4 ++++ 2 files changed, 7 insertions(+) diff --git a/fe/src/main/java/org/apache/impala/catalog/IcebergTable.java b/fe/src/main/java/org/apache/impala/catalog/IcebergTable.java index 37a75ee0c..1f245a665 100644 --- a/fe/src/main/java/org/apache/impala/catalog/IcebergTable.java +++ b/fe/src/main/java/org/apache/impala/catalog/IcebergTable.java @@ -423,6 +423,9 @@ public class IcebergTable extends Table implements FeIcebergTable { setIcebergTableStats(); loadAllColumnStats(msClient, catalogTimeline); setAvroSchema(msClient, msTbl, fileStore_, catalogTimeline); + + // We no longer need to keep Iceberg's content files in memory. + hdfsTable_.setIcebergFiles(null); } catch (Exception e) { throw new IcebergTableLoadingException("Error loading metadata for Iceberg table " + icebergTableLocation_, e); diff --git a/fe/src/main/java/org/apache/impala/catalog/iceberg/GroupedContentFiles.java b/fe/src/main/java/org/apache/impala/catalog/iceberg/GroupedContentFiles.java index dbd170734..2a3d6c544 100644 --- a/fe/src/main/java/org/apache/impala/catalog/iceberg/GroupedContentFiles.java +++ b/fe/src/main/java/org/apache/impala/catalog/iceberg/GroupedContentFiles.java @@ -37,6 +37,10 @@ import com.google.common.collect.Iterables; * - data files without deleted rows * - data files with deleted rows * - delete files (position and equality) + * + * For memory footprint considerations it's not recommended to keep the content of this + * store in the memory for a long time. The desired usage is to initialize it, use it to + * create some catalog objects in Impala and then drop it. */ public class GroupedContentFiles { public List<DataFile> dataFilesWithoutDeletes = new ArrayList<>();
