This is an automated email from the ASF dual-hosted git repository.

dbecker pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 52b11ab6aa0dbd2d99e6d583d740858b9f892b5f
Author: Sai Hemanth Gantasala <[email protected]>
AuthorDate: Wed Feb 7 17:44:42 2024 -0800

    IMPALA-12487: Skip reloading file metadata for ALTER_TABLE events with
    trivial changes in StorageDescriptor
    
    IMPALA-11534 skips reloading file metadata for some trivial ALTER_TABLE
    events. However, ALTER_TABLE events that have trivial changes in
    StorageDescriptor are not handled in IMPALA-11534. The only changes
    that require file metadata reload are: location, rowformat, fileformat,
    serde, and storedAsSubDirectories. The file metadata reload can be
    skipped for all other changes in SD.
    
    Testing:
    1) Manual testing by changing SD parameters in local environment.
    2) Added unit tests for the same in MetastoreEventsProcessorTest class.
    
    Change-Id: I6fd9a9504bf93d2529dc7accbf436ad83e51d8ac
    Reviewed-on: http://gerrit.cloudera.org:8080/21019
    Reviewed-by: Impala Public Jenkins <[email protected]>
    Tested-by: Impala Public Jenkins <[email protected]>
---
 .../org/apache/impala/compat/MetastoreShim.java    |   2 +-
 .../impala/catalog/CatalogServiceCatalog.java      |  13 ++-
 .../main/java/org/apache/impala/catalog/Table.java |   6 +-
 .../impala/catalog/events/MetastoreEvents.java     |  76 ++++++++++++---
 .../events/MetastoreEventsProcessorTest.java       | 108 +++++++++++++++++++++
 5 files changed, 186 insertions(+), 19 deletions(-)

diff --git 
a/fe/src/compat-hive-3/java/org/apache/impala/compat/MetastoreShim.java 
b/fe/src/compat-hive-3/java/org/apache/impala/compat/MetastoreShim.java
index 000dccdf8..9ecad6284 100644
--- a/fe/src/compat-hive-3/java/org/apache/impala/compat/MetastoreShim.java
+++ b/fe/src/compat-hive-3/java/org/apache/impala/compat/MetastoreShim.java
@@ -945,7 +945,7 @@ public class MetastoreShim extends Hive3MetastoreShimBase {
           }
         } else {
           catalog_.reloadTableIfExists(entry.getKey().getDb(), 
entry.getKey().getTbl(),
-               "CommitTxnEvent", getEventId(), 
/*isSkipFileMetadataReload*/false);
+              "CommitTxnEvent", getEventId(), 
/*isSkipFileMetadataReload*/false);
         }
       }
     }
diff --git 
a/fe/src/main/java/org/apache/impala/catalog/CatalogServiceCatalog.java 
b/fe/src/main/java/org/apache/impala/catalog/CatalogServiceCatalog.java
index 817f97f50..b0f9d5181 100644
--- a/fe/src/main/java/org/apache/impala/catalog/CatalogServiceCatalog.java
+++ b/fe/src/main/java/org/apache/impala/catalog/CatalogServiceCatalog.java
@@ -2761,6 +2761,7 @@ public class CatalogServiceCatalog extends Catalog {
         }
         catalogTimeline.markEvent("Loaded table");
       }
+      boolean isFullReloadOnTable = !isSkipFileMetadataReload;
       if (currentHmsEventId != -1 && syncToLatestEventId) {
         // fetch latest event id from HMS before starting table load and set 
that event
         // id as table's last synced id. It may happen that while the table 
was being
@@ -2770,16 +2771,20 @@ public class CatalogServiceCatalog extends Catalog {
         // We are not replaying new events for the table in the end because 
certain
         // events like ALTER_TABLE in MetastoreEventProcessor call this method 
which
         // might trigger an endless loop cycle
-        tbl.setLastSyncedEventId(currentHmsEventId);
+        if (isFullReloadOnTable) {
+          tbl.setLastSyncedEventId(currentHmsEventId);
+        } else {
+          tbl.setLastSyncedEventId(eventId);
+        }
       }
       tbl.setCatalogVersion(newCatalogVersion);
       LOG.info(String.format("Refreshed table metadata: %s", 
tbl.getFullName()));
       // Set the last refresh event id as current HMS event id since all the 
metadata
       // until the current HMS event id is refreshed at this point.
-      if (currentHmsEventId > eventId) {
-        tbl.setLastRefreshEventId(currentHmsEventId);
+      if (currentHmsEventId > eventId && isFullReloadOnTable) {
+        tbl.setLastRefreshEventId(currentHmsEventId, isFullReloadOnTable);
       } else {
-        tbl.setLastRefreshEventId(eventId);
+        tbl.setLastRefreshEventId(eventId, isFullReloadOnTable);
       }
       return tbl.toTCatalogObject(resultType);
     } finally {
diff --git a/fe/src/main/java/org/apache/impala/catalog/Table.java 
b/fe/src/main/java/org/apache/impala/catalog/Table.java
index 09c5086e0..8fd187d95 100644
--- a/fe/src/main/java/org/apache/impala/catalog/Table.java
+++ b/fe/src/main/java/org/apache/impala/catalog/Table.java
@@ -1103,6 +1103,10 @@ public abstract class Table extends CatalogObjectImpl 
implements FeTable {
   public long getLastRefreshEventId() { return lastRefreshEventId_; }
 
   public void setLastRefreshEventId(long eventId) {
+    setLastRefreshEventId(eventId, true);
+  }
+
+  public void setLastRefreshEventId(long eventId, boolean 
isSetLastSyncEventId) {
     if (eventId > lastRefreshEventId_) {
       lastRefreshEventId_ = eventId;
     }
@@ -1111,7 +1115,7 @@ public abstract class Table extends CatalogObjectImpl 
implements FeTable {
     // TODO: Should we reset lastSyncedEvent Id if it is less than event Id?
     // If we don't reset it - we may start syncing table from an event id which
     // is less than refresh event id
-    if (lastSyncedEventId_ < eventId) {
+    if (lastSyncedEventId_ < eventId && isSetLastSyncEventId) {
       setLastSyncedEventId(eventId);
     }
   }
diff --git 
a/fe/src/main/java/org/apache/impala/catalog/events/MetastoreEvents.java 
b/fe/src/main/java/org/apache/impala/catalog/events/MetastoreEvents.java
index d5324004b..2bb6d8682 100644
--- a/fe/src/main/java/org/apache/impala/catalog/events/MetastoreEvents.java
+++ b/fe/src/main/java/org/apache/impala/catalog/events/MetastoreEvents.java
@@ -40,6 +40,7 @@ import org.apache.hadoop.hive.metastore.api.Database;
 import org.apache.hadoop.hive.metastore.api.FieldSchema;
 import org.apache.hadoop.hive.metastore.api.NotificationEvent;
 import org.apache.hadoop.hive.metastore.api.Partition;
+import org.apache.hadoop.hive.metastore.api.StorageDescriptor;
 import org.apache.hadoop.hive.metastore.api.Table;
 import org.apache.hadoop.hive.metastore.api.TxnToWriteId;
 import org.apache.hadoop.hive.metastore.messaging.AbortTxnMessage;
@@ -1892,12 +1893,19 @@ public class MetastoreEvents {
 
       // There are lot of other alter statements which doesn't require file 
metadata
       // reload but these are the most common types for alter statements.
+      boolean skipFileMetadata = false;
       if (isFieldSchemaChanged(beforeTable, afterTable) ||
-          isTableOwnerChanged(beforeTable.getOwner(), afterTable.getOwner()) ||
-          !isCustomTblPropsChanged(whitelistedTblProperties, beforeTable, 
afterTable)) {
-        return true;
+          isTableOwnerChanged(beforeTable.getOwner(), afterTable.getOwner())) {
+        skipFileMetadata = true;
+      } else if (!Objects.equals(beforeTable.getSd(), afterTable.getSd())) {
+        if (isTrivialSdPropsChanged(beforeTable.getSd(), afterTable.getSd())) {
+          skipFileMetadata = true;
+        }
+      } else if (!isCustomTblPropsChanged(whitelistedTblProperties, 
beforeTable,
+          afterTable)) {
+        skipFileMetadata = true;
       }
-      return false;
+      return skipFileMetadata;
     }
 
     private boolean isFieldSchemaChanged(
@@ -1907,8 +1915,9 @@ public class MetastoreEvents {
       List<FieldSchema> afterCols = afterTable.getSd().getCols();
       // check if columns are added or removed
       if (beforeCols.size() != afterCols.size()) {
-        infoLog("Change in number of columns detected for table {}.{} from {} 
to {}",
-            dbName_, tblName_, beforeCols.size(), afterCols.size());
+        infoLog("Change in number of columns detected for table {}.{} from {} 
to {}. " +
+            "So file metadata reload can be skipped.", dbName_, tblName_,
+            beforeCols.size(), afterCols.size());
         return true;
       }
       // check if columns are replaced or column definition is changed
@@ -1916,10 +1925,10 @@ public class MetastoreEvents {
       for (int i = 0; i < beforeCols.size(); i++) {
         if (!beforeCols.get(i).getName().equals(afterCols.get(i).getName()) ||
             !beforeCols.get(i).getType().equals(afterCols.get(i).getType())) {
-          infoLog("Change in table schema detected for table {}.{} from {} to 
{} ",
-              tblName_, dbName_, beforeCols.get(i).getName() + " (" +
-                  beforeCols.get(i).getType() +")", afterCols.get(i).getName() 
+ " (" +
-                  afterCols.get(i).getType() + ")");
+          infoLog("Change in table schema detected for table {}.{} from {} 
({}) " +
+              "to {} ({}). So file metadata reload can be skipped.", dbName_, 
tblName_,
+              beforeCols.get(i).getName(), beforeCols.get(i).getType(),
+              afterCols.get(i).getName(), afterCols.get(i).getType());
           return true;
         }
       }
@@ -1929,7 +1938,8 @@ public class MetastoreEvents {
     private boolean isTableOwnerChanged(String ownerBefore, String ownerAfter) 
{
       if (!Objects.equals(ownerBefore, ownerAfter)) {
         infoLog("Change in Ownership detected for table {}.{}, oldOwner: {}, " 
+
-            "newOwner: {}", dbName_, tblName_, ownerBefore, ownerAfter);
+            "newOwner: {}. So file metadata reload can be skipped.",
+            dbName_, tblName_, ownerBefore, ownerAfter);
         return true;
       }
       return false;
@@ -1944,14 +1954,54 @@ public class MetastoreEvents {
         String configAfter = afterTable.getParameters().get(whitelistConfig);
         if (!Objects.equals(configBefore, configAfter)) {
           infoLog("Change in whitelisted table properties detected for table 
{}.{} " +
-              "whitelisted config: {}, value before: {}, value after: {}", 
dbName_,
-              tblName_, whitelistConfig, configBefore, configAfter);
+              "whitelisted config: {}, value before: {}, value after: {}. So 
file " +
+              "metadata should be reloaded.", dbName_, tblName_, 
whitelistConfig,
+              configBefore, configAfter);
           return true;
         }
       }
       return false;
     }
 
+    // Check if the trivial SD properties are changed during the alter 
statement.
+    // Also, the caller should make sure that 'beforeSd' and 'afterSd' are not 
equal.
+    private boolean isTrivialSdPropsChanged(StorageDescriptor beforeSd,
+        StorageDescriptor afterSd) {
+      Preconditions.checkNotNull(beforeSd, "beforeSd is null");
+      Preconditions.checkNotNull(afterSd, "afterSd is null");
+      StorageDescriptor previousSD = 
normalizeStorageDescriptor(beforeSd.deepCopy());
+      StorageDescriptor currentSD = 
normalizeStorageDescriptor(afterSd.deepCopy());
+      if (!Objects.equals(previousSD, currentSD)) {
+        infoLog("Non-trivial change in table storage descriptor (SD) detected 
for " +
+            "table {}.{}. So file metadata should be reloaded. SD before: {}, 
SD " +
+            "after: {}", dbName_, tblName_, beforeSd.toString(), 
afterSd.toString());
+        return false;
+      }
+      infoLog("Trivial changes in table storage descriptor (SD) detected for 
table " +
+          "{}.{}. So file metadata reload can be skipped.", dbName_, tblName_);
+      return true;
+    }
+
+    /**
+     * Normalize the storage descriptor by unsetting the trivial fields in SD 
like
+     * columns, compressed, numBuckets, bucketCols,SkewedInfo, and
+     * setStoredAsSubDirectories.
+     */
+    private StorageDescriptor normalizeStorageDescriptor(StorageDescriptor sd) 
{
+      sd.unsetCols();
+      sd.unsetCompressed();
+      sd.unsetNumBuckets();
+      sd.unsetBucketCols();
+      sd.unsetSortCols();
+      sd.unsetSkewedInfo();
+      // setStoredAsSubDirectories = null or 'false' are trivial changes, so 
we normalize
+      // this value to false if it is null.
+      if (!sd.isSetStoredAsSubDirectories()) {
+        sd.setStoredAsSubDirectories(false);
+      }
+      return sd;
+    }
+
     /**
      * Detects a event sync flag was turned on in this event
      */
diff --git 
a/fe/src/test/java/org/apache/impala/catalog/events/MetastoreEventsProcessorTest.java
 
b/fe/src/test/java/org/apache/impala/catalog/events/MetastoreEventsProcessorTest.java
index 6820292e7..81461e633 100644
--- 
a/fe/src/test/java/org/apache/impala/catalog/events/MetastoreEventsProcessorTest.java
+++ 
b/fe/src/test/java/org/apache/impala/catalog/events/MetastoreEventsProcessorTest.java
@@ -176,6 +176,8 @@ import org.junit.Assume;
 import org.junit.Before;
 import org.junit.BeforeClass;
 import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 import com.google.common.collect.Lists;
 
@@ -197,6 +199,8 @@ public class MetastoreEventsProcessorTest {
   private static CatalogServiceTestCatalog catalog_;
   private static CatalogOpExecutor catalogOpExecutor_;
   private static MetastoreEventsProcessor eventsProcessor_;
+  private static final Logger LOG =
+      LoggerFactory.getLogger(MetastoreEventsProcessorTest.class);
 
   @BeforeClass
   public static void setUpTestEnvironment() throws TException, ImpalaException 
{
@@ -3084,6 +3088,110 @@ public class MetastoreEventsProcessorTest {
     assertNotEquals(fileMetadataLoadAfter, fileMetadataLoadBefore);
   }
 
+  /**
+   * Some of the the alter table events on Storage Descriptor doesn't require 
to reload
+   * file metadata, this test asserts that file metadata is not reloaded for 
such alter
+   * table events
+   */
+  @Test
+  public void testAlterTableSdVerifyMetadataReload() throws Exception {
+    createDatabase(TEST_DB_NAME, null);
+    final String testTblName = "testSdFileMetadataReload";
+    createTable(testTblName, true);
+    List<List<String>> partVals = new ArrayList<>();
+    partVals.add(Arrays.asList("1"));
+    addPartitions(TEST_DB_NAME, testTblName, partVals);
+    eventsProcessor_.processEvents();
+    // load the table first
+    loadTable(testTblName);
+    HdfsTable tbl = (HdfsTable) catalog_.getTable(TEST_DB_NAME, testTblName);
+    // get file metadata load counter before altering the partition
+    long fileMetadataLoadBefore =
+        
tbl.getMetrics().getCounter(HdfsTable.NUM_LOAD_FILEMETADATA_METRIC).getCount();
+
+    // Test 1: Set file format
+    LOG.info("Test changes in file format for an Alter table statement");
+    org.apache.hadoop.hive.metastore.api.Table hmsTbl = 
catalog_.getTable(TEST_DB_NAME,
+        testTblName).getMetaStoreTable();
+    hmsTbl.getSd().setInputFormat("org.apache.hadoop.mapred.TextInputFormat");
+    hmsTbl.getSd().setOutputFormat(
+        "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat");
+    long fileMetadataLoadAfter = processAlterTableAndReturnMetric(testTblName, 
hmsTbl);
+    assertEquals(fileMetadataLoadBefore + 1, fileMetadataLoadAfter);
+    fileMetadataLoadBefore = fileMetadataLoadAfter;
+
+    // Test 2: set rowformat
+    LOG.info("Test changes in row format for Alter table statement");
+    hmsTbl = catalog_.getTable(TEST_DB_NAME, testTblName).getMetaStoreTable();
+    hmsTbl.getSd().getSerdeInfo().setSerializerClass(
+        "org.apache.hadoop.hive.contrib.serde2.RegexSerDe");
+    fileMetadataLoadAfter = processAlterTableAndReturnMetric(testTblName, 
hmsTbl);
+    assertEquals(fileMetadataLoadBefore + 1, fileMetadataLoadAfter);
+    fileMetadataLoadBefore = fileMetadataLoadAfter;
+
+    // Test 3: set location
+    LOG.info("Test changes in table location for Alter table statement");
+    hmsTbl = catalog_.getTable(TEST_DB_NAME, testTblName).getMetaStoreTable();
+    assertNotNull("Location is expected to be set to proceed forward in the 
test",
+        hmsTbl.getSd().getLocation());
+    String tblLocation = hmsTbl.getSd().getLocation() + "_changed";
+    hmsTbl.getSd().setLocation(tblLocation);
+    fileMetadataLoadAfter = processAlterTableAndReturnMetric(testTblName, 
hmsTbl);
+    assertEquals(fileMetadataLoadBefore + 1, fileMetadataLoadAfter);
+    fileMetadataLoadBefore = fileMetadataLoadAfter;
+
+    // Test 4: set storedAsSubDirectories from false to unset
+    LOG.info("Test changes in storedAsSubDirectories from false to unset");
+    hmsTbl = tbl.getMetaStoreTable().deepCopy();
+    hmsTbl.getSd().unsetStoredAsSubDirectories();
+    fileMetadataLoadAfter = processAlterTableAndReturnMetric(testTblName, 
hmsTbl);
+    assertEquals(fileMetadataLoadBefore, fileMetadataLoadAfter);
+
+    // Test 5: set storedAsSubDirectories to true
+    LOG.info("Test changes in storedAsSubDirectories from unset to true");
+    hmsTbl.getSd().setStoredAsSubDirectories(true);
+    fileMetadataLoadAfter = processAlterTableAndReturnMetric(testTblName, 
hmsTbl);
+    assertEquals(fileMetadataLoadBefore + 1, fileMetadataLoadAfter);
+    fileMetadataLoadBefore = fileMetadataLoadAfter;
+
+    // Test 6: set storedAsSubDirectories from true to false
+    LOG.info("Test changes in storedAsSubDirectories from true to false");
+    hmsTbl.getSd().setStoredAsSubDirectories(false);
+    fileMetadataLoadAfter = processAlterTableAndReturnMetric(testTblName, 
hmsTbl);
+    assertEquals(fileMetadataLoadBefore + 1, fileMetadataLoadAfter);
+
+    // Test 7: set storedAsSubDirectories from unset to false
+    LOG.info("Test changes in storedAsSubDirectories from unset to false");
+    // first set the value to unset from false
+    hmsTbl.getSd().unsetStoredAsSubDirectories();
+    fileMetadataLoadAfter = processAlterTableAndReturnMetric(testTblName, 
hmsTbl);
+    fileMetadataLoadBefore = fileMetadataLoadAfter;
+    hmsTbl.getSd().setStoredAsSubDirectories(false);
+    fileMetadataLoadAfter = processAlterTableAndReturnMetric(testTblName, 
hmsTbl);
+    assertEquals(fileMetadataLoadBefore, fileMetadataLoadAfter);
+
+    // Test 8: keep SD unchanged by setting the same value and also change 
non-trivial
+    // TblProperties to verify file metadata is reloaded
+    LOG.info("Test changes in non-trivial tbl props and same SD");
+    hmsTbl = tbl.getMetaStoreTable().deepCopy();
+    hmsTbl.getSd().setStoredAsSubDirectories(false);
+    hmsTbl.getParameters().put("EXTERNAL", "false");
+    fileMetadataLoadAfter = processAlterTableAndReturnMetric(testTblName, 
hmsTbl);
+    assertEquals(fileMetadataLoadBefore + 1, fileMetadataLoadAfter);
+  }
+
+  private long processAlterTableAndReturnMetric(String testTblName,
+      org.apache.hadoop.hive.metastore.api.Table msTbl) throws Exception {
+    try (MetaStoreClient msClient = catalog_.getMetaStoreClient()) {
+      msClient.getHiveClient().alter_table_with_environmentContext(
+          TEST_DB_NAME, testTblName, msTbl, null);
+    }
+    eventsProcessor_.processEvents();
+    HdfsTable tbl = (HdfsTable) catalog_.getTable(TEST_DB_NAME, testTblName);
+    return
+        
tbl.getMetrics().getCounter(HdfsTable.NUM_LOAD_FILEMETADATA_METRIC).getCount();
+  }
+
   /**
    * For an external table, the test asserts file metadata is not reloaded 
during
    * AlterPartitionEvent processing if only partition parameters are changed

Reply via email to