[impala] 02/02: IMPALA-10971: Fix data cache metrics for instant evictions

joemcdonnell Fri, 02 Jun 2023 15:39:06 -0700

This is an automated email from the ASF dual-hosted git repository.

joemcdonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git


commit 3c49ebad6b1c55305064ef905ef78b242b024c1f
Author: Joe McDonnell <[email protected]>
AuthorDate: Tue Dec 20 14:04:09 2022 -0800

    IMPALA-10971: Fix data cache metrics for instant evictions
    
    When inserting an entry into the data cache, the
    Insert() can fail and instantly evict the new entry.
    This is currently only true for the LIRS cache
    eviction policy. When this happens, the cache metrics
    are not maintained properly. The instant eviction
    results in an unmatched decrement to the total bytes
    and num entries counters.
    
    This moves the increments to happen prior to the
    Insert() call, which fixes the instant eviction
    case. The behavior does not change noticeably for
    the successful Insert() case.
    
    Testing:
     - Added custom cluster test to verify this case
    
    Change-Id: I5db3eaca2f4459844e3270846d905f76265b6b3e
    Reviewed-on: http://gerrit.cloudera.org:8080/19930
    Reviewed-by: Joe McDonnell <[email protected]>
    Tested-by: Impala Public Jenkins <[email protected]>
---
 be/src/runtime/io/data-cache.cc         | 18 +++++++++++------
 tests/custom_cluster/test_data_cache.py | 35 +++++++++++++++++++++++++++++++++
 2 files changed, 47 insertions(+), 6 deletions(-)

diff --git a/be/src/runtime/io/data-cache.cc b/be/src/runtime/io/data-cache.cc
index 2cf45041d..fa532f165 100644
--- a/be/src/runtime/io/data-cache.cc
+++ b/be/src/runtime/io/data-cache.cc
@@ -731,6 +731,17 @@ bool DataCache::Partition::InsertIntoCache(const Slice& 
key, CacheFile* cache_fi
     }
   }
 
+  // IMPALA-10971: These metrics need to be incremented prior to the Insert(), 
because
+  // the Insert() can fail and instantly evict the entry. When that happens,
+  // the total bytes and num entries are decremented. Without this 
corresponding
+  // increment, the counts will be incorrect.
+  // Trace replays do not keep metrics
+  if (LIKELY(!trace_replay_)) {
+    
ImpaladMetrics::IO_MGR_REMOTE_DATA_CACHE_TOTAL_BYTES->Increment(charge_len);
+    ImpaladMetrics::IO_MGR_REMOTE_DATA_CACHE_NUM_ENTRIES->Increment(1);
+    ImpaladMetrics::IO_MGR_REMOTE_DATA_CACHE_NUM_WRITES->Increment(1);
+  }
+
   // Insert the new entry into the cache.
   CacheEntry entry(cache_file, insertion_offset, buffer_len, checksum);
   memcpy(meta_cache_->MutableValue(&pending_handle), &entry, 
sizeof(CacheEntry));
@@ -739,16 +750,11 @@ bool DataCache::Partition::InsertIntoCache(const Slice& 
key, CacheFile* cache_fi
   if (UNLIKELY(handle.get() == nullptr)){
     // Trace replays do not keep metrics
     if (LIKELY(!trace_replay_)) {
+      // EvictedEntry() already ran and decremented the other counters.
       ImpaladMetrics::IO_MGR_REMOTE_DATA_CACHE_INSTANT_EVICTIONS->Increment(1);
     }
     return false;
   }
-  // Trace replays do not keep metrics
-  if (LIKELY(!trace_replay_)) {
-    
ImpaladMetrics::IO_MGR_REMOTE_DATA_CACHE_TOTAL_BYTES->Increment(charge_len);
-    ImpaladMetrics::IO_MGR_REMOTE_DATA_CACHE_NUM_ENTRIES->Increment(1);
-    ImpaladMetrics::IO_MGR_REMOTE_DATA_CACHE_NUM_WRITES->Increment(1);
-  }
   return true;
 }
 
diff --git a/tests/custom_cluster/test_data_cache.py 
b/tests/custom_cluster/test_data_cache.py
index 988c08dfc..3d874692d 100644
--- a/tests/custom_cluster/test_data_cache.py
+++ b/tests/custom_cluster/test_data_cache.py
@@ -177,3 +177,38 @@ class TestDataCache(CustomClusterTestSuite):
       start_args=CACHE_START_ARGS, cluster_size=1)
   def test_data_cache_disablement_lirs(self, vector):
     self.__test_data_cache_disablement(vector)
+
+  @pytest.mark.execute_serially
+  @CustomClusterTestSuite.with_args(
+      impalad_args=get_impalad_args("LIRS", high_write_concurrency=False),
+      start_args="--data_cache_dir=/tmp --data_cache_size=9MB",
+      cluster_size=1)
+  def test_data_cache_lirs_instant_evictions(self, vector):
+    # The setup for this test is intricate. For Allocate() to succeed, the 
request
+    # needs to be smaller than the protected size (95% of the cache). For 
Insert() to
+    # fail, the request needs to be larger than the unprotected size (5% of 
the cache).
+    # So, for an 8MB cache store to fail, the cache needs to be > 8.4MB (8MB / 
0.95)
+    # and less than 160MB (8MB / 0.05). This sets it to 9MB, which should 
result in
+    # 8MB cache inserts to be instantly evicted.
+    QUERY = "select count(*) from tpch.lineitem"
+    self.execute_query(QUERY)
+    assert 
self.get_metric('impala-server.io-mgr.remote-data-cache-miss-bytes') > 0
+    assert 
self.get_metric('impala-server.io-mgr.remote-data-cache-miss-count') > 0
+    assert 
self.get_metric('impala-server.io-mgr.remote-data-cache-total-bytes') >= 0
+    assert 
self.get_metric('impala-server.io-mgr.remote-data-cache-num-entries') >= 0
+    assert 
self.get_metric('impala-server.io-mgr.remote-data-cache-num-writes') >= 0
+    assert 
self.get_metric('impala-server.io-mgr.remote-data-cache-instant-evictions') > 0
+
+    # Run the query multiple times and verify that none of the counters go 
negative
+    instant_evictions_before = \
+        
self.get_metric('impala-server.io-mgr.remote-data-cache-instant-evictions')
+    for i in range(10):
+      self.execute_query(QUERY)
+    instant_evictions_after = \
+        
self.get_metric('impala-server.io-mgr.remote-data-cache-instant-evictions')
+    assert instant_evictions_after - instant_evictions_before > 0
+
+    # All the counters remain positive
+    assert 
self.get_metric('impala-server.io-mgr.remote-data-cache-num-entries') >= 0
+    assert 
self.get_metric('impala-server.io-mgr.remote-data-cache-num-writes') >= 0
+    assert 
self.get_metric('impala-server.io-mgr.remote-data-cache-total-bytes') >= 0

[impala] 02/02: IMPALA-10971: Fix data cache metrics for instant evictions

Reply via email to