This is an automated email from the ASF dual-hosted git repository. joemcdonnell pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git
commit 3c49ebad6b1c55305064ef905ef78b242b024c1f Author: Joe McDonnell <[email protected]> AuthorDate: Tue Dec 20 14:04:09 2022 -0800 IMPALA-10971: Fix data cache metrics for instant evictions When inserting an entry into the data cache, the Insert() can fail and instantly evict the new entry. This is currently only true for the LIRS cache eviction policy. When this happens, the cache metrics are not maintained properly. The instant eviction results in an unmatched decrement to the total bytes and num entries counters. This moves the increments to happen prior to the Insert() call, which fixes the instant eviction case. The behavior does not change noticeably for the successful Insert() case. Testing: - Added custom cluster test to verify this case Change-Id: I5db3eaca2f4459844e3270846d905f76265b6b3e Reviewed-on: http://gerrit.cloudera.org:8080/19930 Reviewed-by: Joe McDonnell <[email protected]> Tested-by: Impala Public Jenkins <[email protected]> --- be/src/runtime/io/data-cache.cc | 18 +++++++++++------ tests/custom_cluster/test_data_cache.py | 35 +++++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+), 6 deletions(-) diff --git a/be/src/runtime/io/data-cache.cc b/be/src/runtime/io/data-cache.cc index 2cf45041d..fa532f165 100644 --- a/be/src/runtime/io/data-cache.cc +++ b/be/src/runtime/io/data-cache.cc @@ -731,6 +731,17 @@ bool DataCache::Partition::InsertIntoCache(const Slice& key, CacheFile* cache_fi } } + // IMPALA-10971: These metrics need to be incremented prior to the Insert(), because + // the Insert() can fail and instantly evict the entry. When that happens, + // the total bytes and num entries are decremented. Without this corresponding + // increment, the counts will be incorrect. + // Trace replays do not keep metrics + if (LIKELY(!trace_replay_)) { + ImpaladMetrics::IO_MGR_REMOTE_DATA_CACHE_TOTAL_BYTES->Increment(charge_len); + ImpaladMetrics::IO_MGR_REMOTE_DATA_CACHE_NUM_ENTRIES->Increment(1); + ImpaladMetrics::IO_MGR_REMOTE_DATA_CACHE_NUM_WRITES->Increment(1); + } + // Insert the new entry into the cache. CacheEntry entry(cache_file, insertion_offset, buffer_len, checksum); memcpy(meta_cache_->MutableValue(&pending_handle), &entry, sizeof(CacheEntry)); @@ -739,16 +750,11 @@ bool DataCache::Partition::InsertIntoCache(const Slice& key, CacheFile* cache_fi if (UNLIKELY(handle.get() == nullptr)){ // Trace replays do not keep metrics if (LIKELY(!trace_replay_)) { + // EvictedEntry() already ran and decremented the other counters. ImpaladMetrics::IO_MGR_REMOTE_DATA_CACHE_INSTANT_EVICTIONS->Increment(1); } return false; } - // Trace replays do not keep metrics - if (LIKELY(!trace_replay_)) { - ImpaladMetrics::IO_MGR_REMOTE_DATA_CACHE_TOTAL_BYTES->Increment(charge_len); - ImpaladMetrics::IO_MGR_REMOTE_DATA_CACHE_NUM_ENTRIES->Increment(1); - ImpaladMetrics::IO_MGR_REMOTE_DATA_CACHE_NUM_WRITES->Increment(1); - } return true; } diff --git a/tests/custom_cluster/test_data_cache.py b/tests/custom_cluster/test_data_cache.py index 988c08dfc..3d874692d 100644 --- a/tests/custom_cluster/test_data_cache.py +++ b/tests/custom_cluster/test_data_cache.py @@ -177,3 +177,38 @@ class TestDataCache(CustomClusterTestSuite): start_args=CACHE_START_ARGS, cluster_size=1) def test_data_cache_disablement_lirs(self, vector): self.__test_data_cache_disablement(vector) + + @pytest.mark.execute_serially + @CustomClusterTestSuite.with_args( + impalad_args=get_impalad_args("LIRS", high_write_concurrency=False), + start_args="--data_cache_dir=/tmp --data_cache_size=9MB", + cluster_size=1) + def test_data_cache_lirs_instant_evictions(self, vector): + # The setup for this test is intricate. For Allocate() to succeed, the request + # needs to be smaller than the protected size (95% of the cache). For Insert() to + # fail, the request needs to be larger than the unprotected size (5% of the cache). + # So, for an 8MB cache store to fail, the cache needs to be > 8.4MB (8MB / 0.95) + # and less than 160MB (8MB / 0.05). This sets it to 9MB, which should result in + # 8MB cache inserts to be instantly evicted. + QUERY = "select count(*) from tpch.lineitem" + self.execute_query(QUERY) + assert self.get_metric('impala-server.io-mgr.remote-data-cache-miss-bytes') > 0 + assert self.get_metric('impala-server.io-mgr.remote-data-cache-miss-count') > 0 + assert self.get_metric('impala-server.io-mgr.remote-data-cache-total-bytes') >= 0 + assert self.get_metric('impala-server.io-mgr.remote-data-cache-num-entries') >= 0 + assert self.get_metric('impala-server.io-mgr.remote-data-cache-num-writes') >= 0 + assert self.get_metric('impala-server.io-mgr.remote-data-cache-instant-evictions') > 0 + + # Run the query multiple times and verify that none of the counters go negative + instant_evictions_before = \ + self.get_metric('impala-server.io-mgr.remote-data-cache-instant-evictions') + for i in range(10): + self.execute_query(QUERY) + instant_evictions_after = \ + self.get_metric('impala-server.io-mgr.remote-data-cache-instant-evictions') + assert instant_evictions_after - instant_evictions_before > 0 + + # All the counters remain positive + assert self.get_metric('impala-server.io-mgr.remote-data-cache-num-entries') >= 0 + assert self.get_metric('impala-server.io-mgr.remote-data-cache-num-writes') >= 0 + assert self.get_metric('impala-server.io-mgr.remote-data-cache-total-bytes') >= 0
