This is an automated email from the ASF dual-hosted git repository.
michaelsmith pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git
The following commit(s) were added to refs/heads/master by this push:
new fbc4d701b IMPALA-12492: Add metrics for total pending events and lag
time of the event-processor
fbc4d701b is described below
commit fbc4d701b1caa2c376c73a11e919d4ca38d64d48
Author: stiga-huang <[email protected]>
AuthorDate: Wed Oct 11 16:36:03 2023 +0800
IMPALA-12492: Add metrics for total pending events and lag time of the
event-processor
We have last-synced-event-time and latest-event-time on the catalogd
WebUI. It's tedious for users to calculate their difference to get the
lag time. This patch addes the lag in the WebUI directly. The unit is
set to TIME_S so it's shown in a human readable format. Also adds the
total pending events in the same page.
Tests
- Manually verify the metrics by generating lots of HMS events,
hard-code the EVENTS_BATCH_SIZE_PER_RPC to 5 and add some sleeps in
event processing. So it's more easy to show lags.
Change-Id: Id329879bfacba9c8415f920b57939dc571d1aad9
Reviewed-on: http://gerrit.cloudera.org:8080/20560
Reviewed-by: Impala Public Jenkins <[email protected]>
Tested-by: Impala Public Jenkins <[email protected]>
---
be/src/util/event-metrics.cc | 29 +++++++++++++++++++++++++++++
be/src/util/event-metrics.h | 12 ++++++++++++
common/thrift/metrics.json | 20 ++++++++++++++++++++
3 files changed, 61 insertions(+)
diff --git a/be/src/util/event-metrics.cc b/be/src/util/event-metrics.cc
index 061025c6d..d06ddd91d 100644
--- a/be/src/util/event-metrics.cc
+++ b/be/src/util/event-metrics.cc
@@ -70,6 +70,9 @@ string MetastoreEventMetrics::LATEST_EVENT_ID_METRIC_NAME =
"events-processor.latest-event-id";
string MetastoreEventMetrics::LATEST_EVENT_TIME_METRIC_NAME =
"events-processor.latest-event-time";
+string MetastoreEventMetrics::PENDING_EVENTS_METRIC_NAME =
+ "events-processor.pending-events";
+string MetastoreEventMetrics::LAG_TIME_METRIC_NAME =
"events-processor.lag-time";
IntCounter* MetastoreEventMetrics::NUM_EVENTS_RECEIVED_COUNTER = nullptr;
IntCounter* MetastoreEventMetrics::NUM_EVENTS_SKIPPED_COUNTER = nullptr;
@@ -95,6 +98,8 @@ IntCounter* MetastoreEventMetrics::LAST_SYNCED_EVENT_ID =
nullptr;
IntCounter* MetastoreEventMetrics::LAST_SYNCED_EVENT_TIME = nullptr;
IntCounter* MetastoreEventMetrics::LATEST_EVENT_ID = nullptr;
IntCounter* MetastoreEventMetrics::LATEST_EVENT_TIME = nullptr;
+IntCounter* MetastoreEventMetrics::PENDING_EVENTS = nullptr;
+IntCounter* MetastoreEventMetrics::LAG_TIME = nullptr;
// Initialize all the metrics for the events metric group
void MetastoreEventMetrics::InitMetastoreEventMetrics(MetricGroup*
metric_group) {
@@ -146,6 +151,8 @@ void
MetastoreEventMetrics::InitMetastoreEventMetrics(MetricGroup* metric_group)
event_metrics->AddCounter(LATEST_EVENT_ID_METRIC_NAME, 0);
LATEST_EVENT_TIME =
event_metrics->AddCounter(LATEST_EVENT_TIME_METRIC_NAME, 0);
+ PENDING_EVENTS = event_metrics->AddCounter(PENDING_EVENTS_METRIC_NAME, 0);
+ LAG_TIME = event_metrics->AddCounter(LAG_TIME_METRIC_NAME, 0);
}
void MetastoreEventMetrics::refresh(TEventProcessorMetrics* response) {
@@ -213,5 +220,27 @@ void
MetastoreEventMetrics::refresh(TEventProcessorMetrics* response) {
if (response->__isset.latest_event_time) {
LATEST_EVENT_TIME->SetValue(response->latest_event_time);
}
+ // last_synced_event_time is 0 at the startup until we have synced any
events.
+ if (response->__isset.latest_event_time &&
response->__isset.last_synced_event_time
+ && response->last_synced_event_time > 0) {
+ // latest_event_time and last_synced_event_time are updated by different
threads.
+ // It's possible that latest_event_time is stale and smaller than
+ // last_synced_event_time. Set the lag to 0 in this case.
+ if (response->latest_event_time <= response->last_synced_event_time) {
+ LAG_TIME->SetValue(0);
+ } else {
+ LAG_TIME->SetValue(response->latest_event_time -
response->last_synced_event_time);
+ }
+ }
+ if (response->__isset.latest_event_id &&
response->__isset.last_synced_event_id) {
+ // Same as above, latest_event_id and last_synced_event_id are updated by
different
+ // threads. Set the value to 0 if latest_event_id is stale.
+ if (response->latest_event_id <= response->last_synced_event_id) {
+ PENDING_EVENTS->SetValue(0);
+ } else {
+ PENDING_EVENTS->SetValue(
+ response->latest_event_id - response->last_synced_event_id);
+ }
+ }
}
} // namespace impala
diff --git a/be/src/util/event-metrics.h b/be/src/util/event-metrics.h
index e314c9bcf..fd6dc1f9f 100644
--- a/be/src/util/event-metrics.h
+++ b/be/src/util/event-metrics.h
@@ -85,6 +85,12 @@ class MetastoreEventMetrics {
/// Latest metastore event time
static IntCounter* LATEST_EVENT_TIME;
+ /// Number of events pending to be synced
+ static IntCounter* PENDING_EVENTS;
+
+ /// Lag time of the event processing
+ static IntCounter* LAG_TIME;
+
private:
/// Following metric names must match with the key in metrics.json
@@ -135,6 +141,12 @@ class MetastoreEventMetrics {
/// Metric name for the event time of the latest metastore event
static std::string LATEST_EVENT_TIME_METRIC_NAME;
+
+ /// Metric name for the number of pending events to be synced
+ static std::string PENDING_EVENTS_METRIC_NAME;
+
+ /// Metric name for the lag time of the event processing
+ static std::string LAG_TIME_METRIC_NAME;
};
} // namespace impala
diff --git a/common/thrift/metrics.json b/common/thrift/metrics.json
index d9391818c..ae53d06d9 100644
--- a/common/thrift/metrics.json
+++ b/common/thrift/metrics.json
@@ -3157,6 +3157,26 @@
"kind" : "COUNTER",
"key" : "events-processor.latest-event-time"
},
+ {
+ "description": "Number of pending events to be synced, i.e. the difference
between latest-event-id and last-synced-event-id",
+ "contexts": [
+ "CATALOGSERVER"
+ ],
+ "label": "Pending Events",
+ "units": "NONE",
+ "kind" : "COUNTER",
+ "key" : "events-processor.pending-events"
+ },
+ {
+ "description": "Lag time of the event processing, i.e. the difference
between latest-event-time and last-synced-event-time",
+ "contexts": [
+ "CATALOGSERVER"
+ ],
+ "label": "Lag Time",
+ "units": "TIME_S",
+ "kind" : "COUNTER",
+ "key" : "events-processor.lag-time"
+ },
{
"description": "Total number of executor groups that have at least one
executor",
"contexts": [