This is an automated email from the ASF dual-hosted git repository.
jtuglu1 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/druid.git
The following commit(s) were added to refs/heads/master by this push:
new 62afd819c59 fix: fix prolonged query timeout bug in
SetAndVerifyContextQueryRunner (#19393)
62afd819c59 is described below
commit 62afd819c590cc85afad7b3e565e5c827853e11b
Author: jtuglu1 <[email protected]>
AuthorDate: Fri May 1 13:47:05 2026 -0700
fix: fix prolonged query timeout bug in SetAndVerifyContextQueryRunner
(#19393)
There's a bug in the SetAndVerifyContextQueryRunner where it
unconditionally overwrites QUERY_FAIL_TIME with startTimeMillis + timeout,
where startTimeMillis is when that runner instance was created — not when the
original query started.
This can happen when a SQL query generates inner sub-queries first (e.g.
CTEs), then outer native queries after those complete. The outer queries
re-enter the broker's query pipeline and get a fresh
SetAndVerifyContextQueryRunner created some ms offset into execution. For
example, with timeout=300000ms inherited from context and a 49000ms offset,
their deadline becomes now + 300s = original_start + 349s.
This can cause queries to run past their allotted query timeout, hogging
resources and breaking operator/user assumptions.
---
.../server/SetAndVerifyContextQueryRunner.java | 21 +++--
.../server/SetAndVerifyContextQueryRunnerTest.java | 90 ++++++++++++++++++++++
2 files changed, 106 insertions(+), 5 deletions(-)
diff --git
a/server/src/main/java/org/apache/druid/server/SetAndVerifyContextQueryRunner.java
b/server/src/main/java/org/apache/druid/server/SetAndVerifyContextQueryRunner.java
index a44aa2eb834..248d2174be3 100644
---
a/server/src/main/java/org/apache/druid/server/SetAndVerifyContextQueryRunner.java
+++
b/server/src/main/java/org/apache/druid/server/SetAndVerifyContextQueryRunner.java
@@ -19,7 +19,6 @@
package org.apache.druid.server;
-import com.google.common.collect.ImmutableMap;
import org.apache.druid.client.DirectDruidClient;
import org.apache.druid.java.util.common.guava.Sequence;
import org.apache.druid.query.Queries;
@@ -30,6 +29,8 @@ import org.apache.druid.query.QueryRunner;
import org.apache.druid.query.context.ResponseContext;
import org.apache.druid.server.initialization.ServerConfig;
+import java.util.Map;
+
/**
* Use this QueryRunner to set and verify Query contexts.
*/
@@ -70,15 +71,25 @@ public class SetAndVerifyContextQueryRunner<T> implements
QueryRunner<T>
);
// DirectDruidClient.QUERY_FAIL_TIME is used by DirectDruidClient and
JsonParserIterator to determine when to
// fail with a timeout exception
- final long failTime;
final QueryContext context = newQuery.context();
+ final long existingFailTime =
context.getLong(DirectDruidClient.QUERY_FAIL_TIME, -1L);
+ long computedFailTime;
if (context.hasTimeout()) {
- failTime = this.startTimeMillis + context.getTimeout();
+ computedFailTime = this.startTimeMillis + context.getTimeout();
} else {
- failTime = this.startTimeMillis + serverConfig.getMaxQueryTimeout();
+ computedFailTime = this.startTimeMillis +
serverConfig.getMaxQueryTimeout();
+ }
+ // Clamp overflow
+ if (computedFailTime <= 0) {
+ computedFailTime = Long.MAX_VALUE;
}
+ // Never extend an existing deadline. This prevents nested native queries
and queued historicals
+ // from pushing the effective timeout past the original deadline.
+ final long failTime = (existingFailTime >= 0)
+ ? Math.min(existingFailTime, computedFailTime)
+ : computedFailTime;
return newQuery.withOverriddenContext(
- ImmutableMap.of(DirectDruidClient.QUERY_FAIL_TIME, failTime > 0 ?
failTime : Long.MAX_VALUE)
+ Map.of(DirectDruidClient.QUERY_FAIL_TIME, failTime)
);
}
}
diff --git
a/server/src/test/java/org/apache/druid/server/SetAndVerifyContextQueryRunnerTest.java
b/server/src/test/java/org/apache/druid/server/SetAndVerifyContextQueryRunnerTest.java
index 81abd5186cb..2395c511b08 100644
---
a/server/src/test/java/org/apache/druid/server/SetAndVerifyContextQueryRunnerTest.java
+++
b/server/src/test/java/org/apache/druid/server/SetAndVerifyContextQueryRunnerTest.java
@@ -139,4 +139,94 @@ public class SetAndVerifyContextQueryRunnerTest
System.currentTimeMillis() <
transformed.context().getLong(DirectDruidClient.QUERY_FAIL_TIME)
);
}
+
+ @Test
+ public void testExistingQueryFailTimeIsPreservedWhenEarlierThanComputed()
+ {
+ // Simulates the case where a parent runner (e.g. SQL layer) already set
QUERY_FAIL_TIME,
+ // and a nested native query runner is created later in execution. The
existing deadline
+ // must not be extended.
+ long existingFailTime = System.currentTimeMillis() + 1000L; // 1s from now
(earlier than computed)
+ Query<ScanResultValue> query = new Druids.ScanQueryBuilder()
+ .dataSource("foo")
+ .intervals(new
MultipleIntervalSegmentSpec(ImmutableList.of(Intervals.ETERNITY)))
+ .context(ImmutableMap.of(
+ QueryContexts.TIMEOUT_KEY, 300_000,
+ DirectDruidClient.QUERY_FAIL_TIME, existingFailTime
+ ))
+ .build();
+
+ ServerConfig defaultConfig = new ServerConfig();
+
+ QueryRunner<ScanResultValue> mockRunner =
EasyMock.createMock(QueryRunner.class);
+ SetAndVerifyContextQueryRunner<ScanResultValue> queryRunner = new
SetAndVerifyContextQueryRunner<>(defaultConfig, mockRunner);
+
+ Query<ScanResultValue> transformed =
queryRunner.withTimeoutAndMaxScatterGatherBytes(query, defaultConfig);
+
+ // The existing fail time (1s from now) is earlier than computed
(startTime + 300s),
+ // so it must be preserved — not extended to startTime + 300s.
+ Assert.assertEquals(existingFailTime, (long)
transformed.context().getLong(DirectDruidClient.QUERY_FAIL_TIME));
+ }
+
+ @Test
+ public void testExistingQueryFailTimeIsOverriddenWhenLaterThanComputed()
+ {
+ // If the existing QUERY_FAIL_TIME is somehow later than what the current
config would compute
+ // (e.g. a very loose parent timeout), the stricter computed value wins.
+ long existingFailTime = System.currentTimeMillis() + 600_000L; // 10 min
from now
+ Query<ScanResultValue> query = new Druids.ScanQueryBuilder()
+ .dataSource("foo")
+ .intervals(new
MultipleIntervalSegmentSpec(ImmutableList.of(Intervals.ETERNITY)))
+ .context(ImmutableMap.of(
+ QueryContexts.TIMEOUT_KEY, 1,
+ DirectDruidClient.QUERY_FAIL_TIME, existingFailTime
+ ))
+ .build();
+
+ ServerConfig defaultConfig = new ServerConfig();
+
+ QueryRunner<ScanResultValue> mockRunner =
EasyMock.createMock(QueryRunner.class);
+ SetAndVerifyContextQueryRunner<ScanResultValue> queryRunner = new
SetAndVerifyContextQueryRunner<>(defaultConfig, mockRunner);
+
+ Query<ScanResultValue> transformed =
queryRunner.withTimeoutAndMaxScatterGatherBytes(query, defaultConfig);
+
+ // Computed fail time is startTime + 1ms, which is earlier than the
existing 10-min deadline.
+ // The computed (stricter) value must win.
+ Assert.assertTrue(
+ transformed.context().getLong(DirectDruidClient.QUERY_FAIL_TIME) <
existingFailTime
+ );
+ }
+
+ @Test
+ public void testExistingQueryFailTimeIsPreservedWhenComputedOverflows()
+ {
+ // When timeout is Long.MAX_VALUE, startTimeMillis + timeout overflows to
a negative value.
+ // The legitimate (small) existingFailTime must still win — not be erased
by the overflow.
+ long existingFailTime = System.currentTimeMillis() + 1000L;
+ Query<ScanResultValue> query = new Druids.ScanQueryBuilder()
+ .dataSource("foo")
+ .intervals(new
MultipleIntervalSegmentSpec(ImmutableList.of(Intervals.ETERNITY)))
+ .context(ImmutableMap.of(
+ QueryContexts.TIMEOUT_KEY, Long.MAX_VALUE,
+ DirectDruidClient.QUERY_FAIL_TIME, existingFailTime
+ ))
+ .build();
+
+ // Explicit max so the test does not depend on ServerConfig default values.
+ ServerConfig defaultConfig = new ServerConfig()
+ {
+ @Override
+ public long getMaxQueryTimeout()
+ {
+ return Long.MAX_VALUE;
+ }
+ };
+
+ QueryRunner<ScanResultValue> mockRunner =
EasyMock.createMock(QueryRunner.class);
+ SetAndVerifyContextQueryRunner<ScanResultValue> queryRunner = new
SetAndVerifyContextQueryRunner<>(defaultConfig, mockRunner);
+
+ Query<ScanResultValue> transformed =
queryRunner.withTimeoutAndMaxScatterGatherBytes(query, defaultConfig);
+
+ Assert.assertEquals(existingFailTime, (long)
transformed.context().getLong(DirectDruidClient.QUERY_FAIL_TIME));
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]