This is an automated email from the ASF dual-hosted git repository.

jtuglu1 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/druid.git


The following commit(s) were added to refs/heads/master by this push:
     new 62afd819c59 fix: fix prolonged query timeout bug in 
SetAndVerifyContextQueryRunner (#19393)
62afd819c59 is described below

commit 62afd819c590cc85afad7b3e565e5c827853e11b
Author: jtuglu1 <[email protected]>
AuthorDate: Fri May 1 13:47:05 2026 -0700

    fix: fix prolonged query timeout bug in SetAndVerifyContextQueryRunner 
(#19393)
    
    There's a bug in the SetAndVerifyContextQueryRunner where it 
unconditionally overwrites QUERY_FAIL_TIME with startTimeMillis + timeout, 
where startTimeMillis is when that runner instance was created — not when the 
original query started.
    
    This can happen when a SQL query generates inner sub-queries first (e.g. 
CTEs), then outer native queries after those complete. The outer queries 
re-enter the broker's query pipeline and get a fresh 
SetAndVerifyContextQueryRunner created some ms offset into execution. For 
example, with timeout=300000ms inherited from context and a 49000ms offset, 
their deadline becomes now + 300s = original_start + 349s.
    
    This can cause queries to run past their allotted query timeout, hogging 
resources and breaking operator/user assumptions.
---
 .../server/SetAndVerifyContextQueryRunner.java     | 21 +++--
 .../server/SetAndVerifyContextQueryRunnerTest.java | 90 ++++++++++++++++++++++
 2 files changed, 106 insertions(+), 5 deletions(-)

diff --git 
a/server/src/main/java/org/apache/druid/server/SetAndVerifyContextQueryRunner.java
 
b/server/src/main/java/org/apache/druid/server/SetAndVerifyContextQueryRunner.java
index a44aa2eb834..248d2174be3 100644
--- 
a/server/src/main/java/org/apache/druid/server/SetAndVerifyContextQueryRunner.java
+++ 
b/server/src/main/java/org/apache/druid/server/SetAndVerifyContextQueryRunner.java
@@ -19,7 +19,6 @@
 
 package org.apache.druid.server;
 
-import com.google.common.collect.ImmutableMap;
 import org.apache.druid.client.DirectDruidClient;
 import org.apache.druid.java.util.common.guava.Sequence;
 import org.apache.druid.query.Queries;
@@ -30,6 +29,8 @@ import org.apache.druid.query.QueryRunner;
 import org.apache.druid.query.context.ResponseContext;
 import org.apache.druid.server.initialization.ServerConfig;
 
+import java.util.Map;
+
 /**
  * Use this QueryRunner to set and verify Query contexts.
  */
@@ -70,15 +71,25 @@ public class SetAndVerifyContextQueryRunner<T> implements 
QueryRunner<T>
     );
     // DirectDruidClient.QUERY_FAIL_TIME is used by DirectDruidClient and 
JsonParserIterator to determine when to
     // fail with a timeout exception
-    final long failTime;
     final QueryContext context = newQuery.context();
+    final long existingFailTime = 
context.getLong(DirectDruidClient.QUERY_FAIL_TIME, -1L);
+    long computedFailTime;
     if (context.hasTimeout()) {
-      failTime = this.startTimeMillis + context.getTimeout();
+      computedFailTime = this.startTimeMillis + context.getTimeout();
     } else {
-      failTime = this.startTimeMillis + serverConfig.getMaxQueryTimeout();
+      computedFailTime = this.startTimeMillis + 
serverConfig.getMaxQueryTimeout();
+    }
+    // Clamp overflow
+    if (computedFailTime <= 0) {
+      computedFailTime = Long.MAX_VALUE;
     }
+    // Never extend an existing deadline. This prevents nested native queries 
and queued historicals
+    // from pushing the effective timeout past the original deadline.
+    final long failTime = (existingFailTime >= 0)
+        ? Math.min(existingFailTime, computedFailTime)
+        : computedFailTime;
     return newQuery.withOverriddenContext(
-        ImmutableMap.of(DirectDruidClient.QUERY_FAIL_TIME, failTime > 0 ? 
failTime : Long.MAX_VALUE)
+        Map.of(DirectDruidClient.QUERY_FAIL_TIME, failTime)
     );
   }
 }
diff --git 
a/server/src/test/java/org/apache/druid/server/SetAndVerifyContextQueryRunnerTest.java
 
b/server/src/test/java/org/apache/druid/server/SetAndVerifyContextQueryRunnerTest.java
index 81abd5186cb..2395c511b08 100644
--- 
a/server/src/test/java/org/apache/druid/server/SetAndVerifyContextQueryRunnerTest.java
+++ 
b/server/src/test/java/org/apache/druid/server/SetAndVerifyContextQueryRunnerTest.java
@@ -139,4 +139,94 @@ public class SetAndVerifyContextQueryRunnerTest
         System.currentTimeMillis() < 
transformed.context().getLong(DirectDruidClient.QUERY_FAIL_TIME)
     );
   }
+
+  @Test
+  public void testExistingQueryFailTimeIsPreservedWhenEarlierThanComputed()
+  {
+    // Simulates the case where a parent runner (e.g. SQL layer) already set 
QUERY_FAIL_TIME,
+    // and a nested native query runner is created later in execution. The 
existing deadline
+    // must not be extended.
+    long existingFailTime = System.currentTimeMillis() + 1000L; // 1s from now 
(earlier than computed)
+    Query<ScanResultValue> query = new Druids.ScanQueryBuilder()
+        .dataSource("foo")
+        .intervals(new 
MultipleIntervalSegmentSpec(ImmutableList.of(Intervals.ETERNITY)))
+        .context(ImmutableMap.of(
+            QueryContexts.TIMEOUT_KEY, 300_000,
+            DirectDruidClient.QUERY_FAIL_TIME, existingFailTime
+        ))
+        .build();
+
+    ServerConfig defaultConfig = new ServerConfig();
+
+    QueryRunner<ScanResultValue> mockRunner = 
EasyMock.createMock(QueryRunner.class);
+    SetAndVerifyContextQueryRunner<ScanResultValue> queryRunner = new 
SetAndVerifyContextQueryRunner<>(defaultConfig, mockRunner);
+
+    Query<ScanResultValue> transformed = 
queryRunner.withTimeoutAndMaxScatterGatherBytes(query, defaultConfig);
+
+    // The existing fail time (1s from now) is earlier than computed 
(startTime + 300s),
+    // so it must be preserved — not extended to startTime + 300s.
+    Assert.assertEquals(existingFailTime, (long) 
transformed.context().getLong(DirectDruidClient.QUERY_FAIL_TIME));
+  }
+
+  @Test
+  public void testExistingQueryFailTimeIsOverriddenWhenLaterThanComputed()
+  {
+    // If the existing QUERY_FAIL_TIME is somehow later than what the current 
config would compute
+    // (e.g. a very loose parent timeout), the stricter computed value wins.
+    long existingFailTime = System.currentTimeMillis() + 600_000L; // 10 min 
from now
+    Query<ScanResultValue> query = new Druids.ScanQueryBuilder()
+        .dataSource("foo")
+        .intervals(new 
MultipleIntervalSegmentSpec(ImmutableList.of(Intervals.ETERNITY)))
+        .context(ImmutableMap.of(
+            QueryContexts.TIMEOUT_KEY, 1,
+            DirectDruidClient.QUERY_FAIL_TIME, existingFailTime
+        ))
+        .build();
+
+    ServerConfig defaultConfig = new ServerConfig();
+
+    QueryRunner<ScanResultValue> mockRunner = 
EasyMock.createMock(QueryRunner.class);
+    SetAndVerifyContextQueryRunner<ScanResultValue> queryRunner = new 
SetAndVerifyContextQueryRunner<>(defaultConfig, mockRunner);
+
+    Query<ScanResultValue> transformed = 
queryRunner.withTimeoutAndMaxScatterGatherBytes(query, defaultConfig);
+
+    // Computed fail time is startTime + 1ms, which is earlier than the 
existing 10-min deadline.
+    // The computed (stricter) value must win.
+    Assert.assertTrue(
+        transformed.context().getLong(DirectDruidClient.QUERY_FAIL_TIME) < 
existingFailTime
+    );
+  }
+
+  @Test
+  public void testExistingQueryFailTimeIsPreservedWhenComputedOverflows()
+  {
+    // When timeout is Long.MAX_VALUE, startTimeMillis + timeout overflows to 
a negative value.
+    // The legitimate (small) existingFailTime must still win — not be erased 
by the overflow.
+    long existingFailTime = System.currentTimeMillis() + 1000L;
+    Query<ScanResultValue> query = new Druids.ScanQueryBuilder()
+        .dataSource("foo")
+        .intervals(new 
MultipleIntervalSegmentSpec(ImmutableList.of(Intervals.ETERNITY)))
+        .context(ImmutableMap.of(
+            QueryContexts.TIMEOUT_KEY, Long.MAX_VALUE,
+            DirectDruidClient.QUERY_FAIL_TIME, existingFailTime
+        ))
+        .build();
+
+    // Explicit max so the test does not depend on ServerConfig default values.
+    ServerConfig defaultConfig = new ServerConfig()
+    {
+      @Override
+      public long getMaxQueryTimeout()
+      {
+        return Long.MAX_VALUE;
+      }
+    };
+
+    QueryRunner<ScanResultValue> mockRunner = 
EasyMock.createMock(QueryRunner.class);
+    SetAndVerifyContextQueryRunner<ScanResultValue> queryRunner = new 
SetAndVerifyContextQueryRunner<>(defaultConfig, mockRunner);
+
+    Query<ScanResultValue> transformed = 
queryRunner.withTimeoutAndMaxScatterGatherBytes(query, defaultConfig);
+
+    Assert.assertEquals(existingFailTime, (long) 
transformed.context().getLong(DirectDruidClient.QUERY_FAIL_TIME));
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to