Re: [PR] SOLR-17320: Added support for timeAllowed time out in HttpShardHandler [solr]

via GitHub Wed, 31 Jul 2024 07:14:30 -0700


gus-asf commented on code in PR #2493:
URL: https://github.com/apache/solr/pull/2493#discussion_r1698484974



##########
solr/core/src/test/org/apache/solr/handler/component/TestHttpShardHandlerFactory.java:
##########
@@ -154,4 +156,63 @@ public void testLiveNodesToHostUrl() {
     assertThat(hostSet, hasItem("1.2.3.4:9000"));
     assertThat(hostSet, hasItem("1.2.3.4:9001"));
   }
+
+  public void testHttpShardHandlerWithResponse() {
+    HttpShardHandlerFactory httpShardHandlerFactory = new 
HttpShardHandlerFactory();
+    HttpShardHandler shardHandler = (HttpShardHandler) 
httpShardHandlerFactory.getShardHandler();
+
+    long timeAllowedInMillis = -1;
+    // setting one pending request.
+    shardHandler.setPendingRequest(1);
+
+    ShardResponse shardResponse = new ShardResponse();
+    shardResponse.setShard("shard_1");
+    ShardRequest shardRequest = new ShardRequest();
+    // one shard
+    shardRequest.actualShards = new String[] {"shard_1"};
+    shardResponse.setShardRequest(shardRequest);
+
+    ExecutorService exec = 
ExecutorUtil.newMDCAwareCachedThreadPool("timeAllowedTest");
+    try {
+      // generating shardresponse for one shard
+      exec.submit(() -> shardHandler.setResponse(shardResponse));
+    } finally {
+      ExecutorUtil.shutdownAndAwaitTermination(exec);
+    }
+    ShardResponse gotResponse =
+        
shardHandler.takeCompletedIncludingErrorsWithTimeout(timeAllowedInMillis);
+
+    assertEquals(shardResponse, gotResponse);
+  }
+
+  @Test
+  public void testHttpShardHandlerWithPartialResponse() {
+    HttpShardHandlerFactory httpShardHandlerFactory = new 
HttpShardHandlerFactory();
+    HttpShardHandler shardHandler = (HttpShardHandler) 
httpShardHandlerFactory.getShardHandler();
+
+    long timeAllowedInMillis = 100;
+    // setting two pending requests.
+    shardHandler.setPendingRequest(2);
+
+    ShardResponse shardResponse = new ShardResponse();
+    shardResponse.setShard("shard_1");
+    ShardRequest shardRequest = new ShardRequest();
+    // two shards
+    shardRequest.actualShards = new String[] {"shard_1", "shard_2"};
+    shardResponse.setShardRequest(shardRequest);
+
+    ExecutorService exec = 
ExecutorUtil.newMDCAwareCachedThreadPool("timeAllowedTest");
+    try {
+      // generating shardresponse for one shard only
+      exec.submit(() -> shardHandler.setResponse(shardResponse));
+    } finally {
+      ExecutorUtil.shutdownAndAwaitTermination(exec);
+    }
+
+    // partial response
+    ShardResponse gotResponse =
+        
shardHandler.takeCompletedIncludingErrorsWithTimeout(timeAllowedInMillis);
+
+    assertEquals(shardResponse, gotResponse);

Review Comment:
   for cases where .equals() is not implemented by a complex class 
`assertTrue(shardResponse == gotResponse)` is potentially clearer.



##########
solr/core/src/java/org/apache/solr/handler/component/HttpShardHandler.java:
##########
@@ -211,23 +216,33 @@ public ShardResponse takeCompletedIncludingErrors() {
    */
   @Override
   public ShardResponse takeCompletedOrError() {
-    return take(true);
+    return take(true, -1);
   }
 
-  private ShardResponse take(boolean bailOnError) {
+  private ShardResponse take(boolean bailOnError, long maxAllowedTimeInMillis) 
{
     try {
-      while (pending.get() > 0) {
-        ShardResponse rsp = responses.take();
-        responseFutureMap.remove(rsp);
+      long deadline = System.nanoTime();
+      if (maxAllowedTimeInMillis > 0) {
+        deadline += TimeUnit.MILLISECONDS.toNanos(maxAllowedTimeInMillis);
+      } else {
+        deadline = System.nanoTime() + TimeUnit.DAYS.toNanos(1);

Review Comment:
   This duplicates the 24 * 60 * 60 * 1000 ms calculation elsewhere. Should try 
not to duplicate constants like this, especially expressed in different ways 
that make it difficult to see that they are the same value.



##########
solr/core/src/test/org/apache/solr/TestTimeAllowedSearch.java:
##########
@@ -0,0 +1,104 @@
+package org.apache.solr;
+
+import com.carrotsearch.randomizedtesting.generators.RandomStrings;
+import java.util.Calendar;
+import java.util.GregorianCalendar;
+import java.util.Locale;
+import java.util.TimeZone;
+import org.apache.solr.client.solrj.SolrQuery;
+import org.apache.solr.client.solrj.impl.CloudSolrClient;
+import org.apache.solr.client.solrj.request.CollectionAdminRequest;
+import org.apache.solr.client.solrj.request.UpdateRequest;
+import org.apache.solr.client.solrj.response.QueryResponse;
+import org.apache.solr.cloud.MiniSolrCloudCluster;
+import org.apache.solr.cloud.SolrCloudTestCase;
+import org.apache.solr.common.SolrInputDocument;
+import org.apache.solr.common.params.CommonParams;
+import org.apache.solr.common.params.ShardParams;
+
+public class TestTimeAllowedSearch extends SolrCloudTestCase {
+
+  /**
+   * This test demonstrates timeAllowed expectation at @{@link
+   * org.apache.solr.handler.component.HttpShardHandler} level This test 
creates collection with
+   * 'implicit` router, which has two shards shard_1 has 100000 docs, so that 
query should take some
+   * time shard_2 has only 1 doc to demonstrate the HttpSHardHandler timeout 
Then it execute
+   * substring query with TIME_ALLOWED 50, assuming this query will time out 
on shard_1
+   */
+  public void testTimeAllowed() throws Exception {
+    MiniSolrCloudCluster cluster =
+        configureCluster(2).addConfig("conf", 
configset("cloud-minimal")).configure();
+    try {
+      CloudSolrClient client = cluster.getSolrClient();
+      String COLLECTION_NAME = "test_coll";
+      CollectionAdminRequest.createCollection(COLLECTION_NAME, "conf", 2, 1)
+          .setRouterName("implicit")
+          .setShards("shard_1,shard_2")
+          .process(cluster.getSolrClient());
+      cluster.waitForActiveCollection(COLLECTION_NAME, 2, 2);
+      UpdateRequest ur = new UpdateRequest();
+      for (int i = 0; i < 100000; i++) {

Review Comment:
   Do we really need 100k docs? That's really a really heavy test which will 
increase the burn rate for SSD disks. Also note that java allows you to write 
numbers like this as 100_000 which makes them far easier to read.



##########
solr/core/src/test/org/apache/solr/TestTimeAllowedSearch.java:
##########
@@ -0,0 +1,104 @@
+package org.apache.solr;
+
+import com.carrotsearch.randomizedtesting.generators.RandomStrings;
+import java.util.Calendar;
+import java.util.GregorianCalendar;
+import java.util.Locale;
+import java.util.TimeZone;
+import org.apache.solr.client.solrj.SolrQuery;
+import org.apache.solr.client.solrj.impl.CloudSolrClient;
+import org.apache.solr.client.solrj.request.CollectionAdminRequest;
+import org.apache.solr.client.solrj.request.UpdateRequest;
+import org.apache.solr.client.solrj.response.QueryResponse;
+import org.apache.solr.cloud.MiniSolrCloudCluster;
+import org.apache.solr.cloud.SolrCloudTestCase;
+import org.apache.solr.common.SolrInputDocument;
+import org.apache.solr.common.params.CommonParams;
+import org.apache.solr.common.params.ShardParams;
+
+public class TestTimeAllowedSearch extends SolrCloudTestCase {
+
+  /**
+   * This test demonstrates timeAllowed expectation at @{@link
+   * org.apache.solr.handler.component.HttpShardHandler} level This test 
creates collection with
+   * 'implicit` router, which has two shards shard_1 has 100000 docs, so that 
query should take some
+   * time shard_2 has only 1 doc to demonstrate the HttpSHardHandler timeout 
Then it execute
+   * substring query with TIME_ALLOWED 50, assuming this query will time out 
on shard_1
+   */
+  public void testTimeAllowed() throws Exception {
+    MiniSolrCloudCluster cluster =
+        configureCluster(2).addConfig("conf", 
configset("cloud-minimal")).configure();
+    try {
+      CloudSolrClient client = cluster.getSolrClient();
+      String COLLECTION_NAME = "test_coll";
+      CollectionAdminRequest.createCollection(COLLECTION_NAME, "conf", 2, 1)
+          .setRouterName("implicit")
+          .setShards("shard_1,shard_2")
+          .process(cluster.getSolrClient());
+      cluster.waitForActiveCollection(COLLECTION_NAME, 2, 2);
+      UpdateRequest ur = new UpdateRequest();
+      for (int i = 0; i < 100000; i++) {
+        SolrInputDocument doc = new SolrInputDocument();
+        doc.addField("id", "" + i);
+        final String s =
+            RandomStrings.randomAsciiLettersOfLengthBetween(random(), 10, 100)
+                .toLowerCase(Locale.ROOT);
+        doc.setField("subject_s", s);
+        doc.setField("_route_", "shard_1");
+        ur.add(doc);
+      }
+
+      // adding "abc" in each shard as we will have query *abc*
+      SolrInputDocument doc = new SolrInputDocument();
+      doc.addField("id", "" + 10000);
+      doc.setField("subject_s", "abc");
+      doc.setField("_route_", "shard_2");
+      ur.add(doc);
+
+      doc = new SolrInputDocument();
+      doc.addField("id", "" + 100001);
+      doc.setField("subject_s", "abc");
+      doc.setField("_route_", "shard_1");
+      ur.add(doc);
+
+      ur.commit(client, COLLECTION_NAME);
+
+      // warm up query
+      SolrQuery query = new SolrQuery();
+      query.setQuery("subject_s:*abcd*");
+      query.set(ShardParams.SHARDS_TOLERANT, "true");
+      QueryResponse response = client.query(COLLECTION_NAME, query);
+
+      query = new SolrQuery();
+      query.setQuery("subject_s:*abc*");
+      query.set(CommonParams.TIME_ALLOWED, 40);
+      query.set(ShardParams.SHARDS_TOLERANT, "true");
+      response = client.query(COLLECTION_NAME, query);
+      assertTrue(
+          "Should have found 1 doc (shard_2) as timeallowed is 25ms found:"

Review Comment:
   Oh this is a testing anti-pattern (not that we don't already use lots of 
those in solr tests but...) This relies on the performance characteristics of 
both the machine and a large amount of unrelated code. Upgrades to machines or 
to (for exmaple) lucene could make this test suddenly fail. Typically the 
better strategy is to introduce an artificial delay that can't change. See 
org.apache.solr.search.ExpensiveSearchComponent and 
org.apache.solr.util.TestInjection#injectUpdateRandomPause for examples. 
David's suggested local param is another option.



##########
solr/core/src/java/org/apache/solr/handler/component/SearchHandler.java:
##########
@@ -555,9 +556,10 @@ public void handleRequestBody(SolrQueryRequest req, 
SolrQueryResponse rsp) throw
           // this loop)
           boolean tolerant = 
ShardParams.getShardsTolerantAsBool(rb.req.getParams());
           while (rb.outgoing.size() == 0) {
+            long timeAllowed = maxTimeAllowed - (long) 
req.getRequestTimer().getTime();

Review Comment:
   This calculation is already managed by 
org.apache.solr.search.TimeAllowedLimit



##########
solr/core/src/java/org/apache/solr/handler/component/HttpShardHandler.java:
##########
@@ -211,23 +216,33 @@ public ShardResponse takeCompletedIncludingErrors() {
    */
   @Override
   public ShardResponse takeCompletedOrError() {
-    return take(true);
+    return take(true, -1);
   }
 
-  private ShardResponse take(boolean bailOnError) {
+  private ShardResponse take(boolean bailOnError, long maxAllowedTimeInMillis) 
{
     try {
-      while (pending.get() > 0) {
-        ShardResponse rsp = responses.take();
-        responseFutureMap.remove(rsp);
+      long deadline = System.nanoTime();
+      if (maxAllowedTimeInMillis > 0) {
+        deadline += TimeUnit.MILLISECONDS.toNanos(maxAllowedTimeInMillis);
+      } else {
+        deadline = System.nanoTime() + TimeUnit.DAYS.toNanos(1);
+      }
 
+      ShardResponse previousResponse = null;
+      while (pending.get() > 0) {

Review Comment:
   Probably should find a way to use 
`QueryLimits.getCurrentLimits().shouldExit();` which will automatically respect 
any other query limits such as CPU or memory limits if added in the future 
(SOLR-17150)



##########
solr/core/src/java/org/apache/solr/handler/component/HttpShardHandler.java:
##########
@@ -211,23 +216,33 @@ public ShardResponse takeCompletedIncludingErrors() {
    */
   @Override
   public ShardResponse takeCompletedOrError() {
-    return take(true);
+    return take(true, -1);
   }
 
-  private ShardResponse take(boolean bailOnError) {
+  private ShardResponse take(boolean bailOnError, long maxAllowedTimeInMillis) 
{
     try {
-      while (pending.get() > 0) {
-        ShardResponse rsp = responses.take();
-        responseFutureMap.remove(rsp);
+      long deadline = System.nanoTime();
+      if (maxAllowedTimeInMillis > 0) {
+        deadline += TimeUnit.MILLISECONDS.toNanos(maxAllowedTimeInMillis);
+      } else {
+        deadline = System.nanoTime() + TimeUnit.DAYS.toNanos(1);
+      }
 
+      ShardResponse previousResponse = null;
+      while (pending.get() > 0) {
+        long waitTime = deadline - System.nanoTime();
+        ShardResponse rsp = responses.poll(waitTime, TimeUnit.NANOSECONDS);
         pending.decrementAndGet();
+        if (rsp == null) return previousResponse;

Review Comment:
   Such a feature runs the risk of a cascading failure. Think 2 replicas... 
load is slowly increasing, finally one replica gets a largish query and a bunch 
of stuff times out... and then all those requests redistribute to the other 
replica making it slow and timing out a bunch more requests, most of which... 
you guessed it re-distribute to the first, lagging replica... meanwhile the 
user on the far end gets to wait for 2 requests to time out instead of 1... 
This would be less of a problem with cpuTimeAllowed hopefully but I'm still not 
sure it makes sense in the broader scheme of things. One use case for 
timeAllowed is that the person deploying solr wants to ensure that the end-user 
doesn't wait more than X before getting some sort of response. Users click away 
from sites that lag, and an error possibly improves the chance that they remain 
engaged. re-requesting would make it difficult to achieve this. Think 3 
replicas and setting the timeout to 1/3 the desired limit... then 2/3 of t
 he available time is wasted bouncing around and failing... would certainly 
require a flag to toggle at a minimum.



##########
solr/core/src/test/org/apache/solr/TestTimeAllowedSearch.java:
##########
@@ -0,0 +1,104 @@
+package org.apache.solr;
+
+import com.carrotsearch.randomizedtesting.generators.RandomStrings;
+import java.util.Calendar;
+import java.util.GregorianCalendar;
+import java.util.Locale;
+import java.util.TimeZone;
+import org.apache.solr.client.solrj.SolrQuery;
+import org.apache.solr.client.solrj.impl.CloudSolrClient;
+import org.apache.solr.client.solrj.request.CollectionAdminRequest;
+import org.apache.solr.client.solrj.request.UpdateRequest;
+import org.apache.solr.client.solrj.response.QueryResponse;
+import org.apache.solr.cloud.MiniSolrCloudCluster;
+import org.apache.solr.cloud.SolrCloudTestCase;
+import org.apache.solr.common.SolrInputDocument;
+import org.apache.solr.common.params.CommonParams;
+import org.apache.solr.common.params.ShardParams;
+
+public class TestTimeAllowedSearch extends SolrCloudTestCase {
+
+  /**
+   * This test demonstrates timeAllowed expectation at @{@link
+   * org.apache.solr.handler.component.HttpShardHandler} level This test 
creates collection with
+   * 'implicit` router, which has two shards shard_1 has 100000 docs, so that 
query should take some
+   * time shard_2 has only 1 doc to demonstrate the HttpSHardHandler timeout 
Then it execute
+   * substring query with TIME_ALLOWED 50, assuming this query will time out 
on shard_1
+   */
+  public void testTimeAllowed() throws Exception {
+    MiniSolrCloudCluster cluster =
+        configureCluster(2).addConfig("conf", 
configset("cloud-minimal")).configure();
+    try {
+      CloudSolrClient client = cluster.getSolrClient();
+      String COLLECTION_NAME = "test_coll";
+      CollectionAdminRequest.createCollection(COLLECTION_NAME, "conf", 2, 1)
+          .setRouterName("implicit")
+          .setShards("shard_1,shard_2")
+          .process(cluster.getSolrClient());
+      cluster.waitForActiveCollection(COLLECTION_NAME, 2, 2);
+      UpdateRequest ur = new UpdateRequest();
+      for (int i = 0; i < 100000; i++) {
+        SolrInputDocument doc = new SolrInputDocument();
+        doc.addField("id", "" + i);
+        final String s =
+            RandomStrings.randomAsciiLettersOfLengthBetween(random(), 10, 100)
+                .toLowerCase(Locale.ROOT);
+        doc.setField("subject_s", s);
+        doc.setField("_route_", "shard_1");
+        ur.add(doc);
+      }
+
+      // adding "abc" in each shard as we will have query *abc*
+      SolrInputDocument doc = new SolrInputDocument();
+      doc.addField("id", "" + 10000);
+      doc.setField("subject_s", "abc");
+      doc.setField("_route_", "shard_2");
+      ur.add(doc);
+
+      doc = new SolrInputDocument();
+      doc.addField("id", "" + 100001);
+      doc.setField("subject_s", "abc");
+      doc.setField("_route_", "shard_1");
+      ur.add(doc);
+
+      ur.commit(client, COLLECTION_NAME);
+
+      // warm up query
+      SolrQuery query = new SolrQuery();
+      query.setQuery("subject_s:*abcd*");
+      query.set(ShardParams.SHARDS_TOLERANT, "true");
+      QueryResponse response = client.query(COLLECTION_NAME, query);
+
+      query = new SolrQuery();
+      query.setQuery("subject_s:*abc*");
+      query.set(CommonParams.TIME_ALLOWED, 40);
+      query.set(ShardParams.SHARDS_TOLERANT, "true");
+      response = client.query(COLLECTION_NAME, query);
+      assertTrue(
+          "Should have found 1 doc (shard_2) as timeallowed is 25ms found:"
+              + response.getResults().getNumFound(),
+          response.getResults().getNumFound() == 1);
+
+      query = new SolrQuery();
+      query.setQuery("subject_s:*abc*");
+      query.set(ShardParams.SHARDS_TOLERANT, "true");
+      response = client.query(COLLECTION_NAME, query);
+      assertTrue(
+          "Should have found few docs as timeallowed is unlimited ",
+          response.getResults().getNumFound() > 1);
+    } finally {
+      cluster.shutdown();
+    }
+  }
+
+  public void testTimeZone() {

Review Comment:
   Appears unused, please remove.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@solr.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@solr.apache.org
For additional commands, e-mail: issues-h...@solr.apache.org

Re: [PR] SOLR-17320: Added support for timeAllowed time out in HttpShardHandler [solr]

Reply via email to