gus-asf commented on code in PR #2493: URL: https://github.com/apache/solr/pull/2493#discussion_r1698484974
########## solr/core/src/test/org/apache/solr/handler/component/TestHttpShardHandlerFactory.java: ########## @@ -154,4 +156,63 @@ public void testLiveNodesToHostUrl() { assertThat(hostSet, hasItem("1.2.3.4:9000")); assertThat(hostSet, hasItem("1.2.3.4:9001")); } + + public void testHttpShardHandlerWithResponse() { + HttpShardHandlerFactory httpShardHandlerFactory = new HttpShardHandlerFactory(); + HttpShardHandler shardHandler = (HttpShardHandler) httpShardHandlerFactory.getShardHandler(); + + long timeAllowedInMillis = -1; + // setting one pending request. + shardHandler.setPendingRequest(1); + + ShardResponse shardResponse = new ShardResponse(); + shardResponse.setShard("shard_1"); + ShardRequest shardRequest = new ShardRequest(); + // one shard + shardRequest.actualShards = new String[] {"shard_1"}; + shardResponse.setShardRequest(shardRequest); + + ExecutorService exec = ExecutorUtil.newMDCAwareCachedThreadPool("timeAllowedTest"); + try { + // generating shardresponse for one shard + exec.submit(() -> shardHandler.setResponse(shardResponse)); + } finally { + ExecutorUtil.shutdownAndAwaitTermination(exec); + } + ShardResponse gotResponse = + shardHandler.takeCompletedIncludingErrorsWithTimeout(timeAllowedInMillis); + + assertEquals(shardResponse, gotResponse); + } + + @Test + public void testHttpShardHandlerWithPartialResponse() { + HttpShardHandlerFactory httpShardHandlerFactory = new HttpShardHandlerFactory(); + HttpShardHandler shardHandler = (HttpShardHandler) httpShardHandlerFactory.getShardHandler(); + + long timeAllowedInMillis = 100; + // setting two pending requests. + shardHandler.setPendingRequest(2); + + ShardResponse shardResponse = new ShardResponse(); + shardResponse.setShard("shard_1"); + ShardRequest shardRequest = new ShardRequest(); + // two shards + shardRequest.actualShards = new String[] {"shard_1", "shard_2"}; + shardResponse.setShardRequest(shardRequest); + + ExecutorService exec = ExecutorUtil.newMDCAwareCachedThreadPool("timeAllowedTest"); + try { + // generating shardresponse for one shard only + exec.submit(() -> shardHandler.setResponse(shardResponse)); + } finally { + ExecutorUtil.shutdownAndAwaitTermination(exec); + } + + // partial response + ShardResponse gotResponse = + shardHandler.takeCompletedIncludingErrorsWithTimeout(timeAllowedInMillis); + + assertEquals(shardResponse, gotResponse); Review Comment: for cases where .equals() is not implemented by a complex class `assertTrue(shardResponse == gotResponse)` is potentially clearer. ########## solr/core/src/java/org/apache/solr/handler/component/HttpShardHandler.java: ########## @@ -211,23 +216,33 @@ public ShardResponse takeCompletedIncludingErrors() { */ @Override public ShardResponse takeCompletedOrError() { - return take(true); + return take(true, -1); } - private ShardResponse take(boolean bailOnError) { + private ShardResponse take(boolean bailOnError, long maxAllowedTimeInMillis) { try { - while (pending.get() > 0) { - ShardResponse rsp = responses.take(); - responseFutureMap.remove(rsp); + long deadline = System.nanoTime(); + if (maxAllowedTimeInMillis > 0) { + deadline += TimeUnit.MILLISECONDS.toNanos(maxAllowedTimeInMillis); + } else { + deadline = System.nanoTime() + TimeUnit.DAYS.toNanos(1); Review Comment: This duplicates the 24 * 60 * 60 * 1000 ms calculation elsewhere. Should try not to duplicate constants like this, especially expressed in different ways that make it difficult to see that they are the same value. ########## solr/core/src/test/org/apache/solr/TestTimeAllowedSearch.java: ########## @@ -0,0 +1,104 @@ +package org.apache.solr; + +import com.carrotsearch.randomizedtesting.generators.RandomStrings; +import java.util.Calendar; +import java.util.GregorianCalendar; +import java.util.Locale; +import java.util.TimeZone; +import org.apache.solr.client.solrj.SolrQuery; +import org.apache.solr.client.solrj.impl.CloudSolrClient; +import org.apache.solr.client.solrj.request.CollectionAdminRequest; +import org.apache.solr.client.solrj.request.UpdateRequest; +import org.apache.solr.client.solrj.response.QueryResponse; +import org.apache.solr.cloud.MiniSolrCloudCluster; +import org.apache.solr.cloud.SolrCloudTestCase; +import org.apache.solr.common.SolrInputDocument; +import org.apache.solr.common.params.CommonParams; +import org.apache.solr.common.params.ShardParams; + +public class TestTimeAllowedSearch extends SolrCloudTestCase { + + /** + * This test demonstrates timeAllowed expectation at @{@link + * org.apache.solr.handler.component.HttpShardHandler} level This test creates collection with + * 'implicit` router, which has two shards shard_1 has 100000 docs, so that query should take some + * time shard_2 has only 1 doc to demonstrate the HttpSHardHandler timeout Then it execute + * substring query with TIME_ALLOWED 50, assuming this query will time out on shard_1 + */ + public void testTimeAllowed() throws Exception { + MiniSolrCloudCluster cluster = + configureCluster(2).addConfig("conf", configset("cloud-minimal")).configure(); + try { + CloudSolrClient client = cluster.getSolrClient(); + String COLLECTION_NAME = "test_coll"; + CollectionAdminRequest.createCollection(COLLECTION_NAME, "conf", 2, 1) + .setRouterName("implicit") + .setShards("shard_1,shard_2") + .process(cluster.getSolrClient()); + cluster.waitForActiveCollection(COLLECTION_NAME, 2, 2); + UpdateRequest ur = new UpdateRequest(); + for (int i = 0; i < 100000; i++) { Review Comment: Do we really need 100k docs? That's really a really heavy test which will increase the burn rate for SSD disks. Also note that java allows you to write numbers like this as 100_000 which makes them far easier to read. ########## solr/core/src/test/org/apache/solr/TestTimeAllowedSearch.java: ########## @@ -0,0 +1,104 @@ +package org.apache.solr; + +import com.carrotsearch.randomizedtesting.generators.RandomStrings; +import java.util.Calendar; +import java.util.GregorianCalendar; +import java.util.Locale; +import java.util.TimeZone; +import org.apache.solr.client.solrj.SolrQuery; +import org.apache.solr.client.solrj.impl.CloudSolrClient; +import org.apache.solr.client.solrj.request.CollectionAdminRequest; +import org.apache.solr.client.solrj.request.UpdateRequest; +import org.apache.solr.client.solrj.response.QueryResponse; +import org.apache.solr.cloud.MiniSolrCloudCluster; +import org.apache.solr.cloud.SolrCloudTestCase; +import org.apache.solr.common.SolrInputDocument; +import org.apache.solr.common.params.CommonParams; +import org.apache.solr.common.params.ShardParams; + +public class TestTimeAllowedSearch extends SolrCloudTestCase { + + /** + * This test demonstrates timeAllowed expectation at @{@link + * org.apache.solr.handler.component.HttpShardHandler} level This test creates collection with + * 'implicit` router, which has two shards shard_1 has 100000 docs, so that query should take some + * time shard_2 has only 1 doc to demonstrate the HttpSHardHandler timeout Then it execute + * substring query with TIME_ALLOWED 50, assuming this query will time out on shard_1 + */ + public void testTimeAllowed() throws Exception { + MiniSolrCloudCluster cluster = + configureCluster(2).addConfig("conf", configset("cloud-minimal")).configure(); + try { + CloudSolrClient client = cluster.getSolrClient(); + String COLLECTION_NAME = "test_coll"; + CollectionAdminRequest.createCollection(COLLECTION_NAME, "conf", 2, 1) + .setRouterName("implicit") + .setShards("shard_1,shard_2") + .process(cluster.getSolrClient()); + cluster.waitForActiveCollection(COLLECTION_NAME, 2, 2); + UpdateRequest ur = new UpdateRequest(); + for (int i = 0; i < 100000; i++) { + SolrInputDocument doc = new SolrInputDocument(); + doc.addField("id", "" + i); + final String s = + RandomStrings.randomAsciiLettersOfLengthBetween(random(), 10, 100) + .toLowerCase(Locale.ROOT); + doc.setField("subject_s", s); + doc.setField("_route_", "shard_1"); + ur.add(doc); + } + + // adding "abc" in each shard as we will have query *abc* + SolrInputDocument doc = new SolrInputDocument(); + doc.addField("id", "" + 10000); + doc.setField("subject_s", "abc"); + doc.setField("_route_", "shard_2"); + ur.add(doc); + + doc = new SolrInputDocument(); + doc.addField("id", "" + 100001); + doc.setField("subject_s", "abc"); + doc.setField("_route_", "shard_1"); + ur.add(doc); + + ur.commit(client, COLLECTION_NAME); + + // warm up query + SolrQuery query = new SolrQuery(); + query.setQuery("subject_s:*abcd*"); + query.set(ShardParams.SHARDS_TOLERANT, "true"); + QueryResponse response = client.query(COLLECTION_NAME, query); + + query = new SolrQuery(); + query.setQuery("subject_s:*abc*"); + query.set(CommonParams.TIME_ALLOWED, 40); + query.set(ShardParams.SHARDS_TOLERANT, "true"); + response = client.query(COLLECTION_NAME, query); + assertTrue( + "Should have found 1 doc (shard_2) as timeallowed is 25ms found:" Review Comment: Oh this is a testing anti-pattern (not that we don't already use lots of those in solr tests but...) This relies on the performance characteristics of both the machine and a large amount of unrelated code. Upgrades to machines or to (for exmaple) lucene could make this test suddenly fail. Typically the better strategy is to introduce an artificial delay that can't change. See org.apache.solr.search.ExpensiveSearchComponent and org.apache.solr.util.TestInjection#injectUpdateRandomPause for examples. David's suggested local param is another option. ########## solr/core/src/java/org/apache/solr/handler/component/SearchHandler.java: ########## @@ -555,9 +556,10 @@ public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throw // this loop) boolean tolerant = ShardParams.getShardsTolerantAsBool(rb.req.getParams()); while (rb.outgoing.size() == 0) { + long timeAllowed = maxTimeAllowed - (long) req.getRequestTimer().getTime(); Review Comment: This calculation is already managed by org.apache.solr.search.TimeAllowedLimit ########## solr/core/src/java/org/apache/solr/handler/component/HttpShardHandler.java: ########## @@ -211,23 +216,33 @@ public ShardResponse takeCompletedIncludingErrors() { */ @Override public ShardResponse takeCompletedOrError() { - return take(true); + return take(true, -1); } - private ShardResponse take(boolean bailOnError) { + private ShardResponse take(boolean bailOnError, long maxAllowedTimeInMillis) { try { - while (pending.get() > 0) { - ShardResponse rsp = responses.take(); - responseFutureMap.remove(rsp); + long deadline = System.nanoTime(); + if (maxAllowedTimeInMillis > 0) { + deadline += TimeUnit.MILLISECONDS.toNanos(maxAllowedTimeInMillis); + } else { + deadline = System.nanoTime() + TimeUnit.DAYS.toNanos(1); + } + ShardResponse previousResponse = null; + while (pending.get() > 0) { Review Comment: Probably should find a way to use `QueryLimits.getCurrentLimits().shouldExit();` which will automatically respect any other query limits such as CPU or memory limits if added in the future (SOLR-17150) ########## solr/core/src/java/org/apache/solr/handler/component/HttpShardHandler.java: ########## @@ -211,23 +216,33 @@ public ShardResponse takeCompletedIncludingErrors() { */ @Override public ShardResponse takeCompletedOrError() { - return take(true); + return take(true, -1); } - private ShardResponse take(boolean bailOnError) { + private ShardResponse take(boolean bailOnError, long maxAllowedTimeInMillis) { try { - while (pending.get() > 0) { - ShardResponse rsp = responses.take(); - responseFutureMap.remove(rsp); + long deadline = System.nanoTime(); + if (maxAllowedTimeInMillis > 0) { + deadline += TimeUnit.MILLISECONDS.toNanos(maxAllowedTimeInMillis); + } else { + deadline = System.nanoTime() + TimeUnit.DAYS.toNanos(1); + } + ShardResponse previousResponse = null; + while (pending.get() > 0) { + long waitTime = deadline - System.nanoTime(); + ShardResponse rsp = responses.poll(waitTime, TimeUnit.NANOSECONDS); pending.decrementAndGet(); + if (rsp == null) return previousResponse; Review Comment: Such a feature runs the risk of a cascading failure. Think 2 replicas... load is slowly increasing, finally one replica gets a largish query and a bunch of stuff times out... and then all those requests redistribute to the other replica making it slow and timing out a bunch more requests, most of which... you guessed it re-distribute to the first, lagging replica... meanwhile the user on the far end gets to wait for 2 requests to time out instead of 1... This would be less of a problem with cpuTimeAllowed hopefully but I'm still not sure it makes sense in the broader scheme of things. One use case for timeAllowed is that the person deploying solr wants to ensure that the end-user doesn't wait more than X before getting some sort of response. Users click away from sites that lag, and an error possibly improves the chance that they remain engaged. re-requesting would make it difficult to achieve this. Think 3 replicas and setting the timeout to 1/3 the desired limit... then 2/3 of t he available time is wasted bouncing around and failing... would certainly require a flag to toggle at a minimum. ########## solr/core/src/test/org/apache/solr/TestTimeAllowedSearch.java: ########## @@ -0,0 +1,104 @@ +package org.apache.solr; + +import com.carrotsearch.randomizedtesting.generators.RandomStrings; +import java.util.Calendar; +import java.util.GregorianCalendar; +import java.util.Locale; +import java.util.TimeZone; +import org.apache.solr.client.solrj.SolrQuery; +import org.apache.solr.client.solrj.impl.CloudSolrClient; +import org.apache.solr.client.solrj.request.CollectionAdminRequest; +import org.apache.solr.client.solrj.request.UpdateRequest; +import org.apache.solr.client.solrj.response.QueryResponse; +import org.apache.solr.cloud.MiniSolrCloudCluster; +import org.apache.solr.cloud.SolrCloudTestCase; +import org.apache.solr.common.SolrInputDocument; +import org.apache.solr.common.params.CommonParams; +import org.apache.solr.common.params.ShardParams; + +public class TestTimeAllowedSearch extends SolrCloudTestCase { + + /** + * This test demonstrates timeAllowed expectation at @{@link + * org.apache.solr.handler.component.HttpShardHandler} level This test creates collection with + * 'implicit` router, which has two shards shard_1 has 100000 docs, so that query should take some + * time shard_2 has only 1 doc to demonstrate the HttpSHardHandler timeout Then it execute + * substring query with TIME_ALLOWED 50, assuming this query will time out on shard_1 + */ + public void testTimeAllowed() throws Exception { + MiniSolrCloudCluster cluster = + configureCluster(2).addConfig("conf", configset("cloud-minimal")).configure(); + try { + CloudSolrClient client = cluster.getSolrClient(); + String COLLECTION_NAME = "test_coll"; + CollectionAdminRequest.createCollection(COLLECTION_NAME, "conf", 2, 1) + .setRouterName("implicit") + .setShards("shard_1,shard_2") + .process(cluster.getSolrClient()); + cluster.waitForActiveCollection(COLLECTION_NAME, 2, 2); + UpdateRequest ur = new UpdateRequest(); + for (int i = 0; i < 100000; i++) { + SolrInputDocument doc = new SolrInputDocument(); + doc.addField("id", "" + i); + final String s = + RandomStrings.randomAsciiLettersOfLengthBetween(random(), 10, 100) + .toLowerCase(Locale.ROOT); + doc.setField("subject_s", s); + doc.setField("_route_", "shard_1"); + ur.add(doc); + } + + // adding "abc" in each shard as we will have query *abc* + SolrInputDocument doc = new SolrInputDocument(); + doc.addField("id", "" + 10000); + doc.setField("subject_s", "abc"); + doc.setField("_route_", "shard_2"); + ur.add(doc); + + doc = new SolrInputDocument(); + doc.addField("id", "" + 100001); + doc.setField("subject_s", "abc"); + doc.setField("_route_", "shard_1"); + ur.add(doc); + + ur.commit(client, COLLECTION_NAME); + + // warm up query + SolrQuery query = new SolrQuery(); + query.setQuery("subject_s:*abcd*"); + query.set(ShardParams.SHARDS_TOLERANT, "true"); + QueryResponse response = client.query(COLLECTION_NAME, query); + + query = new SolrQuery(); + query.setQuery("subject_s:*abc*"); + query.set(CommonParams.TIME_ALLOWED, 40); + query.set(ShardParams.SHARDS_TOLERANT, "true"); + response = client.query(COLLECTION_NAME, query); + assertTrue( + "Should have found 1 doc (shard_2) as timeallowed is 25ms found:" + + response.getResults().getNumFound(), + response.getResults().getNumFound() == 1); + + query = new SolrQuery(); + query.setQuery("subject_s:*abc*"); + query.set(ShardParams.SHARDS_TOLERANT, "true"); + response = client.query(COLLECTION_NAME, query); + assertTrue( + "Should have found few docs as timeallowed is unlimited ", + response.getResults().getNumFound() > 1); + } finally { + cluster.shutdown(); + } + } + + public void testTimeZone() { Review Comment: Appears unused, please remove. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@solr.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@solr.apache.org For additional commands, e-mail: issues-h...@solr.apache.org