dsmiley commented on code in PR #1926:
URL: https://github.com/apache/solr/pull/1926#discussion_r1522243305


##########
solr/core/src/java/org/apache/solr/cluster/maintenance/InactiveShardRemover.java:
##########
@@ -0,0 +1,219 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.cluster.maintenance;
+
+import com.google.common.annotations.VisibleForTesting;
+import java.lang.invoke.MethodHandles;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.Objects;
+import java.util.concurrent.Executors;
+import java.util.concurrent.ScheduledExecutorService;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+import org.apache.solr.api.ConfigurablePlugin;
+import org.apache.solr.client.solrj.SolrResponse;
+import org.apache.solr.client.solrj.request.CollectionAdminRequest;
+import org.apache.solr.cloud.ClusterSingleton;
+import org.apache.solr.common.cloud.ClusterState;
+import org.apache.solr.common.cloud.DocCollection;
+import org.apache.solr.common.cloud.Slice;
+import org.apache.solr.common.cloud.ZkStateReader;
+import org.apache.solr.common.util.ExecutorUtil;
+import org.apache.solr.common.util.SolrNamedThreadFactory;
+import org.apache.solr.core.CoreContainer;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This Cluster Singleton can be configured to periodically find and remove 
{@link
+ * org.apache.solr.common.cloud.Slice.State#INACTIVE} Shards that are left 
behind after a Shard is
+ * split
+ */
+public class InactiveShardRemover
+    implements ClusterSingleton, 
ConfigurablePlugin<InactiveShardRemoverConfig> {
+
+  private static final Logger log = 
LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
+  public static final String PLUGIN_NAME = ".inactive-shard-remover";
+
+  static class DeleteActor {
+
+    private final CoreContainer coreContainer;
+
+    DeleteActor(final CoreContainer coreContainer) {
+      this.coreContainer = coreContainer;
+    }
+
+    void delete(final Slice slice) {
+      CollectionAdminRequest.DeleteShard deleteRequest =
+          CollectionAdminRequest.deleteShard(slice.getCollection(), 
slice.getName());
+      try {
+        SolrResponse response =
+            
coreContainer.getZkController().getSolrCloudManager().request(deleteRequest);
+        if (response.getException() != null) {
+          throw response.getException();
+        }
+      } catch (Exception e) {
+        log.warn("An exception occurred when deleting an inactive shard", e);
+      }
+    }
+  }
+
+  private State state = State.STOPPED;
+
+  private final CoreContainer coreContainer;
+
+  private final DeleteActor deleteActor;
+
+  private ScheduledExecutorService executor;
+
+  private long scheduleIntervalSeconds;
+
+  private long ttlSeconds;
+
+  private int maxDeletesPerCycle;
+
+  /** Constructor invoked via Reflection */
+  public InactiveShardRemover(final CoreContainer cc) {
+    this(cc, new DeleteActor(cc));
+  }
+
+  public InactiveShardRemover(final CoreContainer cc, final DeleteActor 
deleteActor) {
+    this.coreContainer = cc;
+    this.deleteActor = deleteActor;
+  }
+
+  @Override
+  public void configure(final InactiveShardRemoverConfig cfg) {
+    Objects.requireNonNull(cfg, "config must be specified");
+    cfg.validate();
+    this.scheduleIntervalSeconds = cfg.scheduleIntervalSeconds;
+    this.maxDeletesPerCycle = cfg.maxDeletesPerCycle;
+    this.ttlSeconds = cfg.ttlSeconds;
+  }
+
+  @Override
+  public String getName() {
+    return PLUGIN_NAME;
+  }
+
+  @Override
+  public State getState() {
+    return state;
+  }
+
+  @Override
+  public void start() throws Exception {
+    state = State.STARTING;
+    executor = Executors.newScheduledThreadPool(1, new 
SolrNamedThreadFactory(PLUGIN_NAME));
+    executor.scheduleAtFixedRate(
+        this::deleteInactiveSlices,
+        scheduleIntervalSeconds,
+        scheduleIntervalSeconds,
+        TimeUnit.SECONDS);
+    state = State.RUNNING;
+  }
+
+  @Override
+  public void stop() {
+    if (state == State.RUNNING) {
+      state = State.STOPPING;
+      ExecutorUtil.shutdownNowAndAwaitTermination(executor);
+    }
+    state = State.STOPPED;
+  }
+
+  @VisibleForTesting
+  void deleteInactiveSlices() {
+    final ClusterState clusterState = 
coreContainer.getZkController().getClusterState();
+    Collection<Slice> inactiveSlices =
+        clusterState.getCollectionsMap().values().stream()

Review Comment:
   Such an innocent looking thing but `getCollectionsMap()` has been a CPU 
performance problem at the 10s of thousands of collections scale.  Fine for now 
but eventually it'd be nice to have an alternative.  For example, perhaps each 
node could tend to this inactive shard removing task for its own cores, 
ignoring all the others.  And/or maybe a shard split schedules this on a 
pertinent node.  Any way, we have something in-hand here that works, which 
addresses a gap since the autoscaling framework's removal in Solr 8.



##########
solr/core/src/java/org/apache/solr/cluster/maintenance/InactiveShardRemover.java:
##########
@@ -0,0 +1,219 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.cluster.maintenance;
+
+import com.google.common.annotations.VisibleForTesting;
+import java.lang.invoke.MethodHandles;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.Objects;
+import java.util.concurrent.Executors;
+import java.util.concurrent.ScheduledExecutorService;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+import org.apache.solr.api.ConfigurablePlugin;
+import org.apache.solr.client.solrj.SolrResponse;
+import org.apache.solr.client.solrj.request.CollectionAdminRequest;
+import org.apache.solr.cloud.ClusterSingleton;
+import org.apache.solr.common.cloud.ClusterState;
+import org.apache.solr.common.cloud.DocCollection;
+import org.apache.solr.common.cloud.Slice;
+import org.apache.solr.common.cloud.ZkStateReader;
+import org.apache.solr.common.util.ExecutorUtil;
+import org.apache.solr.common.util.SolrNamedThreadFactory;
+import org.apache.solr.core.CoreContainer;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This Cluster Singleton can be configured to periodically find and remove 
{@link
+ * org.apache.solr.common.cloud.Slice.State#INACTIVE} Shards that are left 
behind after a Shard is
+ * split
+ */
+public class InactiveShardRemover
+    implements ClusterSingleton, 
ConfigurablePlugin<InactiveShardRemoverConfig> {
+
+  private static final Logger log = 
LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
+  public static final String PLUGIN_NAME = ".inactive-shard-remover";
+
+  static class DeleteActor {
+
+    private final CoreContainer coreContainer;
+
+    DeleteActor(final CoreContainer coreContainer) {
+      this.coreContainer = coreContainer;
+    }
+
+    void delete(final Slice slice) {
+      CollectionAdminRequest.DeleteShard deleteRequest =
+          CollectionAdminRequest.deleteShard(slice.getCollection(), 
slice.getName());
+      try {
+        SolrResponse response =
+            
coreContainer.getZkController().getSolrCloudManager().request(deleteRequest);
+        if (response.getException() != null) {
+          throw response.getException();
+        }
+      } catch (Exception e) {
+        log.warn("An exception occurred when deleting an inactive shard", e);
+      }
+    }
+  }
+
+  private State state = State.STOPPED;
+
+  private final CoreContainer coreContainer;
+
+  private final DeleteActor deleteActor;
+
+  private ScheduledExecutorService executor;
+
+  private long scheduleIntervalSeconds;
+
+  private long ttlSeconds;
+
+  private int maxDeletesPerCycle;
+
+  /** Constructor invoked via Reflection */
+  public InactiveShardRemover(final CoreContainer cc) {
+    this(cc, new DeleteActor(cc));
+  }
+
+  public InactiveShardRemover(final CoreContainer cc, final DeleteActor 
deleteActor) {
+    this.coreContainer = cc;
+    this.deleteActor = deleteActor;
+  }
+
+  @Override
+  public void configure(final InactiveShardRemoverConfig cfg) {
+    Objects.requireNonNull(cfg, "config must be specified");
+    cfg.validate();
+    this.scheduleIntervalSeconds = cfg.scheduleIntervalSeconds;
+    this.maxDeletesPerCycle = cfg.maxDeletesPerCycle;
+    this.ttlSeconds = cfg.ttlSeconds;
+  }
+
+  @Override
+  public String getName() {
+    return PLUGIN_NAME;
+  }
+
+  @Override
+  public State getState() {
+    return state;
+  }
+
+  @Override
+  public void start() throws Exception {
+    state = State.STARTING;
+    executor = Executors.newScheduledThreadPool(1, new 
SolrNamedThreadFactory(PLUGIN_NAME));
+    executor.scheduleAtFixedRate(
+        this::deleteInactiveSlices,
+        scheduleIntervalSeconds,
+        scheduleIntervalSeconds,
+        TimeUnit.SECONDS);
+    state = State.RUNNING;
+  }
+
+  @Override
+  public void stop() {
+    if (state == State.RUNNING) {
+      state = State.STOPPING;
+      ExecutorUtil.shutdownNowAndAwaitTermination(executor);
+    }
+    state = State.STOPPED;
+  }
+
+  @VisibleForTesting
+  void deleteInactiveSlices() {
+    final ClusterState clusterState = 
coreContainer.getZkController().getClusterState();
+    Collection<Slice> inactiveSlices =
+        clusterState.getCollectionsMap().values().stream()
+            .flatMap(v -> collectInactiveSlices(v).stream())
+            .collect(Collectors.toSet());
+
+    if (log.isInfoEnabled()) {
+      log.info(
+          "Found {} inactive Shards to delete, {} will be deleted",
+          inactiveSlices.size(),
+          Math.min(inactiveSlices.size(), maxDeletesPerCycle));
+    }
+
+    
inactiveSlices.stream().limit(maxDeletesPerCycle).forEach(this::deleteShard);
+  }
+
+  private Collection<Slice> collectInactiveSlices(final DocCollection 
docCollection) {
+    final Collection<Slice> slices = new HashSet<>(docCollection.getSlices());
+    slices.removeAll(docCollection.getActiveSlices());
+    return slices.stream().filter(this::isExpired).collect(Collectors.toSet());
+  }
+
+  private void deleteShard(final Slice s) {
+    deleteActor.delete(s);
+  }
+
+  /**
+   * An Inactive Shard is expired if it has not undergone a state change in 
the period of time
+   * defined by {@link InactiveShardRemover#ttlSeconds}. If it is expired, it 
is eligible for
+   * removal.
+   */
+  private boolean isExpired(final Slice slice) {
+
+    final String collectionName = slice.getCollection();
+    final String sliceName = slice.getName();
+
+    if (slice.getState() != Slice.State.INACTIVE) {
+      return false;
+    }
+
+    final String lastChangeTimestamp = 
slice.getStr(ZkStateReader.STATE_TIMESTAMP_PROP);
+    if (lastChangeTimestamp == null || lastChangeTimestamp.isEmpty()) {
+      log.warn(
+          "Collection {} Shard {} has no last change timestamp and will not be 
deleted",
+          collectionName,
+          sliceName);
+      return false;
+    }
+
+    final long epochTimestamp;

Review Comment:
   I really wish time units were clearer here.  Is this Ns or Ms; just a 
variable name suffix would help.



##########
solr/core/src/java/org/apache/solr/cluster/maintenance/package-info.java:
##########
@@ -0,0 +1,19 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/** Cluster Singleton plugins that can are used to perform maintenance tasks 
within the cluster. */

Review Comment:
   ```suggestion
   /** Cluster Singleton plugins that are used to perform maintenance tasks 
within the cluster. */
   ```



##########
solr/core/src/test/org/apache/solr/cluster/maintenance/InactiveShardRemoverTest.java:
##########
@@ -0,0 +1,231 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.cluster.maintenance;
+
+import static org.apache.solr.client.solrj.SolrRequest.METHOD.POST;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Map;
+import java.util.concurrent.TimeUnit;
+import org.apache.solr.client.solrj.SolrServerException;
+import org.apache.solr.client.solrj.request.CollectionAdminRequest;
+import org.apache.solr.client.solrj.request.V2Request;
+import org.apache.solr.client.solrj.request.beans.PluginMeta;
+import org.apache.solr.client.solrj.response.V2Response;
+import org.apache.solr.cloud.ShardTestUtil;
+import org.apache.solr.cloud.SolrCloudTestCase;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.cloud.DocCollection;
+import org.apache.solr.common.cloud.Slice;
+import org.apache.solr.common.util.TimeSource;
+import org.apache.solr.core.CoreContainer;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+public class InactiveShardRemoverTest extends SolrCloudTestCase {
+
+  @BeforeClass
+  public static void setupCluster() throws Exception {
+    configureCluster(1)
+        .addConfig(
+            "conf", 
TEST_PATH().resolve("configsets").resolve("cloud-minimal").resolve("conf"))
+        .configure();
+  }
+
+  @AfterClass
+  public static void tearDownCluster() throws Exception {
+    cluster.shutdown();
+  }
+
+  @Test
+  public void testDeleteInactiveShard() throws Exception {
+
+    addPlugin(new InactiveShardRemoverConfig(1, 0, 2));
+    try {
+      final String collectionName = "testDeleteInactiveShard";
+      createCollection(collectionName, 1);
+
+      final String sliceName =
+          new 
ArrayList<>(getCollectionState(collectionName).getSlices()).get(0).getName();
+      ShardTestUtil.setSliceState(cluster, collectionName, sliceName, 
Slice.State.INACTIVE);
+
+      waitForState(
+          "Waiting for inactive shard to be deleted",
+          collectionName,
+          clusterShape(0, 0),
+          5,
+          TimeUnit.SECONDS);
+    } finally {
+      removePlugin();
+    }
+  }
+
+  @Test
+  public void testTtl() throws Exception {
+
+    final int ttlSeconds = 1 + random().nextInt(5);
+    final TimeSource timeSource = 
cluster.getOpenOverseer().getSolrCloudManager().getTimeSource();

Review Comment:
   side comment:  I think TimeSource ought to be a global singleton static 
thing so it's clear how to get it from absolutely anywhere.  Such singletons 
can be evil but I think it's warranted for something so universal as tracking 
time.



##########
solr/core/src/test/org/apache/solr/cluster/maintenance/InactiveShardRemoverTest.java:
##########
@@ -0,0 +1,231 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.cluster.maintenance;
+
+import static org.apache.solr.client.solrj.SolrRequest.METHOD.POST;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Map;
+import java.util.concurrent.TimeUnit;
+import org.apache.solr.client.solrj.SolrServerException;
+import org.apache.solr.client.solrj.request.CollectionAdminRequest;
+import org.apache.solr.client.solrj.request.V2Request;
+import org.apache.solr.client.solrj.request.beans.PluginMeta;
+import org.apache.solr.client.solrj.response.V2Response;
+import org.apache.solr.cloud.ShardTestUtil;
+import org.apache.solr.cloud.SolrCloudTestCase;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.cloud.DocCollection;
+import org.apache.solr.common.cloud.Slice;
+import org.apache.solr.common.util.TimeSource;
+import org.apache.solr.core.CoreContainer;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+public class InactiveShardRemoverTest extends SolrCloudTestCase {
+
+  @BeforeClass
+  public static void setupCluster() throws Exception {
+    configureCluster(1)
+        .addConfig(
+            "conf", 
TEST_PATH().resolve("configsets").resolve("cloud-minimal").resolve("conf"))
+        .configure();
+  }
+
+  @AfterClass
+  public static void tearDownCluster() throws Exception {
+    cluster.shutdown();
+  }

Review Comment:
   not needed



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@solr.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@solr.apache.org
For additional commands, e-mail: issues-h...@solr.apache.org

Reply via email to