epugh commented on code in PR #3904: URL: https://github.com/apache/solr/pull/3904#discussion_r2658952119
########## solr/core/src/test/org/apache/solr/cloud/DeleteCoreRemnantsOnCreateTest.java: ########## @@ -0,0 +1,327 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.cloud; + +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.List; +import java.util.Optional; +import org.apache.solr.client.solrj.request.CollectionAdminRequest; +import org.apache.solr.client.solrj.request.CoreAdminRequest; +import org.apache.solr.client.solrj.response.json.JsonMapResponseParser; +import org.apache.solr.common.cloud.DocCollection; +import org.apache.solr.common.cloud.Replica; +import org.apache.solr.common.cloud.Slice; +import org.apache.solr.core.CoreDescriptor; +import org.apache.solr.core.SolrCore; +import org.apache.solr.embedded.JettySolrRunner; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Test; + +/** + * Solr occasionally gets into an inconsistent state with its cores lifecycle where remnant files + * are left on disk after various operations that delete a core. Examples include deleting a + * collection operation that doesn't properly finish, or maybe the Solr process unexpectedly gets + * killed. The system property "solr.cloud.delete.unknown.cores.enabled" is an expert setting that + * when enabled automatically deletes any remnant core data on disk when new cores are created that + * would otherwise fail due to the preexisting files. You should be cautious in enabling this + * feature, as it means that something isn't working well in your Solr setup. + */ +public class DeleteCoreRemnantsOnCreateTest extends SolrCloudTestCase { + private static final String DELETE_UNKNOWN_CORES_PROP = "solr.cloud.delete.unknown.cores.enabled"; + + @BeforeClass + public static void setupCluster() throws Exception { + configureCluster(1).addConfig("conf", configset("cloud-minimal")).configure(); + } + + @Before + public void resetProperty() { + System.clearProperty(DELETE_UNKNOWN_CORES_PROP); + } + + /** + * Shared setup for testing collection creation with remnants. Creates a collection, deletes it, + * and then leaves behind a remnant directory. + */ + private void setupCollectionRemnant(String collectionName) throws Exception { + List<JettySolrRunner> jettys = cluster.getJettySolrRunners(); + String primaryNode = jettys.getFirst().getNodeName(); + + CollectionAdminRequest.Create createRequest = + CollectionAdminRequest.createCollection(collectionName, "conf", 1, 1); + createRequest.setCreateNodeSet(primaryNode); + createRequest.process(cluster.getSolrClient()); + + waitForState( + "Expected collection to be fully active", + collectionName, + (n, c) -> SolrCloudTestCase.replicasForCollectionAreFullyActive(n, c, 1, 1)); + + Replica primaryReplica = getReplicaOnNode(collectionName, "shard1", primaryNode); + JettySolrRunner primaryJetty = cluster.getReplicaJetty(primaryReplica); + String originalCoreName = primaryReplica.getCoreName(); + Path remnantInstanceDir; + try (SolrCore core = primaryJetty.getCoreContainer().getCore(originalCoreName)) { + CoreDescriptor cd = core.getCoreDescriptor(); + remnantInstanceDir = cd.getInstanceDir(); + } + + CollectionAdminRequest.deleteCollection(collectionName).process(cluster.getSolrClient()); + waitForState("Expected collection deletion", collectionName, (n, c) -> c == null); + + // Simulate a core remnant still exists by creating the directory and core.properties + Files.createDirectories(remnantInstanceDir); + Files.writeString(remnantInstanceDir.resolve("core.properties"), "", StandardCharsets.UTF_8); + } + + /** + * Shared setup for testing replica addition with remnants. Creates a collection, then simulates a + * remnant directory on the single node that will impact the next addReplica command. + */ + private void setupReplicaRemnant(String collectionName) throws Exception { + List<JettySolrRunner> jettys = cluster.getJettySolrRunners(); + String primaryNode = jettys.getFirst().getNodeName(); + + CollectionAdminRequest.Create createRequest = + CollectionAdminRequest.createCollection(collectionName, "conf", 1, 1); + createRequest.setCreateNodeSet(primaryNode); + createRequest.process(cluster.getSolrClient()); + + waitForState( + "Expected collection to be fully active", + collectionName, + (n, c) -> SolrCloudTestCase.replicasForCollectionAreFullyActive(n, c, 1, 1)); + + int nextReplicaIndex = 3; // Yep, from 1 to 3 due to how we count in ZK and setup. + String expectedNewReplicaName = collectionName + "_shard1_replica_n" + nextReplicaIndex; + + // Simulate a core remnant on the single node adjacent to the existing replica instance path + Replica existing = getReplicaOnNode(collectionName, "shard1", primaryNode); + try (SolrCore core = + cluster.getReplicaJetty(existing).getCoreContainer().getCore(existing.getCoreName())) { + Path siblingDir = core.getInstancePath().getParent().resolve(expectedNewReplicaName); + Files.createDirectories(siblingDir); + Files.writeString( + siblingDir.resolve("core.properties"), + "name=" + + expectedNewReplicaName + + "_remnant\n" + + "collection=" + + collectionName + + "_remnant\n" + + "shard=shard1\n" + + "coreNodeName=core_node_remnant\n", + StandardCharsets.UTF_8); + } + } + + /** + * Shared setup for testing DeleteCore admin API with remnants. Creates a collection, deletes it, + * and then leaves behind a remnant core directory. + */ + private String setupCoreRemnantForUnloadCoreOperation(String collectionName) throws Exception { + List<JettySolrRunner> jettys = cluster.getJettySolrRunners(); + String primaryNode = jettys.getFirst().getNodeName(); + + CollectionAdminRequest.Create createRequest = + CollectionAdminRequest.createCollection(collectionName, "conf", 1, 1); + createRequest.setCreateNodeSet(primaryNode); + createRequest.process(cluster.getSolrClient()); + + waitForState( + "Expected collection to be fully active", + collectionName, + (n, c) -> SolrCloudTestCase.replicasForCollectionAreFullyActive(n, c, 1, 1)); + + Replica primaryReplica = getReplicaOnNode(collectionName, "shard1", primaryNode); + JettySolrRunner primaryJetty = cluster.getReplicaJetty(primaryReplica); + String originalCoreName = primaryReplica.getCoreName(); + Path remnantInstanceDir; + try (SolrCore core = primaryJetty.getCoreContainer().getCore(originalCoreName)) { + CoreDescriptor cd = core.getCoreDescriptor(); + remnantInstanceDir = cd.getInstanceDir(); + } + + CollectionAdminRequest.deleteCollection(collectionName).process(cluster.getSolrClient()); + waitForState("Expected collection deletion", collectionName, (n, c) -> c == null); + + // Simulate a core remnant still exists by creating the directory and core.properties + Files.createDirectories(remnantInstanceDir); + Files.writeString( + remnantInstanceDir.resolve("core.properties"), + "name=" + originalCoreName + "\n", + StandardCharsets.UTF_8); + + return originalCoreName; + } + + @Test + public void testCreateCollectionWithRemnantsFailsWithoutSetting() throws Exception { + assertNull( + "Property should not be set by default", System.getProperty(DELETE_UNKNOWN_CORES_PROP)); + + String collectionName = "coreRemnantCreateNoSetting"; + setupCollectionRemnant(collectionName); + + // Try to create the collection again - this demonstrates the behavior without the setting + // In typical environments, this might fail, but behavior depends on configuration + CollectionAdminRequest.Create recreateRequest = + CollectionAdminRequest.createCollection(collectionName, "conf", 1, 1); + List<JettySolrRunner> jettys = cluster.getJettySolrRunners(); + recreateRequest.setCreateNodeSet(jettys.getFirst().getNodeName()); + + // The request to create a collection SHOULD fail based on the remnant file, if it does not it + // means we've changed Solr's behavior when creating a core and + // remnants exist, and therefore we should rethink the utility of this setting. + try { + recreateRequest.process(cluster.getSolrClient()); + fail("This request to recreate the collection should have failed due to remnant files."); + } catch (Exception e) { + assertTrue( + "Verify the exception was due to core creation failed.", + e.getMessage().contains("Underlying core creation failed")); + } + } + + @Test + public void testCreateCollectionWithRemnantsWithSetting() throws Exception { + System.setProperty(DELETE_UNKNOWN_CORES_PROP, "true"); + + String collectionName = "coreRemnantCreateWithSetting"; + setupCollectionRemnant(collectionName); + + // With the setting enabled, collection creation should succeed despite remnants + CollectionAdminRequest.Create recreateRequest = + CollectionAdminRequest.createCollection(collectionName, "conf", 1, 1); + List<JettySolrRunner> jettys = cluster.getJettySolrRunners(); + recreateRequest.setCreateNodeSet(jettys.getFirst().getNodeName()); + recreateRequest.process(cluster.getSolrClient()); Review Comment: we have this everywhere... -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
