We lost or zookeeper data about a week ago due to /queue-work being too large or corrupt. We were unable to access/remove the data in the zk folder /queue-work.
To get around that we created a new ZK instance and repopulated it with the custom config used by the application and repopulated our custom use of ZK for property files and restarted SOLR. Now the leader restarts without issue and shows as a leader. The replicas fail to recreate giving this error: 2022-02-21 09:34:23.020 INFO (recoveryExecutor-4-thread-1-processing-n:solr3:8983_solr x:Production_shard1_replica_n13 c:Production s:shard1 r:core_node14) [c:Production s:shard1 r:core_node14 x:Production_shard1_replica_n13] o.a.s.c.RecoveryStrategy Failed to connect leader http://solr1:8983/solr on recovery, try again Somehow solr1 is the leader but fails to respond as the leader correctly when accessed. I created a new collection on the cluster that collection replicates without issue so I assume its a problem with the Collection settings. Below is the cluster status. { "responseHeader":{ "status":0, "QTime":1}, "cluster":{ "collections":{ "UP7Test":{ "pullReplicas":"0", "replicationFactor":"1", "shards":{"shard1":{ "range":"80000000-7fffffff", "state":"active", "replicas":{ "core_node2":{ "core":"UP7Test_shard1_replica_n1", "base_url":"http://solr1:8983/solr", "node_name":"solr1:8983_solr", "state":"active", "type":"NRT", "force_set_state":"false", "leader":"true"}, "core_node4":{ "core":"UP7Test_shard1_replica_n3", "base_url":"http://solr2:8983/solr", "node_name":"solr2:8983_solr", "state":"down", "type":"NRT", "force_set_state":"false"}}}}, "router":{"name":"compositeId"}, "maxShardsPerNode":"1", "autoAddReplicas":"false", "nrtReplicas":"1", "tlogReplicas":"0", "znodeVersion":29, "configName":"Production_config"}, "Production":{ "shards":{"shard1":{ "parent":null, "range":null, "state":"active", "replicas":{ "core_node29":{ "core":"Production_shard1_replica6", "base_url":"http://solr1:8983/solr", "node_name":"solr1:8983_solr", "state":"active", "type":"NRT", "force_set_state":"false", "leader":"true"}, "core_node10":{ "core":"Production_shard1_replica_n9", "base_url":"http://solr2:8983/solr", "node_name":"solr2:8983_solr", "state":"down", "type":"NRT", "force_set_state":"false"}, "core_node14":{ "core":"Production_shard1_replica_n13", "base_url":"http://solr3:8983/solr", "node_name":"solr3:8983_solr", "state":"down", "type":"NRT", "force_set_state":"false"}}}}, "replicationFactor":"1", "router":{"name":"implicit"}, "maxShardsPerNode":"1", "autoAddReplicas":"false", "nrtReplicas":"1", "znodeVersion":124, "configName":"Production_config"}}, "live_nodes":["solr1:8983_solr", "solr3:8983_solr"]}} Has anyone seen this behavior before? Thanks :