From afa14b7195cf452e090100af1bb0bb4e7900a6fd Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 2 May 2026 14:08:30 +0000 Subject: [PATCH 1/5] Initial plan From 2dc9dc6093b01cc71c7df80ada2b08a8014c8ebe Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 2 May 2026 14:12:28 +0000 Subject: [PATCH 2/5] Fix CI flakiness in LeaderElectionIntegrationTest.testSimpleSliceLeaderElection - Increase cluster.waitForActiveCollection timeout from 10s to 60s - Replace ad-hoc polling loop after expireZkSession with waitForState (waits until leader moves away from the expired-session node) - Replace Thread.sleep with waitForState for node rejoining live nodes - Replace final polling loop + assertEquals with waitForState (waits until original node becomes leader again) Agent-Logs-Url: https://github.com/epugh/solr/sessions/1eab1dea-7bf6-4911-93ff-03f3c6614cfd Co-authored-by: epugh <22395+epugh@users.noreply.github.com> --- .../cloud/LeaderElectionIntegrationTest.java | 44 +++++++++---------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/solr/core/src/test/org/apache/solr/cloud/LeaderElectionIntegrationTest.java b/solr/core/src/test/org/apache/solr/cloud/LeaderElectionIntegrationTest.java index 57c84aaf41b1..d9e039d9e3a2 100644 --- a/solr/core/src/test/org/apache/solr/cloud/LeaderElectionIntegrationTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/LeaderElectionIntegrationTest.java @@ -61,7 +61,7 @@ public void testSimpleSliceLeaderElection() throws Exception { String collection = "collection1"; createCollection(collection); - cluster.waitForActiveCollection(collection, 10, TimeUnit.SECONDS, 2, 6); + cluster.waitForActiveCollection(collection, 60, TimeUnit.SECONDS, 2, 6); List stoppedRunners = new ArrayList<>(); for (int i = 0; i < 4; i++) { // who is the leader? @@ -107,15 +107,20 @@ public void testSimpleSliceLeaderElection() throws Exception { assertNotNull(jetty); cluster.expireZkSession(jetty); - for (int i = 0; i < 60; i++) { // wait till leader is changed - if (jetty != getRunner(getLeader(collection))) { - break; - } - Thread.sleep(100); - } - - // make sure we have waited long enough for the first leader to have come back - Thread.sleep(ZkTestServer.TICK_TIME * 2 + 100); + // Wait until leadership has moved away from the expired-session node + waitForState( + "Expected leader to move away after expiring zk session", + collection, + c -> { + var l = c.getLeader("shard1"); + return l != null && !jetty.getNodeName().equals(l.getNodeName()); + }); + + // Wait until the expired-session node is live again before stopping others + waitForState( + "Expected expired-session node to rejoin live nodes", + collection, + (liveNodes, c) -> liveNodes.contains(jetty.getNodeName())); // kill everyone but the first leader that should have reconnected by now for (JettySolrRunner jetty2 : cluster.getJettySolrRunners()) { @@ -124,18 +129,13 @@ public void testSimpleSliceLeaderElection() throws Exception { } } - for (int i = 0; i < 320; i++) { // wait till leader is changed - try { - if (jetty == getRunner(getLeader(collection))) { - break; - } - Thread.sleep(100); - } catch (Exception e) { - continue; - } - } - - assertEquals(jetty, getRunner(getLeader(collection))); + waitForState( + "Expected original node to become leader after others stopped", + collection, + c -> { + var l = c.getLeader("shard1"); + return l != null && jetty.getNodeName().equals(l.getNodeName()); + }); } private JettySolrRunner getRunner(String nodeName) { From 326454149d62bc1b6966c20738f6dd492fc55935 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 2 May 2026 14:28:03 +0000 Subject: [PATCH 3/5] Replace Thread.sleep poll loops with waitForState in ZkFailoverTest and ZkShardTermsRecoveryTest Agent-Logs-Url: https://github.com/epugh/solr/sessions/e666fe4d-9dd3-4036-90d5-2d08cbdec281 Co-authored-by: epugh <22395+epugh@users.noreply.github.com> --- .../org/apache/solr/cloud/ZkFailoverTest.java | 19 ++++++------------- .../solr/cloud/ZkShardTermsRecoveryTest.java | 10 ++++++++-- 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/solr/core/src/test/org/apache/solr/cloud/ZkFailoverTest.java b/solr/core/src/test/org/apache/solr/cloud/ZkFailoverTest.java index d97fdfe84bee..0cc9cc89db02 100644 --- a/solr/core/src/test/org/apache/solr/cloud/ZkFailoverTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/ZkFailoverTest.java @@ -23,9 +23,7 @@ import org.apache.solr.client.solrj.request.SolrQuery; import org.apache.solr.client.solrj.response.QueryResponse; import org.apache.solr.common.SolrInputDocument; -import org.apache.solr.common.cloud.ZkStateReader; import org.apache.solr.embedded.JettySolrRunner; -import org.apache.zookeeper.KeeperException; import org.junit.AfterClass; import org.junit.BeforeClass; import org.slf4j.Logger; @@ -60,12 +58,16 @@ public void testRestartZkWhenClusterDown() throws Exception { // This attempt will fail since it will time out after 1 second System.setProperty("solr.cloud.wait.for.zk.seconds", "1"); restartSolrAndZk(); - waitForLiveNodes(0); + waitForState( + "Timeout waiting for 0 live nodes", coll, (liveNodes, c) -> liveNodes.isEmpty()); // This attempt will succeed since there will be enough time to connect System.setProperty("solr.cloud.wait.for.zk.seconds", "20"); restartSolrAndZk(); - waitForLiveNodes(cluster.getJettySolrRunners().size()); + waitForState( + "Timeout waiting for all nodes to come up", + coll, + (liveNodes, c) -> liveNodes.size() == cluster.getJettySolrRunners().size()); waitForState("Timeout waiting for " + coll, coll, clusterShape(2, 2)); QueryResponse rsp = new QueryRequest(new SolrQuery("*:*")).process(cluster.getSolrClient(), coll); @@ -100,13 +102,4 @@ private void restartSolrAndZk() throws Exception { } } - private void waitForLiveNodes(int numNodes) throws InterruptedException, KeeperException { - ZkStateReader zkStateReader = cluster.getZkStateReader(); - for (int i = 0; i < 100; i++) { - zkStateReader.updateLiveNodes(); - if (zkStateReader.getClusterState().getLiveNodes().size() == numNodes) return; - Thread.sleep(200); - } - fail("Timeout waiting for number of live nodes = " + numNodes); - } } diff --git a/solr/core/src/test/org/apache/solr/cloud/ZkShardTermsRecoveryTest.java b/solr/core/src/test/org/apache/solr/cloud/ZkShardTermsRecoveryTest.java index f52b0fce8db1..ac1e9177fd02 100644 --- a/solr/core/src/test/org/apache/solr/cloud/ZkShardTermsRecoveryTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/ZkShardTermsRecoveryTest.java @@ -52,14 +52,20 @@ public static void setupCluster() throws Exception { CollectionAdminRequest.createCollection(COLLECTION, "conf", NUM_SHARDS, NUM_REPLICAS) .process(cluster.getSolrClient()) .getStatus()); - cluster.waitForActiveCollection(COLLECTION, 10, TimeUnit.SECONDS, 2, NUM_SHARDS * NUM_REPLICAS); + waitForState( + "Timeout waiting for collection to be active after creation", + COLLECTION, + clusterShape(NUM_SHARDS, NUM_SHARDS * NUM_REPLICAS)); } @Before public void waitForActiveState() throws Exception { CollectionAdminRequest.modifyCollection(COLLECTION, Map.of("readOnly", false)) .process(cluster.getSolrClient()); - cluster.waitForActiveCollection(COLLECTION, 10, TimeUnit.SECONDS, 2, NUM_SHARDS * NUM_REPLICAS); + waitForState( + "Timeout waiting for active collection", + COLLECTION, + clusterShape(NUM_SHARDS, NUM_SHARDS * NUM_REPLICAS)); } @Test From bf5d71c09ae80527c794c5f006ccc9637663d3dc Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 2 May 2026 14:28:51 +0000 Subject: [PATCH 4/5] Replace waitForActiveCollection(60s) with waitForState(clusterShape) in LeaderElectionIntegrationTest Agent-Logs-Url: https://github.com/epugh/solr/sessions/e666fe4d-9dd3-4036-90d5-2d08cbdec281 Co-authored-by: epugh <22395+epugh@users.noreply.github.com> --- .../apache/solr/cloud/LeaderElectionIntegrationTest.java | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/solr/core/src/test/org/apache/solr/cloud/LeaderElectionIntegrationTest.java b/solr/core/src/test/org/apache/solr/cloud/LeaderElectionIntegrationTest.java index d9e039d9e3a2..2e190f60f89a 100644 --- a/solr/core/src/test/org/apache/solr/cloud/LeaderElectionIntegrationTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/LeaderElectionIntegrationTest.java @@ -19,7 +19,6 @@ import java.io.IOException; import java.util.ArrayList; import java.util.List; -import java.util.concurrent.TimeUnit; import org.apache.solr.client.solrj.SolrServerException; import org.apache.solr.client.solrj.request.CollectionAdminRequest; import org.apache.solr.common.cloud.ZkNodeProps; @@ -61,7 +60,10 @@ public void testSimpleSliceLeaderElection() throws Exception { String collection = "collection1"; createCollection(collection); - cluster.waitForActiveCollection(collection, 60, TimeUnit.SECONDS, 2, 6); + waitForState( + "Timeout waiting for collection to become active", + collection, + clusterShape(2, NUM_REPLICAS_OF_SHARD1 + 1)); List stoppedRunners = new ArrayList<>(); for (int i = 0; i < 4; i++) { // who is the leader? From bc5d06fa4773729b71650ab2e32f0c4ccb5fe3b1 Mon Sep 17 00:00:00 2001 From: Eric Pugh Date: Tue, 5 May 2026 10:23:58 -0400 Subject: [PATCH 5/5] fix precommit --- solr/core/src/test/org/apache/solr/cloud/ZkFailoverTest.java | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/solr/core/src/test/org/apache/solr/cloud/ZkFailoverTest.java b/solr/core/src/test/org/apache/solr/cloud/ZkFailoverTest.java index 0cc9cc89db02..36febd0495c8 100644 --- a/solr/core/src/test/org/apache/solr/cloud/ZkFailoverTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/ZkFailoverTest.java @@ -58,8 +58,7 @@ public void testRestartZkWhenClusterDown() throws Exception { // This attempt will fail since it will time out after 1 second System.setProperty("solr.cloud.wait.for.zk.seconds", "1"); restartSolrAndZk(); - waitForState( - "Timeout waiting for 0 live nodes", coll, (liveNodes, c) -> liveNodes.isEmpty()); + waitForState("Timeout waiting for 0 live nodes", coll, (liveNodes, c) -> liveNodes.isEmpty()); // This attempt will succeed since there will be enough time to connect System.setProperty("solr.cloud.wait.for.zk.seconds", "20"); @@ -101,5 +100,4 @@ private void restartSolrAndZk() throws Exception { thread.join(); } } - }