From 0be278debf78d841beeb314e08f18621320bd3c8 Mon Sep 17 00:00:00 2001 From: Dmitry Kropachev Date: Sun, 1 Mar 2026 08:23:49 -0400 Subject: [PATCH 1/2] Fix flaky AdvancedShardAwarenessIT timeouts in CI Increase Awaitility timeout from 20s to 60s and raise the connection init query timeout from 5s (default) to 30s for tests that open many channels (66 per node). On resource-constrained CI runners, the burst of 528 concurrent connection attempts causes protocol OPTIONS handshakes to time out, preventing pools from filling within the original deadline. Fixes #820 --- .../oss/driver/core/pool/AdvancedShardAwarenessIT.java | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/integration-tests/src/test/java/com/datastax/oss/driver/core/pool/AdvancedShardAwarenessIT.java b/integration-tests/src/test/java/com/datastax/oss/driver/core/pool/AdvancedShardAwarenessIT.java index aeda4a02c7b..91f0f9e77a6 100644 --- a/integration-tests/src/test/java/com/datastax/oss/driver/core/pool/AdvancedShardAwarenessIT.java +++ b/integration-tests/src/test/java/com/datastax/oss/driver/core/pool/AdvancedShardAwarenessIT.java @@ -156,6 +156,7 @@ public void should_see_mismatched_shard() { .withInt(DefaultDriverOption.ADVANCED_SHARD_AWARENESS_PORT_LOW, 10000) .withInt(DefaultDriverOption.ADVANCED_SHARD_AWARENESS_PORT_HIGH, 60000) .withInt(DefaultDriverOption.CONNECTION_POOL_LOCAL_SIZE, 66) + .withDuration(DefaultDriverOption.CONNECTION_INIT_QUERY_TIMEOUT, Duration.ofSeconds(30)) .build(); try (CqlSession session = CqlSession.builder() @@ -165,7 +166,7 @@ public void should_see_mismatched_shard() { .build()) { List allSessions = Collections.singletonList(session); Awaitility.await() - .atMost(20, TimeUnit.SECONDS) + .atMost(60, TimeUnit.SECONDS) .pollInterval(500, TimeUnit.MILLISECONDS) .until(() -> areAllPoolsFullyInitialized(allSessions, expectedChannelsPerNode)); List logsCopy = ImmutableList.copyOf(appender.list); @@ -217,6 +218,7 @@ public void should_not_struggle_to_fill_pools() { .withInt(DefaultDriverOption.CONNECTION_POOL_LOCAL_SIZE, expectedChannelsPerNode) .withDuration(DefaultDriverOption.RECONNECTION_BASE_DELAY, Duration.ofMillis(10)) .withDuration(DefaultDriverOption.RECONNECTION_MAX_DELAY, Duration.ofMillis(20)) + .withDuration(DefaultDriverOption.CONNECTION_INIT_QUERY_TIMEOUT, Duration.ofSeconds(30)) .build(); CqlSessionBuilder builder = CqlSession.builder() @@ -234,7 +236,7 @@ public void should_not_struggle_to_fill_pools() { CqlSession session4 = CompletableFutures.getUninterruptibly(stage4); ) { List allSessions = Arrays.asList(session1, session2, session3, session4); Awaitility.await() - .atMost(20, TimeUnit.SECONDS) + .atMost(60, TimeUnit.SECONDS) .pollInterval(500, TimeUnit.MILLISECONDS) .until(() -> areAllPoolsFullyInitialized(allSessions, expectedChannelsPerNode)); int tolerance = 2; // Sometimes socket ends up already in use From 4b6d4c4993f9b07a4f147d234a874cef3325f7ca Mon Sep 17 00:00:00 2001 From: Dmitry Kropachev Date: Sun, 1 Mar 2026 19:05:00 -0400 Subject: [PATCH 2/2] Add debug log dumping on AdvancedShardAwarenessIT failure When a test fails, dump captured driver logs (ChannelPool, Reconnection) and Scylla node logs from the CCM config directory. This provides visibility into why pool initialization times out in CI. Also adds CcmBridge.getConfigDirectory() to enable reading node log files. --- .../core/pool/AdvancedShardAwarenessIT.java | 72 +++++++++++++++++++ .../driver/api/testinfra/ccm/CcmBridge.java | 4 ++ 2 files changed, 76 insertions(+) diff --git a/integration-tests/src/test/java/com/datastax/oss/driver/core/pool/AdvancedShardAwarenessIT.java b/integration-tests/src/test/java/com/datastax/oss/driver/core/pool/AdvancedShardAwarenessIT.java index 91f0f9e77a6..50ffba90d9f 100644 --- a/integration-tests/src/test/java/com/datastax/oss/driver/core/pool/AdvancedShardAwarenessIT.java +++ b/integration-tests/src/test/java/com/datastax/oss/driver/core/pool/AdvancedShardAwarenessIT.java @@ -25,7 +25,11 @@ import com.tngtech.java.junit.dataprovider.DataProvider; import com.tngtech.java.junit.dataprovider.DataProviderRunner; import com.tngtech.java.junit.dataprovider.UseDataProvider; +import java.io.IOException; import java.net.InetSocketAddress; +import java.nio.file.DirectoryStream; +import java.nio.file.Files; +import java.nio.file.Path; import java.time.Duration; import java.util.Arrays; import java.util.Collections; @@ -39,8 +43,11 @@ import org.junit.After; import org.junit.Before; import org.junit.ClassRule; +import org.junit.Rule; import org.junit.Test; import org.junit.experimental.categories.Category; +import org.junit.rules.TestWatcher; +import org.junit.runner.Description; import org.junit.runner.RunWith; import org.slf4j.LoggerFactory; @@ -65,6 +72,19 @@ public class AdvancedShardAwarenessIT { private final Pattern generalReconnectionPattern = Pattern.compile(".*Scheduling next reconnection in.*"); + private static final org.slf4j.Logger LOG = + LoggerFactory.getLogger(AdvancedShardAwarenessIT.class); + + @Rule + public TestWatcher logDumper = + new TestWatcher() { + @Override + protected void failed(Throwable e, Description description) { + dumpDriverLogs(description.getMethodName()); + dumpScyllaLogs(description.getMethodName()); + } + }; + @DataProvider public static Object[][] reuseAddressOption() { return new Object[][] {{true}, {false}}; @@ -358,4 +378,56 @@ private void assertMatchesAtMost(Pattern pattern, Integer times, List clusters = Files.newDirectoryStream(configDir)) { + for (Path cluster : clusters) { + if (!Files.isDirectory(cluster)) continue; + try (DirectoryStream nodes = Files.newDirectoryStream(cluster, "node*")) { + for (Path node : nodes) { + Path logsDir = node.resolve("logs"); + if (!Files.isDirectory(logsDir)) continue; + try (DirectoryStream logFiles = Files.newDirectoryStream(logsDir)) { + for (Path logFile : logFiles) { + if (!Files.isRegularFile(logFile)) continue; + LOG.error("--- {} ---", logFile); + try { + List lines = Files.readAllLines(logFile); + // Print last 200 lines to avoid flooding + int start = Math.max(0, lines.size() - 200); + if (start > 0) { + LOG.error("... ({} lines skipped, showing last 200) ...", start); + } + for (int i = start; i < lines.size(); i++) { + LOG.error("{}", lines.get(i)); + } + } catch (IOException readEx) { + LOG.error("Failed to read log file {}: {}", logFile, readEx.getMessage()); + } + } + } + } + } + } + } catch (IOException ex) { + LOG.error("Failed to read CCM logs from {}: {}", configDir, ex.getMessage()); + } + LOG.error("=== END SCYLLA LOGS for {} ===", testName); + } } diff --git a/test-infra/src/main/java/com/datastax/oss/driver/api/testinfra/ccm/CcmBridge.java b/test-infra/src/main/java/com/datastax/oss/driver/api/testinfra/ccm/CcmBridge.java index 81523b3701b..4cc3e0dda4e 100644 --- a/test-infra/src/main/java/com/datastax/oss/driver/api/testinfra/ccm/CcmBridge.java +++ b/test-infra/src/main/java/com/datastax/oss/driver/api/testinfra/ccm/CcmBridge.java @@ -312,6 +312,10 @@ public static Version getDistributionVersion() { return VERSION; } + public Path getConfigDirectory() { + return configDirectory; + } + public static Version getCassandraVersion() { if (isDistributionOf(BackendType.CASSANDRA)) { return VERSION;