From 2978b028170f6054e9ed05112efd87299e24999b Mon Sep 17 00:00:00 2001 From: Francisco Guerrero Date: Tue, 6 Jan 2026 17:07:49 -0800 Subject: [PATCH 01/26] CASSANALYTICS-34: Spark 4.0 Support Patch by Francisco Guerrero; reviewed by TBD for CASSANALYTICS-34 --- CHANGES.txt | 1 + analytics-sidecar-client-common/build.gradle | 5 +- .../request/JsonResponseBytesDecoder.java | 7 +- analytics-sidecar-client/build.gradle | 5 +- .../sidecar/client/RequestExecutor.java | 53 +++-- .../sidecar/client/StreamBuffer.java | 77 +++++++ .../sidecar/client/SidecarClientTest.java | 13 +- .../build.gradle | 30 ++- analytics-sidecar-vertx-client/build.gradle | 5 +- .../org/apache/cassandra/cdc/CdcBuilder.java | 1 - .../spark/bulkwriter/StreamSession.java | 4 +- .../cassandra/spark/data/DataLayer.java | 19 +- .../cassandra/spark/data/LocalDataLayer.java | 35 +-- .../spark/data/PartitionedDataLayer.java | 46 ++-- .../spark/sparksql/CassandraDataSource.java | 55 +++++ .../sparksql/CassandraInputPartition.java | 37 +++ .../CassandraPartitionReaderFactory.java | 69 ++++++ .../spark/sparksql/CassandraPartitioning.java | 39 ++++ .../spark/sparksql/CassandraScanBuilder.java | 149 ++++++++++++ .../spark/sparksql/CassandraTable.java | 69 ++++++ .../sparksql/CassandraTableProvider.java | 61 +++++ .../spark/sparksql/LocalDataSource.java | 43 ++++ .../sparksql/LocalPartitionSizeSource.java | 39 ++++ .../spark/sparksql/PartitionSizeIterator.java | 96 ++++++++ .../sparksql/PartitionSizeTableProvider.java | 171 ++++++++++++++ .../spark/sparksql/SparkRowIterator.java | 87 +++++++ .../cassandra/spark/SSTableReaderTests.java | 7 +- .../bulkwriter/SortedSSTableWriterTest.java | 2 +- .../bulkwriter/TokenRangeMappingUtils.java | 2 +- .../utils/BufferingInputStreamTests.java | 85 ++++--- .../cassandra/spark/PartitionSizeTests.java | 95 ++++++++ .../spark/bulkwriter/TestTaskContext.java | 218 ++++++++++++++++++ .../common/SidecarInstanceFactoryTest.java | 55 +++++ .../build.gradle | 7 +- .../data/converter/types/StringFeatures.java | 2 +- .../spark/utils/ScalaConversionUtils.java | 56 ----- .../cassandra/spark/utils/SparkTypeUtils.java | 55 ----- .../spark/utils/ScalaConversionUtils.java | 14 +- .../cassandra/spark/utils/SparkTypeUtils.java | 11 +- .../bridge/CassandraBridgeImplementation.java | 24 +- gradle.properties | 6 +- gradle/wrapper/gradle-wrapper.properties | 2 +- profiles/scala-2.13-spark-4-jdk-17.gradle | 28 +++ 43 files changed, 1573 insertions(+), 312 deletions(-) create mode 100644 cassandra-analytics-core/src/main/spark4/org/apache/cassandra/spark/sparksql/CassandraDataSource.java create mode 100644 cassandra-analytics-core/src/main/spark4/org/apache/cassandra/spark/sparksql/CassandraInputPartition.java create mode 100644 cassandra-analytics-core/src/main/spark4/org/apache/cassandra/spark/sparksql/CassandraPartitionReaderFactory.java create mode 100644 cassandra-analytics-core/src/main/spark4/org/apache/cassandra/spark/sparksql/CassandraPartitioning.java create mode 100644 cassandra-analytics-core/src/main/spark4/org/apache/cassandra/spark/sparksql/CassandraScanBuilder.java create mode 100644 cassandra-analytics-core/src/main/spark4/org/apache/cassandra/spark/sparksql/CassandraTable.java create mode 100644 cassandra-analytics-core/src/main/spark4/org/apache/cassandra/spark/sparksql/CassandraTableProvider.java create mode 100644 cassandra-analytics-core/src/main/spark4/org/apache/cassandra/spark/sparksql/LocalDataSource.java create mode 100644 cassandra-analytics-core/src/main/spark4/org/apache/cassandra/spark/sparksql/LocalPartitionSizeSource.java create mode 100644 cassandra-analytics-core/src/main/spark4/org/apache/cassandra/spark/sparksql/PartitionSizeIterator.java create mode 100644 cassandra-analytics-core/src/main/spark4/org/apache/cassandra/spark/sparksql/PartitionSizeTableProvider.java create mode 100644 cassandra-analytics-core/src/main/spark4/org/apache/cassandra/spark/sparksql/SparkRowIterator.java create mode 100644 cassandra-analytics-core/src/test/spark4/org/apache/cassandra/spark/PartitionSizeTests.java create mode 100644 cassandra-analytics-core/src/test/spark4/org/apache/cassandra/spark/bulkwriter/TestTaskContext.java create mode 100644 cassandra-analytics-core/src/test/spark4/org/apache/cassandra/spark/common/SidecarInstanceFactoryTest.java delete mode 100644 cassandra-analytics-spark-converter/src/main/scala-2.11-spark-2/org/apache/cassandra/spark/utils/ScalaConversionUtils.java delete mode 100644 cassandra-analytics-spark-converter/src/main/scala-2.12-spark-2/org/apache/cassandra/spark/utils/SparkTypeUtils.java rename cassandra-analytics-spark-converter/src/main/{scala-2.12-spark-2 => scala-2.13-spark-4}/org/apache/cassandra/spark/utils/ScalaConversionUtils.java (75%) rename cassandra-analytics-spark-converter/src/main/{scala-2.11-spark-2 => scala-2.13-spark-4}/org/apache/cassandra/spark/utils/SparkTypeUtils.java (76%) create mode 100644 profiles/scala-2.13-spark-4-jdk-17.gradle diff --git a/CHANGES.txt b/CHANGES.txt index 4e5f27ed1..17e9b8583 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,5 +1,6 @@ 0.5.0 ----- + * Spark 4.0 Support (CASSANALYTICS-34) * Make BulkWriterConfig extensible (CASSANALYTICS-168) 0.4.0 diff --git a/analytics-sidecar-client-common/build.gradle b/analytics-sidecar-client-common/build.gradle index e742f65ca..2133ef462 100644 --- a/analytics-sidecar-client-common/build.gradle +++ b/analytics-sidecar-client-common/build.gradle @@ -27,13 +27,10 @@ plugins { id 'maven-publish' } -if (propertyWithDefault("artifactType", null) == "common") -{ +if (propertyWithDefault("artifactType", null) == "common") { apply from: "$rootDir/gradle/common/publishing.gradle" } -sourceCompatibility = 1.8 - test { useJUnitPlatform() maxParallelForks = Runtime.runtime.availableProcessors().intdiv(2) ?: 1 diff --git a/analytics-sidecar-client-common/src/main/java/org/apache/cassandra/sidecar/common/request/JsonResponseBytesDecoder.java b/analytics-sidecar-client-common/src/main/java/org/apache/cassandra/sidecar/common/request/JsonResponseBytesDecoder.java index 209296412..6367baa23 100644 --- a/analytics-sidecar-client-common/src/main/java/org/apache/cassandra/sidecar/common/request/JsonResponseBytesDecoder.java +++ b/analytics-sidecar-client-common/src/main/java/org/apache/cassandra/sidecar/common/request/JsonResponseBytesDecoder.java @@ -25,13 +25,14 @@ /** * Decoder for json response body bytes + * * @param expected java type */ public class JsonResponseBytesDecoder implements ResponseBytesDecoder { private static final ObjectMapper MAPPER = new ObjectMapper() - // ignore all the properties that are not declared - .configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + // ignore all the properties that are not declared + .configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); private final Class type; public JsonResponseBytesDecoder(Class type) @@ -42,7 +43,7 @@ public JsonResponseBytesDecoder(Class type) @Override public T decode(byte[] bytes) throws IOException { - if (bytes == null) + if (bytes == null || bytes.length == 0) { return null; } diff --git a/analytics-sidecar-client/build.gradle b/analytics-sidecar-client/build.gradle index 202c592d7..37c67cd06 100644 --- a/analytics-sidecar-client/build.gradle +++ b/analytics-sidecar-client/build.gradle @@ -29,13 +29,10 @@ plugins { id('maven-publish') } -if (propertyWithDefault("artifactType", null) == "common") -{ +if (propertyWithDefault("artifactType", null) == "common") { apply from: "$rootDir/gradle/common/publishing.gradle" } -sourceCompatibility = 1.8 - test { useJUnitPlatform() testLogging { diff --git a/analytics-sidecar-client/src/main/java/org/apache/cassandra/sidecar/client/RequestExecutor.java b/analytics-sidecar-client/src/main/java/org/apache/cassandra/sidecar/client/RequestExecutor.java index 15f92aebf..eb535ceff 100644 --- a/analytics-sidecar-client/src/main/java/org/apache/cassandra/sidecar/client/RequestExecutor.java +++ b/analytics-sidecar-client/src/main/java/org/apache/cassandra/sidecar/client/RequestExecutor.java @@ -47,7 +47,7 @@ public class RequestExecutor implements AutoCloseable protected final HttpClient httpClient; protected final ScheduledExecutorService singleThreadExecutorService; - protected RequestExecutor(HttpClient httpClient) + public RequestExecutor(HttpClient httpClient) { this.httpClient = requireNonNull(httpClient, "The httpClient is required"); this.singleThreadExecutorService = Executors.newSingleThreadScheduledExecutor(); @@ -159,6 +159,13 @@ public void streamRequest(RequestContext context, StreamConsumer streamConsumer) public void close() throws Exception { httpClient.close(); + + singleThreadExecutorService.shutdown(); + if (!singleThreadExecutorService.awaitTermination(1, TimeUnit.MINUTES)) + { + logger.warn("Executor service did not terminate within 1 minute"); + singleThreadExecutorService.shutdownNow(); + } } /** @@ -267,17 +274,17 @@ private void applyRetryPolicy(CompletableFuture future, Request request = context.request(); context.retryPolicy() .onResponse(future, request, response, throwable, attempt, retryOnNewHost, (nextAttempt, delay) -> { - String statusCode = response != null ? String.valueOf(response.statusCode()) : ""; - SidecarInstance nextInstance = iterator.hasNext() ? iterator.next() : sidecarInstance; - if (response == null || response.statusCode() != HttpResponseStatus.ACCEPTED.code()) - { - logger.warn("Retrying request on {} instance after {}ms. " + - "Failed on instance={}, attempt={}, statusCode={}", - nextInstance == sidecarInstance ? "same" : "next", delay, - sidecarInstance, attempt, statusCode, throwable); - } - schedule(delay, () -> executeWithRetries(future, iterator, nextInstance, context, nextAttempt)); - }); + String statusCode = response != null ? String.valueOf(response.statusCode()) : ""; + SidecarInstance nextInstance = iterator.hasNext() ? iterator.next() : sidecarInstance; + if (response == null || response.statusCode() != HttpResponseStatus.ACCEPTED.code()) + { + logger.warn("Retrying request on {} instance after {}ms. " + + "Failed on instance={}, attempt={}, statusCode={}", + nextInstance == sidecarInstance ? "same" : "next", delay, + sidecarInstance, attempt, statusCode, throwable); + } + schedule(delay, () -> executeWithRetries(future, iterator, nextInstance, context, nextAttempt)); + }); } /** @@ -306,17 +313,17 @@ private void applyRetryPolicy(CompletableFuture future, Request request = context.request(); context.retryPolicy() .onResponse(future, request, response, throwable, attempt, retryOnNewHost, (nextAttempt, delay) -> { - String statusCode = response != null ? String.valueOf(response.statusCode()) : ""; - SidecarInstance nextInstance = iterator.hasNext() ? iterator.next() : sidecarInstance; - if (response == null || response.statusCode() != HttpResponseStatus.ACCEPTED.code()) - { - logger.warn("Retrying stream on {} instance after {}ms. " + - "Failed on instance={}, attempt={}, statusCode={}", - nextInstance == sidecarInstance ? "same" : "next", delay, - sidecarInstance, attempt, statusCode, throwable); - } - schedule(delay, () -> streamWithRetries(future, consumer, iterator, nextInstance, context, nextAttempt)); - }); + String statusCode = response != null ? String.valueOf(response.statusCode()) : ""; + SidecarInstance nextInstance = iterator.hasNext() ? iterator.next() : sidecarInstance; + if (response == null || response.statusCode() != HttpResponseStatus.ACCEPTED.code()) + { + logger.warn("Retrying stream on {} instance after {}ms. " + + "Failed on instance={}, attempt={}, statusCode={}", + nextInstance == sidecarInstance ? "same" : "next", delay, + sidecarInstance, attempt, statusCode, throwable); + } + schedule(delay, () -> streamWithRetries(future, consumer, iterator, nextInstance, context, nextAttempt)); + }); } /** diff --git a/analytics-sidecar-client/src/main/java/org/apache/cassandra/sidecar/client/StreamBuffer.java b/analytics-sidecar-client/src/main/java/org/apache/cassandra/sidecar/client/StreamBuffer.java index 4664c0408..353bfafa7 100644 --- a/analytics-sidecar-client/src/main/java/org/apache/cassandra/sidecar/client/StreamBuffer.java +++ b/analytics-sidecar-client/src/main/java/org/apache/cassandra/sidecar/client/StreamBuffer.java @@ -82,6 +82,17 @@ static ByteArrayWrapper wrap(byte[] bytes) return new ByteArrayWrapper(bytes); } + /** + * Wraps a {@link ByteBuffer} into a {@link ByteBufferWrapper} + * + * @param buffer the underlying ByteBuffer + * @return the object wrapping ByteBuffer + */ + static ByteBufferWrapper wrap(ByteBuffer buffer) + { + return new ByteBufferWrapper(buffer); + } + /** * A {@link StreamBuffer} implementation that wraps a byte array */ @@ -123,4 +134,70 @@ public void release() { } } + + /** + * A {@link StreamBuffer} implementation that wraps a {@link ByteBuffer}. + * This implementation assumes single-threaded access and may modify the + * buffer's position and limit during operations. + */ + class ByteBufferWrapper implements StreamBuffer + { + private final ByteBuffer buffer; + + private ByteBufferWrapper(ByteBuffer buffer) + { + this.buffer = buffer; + } + + @Override + public void copyBytes(int sourceOffset, ByteBuffer destination, int length) + { + int originalPosition = buffer.position(); + int originalLimit = buffer.limit(); + try + { + buffer.position(sourceOffset); + buffer.limit(sourceOffset + length); + destination.put(buffer); + destination.flip(); + } + finally + { + buffer.position(originalPosition); + buffer.limit(originalLimit); + } + } + + @Override + public void copyBytes(int sourceOffset, byte[] destination, int destinationIndex, int length) + { + int originalPosition = buffer.position(); + try + { + buffer.position(sourceOffset); + buffer.get(destination, destinationIndex, length); + } + finally + { + buffer.position(originalPosition); + } + } + + @Override + public byte getByte(int index) + { + return buffer.get(index); + } + + @Override + public int readableBytes() + { + return buffer.remaining(); + } + + @Override + public void release() + { + } + } } diff --git a/analytics-sidecar-client/src/testFixtures/java/org/apache/cassandra/sidecar/client/SidecarClientTest.java b/analytics-sidecar-client/src/testFixtures/java/org/apache/cassandra/sidecar/client/SidecarClientTest.java index ccf29c3a0..3d6a1a1f5 100644 --- a/analytics-sidecar-client/src/testFixtures/java/org/apache/cassandra/sidecar/client/SidecarClientTest.java +++ b/analytics-sidecar-client/src/testFixtures/java/org/apache/cassandra/sidecar/client/SidecarClientTest.java @@ -1303,13 +1303,12 @@ public void onError(Throwable throwable) assertThat(request3.getHeader("User-Agent")).isEqualTo("cassandra-sidecar-test/0.0.1"); assertThat(request3.getHeader("range")).isEqualTo("bytes=10-20"); - byte[] bytes = receivedBytes.stream() - .collect(ByteArrayOutputStream::new, - (outputStream, src) -> outputStream.write(src, 0, src.length), - (outputStream, src) -> { - }) - .toByteArray(); - assertThat(new String(bytes, StandardCharsets.UTF_8)).isEqualTo("TOC.txt\nSt"); + String actual = receivedBytes.stream() + .collect(ByteArrayOutputStream::new, + (outputStream, src) -> outputStream.write(src, 0, src.length), + (outputStream, src) -> { + }).toString(StandardCharsets.UTF_8); + assertThat(actual).isEqualTo("TOC.txt\nSt"); } } diff --git a/analytics-sidecar-vertx-client-shaded/build.gradle b/analytics-sidecar-vertx-client-shaded/build.gradle index 1d1bdeb19..f5414a6ea 100644 --- a/analytics-sidecar-vertx-client-shaded/build.gradle +++ b/analytics-sidecar-vertx-client-shaded/build.gradle @@ -33,12 +33,16 @@ plugins { id('java-library') id('maven-publish') id('signing') - id('com.gradleup.shadow') version '8.3.0' + id('com.gradleup.shadow') version '8.3.9' } -version project.version +java { + toolchain { + languageVersion = JavaLanguageVersion.of(11) + } +} -sourceCompatibility = 1.8 +version project.version configurations { all*.exclude(group: 'ch.qos.logback') @@ -69,6 +73,23 @@ tasks.named('test') { // Relocating a Package shadowJar { archiveClassifier.set('') + + // Exclude unsupported java versions from jar files with multi-release class support + exclude 'META-INF/versions/12/' + exclude 'META-INF/versions/13/' + exclude 'META-INF/versions/14/' + exclude 'META-INF/versions/15/' + exclude 'META-INF/versions/16/' + exclude 'META-INF/versions/17/' + exclude 'META-INF/versions/18/' + exclude 'META-INF/versions/19/' + exclude 'META-INF/versions/20/' + exclude 'META-INF/versions/21/' + exclude 'META-INF/versions/22/' + exclude 'META-INF/versions/23/' + exclude 'META-INF/versions/24/' + exclude 'META-INF/versions/25/' + // Our use of Jackson should be an implementation detail - shade everything so no matter what // version of Jackson is available in the classpath we don't break consumers of the client relocate 'org.apache.cassandra.sidecar.common', 'o.a.c.sidecar.client.shaded.common' @@ -107,8 +128,7 @@ ext { // analytics-sidecar-vertx-client-shaded_* as the artifact name archivesBaseName = "analytics-sidecar-vertx-client-all" -if (propertyWithDefault("artifactType", null) == "common") -{ +if (propertyWithDefault("artifactType", null) == "common") { publishing { publications { shadow(MavenPublication) { publication -> diff --git a/analytics-sidecar-vertx-client/build.gradle b/analytics-sidecar-vertx-client/build.gradle index f04aa7329..fa6a0d32f 100644 --- a/analytics-sidecar-vertx-client/build.gradle +++ b/analytics-sidecar-vertx-client/build.gradle @@ -25,10 +25,7 @@ plugins { id('maven-publish') } -sourceCompatibility = 1.8 - -if (propertyWithDefault("artifactType", null) == "common") -{ +if (propertyWithDefault("artifactType", null) == "common") { apply from: "$rootDir/gradle/common/publishing.gradle" } diff --git a/cassandra-analytics-cdc/src/main/java/org/apache/cassandra/cdc/CdcBuilder.java b/cassandra-analytics-cdc/src/main/java/org/apache/cassandra/cdc/CdcBuilder.java index d95315ed0..cd8b3ea9e 100644 --- a/cassandra-analytics-cdc/src/main/java/org/apache/cassandra/cdc/CdcBuilder.java +++ b/cassandra-analytics-cdc/src/main/java/org/apache/cassandra/cdc/CdcBuilder.java @@ -119,7 +119,6 @@ public CdcBuilder withExecutor(@NotNull ExecutorService executor) return withExecutor(AsyncExecutor.wrap(executor)); } - public CdcBuilder withExecutor(@NotNull AsyncExecutor asyncExecutor) { this.asyncExecutor = asyncExecutor; diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/StreamSession.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/StreamSession.java index 9a05dca63..40cbbdeaa 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/StreamSession.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/StreamSession.java @@ -29,6 +29,7 @@ import java.util.List; import java.util.Map; import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ExecutorService; import java.util.concurrent.Future; import java.util.concurrent.atomic.AtomicReference; @@ -41,7 +42,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import o.a.c.sidecar.client.shaded.io.vertx.core.impl.ConcurrentHashSet; import org.apache.cassandra.bridge.SSTableDescriptor; import org.apache.cassandra.spark.bulkwriter.token.ReplicaAwareFailureHandler; import org.apache.cassandra.spark.bulkwriter.token.TokenRangeMapping; @@ -61,7 +61,7 @@ public abstract class StreamSession protected final SortedSSTableWriter sstableWriter; protected final ExecutorService executorService; - private final Set streamedFiles = new ConcurrentHashSet<>(); + private final Set streamedFiles = ConcurrentHashMap.newKeySet(); private final AtomicReference lastStreamFailure = new AtomicReference<>(); private volatile boolean isStreamFinalized = false; diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/data/DataLayer.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/data/DataLayer.java index 394913acd..bb087dec5 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/data/DataLayer.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/data/DataLayer.java @@ -29,8 +29,6 @@ import java.util.concurrent.ExecutorService; import java.util.stream.Collectors; -import org.apache.commons.lang.StringUtils; - import org.apache.cassandra.bridge.BigNumberConfig; import org.apache.cassandra.bridge.CassandraBridge; import org.apache.cassandra.bridge.CassandraBridgeFactory; @@ -318,7 +316,7 @@ public StreamScanner openPartitionSizeIterator(int partitionId) public Filter[] unsupportedPushDownFilters(Filter[] filters) { Set partitionKeys = cqlTable().partitionKeys().stream() - .map(key -> StringUtils.lowerCase(key.name())) + .map(key -> lowerCase(key.name())) .collect(Collectors.toSet()); List unsupportedFilters = new ArrayList<>(filters.length); @@ -326,9 +324,9 @@ public Filter[] unsupportedPushDownFilters(Filter[] filters) { if (filter instanceof EqualTo || filter instanceof In) { - String columnName = StringUtils.lowerCase(filter instanceof EqualTo - ? ((EqualTo) filter).attribute() - : ((In) filter).attribute()); + String columnName = lowerCase(filter instanceof EqualTo + ? ((EqualTo) filter).attribute() + : ((In) filter).attribute()); if (partitionKeys.contains(columnName)) { @@ -359,4 +357,13 @@ public Stats stats() { return Stats.DoNothingStats.INSTANCE; } + + static String lowerCase(String s) + { + if (s == null) + { + return s; + } + return s.toLowerCase(); + } } diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/data/LocalDataLayer.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/data/LocalDataLayer.java index a6d44904d..64b57e1cd 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/data/LocalDataLayer.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/data/LocalDataLayer.java @@ -35,6 +35,7 @@ import java.util.List; import java.util.Locale; import java.util.Map; +import java.util.Objects; import java.util.Set; import java.util.UUID; import java.util.concurrent.ExecutorService; @@ -43,9 +44,6 @@ import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.ImmutableMap; -import org.apache.commons.lang.StringUtils; -import org.apache.commons.lang.builder.EqualsBuilder; -import org.apache.commons.lang.builder.HashCodeBuilder; import com.esotericsoftware.kryo.Kryo; import com.esotericsoftware.kryo.io.Input; @@ -179,7 +177,7 @@ public static LocalDataLayer from(Map options) getOrThrow(options, lowerCaseKey("keyspace")), getOrThrow(options, lowerCaseKey("createStmt")), Arrays.stream(options.getOrDefault(lowerCaseKey("udts"), "").split("\n")) - .filter(StringUtils::isNotEmpty) + .filter(s -> !s.isEmpty()) .collect(Collectors.toSet()), SchemaFeatureSet.initializeFromOptions(options), getBoolean(options, lowerCaseKey("useBufferingInputStream"), getBoolean(options, lowerCaseKey("useSSTableInputStream"), false)), @@ -407,35 +405,20 @@ private static Stream listPath(Path path) @Override public int hashCode() { - return new HashCodeBuilder() - .append(cqlTable) - .append(paths) - .append(version()) - .toHashCode(); + return Objects.hash(cqlTable, Arrays.hashCode(paths), version()); } @Override - public boolean equals(Object other) + public boolean equals(Object o) { - if (other == null) + if (o == null || getClass() != o.getClass()) { return false; } - if (this == other) - { - return true; - } - if (this.getClass() != other.getClass()) - { - return false; - } - - LocalDataLayer that = (LocalDataLayer) other; - return new EqualsBuilder() - .append(this.cqlTable, that.cqlTable) - .append(this.paths, that.paths) - .append(this.version(), that.version()) - .isEquals(); + LocalDataLayer that = (LocalDataLayer) o; + return Objects.equals(cqlTable, that.cqlTable) + && Objects.deepEquals(paths, that.paths) + && Objects.equals(version(), that.version()); } // JDK Serialization diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/data/PartitionedDataLayer.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/data/PartitionedDataLayer.java index 72ee3d7ce..235ae049a 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/data/PartitionedDataLayer.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/data/PartitionedDataLayer.java @@ -28,6 +28,7 @@ import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Objects; import java.util.Set; import java.util.concurrent.CompletableFuture; import java.util.concurrent.ExecutorService; @@ -38,8 +39,6 @@ import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Preconditions; import com.google.common.collect.Range; -import org.apache.commons.lang.builder.EqualsBuilder; -import org.apache.commons.lang.builder.HashCodeBuilder; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -88,7 +87,7 @@ public enum AvailabilityHint } public static final Comparator AVAILABILITY_HINT_COMPARATOR = - Comparator.comparingInt((AvailabilityHint other) -> other.priority); + Comparator.comparingInt((AvailabilityHint other) -> other.priority); public static AvailabilityHint fromState(String status, String state) { @@ -209,16 +208,16 @@ public SparkRangeFilter sparkRangeFilter(int partitionId) LOGGER.error("Unable to find the sparkTokenRange for partitionId={} in reversePartitionMap={}", partitionId, reversePartitionMap); throw new IllegalStateException( - String.format("Unable to find sparkTokenRange for partitionId=%d in the reverse partition map", - partitionId)); + String.format("Unable to find sparkTokenRange for partitionId=%d in the reverse partition map", + partitionId)); } return SparkRangeFilter.create(RangeUtils.toTokenRange(sparkTokenRange)); } @Override public List partitionKeyFiltersInRange( - int partitionId, - List filters) throws NoMatchFoundException + int partitionId, + List filters) throws NoMatchFoundException { // We only need to worry about Partition key filters that overlap with this Spark workers token range SparkRangeFilter rangeFilter = sparkRangeFilter(partitionId); @@ -380,9 +379,9 @@ static Set rangesToReplicas(@NotNull ConsistencyLevel consist @NotNull Map, List> ranges) { return ranges.values().stream() - .flatMap(Collection::stream) - .filter(instance -> !consistencyLevel.isDCLocal || dataCenter == null || instance.dataCenter().equals(dataCenter)) - .collect(Collectors.toSet()); + .flatMap(Collection::stream) + .filter(instance -> !consistencyLevel.isDCLocal || dataCenter == null || instance.dataCenter().equals(dataCenter)) + .collect(Collectors.toSet()); } /** @@ -442,7 +441,7 @@ private static void validateConsistency(@NotNull ConsistencyLevel consistencyLev /** * Return a set of primary and backup CassandraInstances to satisfy the consistency level. - * + *

* NOTE: This method current assumes that each Spark token worker owns a single replica set. * * @param instances replicas that overlap with the Spark worker's token range @@ -472,7 +471,7 @@ static ReplicaSet splitReplicas(Collection instances, // multiple replica sets but for current implementation of the TokenPartitioner // it returns a single replica set per Spark worker/partition LOGGER.warn("Cannot use incremental repair awareness when Spark partition owns more than one replica set, " - + "performance will be degraded numRanges={}", ranges.size()); + + "performance will be degraded numRanges={}", ranges.size()); replicaSet.incrementalRepairPrimary = null; } @@ -547,30 +546,17 @@ public ReplicaSet addBackup(CassandraInstance instance) @Override public int hashCode() { - return new HashCodeBuilder() - .append(datacenter) - .toHashCode(); + return Objects.hashCode(datacenter); } @Override - public boolean equals(Object other) + public boolean equals(Object o) { - if (other == null) - { - return false; - } - if (this == other) - { - return true; - } - if (this.getClass() != other.getClass()) + if (o == null || getClass() != o.getClass()) { return false; } - - PartitionedDataLayer that = (PartitionedDataLayer) other; - return new EqualsBuilder() - .append(this.datacenter, that.datacenter) - .isEquals(); + PartitionedDataLayer that = (PartitionedDataLayer) o; + return Objects.equals(datacenter, that.datacenter); } } diff --git a/cassandra-analytics-core/src/main/spark4/org/apache/cassandra/spark/sparksql/CassandraDataSource.java b/cassandra-analytics-core/src/main/spark4/org/apache/cassandra/spark/sparksql/CassandraDataSource.java new file mode 100644 index 000000000..b025b6d85 --- /dev/null +++ b/cassandra-analytics-core/src/main/spark4/org/apache/cassandra/spark/sparksql/CassandraDataSource.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.cassandra.spark.sparksql; + +import com.google.common.annotations.VisibleForTesting; + +import org.apache.cassandra.bridge.CassandraBridgeFactory; +import org.apache.cassandra.bridge.CassandraVersion; +import org.apache.cassandra.spark.data.CassandraDataLayer; +import org.apache.cassandra.spark.data.CassandraDataSourceHelper; +import org.apache.cassandra.spark.data.ClientConfig; +import org.apache.cassandra.spark.data.DataLayer; + +public class CassandraDataSource extends CassandraTableProvider +{ + public CassandraDataSource() + { + CassandraBridgeFactory.validateBridges(CassandraVersion.implementedVersions()); + } + + @Override + public String shortName() + { + return "cassandraBulkRead"; + } + + @Override + public DataLayer getDataLayer(org.apache.spark.sql.util.CaseInsensitiveStringMap options) + { + return CassandraDataSourceHelper.getDataLayer(options, this::initializeDataLayer); + } + + @VisibleForTesting + protected void initializeDataLayer(CassandraDataLayer dataLayer, ClientConfig config) + { + dataLayer.initialize(config); + } +} diff --git a/cassandra-analytics-core/src/main/spark4/org/apache/cassandra/spark/sparksql/CassandraInputPartition.java b/cassandra-analytics-core/src/main/spark4/org/apache/cassandra/spark/sparksql/CassandraInputPartition.java new file mode 100644 index 000000000..ea76f9b76 --- /dev/null +++ b/cassandra-analytics-core/src/main/spark4/org/apache/cassandra/spark/sparksql/CassandraInputPartition.java @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.cassandra.spark.sparksql; + +import org.apache.spark.sql.connector.read.InputPartition; + +class CassandraInputPartition implements InputPartition +{ + private final int partitionId; + + CassandraInputPartition(int partitionId) + { + this.partitionId = partitionId; + } + + public int getPartitionId() + { + return partitionId; + } +} diff --git a/cassandra-analytics-core/src/main/spark4/org/apache/cassandra/spark/sparksql/CassandraPartitionReaderFactory.java b/cassandra-analytics-core/src/main/spark4/org/apache/cassandra/spark/sparksql/CassandraPartitionReaderFactory.java new file mode 100644 index 000000000..b2a7df7ab --- /dev/null +++ b/cassandra-analytics-core/src/main/spark4/org/apache/cassandra/spark/sparksql/CassandraPartitionReaderFactory.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.cassandra.spark.sparksql; + +import java.util.List; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.cassandra.spark.data.DataLayer; +import org.apache.cassandra.spark.sparksql.filters.PartitionKeyFilter; +import org.apache.spark.TaskContext; +import org.apache.spark.sql.catalyst.InternalRow; +import org.apache.spark.sql.connector.read.InputPartition; +import org.apache.spark.sql.connector.read.PartitionReader; +import org.apache.spark.sql.connector.read.PartitionReaderFactory; +import org.apache.spark.sql.types.StructType; + +class CassandraPartitionReaderFactory implements PartitionReaderFactory +{ + private static final Logger LOGGER = LoggerFactory.getLogger(CassandraPartitionReaderFactory.class); + final DataLayer dataLayer; + final StructType requiredSchema; + final List partitionKeyFilters; + + CassandraPartitionReaderFactory(DataLayer dataLayer, + StructType requiredSchema, + List partitionKeyFilters) + { + this.dataLayer = dataLayer; + this.requiredSchema = requiredSchema; + this.partitionKeyFilters = partitionKeyFilters; + } + + @Override + public PartitionReader createReader(InputPartition partition) + { + int partitionId; + if (partition instanceof CassandraInputPartition) + { + partitionId = ((CassandraInputPartition) partition).getPartitionId(); + } + else + { + partitionId = TaskContext.getPartitionId(); + LOGGER.warn("InputPartition is not of CassandraInputPartition type. " + + "Using TaskContext to determine the partitionId type={}, partitionId={}", + partition.getClass().getName(), partitionId); + } + return new SparkRowIterator(partitionId, dataLayer, requiredSchema, partitionKeyFilters); + } +} diff --git a/cassandra-analytics-core/src/main/spark4/org/apache/cassandra/spark/sparksql/CassandraPartitioning.java b/cassandra-analytics-core/src/main/spark4/org/apache/cassandra/spark/sparksql/CassandraPartitioning.java new file mode 100644 index 000000000..e305f3654 --- /dev/null +++ b/cassandra-analytics-core/src/main/spark4/org/apache/cassandra/spark/sparksql/CassandraPartitioning.java @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.cassandra.spark.sparksql; + +import org.apache.cassandra.spark.data.DataLayer; +import org.apache.spark.sql.connector.read.partitioning.Partitioning; + +class CassandraPartitioning implements Partitioning +{ + final DataLayer dataLayer; + + CassandraPartitioning(DataLayer dataLayer) + { + this.dataLayer = dataLayer; + } + + @Override + public int numPartitions() + { + return dataLayer.partitionCount(); + } +} diff --git a/cassandra-analytics-core/src/main/spark4/org/apache/cassandra/spark/sparksql/CassandraScanBuilder.java b/cassandra-analytics-core/src/main/spark4/org/apache/cassandra/spark/sparksql/CassandraScanBuilder.java new file mode 100644 index 000000000..2f821398d --- /dev/null +++ b/cassandra-analytics-core/src/main/spark4/org/apache/cassandra/spark/sparksql/CassandraScanBuilder.java @@ -0,0 +1,149 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.cassandra.spark.sparksql; + +import java.math.BigInteger; +import java.nio.ByteBuffer; +import java.util.AbstractMap; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +import org.apache.cassandra.spark.data.CqlField; +import org.apache.cassandra.spark.data.DataLayer; +import org.apache.cassandra.spark.sparksql.filters.PartitionKeyFilter; +import org.apache.cassandra.spark.utils.FilterUtils; +import org.apache.spark.sql.connector.read.Batch; +import org.apache.spark.sql.connector.read.InputPartition; +import org.apache.spark.sql.connector.read.PartitionReaderFactory; +import org.apache.spark.sql.connector.read.Scan; +import org.apache.spark.sql.connector.read.ScanBuilder; +import org.apache.spark.sql.connector.read.SupportsPushDownFilters; +import org.apache.spark.sql.connector.read.SupportsPushDownRequiredColumns; +import org.apache.spark.sql.connector.read.SupportsReportPartitioning; +import org.apache.spark.sql.connector.read.partitioning.Partitioning; +import org.apache.spark.sql.sources.Filter; +import org.apache.spark.sql.types.StructType; +import org.apache.spark.sql.util.CaseInsensitiveStringMap; + +class CassandraScanBuilder implements ScanBuilder, Scan, Batch, SupportsPushDownFilters, SupportsPushDownRequiredColumns, SupportsReportPartitioning +{ + final DataLayer dataLayer; + final StructType schema; + final CaseInsensitiveStringMap options; + StructType requiredSchema = null; + Filter[] pushedFilters = new Filter[0]; + + CassandraScanBuilder(DataLayer dataLayer, StructType schema, CaseInsensitiveStringMap options) + { + this.dataLayer = dataLayer; + this.schema = schema; + this.options = options; + } + + @Override + public Scan build() + { + return this; + } + + @Override + public void pruneColumns(StructType requiredSchema) + { + this.requiredSchema = requiredSchema; + } + + @Override + public Filter[] pushFilters(Filter[] filters) + { + Filter[] unsupportedFilters = dataLayer.unsupportedPushDownFilters(filters); + + List supportedFilters = new ArrayList<>(Arrays.asList(filters)); + supportedFilters.removeAll(Arrays.asList(unsupportedFilters)); + pushedFilters = supportedFilters.toArray(new Filter[0]); + + return unsupportedFilters; + } + + @Override + public Filter[] pushedFilters() + { + return pushedFilters; + } + + @Override + public StructType readSchema() + { + return requiredSchema; + } + + @Override + public Batch toBatch() + { + return this; + } + + @Override + public InputPartition[] planInputPartitions() + { + return IntStream.range(0, dataLayer.partitionCount()) + .mapToObj(CassandraInputPartition::new) + .toArray(InputPartition[]::new); + } + + @Override + public PartitionReaderFactory createReaderFactory() + { + return new CassandraPartitionReaderFactory(dataLayer, requiredSchema, buildPartitionKeyFilters()); + } + + @Override + public Partitioning outputPartitioning() + { + return new CassandraPartitioning(dataLayer); + } + + private List buildPartitionKeyFilters() + { + List partitionKeyColumnNames = dataLayer.cqlTable().partitionKeys().stream().map(CqlField::name).collect(Collectors.toList()); + Map> partitionKeyValues = FilterUtils.extractPartitionKeyValues(pushedFilters, new HashSet<>(partitionKeyColumnNames)); + if (partitionKeyValues.size() > 0) + { + List> orderedValues = partitionKeyColumnNames.stream().map(partitionKeyValues::get).collect(Collectors.toList()); + return FilterUtils.cartesianProduct(orderedValues).stream() + .map(this::buildFilter) + .collect(Collectors.toList()); + } + else + { + return new ArrayList<>(); + } + } + + private PartitionKeyFilter buildFilter(List keys) + { + AbstractMap.SimpleEntry filterKey = dataLayer.bridge().getPartitionKey(dataLayer.cqlTable(), dataLayer.partitioner(), keys); + return PartitionKeyFilter.create(filterKey.getKey(), filterKey.getValue()); + } +} diff --git a/cassandra-analytics-core/src/main/spark4/org/apache/cassandra/spark/sparksql/CassandraTable.java b/cassandra-analytics-core/src/main/spark4/org/apache/cassandra/spark/sparksql/CassandraTable.java new file mode 100644 index 000000000..f78e4c5b5 --- /dev/null +++ b/cassandra-analytics-core/src/main/spark4/org/apache/cassandra/spark/sparksql/CassandraTable.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.cassandra.spark.sparksql; + +import java.util.HashSet; +import java.util.Set; + +import com.google.common.collect.ImmutableList; + +import org.apache.cassandra.spark.data.DataLayer; +import org.apache.spark.sql.connector.catalog.SupportsRead; +import org.apache.spark.sql.connector.catalog.Table; +import org.apache.spark.sql.connector.catalog.TableCapability; +import org.apache.spark.sql.connector.read.ScanBuilder; +import org.apache.spark.sql.types.StructType; +import org.apache.spark.sql.util.CaseInsensitiveStringMap; + +class CassandraTable implements Table, SupportsRead +{ + private final DataLayer dataLayer; + private final StructType schema; + + CassandraTable(DataLayer dataLayer, StructType schema) + { + this.dataLayer = dataLayer; + this.schema = schema; + } + + @Override + public String name() + { + return dataLayer.cqlTable().keyspace() + "." + dataLayer.cqlTable().table(); + } + + @Override + public StructType schema() + { + return schema; + } + + @Override + public Set capabilities() + { + return new HashSet<>(ImmutableList.of(TableCapability.BATCH_READ, TableCapability.MICRO_BATCH_READ)); + } + + @Override + public ScanBuilder newScanBuilder(CaseInsensitiveStringMap options) + { + return new CassandraScanBuilder(dataLayer, schema, options); + } +} diff --git a/cassandra-analytics-core/src/main/spark4/org/apache/cassandra/spark/sparksql/CassandraTableProvider.java b/cassandra-analytics-core/src/main/spark4/org/apache/cassandra/spark/sparksql/CassandraTableProvider.java new file mode 100644 index 000000000..a2e86b10f --- /dev/null +++ b/cassandra-analytics-core/src/main/spark4/org/apache/cassandra/spark/sparksql/CassandraTableProvider.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.cassandra.spark.sparksql; + +import java.util.Map; + +import org.apache.cassandra.spark.data.DataLayer; +import org.apache.spark.sql.connector.catalog.Table; +import org.apache.spark.sql.connector.catalog.TableProvider; +import org.apache.spark.sql.connector.expressions.Transform; +import org.apache.spark.sql.sources.DataSourceRegister; +import org.apache.spark.sql.types.StructType; +import org.apache.spark.sql.util.CaseInsensitiveStringMap; + +public abstract class CassandraTableProvider implements TableProvider, DataSourceRegister +{ + private DataLayer dataLayer; + + public abstract DataLayer getDataLayer(CaseInsensitiveStringMap options); + + DataLayer getDataLayerInternal(CaseInsensitiveStringMap options) + { + DataLayer dataLayer = this.dataLayer; + if (dataLayer != null) + { + return dataLayer; + } + dataLayer = getDataLayer(options); + this.dataLayer = dataLayer; + return dataLayer; + } + + @Override + public StructType inferSchema(CaseInsensitiveStringMap options) + { + return getDataLayerInternal(options).structType(); + } + + @Override + public Table getTable(StructType schema, Transform[] partitioning, Map properties) + { + return new CassandraTable(getDataLayerInternal(new CaseInsensitiveStringMap(properties)), schema); + } +} diff --git a/cassandra-analytics-core/src/main/spark4/org/apache/cassandra/spark/sparksql/LocalDataSource.java b/cassandra-analytics-core/src/main/spark4/org/apache/cassandra/spark/sparksql/LocalDataSource.java new file mode 100644 index 000000000..c9d14e868 --- /dev/null +++ b/cassandra-analytics-core/src/main/spark4/org/apache/cassandra/spark/sparksql/LocalDataSource.java @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.cassandra.spark.sparksql; + +import org.apache.cassandra.spark.data.DataLayer; +import org.apache.cassandra.spark.data.LocalDataLayer; +import org.apache.spark.sql.util.CaseInsensitiveStringMap; +import org.jetbrains.annotations.NotNull; + +@SuppressWarnings("unused") +public class LocalDataSource extends CassandraTableProvider +{ + @Override + @NotNull + public String shortName() + { + return "localsstabledatasource"; + } + + @Override + @NotNull + public DataLayer getDataLayer(@NotNull CaseInsensitiveStringMap options) + { + return LocalDataLayer.from(options); + } +} diff --git a/cassandra-analytics-core/src/main/spark4/org/apache/cassandra/spark/sparksql/LocalPartitionSizeSource.java b/cassandra-analytics-core/src/main/spark4/org/apache/cassandra/spark/sparksql/LocalPartitionSizeSource.java new file mode 100644 index 000000000..cf24f1ce7 --- /dev/null +++ b/cassandra-analytics-core/src/main/spark4/org/apache/cassandra/spark/sparksql/LocalPartitionSizeSource.java @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.cassandra.spark.sparksql; + +import org.apache.cassandra.spark.data.DataLayer; +import org.apache.cassandra.spark.data.LocalDataLayer; +import org.apache.spark.sql.util.CaseInsensitiveStringMap; + +public class LocalPartitionSizeSource extends PartitionSizeTableProvider +{ + @Override + public String shortName() + { + return "localpartitionsizesource"; + } + + @Override + public DataLayer getDataLayer(CaseInsensitiveStringMap options) + { + return LocalDataLayer.from(options); + } +} diff --git a/cassandra-analytics-core/src/main/spark4/org/apache/cassandra/spark/sparksql/PartitionSizeIterator.java b/cassandra-analytics-core/src/main/spark4/org/apache/cassandra/spark/sparksql/PartitionSizeIterator.java new file mode 100644 index 000000000..d78ed5fbe --- /dev/null +++ b/cassandra-analytics-core/src/main/spark4/org/apache/cassandra/spark/sparksql/PartitionSizeIterator.java @@ -0,0 +1,96 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.cassandra.spark.sparksql; + +import java.io.IOException; + +import org.apache.cassandra.spark.data.CqlTable; +import org.apache.cassandra.spark.data.DataLayer; +import org.apache.cassandra.spark.data.converter.SparkSqlTypeConverter; +import org.apache.cassandra.spark.reader.IndexEntry; +import org.apache.cassandra.spark.reader.StreamScanner; +import org.apache.cassandra.analytics.stats.Stats; +import org.apache.spark.sql.catalyst.InternalRow; +import org.apache.spark.sql.catalyst.expressions.GenericInternalRow; +import org.apache.spark.sql.connector.read.PartitionReader; +import org.jetbrains.annotations.NotNull; + +/** + * Wrapper iterator around IndexIterator to read all Index.db files and return SparkSQL + * rows containing all partition keys and the associated on-disk uncompressed and compressed sizes. + */ +public class PartitionSizeIterator implements PartitionReader +{ + private final StreamScanner it; + private final CqlTable cqlTable; + private final int numPartitionKeys; + private final Stats stats; + private final long startTimeNanos; + private GenericInternalRow curr = null; + private final SparkSqlTypeConverter sparkSqlTypeConverter; + + public PartitionSizeIterator(int partitionId, @NotNull DataLayer dataLayer) + { + this.cqlTable = dataLayer.cqlTable(); + this.numPartitionKeys = cqlTable.numPartitionKeys(); + this.stats = dataLayer.stats(); + this.startTimeNanos = System.nanoTime(); + this.it = dataLayer.openPartitionSizeIterator(partitionId); + stats.openedPartitionSizeIterator(System.nanoTime() - startTimeNanos); + this.sparkSqlTypeConverter = dataLayer.typeConverter(); + } + + /** + * The expected schema is defined in {@link DataLayer#partitionSizeStructType()}. + * It consists of the Cassandra partition keys, appended with the columns "uncompressed" and "compressed". + */ + public boolean next() throws IOException + { + if (it.next()) + { + it.advanceToNextColumn(); + + IndexEntry entry = it.data(); + Object[] values = new Object[numPartitionKeys + 2]; + + CellIterator.readPartitionKey(sparkSqlTypeConverter, entry.getPartitionKey(), cqlTable, values, stats); + values[numPartitionKeys] = entry.getUncompressed(); + values[numPartitionKeys + 1] = entry.getCompressed(); + + this.curr = new GenericInternalRow(values); + stats.emitIndexEntry(entry); + + return true; + } + + return false; + } + + public InternalRow get() + { + return curr; + } + + public void close() throws IOException + { + this.it.close(); + stats.closedPartitionSizeIterator(System.nanoTime() - startTimeNanos); + } +} diff --git a/cassandra-analytics-core/src/main/spark4/org/apache/cassandra/spark/sparksql/PartitionSizeTableProvider.java b/cassandra-analytics-core/src/main/spark4/org/apache/cassandra/spark/sparksql/PartitionSizeTableProvider.java new file mode 100644 index 000000000..2acc567f9 --- /dev/null +++ b/cassandra-analytics-core/src/main/spark4/org/apache/cassandra/spark/sparksql/PartitionSizeTableProvider.java @@ -0,0 +1,171 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.cassandra.spark.sparksql; + +import java.util.Map; +import java.util.Set; +import java.util.stream.IntStream; + +import com.google.common.collect.ImmutableSet; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.cassandra.spark.data.DataLayer; +import org.apache.spark.TaskContext; +import org.apache.spark.sql.catalyst.InternalRow; +import org.apache.spark.sql.connector.catalog.SupportsRead; +import org.apache.spark.sql.connector.catalog.Table; +import org.apache.spark.sql.connector.catalog.TableCapability; +import org.apache.spark.sql.connector.expressions.Transform; +import org.apache.spark.sql.connector.read.Batch; +import org.apache.spark.sql.connector.read.InputPartition; +import org.apache.spark.sql.connector.read.PartitionReader; +import org.apache.spark.sql.connector.read.PartitionReaderFactory; +import org.apache.spark.sql.connector.read.Scan; +import org.apache.spark.sql.connector.read.ScanBuilder; +import org.apache.spark.sql.types.StructType; +import org.apache.spark.sql.util.CaseInsensitiveStringMap; + +public abstract class PartitionSizeTableProvider extends CassandraTableProvider +{ + public abstract DataLayer getDataLayer(CaseInsensitiveStringMap options); + + @Override + public StructType inferSchema(CaseInsensitiveStringMap options) + { + return getDataLayerInternal(options).partitionSizeStructType(); + } + + @Override + public Table getTable(StructType schema, Transform[] partitioning, Map properties) + { + return new PartitionSizeTable(getDataLayerInternal(new CaseInsensitiveStringMap(properties)), schema); + } +} + +class PartitionSizeTable implements Table, SupportsRead +{ + private final DataLayer dataLayer; + private final StructType schema; + + PartitionSizeTable(DataLayer dataLayer, StructType schema) + { + this.dataLayer = dataLayer; + this.schema = schema; + } + + @Override + public String name() + { + return dataLayer.cqlTable().keyspace() + "." + dataLayer.cqlTable().table(); + } + + @Override + public StructType schema() + { + return schema; + } + + @Override + public Set capabilities() + { + return ImmutableSet.of(TableCapability.BATCH_READ); + } + + @Override + public ScanBuilder newScanBuilder(CaseInsensitiveStringMap options) + { + return new PartitionSizeScanBuilder(dataLayer, schema, options); + } +} + +class PartitionSizeScanBuilder implements ScanBuilder, Scan, Batch +{ + final DataLayer dataLayer; + final StructType schema; + final CaseInsensitiveStringMap options; + + PartitionSizeScanBuilder(DataLayer dataLayer, StructType schema, CaseInsensitiveStringMap options) + { + this.dataLayer = dataLayer; + this.schema = schema; + this.options = options; + } + + @Override + public Scan build() + { + return this; + } + + @Override + public StructType readSchema() + { + return schema; + } + + @Override + public Batch toBatch() + { + return this; + } + + @Override + public InputPartition[] planInputPartitions() + { + return IntStream.range(0, this.dataLayer.partitionCount()) + .mapToObj(CassandraInputPartition::new) + .toArray(InputPartition[]::new); + } + + @Override + public PartitionReaderFactory createReaderFactory() + { + return new PartitionSizeReaderFactory(dataLayer); + } +} + +class PartitionSizeReaderFactory implements PartitionReaderFactory +{ + private static final Logger LOGGER = LoggerFactory.getLogger(CassandraPartitionReaderFactory.class); + final DataLayer dataLayer; + + PartitionSizeReaderFactory(DataLayer dataLayer) + { + this.dataLayer = dataLayer; + } + + @Override + public PartitionReader createReader(InputPartition partition) + { + int partitionId; + if (partition instanceof CassandraInputPartition) + { + partitionId = ((CassandraInputPartition) partition).getPartitionId(); + } + else + { + partitionId = TaskContext.getPartitionId(); + LOGGER.warn("InputPartition is not of CassandraInputPartition type. Using TaskContext to determine the partitionId type={}, partitionId={}", + partition.getClass().getName(), partitionId); + } + return new PartitionSizeIterator(partitionId, dataLayer); + } +} diff --git a/cassandra-analytics-core/src/main/spark4/org/apache/cassandra/spark/sparksql/SparkRowIterator.java b/cassandra-analytics-core/src/main/spark4/org/apache/cassandra/spark/sparksql/SparkRowIterator.java new file mode 100644 index 000000000..d076b3423 --- /dev/null +++ b/cassandra-analytics-core/src/main/spark4/org/apache/cassandra/spark/sparksql/SparkRowIterator.java @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.cassandra.spark.sparksql; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import com.google.common.annotations.VisibleForTesting; + +import org.apache.cassandra.spark.config.SchemaFeature; +import org.apache.cassandra.spark.data.DataLayer; +import org.apache.cassandra.spark.sparksql.filters.PartitionKeyFilter; +import org.apache.spark.sql.catalyst.InternalRow; +import org.apache.spark.sql.catalyst.expressions.GenericInternalRow; +import org.apache.spark.sql.connector.read.PartitionReader; +import org.apache.spark.sql.types.StructType; +import org.jetbrains.annotations.NotNull; +import org.jetbrains.annotations.Nullable; + +/** + * Wrapper iterator around SparkCellIterator to normalize cells into Spark SQL rows + */ +public class SparkRowIterator extends AbstractSparkRowIterator implements PartitionReader +{ + @VisibleForTesting + public SparkRowIterator(int partitionId, @NotNull DataLayer dataLayer) + { + this(partitionId, dataLayer, null, new ArrayList<>()); + } + + public SparkRowIterator(int partitionId, + @NotNull DataLayer dataLayer, + @Nullable StructType requiredSchema, + @NotNull List partitionKeyFilters) + { + super( + partitionId, + dataLayer, + requiredSchema, + partitionKeyFilters, + (builder) -> decorate(requiredSchema, builder, dataLayer.requestedFeatures()) + ); + } + + protected static RowBuilder decorate(@Nullable StructType requiredSchema, + RowBuilder builder, + List features) + { + Set fieldNames = requiredSchema == null ? null : new HashSet<>(Arrays.asList(requiredSchema.fieldNames())); + for (SchemaFeature feature : features) + { + // Only decorate when there is no column filter or when the field is requested in the query, + // otherwise we skip decoration + if (fieldNames == null || fieldNames.contains(feature.fieldName())) + { + builder = feature.decorate(builder); + } + } + + return builder; + } + + public GenericInternalRow rowBuilder(Object[] valueArray) + { + return new GenericInternalRow(valueArray); + } +} diff --git a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/SSTableReaderTests.java b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/SSTableReaderTests.java index 53072e172..96248ad12 100644 --- a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/SSTableReaderTests.java +++ b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/SSTableReaderTests.java @@ -29,17 +29,16 @@ import java.util.stream.Collectors; import com.google.common.util.concurrent.Uninterruptibles; -import org.apache.commons.lang.StringUtils; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.MethodSource; +import org.apache.cassandra.analytics.stats.Stats; import org.apache.cassandra.bridge.CassandraBridge; import org.apache.cassandra.spark.data.BasicSupplier; import org.apache.cassandra.spark.data.CqlTable; import org.apache.cassandra.spark.data.FileType; import org.apache.cassandra.spark.reader.RowData; import org.apache.cassandra.spark.reader.StreamScanner; -import org.apache.cassandra.analytics.stats.Stats; import org.apache.cassandra.spark.sparksql.filters.SSTableTimeRangeFilter; import org.apache.cassandra.spark.utils.ByteBufferUtils; import org.apache.cassandra.spark.utils.TimeProvider; @@ -123,7 +122,7 @@ private void testTtlUsingConstantReferenceTimeHelper(CassandraBridge bridgeForTe ByteBuffer colBuf = rowData.getColumnName(); String colName = ByteBufferUtils.string(ByteBufferUtils.readBytesWithShortLength(colBuf)); colBuf.get(); - if (StringUtils.isEmpty(colName)) + if (colName == null || colName.isEmpty()) { continue; } @@ -230,7 +229,7 @@ private void testSSTableFiltering(CassandraBridge bridgeForTest, ByteBuffer colBuf = rowData.getColumnName(); String colName = ByteBufferUtils.string(ByteBufferUtils.readBytesWithShortLength(colBuf)); colBuf.get(); - if (StringUtils.isEmpty(colName)) + if (colName == null || colName.isEmpty()) { continue; } diff --git a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/SortedSSTableWriterTest.java b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/SortedSSTableWriterTest.java index 4bb3f7dcb..271c760d6 100644 --- a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/SortedSSTableWriterTest.java +++ b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/SortedSSTableWriterTest.java @@ -36,7 +36,6 @@ import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; import java.util.stream.Collectors; -import javax.validation.constraints.NotNull; import com.google.common.collect.ImmutableMap; import com.google.common.util.concurrent.Uninterruptibles; @@ -53,6 +52,7 @@ import org.apache.cassandra.spark.bulkwriter.token.TokenRangeMapping; import org.apache.cassandra.spark.common.Digest; import org.apache.cassandra.spark.utils.XXHash32DigestAlgorithm; +import org.jetbrains.annotations.NotNull; import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatNoException; diff --git a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/TokenRangeMappingUtils.java b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/TokenRangeMappingUtils.java index e907f1051..64ee81b74 100644 --- a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/TokenRangeMappingUtils.java +++ b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/TokenRangeMappingUtils.java @@ -26,7 +26,6 @@ import java.util.List; import java.util.Map; import java.util.stream.Collectors; -import javax.validation.constraints.NotNull; import com.google.common.collect.ArrayListMultimap; import com.google.common.collect.ImmutableMap; @@ -42,6 +41,7 @@ import org.apache.cassandra.spark.data.ReplicationFactor; import org.apache.cassandra.spark.data.partitioner.Partitioner; import org.apache.cassandra.spark.utils.RangeUtils; +import org.jetbrains.annotations.NotNull; public final class TokenRangeMappingUtils { diff --git a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/utils/BufferingInputStreamTests.java b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/utils/BufferingInputStreamTests.java index fc41d7fb1..5fe7019bd 100644 --- a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/utils/BufferingInputStreamTests.java +++ b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/utils/BufferingInputStreamTests.java @@ -37,12 +37,11 @@ import com.google.common.util.concurrent.ThreadFactoryBuilder; import com.google.common.util.concurrent.Uninterruptibles; import org.apache.commons.lang.ArrayUtils; -import org.apache.commons.lang.mutable.MutableInt; import org.junit.jupiter.api.Test; +import org.apache.cassandra.analytics.stats.Stats; import org.apache.cassandra.spark.data.FileType; import org.apache.cassandra.spark.data.SSTable; -import org.apache.cassandra.analytics.stats.Stats; import org.apache.cassandra.spark.utils.streaming.BufferingInputStream; import org.apache.cassandra.spark.utils.streaming.CassandraFileSource; import org.apache.cassandra.spark.utils.streaming.StreamBuffer; @@ -61,9 +60,9 @@ public class BufferingInputStreamTests { private static final ScheduledExecutorService SCHEDULER = Executors.newScheduledThreadPool(1); private static final ExecutorService EXECUTOR = - Executors.newFixedThreadPool(4, new ThreadFactoryBuilder().setNameFormat("sstable-tests-%d") - .setDaemon(true) - .build()); + Executors.newFixedThreadPool(4, new ThreadFactoryBuilder().setNameFormat("sstable-tests-%d") + .setDaemon(true) + .build()); static final int DEFAULT_CHUNK_SIZE = 8192; static final Stats STATS = Stats.DoNothingStats.INSTANCE; @@ -139,9 +138,9 @@ private void runMockedTest(int numRequests, int chunksPerRequest, long maxBuffer maxBufferSize, requestChunkSize, (start, end, consumer) -> { - requestCount.incrementAndGet(); - writeBuffers(consumer, randomBuffers(chunksPerRequest)); - }, null); + requestCount.incrementAndGet(); + writeBuffers(consumer, randomBuffers(chunksPerRequest)); + }, null); BufferingInputStream is = new BufferingInputStream<>(mockedClient, STATS.bufferingInputStreamStats()); readStreamFully(is); assertThat(requestCount.get()).isEqualTo(numRequests); @@ -161,18 +160,18 @@ public void testFailure() CassandraFileSource.DEFAULT_MAX_BUFFER_SIZE, CassandraFileSource.DEFAULT_CHUNK_BUFFER_SIZE, (start, end, consumer) -> { - if (count.incrementAndGet() > (numRequests / 2)) - { - // Halfway through throw random exception - EXECUTOR.submit(() -> consumer.onError(new RuntimeException("Something bad happened..."))); - } - else - { - writeBuffers(consumer, randomBuffers(chunksPerRequest)); - } - }, null); + if (count.incrementAndGet() > (numRequests / 2)) + { + // Halfway through throw random exception + EXECUTOR.submit(() -> consumer.onError(new RuntimeException("Something bad happened..."))); + } + else + { + writeBuffers(consumer, randomBuffers(chunksPerRequest)); + } + }, null); assertThatThrownBy(() -> readStreamFully(new BufferingInputStream<>(source, STATS.bufferingInputStreamStats()))) - .isInstanceOf(IOException.class); + .isInstanceOf(IOException.class); } @Test @@ -180,21 +179,21 @@ public void testTimeout() { long now = System.nanoTime(); assertThat(timeoutLeftNanos(Duration.ofMillis(1000), now, now - Duration.ofMillis(900).toNanos())) - .isEqualTo(Duration.ofMillis(100).toNanos()); + .isEqualTo(Duration.ofMillis(100).toNanos()); assertThat(timeoutLeftNanos(Duration.ofMillis(1000), now, now - Duration.ofMillis(1500).toNanos())) - .isEqualTo(Duration.ofMillis(-500).toNanos()); + .isEqualTo(Duration.ofMillis(-500).toNanos()); assertThat(timeoutLeftNanos(Duration.ofMillis(1000), now, now - Duration.ofMillis(5).toNanos())) - .isEqualTo(Duration.ofMillis(995).toNanos()); + .isEqualTo(Duration.ofMillis(995).toNanos()); assertThat(timeoutLeftNanos(Duration.ofMillis(1000), now, now - Duration.ofMillis(0).toNanos())) - .isEqualTo(Duration.ofMillis(1000).toNanos()); + .isEqualTo(Duration.ofMillis(1000).toNanos()); assertThat(timeoutLeftNanos(Duration.ofMillis(1000), now, now + Duration.ofMillis(500).toNanos())) - .isEqualTo(Duration.ofMillis(1000).toNanos()); + .isEqualTo(Duration.ofMillis(1000).toNanos()); assertThat(timeoutLeftNanos(Duration.ofMillis(60000), now, now - Duration.ofMillis(25000).toNanos())) - .isEqualTo(Duration.ofMillis(35000).toNanos()); + .isEqualTo(Duration.ofMillis(35000).toNanos()); assertThat(timeoutLeftNanos(Duration.ofMillis(60000), now, now - Duration.ofMillis(65000).toNanos())) - .isEqualTo(Duration.ofMillis(-5000).toNanos()); + .isEqualTo(Duration.ofMillis(-5000).toNanos()); assertThat(timeoutLeftNanos(Duration.ofMillis(60000), now, now - Duration.ofMillis(60000).toNanos())) - .isEqualTo(Duration.ofMillis(0).toNanos()); + .isEqualTo(Duration.ofMillis(0).toNanos()); } @Test @@ -212,15 +211,15 @@ public void testTimeoutShouldAccountForActivityTime() CassandraFileSource.DEFAULT_MAX_BUFFER_SIZE, CassandraFileSource.DEFAULT_CHUNK_BUFFER_SIZE, (start, end, consumer) -> { - // Only respond once so future requests will time out - if (count.incrementAndGet() == 1) - { - EXECUTOR.submit(() -> { - Uninterruptibles.sleepUninterruptibly(sleepTimeInMillis, TimeUnit.MILLISECONDS); - writeBuffers(consumer, randomBuffers(chunksPerRequest)); - }); - } - }, timeout); + // Only respond once so future requests will time out + if (count.incrementAndGet() == 1) + { + EXECUTOR.submit(() -> { + Uninterruptibles.sleepUninterruptibly(sleepTimeInMillis, TimeUnit.MILLISECONDS); + writeBuffers(consumer, randomBuffers(chunksPerRequest)); + }); + } + }, timeout); BufferingInputStream inputStream = new BufferingInputStream<>(source, STATS.bufferingInputStreamStats()); try { @@ -234,9 +233,9 @@ public void testTimeoutShouldAccountForActivityTime() long readAndTimeoutTotal = TimeUnit.NANOSECONDS.toMillis(inputStream.timeBlockedNanos()) + timeout.toMillis(); Duration clientTimeoutTotal = Duration.ofNanos(System.nanoTime() - startTime); assertThat(clientTimeoutTotal.toMillis()).isGreaterThanOrEqualTo(readAndTimeoutTotal) - .describedAs("Timeout didn't account for activity time. " - + "Took %dms should have taken at least %dms", - clientTimeoutTotal.toMillis(), readAndTimeoutTotal); + .describedAs("Timeout didn't account for activity time. " + + "Took %dms should have taken at least %dms", + clientTimeoutTotal.toMillis(), readAndTimeoutTotal); } @Test @@ -245,8 +244,8 @@ public void testSkipOnInit() throws IOException int size = 20971520; int chunkSize = 1024; int numChunks = 16; - MutableInt bytesRead = new MutableInt(0); - MutableInt count = new MutableInt(0); + AtomicInteger bytesRead = new AtomicInteger(0); + AtomicInteger count = new AtomicInteger(0); CassandraFileSource source = new CassandraFileSource() { @Override @@ -255,8 +254,8 @@ public void request(long start, long end, StreamConsumer consumer) assertThat(start).isNotEqualTo(0); int length = (int) (end - start + 1); consumer.onRead(randomBuffer(length)); - bytesRead.add(length); - count.increment(); + bytesRead.addAndGet(length); + count.incrementAndGet(); consumer.onEnd(); } diff --git a/cassandra-analytics-core/src/test/spark4/org/apache/cassandra/spark/PartitionSizeTests.java b/cassandra-analytics-core/src/test/spark4/org/apache/cassandra/spark/PartitionSizeTests.java new file mode 100644 index 000000000..d3a702ef5 --- /dev/null +++ b/cassandra-analytics-core/src/test/spark4/org/apache/cassandra/spark/PartitionSizeTests.java @@ -0,0 +1,95 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.cassandra.spark; + +import java.nio.charset.StandardCharsets; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.UUID; + +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.MethodSource; + +import org.apache.cassandra.bridge.CassandraVersion; +import org.apache.cassandra.spark.data.VersionRunner; +import org.apache.cassandra.spark.utils.test.TestSchema; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; + +import static org.assertj.core.api.Assertions.assertThat; + +public class PartitionSizeTests extends VersionRunner +{ + @ParameterizedTest + @MethodSource("org.apache.cassandra.spark.data.VersionRunner#versions") + public void testReadingPartitionSize(CassandraVersion version) + { + TestUtils.runTest(version, (partitioner, dir, bridge) -> { + int numRows = Tester.DEFAULT_NUM_ROWS; + int numCols = 25; + TestSchema schema = TestSchema.builder(bridge) + .withPartitionKey("a", bridge.text()) + .withClusteringKey("b", bridge.aInt()) + .withColumn("c", bridge.aInt()) + .withColumn("d", bridge.text()).build(); + + Map sizes = new HashMap<>(numRows); + schema.writeSSTable(dir, bridge, partitioner, (writer) -> { + for (int i = 0; i < numRows; i++) + { + String key = UUID.randomUUID().toString(); + int size = 0; + for (int j = 0; j < numCols; j++) + { + String str = TestUtils.randomLowEntropyString(); + writer.write(key, j, i + j, str); + size += 4 + 4 + str.getBytes(StandardCharsets.UTF_8).length; + } + sizes.put(key, size); + } + }); + + Dataset ds = TestUtils.openLocalPartitionSizeSource(bridge, + partitioner, + dir, + schema.keyspace, + schema.createStatement, + version, + Collections.emptySet(), + null); + List rows = ds.collectAsList(); + assertThat(rows).hasSize(numRows); + for (Row row : rows) + { + String key = row.getString(0); + long uncompressed = row.getLong(1); + long compressed = row.getLong(2); + assertThat(sizes).containsKey(key); + long len = sizes.get(key); + assertThat(len).isLessThan(uncompressed); + assertThat(Math.abs(uncompressed - len)).isLessThan(500); // uncompressed size should be ~len size but with a fixed overhead + assertThat(compressed).isLessThan(uncompressed); + assertThat(compressed / (float) uncompressed).isLessThan(0.1f); + } + }); + } +} diff --git a/cassandra-analytics-core/src/test/spark4/org/apache/cassandra/spark/bulkwriter/TestTaskContext.java b/cassandra-analytics-core/src/test/spark4/org/apache/cassandra/spark/bulkwriter/TestTaskContext.java new file mode 100644 index 000000000..7106bb95b --- /dev/null +++ b/cassandra-analytics-core/src/test/spark4/org/apache/cassandra/spark/bulkwriter/TestTaskContext.java @@ -0,0 +1,218 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.cassandra.spark.bulkwriter; + +import java.io.Closeable; +import java.util.Properties; + +import org.apache.spark.TaskContext; +import org.apache.spark.executor.TaskMetrics; +import org.apache.spark.memory.TaskMemoryManager; +import org.apache.spark.metrics.source.Source; +import org.apache.spark.resource.ResourceInformation; +import org.apache.spark.shuffle.FetchFailedException; +import org.apache.spark.util.AccumulatorV2; +import org.apache.spark.util.TaskCompletionListener; +import org.apache.spark.util.TaskFailureListener; +import scala.Function0; +import scala.Option; +import scala.collection.immutable.Map; +import scala.collection.immutable.Seq; + +public class TestTaskContext extends TaskContext +{ + @Override + public boolean isCompleted() + { + return false; + } + + @Override + public boolean isFailed() + { + return false; + } + + @Override + public boolean isInterrupted() + { + return false; + } + + @Deprecated + public boolean isRunningLocally() + { + return false; + } + + @Override + public TaskContext addTaskCompletionListener(TaskCompletionListener listener) + { + return null; + } + + @Override + public TaskContext addTaskFailureListener(TaskFailureListener listener) + { + return null; + } + + @Override + public int stageId() + { + return 0; + } + + @Override + public int stageAttemptNumber() + { + return 0; + } + + @Override + public int partitionId() + { + return 0; + } + + @Override + public int numPartitions() + { + return 0; + } + + @Override + public int attemptNumber() + { + return 0; + } + + @Override + public long taskAttemptId() + { + return 0; + } + + @Override + public String getLocalProperty(String key) + { + return null; + } + + @Override + public int cpus() + { + return 1; + } + + public Map resources() + { + return null; + } + + public java.util.Map resourcesJMap() + { + return null; + } + + @Override + public TaskMetrics taskMetrics() + { + return null; + } + + @Override + public Seq getMetricsSources(String sourceName) + { + return null; + } + + @Override + public void killTaskIfInterrupted() + { + } + + @Override + public Option getKillReason() + { + return null; + } + + @Override + public TaskMemoryManager taskMemoryManager() + { + return null; + } + + @Override + public void registerAccumulator(AccumulatorV2 accumulator) + { + } + + @Override + public void setFetchFailed(FetchFailedException fetchFailed) + { + } + + @Override + public void markInterrupted(String reason) + { + } + + @Override + public void markTaskFailed(Throwable error) + { + } + + @Override + public void markTaskCompleted(Option error) + { + } + + @Override + public Option fetchFailed() + { + return null; + } + + @Override + public Properties getLocalProperties() + { + return null; + } + + @Override + public boolean interruptible() + { + return false; + } + + @Override + public void pendingInterrupt(Option threadToInterrupt, String reason) + { + + } + + @Override + public T createResourceUninterruptibly(Function0 resourceBuilder) + { + return null; + } +} diff --git a/cassandra-analytics-core/src/test/spark4/org/apache/cassandra/spark/common/SidecarInstanceFactoryTest.java b/cassandra-analytics-core/src/test/spark4/org/apache/cassandra/spark/common/SidecarInstanceFactoryTest.java new file mode 100644 index 000000000..277175a7f --- /dev/null +++ b/cassandra-analytics-core/src/test/spark4/org/apache/cassandra/spark/common/SidecarInstanceFactoryTest.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.cassandra.spark.common; + +import org.junit.jupiter.api.Test; + +import o.a.c.sidecar.client.shaded.client.SidecarInstance; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +class SidecarInstanceFactoryTest +{ + @Test + void testCreateSidecarInstance() + { + assertThatThrownBy(() -> SidecarInstanceFactory.createFromString("", 9999)) + .isExactlyInstanceOf(IllegalArgumentException.class) + .hasMessage("Unable to create sidecar instance from empty input"); + + assertSidecarInstance(SidecarInstanceFactory.createFromString("localhost", 9999), + "localhost", 9999); + assertSidecarInstance(SidecarInstanceFactory.createFromString("[2024:a::1]", 9999), + "[2024:a::1]", 9999); + assertSidecarInstance(SidecarInstanceFactory.createFromString("localhost:8888", 9999), + "localhost", 8888); + assertSidecarInstance(SidecarInstanceFactory.createFromString("127.0.0.1:8888", 9999), + "127.0.0.1", 8888); + assertSidecarInstance(SidecarInstanceFactory.createFromString("[2024:a::1]:8888", 9999), + "[2024:a::1]", 8888); + } + + private void assertSidecarInstance(SidecarInstance sidecarInstance, String expectedHostname, int expectedPort) + { + assertThat(sidecarInstance.hostname()).isEqualTo(expectedHostname); + assertThat(sidecarInstance.port()).isEqualTo(expectedPort); + } +} diff --git a/cassandra-analytics-sidecar-client/build.gradle b/cassandra-analytics-sidecar-client/build.gradle index bedf2c042..cad4d9593 100644 --- a/cassandra-analytics-sidecar-client/build.gradle +++ b/cassandra-analytics-sidecar-client/build.gradle @@ -1,5 +1,3 @@ -import java.nio.file.Paths - /* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file @@ -29,8 +27,7 @@ configurations { all*.exclude(group: 'log4j', module: 'log4j') } -if (propertyWithDefault("artifactType", null) == "common") -{ +if (propertyWithDefault("artifactType", null) == "common") { apply from: "$rootDir/gradle/common/publishing.gradle" } @@ -41,4 +38,4 @@ dependencies { implementation "org.slf4j:slf4j-api:${slf4jApiVersion}" api(project(path: ':analytics-sidecar-vertx-client-shaded', configuration: 'shadow')) -} \ No newline at end of file +} diff --git a/cassandra-analytics-spark-converter/src/main/java/org/apache/cassandra/spark/data/converter/types/StringFeatures.java b/cassandra-analytics-spark-converter/src/main/java/org/apache/cassandra/spark/data/converter/types/StringFeatures.java index cc1e9a9d8..a524cd19d 100644 --- a/cassandra-analytics-spark-converter/src/main/java/org/apache/cassandra/spark/data/converter/types/StringFeatures.java +++ b/cassandra-analytics-spark-converter/src/main/java/org/apache/cassandra/spark/data/converter/types/StringFeatures.java @@ -79,6 +79,6 @@ default int compareTo(Object first, Object second) { return STRING_COMPARATOR.compare(first.toString(), second.toString()); } - return ((UTF8String) first).compare((UTF8String) second); + return ((UTF8String) first).compareTo((UTF8String) second); } } diff --git a/cassandra-analytics-spark-converter/src/main/scala-2.11-spark-2/org/apache/cassandra/spark/utils/ScalaConversionUtils.java b/cassandra-analytics-spark-converter/src/main/scala-2.11-spark-2/org/apache/cassandra/spark/utils/ScalaConversionUtils.java deleted file mode 100644 index 5cfb85bc5..000000000 --- a/cassandra-analytics-spark-converter/src/main/scala-2.11-spark-2/org/apache/cassandra/spark/utils/ScalaConversionUtils.java +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.cassandra.spark.utils; - -import java.util.List; - -import scala.collection.JavaConversions; -import scala.collection.mutable.Seq; - -/** - * Compatibility layer for scala conversions - */ -public final class ScalaConversionUtils -{ - private ScalaConversionUtils() - { - throw new IllegalStateException(getClass() + " is static utility class and shall not be instantiated"); - } - - public static java.lang.Iterable asJavaIterable(scala.collection.Iterable iterable) - { - return JavaConversions.asJavaIterable(iterable); - } - - public static scala.collection.Iterator asScalaIterator(java.util.Iterator iterator) - { - return JavaConversions.asScalaIterator(iterator); - } - - public static java.util.Map mapAsJavaMap(scala.collection.Map map) - { - return JavaConversions.mapAsJavaMap(map); - } - - public static List mutableSeqAsJavaList(Seq seq) - { - return JavaConversions.mutableSeqAsJavaList(seq); - } -} diff --git a/cassandra-analytics-spark-converter/src/main/scala-2.12-spark-2/org/apache/cassandra/spark/utils/SparkTypeUtils.java b/cassandra-analytics-spark-converter/src/main/scala-2.12-spark-2/org/apache/cassandra/spark/utils/SparkTypeUtils.java deleted file mode 100644 index 4df97e31a..000000000 --- a/cassandra-analytics-spark-converter/src/main/scala-2.12-spark-2/org/apache/cassandra/spark/utils/SparkTypeUtils.java +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.cassandra.spark.utils; - -import java.util.Comparator; -import java.util.concurrent.TimeUnit; - -import com.google.common.primitives.Ints; - -import org.apache.cassandra.bridge.type.InternalDuration; -import org.apache.spark.unsafe.types.CalendarInterval; - -public final class SparkTypeUtils -{ - private SparkTypeUtils() - { - throw new IllegalStateException(getClass() + " is static utility class and shall not be instantiated"); - } - - public static final Comparator CALENDAR_INTERVAL_COMPARATOR = - Comparator.comparingInt(interval -> interval.months) - .thenComparingLong(interval -> interval.microseconds); - - public static CalendarInterval convertDuration(InternalDuration duration) - { - // Unfortunately, it loses precision when converting to the spark data type. - long micros = TimeUnit.NANOSECONDS.toMicros(duration.nanoseconds); - micros += duration.days * CalendarInterval.MICROS_PER_DAY; - return new CalendarInterval(duration.months, micros); - } - - public static InternalDuration convertDuration(CalendarInterval interval) - { - int days = Ints.checkedCast(interval.microseconds / CalendarInterval.MICROS_PER_DAY); - long microsRemain = interval.microseconds % CalendarInterval.MICROS_PER_DAY; - return new InternalDuration(interval.months, days, TimeUnit.MICROSECONDS.toNanos(microsRemain)); - } -} diff --git a/cassandra-analytics-spark-converter/src/main/scala-2.12-spark-2/org/apache/cassandra/spark/utils/ScalaConversionUtils.java b/cassandra-analytics-spark-converter/src/main/scala-2.13-spark-4/org/apache/cassandra/spark/utils/ScalaConversionUtils.java similarity index 75% rename from cassandra-analytics-spark-converter/src/main/scala-2.12-spark-2/org/apache/cassandra/spark/utils/ScalaConversionUtils.java rename to cassandra-analytics-spark-converter/src/main/scala-2.13-spark-4/org/apache/cassandra/spark/utils/ScalaConversionUtils.java index 23eabb3f5..e225e1ff8 100644 --- a/cassandra-analytics-spark-converter/src/main/scala-2.12-spark-2/org/apache/cassandra/spark/utils/ScalaConversionUtils.java +++ b/cassandra-analytics-spark-converter/src/main/scala-2.13-spark-4/org/apache/cassandra/spark/utils/ScalaConversionUtils.java @@ -21,8 +21,8 @@ import java.util.List; -import scala.collection.JavaConverters; import scala.collection.mutable.Seq; +import scala.jdk.javaapi.CollectionConverters; /** * Compatibility layer for scala conversions @@ -34,23 +34,23 @@ private ScalaConversionUtils() throw new IllegalStateException(getClass() + " is static utility class and shall not be instantiated"); } - public static java.lang.Iterable asJavaIterable(scala.collection.Iterable iterable) + public static Iterable asJavaIterable(scala.collection.Iterable iterable) { - return JavaConverters.asJavaIterable(iterable); + return CollectionConverters.asJava(iterable); } public static scala.collection.Iterator asScalaIterator(java.util.Iterator iterator) { - return JavaConverters.asScalaIterator(iterator); + return CollectionConverters.asScala(iterator); } - public static java.util.Map mapAsJavaMap(scala.collection.Map map) + public static java.util.Map mapAsJavaMap(scala.collection.Map map) { - return JavaConverters.mapAsJavaMap(map); + return CollectionConverters.asJava(map); } public static List mutableSeqAsJavaList(Seq seq) { - return JavaConverters.mutableSeqAsJavaList(seq); + return CollectionConverters.asJava(seq); } } diff --git a/cassandra-analytics-spark-converter/src/main/scala-2.11-spark-2/org/apache/cassandra/spark/utils/SparkTypeUtils.java b/cassandra-analytics-spark-converter/src/main/scala-2.13-spark-4/org/apache/cassandra/spark/utils/SparkTypeUtils.java similarity index 76% rename from cassandra-analytics-spark-converter/src/main/scala-2.11-spark-2/org/apache/cassandra/spark/utils/SparkTypeUtils.java rename to cassandra-analytics-spark-converter/src/main/scala-2.13-spark-4/org/apache/cassandra/spark/utils/SparkTypeUtils.java index 4df97e31a..4d200b355 100644 --- a/cassandra-analytics-spark-converter/src/main/scala-2.11-spark-2/org/apache/cassandra/spark/utils/SparkTypeUtils.java +++ b/cassandra-analytics-spark-converter/src/main/scala-2.13-spark-4/org/apache/cassandra/spark/utils/SparkTypeUtils.java @@ -22,8 +22,6 @@ import java.util.Comparator; import java.util.concurrent.TimeUnit; -import com.google.common.primitives.Ints; - import org.apache.cassandra.bridge.type.InternalDuration; import org.apache.spark.unsafe.types.CalendarInterval; @@ -36,20 +34,17 @@ private SparkTypeUtils() public static final Comparator CALENDAR_INTERVAL_COMPARATOR = Comparator.comparingInt(interval -> interval.months) + .thenComparingInt(interval -> interval.days) .thenComparingLong(interval -> interval.microseconds); public static CalendarInterval convertDuration(InternalDuration duration) { // Unfortunately, it loses precision when converting to the spark data type. - long micros = TimeUnit.NANOSECONDS.toMicros(duration.nanoseconds); - micros += duration.days * CalendarInterval.MICROS_PER_DAY; - return new CalendarInterval(duration.months, micros); + return new CalendarInterval(duration.months, duration.days, TimeUnit.NANOSECONDS.toMicros(duration.nanoseconds)); } public static InternalDuration convertDuration(CalendarInterval interval) { - int days = Ints.checkedCast(interval.microseconds / CalendarInterval.MICROS_PER_DAY); - long microsRemain = interval.microseconds % CalendarInterval.MICROS_PER_DAY; - return new InternalDuration(interval.months, days, TimeUnit.MICROSECONDS.toNanos(microsRemain)); + return new InternalDuration(interval.months, interval.days, TimeUnit.MICROSECONDS.toNanos(interval.microseconds)); } } diff --git a/cassandra-five-zero-bridge/src/main/java/org/apache/cassandra/bridge/CassandraBridgeImplementation.java b/cassandra-five-zero-bridge/src/main/java/org/apache/cassandra/bridge/CassandraBridgeImplementation.java index 88d5c6df0..440f02eab 100644 --- a/cassandra-five-zero-bridge/src/main/java/org/apache/cassandra/bridge/CassandraBridgeImplementation.java +++ b/cassandra-five-zero-bridge/src/main/java/org/apache/cassandra/bridge/CassandraBridgeImplementation.java @@ -211,15 +211,15 @@ public StreamScanner getCompactionScanner(@NotNull CqlTable table, TableMetadata metadata = schemaBuilder.tableMetaData(); return new CompactionStreamScanner(metadata, partitioner, timeProvider, ssTables.openAll((ssTable, isRepairPrimary) -> { return org.apache.cassandra.spark.reader.SSTableReader.builder(metadata, ssTable) - .withSparkRangeFilter(sparkRangeFilter) - .withPartitionKeyFilters(partitionKeyFilters) - .withTimeRangeFilter(sstableTimeRangeFilter) - .withColumnFilter(columnFilter) - .withReadIndexOffset(readIndexOffset) - .withStats(stats) - .useIncrementalRepair(useIncrementalRepair) - .isRepairPrimary(isRepairPrimary) - .build(); + .withSparkRangeFilter(sparkRangeFilter) + .withPartitionKeyFilters(partitionKeyFilters) + .withTimeRangeFilter(sstableTimeRangeFilter) + .withColumnFilter(columnFilter) + .withReadIndexOffset(readIndexOffset) + .withStats(stats) + .useIncrementalRepair(useIncrementalRepair) + .isRepairPrimary(isRepairPrimary) + .build(); })); } @@ -475,9 +475,9 @@ public void readPartitionKeys(Partitioner partitioner, TimeProvider.DEFAULT, ssTables.openAll((ssTable, isRepairPrimary) -> org.apache.cassandra.spark.reader.SSTableReader.builder(metadata, ssTable) - .withPartitionKeyFilters(partitionKeyFilters1) - .withTimeRangeFilter(timeRangeFilter1) - .build()) + .withPartitionKeyFilters(partitionKeyFilters1) + .withTimeRangeFilter(timeRangeFilter1) + .build()) )) { @Override diff --git a/gradle.properties b/gradle.properties index d19e9a518..30e1aa3b9 100644 --- a/gradle.properties +++ b/gradle.properties @@ -20,7 +20,7 @@ version=0.5-SNAPSHOT snapshot=true description=Apache Cassandra Analytics -analyticsJDKLevel=11 +analyticsJDKLevel=17 cassandra40Version=4.0.17 cassandra50Version=5.0.5 sidecarVersion=0.2.0 @@ -30,8 +30,8 @@ assertjCoreVersion=3.24.2 quickTheoriesVersion=0.26 mockitoVersion=3.12.4 jnaVersion=5.9.0 -scala=2.12 -spark=3 +scala=2.13 +spark=4 kryoVersion=4.0.2 slf4jApiVersion=1.7.30 guavaVersion=16.0.1 diff --git a/gradle/wrapper/gradle-wrapper.properties b/gradle/wrapper/gradle-wrapper.properties index ca025c83a..d4081da47 100644 --- a/gradle/wrapper/gradle-wrapper.properties +++ b/gradle/wrapper/gradle-wrapper.properties @@ -1,6 +1,6 @@ distributionBase=GRADLE_USER_HOME distributionPath=wrapper/dists -distributionUrl=https\://services.gradle.org/distributions/gradle-8.14-bin.zip +distributionUrl=https\://services.gradle.org/distributions/gradle-8.14.3-bin.zip networkTimeout=10000 validateDistributionUrl=true zipStoreBase=GRADLE_USER_HOME diff --git a/profiles/scala-2.13-spark-4-jdk-17.gradle b/profiles/scala-2.13-spark-4-jdk-17.gradle new file mode 100644 index 000000000..859cb6b2f --- /dev/null +++ b/profiles/scala-2.13-spark-4-jdk-17.gradle @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +ext { + jacksonVersion="2.18.2" + jacksonScalaModuleVersion="2.18.2" + scalaMajorVersion="2.13" + scalaVersion="2.13.16" + sparkGroupId="org.apache.spark" + sparkMajorVersion="4" + sparkVersion="4.0.1" +} From 5c04e4d319a3e9703ca62e8ad16a8747fe33d766 Mon Sep 17 00:00:00 2001 From: Francisco Guerrero Date: Thu, 8 Jan 2026 14:47:52 -0800 Subject: [PATCH 02/26] Use Sets.newHashSet --- .../org/apache/cassandra/spark/sparksql/CassandraTable.java | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/cassandra-analytics-core/src/main/spark4/org/apache/cassandra/spark/sparksql/CassandraTable.java b/cassandra-analytics-core/src/main/spark4/org/apache/cassandra/spark/sparksql/CassandraTable.java index f78e4c5b5..35790a96c 100644 --- a/cassandra-analytics-core/src/main/spark4/org/apache/cassandra/spark/sparksql/CassandraTable.java +++ b/cassandra-analytics-core/src/main/spark4/org/apache/cassandra/spark/sparksql/CassandraTable.java @@ -19,10 +19,9 @@ package org.apache.cassandra.spark.sparksql; -import java.util.HashSet; import java.util.Set; -import com.google.common.collect.ImmutableList; +import com.google.common.collect.Sets; import org.apache.cassandra.spark.data.DataLayer; import org.apache.spark.sql.connector.catalog.SupportsRead; @@ -58,7 +57,7 @@ public StructType schema() @Override public Set capabilities() { - return new HashSet<>(ImmutableList.of(TableCapability.BATCH_READ, TableCapability.MICRO_BATCH_READ)); + return Sets.newHashSet(TableCapability.BATCH_READ, TableCapability.MICRO_BATCH_READ); } @Override From 7aec5732286903ec50c4d2183f614636725c1fa1 Mon Sep 17 00:00:00 2001 From: Francisco Guerrero Date: Thu, 8 Jan 2026 14:50:16 -0800 Subject: [PATCH 03/26] restore formatting --- .../bridge/CassandraBridgeImplementation.java | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/cassandra-five-zero-bridge/src/main/java/org/apache/cassandra/bridge/CassandraBridgeImplementation.java b/cassandra-five-zero-bridge/src/main/java/org/apache/cassandra/bridge/CassandraBridgeImplementation.java index 440f02eab..88d5c6df0 100644 --- a/cassandra-five-zero-bridge/src/main/java/org/apache/cassandra/bridge/CassandraBridgeImplementation.java +++ b/cassandra-five-zero-bridge/src/main/java/org/apache/cassandra/bridge/CassandraBridgeImplementation.java @@ -211,15 +211,15 @@ public StreamScanner getCompactionScanner(@NotNull CqlTable table, TableMetadata metadata = schemaBuilder.tableMetaData(); return new CompactionStreamScanner(metadata, partitioner, timeProvider, ssTables.openAll((ssTable, isRepairPrimary) -> { return org.apache.cassandra.spark.reader.SSTableReader.builder(metadata, ssTable) - .withSparkRangeFilter(sparkRangeFilter) - .withPartitionKeyFilters(partitionKeyFilters) - .withTimeRangeFilter(sstableTimeRangeFilter) - .withColumnFilter(columnFilter) - .withReadIndexOffset(readIndexOffset) - .withStats(stats) - .useIncrementalRepair(useIncrementalRepair) - .isRepairPrimary(isRepairPrimary) - .build(); + .withSparkRangeFilter(sparkRangeFilter) + .withPartitionKeyFilters(partitionKeyFilters) + .withTimeRangeFilter(sstableTimeRangeFilter) + .withColumnFilter(columnFilter) + .withReadIndexOffset(readIndexOffset) + .withStats(stats) + .useIncrementalRepair(useIncrementalRepair) + .isRepairPrimary(isRepairPrimary) + .build(); })); } @@ -475,9 +475,9 @@ public void readPartitionKeys(Partitioner partitioner, TimeProvider.DEFAULT, ssTables.openAll((ssTable, isRepairPrimary) -> org.apache.cassandra.spark.reader.SSTableReader.builder(metadata, ssTable) - .withPartitionKeyFilters(partitionKeyFilters1) - .withTimeRangeFilter(timeRangeFilter1) - .build()) + .withPartitionKeyFilters(partitionKeyFilters1) + .withTimeRangeFilter(timeRangeFilter1) + .build()) )) { @Override From bb5275fade053c8a21d7a09d706cb2ca7eafadae Mon Sep 17 00:00:00 2001 From: Francisco Guerrero Date: Thu, 8 Jan 2026 16:29:20 -0800 Subject: [PATCH 04/26] Remove toolchain --- analytics-sidecar-vertx-client-shaded/build.gradle | 6 ------ 1 file changed, 6 deletions(-) diff --git a/analytics-sidecar-vertx-client-shaded/build.gradle b/analytics-sidecar-vertx-client-shaded/build.gradle index f5414a6ea..028327caa 100644 --- a/analytics-sidecar-vertx-client-shaded/build.gradle +++ b/analytics-sidecar-vertx-client-shaded/build.gradle @@ -36,12 +36,6 @@ plugins { id('com.gradleup.shadow') version '8.3.9' } -java { - toolchain { - languageVersion = JavaLanguageVersion.of(11) - } -} - version project.version configurations { From 5b44ac80d13a36046cb0abf16fb0216e9ac1684d Mon Sep 17 00:00:00 2001 From: Francisco Guerrero Date: Fri, 9 Jan 2026 14:46:53 -0800 Subject: [PATCH 05/26] Add JDK17 flags to run tests and upgrade jacoco to support new java formats --- cassandra-analytics-cdc-codec/build.gradle | 5 ++ cassandra-analytics-cdc/build.gradle | 5 ++ cassandra-analytics-core/build.gradle | 52 ++++++++++++------- .../bulkwriter/DirectStreamSessionTest.java | 2 +- .../spark/bulkwriter/RecordWriterTest.java | 4 +- .../spark/data/ClientConfigTests.java | 2 +- 6 files changed, 46 insertions(+), 24 deletions(-) diff --git a/cassandra-analytics-cdc-codec/build.gradle b/cassandra-analytics-cdc-codec/build.gradle index 60b722089..f83224080 100644 --- a/cassandra-analytics-cdc-codec/build.gradle +++ b/cassandra-analytics-cdc-codec/build.gradle @@ -69,6 +69,11 @@ sourceSets { } test { + if (JavaVersion.current().isCompatibleWith(JavaVersion.VERSION_17)) { + jvmArgs(['--add-opens', 'java.base/java.io=ALL-UNNAMED', + '--add-opens', 'java.base/sun.nio.ch=ALL-UNNAMED']) + } + useJUnitPlatform() reports { def destDir = Paths.get(rootProject.rootDir.absolutePath, "build", "test-reports", "four-zero-types").toFile() diff --git a/cassandra-analytics-cdc/build.gradle b/cassandra-analytics-cdc/build.gradle index e413b242d..feffabd1a 100644 --- a/cassandra-analytics-cdc/build.gradle +++ b/cassandra-analytics-cdc/build.gradle @@ -165,6 +165,11 @@ def configureCdcTestTask = { Test task, String majorMinor = null -> task.maxParallelForks = cdcMaxParallelForks task.forkEvery = 1 // Enables different end-to-end test classes use Spark contexts with different configurations + if (JavaVersion.current().isCompatibleWith(JavaVersion.VERSION_17)) { + jvmArgs(['--add-opens', 'java.base/java.io=ALL-UNNAMED', + '--add-opens', 'java.base/java.nio=ALL-UNNAMED', + '--add-opens', 'java.base/sun.nio.ch=ALL-UNNAMED']) + } // Make it so unit tests run on a Jar with Cassandra bridge implementations built in task.dependsOn(tasks.jar) task.classpath += files(jar.archiveFile) diff --git a/cassandra-analytics-core/build.gradle b/cassandra-analytics-core/build.gradle index 8df6b1399..4f9a536d5 100644 --- a/cassandra-analytics-core/build.gradle +++ b/cassandra-analytics-core/build.gradle @@ -159,26 +159,31 @@ jar { // (CI) or falls back to all supported versions (local dev). def configureCoreTestTask(Test task, String majorMinor = null) { task.doFirst { - if (JavaVersion.current().isJava11Compatible()) { - def JDK11_OPTIONS = ['-Djdk.attach.allowAttachSelf=true', - '--add-exports', 'java.base/jdk.internal.misc=ALL-UNNAMED', - '--add-exports', 'java.base/jdk.internal.ref=ALL-UNNAMED', - '--add-exports', 'java.base/sun.nio.ch=ALL-UNNAMED', - '--add-exports', 'java.management.rmi/com.sun.jmx.remote.internal.rmi=ALL-UNNAMED', - '--add-exports', 'java.rmi/sun.rmi.registry=ALL-UNNAMED', - '--add-exports', 'java.rmi/sun.rmi.server=ALL-UNNAMED', - '--add-exports', 'java.sql/java.sql=ALL-UNNAMED', - '--add-opens', 'java.base/java.lang.module=ALL-UNNAMED', - '--add-opens', 'java.base/jdk.internal.loader=ALL-UNNAMED', - '--add-opens', 'java.base/jdk.internal.ref=ALL-UNNAMED', - '--add-opens', 'java.base/jdk.internal.reflect=ALL-UNNAMED', - '--add-opens', 'java.base/jdk.internal.math=ALL-UNNAMED', - '--add-opens', 'java.base/jdk.internal.module=ALL-UNNAMED', - '--add-opens', 'java.base/jdk.internal.util.jar=ALL-UNNAMED', - '--add-opens', 'jdk.management/com.sun.management.internal=ALL-UNNAMED'] - task.jvmArgs(JDK11_OPTIONS) - println("JVM arguments for $project.name are $allJvmArgs") + def JDK_OPTIONS = ['-Djdk.attach.allowAttachSelf=true', + '--add-exports', 'java.base/jdk.internal.misc=ALL-UNNAMED', + '--add-exports', 'java.base/jdk.internal.ref=ALL-UNNAMED', + '--add-exports', 'java.base/sun.nio.ch=ALL-UNNAMED', + '--add-exports', 'java.management.rmi/com.sun.jmx.remote.internal.rmi=ALL-UNNAMED', + '--add-exports', 'java.rmi/sun.rmi.registry=ALL-UNNAMED', + '--add-exports', 'java.rmi/sun.rmi.server=ALL-UNNAMED', + '--add-exports', 'java.sql/java.sql=ALL-UNNAMED', + '--add-opens', 'java.base/java.lang.module=ALL-UNNAMED', + '--add-opens', 'java.base/jdk.internal.loader=ALL-UNNAMED', + '--add-opens', 'java.base/jdk.internal.ref=ALL-UNNAMED', + '--add-opens', 'java.base/jdk.internal.reflect=ALL-UNNAMED', + '--add-opens', 'java.base/jdk.internal.math=ALL-UNNAMED', + '--add-opens', 'java.base/jdk.internal.module=ALL-UNNAMED', + '--add-opens', 'java.base/jdk.internal.util.jar=ALL-UNNAMED', + '--add-opens', 'jdk.management/com.sun.management.internal=ALL-UNNAMED'] + + if (JavaVersion.current().isCompatibleWith(JavaVersion.VERSION_17)) { + JDK_OPTIONS += ['--add-opens', 'java.base/java.io=ALL-UNNAMED', + '--add-opens', 'java.base/java.math=ALL-UNNAMED', + '--add-opens', 'java.base/java.nio=ALL-UNNAMED', + '--add-opens', 'java.base/sun.nio.ch=ALL-UNNAMED'] } + task.jvmArgs(JDK_OPTIONS) + println("JVM arguments for $project.name are $allJvmArgs") } task.systemProperty "cassandra.analytics.bridges.sstable_format", System.getProperty("cassandra.analytics.bridges.sstable_format", "big") @@ -222,6 +227,13 @@ test { maxParallelForks = System.getenv('CORE_MAX_PARALLEL_FORKS')?.toInteger() ?: Math.max(Runtime.runtime.availableProcessors() * 2, 8) + if (JavaVersion.current().isCompatibleWith(JavaVersion.VERSION_17)) { + jvmArgs(['--add-opens', 'java.base/java.io=ALL-UNNAMED', + '--add-opens', 'java.base/java.math=ALL-UNNAMED', + '--add-opens', 'java.base/java.nio=ALL-UNNAMED', + '--add-opens', 'java.base/sun.nio.ch=ALL-UNNAMED']) + } + useJUnitPlatform { excludeTags 'Sequential' } @@ -261,7 +273,7 @@ rootProject.ext.cassandraVersionEnumMap.keySet().each { majorMinor -> /* Start: JaCoCo check */ jacoco { - toolVersion = '0.8.4' + toolVersion = '0.8.13' } jacocoTestReport { diff --git a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/DirectStreamSessionTest.java b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/DirectStreamSessionTest.java index 1c2d60182..f6fe8c5e8 100644 --- a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/DirectStreamSessionTest.java +++ b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/DirectStreamSessionTest.java @@ -124,7 +124,7 @@ void testEmptyTokenRangeFails() replicaAwareFailureHandler(), null) ) .isInstanceOf(IllegalStateException.class) - .hasMessage("No replicas found for range (0‥0]"); + .hasMessageMatching("No replicas found for range \\(0(‥|..)0]"); } @Test diff --git a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/RecordWriterTest.java b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/RecordWriterTest.java index 8a2c17286..b1ce66e35 100644 --- a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/RecordWriterTest.java +++ b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/RecordWriterTest.java @@ -394,10 +394,10 @@ void testWriteWithOutOfRangeTokenFails() (wc, path, dp, pid) -> new SortedSSTableWriter(tw, folder, digestAlgorithm, pid)); Iterator> data = generateData(5, Range.all(), false, false, false); String expectedErr = "java.lang.IllegalStateException: Received Token " + - "5765203080415074583 outside the expected ranges [(-9223372036854775808‥100000]]"; + "5765203080415074583 outside the expected ranges \\[\\(-9223372036854775808(‥|..)100000]]"; assertThatThrownBy(() -> rw.write(data)) .isInstanceOf(RuntimeException.class) - .hasMessage(expectedErr); + .hasMessageMatching(expectedErr); } @Test diff --git a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/data/ClientConfigTests.java b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/data/ClientConfigTests.java index b42e33d22..2d6a58966 100644 --- a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/data/ClientConfigTests.java +++ b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/data/ClientConfigTests.java @@ -206,6 +206,6 @@ void testTimeRangeFilterValidatesRangeOrdering() assertThatThrownBy(() -> ClientConfig.create(options)) .isInstanceOf(IllegalArgumentException.class) - .hasMessageContaining("Invalid range: [2000‥1000]"); + .hasMessageMatching("Invalid range: \\[2000(‥|..)1000]"); } } From cd284b53296bb1bafe7701abd611814dad5f84a6 Mon Sep 17 00:00:00 2001 From: Francisco Guerrero Date: Fri, 9 Jan 2026 14:56:31 -0800 Subject: [PATCH 06/26] Add another missing add-opens --- cassandra-analytics-core/build.gradle | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cassandra-analytics-core/build.gradle b/cassandra-analytics-core/build.gradle index 4f9a536d5..35353f3c1 100644 --- a/cassandra-analytics-core/build.gradle +++ b/cassandra-analytics-core/build.gradle @@ -180,7 +180,8 @@ def configureCoreTestTask(Test task, String majorMinor = null) { JDK_OPTIONS += ['--add-opens', 'java.base/java.io=ALL-UNNAMED', '--add-opens', 'java.base/java.math=ALL-UNNAMED', '--add-opens', 'java.base/java.nio=ALL-UNNAMED', - '--add-opens', 'java.base/sun.nio.ch=ALL-UNNAMED'] + '--add-opens', 'java.base/sun.nio.ch=ALL-UNNAMED', + '--add-opens', 'java.base/sun.util.calendar=ALL-UNNAMED'] } task.jvmArgs(JDK_OPTIONS) println("JVM arguments for $project.name are $allJvmArgs") From d39f0a67d4cf8a5519036288e802c8ff3a272e56 Mon Sep 17 00:00:00 2001 From: Francisco Guerrero Date: Sun, 11 Jan 2026 09:58:59 -0800 Subject: [PATCH 07/26] Add JDK17 exports and add-opens --- .../build.gradle | 54 ++++++++++++------- 1 file changed, 35 insertions(+), 19 deletions(-) diff --git a/cassandra-analytics-integration-tests/build.gradle b/cassandra-analytics-integration-tests/build.gradle index 4604e8c51..8a363e312 100644 --- a/cassandra-analytics-integration-tests/build.gradle +++ b/cassandra-analytics-integration-tests/build.gradle @@ -125,26 +125,42 @@ def configureIntegrationTestTask = { Test task, String majorMinor = null -> showStandardStreams = false } - if (JavaVersion.current().isJava11Compatible()) { - def JDK11_OPTIONS = ['-Djdk.attach.allowAttachSelf=true', - '--add-exports', 'java.base/jdk.internal.misc=ALL-UNNAMED', - '--add-exports', 'java.base/jdk.internal.ref=ALL-UNNAMED', - '--add-exports', 'java.base/sun.nio.ch=ALL-UNNAMED', - '--add-exports', 'java.management.rmi/com.sun.jmx.remote.internal.rmi=ALL-UNNAMED', - '--add-exports', 'java.rmi/sun.rmi.registry=ALL-UNNAMED', - '--add-exports', 'java.rmi/sun.rmi.server=ALL-UNNAMED', - '--add-exports', 'java.sql/java.sql=ALL-UNNAMED', - '--add-opens', 'java.base/java.lang.module=ALL-UNNAMED', - '--add-opens', 'java.base/jdk.internal.loader=ALL-UNNAMED', - '--add-opens', 'java.base/jdk.internal.ref=ALL-UNNAMED', - '--add-opens', 'java.base/jdk.internal.reflect=ALL-UNNAMED', - '--add-opens', 'java.base/jdk.internal.math=ALL-UNNAMED', - '--add-opens', 'java.base/jdk.internal.module=ALL-UNNAMED', - '--add-opens', 'java.base/jdk.internal.util.jar=ALL-UNNAMED', - '--add-opens', 'jdk.management/com.sun.management.internal=ALL-UNNAMED'] - task.jvmArgs(JDK11_OPTIONS) - println("JVM arguments for $project.name are ${task.allJvmArgs}") + def JDK_OPTIONS = ['-Djdk.attach.allowAttachSelf=true', + '--add-exports', 'java.base/jdk.internal.misc=ALL-UNNAMED', + '--add-exports', 'java.base/jdk.internal.ref=ALL-UNNAMED', + '--add-exports', 'java.base/sun.nio.ch=ALL-UNNAMED', + '--add-exports', 'java.management.rmi/com.sun.jmx.remote.internal.rmi=ALL-UNNAMED', + '--add-exports', 'java.rmi/sun.rmi.registry=ALL-UNNAMED', + '--add-exports', 'java.rmi/sun.rmi.server=ALL-UNNAMED', + '--add-exports', 'java.sql/java.sql=ALL-UNNAMED', + '--add-opens', 'java.base/java.lang.module=ALL-UNNAMED', + '--add-opens', 'java.base/jdk.internal.loader=ALL-UNNAMED', + '--add-opens', 'java.base/jdk.internal.ref=ALL-UNNAMED', + '--add-opens', 'java.base/jdk.internal.reflect=ALL-UNNAMED', + '--add-opens', 'java.base/jdk.internal.math=ALL-UNNAMED', + '--add-opens', 'java.base/jdk.internal.module=ALL-UNNAMED', + '--add-opens', 'java.base/jdk.internal.util.jar=ALL-UNNAMED', + '--add-opens', 'jdk.management/com.sun.management.internal=ALL-UNNAMED'] + + if (JavaVersion.current().isCompatibleWith(JavaVersion.VERSION_17)) { + JDK_OPTIONS += ['--add-exports', 'java.rmi/sun.rmi.transport=ALL-UNNAMED', + '--add-exports', 'java.rmi/sun.rmi.transport.tcp=ALL-UNNAMED', + '--add-opens', 'java.base/java.io=ALL-UNNAMED', + '--add-opens', 'java.base/java.lang=ALL-UNNAMED', + '--add-opens', 'java.base/java.lang.invoke=ALL-UNNAMED', + '--add-opens', 'java.base/java.lang.reflect=ALL-UNNAMED', + '--add-opens', 'java.base/java.math=ALL-UNNAMED', + '--add-opens', 'java.base/java.nio=ALL-UNNAMED', + '--add-opens', 'java.base/java.util=ALL-UNNAMED', + '--add-opens', 'java.base/java.util.concurrent=ALL-UNNAMED', + '--add-opens', 'java.base/java.util.concurrent.atomic=ALL-UNNAMED', + '--add-opens', 'java.base/sun.nio.ch=ALL-UNNAMED', + '--add-opens', 'java.base/sun.util.calendar=ALL-UNNAMED', + '--add-opens', 'java.rmi/sun.rmi.transport.tcp=ALL-UNNAMED', + ] } + task.jvmArgs(JDK_OPTIONS) + println("JVM arguments for $project.name are ${task.allJvmArgs}") // Some test classes are skipped entirely via assumeThat (e.g. MultipleTokens tests on C* 4.0). // When CI runs individual classes with --tests, a skipped @BeforeAll leaves zero discovered tests, From f20cd55b68ed5f23bef68a22b194e754efd9150a Mon Sep 17 00:00:00 2001 From: Francisco Guerrero Date: Tue, 13 Jan 2026 14:24:33 -0800 Subject: [PATCH 08/26] consolidate jvm options --- build.gradle | 33 ++++------- cassandra-analytics-cdc-codec/build.gradle | 5 +- cassandra-analytics-cdc/build.gradle | 6 +- .../cassandra/spark/utils/BuildInfo.java | 27 ++++++++- cassandra-analytics-core-example/build.gradle | 20 +------ cassandra-analytics-core/build.gradle | 36 ++---------- .../spark/bulkwriter/BulkSparkConf.java | 22 +++++++- .../cassandra/spark/utils/BuildInfoTest.java | 56 +++++++++++-------- .../build.gradle | 41 ++------------ .../cassandra/analytics/SparkTestUtils.java | 7 ++- cassandra-five-zero-bridge/build.gradle | 26 ++------- gradle/common/java11Options.gradle | 36 ++++++++++++ gradle/common/java17Options.gradle | 37 ++++++++++++ gradle/common/javaOptions.gradle | 26 +++++++++ 14 files changed, 217 insertions(+), 161 deletions(-) create mode 100644 gradle/common/java11Options.gradle create mode 100644 gradle/common/java17Options.gradle create mode 100644 gradle/common/javaOptions.gradle diff --git a/build.gradle b/build.gradle index 065e903f8..258aba16b 100644 --- a/build.gradle +++ b/build.gradle @@ -109,7 +109,6 @@ apply(from: profile) apply(plugin: 'idea') - // Force checkstyle and rat to run before test tasks for faster feedback def codeCheckTasks = task("codeCheckTasks") @@ -190,6 +189,8 @@ subprojects { apply(plugin: 'java-library') apply(plugin: 'checkstyle') + apply from: "${project.rootDir}/gradle/common/javaOptions.gradle" + sourceCompatibility = "${project.rootProject.ext.jdkLabel}" targetCompatibility = "${project.rootProject.ext.jdkLabel}" @@ -259,28 +260,14 @@ subprojects { def heapDumpPath = "${project.rootProject.rootDir}/build/${project.name}/heapDumps" Files.createDirectories(Paths.get(heapDumpPath)) - if (JavaVersion.current().isJava11Compatible()) { - def JDK11_OPTIONS = ['-Djdk.attach.allowAttachSelf=true', - '--add-exports', 'java.base/jdk.internal.misc=ALL-UNNAMED', - '--add-exports', 'java.base/jdk.internal.ref=ALL-UNNAMED', - '--add-exports', 'java.base/sun.nio.ch=ALL-UNNAMED', - '--add-exports', 'java.management.rmi/com.sun.jmx.remote.internal.rmi=ALL-UNNAMED', - '--add-exports', 'java.rmi/sun.rmi.registry=ALL-UNNAMED', - '--add-exports', 'java.rmi/sun.rmi.server=ALL-UNNAMED', - '--add-exports', 'java.sql/java.sql=ALL-UNNAMED', - '--add-opens', 'java.base/java.lang.module=ALL-UNNAMED', - '--add-opens', 'java.base/jdk.internal.loader=ALL-UNNAMED', - '--add-opens', 'java.base/jdk.internal.ref=ALL-UNNAMED', - '--add-opens', 'java.base/jdk.internal.reflect=ALL-UNNAMED', - '--add-opens', 'java.base/jdk.internal.math=ALL-UNNAMED', - '--add-opens', 'java.base/jdk.internal.module=ALL-UNNAMED', - '--add-opens', 'java.base/jdk.internal.util.jar=ALL-UNNAMED', - '--add-opens', 'jdk.management/com.sun.management.internal=ALL-UNNAMED', - '-XX:+HeapDumpOnOutOfMemoryError', - "-XX:HeapDumpPath=${heapDumpPath}"] - jvmArgs(JDK11_OPTIONS) - println("JVM arguments for $project.name are $allJvmArgs") - } + def jdkOptions = project.ext.JDK_OPTIONS + + jdkOptions += [ + '-XX:+HeapDumpOnOutOfMemoryError', + "-XX:HeapDumpPath=${heapDumpPath}", + ] + jvmArgs(jdkOptions) + println("JVM arguments for $project.name are $allJvmArgs") } codeCheckTasks.dependsOn(tasks.withType(Checkstyle)) diff --git a/cassandra-analytics-cdc-codec/build.gradle b/cassandra-analytics-cdc-codec/build.gradle index f83224080..e8ac31877 100644 --- a/cassandra-analytics-cdc-codec/build.gradle +++ b/cassandra-analytics-cdc-codec/build.gradle @@ -68,10 +68,11 @@ sourceSets { } } +apply from: "${project.rootDir}/gradle/common/java17Options.gradle" + test { if (JavaVersion.current().isCompatibleWith(JavaVersion.VERSION_17)) { - jvmArgs(['--add-opens', 'java.base/java.io=ALL-UNNAMED', - '--add-opens', 'java.base/sun.nio.ch=ALL-UNNAMED']) + jvmArgs(project.ext.JDK17_OPTIONS) } useJUnitPlatform() diff --git a/cassandra-analytics-cdc/build.gradle b/cassandra-analytics-cdc/build.gradle index feffabd1a..74aa5a4c5 100644 --- a/cassandra-analytics-cdc/build.gradle +++ b/cassandra-analytics-cdc/build.gradle @@ -141,6 +141,8 @@ jar { } } +apply from: "${project.rootDir}/gradle/common/javaOptions.gradle" + // Assuming container w/8g ram, 2x3G == 6G + overhead; override via CDC_MAX_PARALLEL_FORKS for local dev def cdcMaxHeap = '3072m' def cdcMaxParallelForks = (System.getenv("CDC_MAX_PARALLEL_FORKS") ?: "2") as int @@ -166,9 +168,7 @@ def configureCdcTestTask = { Test task, String majorMinor = null -> task.forkEvery = 1 // Enables different end-to-end test classes use Spark contexts with different configurations if (JavaVersion.current().isCompatibleWith(JavaVersion.VERSION_17)) { - jvmArgs(['--add-opens', 'java.base/java.io=ALL-UNNAMED', - '--add-opens', 'java.base/java.nio=ALL-UNNAMED', - '--add-opens', 'java.base/sun.nio.ch=ALL-UNNAMED']) + jvmArgs(project.ext.JDK17_OPTIONS) } // Make it so unit tests run on a Jar with Cassandra bridge implementations built in task.dependsOn(tasks.jar) diff --git a/cassandra-analytics-common/src/main/java/org/apache/cassandra/spark/utils/BuildInfo.java b/cassandra-analytics-common/src/main/java/org/apache/cassandra/spark/utils/BuildInfo.java index 8e3ffd86a..6a4f5a138 100644 --- a/cassandra-analytics-common/src/main/java/org/apache/cassandra/spark/utils/BuildInfo.java +++ b/cassandra-analytics-common/src/main/java/org/apache/cassandra/spark/utils/BuildInfo.java @@ -66,6 +66,29 @@ static String getBuildVersion() * @return {@code true} if the java version is at least java 11, {@code false} otherwise */ public static boolean isAtLeastJava11(String version) + { + return isAtLeastJavaVersion(version, 11); + } + + /** + * Determine whether the provided version is at least java 17 + * + * @param version the java specification version + * @return {@code true} if the java version is at least java 17, {@code false} otherwise + */ + public static boolean isAtLeastJava17(String version) + { + return isAtLeastJavaVersion(version, 17); + } + + /** + * Determine whether the provided version is at least the minimum java version + * + * @param version the java specification version + * @param minimumVersion the minimum version required + * @return {@code true} if the java version is at least the minimum java version, {@code false} otherwise + */ + public static boolean isAtLeastJavaVersion(String version, int minimumVersion) { if (version == null) { @@ -73,11 +96,11 @@ public static boolean isAtLeastJava11(String version) } else if (version.contains(".")) { - return version.compareTo("11") >= 0; + return version.compareTo(minimumVersion + "") >= 0; } else { - return Integer.parseInt(version) >= 11; + return Integer.parseInt(version) >= minimumVersion; } } diff --git a/cassandra-analytics-core-example/build.gradle b/cassandra-analytics-core-example/build.gradle index b8ab3d402..ad0175172 100644 --- a/cassandra-analytics-core-example/build.gradle +++ b/cassandra-analytics-core-example/build.gradle @@ -33,6 +33,8 @@ dependencies { implementation(group: "${sparkGroupId}", name: "spark-sql_${scalaMajorVersion}", version: "${project.rootProject.sparkVersion}") } +apply from: "${project.rootDir}/gradle/common/javaOptions.gradle" + application { // Optionally allow to start with a different mainClass, rather than the default 'DirectWriteAndReadJob' // For example, @@ -40,22 +42,6 @@ application { // or // ./gradlew :cassandra-analytics-core-example:run --args='LocalS3WriteAndReadJob' mainClass = 'org.apache.cassandra.spark.example.JobSelector' - applicationDefaultJvmArgs = ["-Dfile.encoding=UTF-8", - "-Djdk.attach.allowAttachSelf=true", - "--add-exports", "java.base/jdk.internal.misc=ALL-UNNAMED", - "--add-exports", "java.base/jdk.internal.ref=ALL-UNNAMED", - "--add-exports", "java.base/sun.nio.ch=ALL-UNNAMED", - "--add-exports", "java.management.rmi/com.sun.jmx.remote.internal.rmi=ALL-UNNAMED", - "--add-exports", "java.rmi/sun.rmi.registry=ALL-UNNAMED", - "--add-exports", "java.rmi/sun.rmi.server=ALL-UNNAMED", - "--add-exports", "java.sql/java.sql=ALL-UNNAMED", - "--add-opens", "java.base/java.lang.module=ALL-UNNAMED", - "--add-opens", "java.base/jdk.internal.loader=ALL-UNNAMED", - "--add-opens", "java.base/jdk.internal.ref=ALL-UNNAMED", - "--add-opens", "java.base/jdk.internal.reflect=ALL-UNNAMED", - "--add-opens", "java.base/jdk.internal.math=ALL-UNNAMED", - "--add-opens", "java.base/jdk.internal.module=ALL-UNNAMED", - "--add-opens", "java.base/jdk.internal.util.jar=ALL-UNNAMED", - "--add-opens", "jdk.management/com.sun.management.internal=ALL-UNNAMED"] + applicationDefaultJvmArgs = project.ext.JDK_OPTIONS } diff --git a/cassandra-analytics-core/build.gradle b/cassandra-analytics-core/build.gradle index 35353f3c1..dc9326732 100644 --- a/cassandra-analytics-core/build.gradle +++ b/cassandra-analytics-core/build.gradle @@ -25,11 +25,12 @@ plugins { id('maven-publish') } -if (propertyWithDefault("artifactType", null) == "spark") -{ +if (propertyWithDefault("artifactType", null) == "spark") { apply from: "$rootDir/gradle/common/publishing.gradle" } +apply from: "${project.rootDir}/gradle/common/javaOptions.gradle" + java { withJavadocJar() withSourcesJar() @@ -159,31 +160,7 @@ jar { // (CI) or falls back to all supported versions (local dev). def configureCoreTestTask(Test task, String majorMinor = null) { task.doFirst { - def JDK_OPTIONS = ['-Djdk.attach.allowAttachSelf=true', - '--add-exports', 'java.base/jdk.internal.misc=ALL-UNNAMED', - '--add-exports', 'java.base/jdk.internal.ref=ALL-UNNAMED', - '--add-exports', 'java.base/sun.nio.ch=ALL-UNNAMED', - '--add-exports', 'java.management.rmi/com.sun.jmx.remote.internal.rmi=ALL-UNNAMED', - '--add-exports', 'java.rmi/sun.rmi.registry=ALL-UNNAMED', - '--add-exports', 'java.rmi/sun.rmi.server=ALL-UNNAMED', - '--add-exports', 'java.sql/java.sql=ALL-UNNAMED', - '--add-opens', 'java.base/java.lang.module=ALL-UNNAMED', - '--add-opens', 'java.base/jdk.internal.loader=ALL-UNNAMED', - '--add-opens', 'java.base/jdk.internal.ref=ALL-UNNAMED', - '--add-opens', 'java.base/jdk.internal.reflect=ALL-UNNAMED', - '--add-opens', 'java.base/jdk.internal.math=ALL-UNNAMED', - '--add-opens', 'java.base/jdk.internal.module=ALL-UNNAMED', - '--add-opens', 'java.base/jdk.internal.util.jar=ALL-UNNAMED', - '--add-opens', 'jdk.management/com.sun.management.internal=ALL-UNNAMED'] - - if (JavaVersion.current().isCompatibleWith(JavaVersion.VERSION_17)) { - JDK_OPTIONS += ['--add-opens', 'java.base/java.io=ALL-UNNAMED', - '--add-opens', 'java.base/java.math=ALL-UNNAMED', - '--add-opens', 'java.base/java.nio=ALL-UNNAMED', - '--add-opens', 'java.base/sun.nio.ch=ALL-UNNAMED', - '--add-opens', 'java.base/sun.util.calendar=ALL-UNNAMED'] - } - task.jvmArgs(JDK_OPTIONS) + task.jvmArgs(project.ext.JDK_OPTIONS) println("JVM arguments for $project.name are $allJvmArgs") } @@ -229,10 +206,7 @@ test { ?: Math.max(Runtime.runtime.availableProcessors() * 2, 8) if (JavaVersion.current().isCompatibleWith(JavaVersion.VERSION_17)) { - jvmArgs(['--add-opens', 'java.base/java.io=ALL-UNNAMED', - '--add-opens', 'java.base/java.math=ALL-UNNAMED', - '--add-opens', 'java.base/java.nio=ALL-UNNAMED', - '--add-opens', 'java.base/sun.nio.ch=ALL-UNNAMED']) + jvmArgs(project.ext.JDK17_OPTIONS) } useJUnitPlatform { diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BulkSparkConf.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BulkSparkConf.java index 68f5636b3..21d9b8c58 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BulkSparkConf.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BulkSparkConf.java @@ -75,6 +75,21 @@ public class BulkSparkConf implements Serializable + " --add-opens java.base/jdk.internal.util.jar=ALL-UNNAMED" + " --add-opens jdk.management/com.sun.management.internal=ALL-UNNAMED"; + public static final String JDK17_OPTIONS = " --add-exports java.rmi/sun.rmi.transport=ALL-UNNAMED" + + " --add-exports java.rmi/sun.rmi.transport.tcp=ALL-UNNAMED" + + " --add-opens java.base/java.io=ALL-UNNAMED" + + " --add-opens java.base/java.lang=ALL-UNNAMED" + + " --add-opens java.base/java.lang.invoke=ALL-UNNAMED" + + " --add-opens java.base/java.lang.reflect=ALL-UNNAMED" + + " --add-opens java.base/java.math=ALL-UNNAMED" + + " --add-opens java.base/java.nio=ALL-UNNAMED" + + " --add-opens java.base/java.util=ALL-UNNAMED" + + " --add-opens java.base/java.util.concurrent=ALL-UNNAMED" + + " --add-opens java.base/java.util.concurrent.atomic=ALL-UNNAMED" + + " --add-opens java.base/sun.nio.ch=ALL-UNNAMED" + + " --add-opens java.base/sun.util.calendar=ALL-UNNAMED" + + " --add-opens java.rmi/sun.rmi.transport.tcp=ALL-UNNAMED"; + public static final int DEFAULT_NUM_SPLITS = -1; public static final int DEFAULT_HTTP_CONNECTION_TIMEOUT = 100_000; public static final int DEFAULT_HTTP_RESPONSE_TIMEOUT = 100_000; @@ -666,7 +681,12 @@ protected List getDeprecatedSettingPrefixes() public static void setupSparkConf(SparkConf conf, boolean addKryoRegistrator) { String previousOptions = conf.get("spark.executor.extraJavaOptions", ""); - if (BuildInfo.isAtLeastJava11(BuildInfo.javaSpecificationVersion())) + + if (BuildInfo.isAtLeastJava17(BuildInfo.javaSpecificationVersion())) + { + conf.set("spark.executor.extraJavaOptions", previousOptions + JDK11_OPTIONS + JDK17_OPTIONS); + } + else if (BuildInfo.isAtLeastJava11(BuildInfo.javaSpecificationVersion())) { conf.set("spark.executor.extraJavaOptions", previousOptions + JDK11_OPTIONS); } diff --git a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/utils/BuildInfoTest.java b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/utils/BuildInfoTest.java index 10681e739..ab860527a 100644 --- a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/utils/BuildInfoTest.java +++ b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/utils/BuildInfoTest.java @@ -20,6 +20,8 @@ package org.apache.cassandra.spark.utils; import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; import static org.assertj.core.api.Assertions.assertThat; @@ -49,30 +51,38 @@ public void testJavaVersionReturnsAValue() } @Test - public void isAtLeastJava11() + public void testIsAtLeastJavaVersionWithNullInput() { assertThat(BuildInfo.isAtLeastJava11(null)).isFalse(); - assertThat(BuildInfo.isAtLeastJava11("0.9")).isFalse(); - assertThat(BuildInfo.isAtLeastJava11("1.1")).isFalse(); - assertThat(BuildInfo.isAtLeastJava11("1.2")).isFalse(); - assertThat(BuildInfo.isAtLeastJava11("1.3")).isFalse(); - assertThat(BuildInfo.isAtLeastJava11("1.4")).isFalse(); - assertThat(BuildInfo.isAtLeastJava11("1.5")).isFalse(); - assertThat(BuildInfo.isAtLeastJava11("1.6")).isFalse(); - assertThat(BuildInfo.isAtLeastJava11("1.7")).isFalse(); - assertThat(BuildInfo.isAtLeastJava11("1.8")).isFalse(); - assertThat(BuildInfo.isAtLeastJava11("9")).isFalse(); - assertThat(BuildInfo.isAtLeastJava11("10")).isFalse(); - assertThat(BuildInfo.isAtLeastJava11("11")).isTrue(); - assertThat(BuildInfo.isAtLeastJava11("12")).isTrue(); - assertThat(BuildInfo.isAtLeastJava11("13")).isTrue(); - assertThat(BuildInfo.isAtLeastJava11("14")).isTrue(); - assertThat(BuildInfo.isAtLeastJava11("15")).isTrue(); - assertThat(BuildInfo.isAtLeastJava11("16")).isTrue(); - assertThat(BuildInfo.isAtLeastJava11("17")).isTrue(); - assertThat(BuildInfo.isAtLeastJava11("18")).isTrue(); - assertThat(BuildInfo.isAtLeastJava11("19")).isTrue(); - assertThat(BuildInfo.isAtLeastJava11("20")).isTrue(); - assertThat(BuildInfo.isAtLeastJava11("21")).isTrue(); + assertThat(BuildInfo.isAtLeastJava17(null)).isFalse(); + } + + @ParameterizedTest(name = "{index} => Java version {0}") + @ValueSource(strings = { "0.9", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7", "1.8", "9", "10" }) + public void isNotAtLeastJava11(String version) + { + assertThat(BuildInfo.isAtLeastJava11(version)).isFalse(); + } + + @ParameterizedTest(name = "{index} => Java version {0}") + @ValueSource(strings = { "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25" }) + public void isAtLeastJava11(String version) + { + assertThat(BuildInfo.isAtLeastJava11(version)).isTrue(); + } + + @ParameterizedTest(name = "{index} => Java version {0}") + @ValueSource(strings = { "0.9", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7", "1.8", "9", "10", + "11", "12", "13", "14", "15", "16" }) + public void isNotAtLeastJava17(String version) + { + assertThat(BuildInfo.isAtLeastJava17(version)).isFalse(); + } + + @ParameterizedTest(name = "{index} => Java version {0}") + @ValueSource(strings = { "17", "18", "19", "20", "21", "22", "23", "24", "25" }) + public void isAtLeastJava17(String version) + { + assertThat(BuildInfo.isAtLeastJava17(version)).isTrue(); } } diff --git a/cassandra-analytics-integration-tests/build.gradle b/cassandra-analytics-integration-tests/build.gradle index 8a363e312..0a01c84ea 100644 --- a/cassandra-analytics-integration-tests/build.gradle +++ b/cassandra-analytics-integration-tests/build.gradle @@ -27,11 +27,12 @@ plugins { id('maven-publish') } -if (propertyWithDefault("artifactType", null) == "spark") -{ +if (propertyWithDefault("artifactType", null) == "spark") { apply from: "$rootDir/gradle/common/publishing.gradle" } +apply from: "${project.rootDir}/gradle/common/javaOptions.gradle" + def integrationMaxHeapSize = System.getenv("INTEGRATION_MAX_HEAP_SIZE") ?: "3072M" println("Using ${integrationMaxHeapSize} maxHeapSize") @@ -125,41 +126,7 @@ def configureIntegrationTestTask = { Test task, String majorMinor = null -> showStandardStreams = false } - def JDK_OPTIONS = ['-Djdk.attach.allowAttachSelf=true', - '--add-exports', 'java.base/jdk.internal.misc=ALL-UNNAMED', - '--add-exports', 'java.base/jdk.internal.ref=ALL-UNNAMED', - '--add-exports', 'java.base/sun.nio.ch=ALL-UNNAMED', - '--add-exports', 'java.management.rmi/com.sun.jmx.remote.internal.rmi=ALL-UNNAMED', - '--add-exports', 'java.rmi/sun.rmi.registry=ALL-UNNAMED', - '--add-exports', 'java.rmi/sun.rmi.server=ALL-UNNAMED', - '--add-exports', 'java.sql/java.sql=ALL-UNNAMED', - '--add-opens', 'java.base/java.lang.module=ALL-UNNAMED', - '--add-opens', 'java.base/jdk.internal.loader=ALL-UNNAMED', - '--add-opens', 'java.base/jdk.internal.ref=ALL-UNNAMED', - '--add-opens', 'java.base/jdk.internal.reflect=ALL-UNNAMED', - '--add-opens', 'java.base/jdk.internal.math=ALL-UNNAMED', - '--add-opens', 'java.base/jdk.internal.module=ALL-UNNAMED', - '--add-opens', 'java.base/jdk.internal.util.jar=ALL-UNNAMED', - '--add-opens', 'jdk.management/com.sun.management.internal=ALL-UNNAMED'] - - if (JavaVersion.current().isCompatibleWith(JavaVersion.VERSION_17)) { - JDK_OPTIONS += ['--add-exports', 'java.rmi/sun.rmi.transport=ALL-UNNAMED', - '--add-exports', 'java.rmi/sun.rmi.transport.tcp=ALL-UNNAMED', - '--add-opens', 'java.base/java.io=ALL-UNNAMED', - '--add-opens', 'java.base/java.lang=ALL-UNNAMED', - '--add-opens', 'java.base/java.lang.invoke=ALL-UNNAMED', - '--add-opens', 'java.base/java.lang.reflect=ALL-UNNAMED', - '--add-opens', 'java.base/java.math=ALL-UNNAMED', - '--add-opens', 'java.base/java.nio=ALL-UNNAMED', - '--add-opens', 'java.base/java.util=ALL-UNNAMED', - '--add-opens', 'java.base/java.util.concurrent=ALL-UNNAMED', - '--add-opens', 'java.base/java.util.concurrent.atomic=ALL-UNNAMED', - '--add-opens', 'java.base/sun.nio.ch=ALL-UNNAMED', - '--add-opens', 'java.base/sun.util.calendar=ALL-UNNAMED', - '--add-opens', 'java.rmi/sun.rmi.transport.tcp=ALL-UNNAMED', - ] - } - task.jvmArgs(JDK_OPTIONS) + task.jvmArgs(project.ext.JDK_OPTIONS) println("JVM arguments for $project.name are ${task.allJvmArgs}") // Some test classes are skipped entirely via assumeThat (e.g. MultipleTokens tests on C* 4.0). diff --git a/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/SparkTestUtils.java b/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/SparkTestUtils.java index bc9cdb74b..6319e7f74 100644 --- a/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/SparkTestUtils.java +++ b/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/SparkTestUtils.java @@ -32,7 +32,9 @@ import java.util.stream.IntStream; import java.util.stream.Stream; +import org.apache.commons.lang3.JavaVersion; import org.apache.commons.lang3.StringUtils; +import org.apache.commons.lang3.SystemUtils; import org.apache.cassandra.distributed.api.ICluster; import org.apache.cassandra.distributed.api.IInstance; @@ -200,7 +202,10 @@ public SparkConf defaultSparkConf() .set("spark.cassandra_analytics.cassandra.version", "5.0.0") .set("spark.cassandra_analytics.sidecar.request.retries", "5") .set("spark.cassandra_analytics.sidecar.request.retries.delay.milliseconds", "500") - .set("spark.cassandra_analytics.sidecar.request.retries.max.delay.milliseconds", "500"); + .set("spark.cassandra_analytics.sidecar.request.retries.max.delay.milliseconds", "500") + .set("spark.driver.extraJavaOptions", "--add-exports=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens java.base/java.net=ALL-UNNAMED --add-opens java.base/jdk.internal.misc=ALL-UNNAMED") + .set("spark.executor.extraJavaOptions", "--add-exports=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens java.base/java.net=ALL-UNNAMED --add-opens java.base/jdk.internal.misc=ALL-UNNAMED"); + BulkSparkConf.setupSparkConf(sparkConf, true); KryoRegister.setup(sparkConf); return sparkConf; diff --git a/cassandra-five-zero-bridge/build.gradle b/cassandra-five-zero-bridge/build.gradle index 31882e138..9a785d3ff 100644 --- a/cassandra-five-zero-bridge/build.gradle +++ b/cassandra-five-zero-bridge/build.gradle @@ -23,6 +23,8 @@ plugins { id('java-library') } +apply from: "${project.rootDir}/gradle/common/javaOptions.gradle" + configurations { all*.exclude(group: 'org.slf4j', module: 'slf4j-log4j12') all*.exclude(group: 'log4j', module: 'log4j') @@ -107,26 +109,8 @@ test { // automatically run BIG and BTI tests for Cassandra 5.x bridge tasks.register('testBti', Test) { doFirst { - if (JavaVersion.current().isJava11Compatible()) { - def JDK11_OPTIONS = ['-Djdk.attach.allowAttachSelf=true', - '--add-exports', 'java.base/jdk.internal.misc=ALL-UNNAMED', - '--add-exports', 'java.base/jdk.internal.ref=ALL-UNNAMED', - '--add-exports', 'java.base/sun.nio.ch=ALL-UNNAMED', - '--add-exports', 'java.management.rmi/com.sun.jmx.remote.internal.rmi=ALL-UNNAMED', - '--add-exports', 'java.rmi/sun.rmi.registry=ALL-UNNAMED', - '--add-exports', 'java.rmi/sun.rmi.server=ALL-UNNAMED', - '--add-exports', 'java.sql/java.sql=ALL-UNNAMED', - '--add-opens', 'java.base/java.lang.module=ALL-UNNAMED', - '--add-opens', 'java.base/jdk.internal.loader=ALL-UNNAMED', - '--add-opens', 'java.base/jdk.internal.ref=ALL-UNNAMED', - '--add-opens', 'java.base/jdk.internal.reflect=ALL-UNNAMED', - '--add-opens', 'java.base/jdk.internal.math=ALL-UNNAMED', - '--add-opens', 'java.base/jdk.internal.module=ALL-UNNAMED', - '--add-opens', 'java.base/jdk.internal.util.jar=ALL-UNNAMED', - '--add-opens', 'jdk.management/com.sun.management.internal=ALL-UNNAMED'] - jvmArgs(JDK11_OPTIONS) - println("JVM arguments for $project.name are $allJvmArgs") - } + jvmArgs(project.ext.JDK_OPTIONS) + println("JVM arguments for $project.name are $allJvmArgs") } systemProperty "cassandra.analytics.bridges.sstable_format", "bti" @@ -146,4 +130,4 @@ tasks.register('testBti', Test) { check { dependsOn(tasks.testBti) -} \ No newline at end of file +} diff --git a/gradle/common/java11Options.gradle b/gradle/common/java11Options.gradle new file mode 100644 index 000000000..ed427c615 --- /dev/null +++ b/gradle/common/java11Options.gradle @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +project.ext.JDK11_OPTIONS = [ + '-Djdk.attach.allowAttachSelf=true', + '--add-exports', 'java.base/jdk.internal.misc=ALL-UNNAMED', + '--add-exports', 'java.base/jdk.internal.ref=ALL-UNNAMED', + '--add-exports', 'java.base/sun.nio.ch=ALL-UNNAMED', + '--add-exports', 'java.management.rmi/com.sun.jmx.remote.internal.rmi=ALL-UNNAMED', + '--add-exports', 'java.rmi/sun.rmi.registry=ALL-UNNAMED', + '--add-exports', 'java.rmi/sun.rmi.server=ALL-UNNAMED', + '--add-exports', 'java.sql/java.sql=ALL-UNNAMED', + '--add-opens', 'java.base/java.lang.module=ALL-UNNAMED', + '--add-opens', 'java.base/jdk.internal.loader=ALL-UNNAMED', + '--add-opens', 'java.base/jdk.internal.ref=ALL-UNNAMED', + '--add-opens', 'java.base/jdk.internal.reflect=ALL-UNNAMED', + '--add-opens', 'java.base/jdk.internal.math=ALL-UNNAMED', + '--add-opens', 'java.base/jdk.internal.module=ALL-UNNAMED', + '--add-opens', 'java.base/jdk.internal.util.jar=ALL-UNNAMED', + '--add-opens', 'jdk.management/com.sun.management.internal=ALL-UNNAMED', +] diff --git a/gradle/common/java17Options.gradle b/gradle/common/java17Options.gradle new file mode 100644 index 000000000..d0c7bec11 --- /dev/null +++ b/gradle/common/java17Options.gradle @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +project.ext.JDK17_OPTIONS = [ + '--add-exports', 'java.rmi/sun.rmi.transport=ALL-UNNAMED', + '--add-exports', 'java.rmi/sun.rmi.transport.tcp=ALL-UNNAMED', + '--add-opens', 'java.base/java.io=ALL-UNNAMED', + '--add-opens', 'java.base/java.lang=ALL-UNNAMED', + '--add-opens', 'java.base/java.lang.invoke=ALL-UNNAMED', + '--add-opens', 'java.base/java.lang.reflect=ALL-UNNAMED', + '--add-opens', 'java.base/java.math=ALL-UNNAMED', + '--add-opens', 'java.base/java.net=ALL-UNNAMED', + '--add-opens', 'java.base/java.nio=ALL-UNNAMED', + '--add-opens', 'java.base/java.util=ALL-UNNAMED', + '--add-opens', 'java.base/java.util.concurrent=ALL-UNNAMED', + '--add-opens', 'java.base/java.util.concurrent.atomic=ALL-UNNAMED', + '--add-opens', 'java.base/jdk.internal.misc=ALL-UNNAMED', + '--add-opens', 'java.base/sun.nio.ch=ALL-UNNAMED', + '--add-opens', 'java.base/sun.util.calendar=ALL-UNNAMED', + '--add-opens', 'java.rmi/sun.rmi.transport.tcp=ALL-UNNAMED', +] diff --git a/gradle/common/javaOptions.gradle b/gradle/common/javaOptions.gradle new file mode 100644 index 000000000..434868bc0 --- /dev/null +++ b/gradle/common/javaOptions.gradle @@ -0,0 +1,26 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +apply from: "${project.rootDir}/gradle/common/java11Options.gradle" +apply from: "${project.rootDir}/gradle/common/java17Options.gradle" + +project.ext.JDK_OPTIONS = project.ext.JDK11_OPTIONS + +if (JavaVersion.current().isCompatibleWith(JavaVersion.VERSION_17)) { + project.ext.JDK_OPTIONS += project.ext.JDK17_OPTIONS +} From 2854ed2dfa9e5f2d677b9f9601c03be82e4ef570 Mon Sep 17 00:00:00 2001 From: Francisco Guerrero Date: Sat, 23 May 2026 07:58:54 -0700 Subject: [PATCH 09/26] Fix circle --- .circleci/config.yml | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 40a172306..9f5fdf904 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -49,9 +49,6 @@ commands: type: string jdk: type: string - use_jdk11: - type: string - default: "false" sstable_format: type: string default: "big" @@ -69,7 +66,6 @@ commands: INTEGRATION_MAX_HEAP_SIZE: "3072m" CORE_MAX_PARALLEL_FORKS: 3 CORE_TEST_MAX_HEAP_SIZE: "2048m" - CASSANDRA_USE_JDK11: <> CASSANDRA_VERSION: "<>" command: | export GRADLE_OPTS="-Xmx2g -Dorg.gradle.jvmargs=-Xmx2g" @@ -84,9 +80,6 @@ commands: type: string jdk: type: string - use_jdk11: - type: string - default: "false" cassandra: type: string description: Build and integration test against Spark <> Scala <> <> @@ -98,7 +91,6 @@ commands: JDK_VERSION: "<>" INTEGRATION_MAX_PARALLEL_FORKS: 1 INTEGRATION_MAX_HEAP_SIZE: "3072M" - CASSANDRA_USE_JDK11: <> command: | export GRADLE_OPTS="-Xmx2g -Dorg.gradle.jvmargs=-Xmx2g" export DTEST_JAR="dtest-<< parameters.cassandra >>.jar" @@ -174,8 +166,10 @@ jobs: - checkout - run: name: Build dependencies for jdk11 builds + environment: + CASSANDRA_USE_JDK11: "true" command: | - CASSANDRA_USE_JDK11=true ./scripts/build-dependencies.sh + ./scripts/build-dependencies.sh ./gradlew --no-daemon --max-workers=2 codeCheckTasks - persist_to_workspace: root: dependencies @@ -196,7 +190,6 @@ jobs: spark: "3" scala: "2.12" jdk: "11" - use_jdk11: "true" sstable_format: "big" cassandra: "4.0" @@ -227,7 +220,6 @@ jobs: spark: "3" scala: "2.12" jdk: "11" - use_jdk11: "true" sstable_format: "big" cassandra: "4.1" @@ -260,7 +252,6 @@ jobs: spark: "3" scala: "2.12" jdk: "11" - use_jdk11: "true" cassandra: "4.0.17" - store_artifacts: @@ -297,7 +288,6 @@ jobs: spark: "3" scala: "2.12" jdk: "11" - use_jdk11: "true" cassandra: "4.1.4" - store_artifacts: @@ -332,7 +322,6 @@ jobs: spark: "3" scala: "2.13" jdk: "11" - use_jdk11: "true" sstable_format: "bti" cassandra: "5.0" @@ -365,7 +354,6 @@ jobs: spark: "3" scala: "2.13" jdk: "11" - use_jdk11: "true" cassandra: "5.0.5" - store_artifacts: From f5d8bf53dd2ecb0ef2610e898be5113f58f1e71c Mon Sep 17 00:00:00 2001 From: Francisco Guerrero Date: Tue, 26 May 2026 11:41:30 -0700 Subject: [PATCH 10/26] Circle CI --- .circleci/config.yml | 236 +++++++++++++++---------------------------- 1 file changed, 83 insertions(+), 153 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 9f5fdf904..187391799 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -177,39 +177,25 @@ jobs: - "*.jar" - "org/**/*" - spark3-2_12-jdk11-big-c40: - docker: - - image: cimg/openjdk:11.0 - resource_class: large - steps: - - install_common - - checkout - - attach_workspace: - at: dependencies - - run_build: - spark: "3" - scala: "2.12" - jdk: "11" - sstable_format: "big" - cassandra: "4.0" - - - store_artifacts: - when: always - path: build/test-reports - destination: test-reports - - - store_artifacts: - when: always - path: build/reports - destination: reports - - - store_test_results: - when: always - path: build/test-reports - - spark3-2_12-jdk11-big-c41: + # Single parameterized unit-test job. The workflow invokes it via `matrix:` + # multiple times, once per "compatibility group" (e.g. C4.x + Scala 2.12 + BIG, + # C5.0 + Scala 2.13 + BTI). Splitting into several small matrices is cleaner + # than one big matrix with a long `exclude:` list, because CircleCI's + # `exclude:` requires every parameter to be listed in every entry. + unit-test: + parameters: + spark: + type: string + scala: + type: string + jdk: + type: string + cassandra: + type: string + sstable_format: + type: string docker: - - image: cimg/openjdk:11.0 + - image: cimg/openjdk:<>.0 resource_class: large steps: - install_common @@ -217,11 +203,11 @@ jobs: - attach_workspace: at: dependencies - run_build: - spark: "3" - scala: "2.12" - jdk: "11" - sstable_format: "big" - cassandra: "4.1" + spark: "<>" + scala: "<>" + jdk: "<>" + sstable_format: "<>" + cassandra: "<>" - store_artifacts: when: always @@ -237,46 +223,21 @@ jobs: when: always path: build/test-reports - int-c4-spark3-2_12-jdk11: - parallelism: 8 - docker: - - image: cimg/openjdk:11.0 - resource_class: large - steps: - - setup_remote_docker - - install_common - - checkout - - attach_workspace: - at: dependencies - - run_integration: - spark: "3" - scala: "2.12" - jdk: "11" - cassandra: "4.0.17" - - - store_artifacts: - when: always - path: build/test-reports - destination: test-reports - - - store_artifacts: - when: always - path: build/reports - destination: reports - - - store_artifacts: - when: always - path: cassandra-analytics-integration-tests - destination: int-tests-misc - - - store_test_results: - when: always - path: build/aggregated-test-reports - - int-c41-spark3-2_12-jdk11: + # Single parameterized integration-test job, invoked once per Cassandra/Scala + # compatibility group from the workflow's `matrix:` block. + integration-test: + parameters: + spark: + type: string + scala: + type: string + jdk: + type: string + cassandra: + type: string parallelism: 8 docker: - - image: cimg/openjdk:11.0 + - image: cimg/openjdk:<>.0 resource_class: large steps: - setup_remote_docker @@ -285,10 +246,10 @@ jobs: - attach_workspace: at: dependencies - run_integration: - spark: "3" - scala: "2.12" - jdk: "11" - cassandra: "4.1.4" + spark: "<>" + scala: "<>" + jdk: "<>" + cassandra: "<>" - store_artifacts: when: always @@ -309,91 +270,60 @@ jobs: when: always path: build/aggregated-test-reports - spark3-2_13-jdk11-bti-c50: - docker: - - image: cimg/openjdk:11.0 - resource_class: large - steps: - - install_common - - checkout - - attach_workspace: - at: dependencies - - run_build: - spark: "3" - scala: "2.13" - jdk: "11" - sstable_format: "bti" - cassandra: "5.0" - - - store_artifacts: - when: always - path: build/test-reports - destination: test-reports - - - store_artifacts: - when: always - path: build/reports - destination: reports - - - store_test_results: - when: always - path: build/test-reports - - int-c5-spark3-2_13-jdk11: - parallelism: 8 - docker: - - image: cimg/openjdk:11.0 - resource_class: large - steps: - - setup_remote_docker - - install_common - - checkout - - attach_workspace: - at: dependencies - - run_integration: - spark: "3" - scala: "2.13" - jdk: "11" - cassandra: "5.0.5" - - - store_artifacts: - when: always - path: build/test-reports - destination: test-reports - - - store_artifacts: - when: always - path: build/reports - destination: reports - - - store_test_results: - when: always - path: build/aggregated-test-reports - workflows: version: 2 build-and-test: jobs: - build-deps-jdk11 - # Unit tests: split by Cassandra version to reduce per-process memory pressure - - spark3-2_12-jdk11-big-c40: - requires: - - build-deps-jdk11 - - spark3-2_12-jdk11-big-c41: - requires: - - build-deps-jdk11 - - spark3-2_13-jdk11-bti-c50: + # ---- Unit tests ---- + # Cassandra 4.x: Scala 2.12 + BIG sstable format + - unit-test: requires: - build-deps-jdk11 - - # Integration tests - - int-c4-spark3-2_12-jdk11: + matrix: + alias: unit-c4x + parameters: + spark: ["3"] + scala: ["2.12"] + jdk: ["11"] + sstable_format: ["big"] + cassandra: ["4.0", "4.1"] + + # Cassandra 5.0: Scala 2.13 + BTI sstable format + - unit-test: requires: - build-deps-jdk11 - - int-c41-spark3-2_12-jdk11: + matrix: + alias: unit-c5 + parameters: + spark: ["3"] + scala: ["2.13"] + jdk: ["11"] + sstable_format: ["bti"] + cassandra: ["5.0"] + + # ---- Integration tests ---- + # Cassandra 4.x: Scala 2.12, full patch versions for dtest jar lookup + - integration-test: requires: - build-deps-jdk11 - - int-c5-spark3-2_13-jdk11: + matrix: + alias: int-c4x + parameters: + spark: ["3"] + scala: ["2.12"] + jdk: ["11"] + cassandra: ["4.0.17", "4.1.4"] + + # Cassandra 5.0: Scala 2.13 + - integration-test: requires: - build-deps-jdk11 + matrix: + alias: int-c5 + parameters: + spark: ["3"] + scala: ["2.13"] + jdk: ["11"] + cassandra: ["5.0.5"] From 19026fda6b7e016699e78c018e4ae44b84f9d096 Mon Sep 17 00:00:00 2001 From: Francisco Guerrero Date: Tue, 26 May 2026 12:00:21 -0700 Subject: [PATCH 11/26] circle --- .circleci/config.yml | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 187391799..d00b5613b 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -165,9 +165,10 @@ jobs: - install_common - checkout - run: - name: Build dependencies for jdk11 builds + name: Build dependencies for JDK11 builds environment: CASSANDRA_USE_JDK11: "true" + JDK_VERSION: "11" command: | ./scripts/build-dependencies.sh ./gradlew --no-daemon --max-workers=2 codeCheckTasks @@ -182,7 +183,7 @@ jobs: # C5.0 + Scala 2.13 + BTI). Splitting into several small matrices is cleaner # than one big matrix with a long `exclude:` list, because CircleCI's # `exclude:` requires every parameter to be listed in every entry. - unit-test: + unit: parameters: spark: type: string @@ -225,7 +226,7 @@ jobs: # Single parameterized integration-test job, invoked once per Cassandra/Scala # compatibility group from the workflow's `matrix:` block. - integration-test: + int-test: parameters: spark: type: string @@ -278,7 +279,7 @@ workflows: # ---- Unit tests ---- # Cassandra 4.x: Scala 2.12 + BIG sstable format - - unit-test: + - unit: requires: - build-deps-jdk11 matrix: @@ -291,7 +292,7 @@ workflows: cassandra: ["4.0", "4.1"] # Cassandra 5.0: Scala 2.13 + BTI sstable format - - unit-test: + - unit: requires: - build-deps-jdk11 matrix: @@ -305,7 +306,7 @@ workflows: # ---- Integration tests ---- # Cassandra 4.x: Scala 2.12, full patch versions for dtest jar lookup - - integration-test: + - int-test: requires: - build-deps-jdk11 matrix: @@ -317,7 +318,7 @@ workflows: cassandra: ["4.0.17", "4.1.4"] # Cassandra 5.0: Scala 2.13 - - integration-test: + - int-test: requires: - build-deps-jdk11 matrix: From c426adbdb41d2a7efb112539517d3ff51ded8802 Mon Sep 17 00:00:00 2001 From: Francisco Guerrero Date: Tue, 26 May 2026 13:32:37 -0700 Subject: [PATCH 12/26] Fix deps build --- .circleci/config.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.circleci/config.yml b/.circleci/config.yml index d00b5613b..bfd2b2438 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -168,7 +168,10 @@ jobs: name: Build dependencies for JDK11 builds environment: CASSANDRA_USE_JDK11: "true" + # Set JDK_VERSION, SPARK_VERSION, and SCALA_VERSION for codeCheckTasks JDK_VERSION: "11" + SPARK_VERSION: "3" + SCALA_VERSION: "2.12" command: | ./scripts/build-dependencies.sh ./gradlew --no-daemon --max-workers=2 codeCheckTasks From 6f64d4a0c392fec4fbfa4c3491d0d2172caccd0b Mon Sep 17 00:00:00 2001 From: Francisco Guerrero Date: Tue, 26 May 2026 13:47:46 -0700 Subject: [PATCH 13/26] Add new pipelines --- .circleci/config.yml | 51 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/.circleci/config.yml b/.circleci/config.yml index bfd2b2438..8ef9bc2b7 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -181,6 +181,31 @@ jobs: - "*.jar" - "org/**/*" + # JDK17 dependency build for the Spark 4 / Scala 2.13 matrix. + # Workspace paths overlap with build-deps-jdk11, so any downstream job must + # `requires:` only ONE of these two — never both — to avoid jar collisions. + build-deps-jdk17: + docker: + - image: cimg/openjdk:17.0 + resource_class: large + steps: + - install_common + - checkout + - run: + name: Build dependencies for JDK17 builds + environment: + JDK_VERSION: "17" + SPARK_VERSION: "4" + SCALA_VERSION: "2.13" + command: | + ./scripts/build-dependencies.sh + ./gradlew --no-daemon --max-workers=2 codeCheckTasks + - persist_to_workspace: + root: dependencies + paths: + - "*.jar" + - "org/**/*" + # Single parameterized unit-test job. The workflow invokes it via `matrix:` # multiple times, once per "compatibility group" (e.g. C4.x + Scala 2.12 + BIG, # C5.0 + Scala 2.13 + BTI). Splitting into several small matrices is cleaner @@ -279,6 +304,7 @@ workflows: build-and-test: jobs: - build-deps-jdk11 + - build-deps-jdk17 # ---- Unit tests ---- # Cassandra 4.x: Scala 2.12 + BIG sstable format @@ -307,6 +333,19 @@ workflows: sstable_format: ["bti"] cassandra: ["5.0"] + # Cassandra 5.0 on Spark 4 / Scala 2.13 / JDK 17 + BTI sstable format + - unit: + requires: + - build-deps-jdk17 + matrix: + alias: unit-c5-spark4 + parameters: + spark: ["4"] + scala: ["2.13"] + jdk: ["17"] + sstable_format: ["bti"] + cassandra: ["5.0"] + # ---- Integration tests ---- # Cassandra 4.x: Scala 2.12, full patch versions for dtest jar lookup - int-test: @@ -331,3 +370,15 @@ workflows: scala: ["2.13"] jdk: ["11"] cassandra: ["5.0.5"] + + # Cassandra 5.0 on Spark 4 / Scala 2.13 / JDK 17 + - int-test: + requires: + - build-deps-jdk17 + matrix: + alias: int-c5-spark4 + parameters: + spark: ["4"] + scala: ["2.13"] + jdk: ["17"] + cassandra: ["5.0.5"] From c2ffa5a13d270b0724f3c6d28fa534e8e445df95 Mon Sep 17 00:00:00 2001 From: Francisco Guerrero Date: Tue, 26 May 2026 14:13:55 -0700 Subject: [PATCH 14/26] More fixes --- .circleci/config.yml | 3 +++ cassandra-analytics-cdc-codec/build.gradle | 6 ++---- cassandra-analytics-cdc/build.gradle | 4 +--- cassandra-analytics-core/build.gradle | 4 +--- .../apache/cassandra/spark/bulkwriter/BulkSparkConf.java | 2 ++ .../org/apache/cassandra/analytics/SparkTestUtils.java | 8 ++++---- 6 files changed, 13 insertions(+), 14 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 8ef9bc2b7..0836888fa 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -197,6 +197,9 @@ jobs: JDK_VERSION: "17" SPARK_VERSION: "4" SCALA_VERSION: "2.13" + # JDK17 only targets Cassandra 5.0+; skip 4.0 / 4.1 dtest jar builds + # (build-dtest-jars.sh reads this var to filter CANDIDATE_BRANCHES). + BRANCHES: "cassandra-5.0" command: | ./scripts/build-dependencies.sh ./gradlew --no-daemon --max-workers=2 codeCheckTasks diff --git a/cassandra-analytics-cdc-codec/build.gradle b/cassandra-analytics-cdc-codec/build.gradle index e8ac31877..8ed8489b8 100644 --- a/cassandra-analytics-cdc-codec/build.gradle +++ b/cassandra-analytics-cdc-codec/build.gradle @@ -68,12 +68,10 @@ sourceSets { } } -apply from: "${project.rootDir}/gradle/common/java17Options.gradle" +apply from: "${project.rootDir}/gradle/common/javaOptions.gradle" test { - if (JavaVersion.current().isCompatibleWith(JavaVersion.VERSION_17)) { - jvmArgs(project.ext.JDK17_OPTIONS) - } + jvmArgs(project.ext.JDK_OPTIONS) useJUnitPlatform() reports { diff --git a/cassandra-analytics-cdc/build.gradle b/cassandra-analytics-cdc/build.gradle index 74aa5a4c5..b335ebe87 100644 --- a/cassandra-analytics-cdc/build.gradle +++ b/cassandra-analytics-cdc/build.gradle @@ -166,10 +166,8 @@ def configureCdcTestTask = { Test task, String majorMinor = null -> task.maxHeapSize = cdcMaxHeap task.maxParallelForks = cdcMaxParallelForks task.forkEvery = 1 // Enables different end-to-end test classes use Spark contexts with different configurations + task.jvmArgs(project.ext.JDK_OPTIONS) - if (JavaVersion.current().isCompatibleWith(JavaVersion.VERSION_17)) { - jvmArgs(project.ext.JDK17_OPTIONS) - } // Make it so unit tests run on a Jar with Cassandra bridge implementations built in task.dependsOn(tasks.jar) task.classpath += files(jar.archiveFile) diff --git a/cassandra-analytics-core/build.gradle b/cassandra-analytics-core/build.gradle index dc9326732..ca70ba2ae 100644 --- a/cassandra-analytics-core/build.gradle +++ b/cassandra-analytics-core/build.gradle @@ -205,9 +205,7 @@ test { maxParallelForks = System.getenv('CORE_MAX_PARALLEL_FORKS')?.toInteger() ?: Math.max(Runtime.runtime.availableProcessors() * 2, 8) - if (JavaVersion.current().isCompatibleWith(JavaVersion.VERSION_17)) { - jvmArgs(project.ext.JDK17_OPTIONS) - } + jvmArgs(project.ext.JDK_OPTIONS) useJUnitPlatform { excludeTags 'Sequential' diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BulkSparkConf.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BulkSparkConf.java index 21d9b8c58..c50389f77 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BulkSparkConf.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BulkSparkConf.java @@ -82,10 +82,12 @@ public class BulkSparkConf implements Serializable + " --add-opens java.base/java.lang.invoke=ALL-UNNAMED" + " --add-opens java.base/java.lang.reflect=ALL-UNNAMED" + " --add-opens java.base/java.math=ALL-UNNAMED" + + " --add-opens java.base/java.net=ALL-UNNAMED" + " --add-opens java.base/java.nio=ALL-UNNAMED" + " --add-opens java.base/java.util=ALL-UNNAMED" + " --add-opens java.base/java.util.concurrent=ALL-UNNAMED" + " --add-opens java.base/java.util.concurrent.atomic=ALL-UNNAMED" + + " --add-opens java.base/jdk.internal.misc=ALL-UNNAMED" + " --add-opens java.base/sun.nio.ch=ALL-UNNAMED" + " --add-opens java.base/sun.util.calendar=ALL-UNNAMED" + " --add-opens java.rmi/sun.rmi.transport.tcp=ALL-UNNAMED"; diff --git a/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/SparkTestUtils.java b/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/SparkTestUtils.java index 6319e7f74..fac52699f 100644 --- a/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/SparkTestUtils.java +++ b/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/SparkTestUtils.java @@ -32,9 +32,7 @@ import java.util.stream.IntStream; import java.util.stream.Stream; -import org.apache.commons.lang3.JavaVersion; import org.apache.commons.lang3.StringUtils; -import org.apache.commons.lang3.SystemUtils; import org.apache.cassandra.distributed.api.ICluster; import org.apache.cassandra.distributed.api.IInstance; @@ -54,6 +52,8 @@ import org.apache.spark.sql.SparkSession; import org.jetbrains.annotations.NotNull; +import static org.apache.cassandra.spark.bulkwriter.BulkSparkConf.JDK11_OPTIONS; +import static org.apache.cassandra.spark.bulkwriter.BulkSparkConf.JDK17_OPTIONS; import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.catchThrowable; @@ -203,8 +203,8 @@ public SparkConf defaultSparkConf() .set("spark.cassandra_analytics.sidecar.request.retries", "5") .set("spark.cassandra_analytics.sidecar.request.retries.delay.milliseconds", "500") .set("spark.cassandra_analytics.sidecar.request.retries.max.delay.milliseconds", "500") - .set("spark.driver.extraJavaOptions", "--add-exports=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens java.base/java.net=ALL-UNNAMED --add-opens java.base/jdk.internal.misc=ALL-UNNAMED") - .set("spark.executor.extraJavaOptions", "--add-exports=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens java.base/java.net=ALL-UNNAMED --add-opens java.base/jdk.internal.misc=ALL-UNNAMED"); + .set("spark.driver.extraJavaOptions", JDK11_OPTIONS + JDK17_OPTIONS) + .set("spark.executor.extraJavaOptions", JDK11_OPTIONS + JDK17_OPTIONS); BulkSparkConf.setupSparkConf(sparkConf, true); KryoRegister.setup(sparkConf); From a5a1296598486511a265bc4af84007f398e1125e Mon Sep 17 00:00:00 2001 From: Francisco Guerrero Date: Tue, 26 May 2026 15:27:06 -0700 Subject: [PATCH 15/26] Fix BufferingInputStreamTests --- .../apache/cassandra/sidecar/common/data/RestoreJobStatus.java | 1 + .../cassandra/spark/utils/BufferingInputStreamTests.java | 3 +-- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/analytics-sidecar-client-common/src/main/java/org/apache/cassandra/sidecar/common/data/RestoreJobStatus.java b/analytics-sidecar-client-common/src/main/java/org/apache/cassandra/sidecar/common/data/RestoreJobStatus.java index 9074ccb5f..7ecf9c7f1 100644 --- a/analytics-sidecar-client-common/src/main/java/org/apache/cassandra/sidecar/common/data/RestoreJobStatus.java +++ b/analytics-sidecar-client-common/src/main/java/org/apache/cassandra/sidecar/common/data/RestoreJobStatus.java @@ -45,6 +45,7 @@ public enum RestoreJobStatus /** * @deprecated replaced by {@link #ABORTED} */ + @Deprecated FAILED, /** * The external controller aborts the RestoreJob due to some failure, e.g. consistency not satisfied, timeout, etc. diff --git a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/utils/BufferingInputStreamTests.java b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/utils/BufferingInputStreamTests.java index 5fe7019bd..b7c305241 100644 --- a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/utils/BufferingInputStreamTests.java +++ b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/utils/BufferingInputStreamTests.java @@ -36,7 +36,6 @@ import com.google.common.collect.ImmutableList; import com.google.common.util.concurrent.ThreadFactoryBuilder; import com.google.common.util.concurrent.Uninterruptibles; -import org.apache.commons.lang.ArrayUtils; import org.junit.jupiter.api.Test; import org.apache.cassandra.analytics.stats.Stats; @@ -415,7 +414,7 @@ public long chunkBufferSize() buffer.limit(remainingReadBytes); read = stream2.read(buffer); // read remaining bytes assertThat(read).isEqualTo(remainingReadBytes); - assertThat(ArrayUtils.subarray(buffer.array(), 0, remainingReadBytes)).isEqualTo(returnedBuffers.get(1)); + assertThat(buffer.array()).startsWith(returnedBuffers.get(1)); } } } From ebcbebac7fd9599e418a0fc65fa1779148a36a68 Mon Sep 17 00:00:00 2001 From: Francisco Guerrero Date: Tue, 26 May 2026 15:39:16 -0700 Subject: [PATCH 16/26] More fixes --- .../cassandra/spark/reader/IndexOffsetTests.java | 12 ++++++------ .../cassandra/spark/reader/SSTableReaderTests.java | 5 ++--- .../cassandra/spark/reader/IndexOffsetTests.java | 12 ++++++------ 3 files changed, 14 insertions(+), 15 deletions(-) diff --git a/cassandra-five-zero-bridge/src/test/java/org/apache/cassandra/spark/reader/IndexOffsetTests.java b/cassandra-five-zero-bridge/src/test/java/org/apache/cassandra/spark/reader/IndexOffsetTests.java index 4d55d81d7..b6c9853ab 100644 --- a/cassandra-five-zero-bridge/src/test/java/org/apache/cassandra/spark/reader/IndexOffsetTests.java +++ b/cassandra-five-zero-bridge/src/test/java/org/apache/cassandra/spark/reader/IndexOffsetTests.java @@ -23,12 +23,12 @@ import java.math.BigInteger; import java.nio.ByteBuffer; import java.util.Collection; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicLong; import java.util.stream.Stream; import com.google.common.collect.ImmutableMultimap; import com.google.common.collect.Multimap; -import org.apache.commons.lang.mutable.MutableInt; -import org.apache.commons.lang.mutable.MutableLong; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.Arguments; import org.junit.jupiter.params.provider.MethodSource; @@ -119,8 +119,8 @@ public void testReadIndexOffsets(int numPartitions, int numRowsPerPartition) LOGGER.info("Testing index offsets numKeys={} sparkPartitions={} partitioner={} enableCompression={}", numKeys, ranges.size(), partitioner.name(), enableCompression); - MutableInt skippedPartitions = new MutableInt(0); - MutableLong skippedDataOffsets = new MutableLong(0); + AtomicInteger skippedPartitions = new AtomicInteger(0); + AtomicLong skippedDataOffsets = new AtomicLong(0); int[][] counts = new int[numPartitions][numRowsPerPartition]; for (TokenRange range : ranges) { @@ -130,12 +130,12 @@ public void testReadIndexOffsets(int numPartitions, int numRowsPerPartition) { public void skippedPartition(ByteBuffer key, BigInteger token) { - skippedPartitions.add(1); + skippedPartitions.addAndGet(1); } public void skippedDataDbStartOffset(long length) { - skippedDataOffsets.add(length); + skippedDataOffsets.addAndGet(length); } }) .build(); diff --git a/cassandra-five-zero-bridge/src/test/java/org/apache/cassandra/spark/reader/SSTableReaderTests.java b/cassandra-five-zero-bridge/src/test/java/org/apache/cassandra/spark/reader/SSTableReaderTests.java index fd35c2ca0..18821a2b8 100644 --- a/cassandra-five-zero-bridge/src/test/java/org/apache/cassandra/spark/reader/SSTableReaderTests.java +++ b/cassandra-five-zero-bridge/src/test/java/org/apache/cassandra/spark/reader/SSTableReaderTests.java @@ -45,7 +45,6 @@ import java.util.stream.Stream; import com.google.common.collect.ImmutableMap; -import org.apache.commons.lang.StringUtils; import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -562,7 +561,7 @@ public void testOpenCompactionScanner() colBuf.get(); String colName = ByteBufferUtils.string(ByteBufferUtils.readBytesWithShortLength(colBuf)); colBuf.get(); - if (StringUtils.isEmpty(colName)) + if (colName == null || colName.isEmpty()) { continue; } @@ -942,7 +941,7 @@ public void skippedRepairedSSTable(SSTable ssTable, long repairedAt) colBuf.get(); String colName = ByteBufferUtils.string(ByteBufferUtils.readBytesWithShortLength(colBuf)); colBuf.get(); - if (StringUtils.isEmpty(colName)) + if (colName == null || colName.isEmpty()) { continue; } diff --git a/cassandra-four-zero-bridge/src/test/java/org/apache/cassandra/spark/reader/IndexOffsetTests.java b/cassandra-four-zero-bridge/src/test/java/org/apache/cassandra/spark/reader/IndexOffsetTests.java index b6d7db8c2..84fe91933 100644 --- a/cassandra-four-zero-bridge/src/test/java/org/apache/cassandra/spark/reader/IndexOffsetTests.java +++ b/cassandra-four-zero-bridge/src/test/java/org/apache/cassandra/spark/reader/IndexOffsetTests.java @@ -23,11 +23,11 @@ import java.math.BigInteger; import java.nio.ByteBuffer; import java.util.Collection; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicLong; import com.google.common.collect.ImmutableMultimap; import com.google.common.collect.Multimap; -import org.apache.commons.lang.mutable.MutableInt; -import org.apache.commons.lang.mutable.MutableLong; import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -111,8 +111,8 @@ public void testReadIndexOffsets() LOGGER.info("Testing index offsets numKeys={} sparkPartitions={} partitioner={} enableCompression={}", numKeys, ranges.size(), partitioner.name(), enableCompression); - MutableInt skippedPartitions = new MutableInt(0); - MutableLong skippedDataOffsets = new MutableLong(0); + AtomicInteger skippedPartitions = new AtomicInteger(0); + AtomicLong skippedDataOffsets = new AtomicLong(0); int[] counts = new int[numKeys]; for (TokenRange range : ranges) { @@ -122,12 +122,12 @@ public void testReadIndexOffsets() { public void skippedPartition(ByteBuffer key, BigInteger token) { - skippedPartitions.add(1); + skippedPartitions.addAndGet(1); } public void skippedDataDbStartOffset(long length) { - skippedDataOffsets.add(length); + skippedDataOffsets.addAndGet(length); } }) .build(); From 9789285563ba21eac4c5a2183d45487b8662593e Mon Sep 17 00:00:00 2001 From: Francisco Guerrero Date: Tue, 26 May 2026 15:43:17 -0700 Subject: [PATCH 17/26] Run codeCheckTasks first --- .circleci/config.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 0836888fa..51fb3ee6d 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -173,8 +173,8 @@ jobs: SPARK_VERSION: "3" SCALA_VERSION: "2.12" command: | - ./scripts/build-dependencies.sh ./gradlew --no-daemon --max-workers=2 codeCheckTasks + ./scripts/build-dependencies.sh - persist_to_workspace: root: dependencies paths: @@ -194,6 +194,7 @@ jobs: - run: name: Build dependencies for JDK17 builds environment: + # Set JDK_VERSION, SPARK_VERSION, and SCALA_VERSION for codeCheckTasks JDK_VERSION: "17" SPARK_VERSION: "4" SCALA_VERSION: "2.13" @@ -201,8 +202,8 @@ jobs: # (build-dtest-jars.sh reads this var to filter CANDIDATE_BRANCHES). BRANCHES: "cassandra-5.0" command: | - ./scripts/build-dependencies.sh ./gradlew --no-daemon --max-workers=2 codeCheckTasks + ./scripts/build-dependencies.sh - persist_to_workspace: root: dependencies paths: From bba7d109c1230e5a414c749d29fc2f478a93e832 Mon Sep 17 00:00:00 2001 From: Francisco Guerrero Date: Tue, 26 May 2026 15:50:42 -0700 Subject: [PATCH 18/26] More fixes --- .../utils/BufferingInputStreamTests.java | 74 +++++++++---------- .../spark/reader/SSTableReaderTests.java | 5 +- 2 files changed, 39 insertions(+), 40 deletions(-) diff --git a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/utils/BufferingInputStreamTests.java b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/utils/BufferingInputStreamTests.java index b7c305241..42f3159ae 100644 --- a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/utils/BufferingInputStreamTests.java +++ b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/utils/BufferingInputStreamTests.java @@ -59,9 +59,9 @@ public class BufferingInputStreamTests { private static final ScheduledExecutorService SCHEDULER = Executors.newScheduledThreadPool(1); private static final ExecutorService EXECUTOR = - Executors.newFixedThreadPool(4, new ThreadFactoryBuilder().setNameFormat("sstable-tests-%d") - .setDaemon(true) - .build()); + Executors.newFixedThreadPool(4, new ThreadFactoryBuilder().setNameFormat("sstable-tests-%d") + .setDaemon(true) + .build()); static final int DEFAULT_CHUNK_SIZE = 8192; static final Stats STATS = Stats.DoNothingStats.INSTANCE; @@ -137,9 +137,9 @@ private void runMockedTest(int numRequests, int chunksPerRequest, long maxBuffer maxBufferSize, requestChunkSize, (start, end, consumer) -> { - requestCount.incrementAndGet(); - writeBuffers(consumer, randomBuffers(chunksPerRequest)); - }, null); + requestCount.incrementAndGet(); + writeBuffers(consumer, randomBuffers(chunksPerRequest)); + }, null); BufferingInputStream is = new BufferingInputStream<>(mockedClient, STATS.bufferingInputStreamStats()); readStreamFully(is); assertThat(requestCount.get()).isEqualTo(numRequests); @@ -159,18 +159,18 @@ public void testFailure() CassandraFileSource.DEFAULT_MAX_BUFFER_SIZE, CassandraFileSource.DEFAULT_CHUNK_BUFFER_SIZE, (start, end, consumer) -> { - if (count.incrementAndGet() > (numRequests / 2)) - { - // Halfway through throw random exception - EXECUTOR.submit(() -> consumer.onError(new RuntimeException("Something bad happened..."))); - } - else - { - writeBuffers(consumer, randomBuffers(chunksPerRequest)); - } - }, null); + if (count.incrementAndGet() > (numRequests / 2)) + { + // Halfway through throw random exception + EXECUTOR.submit(() -> consumer.onError(new RuntimeException("Something bad happened..."))); + } + else + { + writeBuffers(consumer, randomBuffers(chunksPerRequest)); + } + }, null); assertThatThrownBy(() -> readStreamFully(new BufferingInputStream<>(source, STATS.bufferingInputStreamStats()))) - .isInstanceOf(IOException.class); + .isInstanceOf(IOException.class); } @Test @@ -178,21 +178,21 @@ public void testTimeout() { long now = System.nanoTime(); assertThat(timeoutLeftNanos(Duration.ofMillis(1000), now, now - Duration.ofMillis(900).toNanos())) - .isEqualTo(Duration.ofMillis(100).toNanos()); + .isEqualTo(Duration.ofMillis(100).toNanos()); assertThat(timeoutLeftNanos(Duration.ofMillis(1000), now, now - Duration.ofMillis(1500).toNanos())) - .isEqualTo(Duration.ofMillis(-500).toNanos()); + .isEqualTo(Duration.ofMillis(-500).toNanos()); assertThat(timeoutLeftNanos(Duration.ofMillis(1000), now, now - Duration.ofMillis(5).toNanos())) - .isEqualTo(Duration.ofMillis(995).toNanos()); + .isEqualTo(Duration.ofMillis(995).toNanos()); assertThat(timeoutLeftNanos(Duration.ofMillis(1000), now, now - Duration.ofMillis(0).toNanos())) - .isEqualTo(Duration.ofMillis(1000).toNanos()); + .isEqualTo(Duration.ofMillis(1000).toNanos()); assertThat(timeoutLeftNanos(Duration.ofMillis(1000), now, now + Duration.ofMillis(500).toNanos())) - .isEqualTo(Duration.ofMillis(1000).toNanos()); + .isEqualTo(Duration.ofMillis(1000).toNanos()); assertThat(timeoutLeftNanos(Duration.ofMillis(60000), now, now - Duration.ofMillis(25000).toNanos())) - .isEqualTo(Duration.ofMillis(35000).toNanos()); + .isEqualTo(Duration.ofMillis(35000).toNanos()); assertThat(timeoutLeftNanos(Duration.ofMillis(60000), now, now - Duration.ofMillis(65000).toNanos())) - .isEqualTo(Duration.ofMillis(-5000).toNanos()); + .isEqualTo(Duration.ofMillis(-5000).toNanos()); assertThat(timeoutLeftNanos(Duration.ofMillis(60000), now, now - Duration.ofMillis(60000).toNanos())) - .isEqualTo(Duration.ofMillis(0).toNanos()); + .isEqualTo(Duration.ofMillis(0).toNanos()); } @Test @@ -210,15 +210,15 @@ public void testTimeoutShouldAccountForActivityTime() CassandraFileSource.DEFAULT_MAX_BUFFER_SIZE, CassandraFileSource.DEFAULT_CHUNK_BUFFER_SIZE, (start, end, consumer) -> { - // Only respond once so future requests will time out - if (count.incrementAndGet() == 1) - { - EXECUTOR.submit(() -> { - Uninterruptibles.sleepUninterruptibly(sleepTimeInMillis, TimeUnit.MILLISECONDS); - writeBuffers(consumer, randomBuffers(chunksPerRequest)); - }); - } - }, timeout); + // Only respond once so future requests will time out + if (count.incrementAndGet() == 1) + { + EXECUTOR.submit(() -> { + Uninterruptibles.sleepUninterruptibly(sleepTimeInMillis, TimeUnit.MILLISECONDS); + writeBuffers(consumer, randomBuffers(chunksPerRequest)); + }); + } + }, timeout); BufferingInputStream inputStream = new BufferingInputStream<>(source, STATS.bufferingInputStreamStats()); try { @@ -232,9 +232,9 @@ public void testTimeoutShouldAccountForActivityTime() long readAndTimeoutTotal = TimeUnit.NANOSECONDS.toMillis(inputStream.timeBlockedNanos()) + timeout.toMillis(); Duration clientTimeoutTotal = Duration.ofNanos(System.nanoTime() - startTime); assertThat(clientTimeoutTotal.toMillis()).isGreaterThanOrEqualTo(readAndTimeoutTotal) - .describedAs("Timeout didn't account for activity time. " - + "Took %dms should have taken at least %dms", - clientTimeoutTotal.toMillis(), readAndTimeoutTotal); + .describedAs("Timeout didn't account for activity time. " + + "Took %dms should have taken at least %dms", + clientTimeoutTotal.toMillis(), readAndTimeoutTotal); } @Test diff --git a/cassandra-four-zero-bridge/src/test/java/org/apache/cassandra/spark/reader/SSTableReaderTests.java b/cassandra-four-zero-bridge/src/test/java/org/apache/cassandra/spark/reader/SSTableReaderTests.java index d3b4ac383..1f8c7ef69 100644 --- a/cassandra-four-zero-bridge/src/test/java/org/apache/cassandra/spark/reader/SSTableReaderTests.java +++ b/cassandra-four-zero-bridge/src/test/java/org/apache/cassandra/spark/reader/SSTableReaderTests.java @@ -46,7 +46,6 @@ import java.util.stream.Stream; import com.google.common.collect.ImmutableMap; -import org.apache.commons.lang.StringUtils; import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -536,7 +535,7 @@ public void testOpenCompactionScanner() colBuf.get(); String colName = ByteBufferUtils.string(ByteBufferUtils.readBytesWithShortLength(colBuf)); colBuf.get(); - if (StringUtils.isEmpty(colName)) + if (colName == null || colName.isEmpty()) { continue; } @@ -916,7 +915,7 @@ public void skippedRepairedSSTable(SSTable ssTable, long repairedAt) colBuf.get(); String colName = ByteBufferUtils.string(ByteBufferUtils.readBytesWithShortLength(colBuf)); colBuf.get(); - if (StringUtils.isEmpty(colName)) + if (colName == null || colName.isEmpty()) { continue; } From 1d655ea2cd14e038b0671482a6f439edef4ad110 Mon Sep 17 00:00:00 2001 From: Francisco Guerrero Date: Tue, 26 May 2026 15:51:01 -0700 Subject: [PATCH 19/26] reformat --- .../utils/BufferingInputStreamTests.java | 74 +++++++++---------- 1 file changed, 37 insertions(+), 37 deletions(-) diff --git a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/utils/BufferingInputStreamTests.java b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/utils/BufferingInputStreamTests.java index 42f3159ae..b7c305241 100644 --- a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/utils/BufferingInputStreamTests.java +++ b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/utils/BufferingInputStreamTests.java @@ -59,9 +59,9 @@ public class BufferingInputStreamTests { private static final ScheduledExecutorService SCHEDULER = Executors.newScheduledThreadPool(1); private static final ExecutorService EXECUTOR = - Executors.newFixedThreadPool(4, new ThreadFactoryBuilder().setNameFormat("sstable-tests-%d") - .setDaemon(true) - .build()); + Executors.newFixedThreadPool(4, new ThreadFactoryBuilder().setNameFormat("sstable-tests-%d") + .setDaemon(true) + .build()); static final int DEFAULT_CHUNK_SIZE = 8192; static final Stats STATS = Stats.DoNothingStats.INSTANCE; @@ -137,9 +137,9 @@ private void runMockedTest(int numRequests, int chunksPerRequest, long maxBuffer maxBufferSize, requestChunkSize, (start, end, consumer) -> { - requestCount.incrementAndGet(); - writeBuffers(consumer, randomBuffers(chunksPerRequest)); - }, null); + requestCount.incrementAndGet(); + writeBuffers(consumer, randomBuffers(chunksPerRequest)); + }, null); BufferingInputStream is = new BufferingInputStream<>(mockedClient, STATS.bufferingInputStreamStats()); readStreamFully(is); assertThat(requestCount.get()).isEqualTo(numRequests); @@ -159,18 +159,18 @@ public void testFailure() CassandraFileSource.DEFAULT_MAX_BUFFER_SIZE, CassandraFileSource.DEFAULT_CHUNK_BUFFER_SIZE, (start, end, consumer) -> { - if (count.incrementAndGet() > (numRequests / 2)) - { - // Halfway through throw random exception - EXECUTOR.submit(() -> consumer.onError(new RuntimeException("Something bad happened..."))); - } - else - { - writeBuffers(consumer, randomBuffers(chunksPerRequest)); - } - }, null); + if (count.incrementAndGet() > (numRequests / 2)) + { + // Halfway through throw random exception + EXECUTOR.submit(() -> consumer.onError(new RuntimeException("Something bad happened..."))); + } + else + { + writeBuffers(consumer, randomBuffers(chunksPerRequest)); + } + }, null); assertThatThrownBy(() -> readStreamFully(new BufferingInputStream<>(source, STATS.bufferingInputStreamStats()))) - .isInstanceOf(IOException.class); + .isInstanceOf(IOException.class); } @Test @@ -178,21 +178,21 @@ public void testTimeout() { long now = System.nanoTime(); assertThat(timeoutLeftNanos(Duration.ofMillis(1000), now, now - Duration.ofMillis(900).toNanos())) - .isEqualTo(Duration.ofMillis(100).toNanos()); + .isEqualTo(Duration.ofMillis(100).toNanos()); assertThat(timeoutLeftNanos(Duration.ofMillis(1000), now, now - Duration.ofMillis(1500).toNanos())) - .isEqualTo(Duration.ofMillis(-500).toNanos()); + .isEqualTo(Duration.ofMillis(-500).toNanos()); assertThat(timeoutLeftNanos(Duration.ofMillis(1000), now, now - Duration.ofMillis(5).toNanos())) - .isEqualTo(Duration.ofMillis(995).toNanos()); + .isEqualTo(Duration.ofMillis(995).toNanos()); assertThat(timeoutLeftNanos(Duration.ofMillis(1000), now, now - Duration.ofMillis(0).toNanos())) - .isEqualTo(Duration.ofMillis(1000).toNanos()); + .isEqualTo(Duration.ofMillis(1000).toNanos()); assertThat(timeoutLeftNanos(Duration.ofMillis(1000), now, now + Duration.ofMillis(500).toNanos())) - .isEqualTo(Duration.ofMillis(1000).toNanos()); + .isEqualTo(Duration.ofMillis(1000).toNanos()); assertThat(timeoutLeftNanos(Duration.ofMillis(60000), now, now - Duration.ofMillis(25000).toNanos())) - .isEqualTo(Duration.ofMillis(35000).toNanos()); + .isEqualTo(Duration.ofMillis(35000).toNanos()); assertThat(timeoutLeftNanos(Duration.ofMillis(60000), now, now - Duration.ofMillis(65000).toNanos())) - .isEqualTo(Duration.ofMillis(-5000).toNanos()); + .isEqualTo(Duration.ofMillis(-5000).toNanos()); assertThat(timeoutLeftNanos(Duration.ofMillis(60000), now, now - Duration.ofMillis(60000).toNanos())) - .isEqualTo(Duration.ofMillis(0).toNanos()); + .isEqualTo(Duration.ofMillis(0).toNanos()); } @Test @@ -210,15 +210,15 @@ public void testTimeoutShouldAccountForActivityTime() CassandraFileSource.DEFAULT_MAX_BUFFER_SIZE, CassandraFileSource.DEFAULT_CHUNK_BUFFER_SIZE, (start, end, consumer) -> { - // Only respond once so future requests will time out - if (count.incrementAndGet() == 1) - { - EXECUTOR.submit(() -> { - Uninterruptibles.sleepUninterruptibly(sleepTimeInMillis, TimeUnit.MILLISECONDS); - writeBuffers(consumer, randomBuffers(chunksPerRequest)); - }); - } - }, timeout); + // Only respond once so future requests will time out + if (count.incrementAndGet() == 1) + { + EXECUTOR.submit(() -> { + Uninterruptibles.sleepUninterruptibly(sleepTimeInMillis, TimeUnit.MILLISECONDS); + writeBuffers(consumer, randomBuffers(chunksPerRequest)); + }); + } + }, timeout); BufferingInputStream inputStream = new BufferingInputStream<>(source, STATS.bufferingInputStreamStats()); try { @@ -232,9 +232,9 @@ public void testTimeoutShouldAccountForActivityTime() long readAndTimeoutTotal = TimeUnit.NANOSECONDS.toMillis(inputStream.timeBlockedNanos()) + timeout.toMillis(); Duration clientTimeoutTotal = Duration.ofNanos(System.nanoTime() - startTime); assertThat(clientTimeoutTotal.toMillis()).isGreaterThanOrEqualTo(readAndTimeoutTotal) - .describedAs("Timeout didn't account for activity time. " - + "Took %dms should have taken at least %dms", - clientTimeoutTotal.toMillis(), readAndTimeoutTotal); + .describedAs("Timeout didn't account for activity time. " + + "Took %dms should have taken at least %dms", + clientTimeoutTotal.toMillis(), readAndTimeoutTotal); } @Test From 16e9ff7c537cdf6acbb4e2e00775b2e69b60a7d1 Mon Sep 17 00:00:00 2001 From: Francisco Guerrero Date: Tue, 26 May 2026 15:55:23 -0700 Subject: [PATCH 20/26] codeCheckTasks depends on cassandra-all jars --- .circleci/config.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 51fb3ee6d..2b38d046d 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -173,8 +173,8 @@ jobs: SPARK_VERSION: "3" SCALA_VERSION: "2.12" command: | - ./gradlew --no-daemon --max-workers=2 codeCheckTasks ./scripts/build-dependencies.sh + ./gradlew --no-daemon --max-workers=2 codeCheckTasks - persist_to_workspace: root: dependencies paths: @@ -202,8 +202,8 @@ jobs: # (build-dtest-jars.sh reads this var to filter CANDIDATE_BRANCHES). BRANCHES: "cassandra-5.0" command: | - ./gradlew --no-daemon --max-workers=2 codeCheckTasks ./scripts/build-dependencies.sh + ./gradlew --no-daemon --max-workers=2 codeCheckTasks - persist_to_workspace: root: dependencies paths: From 9bea5d12dfcdd1e644955af3b1818cfcd39734c2 Mon Sep 17 00:00:00 2001 From: Francisco Guerrero Date: Tue, 26 May 2026 16:32:46 -0700 Subject: [PATCH 21/26] Also set spark.driver.host --- .../test/java/org/apache/cassandra/analytics/SparkTestUtils.java | 1 + 1 file changed, 1 insertion(+) diff --git a/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/SparkTestUtils.java b/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/SparkTestUtils.java index fac52699f..0e3f8ef40 100644 --- a/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/SparkTestUtils.java +++ b/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/SparkTestUtils.java @@ -197,6 +197,7 @@ public SparkConf defaultSparkConf() // the quoted identifiers tests where we test mixed case .set("spark.sql.caseSensitive", "True") .set("spark.ui.enabled", "false") + .set("spark.driver.host", "127.0.0.1") .set("spark.driver.bindAddress", "127.0.0.1") .set("spark.master", "local[8,4]") .set("spark.cassandra_analytics.cassandra.version", "5.0.0") From 9e2e1985247987cea31b5622d6ed2daaeff0b1d5 Mon Sep 17 00:00:00 2001 From: Francisco Guerrero Date: Tue, 26 May 2026 16:47:16 -0700 Subject: [PATCH 22/26] Fix unit test org.apache.cassandra.spark.bulkwriter.BulkSparkConfTest#ensureSetupSparkConfAddsPerformsNecessaryTasks --- .../cassandra/spark/bulkwriter/BulkSparkConfTest.java | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/BulkSparkConfTest.java b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/BulkSparkConfTest.java index ab0e663b1..5271e00ef 100644 --- a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/BulkSparkConfTest.java +++ b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/BulkSparkConfTest.java @@ -127,7 +127,12 @@ void ensureSetupSparkConfAddsPerformsNecessaryTasks() BulkSparkConf.setupSparkConf(sparkConf, true); assertThat(sparkConf.get("spark.kryo.registrator", "")) .isEqualTo("," + SbwKryoRegistrator.class.getName()); - if (BuildInfo.isAtLeastJava11(BuildInfo.javaSpecificationVersion())) + if (BuildInfo.isAtLeastJava17(BuildInfo.javaSpecificationVersion())) + { + assertThat(sparkConf.get("spark.executor.extraJavaOptions", "")) + .isEqualTo(BulkSparkConf.JDK11_OPTIONS + BulkSparkConf.JDK17_OPTIONS); + } + else if (BuildInfo.isAtLeastJava11(BuildInfo.javaSpecificationVersion())) { assertThat(sparkConf.get("spark.executor.extraJavaOptions", "")) .isEqualTo(BulkSparkConf.JDK11_OPTIONS); From b9f9711d9e82f876cb37775e5755e96e67309b09 Mon Sep 17 00:00:00 2001 From: Francisco Guerrero Date: Tue, 26 May 2026 17:11:11 -0700 Subject: [PATCH 23/26] Fix org.apache.cassandra.analytics.BulkWriteDataTypesTest#testType --- .../analytics/BulkWriteDataTypesTest.java | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkWriteDataTypesTest.java b/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkWriteDataTypesTest.java index 34928792b..49f43c302 100644 --- a/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkWriteDataTypesTest.java +++ b/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkWriteDataTypesTest.java @@ -85,11 +85,13 @@ void testType(String tableName, TypeTestSetup typeTestSetup) Dataset df = generateDataset(spark, typeTestSetup); QualifiedName table = new QualifiedName(TEST_KEYSPACE, tableName); - if (typeTestSetup.expectedFailureMessage != null) + if (typeTestSetup.expectedFailureMessages != null) { assertThatException() .isThrownBy(() -> bulkWriterDataFrameWriter(df, table).save()) - .withMessageContaining(typeTestSetup.expectedFailureMessage); + .extracting(Throwable::getMessage) + .asString() + .containsAnyOf(typeTestSetup.expectedFailureMessages); } else { @@ -227,7 +229,10 @@ static TypeTestSetup simpleDurationSchemaSetup() Arrays.asList(IntegerType, CalendarIntervalType), Arrays.asList(INTEGER_MAPPER, DURATION_MAPPER), "CREATE TABLE %s (id int, took duration, PRIMARY KEY (id))", - "Cannot save interval data type into external storage."); + "Cannot save interval data type into external storage", + // Error message changed in Cassandra 5.0 + "datasource doesn't support the column `took` of the type \"INTERVAL\"." + ); } static TypeTestSetup integersAndStringsSchemaSetup() @@ -467,7 +472,7 @@ static class TypeTestSetup final List columnTypes; final List> valueFunction; final String createTableSchema; - final String expectedFailureMessage; + final String[] expectedFailureMessages; final int numRows = 10_000; Function columnMapperValidation = columns -> String.format(String.join(":", Collections.nCopies(columns.length, "%s")), columns); @@ -492,7 +497,7 @@ static class TypeTestSetup this.columnTypes = columnTypes; this.valueFunction = valueFunction; this.createTableSchema = createTableSchema; - this.expectedFailureMessage = null; + this.expectedFailureMessages = null; } TypeTestSetup(String tableName, @@ -500,14 +505,14 @@ static class TypeTestSetup List columnTypes, List> valueFunction, String createTableSchema, - String expectedFailureMessage) + String... expectedFailureMessages) { this.tableName = tableName; this.columns = columns; this.columnTypes = columnTypes; this.valueFunction = valueFunction; this.createTableSchema = createTableSchema; - this.expectedFailureMessage = expectedFailureMessage; + this.expectedFailureMessages = expectedFailureMessages; } } } From df2c1b1d58fed566e5e01567c7361d74f39a505b Mon Sep 17 00:00:00 2001 From: Francisco Guerrero Date: Tue, 26 May 2026 18:03:01 -0700 Subject: [PATCH 24/26] Fix org.apache.cassandra.cdc.avro.AvroLogicalTypesTest#testTimeMicros and org.apache.cassandra.cdc.avro.AvroLogicalTypesTest#testTimestampMicros --- .../apache/cassandra/cdc/avro/AvroLogicalTypesTest.java | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/cassandra-analytics-cdc-codec/src/test/java/org/apache/cassandra/cdc/avro/AvroLogicalTypesTest.java b/cassandra-analytics-cdc-codec/src/test/java/org/apache/cassandra/cdc/avro/AvroLogicalTypesTest.java index 420f12c34..c55b62bcf 100644 --- a/cassandra-analytics-cdc-codec/src/test/java/org/apache/cassandra/cdc/avro/AvroLogicalTypesTest.java +++ b/cassandra-analytics-cdc-codec/src/test/java/org/apache/cassandra/cdc/avro/AvroLogicalTypesTest.java @@ -25,6 +25,7 @@ import java.time.Instant; import java.time.LocalDate; import java.time.LocalTime; +import java.time.temporal.ChronoUnit; import java.util.UUID; import org.apache.avro.AvroRuntimeException; @@ -93,13 +94,17 @@ public void testDate() throws IOException @Test public void testTimeMicros() throws IOException { - testLogicalTypeDataWriteAndReadBack("time_micros", LocalTime.now()); + // In https://bugs.openjdk.org/browse/JDK-8242504 the precision of clock was enhanced to + // nanoseconds. However, for Cassandra we only need microseconds + testLogicalTypeDataWriteAndReadBack("time_micros", LocalTime.now().truncatedTo(ChronoUnit.MICROS)); } @Test public void testTimestampMicros() throws IOException { - testLogicalTypeDataWriteAndReadBack("timestamp_micros", Instant.now()); + // In https://bugs.openjdk.org/browse/JDK-8242504 the precision of clock was enhanced to + // nanoseconds. However, for Cassandra we only need microseconds + testLogicalTypeDataWriteAndReadBack("timestamp_micros", Instant.now().truncatedTo(ChronoUnit.MICROS)); } private void testLogicalTypeDataWriteAndReadBack(String typeName, T testData) throws IOException From dd4193e7b1bc3edda03e7a0a6785aaa0ee7f2dcc Mon Sep 17 00:00:00 2001 From: Francisco Guerrero Date: Tue, 26 May 2026 20:50:04 -0700 Subject: [PATCH 25/26] Update GH actions --- .github/workflows/test.yaml | 169 ++++++++++++++++++++++++++++-------- 1 file changed, 131 insertions(+), 38 deletions(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index cc6629016..0a136c089 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -26,8 +26,8 @@ on: workflow_dispatch: jobs: - build: - name: Compile and build + build-jdk11: + name: Compile and build (JDK 11) runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 @@ -38,43 +38,107 @@ jobs: java-version: 11 - run: | sudo apt-get update - + + if [ -f /etc/ssl/certs/java/cacerts/cacerts ]; then + sudo mv /etc/ssl/certs/java/cacerts/ /etc/ssl/certs/java/cacerts-old + sudo mv /etc/ssl/certs/java/cacerts-old/cacerts /etc/ssl/certs/java/ + sudo rmdir /etc/ssl/certs/java/cacerts-old + fi + + apt-get download ant ant-optional + sudo dpkg --force-all -i ant*.deb + rm ant*.deb + + sudo bash -c 'for i in {2..20}; do echo 127.0.0.${i} localhost${i} >> /etc/hosts; done' + + for i in {2..20} + do + sudo ip addr add "127.0.0.${i}" dev lo + sudo route add -host "127.0.0.${i}" dev lo; + done + + export JDK_VERSION="11" + export SPARK_VERSION="3" + export SCALA_VERSION="2.12" + export CASSANDRA_USE_JDK11="true" + + ./scripts/build-dependencies.sh + + ./gradlew --no-daemon --max-workers=2 codeCheckTasks + - name: Cache Maven repository + uses: actions/cache@v4 + with: + path: ~/.m2 + key: maven-repo-jdk11-${{ github.sha }} + - name: Cache workspace + id: cache-build-save + uses: actions/cache/save@v4 + with: + path: ${{ github.workspace }} + key: build-jdk11-${{ github.sha }} + + # JDK17 dependency build for the Spark 4 / Scala 2.13 / Cassandra 5.0 matrix. + # Produces a distinct workspace cache (build-jdk17-...) so downstream test jobs + # must restore from exactly one of build-jdk11 / build-jdk17 — mixing them + # would clobber dependency jars. + build-jdk17: + name: Compile and build (JDK 17) + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Setup JDK + uses: actions/setup-java@v4 + with: + distribution: 'temurin' + java-version: 17 + - run: | + sudo apt-get update + if [ -f /etc/ssl/certs/java/cacerts/cacerts ]; then sudo mv /etc/ssl/certs/java/cacerts/ /etc/ssl/certs/java/cacerts-old sudo mv /etc/ssl/certs/java/cacerts-old/cacerts /etc/ssl/certs/java/ sudo rmdir /etc/ssl/certs/java/cacerts-old fi - + apt-get download ant ant-optional sudo dpkg --force-all -i ant*.deb rm ant*.deb - + sudo bash -c 'for i in {2..20}; do echo 127.0.0.${i} localhost${i} >> /etc/hosts; done' - + for i in {2..20} do sudo ip addr add "127.0.0.${i}" dev lo sudo route add -host "127.0.0.${i}" dev lo; done - - CASSANDRA_USE_JDK11=true ./scripts/build-dependencies.sh - + + export JDK_VERSION="17" + export SPARK_VERSION="4" + export SCALA_VERSION="2.13" + # JDK17 only targets Cassandra 5.0+; skip 4.0 / 4.1 dtest jar builds + # (build-dtest-jars.sh reads this var to filter CANDIDATE_BRANCHES). + export BRANCHES="cassandra-5.0" + + ./scripts/build-dependencies.sh + ./gradlew codeCheckTasks - name: Cache Maven repository uses: actions/cache@v4 with: path: ~/.m2 - key: maven-repo-${{ github.sha }} + key: maven-repo-jdk17-${{ github.sha }} - name: Cache workspace id: cache-build-save uses: actions/cache/save@v4 with: path: ${{ github.workspace }} - key: build-${{ github.sha }} + key: build-jdk17-${{ github.sha }} unit-test: - name: Unit test - Scala ${{ matrix.scala }} ${{ matrix.sstable-format }} C${{ matrix.cassandra }} - needs: build + name: Unit test - Scala ${{ matrix.scala }} ${{ matrix.sstable-format }} C${{ matrix.cassandra }} Spark${{ matrix.spark }} JDK${{ matrix.jdk }} + # Each matrix entry only consumes one of these caches (selected by matrix.jdk), + # but `needs:` cannot be matrix-conditional so we wait on both builds. + needs: [build-jdk11, build-jdk17] runs-on: ubuntu-latest strategy: matrix: @@ -82,19 +146,30 @@ jobs: - scala: '2.13' sstable-format: 'bti' cassandra: '5.0' + jdk: '11' + spark: '3' - scala: '2.12' sstable-format: 'big' cassandra: '4.1' + jdk: '11' + spark: '3' - scala: '2.12' sstable-format: 'big' cassandra: '4.0' + jdk: '11' + spark: '3' + - scala: '2.13' + sstable-format: 'bti' + cassandra: '5.0' + jdk: '17' + spark: '4' fail-fast: false steps: - name: Setup JDK uses: actions/setup-java@v4 with: distribution: 'temurin' - java-version: 11 + java-version: ${{ matrix.jdk }} - run: | sudo bash -c 'for i in {2..20}; do echo 127.0.0.${i} localhost${i} >> /etc/hosts; done' for i in {2..20} @@ -106,54 +181,70 @@ jobs: uses: actions/cache@v4 with: path: ~/.m2 - key: maven-repo-${{ github.sha }} + key: maven-repo-jdk${{ matrix.jdk }}-${{ github.sha }} - name: Cache workspace id: cache-build-restore uses: actions/cache/restore@v4 with: path: ${{ github.workspace }} - key: build-${{ github.sha }} + key: build-jdk${{ matrix.jdk }}-${{ github.sha }} - run: | - export SPARK_VERSION="3" + export SPARK_VERSION="${{ matrix.spark }}" export SCALA_VERSION="${{ matrix.scala }}" - export JDK_VERSION="11" + export JDK_VERSION="${{ matrix.jdk }}" export INTEGRATION_MAX_PARALLEL_FORKS=1 export INTEGRATION_MAX_HEAP_SIZE="3072M" - export CASSANDRA_USE_JDK11=true export CASSANDRA_VERSION="${{ matrix.cassandra }}" + if [ "${{ matrix.jdk }}" = "11" ]; then + export CASSANDRA_USE_JDK11=true + fi ./gradlew --stacktrace clean assemble check -x cassandra-analytics-integration-tests:test -Dcassandra.analytics.bridges.sstable_format=${{ matrix.sstable-format }} integration-test: name: Integration test - ${{ matrix.config }} (${{ matrix.job_index }}) - needs: build + # Each matrix entry only consumes one of these caches (selected by matrix.jdk), + # but `needs:` cannot be matrix-conditional so we wait on both builds. + needs: [build-jdk11, build-jdk17] runs-on: ubuntu-latest strategy: - # GitHub Actions generate a cross-product of 'config' × 'job_index' (3 × 5 = 15 jobs). + # GitHub Actions generate a cross-product of 'config' × 'job_index' (4 × 5 = 20 jobs). # The 'include' entries don't add new combinations — they augment existing ones - # by matching on 'config' and injecting 'scala' and 'cassandra' into each match. - # To add a new version: add one entry to 'config' and one to 'include'. + # by matching on 'config' and injecting 'scala', 'cassandra', 'jdk', and 'spark' + # into each match. To add a new version: add one entry to 'config' and one to + # 'include'. matrix: - config: ['s2.13-c5.0.5', 's2.12-c4.1.4', 's2.12-c4.0.17'] + config: ['s2.13-c5.0.5', 's2.12-c4.1.4', 's2.12-c4.0.17', 's2.13-c5.0.5-spark4'] job_index: [0, 1, 2, 3, 4] job_total: [5] include: - config: 's2.13-c5.0.5' scala: '2.13' cassandra: '5.0.5' + jdk: '11' + spark: '3' - config: 's2.12-c4.1.4' scala: '2.12' cassandra: '4.1.4' + jdk: '11' + spark: '3' - config: 's2.12-c4.0.17' scala: '2.12' cassandra: '4.0.17' + jdk: '11' + spark: '3' + - config: 's2.13-c5.0.5-spark4' + scala: '2.13' + cassandra: '5.0.5' + jdk: '17' + spark: '4' fail-fast: false steps: - name: Setup JDK uses: actions/setup-java@v4 with: distribution: 'temurin' - java-version: 11 + java-version: ${{ matrix.jdk }} - run: | sudo bash -c 'for i in {2..20}; do echo 127.0.0.${i} localhost${i} >> /etc/hosts; done' for i in {2..20} @@ -165,26 +256,28 @@ jobs: uses: actions/cache@v4 with: path: ~/.m2 - key: maven-repo-${{ github.sha }} + key: maven-repo-jdk${{ matrix.jdk }}-${{ github.sha }} - name: Cache workspace id: cache-build-restore uses: actions/cache/restore@v4 with: path: ${{ github.workspace }} - key: build-${{ github.sha }} + key: build-jdk${{ matrix.jdk }}-${{ github.sha }} - run: | - export SPARK_VERSION="3" + export SPARK_VERSION="${{ matrix.spark }}" export SCALA_VERSION="${{ matrix.scala }}" - export JDK_VERSION="11" + export JDK_VERSION="${{ matrix.jdk }}" export INTEGRATION_MAX_PARALLEL_FORKS=1 export INTEGRATION_MAX_HEAP_SIZE="3072M" - export CASSANDRA_USE_JDK11=true - + if [ "${{ matrix.jdk }}" = "11" ]; then + export CASSANDRA_USE_JDK11=true + fi + export DTEST_JAR="dtest-${{ matrix.cassandra }}.jar" export CASSANDRA_VERSION=$(echo ${{ matrix.cassandra }} | cut -d'.' -f 1,2) - + ./gradlew --stacktrace clean assemble - + cd cassandra-analytics-integration-tests/src/test/java # Shuffle test classes using the commit SHA as seed for reproducible randomization, # then shard across runners via round-robin on the shuffled order. @@ -192,7 +285,7 @@ jobs: | python3 -c "import random,sys; lines=sys.stdin.read().splitlines(); random.seed('$GITHUB_SHA'); random.shuffle(lines); print('\n'.join(lines))" \ | awk 'NR % ${{ matrix.job_total }} == ${{ matrix.job_index }}') cd ../../../.. - + EXIT_STATUS=0 # Execution of "gradle test --test $TEST_NAME" returns non-zero exit code when commend did not run any test # (e.g. when all tests are ignored). Currently there is no option to change Gradle behaviour. @@ -219,13 +312,13 @@ jobs: echo "Skipping test $TEST_NAME" fi done; - + tar czf integration-tests.tar.gz test-reports/* - + exit $EXIT_STATUS - name: Publish test results uses: actions/upload-artifact@v4 if: (!cancelled()) with: - name: integration-tests-${{ matrix.scala }}-${{ matrix.cassandra }}-${{ matrix.job_index }} - path: integration-tests.tar.gz \ No newline at end of file + name: integration-tests-${{ matrix.config }}-${{ matrix.job_index }} + path: integration-tests.tar.gz From 78fd2c2542c550cc758a5e25436e3e85df12e903 Mon Sep 17 00:00:00 2001 From: Francisco Guerrero Date: Thu, 28 May 2026 11:45:36 -0700 Subject: [PATCH 26/26] Fix org.apache.cassandra.spark.endtoend.SchemaTests#testMultipleSSTableCompacted test --- .../cassandra/spark/endtoend/SchemaTests.java | 24 +++++++++++++++---- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/endtoend/SchemaTests.java b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/endtoend/SchemaTests.java index 08ab169fe..f2d452adb 100644 --- a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/endtoend/SchemaTests.java +++ b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/endtoend/SchemaTests.java @@ -19,11 +19,12 @@ package org.apache.cassandra.spark.endtoend; +import java.math.BigInteger; import java.util.HashMap; import java.util.Map; import java.util.UUID; import java.util.concurrent.TimeUnit; -import java.util.concurrent.atomic.AtomicLong; +import java.util.concurrent.atomic.AtomicReference; import java.util.stream.Collectors; import java.util.stream.IntStream; @@ -43,6 +44,8 @@ import org.apache.cassandra.spark.utils.RandomUtils; import org.apache.cassandra.spark.utils.test.TestSchema; import org.apache.spark.sql.Row; +import org.apache.spark.sql.functions; +import org.apache.spark.sql.types.DataTypes; import static org.assertj.core.api.Assertions.assertThat; import static org.quicktheories.QuickTheory.qt; @@ -284,7 +287,11 @@ public void testMultipleSSTableCompacted(CassandraVersion version) .withClusteringKey("c", bridge.text()) .withColumn("d", bridge.text()) .withColumn("e", bridge.bigint()); - AtomicLong total = new AtomicLong(0); + // AtomicLong is not sufficient here, using BigInteger instead. The aggregation + // in the withCheck block will overflow and this will throw an exception in Spark 4. + // In earlier versions of Spark the long value would just overflow and the aggregated + // value was incorrect. + AtomicReference total = new AtomicReference<>(BigInteger.ZERO); Map rows = new HashMap<>(Tester.DEFAULT_NUM_ROWS); Tester.builder(schemaBuilder) // Don't write random data @@ -319,7 +326,7 @@ public void testMultipleSSTableCompacted(CassandraVersion version) TestSchema.TestRow newTestRow = testRow.copy("e", RandomUtils.RANDOM.nextLong()) .copy("d", UUID.randomUUID().toString().substring(0, 10)); rows.put(testRow.getUUID("a"), newTestRow); - total.addAndGet(newTestRow.getLong("e")); + total.updateAndGet(t -> t.add(BigInteger.valueOf(newTestRow.getLong("e")))); writer.write(newTestRow.allValues()); } }) @@ -328,10 +335,17 @@ public void testMultipleSSTableCompacted(CassandraVersion version) .withReadListener(actualRow -> assertThat(actualRow).isEqualTo(rows.get(actualRow.getUUID("a")))) .withReadListener(actualRow -> assertThat(actualRow.getLong("e")).isEqualTo(rows.get(actualRow.getUUID("a")).getLong("e"))) // Verify Spark aggregations match expected - .withCheck(dataset -> assertThat(dataset.groupBy().sum("e").first().getLong(0)).isEqualTo(total.get())) + .withCheck(dataset -> { + BigInteger sum = dataset.agg(functions.sum( + functions.col("e").cast(DataTypes.createDecimalType(38, 0)))) + .first() + .getDecimal(0) + .toBigInteger(); + assertThat(sum).isEqualTo(total.get()); + }) .withCheck(dataset -> assertThat(dataset.groupBy().count().first().getLong(0)).isEqualTo(rows.size())) .withReset(() -> { - total.set(0); + total.set(BigInteger.ZERO); rows.clear(); }) .run(bridge.getVersion());