Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ All notable changes to this project will be documented in this file.
- opensearch-dashboards: Add `3.4.0` ([#1392]).
- testing-tools: build testing tools subimages in workflow ([#1366]).
- kafka: Add `4.1.1` ([#1395]).
- spark: Add `4.1.1` ([#1402]).
- spark-connect-client: Add `4.1.1` ([#1402]).

### Changed

Expand All @@ -26,13 +28,18 @@ All notable changes to this project will be documented in this file.
- trino: Backport Kafka offset handling to 477 ([#1373]).
- ubi: Bumped ubi9 and ubi10 hashes ([#1386]).
- vector: Bumped from 0.49.0 to 0.52.0 ([#1387]).
- spark: Use one Dockerfile per major product version ([#1402]).
Remove all HBase dependencies from the Spark 4 image.
Pull logging dependencies with `mvn` instead of `curl` to remove manual maintenance in Nexus `packages`.

### Removed

- opensearch: Remove the `performance-analyzer` plugin from the OpenSearch image ([#1357]).
- superset: Remove 4.0.2 and 4.1.2 ([#1394]).
- kafka: Remove `3.7.2` and `4.1.0` ([#1395]).
- opa: remove 1.4.2 ([#1396]).
- spark: Remove `3.5.6` and `4.0.1` ([#1402]).
- spark-connect-client: Remove `3.5.6` and `4.0.1` ([#1402]).

### Fixed

Expand Down Expand Up @@ -60,6 +67,7 @@ All notable changes to this project will be documented in this file.
[#1394]: https://github.com/stackabletech/docker-images/pull/1394
[#1395]: https://github.com/stackabletech/docker-images/pull/1395
[#1396]: https://github.com/stackabletech/docker-images/pull/1396
[#1402]: https://github.com/stackabletech/docker-images/pull/1402

## [25.11.0] - 2025-11-07

Expand Down
17 changes: 5 additions & 12 deletions spark-connect-client/boil-config.toml
Original file line number Diff line number Diff line change
@@ -1,20 +1,13 @@
[versions."3.5.6".local-images]
spark-k8s = "3.5.6"
java-base = "17"

[versions."3.5.6".build-arguments]
python-version = "3.11"

[versions."3.5.7".local-images]
spark-k8s = "3.5.7"
java-base = "17"

[versions."3.5.7".build-arguments]
python-version = "3.11"

[versions."4.0.1".local-images]
spark-k8s = "4.0.1"
java-base = "17"
[versions."4.1.1".local-images]
spark-k8s = "4.1.1"
java-base = "21"

[versions."4.0.1".build-arguments]
python-version = "3.11"
[versions."4.1.1".build-arguments]
python-version = "3.12"
158 changes: 20 additions & 138 deletions spark-k8s/Dockerfile → spark-k8s/Dockerfile.3
Original file line number Diff line number Diff line change
Expand Up @@ -33,106 +33,7 @@ EOF

# hbase-connectors-builder: Build the Spark HBase connector and copy
# required JARs into /stackable/spark/jars
FROM local-image/java-devel AS hbase-connectors-builder

ARG PRODUCT_VERSION
ARG RELEASE_VERSION
ARG HADOOP_HADOOP_VERSION
# Reassign the arg to `HADOOP_VERSION` for better readability.
ENV HADOOP_VERSION=${HADOOP_HADOOP_VERSION}
ARG HBASE_VERSION
ARG HBASE_CONNECTOR_VERSION
ARG STACKABLE_USER_UID

WORKDIR /stackable

# Copy the pom.xml file from the patched Spark source code to read the
# versions used by Spark. The pom.xml defines child modules which are
# not required and not copied, therefore mvn must be called with the
# parameter --non-recursive.
COPY --chown=${STACKABLE_USER_UID}:0 --from=spark-source-builder \
/stackable/src/spark-k8s/patchable-work/worktree/${PRODUCT_VERSION}/pom.xml \
spark/

# Patch the hbase-connectors source code
WORKDIR /stackable

COPY --chown=${STACKABLE_USER_UID}:0 spark-k8s/hbase-connectors/stackable/patches/patchable.toml /stackable/src/spark-k8s/hbase-connectors/stackable/patches/patchable.toml
COPY --chown=${STACKABLE_USER_UID}:0 spark-k8s/hbase-connectors/stackable/patches/${HBASE_CONNECTOR_VERSION} /stackable/src/spark-k8s/hbase-connectors/stackable/patches/${HBASE_CONNECTOR_VERSION}

RUN <<EOF

# IMPORTANT: HBase connectors don't support Spark 4 yet, so we skip the build.
# Watch this PR for updates: https://github.com/apache/hbase-connectors/pull/130
if [[ "${PRODUCT_VERSION}" == 4* ]]; then
# Create this empty directory so that following COPY layers succeed.
mkdir -p /stackable/spark/jars
# Create a dummy tarball to satisfy the build process for Spark 3.
touch hbase-connector-${HBASE_CONNECTOR_VERSION}-stackable${RELEASE_VERSION}-src.tar.gz
exit 0
fi

cd "$(/stackable/patchable --images-repo-root=src checkout spark-k8s/hbase-connectors ${HBASE_CONNECTOR_VERSION})/spark"

NEW_VERSION="${HBASE_CONNECTOR_VERSION}-stackable${RELEASE_VERSION}"

mvn versions:set -DnewVersion=$NEW_VERSION

# Create snapshot of the source code including custom patches
tar -czf /stackable/hbase-connector-${HBASE_CONNECTOR_VERSION}-stackable${RELEASE_VERSION}-src.tar.gz .

# Building the hbase-connectors with JDK 17 is not yet supported, see
# https://github.com/apache/hbase-connectors/pull/132.
# As there are no JDK profiles, access to the non-public elements must
# be enabled with --add-opens, see https://openjdk.org/jeps/403 and
# https://openjdk.org/jeps/261#Breaking-encapsulation.
export JDK_JAVA_OPTIONS="\
--add-opens java.base/java.lang=ALL-UNNAMED \
--add-opens java.base/java.util=ALL-UNNAMED"

# Get the Scala version used by Spark
SCALA_VERSION=$(grep "scala.version" /stackable/spark/pom.xml | head -n1 | awk -F '[<>]' '{print $3}')

# Get the Scala binary version used by Spark
SCALA_BINARY_VERSION=$(grep "scala.binary.version" /stackable/spark/pom.xml | head -n1 | awk -F '[<>]' '{print $3}')

# Build the Spark HBase connector
# Skip the tests because the MiniHBaseCluster does not get ready for
# whatever reason:
# Caused by: java.lang.RuntimeException: Master not active after 30000ms
# at org.apache.hadoop.hbase.util.JVMClusterUtil.waitForEvent(JVMClusterUtil.java:221)
# at org.apache.hadoop.hbase.util.JVMClusterUtil.startup(JVMClusterUtil.java:177)
# at org.apache.hadoop.hbase.LocalHBaseCluster.startup(LocalHBaseCluster.java:407)
# at org.apache.hadoop.hbase.MiniHBaseCluster.init(MiniHBaseCluster.java:250)
mvn \
--batch-mode \
--no-transfer-progress \
--define spark.version="${PRODUCT_VERSION}" \
--define scala.version="${SCALA_VERSION}" \
--define scala.binary.version="${SCALA_BINARY_VERSION}" \
--define hadoop-three.version="${HADOOP_VERSION}" \
--define hbase.version="${HBASE_VERSION}" \
--define skipTests \
--define maven.test.skip=true \
clean package

mkdir -p /stackable/spark/jars
ln -s "$(pwd)/hbase-spark/target/hbase-spark-${HBASE_CONNECTOR_VERSION}-stackable${RELEASE_VERSION}.jar" /stackable/spark/jars/hbase-spark-${HBASE_CONNECTOR_VERSION}-stackable${RELEASE_VERSION}.jar

cd /stackable/spark/jars

# Download log4j-slf4j-impl-x.x.x.jar containing the StaticLoggerBinder
# which is required by the connector.
# Spark contains only log4j-slf4j2-impl-x.x.x.jar but not
# log4j-slf4j-impl-x.x.x.jar. It is okay to have both JARs in the
# classpath as long as they have the same version.
mvn --non-recursive --file /stackable/spark/pom.xml \
dependency:copy \
-Dartifact=org.apache.logging.log4j:log4j-slf4j-impl:'${log4j.version}' \
-DoutputDirectory=./jars
chmod g=u /stackable/hbase-connector-${HBASE_CONNECTOR_VERSION}-stackable${RELEASE_VERSION}-src.tar.gz .
EOF

FROM local-image/spark-k8s/hbase-connectors AS hbase-connectors-builder

# spark-builder: Build Spark into /stackable/spark-${PRODUCT_VERSION}/dist,
# download additional JARs and perform checks
Expand Down Expand Up @@ -173,26 +74,11 @@ RUN <<EOF
MAVEN_BIN="/usr/bin/mvn"
export MAVEN_OPTS="-Xss64m -Xmx2g -XX:ReservedCodeCacheSize=1g"

case "${PRODUCT_VERSION}" in
4*)
# The Spark 4 script has a --connect option which is not available in Spark 3.
# This option is required to build Spark Connect.
# Also this option breaks the Spark 3 build so we ensure it's only provided here.
./dev/make-distribution.sh \
--mvn "${MAVEN_BIN}" \
--connect \
-Dhadoop.version="${HADOOP_VERSION}-stackable${RELEASE_VERSION}" \
-DskipTests \
-P'hadoop-3' -Pkubernetes -Phive -Phive-thriftserver
;;
*)
./dev/make-distribution.sh \
--mvn "${MAVEN_BIN}" \
-Dhadoop.version="${HADOOP_VERSION}-stackable${RELEASE_VERSION}" \
-DskipTests \
-P'hadoop-3' -Pkubernetes -Phive -Phive-thriftserver
;;
esac
./dev/make-distribution.sh \
--mvn "${MAVEN_BIN}" \
-Dhadoop.version="${HADOOP_VERSION}-stackable${RELEASE_VERSION}" \
-DskipTests \
-P'hadoop-3' -Pkubernetes -Phive -Phive-thriftserver

sed -i "s/${NEW_VERSION}/${ORIGINAL_VERSION}/g" assembly/target/bom.json
EOF
Expand All @@ -206,18 +92,9 @@ RUN <<EOF
mkdir -p dist/connect
cd dist/connect

case "${PRODUCT_VERSION}" in
4*)
cp "/stackable/spark-${PRODUCT_VERSION}-stackable${RELEASE_VERSION}/sql/connect/server/target/spark-connect_${SCALA_BINARY_VERSION}-${PRODUCT_VERSION}-stackable${RELEASE_VERSION}.jar" .
cp "/stackable/spark-${PRODUCT_VERSION}-stackable${RELEASE_VERSION}/sql/connect/common/target/spark-connect-common_${SCALA_BINARY_VERSION}-${PRODUCT_VERSION}-stackable${RELEASE_VERSION}.jar" .
cp "/stackable/spark-${PRODUCT_VERSION}-stackable${RELEASE_VERSION}/sql/connect/client/jvm/target/spark-connect-client-jvm_${SCALA_BINARY_VERSION}-${PRODUCT_VERSION}-stackable${RELEASE_VERSION}.jar" .
;;
*)
cp "/stackable/spark-${PRODUCT_VERSION}-stackable${RELEASE_VERSION}/connector/connect/server/target/spark-connect_${SCALA_BINARY_VERSION}-${PRODUCT_VERSION}-stackable${RELEASE_VERSION}.jar" .
cp "/stackable/spark-${PRODUCT_VERSION}-stackable${RELEASE_VERSION}/connector/connect/common/target/spark-connect-common_${SCALA_BINARY_VERSION}-${PRODUCT_VERSION}-stackable${RELEASE_VERSION}.jar" .
cp "/stackable/spark-${PRODUCT_VERSION}-stackable${RELEASE_VERSION}/connector/connect/client/jvm/target/spark-connect-client-jvm_${SCALA_BINARY_VERSION}-${PRODUCT_VERSION}-stackable${RELEASE_VERSION}.jar" .
;;
esac
cp "/stackable/spark-${PRODUCT_VERSION}-stackable${RELEASE_VERSION}/connector/connect/server/target/spark-connect_${SCALA_BINARY_VERSION}-${PRODUCT_VERSION}-stackable${RELEASE_VERSION}.jar" .
cp "/stackable/spark-${PRODUCT_VERSION}-stackable${RELEASE_VERSION}/connector/connect/common/target/spark-connect-common_${SCALA_BINARY_VERSION}-${PRODUCT_VERSION}-stackable${RELEASE_VERSION}.jar" .
cp "/stackable/spark-${PRODUCT_VERSION}-stackable${RELEASE_VERSION}/connector/connect/client/jvm/target/spark-connect-client-jvm_${SCALA_BINARY_VERSION}-${PRODUCT_VERSION}-stackable${RELEASE_VERSION}.jar" .

# This link is needed by the operator and is kept for backwards compatibility.
# TODO: remove it at some time in the future.
Expand Down Expand Up @@ -272,12 +149,17 @@ WORKDIR /stackable/spark-${PRODUCT_VERSION}-stackable${RELEASE_VERSION}/dist/ext

RUN <<EOF
# Download jackson-dataformat-xml, stax2-api, and woodstox-core which are required for logging.
curl --fail https://repo.stackable.tech/repository/packages/jackson-dataformat-xml/jackson-dataformat-xml-${JACKSON_DATAFORMAT_XML_VERSION}.jar \
-o /stackable/spark-${PRODUCT_VERSION}-stackable${RELEASE_VERSION}/dist/extra-jars/jackson-dataformat-xml-${JACKSON_DATAFORMAT_XML_VERSION}.jar
curl --fail https://repo.stackable.tech/repository/packages/stax2-api/stax2-api-${STAX2_API_VERSION}.jar \
-o /stackable/spark-${PRODUCT_VERSION}-stackable${RELEASE_VERSION}/dist/extra-jars/stax2-api-${STAX2_API_VERSION}.jar
curl --fail https://repo.stackable.tech/repository/packages/woodstox-core/woodstox-core-${WOODSTOX_CORE_VERSION}.jar \
-o /stackable/spark-${PRODUCT_VERSION}-stackable${RELEASE_VERSION}/dist/extra-jars/woodstox-core-${WOODSTOX_CORE_VERSION}.jar
mvn dependency:get -Dartifact=com.fasterxml.jackson.dataformat:jackson-dataformat-xml:${JACKSON_DATAFORMAT_XML_VERSION}
cp /root/.m2/repository/com/fasterxml/jackson/dataformat/jackson-dataformat-xml/${JACKSON_DATAFORMAT_XML_VERSION}/jackson-dataformat-xml-${JACKSON_DATAFORMAT_XML_VERSION}.jar \
/stackable/spark-${PRODUCT_VERSION}-stackable${RELEASE_VERSION}/dist/extra-jars/jackson-dataformat-xml-${JACKSON_DATAFORMAT_XML_VERSION}.jar

mvn dependency:get -Dartifact=mvn dependency:get -Dartifact=org.codehaus.woodstox:stax2-api:${STAX2_API_VERSION}
cp /root/.m2/repository/org/codehaus/woodstox/stax2-api/${STAX2_API_VERSION}/stax2-api-${STAX2_API_VERSION}.jar \
/stackable/spark-${PRODUCT_VERSION}-stackable${RELEASE_VERSION}/dist/extra-jars/stax2-api-${STAX2_API_VERSION}.jar

mvn dependency:get -Dartifact=mvn dependency:get -Dartifact=com.fasterxml.woodstox:woodstox-core:${WOODSTOX_CORE_VERSION}
cp /root/.m2/repository/com/fasterxml/woodstox/woodstox-core/${WOODSTOX_CORE_VERSION}/woodstox-core-${WOODSTOX_CORE_VERSION}.jar \
/stackable/spark-${PRODUCT_VERSION}-stackable${RELEASE_VERSION}/dist/extra-jars/woodstox-core-${WOODSTOX_CORE_VERSION}.jar

# Get the correct `tini` binary for our architecture.
curl --fail "https://repo.stackable.tech/repository/packages/tini/tini-${TINI_VERSION}-${TARGETARCH}" \
Expand Down
Loading