From 9c8d1c2849bec8a0f815875170519579eb1ebad2 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 17 Mar 2026 15:51:36 -0700 Subject: [PATCH 01/10] Drop R install we don't need (see run-in-container Co-authored-by: Holden Karau --- dev/spark-test-image/docs/Dockerfile | 9 --------- 1 file changed, 9 deletions(-) diff --git a/dev/spark-test-image/docs/Dockerfile b/dev/spark-test-image/docs/Dockerfile index f3778a342eb54..09ae0460d2932 100644 --- a/dev/spark-test-image/docs/Dockerfile +++ b/dev/spark-test-image/docs/Dockerfile @@ -70,15 +70,6 @@ RUN apt-get update && apt-get install -y \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* -# See more in SPARK-39959, roxygen2 < 7.2.1 -RUN Rscript -e "install.packages(c('devtools', 'knitr', 'markdown', 'rmarkdown', 'testthat', 'remotes'), repos='https://cloud.r-project.org/')" && \ - Rscript -e "devtools::install_version('roxygen2', version='7.2.0', repos='https://cloud.r-project.org')" && \ - Rscript -e "devtools::install_version('pkgdown', version='2.0.1', repos='https://cloud.r-project.org')" && \ - Rscript -e "devtools::install_version('preferably', version='0.4', repos='https://cloud.r-project.org')" - -# See more in SPARK-39735 -ENV R_LIBS_SITE="/usr/local/lib/R/site-library:${R_LIBS_SITE}:/usr/lib/R/library" - # Setup virtual environment ENV VIRTUAL_ENV=/opt/spark-venv RUN python3.12 -m venv $VIRTUAL_ENV From 4c0ed1ea69d9b340ef5ef4adf5a1645c5b31ce27 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Wed, 18 Mar 2026 16:13:36 -0700 Subject: [PATCH 02/10] Update the docs build step so that we don't stomp on user .* files with root permissions, fix we use Python 3.12 not 3.11 anymore, handle amd64/arm64 builds Co-authored-by: Holden Karau --- dev/spark-test-image-util/docs/build-docs | 7 ++++++- dev/spark-test-image-util/docs/run-in-container | 8 +++++--- docs/Gemfile.lock | 14 ++++---------- 3 files changed, 15 insertions(+), 14 deletions(-) diff --git a/dev/spark-test-image-util/docs/build-docs b/dev/spark-test-image-util/docs/build-docs index ca59769f24231..6fcd4b5b541cd 100755 --- a/dev/spark-test-image-util/docs/build-docs +++ b/dev/spark-test-image-util/docs/build-docs @@ -17,6 +17,8 @@ # limitations under the License. # +set -e + if ! [ -x "$(command -v docker)" ]; then echo "Error: Docker is not installed." >&2 exit 1 @@ -41,10 +43,12 @@ build/sbt -Phive -Pkinesis-asl clean unidoc package docker buildx build \ --cache-from type=registry,ref="${DOCKER_CACHE_IMG}" \ --tag "${IMG_URL}" "${FWDIR}" \ - --file "${SPARK_HOME}/dev/spark-test-image/docs/Dockerfile" + --file "${SPARK_HOME}/dev/spark-test-image/docs/Dockerfile" \ + --load # 3.Build docs on container: `error docs`, `scala doc`, `python doc`, `sql doc`. docker run \ + --user "$(id -u):$(id -g)" \ --mount type=bind,source="${SPARK_HOME}",target="${DOCKER_MOUNT_SPARK_HOME}" \ --interactive --tty "${IMG_URL}" \ /bin/bash -c "sh ${BUILD_DOCS_SCRIPT_PATH}" @@ -58,6 +62,7 @@ if [[ "$SKIP_RDOC" != "1" ]]; then # and when writing to subsequent files, will throw an error as: # `! [EACCES] Failed to copy '/usr/local/lib/R/site-library/pkgdown/BS5/assets/katex-auto.js' # to '/__w/spark/spark/R/pkg/docs/katex-auto.js': permission denied` + echo "Building final docs *OUTSIDE* of container." export SKIP_ERRORDOC=1 export SKIP_SCALADOC=1 export SKIP_PYTHONDOC=1 diff --git a/dev/spark-test-image-util/docs/run-in-container b/dev/spark-test-image-util/docs/run-in-container index 3bfb3c5f651dd..537c8483be5c5 100644 --- a/dev/spark-test-image-util/docs/run-in-container +++ b/dev/spark-test-image-util/docs/run-in-container @@ -16,7 +16,9 @@ # # 1.Set env variable. -export JAVA_HOME=/usr/lib/jvm/java-17-openjdk-arm64 +set -ex +(ls /usr/lib/jvm/java-17-openjdk-arm64 && ln -s /usr/lib/jvm/java-17-openjdk-arm64 /usr/lib/jvm/java-17-openjdk) || (ls /usr/lib/jvm/java-17-openjdk-amd64 && ln -s /usr/lib/jvm/java-17-openjdk-amd64 /usr/lib/jvm/java-17-openjdk) +export JAVA_HOME=/usr/lib/jvm/java-17-openjdk export PATH=$JAVA_HOME/bin:$PATH export SPARK_DOCS_IS_BUILT_ON_HOST=1 # We expect to compile the R document on the host. @@ -28,8 +30,8 @@ cd /__w/spark/spark/docs bundle install # 3.Build docs, includes: `error docs`, `scala doc`, `python doc`, `sql doc`, excludes: `r doc`. -# We need this link to make sure `python3` points to `python3.11` which contains the prerequisite packages. -ln -s "$(which python3.11)" "/usr/local/bin/python3" +# We need this link to make sure `python3` points to `python3.12` which contains the prerequisite packages. +ln -s "$(which python3.12)" "/usr/local/bin/python3" # Build docs first with SKIP_API to ensure they are buildable without requiring any # language docs to be built beforehand. diff --git a/docs/Gemfile.lock b/docs/Gemfile.lock index 70a51382190c8..a1f9b0f07086d 100644 --- a/docs/Gemfile.lock +++ b/docs/Gemfile.lock @@ -4,7 +4,6 @@ GEM addressable (2.8.7) public_suffix (>= 2.0.2, < 7.0) base64 (0.3.0) - bigdecimal (3.2.2) colorator (1.1.0) concurrent-ruby (1.3.5) csv (3.3.5) @@ -14,9 +13,6 @@ GEM eventmachine (1.2.7) ffi (1.17.2) forwardable-extended (2.6.0) - google-protobuf (4.31.1) - bigdecimal - rake (>= 13) http_parser.rb (0.8.0) i18n (1.14.7) concurrent-ruby (~> 1.0) @@ -41,8 +37,8 @@ GEM webrick (~> 1.7) jekyll-redirect-from (0.16.0) jekyll (>= 3.3, < 5.0) - jekyll-sass-converter (3.1.0) - sass-embedded (~> 1.75) + jekyll-sass-converter (2.2.0) + sassc (> 2.0.1, < 3.0) jekyll-watch (2.2.1) listen (~> 3.0) json (2.12.2) @@ -58,16 +54,14 @@ GEM pathutil (0.16.2) forwardable-extended (~> 2.6) public_suffix (6.0.2) - rake (13.3.0) rb-fsevent (0.11.2) rb-inotify (0.11.1) ffi (~> 1.0) rexml (3.4.4) rouge (4.5.2) safe_yaml (1.0.5) - sass-embedded (1.89.2) - google-protobuf (~> 4.31) - rake (>= 13) + sassc (2.4.0) + ffi (~> 1.9) terminal-table (3.0.2) unicode-display_width (>= 1.1.1, < 3) unicode-display_width (2.6.0) From 81c8a011f42b6a35df0d95d9eade207f1a039a60 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Fri, 20 Mar 2026 13:08:46 -0700 Subject: [PATCH 03/10] More changes to run as user rather than root so we don't screw up the bind mount permissions Co-authored-by: Holden Karau --- dev/spark-test-image-util/docs/run-in-container | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/dev/spark-test-image-util/docs/run-in-container b/dev/spark-test-image-util/docs/run-in-container index 537c8483be5c5..d7f91c0dfa61b 100644 --- a/dev/spark-test-image-util/docs/run-in-container +++ b/dev/spark-test-image-util/docs/run-in-container @@ -17,9 +17,9 @@ # 1.Set env variable. set -ex -(ls /usr/lib/jvm/java-17-openjdk-arm64 && ln -s /usr/lib/jvm/java-17-openjdk-arm64 /usr/lib/jvm/java-17-openjdk) || (ls /usr/lib/jvm/java-17-openjdk-amd64 && ln -s /usr/lib/jvm/java-17-openjdk-amd64 /usr/lib/jvm/java-17-openjdk) -export JAVA_HOME=/usr/lib/jvm/java-17-openjdk -export PATH=$JAVA_HOME/bin:$PATH +(ls /usr/lib/jvm/java-17-openjdk-arm64 && JAVA_HOME=/usr/lib/jvm/java-17-openjdk-arm64) || (ls /usr/lib/jvm/java-17-openjdk-amd64 && JAVA_HOME=/usr/lib/jvm/java-17-openjdk-amd64) +export JAVA_HOME +export PATH=~/.bin:$JAVA_HOME/bin:$PATH export SPARK_DOCS_IS_BUILT_ON_HOST=1 # We expect to compile the R document on the host. export SKIP_RDOC=1 @@ -31,7 +31,8 @@ bundle install # 3.Build docs, includes: `error docs`, `scala doc`, `python doc`, `sql doc`, excludes: `r doc`. # We need this link to make sure `python3` points to `python3.12` which contains the prerequisite packages. -ln -s "$(which python3.12)" "/usr/local/bin/python3" +mkdir -p ~/.bin +ln -s "$(which python3.12)" "~/.bin/python3" # Build docs first with SKIP_API to ensure they are buildable without requiring any # language docs to be built beforehand. From e06e0ace74c0f960575d646499325bb5d16a08ea Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Mon, 23 Mar 2026 16:11:48 -0700 Subject: [PATCH 04/10] Clarify that R docs are built on the host which is a little weird but out of scope for fixing that here today lets just get the current system working again on x86. Co-authored-by: Holden Karau --- docs/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/README.md b/docs/README.md index 1235efe91812b..e101d8d26ffd7 100644 --- a/docs/README.md +++ b/docs/README.md @@ -136,6 +136,6 @@ Note: Before running it, you need to have `docker` installed. $ dev/spark-test-image-util/docs/build-docs ``` -It will generate all documents on the `container` and `host`. +It will generate all documents on the `container` except for R and `host` (R). Especially when there are conflicts between the libraries required by Python development environment and the libraries required by generating Python docs environment, this is a good choice. From 37ddacc70317929df775e891c16222385a93d9c8 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Mon, 23 Mar 2026 16:30:07 -0700 Subject: [PATCH 05/10] Use .sh for highlighting happy times. --- .../docs/{run-in-container => run-in-container.sh} | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) rename dev/spark-test-image-util/docs/{run-in-container => run-in-container.sh} (99%) diff --git a/dev/spark-test-image-util/docs/run-in-container b/dev/spark-test-image-util/docs/run-in-container.sh similarity index 99% rename from dev/spark-test-image-util/docs/run-in-container rename to dev/spark-test-image-util/docs/run-in-container.sh index d7f91c0dfa61b..29678378b5e95 100644 --- a/dev/spark-test-image-util/docs/run-in-container +++ b/dev/spark-test-image-util/docs/run-in-container.sh @@ -23,6 +23,8 @@ export PATH=~/.bin:$JAVA_HOME/bin:$PATH export SPARK_DOCS_IS_BUILT_ON_HOST=1 # We expect to compile the R document on the host. export SKIP_RDOC=1 +mkdir -p ~/.bin +mkdir -p ~/.gem # 2.Install bundler. gem install bundler -v 2.4.22 @@ -31,7 +33,6 @@ bundle install # 3.Build docs, includes: `error docs`, `scala doc`, `python doc`, `sql doc`, excludes: `r doc`. # We need this link to make sure `python3` points to `python3.12` which contains the prerequisite packages. -mkdir -p ~/.bin ln -s "$(which python3.12)" "~/.bin/python3" # Build docs first with SKIP_API to ensure they are buildable without requiring any From c5415da525690612a30fd9d2b1bf225ff1d2c88f Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Mon, 23 Mar 2026 16:30:26 -0700 Subject: [PATCH 06/10] Update to call the new script. --- dev/spark-test-image-util/docs/build-docs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/spark-test-image-util/docs/build-docs b/dev/spark-test-image-util/docs/build-docs index 6fcd4b5b541cd..c743175cc3f58 100755 --- a/dev/spark-test-image-util/docs/build-docs +++ b/dev/spark-test-image-util/docs/build-docs @@ -31,7 +31,7 @@ IMG_TAG=$(date +%s) IMG_NAME="${REPOSITORY}:${IMG_TAG}" IMG_URL="$REPO_OWNER/$IMG_NAME" DOCKER_MOUNT_SPARK_HOME="/__w/spark/spark" -BUILD_DOCS_SCRIPT_PATH="${DOCKER_MOUNT_SPARK_HOME}/dev/spark-test-image-util/docs/run-in-container" +BUILD_DOCS_SCRIPT_PATH="${DOCKER_MOUNT_SPARK_HOME}/dev/spark-test-image-util/docs/run-in-container.sh" FWDIR="$(cd "`dirname "${BASH_SOURCE[0]}"`"; pwd)" SPARK_HOME="$(cd "`dirname "${BASH_SOURCE[0]}"`"/../../..; pwd)" From 647be9258ef08394cb49f68839f3cc6522010ce9 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Mon, 23 Mar 2026 16:31:04 -0700 Subject: [PATCH 07/10] Install bundler into local dir rather than system dir. --- dev/spark-test-image-util/docs/run-in-container.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/spark-test-image-util/docs/run-in-container.sh b/dev/spark-test-image-util/docs/run-in-container.sh index 29678378b5e95..0178cfc978a64 100644 --- a/dev/spark-test-image-util/docs/run-in-container.sh +++ b/dev/spark-test-image-util/docs/run-in-container.sh @@ -27,7 +27,7 @@ mkdir -p ~/.bin mkdir -p ~/.gem # 2.Install bundler. -gem install bundler -v 2.4.22 +gem install bundler -v 2.4.22 --install-dir ~/.gem --bindir ~/.bin cd /__w/spark/spark/docs bundle install From 610db9bf33453faed2c6d31217733009ab93e0f6 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Mon, 23 Mar 2026 16:52:35 -0700 Subject: [PATCH 08/10] Fix lack of ~ expansion and also fix JAVA_HOME settings to use case/switch on uname -m --- dev/spark-test-image-util/docs/run-in-container.sh | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/dev/spark-test-image-util/docs/run-in-container.sh b/dev/spark-test-image-util/docs/run-in-container.sh index 0178cfc978a64..48f26a52250f3 100644 --- a/dev/spark-test-image-util/docs/run-in-container.sh +++ b/dev/spark-test-image-util/docs/run-in-container.sh @@ -1,3 +1,4 @@ +#!/bin/bash # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with @@ -17,9 +18,14 @@ # 1.Set env variable. set -ex -(ls /usr/lib/jvm/java-17-openjdk-arm64 && JAVA_HOME=/usr/lib/jvm/java-17-openjdk-arm64) || (ls /usr/lib/jvm/java-17-openjdk-amd64 && JAVA_HOME=/usr/lib/jvm/java-17-openjdk-amd64) -export JAVA_HOME -export PATH=~/.bin:$JAVA_HOME/bin:$PATH +_arch="$(uname -m)" +case "$_arch" in + "aarch64") export JAVA_HOME=/usr/lib/jvm/java-17-openjdk-arm64 ;; + "x86_64") export JAVA_HOME=/usr/lib/jvm/java-17-openjdk-amd64 ;; + *) echo "Unexpected arch $_arch picking first java-17-openjdk in /usr/lib/jvm"; + export JAVA_HOME=$(ls /usr/lib/jvm/java-17-openjdk-* | head -n 1);; +esac +export PATH="$HOME/.bin:$JAVA_HOME/bin:$PATH" export SPARK_DOCS_IS_BUILT_ON_HOST=1 # We expect to compile the R document on the host. export SKIP_RDOC=1 @@ -33,7 +39,7 @@ bundle install # 3.Build docs, includes: `error docs`, `scala doc`, `python doc`, `sql doc`, excludes: `r doc`. # We need this link to make sure `python3` points to `python3.12` which contains the prerequisite packages. -ln -s "$(which python3.12)" "~/.bin/python3" +ln -s "$(which python3.12)" ~/.bin/python3 # Build docs first with SKIP_API to ensure they are buildable without requiring any # language docs to be built beforehand. From 844cba9cc500f220bedc371f2919b6925370024f Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Mon, 23 Mar 2026 18:37:16 -0700 Subject: [PATCH 09/10] Rather than symlink a Python bin randomly activate the virtualevn constructed in the dockerfile. --- dev/spark-test-image-util/docs/run-in-container.sh | 4 ++-- dev/spark-test-image/docs/Dockerfile | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/dev/spark-test-image-util/docs/run-in-container.sh b/dev/spark-test-image-util/docs/run-in-container.sh index 48f26a52250f3..4342d904ec7ba 100644 --- a/dev/spark-test-image-util/docs/run-in-container.sh +++ b/dev/spark-test-image-util/docs/run-in-container.sh @@ -38,8 +38,8 @@ cd /__w/spark/spark/docs bundle install # 3.Build docs, includes: `error docs`, `scala doc`, `python doc`, `sql doc`, excludes: `r doc`. -# We need this link to make sure `python3` points to `python3.12` which contains the prerequisite packages. -ln -s "$(which python3.12)" ~/.bin/python3 +# Activate the venv with mkdocs and friends. +. $VIRTUAL_ENV/bin/activate # Build docs first with SKIP_API to ensure they are buildable without requiring any # language docs to be built beforehand. diff --git a/dev/spark-test-image/docs/Dockerfile b/dev/spark-test-image/docs/Dockerfile index 09ae0460d2932..68168365ee639 100644 --- a/dev/spark-test-image/docs/Dockerfile +++ b/dev/spark-test-image/docs/Dockerfile @@ -78,7 +78,8 @@ ENV PATH="$VIRTUAL_ENV/bin:$PATH" # Should unpin 'sphinxcontrib-*' after upgrading sphinx>5 # See 'ipython_genutils' in SPARK-38517 # See 'docutils<0.18.0' in SPARK-39421 -RUN python3.12 -m pip install 'sphinx==4.5.0' mkdocs 'pydata_sphinx_theme>=0.13' sphinx-copybutton nbsphinx numpydoc jinja2 markupsafe \ +RUN . $VIRTUAL_ENV/bin/activate \ + && python3.12 -m pip install 'sphinx==4.5.0' mkdocs 'pydata_sphinx_theme>=0.13' sphinx-copybutton nbsphinx numpydoc jinja2 markupsafe \ ipython ipython_genutils sphinx_plotly_directive 'numpy>=1.22' 'pyarrow>=23.0.0' 'pandas==2.3.3' 'plotly>=4.8' 'docutils<0.18.0' \ 'flake8==3.9.0' 'mypy==1.19.1' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' 'black==26.3.1' \ 'pandas-stubs==1.2.0.53' 'grpcio==1.76.0' 'grpcio-status==1.76.0' 'protobuf==6.33.5' 'grpc-stubs==1.24.11' 'googleapis-common-protos-stubs==2.2.0' \ From c927118b365a601be09471217719111a8a220e52 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Mon, 23 Mar 2026 18:37:37 -0700 Subject: [PATCH 10/10] Add remtoes to the install instructions since now its needed by devtools install. --- docs/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/README.md b/docs/README.md index e101d8d26ffd7..ce228998e095e 100644 --- a/docs/README.md +++ b/docs/README.md @@ -60,7 +60,7 @@ To generate the Python or R API docs, you'll also need to [install Pandoc](https If you'd like to generate R API documentation, install these libraries: ```sh -$ sudo Rscript -e 'install.packages(c("knitr", "devtools", "testthat", "rmarkdown"), repos="https://cloud.r-project.org/")' +$ sudo Rscript -e 'install.packages(c("knitr", "devtools", "testthat", "rmarkdown", "remotes"), repos="https://cloud.r-project.org/")' $ sudo Rscript -e 'devtools::install_version("roxygen2", version = "7.1.2", repos="https://cloud.r-project.org/")' $ sudo Rscript -e "devtools::install_version('pkgdown', version='2.0.1', repos='https://cloud.r-project.org')" $ sudo Rscript -e "devtools::install_version('preferably', version='0.4', repos='https://cloud.r-project.org')"