diff --git a/dev/spark-test-image-util/docs/build-docs b/dev/spark-test-image-util/docs/build-docs index ca59769f24231..145834831454a 100755 --- a/dev/spark-test-image-util/docs/build-docs +++ b/dev/spark-test-image-util/docs/build-docs @@ -17,6 +17,8 @@ # limitations under the License. # +set -e + if ! [ -x "$(command -v docker)" ]; then echo "Error: Docker is not installed." >&2 exit 1 @@ -29,7 +31,7 @@ IMG_TAG=$(date +%s) IMG_NAME="${REPOSITORY}:${IMG_TAG}" IMG_URL="$REPO_OWNER/$IMG_NAME" DOCKER_MOUNT_SPARK_HOME="/__w/spark/spark" -BUILD_DOCS_SCRIPT_PATH="${DOCKER_MOUNT_SPARK_HOME}/dev/spark-test-image-util/docs/run-in-container" +BUILD_DOCS_SCRIPT_PATH="${DOCKER_MOUNT_SPARK_HOME}/dev/spark-test-image-util/docs/run-in-container.sh" FWDIR="$(cd "`dirname "${BASH_SOURCE[0]}"`"; pwd)" SPARK_HOME="$(cd "`dirname "${BASH_SOURCE[0]}"`"/../../..; pwd)" @@ -41,10 +43,12 @@ build/sbt -Phive -Pkinesis-asl clean unidoc package docker buildx build \ --cache-from type=registry,ref="${DOCKER_CACHE_IMG}" \ --tag "${IMG_URL}" "${FWDIR}" \ - --file "${SPARK_HOME}/dev/spark-test-image/docs/Dockerfile" + --file "${SPARK_HOME}/dev/spark-test-image/docs/Dockerfile" \ + --load # 3.Build docs on container: `error docs`, `scala doc`, `python doc`, `sql doc`. docker run \ + --user "$(id -u):$(id -g)" \ --mount type=bind,source="${SPARK_HOME}",target="${DOCKER_MOUNT_SPARK_HOME}" \ --interactive --tty "${IMG_URL}" \ /bin/bash -c "sh ${BUILD_DOCS_SCRIPT_PATH}" @@ -58,11 +62,13 @@ if [[ "$SKIP_RDOC" != "1" ]]; then # and when writing to subsequent files, will throw an error as: # `! [EACCES] Failed to copy '/usr/local/lib/R/site-library/pkgdown/BS5/assets/katex-auto.js' # to '/__w/spark/spark/R/pkg/docs/katex-auto.js': permission denied` + echo "Building final docs *OUTSIDE* of container." export SKIP_ERRORDOC=1 export SKIP_SCALADOC=1 export SKIP_PYTHONDOC=1 export SKIP_SQLDOC=1 cd docs + bundle install bundle exec jekyll build fi diff --git a/dev/spark-test-image-util/docs/run-in-container b/dev/spark-test-image-util/docs/run-in-container.sh similarity index 68% rename from dev/spark-test-image-util/docs/run-in-container rename to dev/spark-test-image-util/docs/run-in-container.sh index 3bfb3c5f651dd..4342d904ec7ba 100644 --- a/dev/spark-test-image-util/docs/run-in-container +++ b/dev/spark-test-image-util/docs/run-in-container.sh @@ -1,3 +1,4 @@ +#!/bin/bash # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with @@ -16,20 +17,29 @@ # # 1.Set env variable. -export JAVA_HOME=/usr/lib/jvm/java-17-openjdk-arm64 -export PATH=$JAVA_HOME/bin:$PATH +set -ex +_arch="$(uname -m)" +case "$_arch" in + "aarch64") export JAVA_HOME=/usr/lib/jvm/java-17-openjdk-arm64 ;; + "x86_64") export JAVA_HOME=/usr/lib/jvm/java-17-openjdk-amd64 ;; + *) echo "Unexpected arch $_arch picking first java-17-openjdk in /usr/lib/jvm"; + export JAVA_HOME=$(ls /usr/lib/jvm/java-17-openjdk-* | head -n 1);; +esac +export PATH="$HOME/.bin:$JAVA_HOME/bin:$PATH" export SPARK_DOCS_IS_BUILT_ON_HOST=1 # We expect to compile the R document on the host. export SKIP_RDOC=1 +mkdir -p ~/.bin +mkdir -p ~/.gem # 2.Install bundler. -gem install bundler -v 2.4.22 +gem install bundler -v 2.4.22 --install-dir ~/.gem --bindir ~/.bin cd /__w/spark/spark/docs bundle install # 3.Build docs, includes: `error docs`, `scala doc`, `python doc`, `sql doc`, excludes: `r doc`. -# We need this link to make sure `python3` points to `python3.11` which contains the prerequisite packages. -ln -s "$(which python3.11)" "/usr/local/bin/python3" +# Activate the venv with mkdocs and friends. +. $VIRTUAL_ENV/bin/activate # Build docs first with SKIP_API to ensure they are buildable without requiring any # language docs to be built beforehand. diff --git a/dev/spark-test-image/docs/Dockerfile b/dev/spark-test-image/docs/Dockerfile index 5009023fe677e..5369629beca7f 100644 --- a/dev/spark-test-image/docs/Dockerfile +++ b/dev/spark-test-image/docs/Dockerfile @@ -70,15 +70,6 @@ RUN apt-get update && apt-get install -y \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* -# See more in SPARK-39959, roxygen2 < 7.2.1 -RUN Rscript -e "install.packages(c('devtools', 'knitr', 'markdown', 'rmarkdown', 'testthat', 'remotes'), repos='https://cloud.r-project.org/')" && \ - Rscript -e "devtools::install_version('roxygen2', version='7.2.0', repos='https://cloud.r-project.org')" && \ - Rscript -e "devtools::install_version('pkgdown', version='2.0.1', repos='https://cloud.r-project.org')" && \ - Rscript -e "devtools::install_version('preferably', version='0.4', repos='https://cloud.r-project.org')" - -# See more in SPARK-39735 -ENV R_LIBS_SITE="/usr/local/lib/R/site-library:${R_LIBS_SITE}:/usr/lib/R/library" - # Setup virtual environment ENV VIRTUAL_ENV=/opt/spark-venv RUN python3.12 -m venv $VIRTUAL_ENV @@ -87,7 +78,8 @@ ENV PATH="$VIRTUAL_ENV/bin:$PATH" # Should unpin 'sphinxcontrib-*' after upgrading sphinx>5 # See 'ipython_genutils' in SPARK-38517 # See 'docutils<0.18.0' in SPARK-39421 -RUN python3.12 -m pip install 'sphinx==4.5.0' mkdocs 'pydata_sphinx_theme>=0.13' sphinx-copybutton nbsphinx numpydoc jinja2 markupsafe \ +RUN . $VIRTUAL_ENV/bin/activate \ + && python3.12 -m pip install 'sphinx==4.5.0' mkdocs 'pydata_sphinx_theme>=0.13' sphinx-copybutton nbsphinx numpydoc jinja2 markupsafe \ ipython ipython_genutils sphinx_plotly_directive 'numpy>=1.22' 'pyarrow>=23.0.0' 'pandas==2.3.3' 'plotly>=4.8' 'docutils<0.18.0' \ 'flake8==3.9.0' 'mypy==1.19.1' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' 'ruff==0.14.8' \ 'pandas-stubs==1.2.0.53' 'grpcio==1.76.0' 'grpcio-status==1.76.0' 'protobuf==6.33.5' 'grpc-stubs==1.24.11' 'googleapis-common-protos-stubs==2.2.0' \ diff --git a/docs/Gemfile.lock b/docs/Gemfile.lock index 70a51382190c8..a1f9b0f07086d 100644 --- a/docs/Gemfile.lock +++ b/docs/Gemfile.lock @@ -4,7 +4,6 @@ GEM addressable (2.8.7) public_suffix (>= 2.0.2, < 7.0) base64 (0.3.0) - bigdecimal (3.2.2) colorator (1.1.0) concurrent-ruby (1.3.5) csv (3.3.5) @@ -14,9 +13,6 @@ GEM eventmachine (1.2.7) ffi (1.17.2) forwardable-extended (2.6.0) - google-protobuf (4.31.1) - bigdecimal - rake (>= 13) http_parser.rb (0.8.0) i18n (1.14.7) concurrent-ruby (~> 1.0) @@ -41,8 +37,8 @@ GEM webrick (~> 1.7) jekyll-redirect-from (0.16.0) jekyll (>= 3.3, < 5.0) - jekyll-sass-converter (3.1.0) - sass-embedded (~> 1.75) + jekyll-sass-converter (2.2.0) + sassc (> 2.0.1, < 3.0) jekyll-watch (2.2.1) listen (~> 3.0) json (2.12.2) @@ -58,16 +54,14 @@ GEM pathutil (0.16.2) forwardable-extended (~> 2.6) public_suffix (6.0.2) - rake (13.3.0) rb-fsevent (0.11.2) rb-inotify (0.11.1) ffi (~> 1.0) rexml (3.4.4) rouge (4.5.2) safe_yaml (1.0.5) - sass-embedded (1.89.2) - google-protobuf (~> 4.31) - rake (>= 13) + sassc (2.4.0) + ffi (~> 1.9) terminal-table (3.0.2) unicode-display_width (>= 1.1.1, < 3) unicode-display_width (2.6.0) diff --git a/docs/README.md b/docs/README.md index 1235efe91812b..b36e3e29d8a20 100644 --- a/docs/README.md +++ b/docs/README.md @@ -60,7 +60,8 @@ To generate the Python or R API docs, you'll also need to [install Pandoc](https If you'd like to generate R API documentation, install these libraries: ```sh -$ sudo Rscript -e 'install.packages(c("knitr", "devtools", "testthat", "rmarkdown"), repos="https://cloud.r-project.org/")' +$ sudo Rscript -e 'install.packages(c("knitr", "devtools", "testthat", "rmarkdown", "remotes"), repos="https://cloud.r-project.org/")' +$ sudo Rscript -e 'remotes::install_version("markdown", version="1.12", repos="https://cloud.r-project.org")' $ sudo Rscript -e 'devtools::install_version("roxygen2", version = "7.1.2", repos="https://cloud.r-project.org/")' $ sudo Rscript -e "devtools::install_version('pkgdown', version='2.0.1', repos='https://cloud.r-project.org')" $ sudo Rscript -e "devtools::install_version('preferably', version='0.4', repos='https://cloud.r-project.org')" @@ -136,6 +137,6 @@ Note: Before running it, you need to have `docker` installed. $ dev/spark-test-image-util/docs/build-docs ``` -It will generate all documents on the `container` and `host`. +It will generate all documents on the `container` except for R and `host` (R). Especially when there are conflicts between the libraries required by Python development environment and the libraries required by generating Python docs environment, this is a good choice.