From 9659620924e44b23bfb65c3ce8dc6a5762baa69f Mon Sep 17 00:00:00 2001 From: Kousuke Saruta Date: Fri, 8 May 2026 00:44:05 +0900 Subject: [PATCH 01/10] [SPARK-56525][INFRA] Run apt-get update before installing R dependencies ### What changes were proposed in this pull request? Add `apt-get update` before `apt-get install` for R-related dev libraries to avoid stale package index causing 404 errors. ### Why are the changes needed? The `apt-get install` for R dev dependencies (libtiff5-dev, libharfbuzz-dev, etc.) is in a separate RUN layer from the earlier `apt-get update`, so when the package index becomes stale (packages are superseded on the Ubuntu archive), the install fails with 404. ### Does this PR introduce *any* user-facing change? No. ### How was this patch tested? CI. ### Was this patch authored or co-authored using generative AI tooling? No. --- dev/infra/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/infra/Dockerfile b/dev/infra/Dockerfile index 83176aec80c53..6f4acdf05ba1b 100644 --- a/dev/infra/Dockerfile +++ b/dev/infra/Dockerfile @@ -55,7 +55,7 @@ RUN $APT_INSTALL r-base libcurl4-openssl-dev qpdf libssl-dev zlib1g-dev RUN Rscript -e "install.packages(c('remotes', 'knitr', 'markdown', 'rmarkdown', 'testthat', 'e1071', 'survival', 'arrow', 'roxygen2', 'xml2'), repos='https://cloud.r-project.org/')" # See more in SPARK-39959, roxygen2 < 7.2.1 -RUN apt-get install -y libcurl4-openssl-dev libgit2-dev libssl-dev libxml2-dev \ +RUN apt-get update && apt-get install -y libcurl4-openssl-dev libgit2-dev libssl-dev libxml2-dev \ libfontconfig1-dev libharfbuzz-dev libfribidi-dev libfreetype6-dev libpng-dev \ libtiff5-dev libjpeg-dev RUN Rscript -e "install.packages(c('remotes'), repos='https://cloud.r-project.org/')" From b70f85e295ca3d96855bb90c024bccb3adad1cfe Mon Sep 17 00:00:00 2001 From: Kousuke Saruta Date: Sat, 9 May 2026 06:15:39 +0000 Subject: [PATCH 02/10] [INFRA] Fix docker build for dev/infra by pinning scipy<1.10 and pythran for PyPy 3.8 compatibility --- dev/infra/Dockerfile | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/dev/infra/Dockerfile b/dev/infra/Dockerfile index 6f4acdf05ba1b..83055db7a4ea6 100644 --- a/dev/infra/Dockerfile +++ b/dev/infra/Dockerfile @@ -31,7 +31,7 @@ RUN apt-get update RUN $APT_INSTALL software-properties-common git libxml2-dev pkg-config curl wget openjdk-8-jdk libpython3-dev python3-pip python3-setuptools python3.8 python3.9 RUN update-alternatives --set java /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java -RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.9 +RUN curl -sS https://bootstrap.pypa.io/pip/3.9/get-pip.py | python3.9 RUN add-apt-repository ppa:pypy/ppa RUN apt update @@ -43,7 +43,7 @@ RUN mkdir -p /usr/local/pypy/pypy3.8 && \ ln -sf /usr/local/pypy/pypy3.8/bin/pypy /usr/local/bin/pypy3.8 && \ ln -sf /usr/local/pypy/pypy3.8/bin/pypy /usr/local/bin/pypy3 -RUN curl -sS https://bootstrap.pypa.io/get-pip.py | pypy3 +RUN curl -sS https://bootstrap.pypa.io/pip/3.8/get-pip.py | pypy3 RUN $APT_INSTALL gnupg ca-certificates pandoc RUN echo 'deb https://cloud.r-project.org/bin/linux/ubuntu focal-cran40/' >> /etc/apt/sources.list @@ -64,7 +64,10 @@ RUN Rscript -e "remotes::install_version('roxygen2', version='7.2.0', repos='htt # See more in SPARK-39735 ENV R_LIBS_SITE "/usr/local/lib/R/site-library:${R_LIBS_SITE}:/usr/lib/R/library" -RUN pypy3 -m pip install numpy 'pandas<=2.0.3' scipy coverage matplotlib +RUN printf 'pythran==0.12.2\nbeniget==0.4.1\ngast==0.5.3\n' > /tmp/pypy-constraints.txt && \ + PIP_CONSTRAINT=/tmp/pypy-constraints.txt pypy3 -m pip install 'scipy<1.10' && \ + SETUPTOOLS_USE_DISTUTILS=stdlib PIP_CONSTRAINT=/tmp/pypy-constraints.txt pypy3 -m pip install numpy 'pandas<=2.0.3' coverage matplotlib && \ + rm /tmp/pypy-constraints.txt RUN python3.9 -m pip install 'numpy==1.25.1' 'pyarrow==12.0.1' 'pandas<=2.0.3' scipy unittest-xml-reporting plotly>=4.8 'mlflow>=2.3.1' coverage 'matplotlib==3.7.2' openpyxl 'memory-profiler==0.60.0' 'scikit-learn==1.1.*' # Add Python deps for Spark Connect. From be69e7f4e4cd01757bc4f6255133d63fbb88b31d Mon Sep 17 00:00:00 2001 From: Kousuke Saruta Date: Sat, 9 May 2026 12:56:07 +0000 Subject: [PATCH 03/10] Pin beniget==0.4.1 and limit pyproject-metadata<0.9.0 --- dev/infra/Dockerfile | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/dev/infra/Dockerfile b/dev/infra/Dockerfile index 83055db7a4ea6..3b2ae71d4a53d 100644 --- a/dev/infra/Dockerfile +++ b/dev/infra/Dockerfile @@ -27,16 +27,13 @@ ENV DEBCONF_NONINTERACTIVE_SEEN true ARG APT_INSTALL="apt-get install --no-install-recommends -y" RUN apt-get clean -RUN apt-get update -RUN $APT_INSTALL software-properties-common git libxml2-dev pkg-config curl wget openjdk-8-jdk libpython3-dev python3-pip python3-setuptools python3.8 python3.9 +RUN apt-get update && $APT_INSTALL software-properties-common git libxml2-dev pkg-config curl wget openjdk-8-jdk libpython3-dev python3-pip python3-setuptools python3.8 python3.9 RUN update-alternatives --set java /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java RUN curl -sS https://bootstrap.pypa.io/pip/3.9/get-pip.py | python3.9 RUN add-apt-repository ppa:pypy/ppa -RUN apt update -RUN $APT_INSTALL gfortran libopenblas-dev liblapack-dev -RUN $APT_INSTALL build-essential +RUN apt-get update && $APT_INSTALL gfortran libopenblas-dev liblapack-dev build-essential RUN mkdir -p /usr/local/pypy/pypy3.8 && \ curl -sqL https://downloads.python.org/pypy/pypy3.8-v7.3.11-linux64.tar.bz2 | tar xjf - -C /usr/local/pypy/pypy3.8 --strip-components=1 && \ @@ -45,13 +42,12 @@ RUN mkdir -p /usr/local/pypy/pypy3.8 && \ RUN curl -sS https://bootstrap.pypa.io/pip/3.8/get-pip.py | pypy3 -RUN $APT_INSTALL gnupg ca-certificates pandoc +RUN apt-get update && $APT_INSTALL gnupg ca-certificates pandoc RUN echo 'deb https://cloud.r-project.org/bin/linux/ubuntu focal-cran40/' >> /etc/apt/sources.list RUN gpg --keyserver hkps://keyserver.ubuntu.com --recv-key E298A3A825C0D65DFD57CBB651716619E084DAB9 RUN gpg -a --export E084DAB9 | apt-key add - RUN add-apt-repository 'deb https://cloud.r-project.org/bin/linux/ubuntu focal-cran40/' -RUN apt update -RUN $APT_INSTALL r-base libcurl4-openssl-dev qpdf libssl-dev zlib1g-dev +RUN apt-get update && $APT_INSTALL r-base libcurl4-openssl-dev qpdf libssl-dev zlib1g-dev RUN Rscript -e "install.packages(c('remotes', 'knitr', 'markdown', 'rmarkdown', 'testthat', 'e1071', 'survival', 'arrow', 'roxygen2', 'xml2'), repos='https://cloud.r-project.org/')" # See more in SPARK-39959, roxygen2 < 7.2.1 @@ -64,9 +60,9 @@ RUN Rscript -e "remotes::install_version('roxygen2', version='7.2.0', repos='htt # See more in SPARK-39735 ENV R_LIBS_SITE "/usr/local/lib/R/site-library:${R_LIBS_SITE}:/usr/lib/R/library" -RUN printf 'pythran==0.12.2\nbeniget==0.4.1\ngast==0.5.3\n' > /tmp/pypy-constraints.txt && \ - PIP_CONSTRAINT=/tmp/pypy-constraints.txt pypy3 -m pip install 'scipy<1.10' && \ - SETUPTOOLS_USE_DISTUTILS=stdlib PIP_CONSTRAINT=/tmp/pypy-constraints.txt pypy3 -m pip install numpy 'pandas<=2.0.3' coverage matplotlib && \ +RUN printf 'beniget==0.4.1\npyproject-metadata<0.9.0\n' > /tmp/pypy-constraints.txt && \ + PIP_CONSTRAINT=/tmp/pypy-constraints.txt pypy3 -m pip install numpy scipy coverage matplotlib && \ + SETUPTOOLS_USE_DISTUTILS=stdlib pypy3 -m pip install 'pandas<=2.0.3' && \ rm /tmp/pypy-constraints.txt RUN python3.9 -m pip install 'numpy==1.25.1' 'pyarrow==12.0.1' 'pandas<=2.0.3' scipy unittest-xml-reporting plotly>=4.8 'mlflow>=2.3.1' coverage 'matplotlib==3.7.2' openpyxl 'memory-profiler==0.60.0' 'scikit-learn==1.1.*' From 2f1dc51715c5d957a1f78c2ec978c2553061c255 Mon Sep 17 00:00:00 2001 From: Kousuke Saruta Date: Sat, 9 May 2026 22:41:54 +0000 Subject: [PATCH 04/10] Update FULL_REFRESH_DATE --- dev/infra/Dockerfile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dev/infra/Dockerfile b/dev/infra/Dockerfile index 3b2ae71d4a53d..43fe4129d2c36 100644 --- a/dev/infra/Dockerfile +++ b/dev/infra/Dockerfile @@ -19,7 +19,7 @@ # See also in https://hub.docker.com/_/ubuntu FROM ubuntu:focal-20221019 -ENV FULL_REFRESH_DATE 20221118 +ENV FULL_REFRESH_DATE 20260510 ENV DEBIAN_FRONTEND noninteractive ENV DEBCONF_NONINTERACTIVE_SEEN true @@ -47,7 +47,7 @@ RUN echo 'deb https://cloud.r-project.org/bin/linux/ubuntu focal-cran40/' >> /et RUN gpg --keyserver hkps://keyserver.ubuntu.com --recv-key E298A3A825C0D65DFD57CBB651716619E084DAB9 RUN gpg -a --export E084DAB9 | apt-key add - RUN add-apt-repository 'deb https://cloud.r-project.org/bin/linux/ubuntu focal-cran40/' -RUN apt-get update && $APT_INSTALL r-base libcurl4-openssl-dev qpdf libssl-dev zlib1g-dev +RUN apt-get update && $APT_INSTALL r-base libcurl4-openssl-dev qpdf libssl-dev zlib1g-dev libuv1-dev RUN Rscript -e "install.packages(c('remotes', 'knitr', 'markdown', 'rmarkdown', 'testthat', 'e1071', 'survival', 'arrow', 'roxygen2', 'xml2'), repos='https://cloud.r-project.org/')" # See more in SPARK-39959, roxygen2 < 7.2.1 @@ -64,7 +64,7 @@ RUN printf 'beniget==0.4.1\npyproject-metadata<0.9.0\n' > /tmp/pypy-constraints. PIP_CONSTRAINT=/tmp/pypy-constraints.txt pypy3 -m pip install numpy scipy coverage matplotlib && \ SETUPTOOLS_USE_DISTUTILS=stdlib pypy3 -m pip install 'pandas<=2.0.3' && \ rm /tmp/pypy-constraints.txt -RUN python3.9 -m pip install 'numpy==1.25.1' 'pyarrow==12.0.1' 'pandas<=2.0.3' scipy unittest-xml-reporting plotly>=4.8 'mlflow>=2.3.1' coverage 'matplotlib==3.7.2' openpyxl 'memory-profiler==0.60.0' 'scikit-learn==1.1.*' +RUN python3.9 -m pip install 'numpy==1.25.1' 'pyarrow==12.0.1' 'pandas<=2.0.3' scipy unittest-xml-reporting 'plotly<6.0' 'mlflow>=2.3.1' coverage 'matplotlib==3.7.2' openpyxl 'memory-profiler==0.60.0' 'scikit-learn==1.1.*' # Add Python deps for Spark Connect. RUN python3.9 -m pip install 'grpcio>=1.48,<1.57' 'grpcio-status>=1.48,<1.57' 'protobuf==3.20.3' 'googleapis-common-protos==1.56.4' From 976e0a20a9fd03abe68098d4ae87dc4e761f8270 Mon Sep 17 00:00:00 2001 From: Kousuke Saruta Date: Sun, 10 May 2026 08:42:41 +0000 Subject: [PATCH 05/10] Pin pyproject-metadata==0.8.1 --- dev/infra/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/infra/Dockerfile b/dev/infra/Dockerfile index 43fe4129d2c36..d9c04eba9b266 100644 --- a/dev/infra/Dockerfile +++ b/dev/infra/Dockerfile @@ -60,7 +60,7 @@ RUN Rscript -e "remotes::install_version('roxygen2', version='7.2.0', repos='htt # See more in SPARK-39735 ENV R_LIBS_SITE "/usr/local/lib/R/site-library:${R_LIBS_SITE}:/usr/lib/R/library" -RUN printf 'beniget==0.4.1\npyproject-metadata<0.9.0\n' > /tmp/pypy-constraints.txt && \ +RUN printf 'beniget==0.4.1\npyproject-metadata==0.8.1\n' > /tmp/pypy-constraints.txt && \ PIP_CONSTRAINT=/tmp/pypy-constraints.txt pypy3 -m pip install numpy scipy coverage matplotlib && \ SETUPTOOLS_USE_DISTUTILS=stdlib pypy3 -m pip install 'pandas<=2.0.3' && \ rm /tmp/pypy-constraints.txt From 542b7eadcc9d0775d360d867e06c0fdd18d6c85e Mon Sep 17 00:00:00 2001 From: Kousuke Saruta Date: Sun, 10 May 2026 15:11:18 +0000 Subject: [PATCH 06/10] Skip mypy for pydantic and sqlalchemy --- python/mypy.ini | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/python/mypy.ini b/python/mypy.ini index 3443af9a86503..44d3f73e4da53 100644 --- a/python/mypy.ini +++ b/python/mypy.ini @@ -169,3 +169,9 @@ ignore_missing_imports = True ; Ignore errors for proto generated code [mypy-pyspark.sql.connect.proto.*, pyspark.sql.connect.proto] ignore_errors = True + +[mypy-pydantic.*] +follow_imports = skip + +[mypy-sqlalchemy.*] +follow_imports = skip From c84bfa8016befcc4a111eedbd8454d6fe33f105f Mon Sep 17 00:00:00 2001 From: Kousuke Saruta Date: Sun, 10 May 2026 15:50:17 +0000 Subject: [PATCH 07/10] fix(sparkr): Skip primitive functions in cleanClosure Primitive functions (e.g., min, max, sum) do not have environments and attempting to set one via environment<- has no effect. Since R 4.4.0, this operation emits a deprecation warning, which causes test failures when running with options(warn = 2). Add is.primitive() guards in both processClosure and cleanClosure so that primitive functions are handled without attempting to access or modify their environment. --- R/pkg/R/utils.R | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/R/pkg/R/utils.R b/R/pkg/R/utils.R index ca8f8defdfdec..b9302585a03ce 100644 --- a/R/pkg/R/utils.R +++ b/R/pkg/R/utils.R @@ -546,6 +546,11 @@ processClosure <- function(node, oldEnv, defVars, checkedFuncs, newEnv) { error = function(e) { FALSE })) { obj <- get(nodeChar, envir = func.env, inherits = FALSE) if (is.function(obj)) { + if (is.primitive(obj)) { + # Primitive functions have no closure to clean. + assign(nodeChar, obj, envir = newEnv) + break + } # If the node is a function call. funcList <- mget(nodeChar, envir = checkedFuncs, inherits = F, ifnotfound = list(list(NULL)))[[1]] @@ -592,7 +597,7 @@ processClosure <- function(node, oldEnv, defVars, checkedFuncs, newEnv) { # return value # a new version of func that has a correct environment (closure). cleanClosure <- function(func, checkedFuncs = new.env()) { - if (is.function(func)) { + if (is.function(func) && !is.primitive(func)) { newEnv <- new.env(parent = .GlobalEnv) func.body <- body(func) oldEnv <- environment(func) From ae042e4b622090b012f5d9a23866dce00cce7fdf Mon Sep 17 00:00:00 2001 From: Kousuke Saruta Date: Wed, 13 May 2026 05:31:34 +0000 Subject: [PATCH 08/10] Fix R/run-tests.sh for "Lost braces" in Rd files due to R 4.4+ stricter checkRd --- R/run-tests.sh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/R/run-tests.sh b/R/run-tests.sh index 90a60eda03871..20442ca89117d 100755 --- a/R/run-tests.sh +++ b/R/run-tests.sh @@ -58,10 +58,11 @@ if [[ $FAILED != 0 || $NUM_TEST_WARNING != 0 ]]; then echo -en "\033[0m" # No color exit -1 else - # We have 2 NOTEs: for RoxygenNote and one in Jenkins only "No repository set" + # We have 3 NOTEs: for RoxygenNote, one in Jenkins only "No repository set", + # and "Lost braces" in Rd files due to R 4.4+ stricter checkRd # For non-latest version branches, one WARNING for package version - if [[ ($NUM_CRAN_WARNING != 0 || $NUM_CRAN_ERROR != 0 || $NUM_CRAN_NOTES -gt 2) && - ($HAS_PACKAGE_VERSION_WARN != 1 || $NUM_CRAN_WARNING != 1 || $NUM_CRAN_ERROR != 0 || $NUM_CRAN_NOTES -gt 1) ]]; then + if [[ ($NUM_CRAN_WARNING != 0 || $NUM_CRAN_ERROR != 0 || $NUM_CRAN_NOTES -gt 3) && + ($HAS_PACKAGE_VERSION_WARN != 1 || $NUM_CRAN_WARNING != 1 || $NUM_CRAN_ERROR != 0 || $NUM_CRAN_NOTES -gt 2) ]]; then cat $CRAN_CHECK_LOG_FILE echo -en "\033[31m" # Red echo "Had CRAN check errors; see logs." From 67cae39225055cb84af53286dcdbc119ec22acd0 Mon Sep 17 00:00:00 2001 From: Kousuke Saruta Date: Wed, 13 May 2026 05:33:02 +0000 Subject: [PATCH 09/10] Fix to pass dev/lint-python --- dev/infra/Dockerfile | 2 +- python/pyspark/ml/tests/typing/test_feature.yml | 8 ++++---- .../pyspark/sql/tests/typing/test_functions.yml | 16 ++++++++-------- python/pyspark/sql/utils.py | 2 +- 4 files changed, 14 insertions(+), 14 deletions(-) diff --git a/dev/infra/Dockerfile b/dev/infra/Dockerfile index d9c04eba9b266..561c6a31fefcf 100644 --- a/dev/infra/Dockerfile +++ b/dev/infra/Dockerfile @@ -64,7 +64,7 @@ RUN printf 'beniget==0.4.1\npyproject-metadata==0.8.1\n' > /tmp/pypy-constraints PIP_CONSTRAINT=/tmp/pypy-constraints.txt pypy3 -m pip install numpy scipy coverage matplotlib && \ SETUPTOOLS_USE_DISTUTILS=stdlib pypy3 -m pip install 'pandas<=2.0.3' && \ rm /tmp/pypy-constraints.txt -RUN python3.9 -m pip install 'numpy==1.25.1' 'pyarrow==12.0.1' 'pandas<=2.0.3' scipy unittest-xml-reporting 'plotly<6.0' 'mlflow>=2.3.1' coverage 'matplotlib==3.7.2' openpyxl 'memory-profiler==0.60.0' 'scikit-learn==1.1.*' +RUN python3.9 -m pip install 'numpy==1.25.1' 'pyarrow==12.0.1' 'pandas<=2.0.3' scipy unittest-xml-reporting 'plotly<6.0' 'mlflow>=2.3.1' coverage 'matplotlib==3.7.2' openpyxl 'memory-profiler==0.60.0' 'scikit-learn==1.1.*' 'Flask==1.1.2' # Add Python deps for Spark Connect. RUN python3.9 -m pip install 'grpcio>=1.48,<1.57' 'grpcio-status>=1.48,<1.57' 'protobuf==3.20.3' 'googleapis-common-protos==1.56.4' diff --git a/python/pyspark/ml/tests/typing/test_feature.yml b/python/pyspark/ml/tests/typing/test_feature.yml index 0d1034a44df66..9c9242cf3cd48 100644 --- a/python/pyspark/ml/tests/typing/test_feature.yml +++ b/python/pyspark/ml/tests/typing/test_feature.yml @@ -47,9 +47,9 @@ out: | main:14: error: No overload variant of "StringIndexer" matches argument types "str", "List[str]" [call-overload] main:14: note: Possible overload variants: - main:14: note: def StringIndexer(self, *, inputCol: Optional[str] = ..., outputCol: Optional[str] = ..., handleInvalid: str = ..., stringOrderType: str = ...) -> StringIndexer - main:14: note: def StringIndexer(self, *, inputCols: Optional[List[str]] = ..., outputCols: Optional[List[str]] = ..., handleInvalid: str = ..., stringOrderType: str = ...) -> StringIndexer + main:14: note: def __init__(self, *, inputCol: Optional[str] = ..., outputCol: Optional[str] = ..., handleInvalid: str = ..., stringOrderType: str = ...) -> StringIndexer + main:14: note: def __init__(self, *, inputCols: Optional[List[str]] = ..., outputCols: Optional[List[str]] = ..., handleInvalid: str = ..., stringOrderType: str = ...) -> StringIndexer main:15: error: No overload variant of "StringIndexer" matches argument types "List[str]", "str" [call-overload] main:15: note: Possible overload variants: - main:15: note: def StringIndexer(self, *, inputCol: Optional[str] = ..., outputCol: Optional[str] = ..., handleInvalid: str = ..., stringOrderType: str = ...) -> StringIndexer - main:15: note: def StringIndexer(self, *, inputCols: Optional[List[str]] = ..., outputCols: Optional[List[str]] = ..., handleInvalid: str = ..., stringOrderType: str = ...) -> StringIndexer + main:15: note: def __init__(self, *, inputCol: Optional[str] = ..., outputCol: Optional[str] = ..., handleInvalid: str = ..., stringOrderType: str = ...) -> StringIndexer + main:15: note: def __init__(self, *, inputCols: Optional[List[str]] = ..., outputCols: Optional[List[str]] = ..., handleInvalid: str = ..., stringOrderType: str = ...) -> StringIndexer diff --git a/python/pyspark/sql/tests/typing/test_functions.yml b/python/pyspark/sql/tests/typing/test_functions.yml index 6c80420bf0a3b..c540f508b39dd 100644 --- a/python/pyspark/sql/tests/typing/test_functions.yml +++ b/python/pyspark/sql/tests/typing/test_functions.yml @@ -70,32 +70,32 @@ main:29: error: No overload variant of "array" matches argument types "List[Column]", "List[Column]" [call-overload] main:29: note: Possible overload variants: main:29: note: def array(*cols: Union[Column, str]) -> Column - main:29: note: def [ColumnOrName_] array(Union[List[ColumnOrName_], Tuple[ColumnOrName_, ...]]) -> Column + main:29: note: def [ColumnOrName_] array(__cols, Union[List[ColumnOrName_], Tuple[ColumnOrName_, ...]]) -> Column main:30: error: No overload variant of "create_map" matches argument types "List[Column]", "List[Column]" [call-overload] main:30: note: Possible overload variants: main:30: note: def create_map(*cols: Union[Column, str]) -> Column - main:30: note: def [ColumnOrName_] create_map(Union[List[ColumnOrName_], Tuple[ColumnOrName_, ...]]) -> Column + main:30: note: def [ColumnOrName_] create_map(__cols, Union[List[ColumnOrName_], Tuple[ColumnOrName_, ...]]) -> Column main:31: error: No overload variant of "map_concat" matches argument types "List[Column]", "List[Column]" [call-overload] main:31: note: Possible overload variants: main:31: note: def map_concat(*cols: Union[Column, str]) -> Column - main:31: note: def [ColumnOrName_] map_concat(Union[List[ColumnOrName_], Tuple[ColumnOrName_, ...]]) -> Column + main:31: note: def [ColumnOrName_] map_concat(__cols, Union[List[ColumnOrName_], Tuple[ColumnOrName_, ...]]) -> Column main:32: error: No overload variant of "struct" matches argument types "List[str]", "List[str]" [call-overload] main:32: note: Possible overload variants: main:32: note: def struct(*cols: Union[Column, str]) -> Column - main:32: note: def [ColumnOrName_] struct(Union[List[ColumnOrName_], Tuple[ColumnOrName_, ...]]) -> Column + main:32: note: def [ColumnOrName_] struct(__cols, Union[List[ColumnOrName_], Tuple[ColumnOrName_, ...]]) -> Column main:33: error: No overload variant of "array" matches argument types "List[str]", "List[str]" [call-overload] main:33: note: Possible overload variants: main:33: note: def array(*cols: Union[Column, str]) -> Column - main:33: note: def [ColumnOrName_] array(Union[List[ColumnOrName_], Tuple[ColumnOrName_, ...]]) -> Column + main:33: note: def [ColumnOrName_] array(__cols, Union[List[ColumnOrName_], Tuple[ColumnOrName_, ...]]) -> Column main:34: error: No overload variant of "create_map" matches argument types "List[str]", "List[str]" [call-overload] main:34: note: Possible overload variants: main:34: note: def create_map(*cols: Union[Column, str]) -> Column - main:34: note: def [ColumnOrName_] create_map(Union[List[ColumnOrName_], Tuple[ColumnOrName_, ...]]) -> Column + main:34: note: def [ColumnOrName_] create_map(__cols, Union[List[ColumnOrName_], Tuple[ColumnOrName_, ...]]) -> Column main:35: error: No overload variant of "map_concat" matches argument types "List[str]", "List[str]" [call-overload] main:35: note: Possible overload variants: main:35: note: def map_concat(*cols: Union[Column, str]) -> Column - main:35: note: def [ColumnOrName_] map_concat(Union[List[ColumnOrName_], Tuple[ColumnOrName_, ...]]) -> Column + main:35: note: def [ColumnOrName_] map_concat(__cols, Union[List[ColumnOrName_], Tuple[ColumnOrName_, ...]]) -> Column main:36: error: No overload variant of "struct" matches argument types "List[str]", "List[str]" [call-overload] main:36: note: Possible overload variants: main:36: note: def struct(*cols: Union[Column, str]) -> Column - main:36: note: def [ColumnOrName_] struct(Union[List[ColumnOrName_], Tuple[ColumnOrName_, ...]]) -> Column + main:36: note: def [ColumnOrName_] struct(__cols, Union[List[ColumnOrName_], Tuple[ColumnOrName_, ...]]) -> Column diff --git a/python/pyspark/sql/utils.py b/python/pyspark/sql/utils.py index 02b67d96f1701..3915ab83bc4e1 100644 --- a/python/pyspark/sql/utils.py +++ b/python/pyspark/sql/utils.py @@ -50,7 +50,7 @@ from pyspark.sql.window import Window from pyspark.pandas._typing import IndexOpsLike, SeriesOrIndex -has_numpy = False +has_numpy: bool = False try: import numpy as np # noqa: F401 From 458389ddc4fefceaff841683d1c16adce4512d40 Mon Sep 17 00:00:00 2001 From: Kousuke Saruta Date: Wed, 13 May 2026 12:12:56 +0000 Subject: [PATCH 10/10] fix(infra): Pin Werkzeug and ragg for CI compatibility Pin Werkzeug==2.1.2 in Dockerfile to maintain compatibility with markupsafe==2.0.1 used in the workflow lint step. Pin ragg==1.2.5 in the workflow before pkgdown installation because ragg 1.5.x requires libwebp which is not available in the Docker image, and its configure script fails to find freetype2 headers. --- .github/workflows/build_and_test.yml | 1 + dev/infra/Dockerfile | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 0ef8eefba4a4c..7420a79e12204 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -709,6 +709,7 @@ jobs: apt-get update -y apt-get install -y ruby ruby-dev Rscript -e "install.packages(c('remotes', 'testthat', 'knitr', 'rmarkdown', 'markdown', 'e1071', 'roxygen2', 'ggplot2', 'mvtnorm', 'statmod'), repos='https://cloud.r-project.org/')" + Rscript -e "remotes::install_version('ragg', version='1.2.5', repos='https://cloud.r-project.org')" Rscript -e "remotes::install_version('pkgdown', version='2.0.1', repos='https://cloud.r-project.org')" Rscript -e "remotes::install_version('preferably', version='0.4', repos='https://cloud.r-project.org')" gem install bundler -v 2.4.22 diff --git a/dev/infra/Dockerfile b/dev/infra/Dockerfile index 561c6a31fefcf..b2d5246260240 100644 --- a/dev/infra/Dockerfile +++ b/dev/infra/Dockerfile @@ -64,7 +64,7 @@ RUN printf 'beniget==0.4.1\npyproject-metadata==0.8.1\n' > /tmp/pypy-constraints PIP_CONSTRAINT=/tmp/pypy-constraints.txt pypy3 -m pip install numpy scipy coverage matplotlib && \ SETUPTOOLS_USE_DISTUTILS=stdlib pypy3 -m pip install 'pandas<=2.0.3' && \ rm /tmp/pypy-constraints.txt -RUN python3.9 -m pip install 'numpy==1.25.1' 'pyarrow==12.0.1' 'pandas<=2.0.3' scipy unittest-xml-reporting 'plotly<6.0' 'mlflow>=2.3.1' coverage 'matplotlib==3.7.2' openpyxl 'memory-profiler==0.60.0' 'scikit-learn==1.1.*' 'Flask==1.1.2' +RUN python3.9 -m pip install 'numpy==1.25.1' 'pyarrow==12.0.1' 'pandas<=2.0.3' scipy unittest-xml-reporting 'plotly<6.0' 'mlflow>=2.3.1' coverage 'matplotlib==3.7.2' openpyxl 'memory-profiler==0.60.0' 'scikit-learn==1.1.*' 'Flask==1.1.2' 'Werkzeug==2.1.2' # Add Python deps for Spark Connect. RUN python3.9 -m pip install 'grpcio>=1.48,<1.57' 'grpcio-status>=1.48,<1.57' 'protobuf==3.20.3' 'googleapis-common-protos==1.56.4'