diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c4c4f04188d..fb46b2eda09 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -337,6 +337,7 @@ repos: ?^ci/scripts/python_sdist_build\.sh$| ?^ci/scripts/python_sdist_test\.sh$| ?^ci/scripts/python_wheel_unix_test\.sh$| + ?^ci/scripts/python_test_type_annotations\.sh$| ?^ci/scripts/r_build\.sh$| ?^ci/scripts/r_revdepcheck\.sh$| ?^ci/scripts/release_test\.sh$| @@ -379,6 +380,7 @@ repos: # TODO: Remove this when we fix all lint failures files: >- ( + ?^ci/scripts/python_test_type_annotations\.sh$| ?^dev/release/05-binary-upload\.sh$| ?^dev/release/binary-recover\.sh$| ?^dev/release/post-03-binary\.sh$| diff --git a/ci/scripts/python_test_type_annotations.sh b/ci/scripts/python_test_type_annotations.sh new file mode 100755 index 00000000000..d3eed879f91 --- /dev/null +++ b/ci/scripts/python_test_type_annotations.sh @@ -0,0 +1,43 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -ex +pyarrow_dir=${1} + +if [ "${PYARROW_TEST_ANNOTATIONS}" == "ON" ]; then + if [ -n "${ARROW_PYTHON_VENV:-}" ]; then + # shellcheck source=/dev/null + . "${ARROW_PYTHON_VENV}/bin/activate" + fi + + # Install library stubs. Note some libraries contain their own type hints so they need to be installed. + pip install fsspec pandas-stubs scipy-stubs types-cffi types-psutil types-requests types-python-dateutil + + # Install type checkers + pip install mypy pyright ty + + # Run type checkers + pushd "${pyarrow_dir}" + mypy + pyright + ty check + popd +else + echo "Skipping type annotation tests" +fi diff --git a/ci/scripts/python_wheel_macos_build.sh b/ci/scripts/python_wheel_macos_build.sh index bd61154430e..3ad03d94bee 100755 --- a/ci/scripts/python_wheel_macos_build.sh +++ b/ci/scripts/python_wheel_macos_build.sh @@ -177,6 +177,8 @@ export CMAKE_PREFIX_PATH=${build_dir}/install export SETUPTOOLS_SCM_PRETEND_VERSION=${PYARROW_VERSION} pushd ${source_dir}/python +# Install libcst for build-time stub docstring extraction +python -m pip install libcst python setup.py bdist_wheel popd diff --git a/ci/scripts/python_wheel_validate_contents.py b/ci/scripts/python_wheel_validate_contents.py index 84fcaba42e6..7d41b1b7385 100644 --- a/ci/scripts/python_wheel_validate_contents.py +++ b/ci/scripts/python_wheel_validate_contents.py @@ -34,7 +34,7 @@ def validate_wheel(path): ] assert not outliers, f"Unexpected contents in wheel: {sorted(outliers)}" print(f"The wheel: {wheels[0]} seems valid.") - + # TODO(GH-32609): Validate some docstrings were generated and added. def main(): parser = argparse.ArgumentParser() diff --git a/ci/scripts/python_wheel_windows_build.bat b/ci/scripts/python_wheel_windows_build.bat index b4b7fed99fd..957bd348042 100644 --- a/ci/scripts/python_wheel_windows_build.bat +++ b/ci/scripts/python_wheel_windows_build.bat @@ -132,6 +132,9 @@ set CMAKE_PREFIX_PATH=C:\arrow-dist pushd C:\arrow\python +@REM Install libcst for build-time stub docstring extraction +%PYTHON_CMD% -m pip install libcst + @REM Build wheel %PYTHON_CMD% setup.py bdist_wheel || exit /B 1 diff --git a/ci/scripts/python_wheel_xlinux_build.sh b/ci/scripts/python_wheel_xlinux_build.sh index a3fbeb3c0b3..b5052a8e86c 100755 --- a/ci/scripts/python_wheel_xlinux_build.sh +++ b/ci/scripts/python_wheel_xlinux_build.sh @@ -167,6 +167,8 @@ export ARROW_HOME=/tmp/arrow-dist export CMAKE_PREFIX_PATH=/tmp/arrow-dist pushd /arrow/python +# Install libcst for build-time stub docstring extraction +python -m pip install libcst python setup.py bdist_wheel echo "=== Strip symbols from wheel ===" diff --git a/compose.yaml b/compose.yaml index 31bc5c81b95..ecfd1e403ad 100644 --- a/compose.yaml +++ b/compose.yaml @@ -1531,13 +1531,15 @@ services: BUILD_DOCS_CPP: "ON" BUILD_DOCS_PYTHON: "ON" PYTEST_ARGS: "--doctest-modules --doctest-cython" + PYARROW_TEST_ANNOTATIONS: "ON" volumes: *conda-volumes command: ["/arrow/ci/scripts/cpp_build.sh /arrow /build && /arrow/ci/scripts/python_build.sh /arrow /build && pip install -e /arrow/dev/archery[numpydoc] && archery numpydoc --allow-rule GL10,PR01,PR03,PR04,PR05,PR10,RT03,YD01 && - /arrow/ci/scripts/python_test.sh /arrow"] + /arrow/ci/scripts/python_test.sh /arrow && + /arrow/ci/scripts/python_test_type_annotations.sh /arrow/python"] conda-python-dask: # Possible $DASK parameters: diff --git a/dev/update_stub_docstrings.py b/dev/update_stub_docstrings.py new file mode 100644 index 00000000000..5ca42a06a70 --- /dev/null +++ b/dev/update_stub_docstrings.py @@ -0,0 +1,233 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +Extract docstrings from pyarrow runtime and insert them into stub files. + +Usage (from python/ directory with pyarrow built): + python ../dev/update_stub_docstrings.py pyarrow-stubs +""" + +import argparse +import importlib +import inspect +import shutil +import sys +from pathlib import Path +from textwrap import indent + +import libcst +from libcst import matchers as m + + +def _resolve_object(module, path): + """Resolve an object by dotted path from a module.""" + if not path: + return module, None, module.__name__ + + parts = path.split(".") + parent = None + obj = module + + for part in parts: + parent = obj + try: + obj = getattr(obj, part) + except AttributeError: + try: + obj = vars(parent).get(part) + if obj is not None: + continue + except TypeError: + pass + return None, None, None + + return obj, parent, getattr(obj, "__name__", parts[-1]) + + +def _get_docstring(name, module, indentation): + """Extract and format a docstring for insertion into a stub file.""" + obj, parent, obj_name = _resolve_object(module, name) + if obj is None: + print(f"{name} not found in {module.__name__}") + return None + + docstring = inspect.getdoc(obj) + if not docstring: + return None + + # Remove signature prefix + parent_name = getattr(parent, "__name__", None) if parent else None + if docstring.startswith(obj_name) or ( + parent_name and docstring.startswith(f"{parent_name}.{obj_name}") + ): + docstring = "\n".join(docstring.splitlines()[2:]) + + # Skip empty docstrings + if not docstring.strip(): + return None + + prefix = " " * indentation + return '"""\n' + indent(docstring + '\n"""', prefix) + + +class DocstringInserter(libcst.CSTTransformer): + """CST transformer that inserts docstrings into stub file nodes.""" + + def __init__(self, module, namespace): + self.module = module + self.base_namespace = namespace + self.stack = [] + self.indentation = 0 + + def _full_name(self): + name = ".".join(self.stack) + return f"{self.base_namespace}.{name}" if self.base_namespace else name + + def leave_Module(self, original_node, updated_node): + new_body = [] + clone_matcher = m.SimpleStatementLine( + body=[m.Assign(value=m.Call(func=m.Name(value="_clone_signature"))), + m.ZeroOrMore()] + ) + for stmt in updated_node.body: + new_body.append(stmt) + if m.matches(stmt, clone_matcher): + name = stmt.body[0].targets[0].target.value + if self.base_namespace: + name = f"{self.base_namespace}.{name}" + docstring = _get_docstring(name, self.module, 0) + if docstring: + new_body.append(libcst.SimpleStatementLine( + body=[libcst.Expr(value=libcst.SimpleString(docstring))])) + return updated_node.with_changes(body=new_body) + + def visit_ClassDef(self, node): + self.stack.append(node.name.value) + self.indentation += 1 + + def leave_ClassDef(self, original_node, updated_node): + name = self._full_name() + docstring = _get_docstring(name, self.module, self.indentation) + + if docstring: + ellipsis_class = m.ClassDef(body=m.IndentedBlock(body=[ + m.SimpleStatementLine(body=[ + m.Expr(m.Ellipsis()), m.ZeroOrMore()]), m.ZeroOrMore()])) + func_class = m.ClassDef(body=m.IndentedBlock( + body=[m.FunctionDef(), m.ZeroOrMore()])) + + if m.matches(updated_node, ellipsis_class): + updated_node = updated_node.deep_replace( + updated_node.body.body[0].body[0].value, + libcst.SimpleString(value=docstring)) + elif m.matches(updated_node, func_class): + docstring_stmt = libcst.SimpleStatementLine( + body=[libcst.Expr(value=libcst.SimpleString(value=docstring))]) + updated_node = updated_node.with_changes( + body=updated_node.body.with_changes( + body=[docstring_stmt] + list(updated_node.body.body))) + + self.stack.pop() + self.indentation -= 1 + return updated_node + + def visit_FunctionDef(self, node): + self.stack.append(node.name.value) + self.indentation += 1 + + def leave_FunctionDef(self, original_node, updated_node): + name = self._full_name() + ellipsis_func = m.FunctionDef( + body=m.SimpleStatementSuite(body=[m.Expr(m.Ellipsis())])) + + if m.matches(original_node, ellipsis_func): + docstring = _get_docstring(name, self.module, self.indentation) + if docstring: + docstring_stmt = libcst.SimpleStatementLine( + body=[libcst.Expr(value=libcst.SimpleString(value=docstring))]) + updated_node = updated_node.with_changes( + body=libcst.IndentedBlock(body=[docstring_stmt])) + + self.stack.pop() + self.indentation -= 1 + return updated_node + + +LIB_MODULES = {"array", "builder", "compat", "config", "device", "error", "io", + "_ipc", "memory", "pandas_shim", "scalar", "table", "tensor", "_types"} + + +def add_docstrings_to_stubs(stubs_dir): + """Update all stub files in stubs_dir with docstrings from pyarrow runtime.""" + stubs_dir = Path(stubs_dir) + print(f"Updating stub docstrings in: {stubs_dir}") + + pyarrow = importlib.import_module("pyarrow") + + for stub_file in stubs_dir.rglob('*.pyi'): + if stub_file.name == "_stubs_typing.pyi": + continue + + module_name = stub_file.stem + if module_name in LIB_MODULES: + namespace = "lib" + elif stub_file.parent.name in ("parquet", "interchange"): + namespace = f"{stub_file.parent.name}.{module_name}" + elif module_name == "__init__": + namespace = "" + else: + namespace = module_name + + print(f" {stub_file.name} -> {namespace or '(root)'}") + tree = libcst.parse_module(stub_file.read_text()) + modified = tree.visit(DocstringInserter(pyarrow, namespace)) + stub_file.write_text(modified.code) + + +def copy_stubs(src_dir, dest_dir): + """Copy .pyi files from src_dir to dest_dir.""" + src_dir, dest_dir = Path(src_dir), Path(dest_dir) + if not src_dir.exists(): + return + + print(f"Copying stubs: {src_dir} -> {dest_dir}") + for src in src_dir.rglob('*.pyi'): + dest = dest_dir / src.relative_to(src_dir) + dest.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(src, dest) + + +def update_stubs_for_build(stubs_dir, build_lib): + """Entry point for setup.py: update docstrings and copy stubs to build dir.""" + stubs_dir, build_lib = Path(stubs_dir), Path(build_lib) + + sys.path.insert(0, str(build_lib)) + try: + add_docstrings_to_stubs(stubs_dir) + copy_stubs(stubs_dir / "pyarrow", build_lib / "pyarrow") + finally: + sys.path.pop(0) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("stubs_dir", type=Path, help="Path to pyarrow-stubs folder") + args = parser.parse_args() + + sys.path.insert(0, ".") + add_docstrings_to_stubs(args.stubs_dir.resolve()) diff --git a/docs/source/developers/python/development.rst b/docs/source/developers/python/development.rst index d03b2439b10..596715b9217 100644 --- a/docs/source/developers/python/development.rst +++ b/docs/source/developers/python/development.rst @@ -101,6 +101,74 @@ The test groups currently include: * ``s3``: Tests for Amazon S3 * ``tensorflow``: Tests that involve TensorFlow +Type Checking +============= + +PyArrow provides type stubs (``*.pyi`` files) for static type checking. These +stubs are located in the ``pyarrow-stubs/`` directory and are automatically +included in the distributed wheel packages. + +Running Type Checkers +--------------------- + +We support multiple type checkers. Their configurations are in +``pyproject.toml``. + +**mypy** + +To run mypy on the PyArrow codebase: + +.. code-block:: + + $ cd arrow/python + $ mypy + +The mypy configuration is in the ``[tool.mypy]`` section of ``pyproject.toml``. + +**pyright** + +To run pyright: + +.. code-block:: + + $ cd arrow/python + $ pyright + +The pyright configuration is in the ``[tool.pyright]`` section of ``pyproject.toml``. + +**ty** + +To run ty (note: currently only partially configured): + +.. code-block:: + + $ cd arrow/python + $ ty check + +Maintaining Type Stubs +----------------------- + +Type stubs for PyArrow are maintained in the ``pyarrow-stubs/`` +directory. These stubs mirror the structure of the main ``pyarrow/`` package. + +When adding or modifying public APIs: + +1. **Update the corresponding ``.pyi`` stub file** in ``pyarrow-stubs/`` + to reflect the new or changed function/class signatures. + +2. **Include type annotations** where possible. For Cython modules or + dynamically generated APIs such as compute kernels add the corresponding + stub in ``pyarrow-stubs/``. + +3. **Run type checkers** to ensure the stubs are correct and complete. + +The stub files are automatically copied into the built wheel during the build +process and will be included when users install PyArrow, enabling type checking +in downstream projects and for users' IDEs. + +Note: ``py.typed`` marker file in the ``pyarrow/`` directory indicates to type +checkers that PyArrow supports type checking according to :pep:`561`. + Doctest ======= diff --git a/python/MANIFEST.in b/python/MANIFEST.in index ed7012e4b70..2840ba74128 100644 --- a/python/MANIFEST.in +++ b/python/MANIFEST.in @@ -4,6 +4,7 @@ include ../NOTICE.txt global-include CMakeLists.txt graft pyarrow +graft pyarrow-stubs graft cmake_modules global-exclude *.so diff --git a/python/pyarrow-stubs/pyarrow/__init__.pyi b/python/pyarrow-stubs/pyarrow/__init__.pyi new file mode 100644 index 00000000000..ccec8d5abc0 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/__init__.pyi @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Type stubs for PyArrow. + +This is a placeholder stub file. +Complete type annotations will be added in subsequent PRs. +""" + +from typing import Any + +# TODO(GH-48970): remove __getattr__ before release as this +# will annotate non-existing attributes as Any. +# https://github.com/apache/arrow/issues/48970 +def __getattr__(name: str) -> Any: ... diff --git a/python/pyarrow/py.typed b/python/pyarrow/py.typed new file mode 100644 index 00000000000..13a83393a91 --- /dev/null +++ b/python/pyarrow/py.typed @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. diff --git a/python/pyproject.toml b/python/pyproject.toml index 899144d418d..2816b69d803 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -18,6 +18,8 @@ [build-system] requires = [ "cython >= 3.1", + # Needed for build-time stub docstring extraction + "libcst>=1.8.6", "numpy>=1.25", # configuring setuptools_scm in pyproject.toml requires # versions released after 2022 @@ -84,11 +86,11 @@ zip-safe=false include-package-data=true [tool.setuptools.packages.find] -include = ["pyarrow"] +include = ["pyarrow", "pyarrow.*"] namespaces = false [tool.setuptools.package-data] -pyarrow = ["*.pxd", "*.pyx", "includes/*.pxd"] +pyarrow = ["*.pxd", "*.pyi", "*.pyx", "includes/*.pxd", "py.typed"] [tool.setuptools_scm] root = '..' @@ -96,3 +98,39 @@ version_file = 'pyarrow/_generated_version.py' version_scheme = 'guess-next-dev' git_describe_command = 'git describe --dirty --tags --long --match "apache-arrow-[0-9]*.*"' fallback_version = '24.0.0a0' + +# TODO: Enable type checking once stubs are merged +[tool.mypy] +files = ["pyarrow-stubs"] +mypy_path = "$MYPY_CONFIG_FILE_DIR/pyarrow-stubs" +exclude = [ + "^pyarrow/", + "^benchmarks/", + "^examples/", + "^scripts/", +] + +# TODO: Enable type checking once stubs are merged +[tool.pyright] +pythonPlatform = "All" +pythonVersion = "3.10" +include = ["pyarrow-stubs"] +exclude = [ + "pyarrow", + "benchmarks", + "examples", + "scripts", + "build", +] +stubPath = "pyarrow-stubs" +typeCheckingMode = "basic" + +# TODO: Enable type checking once stubs are merged +[tool.ty.src] +include = ["pyarrow-stubs"] +exclude = [ + "pyarrow", + "benchmarks", + "examples", + "scripts", +] diff --git a/python/setup.py b/python/setup.py index a27bd3baefd..7623bba5147 100755 --- a/python/setup.py +++ b/python/setup.py @@ -121,8 +121,28 @@ def build_extensions(self): def run(self): self._run_cmake() + self._update_stubs() _build_ext.run(self) + def _update_stubs(self): + """Update stub docstrings and copy to build directory.""" + stubs_dir = pjoin(setup_dir, 'pyarrow-stubs') + if not os.path.exists(stubs_dir): + return + + build_cmd = self.get_finalized_command('build') + build_lib = os.path.abspath(build_cmd.build_lib) + + # Import here to avoid hard dependency on the dev script + sys.path.insert(0, pjoin(setup_dir, '..', 'dev')) + try: + from update_stub_docstrings import update_stubs_for_build + update_stubs_for_build(stubs_dir, build_lib) + except ImportError: + print("-- Skipping stubs (update_stub_docstrings.py not found)") + finally: + sys.path.pop(0) + # adapted from cmake_build_ext in dynd-python # github.com/libdynd/dynd-python