diff --git a/xrspatial/geotiff/__init__.py b/xrspatial/geotiff/__init__.py index 23b2cbeb..11e740d0 100644 --- a/xrspatial/geotiff/__init__.py +++ b/xrspatial/geotiff/__init__.py @@ -13,6 +13,8 @@ """ from __future__ import annotations +import warnings + import numpy as np import xarray as xr @@ -1399,7 +1401,8 @@ def read_geotiff_gpu(source: str, *, overview_level: int | None = None, name: str | None = None, chunks: int | tuple | None = None, - max_pixels: int | None = None) -> xr.DataArray: + max_pixels: int | None = None, + gpu: str = 'auto') -> xr.DataArray: """Read a GeoTIFF with GPU-accelerated decompression via Numba CUDA. Decompresses all tiles in parallel on the GPU and returns a @@ -1425,12 +1428,38 @@ def read_geotiff_gpu(source: str, *, max_pixels : int or None Maximum allowed pixel count (width * height * samples). None uses the default (~1 billion). + gpu : {'auto', 'strict'}, default 'auto' + Behaviour when any GPU decode stage raises an exception. + + The GPU pipeline has two stages: first ``gpu_decode_tiles_from_file`` + (GDS-style direct read), then ``gpu_decode_tiles`` over CPU-mmap + extracted tile bytes. Both stages still run on the GPU. The CPU + fallback (``read_to_array`` + ``cupy.asarray``) only fires after + both GPU stages have failed. + + - ``'auto'``: each GPU-stage failure emits a ``RuntimeWarning`` + reporting the original exception type and message, then falls + through to the next stage (CPU mmap re-decode for the first + failure, full CPU decode + GPU transfer for the second). This + preserves backward-compatible behaviour while making GPU + regressions visible. + - ``'strict'``: re-raise the original exception from either stage + so GPU bugs surface immediately. Useful in tests and CI for the + GPU fast path. + + Stripped layouts and sparse-tile files route directly to the CPU + reader before either GPU decode stage runs, so the ``gpu`` kwarg + does not affect them. A failure inside the subsequent + ``cupy.asarray(...)`` upload propagates unchanged in both modes. Returns ------- xr.DataArray CuPy-backed DataArray on GPU device. """ + if gpu not in ('auto', 'strict'): + raise ValueError( + f"gpu must be 'auto' or 'strict', got {gpu!r}") try: import cupy except ImportError: @@ -1537,8 +1566,16 @@ def read_geotiff_gpu(source: str, *, compression, predictor, file_dtype, samples, byte_order=header.byte_order, ) - except Exception: - pass + except Exception as e: + if gpu == 'strict': + raise + warnings.warn( + f"read_geotiff_gpu: GPU decode failed " + f"({type(e).__name__}: {e}); falling back to CPU.", + RuntimeWarning, + stacklevel=2, + ) + arr_gpu = None if arr_gpu is None: # Fallback: extract tiles via CPU mmap, then GPU decode @@ -1560,8 +1597,15 @@ def read_geotiff_gpu(source: str, *, compression, predictor, file_dtype, samples, byte_order=header.byte_order, ) - except (ValueError, Exception): - # Unsupported compression -- fall back to CPU then transfer + except Exception as e: + if gpu == 'strict': + raise + warnings.warn( + f"read_geotiff_gpu: GPU decode failed " + f"({type(e).__name__}: {e}); falling back to CPU.", + RuntimeWarning, + stacklevel=2, + ) arr_cpu, _ = read_to_array(source, overview_level=overview_level) arr_gpu = cupy.asarray(arr_cpu) diff --git a/xrspatial/geotiff/tests/test_gpu_strict_fallback_1516.py b/xrspatial/geotiff/tests/test_gpu_strict_fallback_1516.py new file mode 100644 index 00000000..6b06c4a2 --- /dev/null +++ b/xrspatial/geotiff/tests/test_gpu_strict_fallback_1516.py @@ -0,0 +1,267 @@ +"""Regression tests for issue #1516. + +``read_geotiff_gpu`` previously wrapped the GPU decode in a too-broad +``try/except Exception: pass`` that silently swallowed any failure and +fell through to the CPU path. Real GPU regressions (#1508 was an +``AttributeError``) lived undetected because the user-visible result +was still numerically correct. + +The fix: + +1. Default ``gpu='auto'`` still falls back to CPU, but emits a + ``RuntimeWarning`` reporting the original exception type and + message so failures are visible. +2. New ``gpu='strict'`` mode re-raises instead of falling back, so + tests and CI for the GPU fast path see real errors. + +These tests monkeypatch ``gpu_decode_tiles_from_file`` to raise a +synthetic exception. They do not require a real GPU because we stub +``cupy`` at the ``sys.modules`` level when it is not already +available; ``cupy.asarray`` is only called in the CPU-fallback branch +and is satisfied by a thin numpy-backed shim. +""" +from __future__ import annotations + +import importlib +import sys +import types + +import numpy as np +import pytest + +from .conftest import make_minimal_tiff + + +_CUPY_ORIG_SENTINEL = object() +_cupy_saved = _CUPY_ORIG_SENTINEL +_cupy_cuda_saved = _CUPY_ORIG_SENTINEL + + +def _cuda_actually_available() -> bool: + """Return True only if cupy + CUDA are usable on this host. + + cupy may be importable on a machine without a working CUDA runtime + (no driver, no device, ROCm-only, etc.). The CPU-fallback branch in + ``read_geotiff_gpu`` calls ``cupy.asarray`` which would then fail at + allocation time. Treat that case the same as cupy-not-installed. + """ + try: + import cupy + except ImportError: + return False + try: + return bool(cupy.cuda.is_available()) + except Exception: + return False + + +def _ensure_cupy_stub() -> bool: + """Install a numpy-backed ``cupy`` shim if real cupy isn't usable. + + Replaces ``sys.modules['cupy']`` whenever cupy is missing OR cupy is + installed but CUDA isn't available. The original module (if any) is + saved so :func:`_restore_cupy` can put it back. + """ + global _cupy_saved, _cupy_cuda_saved + + if _cuda_actually_available(): + return False + + _cupy_saved = sys.modules.get('cupy', _CUPY_ORIG_SENTINEL) + _cupy_cuda_saved = sys.modules.get('cupy.cuda', _CUPY_ORIG_SENTINEL) + + stub = types.ModuleType('cupy') + stub.ndarray = np.ndarray + stub.asarray = np.asarray + + cuda_mod = types.ModuleType('cupy.cuda') + cuda_mod.is_available = lambda: False + stub.cuda = cuda_mod + + sys.modules['cupy'] = stub + sys.modules['cupy.cuda'] = cuda_mod + return True + + +def _restore_cupy() -> None: + """Undo :func:`_ensure_cupy_stub`.""" + global _cupy_saved, _cupy_cuda_saved + for name, saved in ( + ('cupy', _cupy_saved), + ('cupy.cuda', _cupy_cuda_saved), + ): + if saved is _CUPY_ORIG_SENTINEL: + sys.modules.pop(name, None) + else: + sys.modules[name] = saved + _cupy_saved = _CUPY_ORIG_SENTINEL + _cupy_cuda_saved = _CUPY_ORIG_SENTINEL + importlib.invalidate_caches() + + +@pytest.fixture +def tiled_tiff_path(tmp_path): + """A small tiled TIFF on disk that exercises the GPU tile path.""" + data = np.arange(64, dtype=np.float32).reshape(8, 8) + raw = make_minimal_tiff( + 8, 8, np.dtype('float32'), + pixel_data=data, + tiled=True, + tile_size=4, + ) + path = tmp_path / "strict_fallback_1516.tif" + path.write_bytes(raw) + return str(path), data + + +def _patch_gpu_decode_to_raise(monkeypatch, exc): + """Replace ``gpu_decode_tiles_from_file`` with one that raises ``exc``.""" + from xrspatial.geotiff import _gpu_decode + + def _boom(*args, **kwargs): + raise exc + + monkeypatch.setattr( + _gpu_decode, 'gpu_decode_tiles_from_file', _boom, raising=True, + ) + + +def _patch_both_gpu_stages_to_raise(monkeypatch, exc): + """Make both GPU decode stages raise ``exc`` to exercise the second handler.""" + from xrspatial.geotiff import _gpu_decode + + def _boom(*args, **kwargs): + raise exc + + monkeypatch.setattr( + _gpu_decode, 'gpu_decode_tiles_from_file', _boom, raising=True, + ) + monkeypatch.setattr( + _gpu_decode, 'gpu_decode_tiles', _boom, raising=True, + ) + + +def test_default_mode_warns_on_gpu_failure(tiled_tiff_path, monkeypatch): + """Default ``gpu='auto'`` warns and falls back to the CPU result.""" + inserted_stub = _ensure_cupy_stub() + try: + from xrspatial.geotiff import read_geotiff_gpu + + path, expected = tiled_tiff_path + + synthetic = RuntimeError("simulated GPU failure") + _patch_gpu_decode_to_raise(monkeypatch, synthetic) + + with pytest.warns(RuntimeWarning, match="GPU decode failed"): + result = read_geotiff_gpu(path) + + # Fallback returned the CPU-decoded data. Real cupy arrays expose + # ``.get()`` to copy back to host; the numpy stub returns a + # plain ndarray. + out = result.data + if hasattr(out, 'get'): + out = out.get() + np.testing.assert_array_equal(np.asarray(out), expected) + finally: + if inserted_stub: + _restore_cupy() + + +def test_strict_mode_reraises(tiled_tiff_path, monkeypatch): + """``gpu='strict'`` re-raises the original GPU exception.""" + inserted_stub = _ensure_cupy_stub() + try: + from xrspatial.geotiff import read_geotiff_gpu + + path, _ = tiled_tiff_path + + synthetic = RuntimeError("simulated GPU failure") + _patch_gpu_decode_to_raise(monkeypatch, synthetic) + + with pytest.raises(RuntimeError, match="simulated GPU failure"): + read_geotiff_gpu(path, gpu='strict') + finally: + if inserted_stub: + _restore_cupy() + + +def test_strict_mode_reraises_second_stage(tiled_tiff_path, monkeypatch): + """``gpu='strict'`` re-raises if the second-stage GPU decode fails too. + + Regression for the case where ``gpu_decode_tiles_from_file`` and the + follow-up ``gpu_decode_tiles`` both fail. Previously the second + failure was caught by an unconditional ``except (ValueError, Exception)`` + that fell back to CPU regardless of mode. + """ + inserted_stub = _ensure_cupy_stub() + try: + from xrspatial.geotiff import read_geotiff_gpu + + path, _ = tiled_tiff_path + + synthetic = RuntimeError("simulated second-stage GPU failure") + _patch_both_gpu_stages_to_raise(monkeypatch, synthetic) + + with pytest.raises(RuntimeError, + match="simulated second-stage GPU failure"): + read_geotiff_gpu(path, gpu='strict') + finally: + if inserted_stub: + _restore_cupy() + + +def test_default_mode_warns_on_second_stage_failure(tiled_tiff_path, monkeypatch): + """``gpu='auto'`` warns once per stage failure and falls back to CPU. + + Both GPU decode stages are forced to raise, so the user sees two + distinct ``RuntimeWarning`` records (one per stage) before the CPU + fallback fires. Asserting the exact count guards against a + regression where one of the two handlers stops warning. + """ + import warnings as _warnings + + inserted_stub = _ensure_cupy_stub() + try: + from xrspatial.geotiff import read_geotiff_gpu + + path, expected = tiled_tiff_path + + synthetic = RuntimeError("simulated second-stage GPU failure") + _patch_both_gpu_stages_to_raise(monkeypatch, synthetic) + + with _warnings.catch_warnings(record=True) as records: + _warnings.simplefilter("always") + result = read_geotiff_gpu(path) + + gpu_warnings = [ + w for w in records + if issubclass(w.category, RuntimeWarning) + and "GPU decode failed" in str(w.message) + ] + assert len(gpu_warnings) == 2, ( + f"expected one warning per GPU stage; got {len(gpu_warnings)}: " + f"{[str(w.message) for w in gpu_warnings]}" + ) + + out = result.data + if hasattr(out, 'get'): + out = out.get() + np.testing.assert_array_equal(np.asarray(out), expected) + finally: + if inserted_stub: + _restore_cupy() + + +def test_invalid_gpu_kwarg_rejected(tiled_tiff_path): + """An unknown ``gpu=`` value raises ``ValueError`` with a clear message.""" + inserted_stub = _ensure_cupy_stub() + try: + from xrspatial.geotiff import read_geotiff_gpu + + path, _ = tiled_tiff_path + + with pytest.raises(ValueError, match="gpu must be 'auto' or 'strict'"): + read_geotiff_gpu(path, gpu='loose') + finally: + if inserted_stub: + _restore_cupy()