diff --git a/xrspatial/geotiff/__init__.py b/xrspatial/geotiff/__init__.py index 18688a3d..fffd1869 100644 --- a/xrspatial/geotiff/__init__.py +++ b/xrspatial/geotiff/__init__.py @@ -3225,6 +3225,19 @@ def read_vrt(source: str, *, dtype=None, Otherwise ``attrs['crs']`` stays unset and ``attrs['crs_wkt']`` carries the original WKT. The source GeoTransform is preserved as a rasterio-style 6-tuple in ``attrs['transform']``. + + Source-path containment (issue #1671): every ```` in + the VRT must resolve (after canonicalising ``..`` segments and + symlinks) to a path under the VRT's own directory. Absolute paths + pointing elsewhere are rejected with ``ValueError`` by default. + Operators that legitimately need to mosaic files from outside the + VRT directory can opt in by setting the + ``XRSPATIAL_VRT_ALLOWED_ROOTS`` environment variable to a + ``os.pathsep``-separated list of trusted directory roots; sources + resolving under any listed root are then accepted. A + ``relativeToVRT='1'`` source that escapes the VRT directory (e.g. + ``../../etc/passwd`` or a symlink to a file outside the directory) + is rejected regardless of the allowlist. """ from ._reader import _coerce_path from ._vrt import read_vrt as _read_vrt_internal diff --git a/xrspatial/geotiff/_vrt.py b/xrspatial/geotiff/_vrt.py index 419fe9f9..a9d35458 100644 --- a/xrspatial/geotiff/_vrt.py +++ b/xrspatial/geotiff/_vrt.py @@ -56,6 +56,44 @@ def _codec_decode_exceptions() -> tuple[type[BaseException], ...]: _CODEC_DECODE_EXCEPTIONS = _codec_decode_exceptions() +# Environment variable name used to opt in to absolute source paths +# outside the VRT directory. ``os.pathsep``-separated list of +# directory roots (``:`` on POSIX, ``;`` on Windows). Empty entries +# are ignored. See ``parse_vrt`` for the containment policy. +_ALLOWED_ROOTS_ENV = 'XRSPATIAL_VRT_ALLOWED_ROOTS' + + +def _allowed_source_roots() -> list[str]: + """Return the operator-supplied allowlist of trusted source roots. + + Returns the canonical ``realpath`` of each entry so the containment + check can compare against the resolved source path directly. Empty + entries (from stray ``os.pathsep`` separators) are dropped. + """ + raw = os.environ.get(_ALLOWED_ROOTS_ENV, '') + roots = [] + for entry in raw.split(os.pathsep): + entry = entry.strip() + if not entry: + continue + roots.append(os.path.realpath(entry)) + return roots + + +def _path_is_under(path: str, root: str) -> bool: + """Return True when ``path`` lives under ``root``. + + Both inputs must already be canonical (``os.path.realpath`` applied). + Uses ``os.path.commonpath`` for robustness against prefix-collisions + (``/foo`` vs ``/foobar``) which a naive ``startswith`` check misses. + """ + try: + return os.path.commonpath([path, root]) == root + except ValueError: + # Different drives on Windows, or one of the paths is empty. + return False + + def _xml_text(value) -> str: """Escape *value* for safe inclusion as XML element text. @@ -170,12 +208,26 @@ def parse_vrt(xml_str: str, vrt_dir: str = '.') -> VRTDataset: Returns ------- VRTDataset + + Raises + ------ + ValueError + When a ``SourceFilename`` resolves outside ``vrt_dir`` and + outside every entry in ``XRSPATIAL_VRT_ALLOWED_ROOTS``. See + :func:`read_vrt` for the full containment policy. """ # ``safe_fromstring`` refuses DOCTYPE declarations so a crafted VRT # cannot trigger XML entity expansion (billion-laughs) attacks # against the reader. See issue #1579. root = safe_fromstring(xml_str) + # Pre-compute the trusted roots once per parse. ``vrt_dir`` is + # always trusted; the allowlist from ``XRSPATIAL_VRT_ALLOWED_ROOTS`` + # is only consulted for absolute sources (``relativeToVRT='0'``). + # See issue #1671 for the path-traversal hardening. + vrt_root = os.path.realpath(vrt_dir) + allowed_roots = _allowed_source_roots() + width = int(root.get('rasterXSize', 0)) height = int(root.get('rasterYSize', 0)) @@ -225,9 +277,37 @@ def parse_vrt(xml_str: str, vrt_dir: str = '.') -> VRTDataset: relative.get('relativeToVRT', '0') == '1') if is_relative and not os.path.isabs(filename): filename = os.path.join(vrt_dir, filename) - # Canonicalize to prevent path traversal (e.g. ../) + # Canonicalize so ``..`` segments and symlinks both resolve + # to their target before the containment check below. filename = os.path.realpath(filename) + # Containment policy (issue #1671): + # + # * ``relativeToVRT='1'`` declares the source lives under the + # VRT directory. Honour that intent even when the allowlist + # would otherwise cover the resolved path -- a relative + # source that escapes ``vrt_dir`` is always an attack + # primitive ('..' segments, symlink swap, etc.). + # * Absolute sources are accepted when they resolve under + # ``vrt_dir`` (matches the writer's ``relative=False`` round + # trip) or under any explicit ``XRSPATIAL_VRT_ALLOWED_ROOTS`` + # entry. + # + # In every other case raise ``ValueError`` so a crafted VRT + # cannot read arbitrary files the process has access to. + if is_relative: + trusted = [vrt_root] + else: + trusted = [vrt_root, *allowed_roots] + if not any(_path_is_under(filename, r) for r in trusted): + raise ValueError( + f"VRT SourceFilename {filename!r} resolves outside " + f"the VRT directory ({vrt_root!r}) and is not " + f"covered by any {_ALLOWED_ROOTS_ENV} entry " + f"({allowed_roots!r}). Refusing to read; see issue " + f"#1671." + ) + src_band = int(_text(src_elem, 'SourceBand') or '1') src_rect_elem = src_elem.find('SrcRect') diff --git a/xrspatial/geotiff/tests/test_features.py b/xrspatial/geotiff/tests/test_features.py index 66c7c6c2..19863785 100644 --- a/xrspatial/geotiff/tests/test_features.py +++ b/xrspatial/geotiff/tests/test_features.py @@ -796,10 +796,14 @@ def test_read_vrt_function(self, tmp_path): assert da.name == 'mosaic' np.testing.assert_array_equal(da.values, arr) - def test_vrt_parser(self): + def test_vrt_parser(self, tmp_path): """VRT XML parser extracts all fields correctly.""" from xrspatial.geotiff._vrt import parse_vrt + # Use a path under tmp_path so the issue #1671 containment check + # accepts the source. The test exercises field-extraction, not + # the on-disk readability of the source file. + src_path = str(tmp_path / 'tile.tif') xml = ( '\n' ' EPSG:32610\n' @@ -807,7 +811,7 @@ def test_vrt_parser(self): ' \n' ' 0\n' ' \n' - ' /data/tile.tif\n' + f' {src_path}\n' ' 1\n' ' \n' ' \n' @@ -815,7 +819,7 @@ def test_vrt_parser(self): ' \n' '\n' ) - vrt = parse_vrt(xml) + vrt = parse_vrt(xml, str(tmp_path)) assert vrt.width == 100 assert vrt.height == 200 assert vrt.crs_wkt == 'EPSG:32610' @@ -825,7 +829,7 @@ def test_vrt_parser(self): assert vrt.bands[0].nodata == 0.0 assert len(vrt.bands[0].sources) == 1 src = vrt.bands[0].sources[0] - assert src.filename == os.path.realpath('/data/tile.tif') + assert src.filename == os.path.realpath(src_path) assert src.src_rect.x_off == 10 def test_vrt_float64_fractional_nodata_masked(self, tmp_path): diff --git a/xrspatial/geotiff/tests/test_security.py b/xrspatial/geotiff/tests/test_security.py index 67954537..a9379dfd 100644 --- a/xrspatial/geotiff/tests/test_security.py +++ b/xrspatial/geotiff/tests/test_security.py @@ -300,11 +300,18 @@ def test_open_geotiff_vrt_max_pixels(self, tmp_path): # --------------------------------------------------------------------------- # Cat 5: VRT path traversal +# +# Tightened in issue #1671: ``parse_vrt`` no longer accepts source paths +# that resolve outside the VRT directory (or any explicit allowlist +# entry). The realpath call by itself only normalised ``..`` segments; +# it did not enforce containment, so a crafted VRT could still hand +# ``read_to_array`` an arbitrary path. # --------------------------------------------------------------------------- class TestVRTPathTraversal: - def test_relative_path_canonicalized(self, tmp_path): - """Relative paths in VRT SourceFilename are canonicalized.""" + def test_relative_path_traversal_rejected(self, tmp_path): + """Relative paths in VRT SourceFilename that escape the VRT + directory are rejected, not silently canonicalised.""" from xrspatial.geotiff._vrt import parse_vrt vrt_xml = ''' @@ -321,15 +328,8 @@ def test_relative_path_canonicalized(self, tmp_path): vrt_dir = str(tmp_path / "subdir") os.makedirs(vrt_dir) - vrt = parse_vrt(vrt_xml, vrt_dir) - source_path = vrt.bands[0].sources[0].filename - - # After canonicalization, the path should NOT contain ".." - assert ".." not in source_path - # It should be an absolute path - assert os.path.isabs(source_path) - # Verify it was resolved through realpath - assert source_path == os.path.realpath(source_path) + with pytest.raises(ValueError, match="outside the VRT directory"): + parse_vrt(vrt_xml, vrt_dir) def test_normal_relative_path_still_works(self, tmp_path): """Normal relative paths without traversal still resolve correctly.""" @@ -353,8 +353,9 @@ def test_normal_relative_path_still_works(self, tmp_path): expected = os.path.realpath(os.path.join(vrt_dir, "data", "tile.tif")) assert source_path == expected - def test_absolute_path_also_canonicalized(self, tmp_path): - """Absolute paths in VRT are also canonicalized.""" + def test_absolute_path_outside_vrt_dir_rejected(self, tmp_path): + """Absolute paths pointing outside the VRT directory are rejected + by default (issue #1671).""" from xrspatial.geotiff._vrt import parse_vrt vrt_xml = ''' @@ -368,11 +369,8 @@ def test_absolute_path_also_canonicalized(self, tmp_path): ''' - vrt = parse_vrt(vrt_xml, str(tmp_path)) - source_path = vrt.bands[0].sources[0].filename - - assert ".." not in source_path - assert source_path == os.path.realpath("/tmp/../tmp/test.tif") + with pytest.raises(ValueError, match="outside the VRT directory"): + parse_vrt(vrt_xml, str(tmp_path)) # --------------------------------------------------------------------------- diff --git a/xrspatial/geotiff/tests/test_vrt_path_containment_1671.py b/xrspatial/geotiff/tests/test_vrt_path_containment_1671.py new file mode 100644 index 00000000..74f472c3 --- /dev/null +++ b/xrspatial/geotiff/tests/test_vrt_path_containment_1671.py @@ -0,0 +1,302 @@ +"""Regression tests for issue #1671: VRT path-traversal containment. + +The previous fix for path traversal (#1185) called ``os.path.realpath`` +on every ``SourceFilename`` but did not enforce that the resolved path +lived under the VRT directory. A crafted VRT could therefore read any +file the process had access to: ``../../etc/passwd``, a symlink that +escapes ``vrt_dir``, or an absolute path anywhere on disk. + +The new behaviour rejects: + +* relative sources (``relativeToVRT='1'``) that resolve outside the VRT + directory +* absolute sources (``relativeToVRT='0'``) that resolve outside both + the VRT directory and any allowlisted root + +Operators that legitimately need cross-directory reads opt in via the +``XRSPATIAL_VRT_ALLOWED_ROOTS`` environment variable +(``os.pathsep``-separated list of directory paths). +""" +from __future__ import annotations + +import os +import uuid + +import numpy as np +import pytest +import xarray as xr + +from xrspatial.geotiff import to_geotiff +from xrspatial.geotiff._vrt import parse_vrt, read_vrt as _read_vrt_internal + + +def _unique_dir(tmp_path, label: str) -> str: + """Return a sub-path under ``tmp_path`` carrying the issue id + a + uuid so parallel test workers cannot collide on the same name.""" + d = tmp_path / f"vrt_1671_{label}_{uuid.uuid4().hex[:8]}" + d.mkdir() + return str(d) + + +def _write_minimal_tif(path: str) -> None: + """Write a 4x4 float32 GeoTIFF the VRT can reference.""" + arr = np.arange(16, dtype=np.float32).reshape(4, 4) + y = np.linspace(1.0, 0.0, 4) + x = np.linspace(0.0, 1.0, 4) + da = xr.DataArray( + arr, dims=['y', 'x'], + coords={'y': y, 'x': x}, + attrs={'crs': 4326}, + ) + to_geotiff(da, path, compression='none') + + +def _build_vrt(vrt_path: str, source_filename: str, relative: str) -> None: + """Write a 4x4 single-band VRT pointing at *source_filename*. + + ``relative`` is the literal value of the ``relativeToVRT`` attribute + ('0' or '1'). + """ + xml = ( + '\n' + ' 0, 1, 0, 0, 0, -1\n' + ' \n' + ' \n' + f' ' + f'{source_filename}\n' + ' 1\n' + ' \n' + ' \n' + ' \n' + ' \n' + '\n' + ) + with open(vrt_path, 'w') as f: + f.write(xml) + + +@pytest.fixture(autouse=True) +def _clear_allowlist_env(monkeypatch): + """Make sure no stray XRSPATIAL_VRT_ALLOWED_ROOTS leaks across tests.""" + monkeypatch.delenv('XRSPATIAL_VRT_ALLOWED_ROOTS', raising=False) + + +# --------------------------------------------------------------------------- +# Relative-source traversal +# --------------------------------------------------------------------------- + + +def test_relative_source_with_dotdot_traversal_rejected(tmp_path): + """A relative source resolving outside ``vrt_dir`` raises ValueError. + + ``parse_vrt`` is the resolution layer; the rejection must happen there + so the dangerous path never reaches ``read_to_array``. + """ + vrt_dir = _unique_dir(tmp_path, "trav_rel") + xml = ( + '\n' + ' \n' + ' \n' + ' ' + '../../../../../etc/passwd\n' + ' 1\n' + ' \n' + ' \n' + ' \n' + ' \n' + '\n' + ) + with pytest.raises(ValueError, match="outside the VRT directory"): + parse_vrt(xml, vrt_dir) + + +def test_relative_source_symlink_traversal_rejected(tmp_path): + """A symlink under ``vrt_dir`` that points outside still gets rejected. + + The check uses ``realpath`` so a symlink-based escape is caught the + same way ``../`` is. + """ + vrt_dir = _unique_dir(tmp_path, "trav_sym") + outside_dir = _unique_dir(tmp_path, "trav_sym_outside") + outside_target = os.path.join(outside_dir, 'secret.tif') + _write_minimal_tif(outside_target) + + # Plant a symlink inside vrt_dir that points to the outside file. + # ``os.symlink`` can fail on Windows CI (requires Developer Mode or + # admin privileges) and on some filesystems, so guard it and skip + # rather than fail the suite on those platforms. + sym = os.path.join(vrt_dir, 'inside.tif') + try: + os.symlink(outside_target, sym) + except (OSError, NotImplementedError) as e: + pytest.skip(f"symlink not supported in this environment: {e}") + + vrt_path = os.path.join(vrt_dir, 'mosaic.vrt') + _build_vrt(vrt_path, 'inside.tif', relative='1') + + with pytest.raises(ValueError, match="outside the VRT directory"): + _read_vrt_internal(vrt_path) + + +# --------------------------------------------------------------------------- +# Absolute-source rejection by default +# --------------------------------------------------------------------------- + + +def test_absolute_source_outside_vrt_dir_rejected(tmp_path): + """A SourceFilename pointing at an absolute path outside ``vrt_dir`` + is rejected by default.""" + vrt_dir = _unique_dir(tmp_path, "abs_outside") + outside_dir = _unique_dir(tmp_path, "abs_outside_target") + outside_tif = os.path.join(outside_dir, 'data.tif') + _write_minimal_tif(outside_tif) + + vrt_path = os.path.join(vrt_dir, 'mosaic.vrt') + _build_vrt(vrt_path, outside_tif, relative='0') + + with pytest.raises(ValueError, match="outside the VRT directory"): + _read_vrt_internal(vrt_path) + + +def test_absolute_source_inside_vrt_dir_ok(tmp_path): + """An absolute path that happens to resolve under ``vrt_dir`` passes. + + Mirrors the writer's ``relative=False`` round-trip case: the on-disk + VRT carries the absolute path of a TIFF that still lives next to it. + """ + vrt_dir = _unique_dir(tmp_path, "abs_inside") + tif_path = os.path.join(vrt_dir, 'data.tif') + _write_minimal_tif(tif_path) + + vrt_path = os.path.join(vrt_dir, 'mosaic.vrt') + _build_vrt(vrt_path, tif_path, relative='0') + + arr, _ = _read_vrt_internal(vrt_path) + assert arr.shape == (4, 4) + + +# --------------------------------------------------------------------------- +# Allowlist opt-in via XRSPATIAL_VRT_ALLOWED_ROOTS +# --------------------------------------------------------------------------- + + +def test_absolute_source_allowlisted_root_passes(tmp_path, monkeypatch): + """Setting XRSPATIAL_VRT_ALLOWED_ROOTS allows cross-directory reads.""" + vrt_dir = _unique_dir(tmp_path, "allow_vrt") + outside_dir = _unique_dir(tmp_path, "allow_data") + outside_tif = os.path.join(outside_dir, 'data.tif') + _write_minimal_tif(outside_tif) + + vrt_path = os.path.join(vrt_dir, 'mosaic.vrt') + _build_vrt(vrt_path, outside_tif, relative='0') + + monkeypatch.setenv('XRSPATIAL_VRT_ALLOWED_ROOTS', outside_dir) + arr, _ = _read_vrt_internal(vrt_path) + assert arr.shape == (4, 4) + + +def test_allowlist_supports_multiple_roots(tmp_path, monkeypatch): + """An ``os.pathsep``-separated list permits sources under any listed root.""" + vrt_dir = _unique_dir(tmp_path, "multi_vrt") + dir_a = _unique_dir(tmp_path, "multi_a") + dir_b = _unique_dir(tmp_path, "multi_b") + tif_b = os.path.join(dir_b, 'data.tif') + _write_minimal_tif(tif_b) + + vrt_path = os.path.join(vrt_dir, 'mosaic.vrt') + _build_vrt(vrt_path, tif_b, relative='0') + + monkeypatch.setenv( + 'XRSPATIAL_VRT_ALLOWED_ROOTS', os.pathsep.join([dir_a, dir_b])) + arr, _ = _read_vrt_internal(vrt_path) + assert arr.shape == (4, 4) + + +def test_allowlist_does_not_cover_traversal_via_relative_source( + tmp_path, monkeypatch, +): + """A relative source that escapes ``vrt_dir`` is rejected even if + the resolved path happens to land under an allowlisted root. + + Relative paths declare intent to stay inside the VRT directory. + Honouring that intent prevents an attacker from chaining an + allowlist entry into a relative-source traversal. + """ + vrt_dir = _unique_dir(tmp_path, "rel_with_allow") + outside_dir = _unique_dir(tmp_path, "rel_with_allow_target") + outside_tif = os.path.join(outside_dir, 'data.tif') + _write_minimal_tif(outside_tif) + + # Build a relative path from vrt_dir to outside_tif. + rel = os.path.relpath(outside_tif, vrt_dir) + + vrt_path = os.path.join(vrt_dir, 'mosaic.vrt') + _build_vrt(vrt_path, rel, relative='1') + + monkeypatch.setenv('XRSPATIAL_VRT_ALLOWED_ROOTS', outside_dir) + with pytest.raises(ValueError, match="outside the VRT directory"): + _read_vrt_internal(vrt_path) + + +def test_allowlist_empty_entries_ignored(tmp_path, monkeypatch): + """Empty entries in the allowlist (from stray separators) are skipped.""" + vrt_dir = _unique_dir(tmp_path, "empty_entry_vrt") + outside_dir = _unique_dir(tmp_path, "empty_entry_data") + outside_tif = os.path.join(outside_dir, 'data.tif') + _write_minimal_tif(outside_tif) + + vrt_path = os.path.join(vrt_dir, 'mosaic.vrt') + _build_vrt(vrt_path, outside_tif, relative='0') + + # Leading/trailing/embedded empty entries should not crash the parser + # or accidentally grant access to ``/``. Build the value with + # ``os.pathsep`` so the test stays cross-platform (``:`` on POSIX, + # ``;`` on Windows). + sep = os.pathsep + value = f"{sep}{outside_dir}{sep}{sep}" + monkeypatch.setenv('XRSPATIAL_VRT_ALLOWED_ROOTS', value) + arr, _ = _read_vrt_internal(vrt_path) + assert arr.shape == (4, 4) + + +# --------------------------------------------------------------------------- +# Happy-path regression: existing relative-source under vrt_dir still works +# --------------------------------------------------------------------------- + + +def test_normal_relative_source_under_vrt_dir(tmp_path): + """A plain relative source under the VRT directory still reads fine.""" + vrt_dir = _unique_dir(tmp_path, "happy") + tif_path = os.path.join(vrt_dir, 'data.tif') + _write_minimal_tif(tif_path) + + vrt_path = os.path.join(vrt_dir, 'mosaic.vrt') + _build_vrt(vrt_path, 'data.tif', relative='1') + + arr, _ = _read_vrt_internal(vrt_path) + assert arr.shape == (4, 4) + + +def test_error_message_names_rejected_path(tmp_path): + """The ValueError mentions the offending path so operators can diagnose.""" + vrt_dir = _unique_dir(tmp_path, "msg_check") + xml = ( + '\n' + ' \n' + ' \n' + ' ' + '../../etc/shadow\n' + ' 1\n' + ' \n' + ' \n' + ' \n' + ' \n' + '\n' + ) + with pytest.raises(ValueError) as excinfo: + parse_vrt(xml, vrt_dir) + msg = str(excinfo.value) + # The rejected resolved path should appear somewhere in the message. + assert 'shadow' in msg + # And the trusted root should also be cited. + assert os.path.realpath(vrt_dir) in msg