Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
141 changes: 117 additions & 24 deletions xrspatial/geotiff/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,25 +179,50 @@ def _coords_to_transform(da: xr.DataArray) -> GeoTransform | None:
)


def _read_geo_info(source: str, *, overview_level: int | None = None):
def _read_geo_info(source, *, overview_level: int | None = None):
"""Read only the geographic metadata and image dimensions from a GeoTIFF.

Returns (geo_info, height, width, dtype, n_bands) without reading pixel
data. Uses mmap for header-only access -- O(1) memory regardless of file
size.
data. Uses mmap for header-only access on string paths; for file-like
inputs it reads the bytes directly. O(1) memory regardless of file size
when a path is supplied.

Parameters
----------
source : str or binary file-like
Path or any object with ``read``/``seek``.
overview_level : int or None
Overview IFD index (0 = full resolution).
"""
from ._dtypes import tiff_dtype_to_numpy
from ._geotags import extract_geo_info
from ._header import parse_all_ifds, parse_header
from ._reader import _coerce_path, _is_file_like

with open(source, 'rb') as f:
import mmap
data = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
source = _coerce_path(source)
if _is_file_like(source):
# File-like: read its full bytes; we don't try to mmap arbitrary
# buffers because they may not back a real file descriptor.
try:
cur = source.tell()
except (OSError, AttributeError):
cur = 0
source.seek(0)
data = source.read()
try:
source.seek(cur)
except (OSError, AttributeError):
pass
close_data = False
elif isinstance(source, str):
with open(source, 'rb') as f:
import mmap
data = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
close_data = True
else:
raise TypeError(
"source must be a str path or binary file-like, "
f"got {type(source).__name__}")
try:
header = parse_header(data)
ifds = parse_all_ifds(data, header)
Expand All @@ -213,7 +238,8 @@ def _read_geo_info(source: str, *, overview_level: int | None = None):
n_bands = ifd.samples_per_pixel if ifd.samples_per_pixel > 1 else 0
return geo_info, ifd.height, ifd.width, file_dtype, n_bands
finally:
data.close()
if close_data:
data.close()


def _extent_to_window(transform, file_height, file_width,
Expand Down Expand Up @@ -245,7 +271,7 @@ def _extent_to_window(transform, file_height, file_width,



def open_geotiff(source: str, *, dtype=None, window=None,
def open_geotiff(source, *, dtype=None, window=None,
overview_level: int | None = None,
band: int | None = None,
name: str | None = None,
Expand All @@ -264,8 +290,11 @@ def open_geotiff(source: str, *, dtype=None, window=None,

Parameters
----------
source : str
File path, HTTP URL, or cloud URI (s3://, gs://, az://).
source : str or binary file-like
File path, HTTP URL, cloud URI (s3://, gs://, az://), or a
binary file-like object (e.g. ``io.BytesIO``) with read+seek.
VRT, dask-chunked, GPU, and remote-URL paths require a string;
in-memory file-like buffers go through the eager numpy reader.
dtype : str, numpy.dtype, or None
Cast the result to this dtype after reading. None keeps the
file's native dtype. Float-to-int casts raise ValueError to
Expand Down Expand Up @@ -315,12 +344,29 @@ def open_geotiff(source: str, *, dtype=None, window=None,
is lossy in a way users rarely intend; cast explicitly after read if
you need it).
"""
# VRT files
if source.lower().endswith('.vrt'):
from ._reader import _coerce_path

source = _coerce_path(source)

# VRT files (string paths only -- VRT XML references other files on disk)
if isinstance(source, str) and source.lower().endswith('.vrt'):
return read_vrt(source, dtype=dtype, window=window, band=band,
name=name, chunks=chunks, gpu=gpu,
max_pixels=max_pixels)
Comment on lines +351 to 355

# File-like buffers don't support the GPU or dask code paths because
# those re-open the source by path from worker tasks or device-side
# readers. Reject early with a clear message.
if not isinstance(source, str):
if gpu:
raise ValueError(
"gpu=True is not supported for file-like sources. "
"Pass a path string instead.")
if chunks is not None:
raise ValueError(
"chunks=... (dask) is not supported for file-like sources. "
"Pass a path string instead.")

# GPU path
if gpu:
return read_geotiff_gpu(source, dtype=dtype,
Expand Down Expand Up @@ -358,9 +404,11 @@ def open_geotiff(source: str, *, dtype=None, window=None,
coords = {'y': full_y, 'x': full_x}

if name is None:
# Derive from source path
import os
name = os.path.splitext(os.path.basename(source))[0]
# Derive from source path. File-like buffers don't have a path,
# so leave name unset rather than fabricating one.
if isinstance(source, str):
import os
name = os.path.splitext(os.path.basename(source))[0]
Comment on lines +409 to +411

attrs = {}
if geo_info.crs_epsg is not None:
Expand Down Expand Up @@ -579,7 +627,7 @@ def _merge_friendly_extra_tags(extra_tags_list, attrs: dict) -> list | None:
return existing or None


def to_geotiff(data: xr.DataArray | np.ndarray, path: str, *,
def to_geotiff(data: xr.DataArray | np.ndarray, path, *,
crs: int | str | None = None,
nodata=None,
compression: str = 'zstd',
Expand Down Expand Up @@ -612,8 +660,11 @@ def to_geotiff(data: xr.DataArray | np.ndarray, path: str, *,
----------
data : xr.DataArray or np.ndarray
2D raster data.
path : str
Output file path.
path : str or binary file-like
Output file path, or any object exposing a ``write`` method
(e.g. ``io.BytesIO``). When a file-like is passed, the encoded
TIFF bytes are written to that object once assembly completes.
``cog=True`` and ``.vrt`` outputs require a string path.
crs : int, str, or None
EPSG code (int), WKT string, or PROJ string. If None and data
is a DataArray, tries to read from attrs ('crs' for EPSG,
Expand Down Expand Up @@ -659,6 +710,10 @@ def to_geotiff(data: xr.DataArray | np.ndarray, path: str, *,
rasters whose tile-row exceeds this budget are split into
horizontal segments. Ignored for numpy / CuPy / COG paths.
"""
from ._reader import _coerce_path

path = _coerce_path(path)

# Up-front validation: catch bad compression names before they reach
# any of the deeper write paths (streaming, GPU, VRT, COG) where the
# error surfaces from _compression_tag with a less obvious traceback.
Expand All @@ -668,6 +723,21 @@ def to_geotiff(data: xr.DataArray | np.ndarray, path: str, *,
f"Unknown compression {compression!r}. "
f"Valid options: {list(_VALID_COMPRESSIONS)}.")

# File-like (BytesIO etc.) destinations: the streaming, GPU, COG, and
# VRT writers all need a real filesystem path (atomic rename, overview
# passes, sidecar writes). Reject those combos up front so the user
# gets a clear error instead of a deep traceback.
Comment thread
brendancol marked this conversation as resolved.
_path_is_file_like = (not isinstance(path, str)) and hasattr(path, 'write')
if _path_is_file_like:
if cog:
raise ValueError(
"cog=True is not supported for file-like destinations. "
"Pass a string path or write to BytesIO without cog=True.")
elif not isinstance(path, str):
raise TypeError(
f"path must be a str or a binary file-like with a write() "
f"method, got {type(path).__name__}")
Comment on lines +736 to +739

# tile_size only applies to tiled output; warn if the caller passed a
# non-default size alongside strip mode (it would otherwise be silently
# ignored).
Expand All @@ -680,8 +750,9 @@ def to_geotiff(data: xr.DataArray | np.ndarray, path: str, *,
stacklevel=2,
)

# VRT tiled output
if path.lower().endswith('.vrt'):
# VRT tiled output (string paths only -- VRT writes a real .vrt file
# plus per-tile GeoTIFFs to a directory)
if isinstance(path, str) and path.lower().endswith('.vrt'):
if cog:
raise ValueError(
"cog=True is not compatible with VRT output. "
Expand All @@ -701,6 +772,15 @@ def to_geotiff(data: xr.DataArray | np.ndarray, path: str, *,

# Auto-detect GPU data and dispatch to write_geotiff_gpu
use_gpu = gpu if gpu is not None else _is_gpu_data(data)
if use_gpu and _path_is_file_like:
# write_geotiff_gpu's nvCOMP path materialises tile parts and then
# calls _write_bytes(path), which would write at the buffer's
# current cursor without truncating. More importantly, the GPU
# path was never tested with file-like destinations; refuse rather
# than silently produce something untested.
raise ValueError(
"gpu=True is not supported for file-like destinations. "
"Pass a string path (or set gpu=False).")
if use_gpu:
try:
write_geotiff_gpu(data, path, crs=crs, nodata=nodata,
Expand Down Expand Up @@ -785,8 +865,10 @@ def to_geotiff(data: xr.DataArray | np.ndarray, path: str, *,

# Dask-backed: stream tiles to avoid materialising the full array.
# COG requires overviews from the full array, so it falls through
# to the eager path.
if hasattr(raw, 'dask') and not cog:
# to the eager path. Streaming write needs a real filesystem path
# (it builds a temp file then atomic-renames); for file-like
# destinations we materialise eagerly and assemble in-memory.
if hasattr(raw, 'dask') and not cog and not _path_is_file_like:
dask_arr = raw
# Handle band-first dimension order (band, y, x) -> (y, x, band)
if raw.ndim == 3 and data.dims[0] in ('band', 'bands', 'channel'):
Expand Down Expand Up @@ -1127,12 +1209,16 @@ def read_geotiff_dask(source: str, *, dtype=None, chunks: int | tuple = 512,
"""
import dask.array as da

from ._reader import _coerce_path

source = _coerce_path(source)

# ``read_geotiff`` already routes ``.vrt`` to ``read_vrt`` before
# reaching here, so this branch is only hit when ``read_geotiff_dask``
# is called directly with a VRT path. Keep it as a defensive fallback
# rather than letting the windowed-read path try to parse VRT XML as
# TIFF bytes. ``read_vrt`` is the single source of truth for VRT.
if source.lower().endswith('.vrt'):
if isinstance(source, str) and source.lower().endswith('.vrt'):
return read_vrt(source, dtype=dtype, name=name, chunks=chunks)

# Metadata-only read: O(1) memory via mmap, no pixel decompression
Expand Down Expand Up @@ -1306,12 +1392,16 @@ def read_geotiff_gpu(source: str, *,
"cupy is required for GPU reads. "
"Install it with: pip install cupy-cuda12x")

from ._reader import _FileSource, _check_dimensions, MAX_PIXELS_DEFAULT
from ._reader import (
_FileSource, _check_dimensions, MAX_PIXELS_DEFAULT, _coerce_path,
)
from ._header import parse_header, parse_all_ifds, validate_tile_layout
from ._dtypes import tiff_dtype_to_numpy
from ._geotags import extract_geo_info
from ._gpu_decode import gpu_decode_tiles

source = _coerce_path(source)

if max_pixels is None:
max_pixels = MAX_PIXELS_DEFAULT

Expand Down Expand Up @@ -1676,8 +1766,11 @@ def read_vrt(source: str, *, dtype=None, window=None,
the original WKT. The source GeoTransform is preserved as a
rasterio-style 6-tuple in ``attrs['transform']``.
"""
from ._reader import _coerce_path
from ._vrt import read_vrt as _read_vrt_internal

source = _coerce_path(source)

arr, vrt = _read_vrt_internal(source, window=window, band=band,
max_pixels=max_pixels)

Expand Down
Loading
Loading