diff --git a/.claude/sweep-accuracy-state.csv b/.claude/sweep-accuracy-state.csv index 4bebbdc8..54e2d306 100644 --- a/.claude/sweep-accuracy-state.csv +++ b/.claude/sweep-accuracy-state.csv @@ -12,6 +12,7 @@ emerging_hotspots,2026-04-30,,MEDIUM,2;3,MEDIUM: threshold_90 uses int() (trunca fire,2026-04-30,,,,All ops per-pixel (no accumulation/stencil/projected distance). NaN handled via x!=x; CUDA bounds use strict <; rdnbr and ros divisions guarded; CPU/GPU/dask paths algorithmically identical. No accuracy issues found. flood,2026-04-30,,MEDIUM,2;5,"MEDIUM (not fixed): dask backend preserves float32 input dtype while numpy promotes to float64 in flood_depth and curve_number_runoff; DataArray inputs for curve_number, mannings_n bypass scalar > 0 (and CN <= 100) range validation, silently producing NaN/garbage." focal,2026-03-30T13:00:00Z,1092,,, +geotiff,2026-05-06,1500,HIGH,3,"Pass 3 (2026-05-06): HIGH fixed PR #1500 - predictor=3 un-transpose always wrote MSB at byte index bps-1 so big-endian float TIFFs decoded to garbage; now byte-order aware; GPU output also gets a final byteswap for BE files. (Pass 3 also opened #1499 for predictor=2 sample-wise decode but it was a duplicate of already-merged #1498 -- closed.) | Pass 2 (2026-05-05) PR #1498: predictor=2 was byte-wise at sample stride, multi-byte integer files written by libtiff/GDAL silently decoded to wrong values; now sample-wise per TIFF spec on CPU and GPU. | Pass 1 (2026-04-23) PR #1247: HIGH fixed CPU fp_predictor_decode wrong byte-lane layout for multi-sample predictor=3 (GPU was correct). MEDIUMs also fixed on same PR: BigTIFF writers emit LONG8 strip/tile offsets (were LONG); VRT read honors AREA_OR_POINT=Point; VRT nodata cast uses source dtype instead of float32. LOW fixed: duplicate LERC codec block removed from _compression.py." geotiff,2026-05-06,1498,HIGH,1;5,HIGH fixed: CPU fp_predictor_decode wrong byte-lane layout for multi-sample predictor=3 (GPU was correct). MEDIUMs also fixed on same PR: eager and streaming writers emit LONG8 strip/tile offsets in BigTIFF output (were LONG); VRT read honors AREA_OR_POINT=Point; VRT nodata cast uses source dtype instead of float32. LOW fixed: duplicate LERC codec block removed from _compression.py. 2026-05-05 deep pass: HIGH fixed in #1498 -- predictor=2 (horizontal differencing) did byte-wise differencing instead of sample-wise; uint8 was correct but uint16/int16/uint32/int32 silently corrupted pixel values both on read of GDAL/libtiff/tifffile/rasterio-written files and on write (their readers saw garbage). CPU + GPU + writer all reworked to dispatch per-dtype kernels at sample resolution with stride=samples. LOW deferred: ModelTiepointTag with multiple tiepoints uses only the first 6 doubles (no warning); rare in practice but silently produces wrong georeferencing for control-point-georeferenced files. 2026-05-06 fourth pass: HIGH fixed in fix-sparse-cog-tile branch -- sparse COG tiles/strips (TileByteCounts/StripByteCounts == 0 from GDAL SPARSE_OK=TRUE) crashed local mmap reads ("incomplete or truncated stream"), returned uninitialised np.empty memory in the COG-HTTP path, and broke GPU reads. Fix: detect sparse blocks, pre-fill the result with the file's nodata sentinel (0 if unset), skip those entries in the decode loop. New tests cover tiled+stripped, with/without nodata, raw and accessor reads, plus GPU. Re-confirmed clean: PackBits decode against GDAL output, planar=2 + predictor=2 + uint16/uint32, non-tile-aligned dask streaming write, WhiteIsZero (PI=0) inversion. Deferred non-accuracy: tiled JPEG TIFFs from GDAL fail because reader does not inject JPEGTables (tag 347) into per-tile streams (interop crash, not numerical) and Orientation tag (274) is silently ignored. No CRIT findings. glcm,2026-05-01,1408,HIGH,2,"angle=None averaged NaN as 0, masking no-valid-pairs as zero texture; fixed via nanmean-style averaging" hillshade,2026-04-10T12:00:00Z,,,,"Horn's method correct. All backends consistent. NaN propagation correct. float32 adequate for [0,1] output." diff --git a/CHANGELOG.md b/CHANGELOG.md index 0cc2b4ec..2d3805ad 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,12 @@ ----------- +### Unreleased + +#### Bug fixes and improvements +- Decode TIFF predictor=3 un-transpose by file byte order so big-endian floating-point TIFFs read back exactly + + ### Version 0.9.9 - 2026-05-05 #### Bug fixes and improvements diff --git a/xrspatial/geotiff/__init__.py b/xrspatial/geotiff/__init__.py index dcb382e1..3fad38e6 100644 --- a/xrspatial/geotiff/__init__.py +++ b/xrspatial/geotiff/__init__.py @@ -1401,6 +1401,7 @@ def read_geotiff_gpu(source: str, *, source, offsets, byte_counts, tw, th, width, height, compression, predictor, file_dtype, samples, + byte_order=header.byte_order, ) except Exception: pass @@ -1423,6 +1424,7 @@ def read_geotiff_gpu(source: str, *, compressed_tiles, tw, th, width, height, compression, predictor, file_dtype, samples, + byte_order=header.byte_order, ) except (ValueError, Exception): # Unsupported compression -- fall back to CPU then transfer diff --git a/xrspatial/geotiff/_compression.py b/xrspatial/geotiff/_compression.py index ef604ed1..a8f88752 100644 --- a/xrspatial/geotiff/_compression.py +++ b/xrspatial/geotiff/_compression.py @@ -576,10 +576,18 @@ def predictor_encode(data: np.ndarray, width: int, height: int, # Decode: undo differencing, then un-transpose (lane b -> native byte bps-1-b). @ngjit -def _fp_predictor_decode_row(row_data, width, bps): +def _fp_predictor_decode_row(row_data, width, bps, big_endian): """Undo floating-point predictor for one row (in-place). - row_data: uint8 array of length width * bps + row_data: uint8 array of length width * bps. + + Per TIFF Tech Note 3, lane 0 contains the most significant byte of + each sample regardless of the file's byte order. After + un-transposing, the MSB has to land at the byte position that + corresponds to "most significant" *in the file's byte order*: index 0 + for big-endian files and index ``bps-1`` for little-endian files. + The previous implementation always wrote MSB at index ``bps-1``, + which scrambled big-endian predictor=3 reads. """ n = width * bps @@ -587,26 +595,35 @@ def _fp_predictor_decode_row(row_data, width, bps): for i in range(1, n): row_data[i] = np.uint8((np.int32(row_data[i]) + np.int32(row_data[i - 1])) & 0xFF) - # Step 2: un-transpose bytes back to native sample order + # Step 2: un-transpose bytes back to the file's native sample order tmp = np.empty(n, dtype=np.uint8) - for sample in range(width): - for b in range(bps): - tmp[sample * bps + b] = row_data[(bps - 1 - b) * width + sample] + if big_endian: + # MSB lane (lane 0) -> byte index 0 in each output sample. + for sample in range(width): + for b in range(bps): + tmp[sample * bps + b] = row_data[b * width + sample] + else: + # MSB lane -> byte index ``bps-1`` (the high-order byte in LE). + for sample in range(width): + for b in range(bps): + tmp[sample * bps + b] = row_data[(bps - 1 - b) * width + sample] for i in range(n): row_data[i] = tmp[i] @ngjit -def _fp_predictor_decode_rows(data, width, height, bps): +def _fp_predictor_decode_rows(data, width, height, bps, big_endian): """Dispatch per-row decode from Numba, avoiding Python loop overhead.""" row_len = width * bps for row in range(height): start = row * row_len - _fp_predictor_decode_row(data[start:start + row_len], width, bps) + _fp_predictor_decode_row( + data[start:start + row_len], width, bps, big_endian) def fp_predictor_decode(data: np.ndarray, width: int, height: int, - bytes_per_sample: int) -> np.ndarray: + bytes_per_sample: int, + big_endian: bool = False) -> np.ndarray: """Undo floating-point predictor (predictor=3). Parameters @@ -617,6 +634,10 @@ def fp_predictor_decode(data: np.ndarray, width: int, height: int, Tile/strip dimensions. bytes_per_sample : int Bytes per sample (e.g. 4 for float32, 8 for float64). + big_endian : bool + Whether the source file is big-endian (BOM ``MM``). The + un-transpose puts the MSB lane at byte index 0 of each output + sample for BE files, and at byte index ``bps-1`` for LE files. Returns ------- @@ -624,7 +645,8 @@ def fp_predictor_decode(data: np.ndarray, width: int, height: int, Corrected array. """ buf = np.ascontiguousarray(data) - _fp_predictor_decode_rows(buf, width, height, bytes_per_sample) + _fp_predictor_decode_rows(buf, width, height, bytes_per_sample, + big_endian) return buf diff --git a/xrspatial/geotiff/_gpu_decode.py b/xrspatial/geotiff/_gpu_decode.py index 0598a6f9..4cb5fe75 100644 --- a/xrspatial/geotiff/_gpu_decode.py +++ b/xrspatial/geotiff/_gpu_decode.py @@ -700,11 +700,14 @@ def _predictor_decode_kernel_u64(view, width, height, samples_per_pixel): @cuda.jit -def _fp_predictor_decode_kernel(data, tmp, width, height, bps): +def _fp_predictor_decode_kernel(data, tmp, width, height, bps, big_endian): """Undo floating-point predictor (predictor=3), one thread per row. data: flat uint8 device array tmp: scratch buffer, same size as data + big_endian: when True, place the MSB lane at byte index 0 of each + output sample (file is big-endian); when False, place it at + byte index ``bps-1`` (file is little-endian). """ row = cuda.grid(1) if row >= height: @@ -719,10 +722,17 @@ def _fp_predictor_decode_kernel(data, tmp, width, height, bps): data[idx] = numba_uint8( (numba_int32(data[idx]) + numba_int32(data[idx - 1])) & 0xFF) - # Step 2: un-transpose byte lanes (MSB-first) back to native order - for sample in range(width): - for b in range(bps): - tmp[start + sample * bps + b] = data[start + (bps - 1 - b) * width + sample] + # Step 2: un-transpose byte lanes back to the file's native sample + # order. Lane 0 always contains the MSB byte (TIFF Tech Note 3); the + # MSB lands at byte index 0 (BE) or bps-1 (LE) of each output sample. + if big_endian: + for sample in range(width): + for b in range(bps): + tmp[start + sample * bps + b] = data[start + b * width + sample] + else: + for sample in range(width): + for b in range(bps): + tmp[start + sample * bps + b] = data[start + (bps - 1 - b) * width + sample] # Copy back for i in range(row_len): @@ -1315,6 +1325,7 @@ def gpu_decode_tiles_from_file( predictor: int, dtype: np.dtype, samples: int = 1, + byte_order: str = '<', ): """Decode tiles from a file, using GDS if available. @@ -1345,7 +1356,8 @@ def gpu_decode_tiles_from_file( return _apply_predictor_and_assemble( d_decomp, d_decomp_offsets, len(d_tiles), tile_width, tile_height, image_width, image_height, - predictor, dtype, samples, tile_bytes) + predictor, dtype, samples, tile_bytes, + byte_order=byte_order) # GDS read succeeded but nvCOMP can't decompress on GPU, # or it's LZW/deflate. Copy tiles to host and use normal path. @@ -1357,7 +1369,8 @@ def gpu_decode_tiles_from_file( return gpu_decode_tiles( compressed_tiles, tile_width, tile_height, - image_width, image_height, compression, predictor, dtype, samples) + image_width, image_height, compression, predictor, dtype, samples, + byte_order=byte_order) def _try_nvcomp_from_device_bufs(d_tiles, tile_bytes, compression): @@ -1493,11 +1506,13 @@ def _gpu_predictor2_encode(d_decomp, tile_width, total_rows, dtype, samples): def _apply_predictor_and_assemble(d_decomp, d_decomp_offsets, n_tiles, tile_width, tile_height, image_width, image_height, - predictor, dtype, samples, tile_bytes): + predictor, dtype, samples, tile_bytes, + byte_order: str = '<'): """Apply predictor decode and tile assembly on GPU.""" import cupy bytes_per_pixel = dtype.itemsize * samples + big_endian = (byte_order == '>') if predictor == 2: total_rows = n_tiles * tile_height @@ -1512,7 +1527,8 @@ def _apply_predictor_and_assemble(d_decomp, d_decomp_offsets, n_tiles, bpg = math.ceil(total_rows / tpb) d_tmp = cupy.empty_like(d_decomp) _fp_predictor_decode_kernel[bpg, tpb]( - d_decomp, d_tmp, tile_width * samples, total_rows, dtype.itemsize) + d_decomp, d_tmp, tile_width * samples, total_rows, + dtype.itemsize, big_endian) cuda.synchronize() tiles_across = math.ceil(image_width / tile_width) @@ -1532,10 +1548,15 @@ def _apply_predictor_and_assemble(d_decomp, d_decomp_offsets, n_tiles, cuda.synchronize() if samples > 1: - return d_output.view(dtype=cupy.dtype(dtype)).reshape( + out = d_output.view(dtype=cupy.dtype(dtype)).reshape( image_height, image_width, samples) - return d_output.view(dtype=cupy.dtype(dtype)).reshape( - image_height, image_width) + else: + out = d_output.view(dtype=cupy.dtype(dtype)).reshape( + image_height, image_width) + if big_endian and dtype.itemsize > 1: + # See gpu_decode_tiles for why BE samples need a final byteswap. + out = out.byteswap() + return out def gpu_decode_tiles( @@ -1548,6 +1569,7 @@ def gpu_decode_tiles( predictor: int, dtype: np.dtype, samples: int = 1, + byte_order: str = '<', ): """Decode and assemble TIFF tiles entirely on GPU. @@ -1759,7 +1781,8 @@ def gpu_decode_tiles( bpg = math.ceil(total_rows / tpb) d_tmp = cupy.empty_like(d_decomp) _fp_predictor_decode_kernel[bpg, tpb]( - d_decomp, d_tmp, tile_width * samples, total_rows, dtype.itemsize) + d_decomp, d_tmp, tile_width * samples, total_rows, + dtype.itemsize, byte_order == '>') cuda.synchronize() # Assemble tiles into output image on GPU @@ -1781,10 +1804,18 @@ def gpu_decode_tiles( # Reshape to image if samples > 1: - return d_output.view(dtype=cupy.dtype(dtype)).reshape( + out = d_output.view(dtype=cupy.dtype(dtype)).reshape( image_height, image_width, samples) - return d_output.view(dtype=cupy.dtype(dtype)).reshape( - image_height, image_width) + else: + out = d_output.view(dtype=cupy.dtype(dtype)).reshape( + image_height, image_width) + # The decoded byte stream is in the file's byte order; cupy view + # interprets it as native (always little-endian on supported GPUs), + # so big-endian samples that are wider than a byte must be swapped + # back to native before the values mean anything. + if byte_order == '>' and dtype.itemsize > 1: + out = out.byteswap() + return out # --------------------------------------------------------------------------- diff --git a/xrspatial/geotiff/_reader.py b/xrspatial/geotiff/_reader.py index aacf89a7..a7ce2359 100644 --- a/xrspatial/geotiff/_reader.py +++ b/xrspatial/geotiff/_reader.py @@ -358,9 +358,9 @@ def _apply_predictor(chunk: np.ndarray, pred: int, width: int, Predictor=3 (floating-point) byte-swizzles each row into ``bytes_per_sample`` interleaved lanes of length ``width * samples``, - per TIFF Technical Note 3. Passing ``bytes_per_sample * samples`` as - the lane count (the pre-fix behaviour) swizzles over the wrong lane - count and scrambles multi-band pixel values. + per TIFF Technical Note 3. The un-transpose stage has to put the + MSB lane at the file's high-order byte position, which differs for + big- vs little-endian files; ``byte_order`` carries that. """ if pred == 2: return predictor_decode(chunk, width, height, @@ -368,7 +368,8 @@ def _apply_predictor(chunk: np.ndarray, pred: int, width: int, byte_order=byte_order) elif pred == 3: return fp_predictor_decode(chunk, width * samples, height, - bytes_per_sample) + bytes_per_sample, + big_endian=(byte_order == '>')) return chunk diff --git a/xrspatial/geotiff/tests/test_predictor3_big_endian.py b/xrspatial/geotiff/tests/test_predictor3_big_endian.py new file mode 100644 index 00000000..aec4c093 --- /dev/null +++ b/xrspatial/geotiff/tests/test_predictor3_big_endian.py @@ -0,0 +1,92 @@ +"""Predictor=3 big-endian round-trip tests. + +The floating-point predictor (TIFF Tech Note 3) byte-swizzles each row +into ``bytes_per_sample`` lanes, MSB-first. The decoder un-transposes +those lanes back to the file's native byte order; the byte position of +the MSB differs between BE (index 0) and LE (index ``bps-1``). + +Before the fix the un-transpose always wrote MSB at index ``bps-1``, so +big-endian predictor=3 files decoded to garbage values even though they +came back as a clean ``float32``/``float64`` array (no error, just wrong +numbers - the worst kind of bug for raster data). +""" +from __future__ import annotations + +import numpy as np +import pytest + +from xrspatial.geotiff._reader import read_to_array + +tifffile = pytest.importorskip("tifffile") +pytest.importorskip("imagecodecs") # tifffile needs it for predictor=3 + + +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +def test_big_endian_predictor3_round_trip(tmp_path, dtype): + """xrspatial reads a big-endian predictor=3 file exactly.""" + arr = np.array( + [ + [1.0, 2.0, 3.0, 4.0], + [5.0, 6.0, 7.0, 8.0], + [1.5, 2.5, 3.5, 4.5], + [10.0, 20.0, 30.0, 40.0], + ], + dtype=dtype, + ) + path = tmp_path / f"be_pred3_{np.dtype(dtype).name}.tif" + tifffile.imwrite( + str(path), arr, byteorder=">", predictor=3, compression="deflate") + + out, _ = read_to_array(str(path)) + np.testing.assert_array_equal(out, arr) + + +def test_little_endian_predictor3_still_round_trips(tmp_path): + """Sanity: the BE fix did not break the LE path.""" + arr = np.linspace(-100.0, 100.0, 64, dtype=np.float32).reshape(8, 8) + path = tmp_path / "le_pred3_f32.tif" + tifffile.imwrite( + str(path), arr, byteorder="<", predictor=3, compression="deflate") + + out, _ = read_to_array(str(path)) + np.testing.assert_array_equal(out, arr) + + +def test_big_endian_predictor3_tiled(tmp_path): + """BE predictor=3 with tiled layout, multiple tiles per row.""" + rng = np.random.RandomState(20260506) + arr = rng.standard_normal((32, 48)).astype(np.float32) + path = tmp_path / "be_pred3_tiled.tif" + tifffile.imwrite( + str(path), arr, byteorder=">", predictor=3, + compression="deflate", tile=(16, 16)) + + out, _ = read_to_array(str(path)) + np.testing.assert_array_equal(out, arr) + + +def test_big_endian_predictor3_gpu(tmp_path): + """The GPU decode path also handles big-endian predictor=3.""" + cupy = pytest.importorskip("cupy") + if not cupy.cuda.is_available(): + pytest.skip("CUDA not available") + + from xrspatial.geotiff import open_geotiff + + rng = np.random.RandomState(20260506) + arr = rng.standard_normal((32, 48)).astype(np.float32) + path = tmp_path / "be_pred3_gpu.tif" + tifffile.imwrite( + str(path), arr, byteorder=">", predictor=3, + compression="deflate", tile=(16, 16)) + + cpu = open_geotiff(str(path)).values + np.testing.assert_array_equal(cpu, arr) + + gpu_da = open_geotiff(str(path), gpu=True) + gpu_arr = gpu_da.data + if hasattr(gpu_arr, "get"): + gpu_arr = gpu_arr.get() + else: + gpu_arr = np.asarray(gpu_arr) + np.testing.assert_array_equal(gpu_arr, arr)