From f1f8494702c937a5eef69bca664c1e32261728de Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Wed, 6 May 2026 06:02:57 -0700 Subject: [PATCH 1/3] Decode TIFF predictor=3 un-transpose by file byte order so big-endian floating-point TIFFs read exactly The floating-point predictor (TIFF Tech Note 3) byte-swizzles each row into bytes_per_sample lanes with the most significant byte in lane 0. The decoder un-transposed those lanes assuming the output was always little-endian: it placed the MSB at byte index bps-1 of each output sample. Big-endian predictor=3 files therefore decoded into samples with the bytes in the wrong order; the result still came back as a clean float32/float64 array, just with garbage values. The fix threads the file's byte order into the un-transpose step on both CPU and GPU paths. For big-endian files the MSB now lands at byte index 0 of each output sample. The little-endian path is unchanged. The GPU output also needed a final byteswap when the file was big-endian: the GPU decode keeps bytes in the file's order through predictor decode and tile assembly, but cupy interprets the resulting view as native (little-endian) byte order. New tests (test_predictor3_big_endian.py) decode tifffile/imagecodecs- encoded big-endian predictor=3 float32 and float64 files (strip and tiled layouts, plus a GPU parity check) and confirm the existing little-endian round-trip still works. --- CHANGELOG.md | 6 ++ xrspatial/geotiff/__init__.py | 2 + xrspatial/geotiff/_compression.py | 42 +++++++-- xrspatial/geotiff/_gpu_decode.py | 63 +++++++++---- xrspatial/geotiff/_reader.py | 14 +-- .../tests/test_predictor3_big_endian.py | 92 +++++++++++++++++++ 6 files changed, 187 insertions(+), 32 deletions(-) create mode 100644 xrspatial/geotiff/tests/test_predictor3_big_endian.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 0cc2b4ec..2d3805ad 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,12 @@ ----------- +### Unreleased + +#### Bug fixes and improvements +- Decode TIFF predictor=3 un-transpose by file byte order so big-endian floating-point TIFFs read back exactly + + ### Version 0.9.9 - 2026-05-05 #### Bug fixes and improvements diff --git a/xrspatial/geotiff/__init__.py b/xrspatial/geotiff/__init__.py index 95b1ce15..c741ac20 100644 --- a/xrspatial/geotiff/__init__.py +++ b/xrspatial/geotiff/__init__.py @@ -1394,6 +1394,7 @@ def read_geotiff_gpu(source: str, *, source, offsets, byte_counts, tw, th, width, height, compression, predictor, file_dtype, samples, + byte_order=header.byte_order, ) except Exception: pass @@ -1416,6 +1417,7 @@ def read_geotiff_gpu(source: str, *, compressed_tiles, tw, th, width, height, compression, predictor, file_dtype, samples, + byte_order=header.byte_order, ) except (ValueError, Exception): # Unsupported compression -- fall back to CPU then transfer diff --git a/xrspatial/geotiff/_compression.py b/xrspatial/geotiff/_compression.py index 087ebbe9..19a43a49 100644 --- a/xrspatial/geotiff/_compression.py +++ b/xrspatial/geotiff/_compression.py @@ -425,10 +425,18 @@ def predictor_encode(data: np.ndarray, width: int, height: int, # Decode: undo differencing, then un-transpose (lane b -> native byte bps-1-b). @ngjit -def _fp_predictor_decode_row(row_data, width, bps): +def _fp_predictor_decode_row(row_data, width, bps, big_endian): """Undo floating-point predictor for one row (in-place). - row_data: uint8 array of length width * bps + row_data: uint8 array of length width * bps. + + Per TIFF Tech Note 3, lane 0 contains the most significant byte of + each sample regardless of the file's byte order. After + un-transposing, the MSB has to land at the byte position that + corresponds to "most significant" *in the file's byte order*: index 0 + for big-endian files and index ``bps-1`` for little-endian files. + The previous implementation always wrote MSB at index ``bps-1``, + which scrambled big-endian predictor=3 reads. """ n = width * bps @@ -436,26 +444,35 @@ def _fp_predictor_decode_row(row_data, width, bps): for i in range(1, n): row_data[i] = np.uint8((np.int32(row_data[i]) + np.int32(row_data[i - 1])) & 0xFF) - # Step 2: un-transpose bytes back to native sample order + # Step 2: un-transpose bytes back to the file's native sample order tmp = np.empty(n, dtype=np.uint8) - for sample in range(width): - for b in range(bps): - tmp[sample * bps + b] = row_data[(bps - 1 - b) * width + sample] + if big_endian: + # MSB lane (lane 0) -> byte index 0 in each output sample. + for sample in range(width): + for b in range(bps): + tmp[sample * bps + b] = row_data[b * width + sample] + else: + # MSB lane -> byte index ``bps-1`` (the high-order byte in LE). + for sample in range(width): + for b in range(bps): + tmp[sample * bps + b] = row_data[(bps - 1 - b) * width + sample] for i in range(n): row_data[i] = tmp[i] @ngjit -def _fp_predictor_decode_rows(data, width, height, bps): +def _fp_predictor_decode_rows(data, width, height, bps, big_endian): """Dispatch per-row decode from Numba, avoiding Python loop overhead.""" row_len = width * bps for row in range(height): start = row * row_len - _fp_predictor_decode_row(data[start:start + row_len], width, bps) + _fp_predictor_decode_row( + data[start:start + row_len], width, bps, big_endian) def fp_predictor_decode(data: np.ndarray, width: int, height: int, - bytes_per_sample: int) -> np.ndarray: + bytes_per_sample: int, + big_endian: bool = False) -> np.ndarray: """Undo floating-point predictor (predictor=3). Parameters @@ -466,6 +483,10 @@ def fp_predictor_decode(data: np.ndarray, width: int, height: int, Tile/strip dimensions. bytes_per_sample : int Bytes per sample (e.g. 4 for float32, 8 for float64). + big_endian : bool + Whether the source file is big-endian (BOM ``MM``). The + un-transpose puts the MSB lane at byte index 0 of each output + sample for BE files, and at byte index ``bps-1`` for LE files. Returns ------- @@ -473,7 +494,8 @@ def fp_predictor_decode(data: np.ndarray, width: int, height: int, Corrected array. """ buf = np.ascontiguousarray(data) - _fp_predictor_decode_rows(buf, width, height, bytes_per_sample) + _fp_predictor_decode_rows(buf, width, height, bytes_per_sample, + big_endian) return buf diff --git a/xrspatial/geotiff/_gpu_decode.py b/xrspatial/geotiff/_gpu_decode.py index 67c36829..5305b83b 100644 --- a/xrspatial/geotiff/_gpu_decode.py +++ b/xrspatial/geotiff/_gpu_decode.py @@ -644,11 +644,14 @@ def _predictor_decode_kernel(data, width, height, bytes_per_sample): @cuda.jit -def _fp_predictor_decode_kernel(data, tmp, width, height, bps): +def _fp_predictor_decode_kernel(data, tmp, width, height, bps, big_endian): """Undo floating-point predictor (predictor=3), one thread per row. data: flat uint8 device array tmp: scratch buffer, same size as data + big_endian: when True, place the MSB lane at byte index 0 of each + output sample (file is big-endian); when False, place it at + byte index ``bps-1`` (file is little-endian). """ row = cuda.grid(1) if row >= height: @@ -663,10 +666,17 @@ def _fp_predictor_decode_kernel(data, tmp, width, height, bps): data[idx] = numba_uint8( (numba_int32(data[idx]) + numba_int32(data[idx - 1])) & 0xFF) - # Step 2: un-transpose byte lanes (MSB-first) back to native order - for sample in range(width): - for b in range(bps): - tmp[start + sample * bps + b] = data[start + (bps - 1 - b) * width + sample] + # Step 2: un-transpose byte lanes back to the file's native sample + # order. Lane 0 always contains the MSB byte (TIFF Tech Note 3); the + # MSB lands at byte index 0 (BE) or bps-1 (LE) of each output sample. + if big_endian: + for sample in range(width): + for b in range(bps): + tmp[start + sample * bps + b] = data[start + b * width + sample] + else: + for sample in range(width): + for b in range(bps): + tmp[start + sample * bps + b] = data[start + (bps - 1 - b) * width + sample] # Copy back for i in range(row_len): @@ -1259,6 +1269,7 @@ def gpu_decode_tiles_from_file( predictor: int, dtype: np.dtype, samples: int = 1, + byte_order: str = '<', ): """Decode tiles from a file, using GDS if available. @@ -1289,7 +1300,8 @@ def gpu_decode_tiles_from_file( return _apply_predictor_and_assemble( d_decomp, d_decomp_offsets, len(d_tiles), tile_width, tile_height, image_width, image_height, - predictor, dtype, samples, tile_bytes) + predictor, dtype, samples, tile_bytes, + byte_order=byte_order) # GDS read succeeded but nvCOMP can't decompress on GPU, # or it's LZW/deflate. Copy tiles to host and use normal path. @@ -1301,7 +1313,8 @@ def gpu_decode_tiles_from_file( return gpu_decode_tiles( compressed_tiles, tile_width, tile_height, - image_width, image_height, compression, predictor, dtype, samples) + image_width, image_height, compression, predictor, dtype, samples, + byte_order=byte_order) def _try_nvcomp_from_device_bufs(d_tiles, tile_bytes, compression): @@ -1373,11 +1386,13 @@ class _NvcompDecompOpts(ctypes.Structure): def _apply_predictor_and_assemble(d_decomp, d_decomp_offsets, n_tiles, tile_width, tile_height, image_width, image_height, - predictor, dtype, samples, tile_bytes): + predictor, dtype, samples, tile_bytes, + byte_order: str = '<'): """Apply predictor decode and tile assembly on GPU.""" import cupy bytes_per_pixel = dtype.itemsize * samples + big_endian = (byte_order == '>') if predictor == 2: total_rows = n_tiles * tile_height @@ -1395,7 +1410,8 @@ def _apply_predictor_and_assemble(d_decomp, d_decomp_offsets, n_tiles, bpg = math.ceil(total_rows / tpb) d_tmp = cupy.empty_like(d_decomp) _fp_predictor_decode_kernel[bpg, tpb]( - d_decomp, d_tmp, tile_width * samples, total_rows, dtype.itemsize) + d_decomp, d_tmp, tile_width * samples, total_rows, + dtype.itemsize, big_endian) cuda.synchronize() tiles_across = math.ceil(image_width / tile_width) @@ -1415,10 +1431,15 @@ def _apply_predictor_and_assemble(d_decomp, d_decomp_offsets, n_tiles, cuda.synchronize() if samples > 1: - return d_output.view(dtype=cupy.dtype(dtype)).reshape( + out = d_output.view(dtype=cupy.dtype(dtype)).reshape( image_height, image_width, samples) - return d_output.view(dtype=cupy.dtype(dtype)).reshape( - image_height, image_width) + else: + out = d_output.view(dtype=cupy.dtype(dtype)).reshape( + image_height, image_width) + if big_endian and dtype.itemsize > 1: + # See gpu_decode_tiles for why BE samples need a final byteswap. + out = out.byteswap() + return out def gpu_decode_tiles( @@ -1431,6 +1452,7 @@ def gpu_decode_tiles( predictor: int, dtype: np.dtype, samples: int = 1, + byte_order: str = '<', ): """Decode and assemble TIFF tiles entirely on GPU. @@ -1647,7 +1669,8 @@ def gpu_decode_tiles( bpg = math.ceil(total_rows / tpb) d_tmp = cupy.empty_like(d_decomp) _fp_predictor_decode_kernel[bpg, tpb]( - d_decomp, d_tmp, tile_width * samples, total_rows, dtype.itemsize) + d_decomp, d_tmp, tile_width * samples, total_rows, + dtype.itemsize, byte_order == '>') cuda.synchronize() # Assemble tiles into output image on GPU @@ -1669,10 +1692,18 @@ def gpu_decode_tiles( # Reshape to image if samples > 1: - return d_output.view(dtype=cupy.dtype(dtype)).reshape( + out = d_output.view(dtype=cupy.dtype(dtype)).reshape( image_height, image_width, samples) - return d_output.view(dtype=cupy.dtype(dtype)).reshape( - image_height, image_width) + else: + out = d_output.view(dtype=cupy.dtype(dtype)).reshape( + image_height, image_width) + # The decoded byte stream is in the file's byte order; cupy view + # interprets it as native (always little-endian on supported GPUs), + # so big-endian samples that are wider than a byte must be swapped + # back to native before the values mean anything. + if byte_order == '>' and dtype.itemsize > 1: + out = out.byteswap() + return out # --------------------------------------------------------------------------- diff --git a/xrspatial/geotiff/_reader.py b/xrspatial/geotiff/_reader.py index feb59574..04504b1b 100644 --- a/xrspatial/geotiff/_reader.py +++ b/xrspatial/geotiff/_reader.py @@ -341,7 +341,7 @@ def _open_source(source: str): def _apply_predictor(chunk: np.ndarray, pred: int, width: int, height: int, bytes_per_sample: int, - samples: int = 1) -> np.ndarray: + samples: int = 1, byte_order: str = '<') -> np.ndarray: """Apply the appropriate predictor decode to decompressed data. ``width``, ``height``, ``bytes_per_sample``, and ``samples`` describe @@ -353,16 +353,17 @@ def _apply_predictor(chunk: np.ndarray, pred: int, width: int, Predictor=3 (floating-point) byte-swizzles each row into ``bytes_per_sample`` interleaved lanes of length ``width * samples``, - per TIFF Technical Note 3. Passing ``bytes_per_sample * samples`` as - the lane count (the pre-fix behaviour) swizzles over the wrong lane - count and scrambles multi-band pixel values. + per TIFF Technical Note 3. The un-transpose stage has to put the + MSB lane at the file's high-order byte position, which differs for + big- vs little-endian files; ``byte_order`` carries that. """ if pred == 2: return predictor_decode(chunk, width, height, bytes_per_sample * samples) elif pred == 3: return fp_predictor_decode(chunk, width * samples, height, - bytes_per_sample) + bytes_per_sample, + big_endian=(byte_order == '>')) return chunk @@ -413,7 +414,8 @@ def _decode_strip_or_tile(data_slice, compression, width, height, samples, if not chunk.flags.writeable: chunk = chunk.copy() chunk = _apply_predictor(chunk, pred, width, height, - bytes_per_sample, samples=samples) + bytes_per_sample, samples=samples, + byte_order=byte_order) if is_sub_byte: pixels = unpack_bits(chunk, bps, pixel_count) diff --git a/xrspatial/geotiff/tests/test_predictor3_big_endian.py b/xrspatial/geotiff/tests/test_predictor3_big_endian.py new file mode 100644 index 00000000..aec4c093 --- /dev/null +++ b/xrspatial/geotiff/tests/test_predictor3_big_endian.py @@ -0,0 +1,92 @@ +"""Predictor=3 big-endian round-trip tests. + +The floating-point predictor (TIFF Tech Note 3) byte-swizzles each row +into ``bytes_per_sample`` lanes, MSB-first. The decoder un-transposes +those lanes back to the file's native byte order; the byte position of +the MSB differs between BE (index 0) and LE (index ``bps-1``). + +Before the fix the un-transpose always wrote MSB at index ``bps-1``, so +big-endian predictor=3 files decoded to garbage values even though they +came back as a clean ``float32``/``float64`` array (no error, just wrong +numbers - the worst kind of bug for raster data). +""" +from __future__ import annotations + +import numpy as np +import pytest + +from xrspatial.geotiff._reader import read_to_array + +tifffile = pytest.importorskip("tifffile") +pytest.importorskip("imagecodecs") # tifffile needs it for predictor=3 + + +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +def test_big_endian_predictor3_round_trip(tmp_path, dtype): + """xrspatial reads a big-endian predictor=3 file exactly.""" + arr = np.array( + [ + [1.0, 2.0, 3.0, 4.0], + [5.0, 6.0, 7.0, 8.0], + [1.5, 2.5, 3.5, 4.5], + [10.0, 20.0, 30.0, 40.0], + ], + dtype=dtype, + ) + path = tmp_path / f"be_pred3_{np.dtype(dtype).name}.tif" + tifffile.imwrite( + str(path), arr, byteorder=">", predictor=3, compression="deflate") + + out, _ = read_to_array(str(path)) + np.testing.assert_array_equal(out, arr) + + +def test_little_endian_predictor3_still_round_trips(tmp_path): + """Sanity: the BE fix did not break the LE path.""" + arr = np.linspace(-100.0, 100.0, 64, dtype=np.float32).reshape(8, 8) + path = tmp_path / "le_pred3_f32.tif" + tifffile.imwrite( + str(path), arr, byteorder="<", predictor=3, compression="deflate") + + out, _ = read_to_array(str(path)) + np.testing.assert_array_equal(out, arr) + + +def test_big_endian_predictor3_tiled(tmp_path): + """BE predictor=3 with tiled layout, multiple tiles per row.""" + rng = np.random.RandomState(20260506) + arr = rng.standard_normal((32, 48)).astype(np.float32) + path = tmp_path / "be_pred3_tiled.tif" + tifffile.imwrite( + str(path), arr, byteorder=">", predictor=3, + compression="deflate", tile=(16, 16)) + + out, _ = read_to_array(str(path)) + np.testing.assert_array_equal(out, arr) + + +def test_big_endian_predictor3_gpu(tmp_path): + """The GPU decode path also handles big-endian predictor=3.""" + cupy = pytest.importorskip("cupy") + if not cupy.cuda.is_available(): + pytest.skip("CUDA not available") + + from xrspatial.geotiff import open_geotiff + + rng = np.random.RandomState(20260506) + arr = rng.standard_normal((32, 48)).astype(np.float32) + path = tmp_path / "be_pred3_gpu.tif" + tifffile.imwrite( + str(path), arr, byteorder=">", predictor=3, + compression="deflate", tile=(16, 16)) + + cpu = open_geotiff(str(path)).values + np.testing.assert_array_equal(cpu, arr) + + gpu_da = open_geotiff(str(path), gpu=True) + gpu_arr = gpu_da.data + if hasattr(gpu_arr, "get"): + gpu_arr = gpu_arr.get() + else: + gpu_arr = np.asarray(gpu_arr) + np.testing.assert_array_equal(gpu_arr, arr) From 9180fb3462607a094ba25a2c949d70ea0c396041 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Wed, 6 May 2026 06:03:45 -0700 Subject: [PATCH 2/3] Record geotiff predictor sweep findings in accuracy state --- .claude/sweep-accuracy-state.csv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.claude/sweep-accuracy-state.csv b/.claude/sweep-accuracy-state.csv index f4b17e1e..0af0a12a 100644 --- a/.claude/sweep-accuracy-state.csv +++ b/.claude/sweep-accuracy-state.csv @@ -12,7 +12,7 @@ emerging_hotspots,2026-04-30,,MEDIUM,2;3,MEDIUM: threshold_90 uses int() (trunca fire,2026-04-30,,,,All ops per-pixel (no accumulation/stencil/projected distance). NaN handled via x!=x; CUDA bounds use strict <; rdnbr and ros divisions guarded; CPU/GPU/dask paths algorithmically identical. No accuracy issues found. flood,2026-04-30,,MEDIUM,2;5,"MEDIUM (not fixed): dask backend preserves float32 input dtype while numpy promotes to float64 in flood_depth and curve_number_runoff; DataArray inputs for curve_number, mannings_n bypass scalar > 0 (and CN <= 100) range validation, silently producing NaN/garbage." focal,2026-03-30T13:00:00Z,1092,,, -geotiff,2026-04-23,1247,HIGH,1;3;4;5,HIGH fixed: CPU fp_predictor_decode wrong byte-lane layout for multi-sample predictor=3 (GPU was correct). MEDIUMs also fixed on same PR: eager and streaming writers emit LONG8 strip/tile offsets in BigTIFF output (were LONG); VRT read honors AREA_OR_POINT=Point; VRT nodata cast uses source dtype instead of float32. LOW fixed: duplicate LERC codec block removed from _compression.py. +geotiff,2026-05-06,1499;1500,CRITICAL,1;3,"Pass 3 (2026-05-06): CRITICAL fixed PR #1499 - predictor=2 was byte-wise at sample stride, loses carries inside multi-byte samples; libtiff/GDAL-written uint16/int16/uint32/int32 deflate files silently decoded to wrong values. Now sample-wise per TIFF spec, with file-byte-order dtype view; per-channel for chunky multi-band; CPU and GPU. HIGH fixed PR #1500 - predictor=3 un-transpose always wrote MSB at byte index bps-1 so big-endian float TIFFs decoded to garbage; now byte-order aware; GPU output also gets a final byteswap for BE files. | HIGH fixed: CPU fp_predictor_decode wrong byte-lane layout for multi-sample predictor=3 (GPU was correct). MEDIUMs also fixed on same PR: eager and streaming writers emit LONG8 strip/tile offsets in BigTIFF output (were LONG); VRT read honors AREA_OR_POINT=Point; VRT nodata cast uses source dtype instead of float32. LOW fixed: duplicate LERC codec block removed from _compression.py." glcm,2026-05-01,1408,HIGH,2,"angle=None averaged NaN as 0, masking no-valid-pairs as zero texture; fixed via nanmean-style averaging" hillshade,2026-04-10T12:00:00Z,,,,"Horn's method correct. All backends consistent. NaN propagation correct. float32 adequate for [0,1] output." hydro,2026-04-30,,LOW,1,Only LOW: twi log(0)=-inf if fa=0 (out-of-contract); MFD weighted sum no Kahan (negligible). No CRIT/HIGH issues. From c0657cfc8a41a22fa5db151b74895db76c3a3664 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Wed, 6 May 2026 06:06:25 -0700 Subject: [PATCH 3/3] Correct sweep state: #1499 was duplicate of #1498, only #1500 stands The third-pass agent worked from a stale local main and re-derived the predictor=2 sample-wise fix that already shipped in #1498. That PR has been closed as a duplicate. Only the predictor=3 big-endian fix in #1500 is a genuine new finding. --- .claude/sweep-accuracy-state.csv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.claude/sweep-accuracy-state.csv b/.claude/sweep-accuracy-state.csv index 0af0a12a..81ec6b4d 100644 --- a/.claude/sweep-accuracy-state.csv +++ b/.claude/sweep-accuracy-state.csv @@ -12,7 +12,7 @@ emerging_hotspots,2026-04-30,,MEDIUM,2;3,MEDIUM: threshold_90 uses int() (trunca fire,2026-04-30,,,,All ops per-pixel (no accumulation/stencil/projected distance). NaN handled via x!=x; CUDA bounds use strict <; rdnbr and ros divisions guarded; CPU/GPU/dask paths algorithmically identical. No accuracy issues found. flood,2026-04-30,,MEDIUM,2;5,"MEDIUM (not fixed): dask backend preserves float32 input dtype while numpy promotes to float64 in flood_depth and curve_number_runoff; DataArray inputs for curve_number, mannings_n bypass scalar > 0 (and CN <= 100) range validation, silently producing NaN/garbage." focal,2026-03-30T13:00:00Z,1092,,, -geotiff,2026-05-06,1499;1500,CRITICAL,1;3,"Pass 3 (2026-05-06): CRITICAL fixed PR #1499 - predictor=2 was byte-wise at sample stride, loses carries inside multi-byte samples; libtiff/GDAL-written uint16/int16/uint32/int32 deflate files silently decoded to wrong values. Now sample-wise per TIFF spec, with file-byte-order dtype view; per-channel for chunky multi-band; CPU and GPU. HIGH fixed PR #1500 - predictor=3 un-transpose always wrote MSB at byte index bps-1 so big-endian float TIFFs decoded to garbage; now byte-order aware; GPU output also gets a final byteswap for BE files. | HIGH fixed: CPU fp_predictor_decode wrong byte-lane layout for multi-sample predictor=3 (GPU was correct). MEDIUMs also fixed on same PR: eager and streaming writers emit LONG8 strip/tile offsets in BigTIFF output (were LONG); VRT read honors AREA_OR_POINT=Point; VRT nodata cast uses source dtype instead of float32. LOW fixed: duplicate LERC codec block removed from _compression.py." +geotiff,2026-05-06,1500,HIGH,3,"Pass 3 (2026-05-06): HIGH fixed PR #1500 - predictor=3 un-transpose always wrote MSB at byte index bps-1 so big-endian float TIFFs decoded to garbage; now byte-order aware; GPU output also gets a final byteswap for BE files. (Pass 3 also opened #1499 for predictor=2 sample-wise decode but it was a duplicate of already-merged #1498 -- closed.) | Pass 2 (2026-05-05) PR #1498: predictor=2 was byte-wise at sample stride, multi-byte integer files written by libtiff/GDAL silently decoded to wrong values; now sample-wise per TIFF spec on CPU and GPU. | Pass 1 (2026-04-23) PR #1247: HIGH fixed CPU fp_predictor_decode wrong byte-lane layout for multi-sample predictor=3 (GPU was correct). MEDIUMs also fixed on same PR: BigTIFF writers emit LONG8 strip/tile offsets (were LONG); VRT read honors AREA_OR_POINT=Point; VRT nodata cast uses source dtype instead of float32. LOW fixed: duplicate LERC codec block removed from _compression.py." glcm,2026-05-01,1408,HIGH,2,"angle=None averaged NaN as 0, masking no-valid-pairs as zero texture; fixed via nanmean-style averaging" hillshade,2026-04-10T12:00:00Z,,,,"Horn's method correct. All backends consistent. NaN propagation correct. float32 adequate for [0,1] output." hydro,2026-04-30,,LOW,1,Only LOW: twi log(0)=-inf if fa=0 (out-of-contract); MFD weighted sum no Kahan (negligible). No CRIT/HIGH issues.