diff --git a/.claude/sweep-accuracy-state.csv b/.claude/sweep-accuracy-state.csv
index 4bebbdc8..54e2d306 100644
--- a/.claude/sweep-accuracy-state.csv
+++ b/.claude/sweep-accuracy-state.csv
@@ -12,6 +12,7 @@ emerging_hotspots,2026-04-30,,MEDIUM,2;3,MEDIUM: threshold_90 uses int() (trunca
 fire,2026-04-30,,,,All ops per-pixel (no accumulation/stencil/projected distance). NaN handled via x!=x; CUDA bounds use strict <; rdnbr and ros divisions guarded; CPU/GPU/dask paths algorithmically identical. No accuracy issues found.
 flood,2026-04-30,,MEDIUM,2;5,"MEDIUM (not fixed): dask backend preserves float32 input dtype while numpy promotes to float64 in flood_depth and curve_number_runoff; DataArray inputs for curve_number, mannings_n bypass scalar > 0 (and CN <= 100) range validation, silently producing NaN/garbage."
 focal,2026-03-30T13:00:00Z,1092,,,
+geotiff,2026-05-06,1500,HIGH,3,"Pass 3 (2026-05-06): HIGH fixed PR #1500 - predictor=3 un-transpose always wrote MSB at byte index bps-1 so big-endian float TIFFs decoded to garbage; now byte-order aware; GPU output also gets a final byteswap for BE files. (Pass 3 also opened #1499 for predictor=2 sample-wise decode but it was a duplicate of already-merged #1498 -- closed.) | Pass 2 (2026-05-05) PR #1498: predictor=2 was byte-wise at sample stride, multi-byte integer files written by libtiff/GDAL silently decoded to wrong values; now sample-wise per TIFF spec on CPU and GPU. | Pass 1 (2026-04-23) PR #1247: HIGH fixed CPU fp_predictor_decode wrong byte-lane layout for multi-sample predictor=3 (GPU was correct). MEDIUMs also fixed on same PR: BigTIFF writers emit LONG8 strip/tile offsets (were LONG); VRT read honors AREA_OR_POINT=Point; VRT nodata cast uses source dtype instead of float32. LOW fixed: duplicate LERC codec block removed from _compression.py."
 geotiff,2026-05-06,1498,HIGH,1;5,HIGH fixed: CPU fp_predictor_decode wrong byte-lane layout for multi-sample predictor=3 (GPU was correct). MEDIUMs also fixed on same PR: eager and streaming writers emit LONG8 strip/tile offsets in BigTIFF output (were LONG); VRT read honors AREA_OR_POINT=Point; VRT nodata cast uses source dtype instead of float32. LOW fixed: duplicate LERC codec block removed from _compression.py. 2026-05-05 deep pass: HIGH fixed in #1498 -- predictor=2 (horizontal differencing) did byte-wise differencing instead of sample-wise; uint8 was correct but uint16/int16/uint32/int32 silently corrupted pixel values both on read of GDAL/libtiff/tifffile/rasterio-written files and on write (their readers saw garbage). CPU + GPU + writer all reworked to dispatch per-dtype kernels at sample resolution with stride=samples. LOW deferred: ModelTiepointTag with multiple tiepoints uses only the first 6 doubles (no warning); rare in practice but silently produces wrong georeferencing for control-point-georeferenced files. 2026-05-06 fourth pass: HIGH fixed in fix-sparse-cog-tile branch -- sparse COG tiles/strips (TileByteCounts/StripByteCounts == 0 from GDAL SPARSE_OK=TRUE) crashed local mmap reads ("incomplete or truncated stream"), returned uninitialised np.empty memory in the COG-HTTP path, and broke GPU reads. Fix: detect sparse blocks, pre-fill the result with the file's nodata sentinel (0 if unset), skip those entries in the decode loop. New tests cover tiled+stripped, with/without nodata, raw and accessor reads, plus GPU. Re-confirmed clean: PackBits decode against GDAL output, planar=2 + predictor=2 + uint16/uint32, non-tile-aligned dask streaming write, WhiteIsZero (PI=0) inversion. Deferred non-accuracy: tiled JPEG TIFFs from GDAL fail because reader does not inject JPEGTables (tag 347) into per-tile streams (interop crash, not numerical) and Orientation tag (274) is silently ignored. No CRIT findings.
 glcm,2026-05-01,1408,HIGH,2,"angle=None averaged NaN as 0, masking no-valid-pairs as zero texture; fixed via nanmean-style averaging"
 hillshade,2026-04-10T12:00:00Z,,,,"Horn's method correct. All backends consistent. NaN propagation correct. float32 adequate for [0,1] output."
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0cc2b4ec..2d3805ad 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,12 @@
 -----------
 
 
+### Unreleased
+
+#### Bug fixes and improvements
+- Decode TIFF predictor=3 un-transpose by file byte order so big-endian floating-point TIFFs read back exactly
+
+
 ### Version 0.9.9 - 2026-05-05
 
 #### Bug fixes and improvements
diff --git a/xrspatial/geotiff/__init__.py b/xrspatial/geotiff/__init__.py
index dcb382e1..3fad38e6 100644
--- a/xrspatial/geotiff/__init__.py
+++ b/xrspatial/geotiff/__init__.py
@@ -1401,6 +1401,7 @@ def read_geotiff_gpu(source: str, *,
                 source, offsets, byte_counts,
                 tw, th, width, height,
                 compression, predictor, file_dtype, samples,
+                byte_order=header.byte_order,
             )
         except Exception:
             pass
@@ -1423,6 +1424,7 @@ def read_geotiff_gpu(source: str, *,
                 compressed_tiles,
                 tw, th, width, height,
                 compression, predictor, file_dtype, samples,
+                byte_order=header.byte_order,
             )
         except (ValueError, Exception):
             # Unsupported compression -- fall back to CPU then transfer
diff --git a/xrspatial/geotiff/_compression.py b/xrspatial/geotiff/_compression.py
index ef604ed1..a8f88752 100644
--- a/xrspatial/geotiff/_compression.py
+++ b/xrspatial/geotiff/_compression.py
@@ -576,10 +576,18 @@ def predictor_encode(data: np.ndarray, width: int, height: int,
 # Decode: undo differencing, then un-transpose (lane b -> native byte bps-1-b).
 
 @ngjit
-def _fp_predictor_decode_row(row_data, width, bps):
+def _fp_predictor_decode_row(row_data, width, bps, big_endian):
     """Undo floating-point predictor for one row (in-place).
 
-    row_data: uint8 array of length width * bps
+    row_data: uint8 array of length width * bps.
+
+    Per TIFF Tech Note 3, lane 0 contains the most significant byte of
+    each sample regardless of the file's byte order.  After
+    un-transposing, the MSB has to land at the byte position that
+    corresponds to "most significant" *in the file's byte order*: index 0
+    for big-endian files and index ``bps-1`` for little-endian files.
+    The previous implementation always wrote MSB at index ``bps-1``,
+    which scrambled big-endian predictor=3 reads.
     """
     n = width * bps
 
@@ -587,26 +595,35 @@ def _fp_predictor_decode_row(row_data, width, bps):
     for i in range(1, n):
         row_data[i] = np.uint8((np.int32(row_data[i]) + np.int32(row_data[i - 1])) & 0xFF)
 
-    # Step 2: un-transpose bytes back to native sample order
+    # Step 2: un-transpose bytes back to the file's native sample order
     tmp = np.empty(n, dtype=np.uint8)
-    for sample in range(width):
-        for b in range(bps):
-            tmp[sample * bps + b] = row_data[(bps - 1 - b) * width + sample]
+    if big_endian:
+        # MSB lane (lane 0) -> byte index 0 in each output sample.
+        for sample in range(width):
+            for b in range(bps):
+                tmp[sample * bps + b] = row_data[b * width + sample]
+    else:
+        # MSB lane -> byte index ``bps-1`` (the high-order byte in LE).
+        for sample in range(width):
+            for b in range(bps):
+                tmp[sample * bps + b] = row_data[(bps - 1 - b) * width + sample]
     for i in range(n):
         row_data[i] = tmp[i]
 
 
 @ngjit
-def _fp_predictor_decode_rows(data, width, height, bps):
+def _fp_predictor_decode_rows(data, width, height, bps, big_endian):
     """Dispatch per-row decode from Numba, avoiding Python loop overhead."""
     row_len = width * bps
     for row in range(height):
         start = row * row_len
-        _fp_predictor_decode_row(data[start:start + row_len], width, bps)
+        _fp_predictor_decode_row(
+            data[start:start + row_len], width, bps, big_endian)
 
 
 def fp_predictor_decode(data: np.ndarray, width: int, height: int,
-                        bytes_per_sample: int) -> np.ndarray:
+                        bytes_per_sample: int,
+                        big_endian: bool = False) -> np.ndarray:
     """Undo floating-point predictor (predictor=3).
 
     Parameters
@@ -617,6 +634,10 @@ def fp_predictor_decode(data: np.ndarray, width: int, height: int,
         Tile/strip dimensions.
     bytes_per_sample : int
         Bytes per sample (e.g. 4 for float32, 8 for float64).
+    big_endian : bool
+        Whether the source file is big-endian (BOM ``MM``).  The
+        un-transpose puts the MSB lane at byte index 0 of each output
+        sample for BE files, and at byte index ``bps-1`` for LE files.
 
     Returns
     -------
@@ -624,7 +645,8 @@ def fp_predictor_decode(data: np.ndarray, width: int, height: int,
         Corrected array.
     """
     buf = np.ascontiguousarray(data)
-    _fp_predictor_decode_rows(buf, width, height, bytes_per_sample)
+    _fp_predictor_decode_rows(buf, width, height, bytes_per_sample,
+                              big_endian)
     return buf
 
 
diff --git a/xrspatial/geotiff/_gpu_decode.py b/xrspatial/geotiff/_gpu_decode.py
index 0598a6f9..4cb5fe75 100644
--- a/xrspatial/geotiff/_gpu_decode.py
+++ b/xrspatial/geotiff/_gpu_decode.py
@@ -700,11 +700,14 @@ def _predictor_decode_kernel_u64(view, width, height, samples_per_pixel):
 
 
 @cuda.jit
-def _fp_predictor_decode_kernel(data, tmp, width, height, bps):
+def _fp_predictor_decode_kernel(data, tmp, width, height, bps, big_endian):
     """Undo floating-point predictor (predictor=3), one thread per row.
 
     data: flat uint8 device array
     tmp: scratch buffer, same size as data
+    big_endian: when True, place the MSB lane at byte index 0 of each
+        output sample (file is big-endian); when False, place it at
+        byte index ``bps-1`` (file is little-endian).
     """
     row = cuda.grid(1)
     if row >= height:
@@ -719,10 +722,17 @@ def _fp_predictor_decode_kernel(data, tmp, width, height, bps):
         data[idx] = numba_uint8(
             (numba_int32(data[idx]) + numba_int32(data[idx - 1])) & 0xFF)
 
-    # Step 2: un-transpose byte lanes (MSB-first) back to native order
-    for sample in range(width):
-        for b in range(bps):
-            tmp[start + sample * bps + b] = data[start + (bps - 1 - b) * width + sample]
+    # Step 2: un-transpose byte lanes back to the file's native sample
+    # order.  Lane 0 always contains the MSB byte (TIFF Tech Note 3); the
+    # MSB lands at byte index 0 (BE) or bps-1 (LE) of each output sample.
+    if big_endian:
+        for sample in range(width):
+            for b in range(bps):
+                tmp[start + sample * bps + b] = data[start + b * width + sample]
+    else:
+        for sample in range(width):
+            for b in range(bps):
+                tmp[start + sample * bps + b] = data[start + (bps - 1 - b) * width + sample]
 
     # Copy back
     for i in range(row_len):
@@ -1315,6 +1325,7 @@ def gpu_decode_tiles_from_file(
     predictor: int,
     dtype: np.dtype,
     samples: int = 1,
+    byte_order: str = '<',
 ):
     """Decode tiles from a file, using GDS if available.
 
@@ -1345,7 +1356,8 @@ def gpu_decode_tiles_from_file(
                 return _apply_predictor_and_assemble(
                     d_decomp, d_decomp_offsets, len(d_tiles),
                     tile_width, tile_height, image_width, image_height,
-                    predictor, dtype, samples, tile_bytes)
+                    predictor, dtype, samples, tile_bytes,
+                    byte_order=byte_order)
 
         # GDS read succeeded but nvCOMP can't decompress on GPU,
         # or it's LZW/deflate. Copy tiles to host and use normal path.
@@ -1357,7 +1369,8 @@ def gpu_decode_tiles_from_file(
 
     return gpu_decode_tiles(
         compressed_tiles, tile_width, tile_height,
-        image_width, image_height, compression, predictor, dtype, samples)
+        image_width, image_height, compression, predictor, dtype, samples,
+        byte_order=byte_order)
 
 
 def _try_nvcomp_from_device_bufs(d_tiles, tile_bytes, compression):
@@ -1493,11 +1506,13 @@ def _gpu_predictor2_encode(d_decomp, tile_width, total_rows, dtype, samples):
 def _apply_predictor_and_assemble(d_decomp, d_decomp_offsets, n_tiles,
                                     tile_width, tile_height,
                                     image_width, image_height,
-                                    predictor, dtype, samples, tile_bytes):
+                                    predictor, dtype, samples, tile_bytes,
+                                    byte_order: str = '<'):
     """Apply predictor decode and tile assembly on GPU."""
     import cupy
 
     bytes_per_pixel = dtype.itemsize * samples
+    big_endian = (byte_order == '>')
 
     if predictor == 2:
         total_rows = n_tiles * tile_height
@@ -1512,7 +1527,8 @@ def _apply_predictor_and_assemble(d_decomp, d_decomp_offsets, n_tiles,
         bpg = math.ceil(total_rows / tpb)
         d_tmp = cupy.empty_like(d_decomp)
         _fp_predictor_decode_kernel[bpg, tpb](
-            d_decomp, d_tmp, tile_width * samples, total_rows, dtype.itemsize)
+            d_decomp, d_tmp, tile_width * samples, total_rows,
+            dtype.itemsize, big_endian)
         cuda.synchronize()
 
     tiles_across = math.ceil(image_width / tile_width)
@@ -1532,10 +1548,15 @@ def _apply_predictor_and_assemble(d_decomp, d_decomp_offsets, n_tiles,
     cuda.synchronize()
 
     if samples > 1:
-        return d_output.view(dtype=cupy.dtype(dtype)).reshape(
+        out = d_output.view(dtype=cupy.dtype(dtype)).reshape(
             image_height, image_width, samples)
-    return d_output.view(dtype=cupy.dtype(dtype)).reshape(
-        image_height, image_width)
+    else:
+        out = d_output.view(dtype=cupy.dtype(dtype)).reshape(
+            image_height, image_width)
+    if big_endian and dtype.itemsize > 1:
+        # See gpu_decode_tiles for why BE samples need a final byteswap.
+        out = out.byteswap()
+    return out
 
 
 def gpu_decode_tiles(
@@ -1548,6 +1569,7 @@ def gpu_decode_tiles(
     predictor: int,
     dtype: np.dtype,
     samples: int = 1,
+    byte_order: str = '<',
 ):
     """Decode and assemble TIFF tiles entirely on GPU.
 
@@ -1759,7 +1781,8 @@ def gpu_decode_tiles(
         bpg = math.ceil(total_rows / tpb)
         d_tmp = cupy.empty_like(d_decomp)
         _fp_predictor_decode_kernel[bpg, tpb](
-            d_decomp, d_tmp, tile_width * samples, total_rows, dtype.itemsize)
+            d_decomp, d_tmp, tile_width * samples, total_rows,
+            dtype.itemsize, byte_order == '>')
         cuda.synchronize()
 
     # Assemble tiles into output image on GPU
@@ -1781,10 +1804,18 @@ def gpu_decode_tiles(
 
     # Reshape to image
     if samples > 1:
-        return d_output.view(dtype=cupy.dtype(dtype)).reshape(
+        out = d_output.view(dtype=cupy.dtype(dtype)).reshape(
             image_height, image_width, samples)
-    return d_output.view(dtype=cupy.dtype(dtype)).reshape(
-        image_height, image_width)
+    else:
+        out = d_output.view(dtype=cupy.dtype(dtype)).reshape(
+            image_height, image_width)
+    # The decoded byte stream is in the file's byte order; cupy view
+    # interprets it as native (always little-endian on supported GPUs),
+    # so big-endian samples that are wider than a byte must be swapped
+    # back to native before the values mean anything.
+    if byte_order == '>' and dtype.itemsize > 1:
+        out = out.byteswap()
+    return out
 
 
 # ---------------------------------------------------------------------------
diff --git a/xrspatial/geotiff/_reader.py b/xrspatial/geotiff/_reader.py
index aacf89a7..a7ce2359 100644
--- a/xrspatial/geotiff/_reader.py
+++ b/xrspatial/geotiff/_reader.py
@@ -358,9 +358,9 @@ def _apply_predictor(chunk: np.ndarray, pred: int, width: int,
 
     Predictor=3 (floating-point) byte-swizzles each row into
     ``bytes_per_sample`` interleaved lanes of length ``width * samples``,
-    per TIFF Technical Note 3.  Passing ``bytes_per_sample * samples`` as
-    the lane count (the pre-fix behaviour) swizzles over the wrong lane
-    count and scrambles multi-band pixel values.
+    per TIFF Technical Note 3.  The un-transpose stage has to put the
+    MSB lane at the file's high-order byte position, which differs for
+    big- vs little-endian files; ``byte_order`` carries that.
     """
     if pred == 2:
         return predictor_decode(chunk, width, height,
@@ -368,7 +368,8 @@ def _apply_predictor(chunk: np.ndarray, pred: int, width: int,
                                 byte_order=byte_order)
     elif pred == 3:
         return fp_predictor_decode(chunk, width * samples, height,
-                                   bytes_per_sample)
+                                   bytes_per_sample,
+                                   big_endian=(byte_order == '>'))
     return chunk
 
 
diff --git a/xrspatial/geotiff/tests/test_predictor3_big_endian.py b/xrspatial/geotiff/tests/test_predictor3_big_endian.py
new file mode 100644
index 00000000..aec4c093
--- /dev/null
+++ b/xrspatial/geotiff/tests/test_predictor3_big_endian.py
@@ -0,0 +1,92 @@
+"""Predictor=3 big-endian round-trip tests.
+
+The floating-point predictor (TIFF Tech Note 3) byte-swizzles each row
+into ``bytes_per_sample`` lanes, MSB-first.  The decoder un-transposes
+those lanes back to the file's native byte order; the byte position of
+the MSB differs between BE (index 0) and LE (index ``bps-1``).
+
+Before the fix the un-transpose always wrote MSB at index ``bps-1``, so
+big-endian predictor=3 files decoded to garbage values even though they
+came back as a clean ``float32``/``float64`` array (no error, just wrong
+numbers - the worst kind of bug for raster data).
+"""
+from __future__ import annotations
+
+import numpy as np
+import pytest
+
+from xrspatial.geotiff._reader import read_to_array
+
+tifffile = pytest.importorskip("tifffile")
+pytest.importorskip("imagecodecs")  # tifffile needs it for predictor=3
+
+
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+def test_big_endian_predictor3_round_trip(tmp_path, dtype):
+    """xrspatial reads a big-endian predictor=3 file exactly."""
+    arr = np.array(
+        [
+            [1.0, 2.0, 3.0, 4.0],
+            [5.0, 6.0, 7.0, 8.0],
+            [1.5, 2.5, 3.5, 4.5],
+            [10.0, 20.0, 30.0, 40.0],
+        ],
+        dtype=dtype,
+    )
+    path = tmp_path / f"be_pred3_{np.dtype(dtype).name}.tif"
+    tifffile.imwrite(
+        str(path), arr, byteorder=">", predictor=3, compression="deflate")
+
+    out, _ = read_to_array(str(path))
+    np.testing.assert_array_equal(out, arr)
+
+
+def test_little_endian_predictor3_still_round_trips(tmp_path):
+    """Sanity: the BE fix did not break the LE path."""
+    arr = np.linspace(-100.0, 100.0, 64, dtype=np.float32).reshape(8, 8)
+    path = tmp_path / "le_pred3_f32.tif"
+    tifffile.imwrite(
+        str(path), arr, byteorder="<", predictor=3, compression="deflate")
+
+    out, _ = read_to_array(str(path))
+    np.testing.assert_array_equal(out, arr)
+
+
+def test_big_endian_predictor3_tiled(tmp_path):
+    """BE predictor=3 with tiled layout, multiple tiles per row."""
+    rng = np.random.RandomState(20260506)
+    arr = rng.standard_normal((32, 48)).astype(np.float32)
+    path = tmp_path / "be_pred3_tiled.tif"
+    tifffile.imwrite(
+        str(path), arr, byteorder=">", predictor=3,
+        compression="deflate", tile=(16, 16))
+
+    out, _ = read_to_array(str(path))
+    np.testing.assert_array_equal(out, arr)
+
+
+def test_big_endian_predictor3_gpu(tmp_path):
+    """The GPU decode path also handles big-endian predictor=3."""
+    cupy = pytest.importorskip("cupy")
+    if not cupy.cuda.is_available():
+        pytest.skip("CUDA not available")
+
+    from xrspatial.geotiff import open_geotiff
+
+    rng = np.random.RandomState(20260506)
+    arr = rng.standard_normal((32, 48)).astype(np.float32)
+    path = tmp_path / "be_pred3_gpu.tif"
+    tifffile.imwrite(
+        str(path), arr, byteorder=">", predictor=3,
+        compression="deflate", tile=(16, 16))
+
+    cpu = open_geotiff(str(path)).values
+    np.testing.assert_array_equal(cpu, arr)
+
+    gpu_da = open_geotiff(str(path), gpu=True)
+    gpu_arr = gpu_da.data
+    if hasattr(gpu_arr, "get"):
+        gpu_arr = gpu_arr.get()
+    else:
+        gpu_arr = np.asarray(gpu_arr)
+    np.testing.assert_array_equal(gpu_arr, arr)