From b867c1283c7aa914bab9ba073e008422f3ef169f Mon Sep 17 00:00:00 2001
From: Brendan Collins <brendancol@gmail.com>
Date: Fri, 8 May 2026 13:57:50 -0700
Subject: [PATCH 1/2] Apply LERC valid-mask in GPU decode path (depends on
 #1529)

The CPU LERC reader from #1529 honours the LERC valid-mask and writes
the file's nodata sentinel into masked pixels. The GPU LERC tile-decode
path was still dropping the mask, so masked pixels read back as 0 on
GPU but as NaN or the sentinel on CPU. Same bug, GPU side.

Changes:

_gpu_decode.py: the LERC branch now calls lerc_decompress_with_mask
per tile and keeps any returned valid-mask. After predictor decode and
tile assembly, _apply_lerc_mask_fill builds an invalid mask on host
(matching the GPU assembly kernel's tile-grid layout), copies it to
GPU once, and overwrites masked positions with the resolved fill
value. Tiles LERC reports as fully valid skip the host work, so the
no-mask path stays zero-copy.

gpu_decode_tiles and gpu_decode_tiles_from_file get a masked_fill
keyword that is forwarded through. read_geotiff_gpu computes it via
_resolve_masked_fill(ifd.nodata_str, file_dtype) for LERC sources.

tests/test_lerc_valid_mask_gpu.py: 4 tests covering float32+NaN,
float32+sentinel, uint16+sentinel, and the no-mask regression. Each
compares read_geotiff_gpu output to read_to_array output for the same
file. Skipped unless cupy + CUDA + lerc are available.

Out of scope: the encode side. The xrspatial writer still hard-codes
hasMask=False; the tests reuse the lerc_compress monkeypatch fixture
from the CPU PR to inject a valid-mask through lerc.encode directly.
---
 xrspatial/geotiff/__init__.py                 |  14 ++
 xrspatial/geotiff/_gpu_decode.py              | 105 +++++++++-
 .../geotiff/tests/test_lerc_valid_mask_gpu.py | 181 ++++++++++++++++++
 3 files changed, 295 insertions(+), 5 deletions(-)
 create mode 100644 xrspatial/geotiff/tests/test_lerc_valid_mask_gpu.py

diff --git a/xrspatial/geotiff/__init__.py b/xrspatial/geotiff/__init__.py
index bd7d97fe2..4b27f245d 100644
--- a/xrspatial/geotiff/__init__.py
+++ b/xrspatial/geotiff/__init__.py
@@ -1617,7 +1617,9 @@ def read_geotiff_gpu(source: str, *,
 
     from ._reader import (
         _FileSource, _check_dimensions, MAX_PIXELS_DEFAULT, _coerce_path,
+        _resolve_masked_fill,
     )
+    from ._compression import COMPRESSION_LERC
     from ._header import (
         parse_header, parse_all_ifds, select_overview_ifd, validate_tile_layout,
     )
@@ -1709,6 +1711,16 @@ def read_geotiff_gpu(source: str, *,
     # Sparse tiles (byte_count == 0) are unsupported on the GPU pipeline;
     # the CPU reader fills them with nodata and copies onto the GPU.
     has_sparse_tile = any(bc == 0 for bc in byte_counts)
+    # LERC tiles can carry a per-pixel valid mask that GDAL writes
+    # zero-filled in the data array.  Compute the nodata fill the same
+    # way the CPU reader does so the GPU decode path can restore it
+    # post-assembly (mirrors PR #1529 for the CPU path). Only the
+    # chunky (planar=1) GPU path threads masked_fill into its kernel
+    # call below; the planar=2 per-band branch falls back to the CPU
+    # reader for masked pixels (rare in practice -- LERC files
+    # typically use chunky layout).
+    masked_fill = (_resolve_masked_fill(ifd.nodata_str, file_dtype)
+                   if compression == COMPRESSION_LERC else None)
 
     # PlanarConfiguration=2 (separate bands): each band has its own list
     # of tiles back-to-back in TileOffsets / TileByteCounts. The GPU
@@ -1795,6 +1807,7 @@ def _read_once():
                 tw, th, width, height,
                 compression, predictor, file_dtype, samples,
                 byte_order=header.byte_order,
+                masked_fill=masked_fill,
             )
         except Exception as e:
             if gpu == 'strict':
@@ -1826,6 +1839,7 @@ def _read_once():
                 tw, th, width, height,
                 compression, predictor, file_dtype, samples,
                 byte_order=header.byte_order,
+                masked_fill=masked_fill,
             )
         except Exception as e:
             if gpu == 'strict':
diff --git a/xrspatial/geotiff/_gpu_decode.py b/xrspatial/geotiff/_gpu_decode.py
index b2c51e09f..a230cc23b 100644
--- a/xrspatial/geotiff/_gpu_decode.py
+++ b/xrspatial/geotiff/_gpu_decode.py
@@ -1452,6 +1452,7 @@ def gpu_decode_tiles_from_file(
     dtype: np.dtype,
     samples: int = 1,
     byte_order: str = '<',
+    masked_fill=None,
 ):
     """Decode tiles from a file, using GDS if available.
 
@@ -1496,7 +1497,7 @@ def gpu_decode_tiles_from_file(
     return gpu_decode_tiles(
         compressed_tiles, tile_width, tile_height,
         image_width, image_height, compression, predictor, dtype, samples,
-        byte_order=byte_order)
+        byte_order=byte_order, masked_fill=masked_fill)
 
 
 def _try_nvcomp_from_device_bufs(d_tiles, tile_bytes, compression):
@@ -1711,6 +1712,7 @@ def gpu_decode_tiles(
     dtype: np.dtype,
     samples: int = 1,
     byte_order: str = '<',
+    masked_fill=None,
 ):
     """Decode and assemble TIFF tiles entirely on GPU.
 
@@ -1730,6 +1732,12 @@ def gpu_decode_tiles(
         Output pixel dtype.
     samples : int
         Samples per pixel.
+    masked_fill : scalar or None
+        Value to write into pixels that LERC reports as invalid.  Only
+        consulted for LERC-compressed inputs (tag 34887).  ``None``
+        leaves any masked pixels at LERC's zero fill.  Use
+        ``_resolve_masked_fill(ifd.nodata_str, dtype)`` from
+        ``_reader.py`` to mirror the CPU reader's nodata semantics.
 
     Returns
     -------
@@ -1742,6 +1750,12 @@ def gpu_decode_tiles(
     bytes_per_pixel = dtype.itemsize * samples
     tile_bytes = tile_width * tile_height * bytes_per_pixel
 
+    # Per-tile LERC valid masks; populated only by the LERC branch
+    # below.  Kept as a flat local so the post-assembly fill block at
+    # the end of this function can find it regardless of which decode
+    # branch ran.
+    _lerc_masks: list[np.ndarray | None] | None = None
+
     # Try nvCOMP batch decompression first (much faster if available)
     nvcomp_result = _try_nvcomp_batch_decompress(
         compressed_tiles, tile_bytes, compression)
@@ -1869,19 +1883,34 @@ def gpu_decode_tiles(
             d_decomp_offsets = cupy.asarray(decomp_offsets)
 
     elif compression == 34887:  # LERC
-        from ._compression import lerc_decompress
+        from ._compression import lerc_decompress_with_mask
         raw_host = np.empty(n_tiles * tile_bytes, dtype=np.uint8)
+        # Per-tile valid masks captured from LERC.  None entries mean
+        # "all valid" (LERC returned no mask, or an all-True mask).
+        # The host-side mask buffer is materialised lazily: it stays
+        # None until at least one tile reports a real partial mask.
+        per_tile_masks: list[np.ndarray | None] = [None] * n_tiles
+        any_lerc_mask = False
         for i, tile in enumerate(compressed_tiles):
             start = i * tile_bytes
-            chunk = np.frombuffer(
-                lerc_decompress(tile, tile_width, tile_height, samples),
-                dtype=np.uint8)
+            decoded_bytes, valid_mask = lerc_decompress_with_mask(tile)
+            chunk = np.frombuffer(decoded_bytes, dtype=np.uint8)
             raw_host[start:start + min(len(chunk), tile_bytes)] = \
                 chunk[:tile_bytes] if len(chunk) >= tile_bytes else \
                 np.pad(chunk, (0, tile_bytes - len(chunk)))
+            if valid_mask is not None:
+                per_tile_masks[i] = np.asarray(valid_mask)
+                any_lerc_mask = True
         d_decomp = cupy.asarray(raw_host)
         decomp_offsets = np.arange(n_tiles, dtype=np.int64) * tile_bytes
         d_decomp_offsets = cupy.asarray(decomp_offsets)
+        if any_lerc_mask:
+            # Stash per-tile masks for the post-assembly fill pass below.
+            # Stored in a ``_lerc_masks`` local that the LERC fill block
+            # picks up after predictor decode and tile assembly.
+            _lerc_masks = per_tile_masks
+        else:
+            _lerc_masks = None
 
     elif compression == 1:  # Uncompressed
         raw_host = np.empty(n_tiles * tile_bytes, dtype=np.uint8)
@@ -1965,6 +1994,72 @@ def gpu_decode_tiles(
     if byte_order == '>' and dtype.itemsize > 1 and predictor != 2:
         # cupy.ndarray has no .byteswap(), so use the dtype-view helper.
         out = _xp_byteswap(out)
+
+    # LERC valid-mask fill: GDAL writes LERC TIFFs with masked pixels
+    # zero-filled in the data array, so without restoring nodata here a
+    # masked pixel reads back as a real zero measurement.  Mirrors the
+    # CPU path in ``_decode_strip_or_tile`` (PR #1529).
+    if _lerc_masks is not None and masked_fill is not None:
+        out = _apply_lerc_mask_fill(
+            out, _lerc_masks, tile_width, tile_height,
+            image_width, image_height, samples, masked_fill)
+    return out
+
+
+def _apply_lerc_mask_fill(out, per_tile_masks, tile_width, tile_height,
+                          image_width, image_height, samples, masked_fill):
+    """Write *masked_fill* into LERC-invalid positions of an assembled image.
+
+    Each tile contributes either an ``(h, w)``/``(h, w, samples)`` valid
+    mask (1=valid, 0=invalid) or ``None`` for "all valid".  The host
+    builds a single assembled boolean invalid-mask sized to the output
+    image, transfers it to the GPU once, and uses it to overwrite
+    masked positions on device.
+    """
+    import cupy
+
+    n_tiles = len(per_tile_masks)
+    tiles_across = math.ceil(image_width / tile_width)
+    tiles_down = math.ceil(image_height / tile_height)
+    if n_tiles != tiles_across * tiles_down:
+        # Defensive: tile grid mismatch means we cannot place masks
+        # safely.  Skip the fill rather than corrupt the output.
+        return out
+
+    invalid = np.zeros((image_height, image_width), dtype=bool)
+    for idx, mask in enumerate(per_tile_masks):
+        if mask is None:
+            continue
+        tr = idx // tiles_across
+        tc = idx % tiles_across
+        y0 = tr * tile_height
+        x0 = tc * tile_width
+        # Trim the tile mask to the visible portion of the image so
+        # right- and bottom-edge tile padding does not leak into the
+        # assembled mask.
+        h = min(tile_height, image_height - y0)
+        w = min(tile_width, image_width - x0)
+        if h <= 0 or w <= 0:
+            continue
+        m = np.asarray(mask)
+        # LERC may report the mask as (h, w) or (h, w, samples); for the
+        # invalid-fill we collapse multi-sample masks via "any sample
+        # invalid".
+        if m.ndim == 3:
+            m2 = (m == 0).any(axis=2)
+        else:
+            m2 = (m == 0)
+        invalid[y0:y0 + h, x0:x0 + w] = m2[:h, :w]
+
+    if not invalid.any():
+        return out
+
+    d_invalid = cupy.asarray(invalid)
+    if out.ndim == 3:
+        # Broadcast (H, W) mask across the sample axis.
+        out[d_invalid, ...] = out.dtype.type(masked_fill)
+    else:
+        out[d_invalid] = out.dtype.type(masked_fill)
     return out
 
 
diff --git a/xrspatial/geotiff/tests/test_lerc_valid_mask_gpu.py b/xrspatial/geotiff/tests/test_lerc_valid_mask_gpu.py
new file mode 100644
index 000000000..0f39697bf
--- /dev/null
+++ b/xrspatial/geotiff/tests/test_lerc_valid_mask_gpu.py
@@ -0,0 +1,181 @@
+"""GPU follow-up to PR #1529 (LERC valid-mask on decode).
+
+The CPU LERC reader honours the LERC valid-mask and writes the file's
+nodata sentinel into masked pixels.  The GPU LERC tile-decode path used
+to discard the mask, so masked pixels read back as LERC's zero fill
+(real-looking measurements at z == 0) on GPU but as NaN/sentinel on
+CPU.  These tests confirm the GPU path now matches the CPU path for
+representative LERC mask combinations.
+
+Mirrors the structure of ``test_lerc_valid_mask.py`` but compares
+``read_geotiff_gpu`` output to ``read_to_array`` output for each case.
+"""
+from __future__ import annotations
+
+import importlib.util
+
+import numpy as np
+import pytest
+
+lerc = pytest.importorskip("lerc")
+
+from xrspatial.geotiff._compression import LERC_AVAILABLE  # noqa: E402
+
+
+def _gpu_available() -> bool:
+    """True if cupy is importable and CUDA is initialised."""
+    if importlib.util.find_spec("cupy") is None:
+        return False
+    try:
+        import cupy
+        return bool(cupy.cuda.is_available())
+    except Exception:
+        return False
+
+
+_HAS_GPU = _gpu_available()
+_gpu_only = pytest.mark.skipif(
+    not (_HAS_GPU and LERC_AVAILABLE),
+    reason="cupy + CUDA + lerc required",
+)
+
+
+@pytest.fixture
+def lerc_writer_with_mask(monkeypatch):
+    """Patch ``lerc_compress`` to embed a valid-mask the writer can't pass.
+
+    The xrspatial writer hard-codes ``hasMask=False`` in its call to
+    ``lerc.encode``.  Tests inject a per-tile mask through this holder's
+    ``invalid`` predicate so the masked pixels survive the encode and
+    show up at decode time.  Same pattern as the CPU test fixture in
+    ``test_lerc_valid_mask.py``.
+    """
+    holder = {"invalid": None}
+
+    def _patched(data, width, height, samples=1,
+                 dtype=np.dtype('float32'), max_z_error=0.0):
+        if samples == 1:
+            arr = np.frombuffer(data, dtype=dtype).reshape(height, width)
+        else:
+            arr = np.frombuffer(data, dtype=dtype).reshape(
+                height, width, samples)
+        invalid_pred = holder["invalid"]
+        if invalid_pred is None:
+            mask = None
+            has_mask = False
+        else:
+            invalid = invalid_pred(arr)
+            mask = np.where(invalid, np.uint8(0), np.uint8(1))
+            has_mask = True
+        result = lerc.encode(arr, samples, has_mask, mask, max_z_error, 1)
+        if result[0] != 0:
+            raise RuntimeError(
+                f"LERC encode failed with error code {result[0]}")
+        return bytes(result[2])
+
+    monkeypatch.setattr(
+        "xrspatial.geotiff._compression.lerc_compress", _patched,
+    )
+    return holder
+
+
+def _read_cpu_gpu(path):
+    """Read *path* with both readers and return ``(cpu_array, gpu_host_array)``."""
+    from xrspatial.geotiff import read_geotiff_gpu
+    from xrspatial.geotiff._reader import read_to_array
+
+    cpu, _geo = read_to_array(path)
+    gpu_da = read_geotiff_gpu(path, gpu='strict')
+    gpu_host = gpu_da.data.get()
+    return cpu, gpu_host
+
+
+@_gpu_only
+class TestGpuLercValidMask:
+    """End-to-end TIFF round-trips comparing GPU vs CPU output."""
+
+    def test_float32_nan_nodata(self, tmp_path, lerc_writer_with_mask):
+        """Float32 LERC + NaN nodata: GPU output matches CPU output."""
+        from xrspatial.geotiff._writer import write
+
+        arr = np.arange(1, 65, dtype=np.float32).reshape(8, 8)
+        invalid_positions = {(0, 1), (5, 4)}
+
+        def invalid_pred(a):
+            m = np.zeros(a.shape[:2], dtype=bool)
+            for r, c in invalid_positions:
+                m[r, c] = True
+            return m
+        lerc_writer_with_mask["invalid"] = invalid_pred
+
+        path = str(tmp_path / "lerc_mask_nan_gpu.tif")
+        write(arr, path, compression="lerc", tiled=True, tile_size=8,
+              nodata=float("nan"))
+
+        cpu, gpu = _read_cpu_gpu(path)
+        # NaN positions
+        for (r, c) in invalid_positions:
+            assert np.isnan(cpu[r, c])
+            assert np.isnan(gpu[r, c])
+        # Valid positions agree exactly
+        cpu_valid = np.where(np.isnan(cpu), 0.0, cpu)
+        gpu_valid = np.where(np.isnan(gpu), 0.0, gpu)
+        np.testing.assert_array_equal(cpu_valid, gpu_valid)
+
+    def test_float32_sentinel_nodata(self, tmp_path, lerc_writer_with_mask):
+        """Float32 LERC + sentinel nodata (-9999): GPU matches CPU."""
+        from xrspatial.geotiff._writer import write
+
+        arr = np.arange(1, 65, dtype=np.float32).reshape(8, 8)
+        invalid_positions = {(0, 1), (3, 3), (7, 7)}
+
+        def invalid_pred(a):
+            m = np.zeros(a.shape[:2], dtype=bool)
+            for r, c in invalid_positions:
+                m[r, c] = True
+            return m
+        lerc_writer_with_mask["invalid"] = invalid_pred
+
+        path = str(tmp_path / "lerc_mask_sentinel_f32_gpu.tif")
+        write(arr, path, compression="lerc", tiled=True, tile_size=8,
+              nodata=-9999.0)
+
+        cpu, gpu = _read_cpu_gpu(path)
+        np.testing.assert_array_equal(cpu, gpu)
+        for (r, c) in invalid_positions:
+            assert gpu[r, c] == np.float32(-9999.0)
+
+    def test_uint16_sentinel_nodata(self, tmp_path, lerc_writer_with_mask):
+        """Uint16 LERC + sentinel nodata (65535): GPU matches CPU."""
+        from xrspatial.geotiff._writer import write
+
+        arr = (np.arange(1, 65, dtype=np.uint16) * 100).reshape(8, 8)
+        invalid_positions = {(0, 1), (4, 4)}
+
+        def invalid_pred(a):
+            m = np.zeros(a.shape[:2], dtype=bool)
+            for r, c in invalid_positions:
+                m[r, c] = True
+            return m
+        lerc_writer_with_mask["invalid"] = invalid_pred
+
+        path = str(tmp_path / "lerc_mask_uint16_gpu.tif")
+        write(arr, path, compression="lerc", tiled=True, tile_size=8,
+              nodata=65535)
+
+        cpu, gpu = _read_cpu_gpu(path)
+        np.testing.assert_array_equal(cpu, gpu)
+        for (r, c) in invalid_positions:
+            assert gpu[r, c] == np.uint16(65535)
+
+    def test_no_mask_roundtrip_bitexact(self, tmp_path):
+        """All-valid LERC (no encoded mask): GPU and CPU agree bit-exact."""
+        from xrspatial.geotiff._writer import write
+
+        arr = np.arange(64, dtype=np.float32).reshape(8, 8)
+        path = str(tmp_path / "lerc_no_mask_gpu.tif")
+        write(arr, path, compression="lerc", tiled=True, tile_size=8)
+
+        cpu, gpu = _read_cpu_gpu(path)
+        np.testing.assert_array_equal(cpu, arr)
+        np.testing.assert_array_equal(gpu, arr)

From bf4fdb8000af22f958d2e31a927ee038dd9df27c Mon Sep 17 00:00:00 2001
From: Brendan Collins <brendancol@gmail.com>
Date: Sat, 9 May 2026 14:30:49 -0700
Subject: [PATCH 2/2] Account for LERC mask buffers in GPU memory budget

_apply_lerc_mask_fill bypassed _check_gpu_memory when allocating the
H*W boolean invalid mask and the cupy temporary backing the boolean
indexing assignment. On near-budget decodes that pushed the GPU into
OOM during the mask transfer rather than failing fast with the usual
xrspatial budget message. Add explicit budget checks for both the
mask buffer and the index temporary (worst-case one int64 per pixel)
so the LERC path stays under the same envelope as the rest of the
GPU decode.
---
 xrspatial/geotiff/_gpu_decode.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/xrspatial/geotiff/_gpu_decode.py b/xrspatial/geotiff/_gpu_decode.py
index a230cc23b..ac8ba4bb6 100644
--- a/xrspatial/geotiff/_gpu_decode.py
+++ b/xrspatial/geotiff/_gpu_decode.py
@@ -2054,6 +2054,14 @@ def _apply_lerc_mask_fill(out, per_tile_masks, tile_width, tile_height,
     if not invalid.any():
         return out
 
+    # Account for the boolean mask buffer up front so a borderline-OK
+    # decode doesn't tip into CUDA OOM on the mask transfer. Boolean
+    # indexing on cupy materialises a temporary index array (typically
+    # int64 of length sum(invalid)); cap that at the worst case of
+    # one int64 per pixel so the budget covers both buffers.
+    _check_gpu_memory(invalid.nbytes, what="LERC valid-mask buffer")
+    _check_gpu_memory(invalid.size * 8, what="LERC mask index buffer")
+
     d_invalid = cupy.asarray(invalid)
     if out.ndim == 3:
         # Broadcast (H, W) mask across the sample axis.