Skip to content

Commit 907a3cd

Browse files
authored
Improve polygonize performance: single-pass tracing, JIT merge helpers, batch shapely (#1010)
* Improve polygonize performance: single-pass tracing, JIT merge helpers, batch shapely (#1008) Replace the two-pass _follow with a single-pass implementation using a dynamically-grown buffer. This eliminates retracing every polygon boundary a second time, which was the dominant cost for rasters with many small regions. Add @ngjit to _point_in_ring, _simplify_ring, and _signed_ring_area so the dask chunk-merge path runs compiled instead of interpreted. Use shapely.polygons() batch constructor for hole-free polygons in _to_geopandas (shapely 2.0+, with fallback for older versions). * Add performance regression tests for polygonize (#1008) - Buffer growth: snake-shaped polygon with >64 boundary points - JIT merge helpers: direct tests of _simplify_ring, _signed_ring_area, _point_in_ring - Dask merge: checkerboard pattern forcing many boundary merges - Geopandas batch: mixed hole-free and holed polygons through the shapely.polygons() batch path * Revert single-pass _follow, keep JIT merge helpers and batch shapely (#1008) Benchmarks showed the single-pass _follow was 15-30% slower than the original two-pass version. The buffer growth check in the inner loop adds overhead that numba doesn't optimize away, and the two-pass approach benefits from the data being in cache on the second pass. Reverted _follow to the original two-pass implementation. The JIT merge helpers (2.3-2.6x dask speedup) and batch shapely construction (1.3-1.6x geopandas speedup) are kept.
1 parent be5f984 commit 907a3cd

File tree

2 files changed

+125
-13
lines changed

2 files changed

+125
-13
lines changed

xrspatial/polygonize.py

Lines changed: 44 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -474,11 +474,34 @@ def _to_geopandas(
474474
column_name: str,
475475
):
476476
import geopandas as gpd
477+
import shapely
477478
from shapely.geometry import Polygon
478479

479-
# Convert list of point arrays to shapely Polygons.
480-
polygons = list(map(
481-
lambda points: Polygon(points[0], points[1:]), polygon_points))
480+
if hasattr(shapely, 'polygons'):
481+
# Shapely 2.0+: batch-construct hole-free polygons via
482+
# linearrings -> polygons pipeline (both are C-level batch ops).
483+
no_holes = [i for i, pts in enumerate(polygon_points)
484+
if len(pts) == 1]
485+
486+
if len(no_holes) == len(polygon_points):
487+
# All hole-free: batch create LinearRings then Polygons.
488+
rings = [shapely.linearrings(pts[0])
489+
for pts in polygon_points]
490+
polygons = list(shapely.polygons(rings))
491+
else:
492+
polygons = [None] * len(polygon_points)
493+
if no_holes:
494+
rings = [shapely.linearrings(polygon_points[i][0])
495+
for i in no_holes]
496+
batch = shapely.polygons(rings)
497+
for idx, poly in zip(no_holes, batch):
498+
polygons[idx] = poly
499+
for i, pts in enumerate(polygon_points):
500+
if polygons[i] is None:
501+
polygons[i] = Polygon(pts[0], pts[1:])
502+
else:
503+
# Shapely < 2.0 fallback.
504+
polygons = [Polygon(pts[0], pts[1:]) for pts in polygon_points]
482505

483506
df = gpd.GeoDataFrame({column_name: column, "geometry": polygons})
484507
return df
@@ -823,32 +846,40 @@ def _trace_rings(edge_set):
823846
return rings
824847

825848

849+
@ngjit
826850
def _simplify_ring(ring):
827851
"""Remove collinear (redundant) vertices from a closed ring."""
828852
n = len(ring) - 1 # exclude closing point
829853
if n <= 3:
830854
return ring
831-
keep = []
855+
# Single pass into pre-allocated output.
856+
out = np.empty((n + 1, 2), dtype=np.float64)
857+
count = 0
832858
for i in range(n):
833-
prev = ring[(i - 1) % n]
834-
curr = ring[i]
835-
nxt = ring[(i + 1) % n]
836-
if not ((prev[0] == curr[0] == nxt[0]) or
837-
(prev[1] == curr[1] == nxt[1])):
838-
keep.append(curr)
839-
if len(keep) < n:
840-
keep.append(keep[0])
841-
return np.array(keep, dtype=np.float64)
859+
pi = (i - 1) % n
860+
ni = (i + 1) % n
861+
if not ((ring[pi, 0] == ring[i, 0] == ring[ni, 0]) or
862+
(ring[pi, 1] == ring[i, 1] == ring[ni, 1])):
863+
out[count, 0] = ring[i, 0]
864+
out[count, 1] = ring[i, 1]
865+
count += 1
866+
if count < n:
867+
out[count, 0] = out[0, 0]
868+
out[count, 1] = out[0, 1]
869+
count += 1
870+
return out[:count].copy()
842871
return ring
843872

844873

874+
@ngjit
845875
def _signed_ring_area(ring):
846876
"""Shoelace formula for signed area of a closed ring."""
847877
x = ring[:, 0]
848878
y = ring[:, 1]
849879
return 0.5 * (np.dot(x[:-1], y[1:]) - np.dot(x[1:], y[:-1]))
850880

851881

882+
@ngjit
852883
def _point_in_ring(px, py, ring):
853884
"""Ray-casting point-in-polygon test."""
854885
n = len(ring) - 1

xrspatial/tests/test_polygonize.py

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -457,3 +457,84 @@ def test_polygonize_dask_cupy_matches_numpy(chunks):
457457
assert val in areas_dcp, f"Value {val} missing from dask+cupy result"
458458
assert_allclose(areas_dcp[val], areas_np[val],
459459
err_msg=f"Area mismatch for value {val}")
460+
461+
462+
# --- Performance-related regression tests (#1008) ---
463+
464+
def test_polygonize_1008_jit_merge_helpers():
465+
"""JIT-compiled _simplify_ring, _signed_ring_area, _point_in_ring."""
466+
from ..polygonize import _point_in_ring, _signed_ring_area, _simplify_ring
467+
468+
# Unit square: CCW exterior.
469+
square = np.array([
470+
[0, 0], [1, 0], [1, 1], [0, 1], [0, 0]], dtype=np.float64)
471+
472+
assert_allclose(_signed_ring_area(square), 1.0)
473+
474+
# Point inside.
475+
assert _point_in_ring(0.5, 0.5, square) is True
476+
# Point outside.
477+
assert _point_in_ring(2.0, 2.0, square) is False
478+
479+
# Square with collinear midpoints on each edge.
480+
with_collinear = np.array([
481+
[0, 0], [0.5, 0], [1, 0], [1, 0.5], [1, 1],
482+
[0.5, 1], [0, 1], [0, 0.5], [0, 0]], dtype=np.float64)
483+
simplified = _simplify_ring(with_collinear)
484+
# Should remove the midpoints, leaving 4 corners + closing point.
485+
assert simplified.shape == (5, 2)
486+
assert_allclose(_signed_ring_area(simplified), 1.0)
487+
488+
# Ring with no collinear points should be returned unchanged.
489+
triangle = np.array([
490+
[0, 0], [2, 0], [1, 2], [0, 0]], dtype=np.float64)
491+
assert _simplify_ring(triangle) is triangle
492+
493+
494+
@dask_array_available
495+
def test_polygonize_1008_dask_merge_many_boundary_polygons():
496+
"""Dask merge with many boundary-crossing polygons of the same value.
497+
498+
Checkerboard pattern in small chunks forces many boundary polygons
499+
through the merge path, exercising the JIT-compiled helpers.
500+
"""
501+
# 8x8 checkerboard, chunks of 4x4.
502+
data = np.zeros((8, 8), dtype=np.int32)
503+
data[::2, ::2] = 1
504+
data[1::2, 1::2] = 1
505+
506+
raster_np = xr.DataArray(data)
507+
vals_np, polys_np = polygonize(raster_np, connectivity=4)
508+
areas_np = _area_by_value(vals_np, polys_np)
509+
510+
raster_da = xr.DataArray(da.from_array(data, chunks=(4, 4)))
511+
vals_da, polys_da = polygonize(raster_da, connectivity=4)
512+
areas_da = _area_by_value(vals_da, polys_da)
513+
514+
for val in areas_np:
515+
assert val in areas_da
516+
assert_allclose(areas_da[val], areas_np[val],
517+
err_msg=f"Area mismatch for value {val}")
518+
519+
520+
@pytest.mark.skipif(gpd is None, reason="geopandas not installed")
521+
def test_polygonize_1008_geopandas_batch_with_holes():
522+
"""Batch shapely construction: mix of hole-free and holed polygons."""
523+
# Outer ring of 0s with inner block of 1s containing a 2 (hole in 1).
524+
data = np.zeros((6, 6), dtype=np.int32)
525+
data[1:5, 1:5] = 1
526+
data[2:4, 2:4] = 2
527+
528+
raster = xr.DataArray(data)
529+
df = polygonize(raster, return_type="geopandas", connectivity=4)
530+
531+
assert isinstance(df, gpd.GeoDataFrame)
532+
assert len(df) == 3 # values 0, 1, 2
533+
534+
# Value 1 polygon should have a hole (the 2-region).
535+
row_1 = df[df.DN == 1].iloc[0]
536+
geom = row_1.geometry
537+
assert len(list(geom.interiors)) == 1
538+
539+
# Total area should equal raster size.
540+
assert_allclose(df.geometry.area.sum(), 36.0)

0 commit comments

Comments
 (0)