From a9e372607f4c136f28450c9a11a51668e73c82ee Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Thu, 12 Mar 2026 19:06:22 -0500
Subject: [PATCH 1/5] refactor: replace cross-product ray-triangle test with
 Moller-Trumbore barycentric algorithm

The previous f_intersects_triangle used three cross products and sign
checks against the triangle normal, which is winding-order dependent.
If vertices are loaded in the wrong order, the sign flips and
intersections are missed.

Replace with the Moller-Trumbore algorithm, which computes barycentric
coordinates (u, v) directly from the vertices. This is vertex
winding-order independent, uses two cross products instead of three,
and no longer depends on triangle%n.

Also remove the now-dead tri%n copy in f_model_is_inside_flat.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/common/m_model.fpp | 59 ++++++++++++++++--------------------------
 1 file changed, 23 insertions(+), 36 deletions(-)

diff --git a/src/common/m_model.fpp b/src/common/m_model.fpp
index 642d3b9522..8265375979 100644
--- a/src/common/m_model.fpp
+++ b/src/common/m_model.fpp
@@ -619,7 +619,6 @@ contains
                         nHits = 0
                         do q = 1, ntrs
                             tri%v(:, :) = gpu_trs_v(:, :, q, pid)
-                            tri%n(1:3) = gpu_trs_n(1:3, q, pid)
                             nHits = nHits + f_intersects_triangle(ray, tri)
                         end do
                         ! if the ray intersected an odd number of times, we must be inside
@@ -638,11 +637,13 @@ contains
 
     end function f_model_is_inside_flat
 
-    ! From https://www.scratchapixel.com/lessons/3e-basic-rendering/ray-tracing-rendering-a-triangle/ray-triangle-intersection-geometric-solution.html
-    !> This procedure checks if a ray intersects a triangle.
+    !> This procedure checks if a ray intersects a triangle using the
+    !! Moller-Trumbore algorithm (barycentric coordinates). Unlike the
+    !! previous cross-product sign test, this is vertex winding-order
+    !! independent.
     !! @param ray      Ray.
     !! @param triangle Triangle.
-    !! @return         True if the ray intersects the triangle, false otherwise.
+    !! @return         1 if the ray intersects the triangle, 0 otherwise.
     function f_intersects_triangle(ray, triangle) result(intersects)
 
         $:GPU_ROUTINE(parallelism='[seq]')
@@ -652,46 +653,32 @@ contains
 
         integer :: intersects
 
-        real(wp) :: N(3), P(3), C(3), edge(3), vp(3)
-        real(wp) :: d, t, NdotRayDirection
+        real(wp) :: edge1(3), edge2(3), h(3), s(3), q(3)
+        real(wp) :: a, f, u, v, t
 
         intersects = 0
 
-        N(1:3) = triangle%n(1:3)
-        NdotRayDirection = dot_product(N(1:3), ray%d(1:3))
-        if (abs(NdotRayDirection) < 0.0000001_wp) then
-            return
-        end if
+        edge1 = triangle%v(2, :) - triangle%v(1, :)
+        edge2 = triangle%v(3, :) - triangle%v(1, :)
+        h = f_cross(ray%d, edge2)
+        a = dot_product(edge1, h)
 
-        d = -sum(N(:)*triangle%v(1, :))
-        t = -(sum(N(:)*ray%o(:)) + d)/NdotRayDirection
-        if (t < 0) then
-            return
-        end if
+        if (abs(a) < 1e-7_wp) return
 
-        P = ray%o + t*ray%d
-        edge = triangle%v(2, :) - triangle%v(1, :)
-        vp = P - triangle%v(1, :)
-        C = f_cross(edge, vp)
-        if (sum(N(:)*C(:)) < 0) then
-            return
-        end if
+        f = 1.0_wp/a
+        s = ray%o - triangle%v(1, :)
+        u = f*dot_product(s, h)
 
-        edge = triangle%v(3, :) - triangle%v(2, :)
-        vp = P - triangle%v(2, :)
-        C = f_cross(edge, vp)
-        if (sum(N(:)*C(:)) < 0) then
-            return
-        end if
+        if (u < 0.0_wp .or. u > 1.0_wp) return
 
-        edge = triangle%v(1, :) - triangle%v(3, :)
-        vp = P - triangle%v(3, :)
-        C = f_cross(edge, vp)
-        if (sum(N(:)*C(:)) < 0) then
-            return
-        end if
+        q = f_cross(s, edge1)
+        v = f*dot_product(ray%d, q)
+
+        if (v < 0.0_wp .or. u + v > 1.0_wp) return
+
+        t = f*dot_product(edge2, q)
 
-        intersects = 1
+        if (t > 0.0_wp) intersects = 1
 
     end function f_intersects_triangle
 

From a1d3d9e8ea2b5a90657e241bfdcd9d5931e77f78 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Thu, 12 Mar 2026 19:27:56 -0500
Subject: [PATCH 2/5] refactor: replace ray casting with generalized winding
 number for inside/outside test

Replace the 26-ray casting approach in f_model_is_inside_flat with the
generalized winding number (Jacobson et al., SIGGRAPH 2013). The winding
number sums the solid angle subtended by each triangle at the query
point using the Van Oosterom-Strackee formula.

This is robust to small triangles: a tiny triangle contributes a
proportionally small solid angle rather than being missed entirely by
rays. It is also winding-order independent and branch-free in the
inner loop, which is GPU-friendly.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/common/m_model.fpp | 82 ++++++++++++++++++++++--------------------
 1 file changed, 44 insertions(+), 38 deletions(-)

diff --git a/src/common/m_model.fpp b/src/common/m_model.fpp
index 8265375979..e6849cb402 100644
--- a/src/common/m_model.fpp
+++ b/src/common/m_model.fpp
@@ -583,11 +583,16 @@ contains
 
     end function f_model_is_inside
 
-    !> This procedure, given a cell center will determine if a point exists instide a surface
-    !! @param ntrs     Number of triangles in the model
-    !! @param pid      Patch ID od this model
+    !> This procedure determines if a point is inside a surface using
+    !! the generalized winding number (Jacobson et al., SIGGRAPH 2013).
+    !! The winding number is the sum of signed solid angles subtended
+    !! by each triangle, normalized by 4*pi. Returns ~1.0 inside,
+    !! ~0.0 outside. Unlike ray casting, this is robust to small
+    !! triangles and vertex winding order.
+    !! @param ntrs     Number of triangles in the model.
+    !! @param pid      Patch ID of this model.
     !! @param point    Point to test.
-    !! @return fraction The perfentage of candidate rays cast indicate that we are inside the model
+    !! @return fraction Winding number (~1.0 inside, ~0.0 outside).
     function f_model_is_inside_flat(ntrs, pid, point) result(fraction)
 
         $:GPU_ROUTINE(parallelism='[seq]')
@@ -597,43 +602,44 @@ contains
         real(wp), dimension(1:3), intent(in) :: point
 
         real(wp) :: fraction
-        type(t_ray) :: ray
-        type(t_triangle) :: tri
-        integer :: i, j, k, q, nInOrOut, nHits
 
-        ! cast 26 rays from the point and count the number at leave the boundary
-        nInOrOut = 0
-        do i = -1, 1
-            do j = -1, 1
-                do k = -1, 1
-                    if (i /= 0 .or. j /= 0 .or. k /= 0) then
-                        ! We cannot get inersections if the ray is exactly in line with triangle plane
-                        if (p == 0 .and. k == 0) cycle
-
-                        ! generate the ray
-                        ray%o = point
-                        ray%d(:) = [real(i, wp), real(j, wp), real(k, wp)]
-                        ray%d = ray%d/sqrt(real(abs(i) + abs(j) + abs(k), wp))
-
-                        ! count the number of intersections
-                        nHits = 0
-                        do q = 1, ntrs
-                            tri%v(:, :) = gpu_trs_v(:, :, q, pid)
-                            nHits = nHits + f_intersects_triangle(ray, tri)
-                        end do
-                        ! if the ray intersected an odd number of times, we must be inside
-                        nInOrOut = nInOrOut + mod(nHits, 2)
-                    end if
-                end do
-            end do
+        real(wp) :: r1(3), r2(3), r3(3)
+        real(wp) :: r1_mag, r2_mag, r3_mag
+        real(wp) :: numerator, denominator
+        integer :: q
+
+        fraction = 0.0_wp
+
+        do q = 1, ntrs
+            r1 = gpu_trs_v(1, :, q, pid) - point
+            r2 = gpu_trs_v(2, :, q, pid) - point
+            r3 = gpu_trs_v(3, :, q, pid) - point
+
+            r1_mag = sqrt(dot_product(r1, r1))
+            r2_mag = sqrt(dot_product(r2, r2))
+            r3_mag = sqrt(dot_product(r3, r3))
+
+            ! Van Oosterom-Strackee formula:
+            ! tan(Omega/2) = numerator / denominator
+            ! numerator = scalar triple product r1 . (r2 x r3)
+            numerator = r1(1)*(r2(2)*r3(3) - r2(3)*r3(2)) &
+                        + r1(2)*(r2(3)*r3(1) - r2(1)*r3(3)) &
+                        + r1(3)*(r2(1)*r3(2) - r2(2)*r3(1))
+
+            denominator = r1_mag*r2_mag*r3_mag &
+                          + dot_product(r1, r2)*r3_mag &
+                          + dot_product(r2, r3)*r1_mag &
+                          + dot_product(r3, r1)*r2_mag
+
+            ! Solid angle = 2 * atan2(num, den).
+            ! atan2(0,0) = 0 per IEEE 754, so degenerate triangles
+            ! contribute nothing without special casing.
+            fraction = fraction + atan2(numerator, denominator)
         end do
 
-        if (p == 0) then
-            ! in 2D, we skipped 8 rays
-            fraction = real(nInOrOut)/18._wp
-        else
-            fraction = real(nInOrOut)/26._wp
-        end if
+        ! Winding number = total solid angle / (4 * pi)
+        ! Each triangle contributes 2*atan2, so sum / (2*pi)
+        fraction = fraction/(2.0_wp*acos(-1.0_wp))
 
     end function f_model_is_inside_flat
 

From 28d8d500ffa8a0249db6ba61bb3023c6a3d55356 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Thu, 12 Mar 2026 19:39:08 -0500
Subject: [PATCH 3/5] fix: add 2D winding number branch for p==0 simulations

The 3D solid angle (Van Oosterom-Strackee) degenerates when all
triangles are coplanar (z=0), producing near-zero winding numbers
everywhere. For 2D (p==0), use the 2D winding number instead: sum
the signed angle subtended by each boundary edge via atan2(cross,dot).
This uses the existing gpu_boundary_v and gpu_boundary_edge_count
arrays that are already computed and uploaded for 2D STL models.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/common/m_model.fpp | 85 ++++++++++++++++++++++++++----------------
 1 file changed, 53 insertions(+), 32 deletions(-)

diff --git a/src/common/m_model.fpp b/src/common/m_model.fpp
index e6849cb402..e84ae93f02 100644
--- a/src/common/m_model.fpp
+++ b/src/common/m_model.fpp
@@ -585,10 +585,11 @@ contains
 
     !> This procedure determines if a point is inside a surface using
     !! the generalized winding number (Jacobson et al., SIGGRAPH 2013).
-    !! The winding number is the sum of signed solid angles subtended
-    !! by each triangle, normalized by 4*pi. Returns ~1.0 inside,
+    !! In 3D, sums the solid angle subtended by each triangle (Van
+    !! Oosterom-Strackee formula). In 2D (p==0), sums the signed
+    !! angle subtended by each boundary edge. Returns ~1.0 inside,
     !! ~0.0 outside. Unlike ray casting, this is robust to small
-    !! triangles and vertex winding order.
+    !! triangles/edges and vertex winding order.
     !! @param ntrs     Number of triangles in the model.
     !! @param pid      Patch ID of this model.
     !! @param point    Point to test.
@@ -606,40 +607,60 @@ contains
         real(wp) :: r1(3), r2(3), r3(3)
         real(wp) :: r1_mag, r2_mag, r3_mag
         real(wp) :: numerator, denominator
+        real(wp) :: d1(2), d2(2)
         integer :: q
 
         fraction = 0.0_wp
 
-        do q = 1, ntrs
-            r1 = gpu_trs_v(1, :, q, pid) - point
-            r2 = gpu_trs_v(2, :, q, pid) - point
-            r3 = gpu_trs_v(3, :, q, pid) - point
-
-            r1_mag = sqrt(dot_product(r1, r1))
-            r2_mag = sqrt(dot_product(r2, r2))
-            r3_mag = sqrt(dot_product(r3, r3))
-
-            ! Van Oosterom-Strackee formula:
-            ! tan(Omega/2) = numerator / denominator
-            ! numerator = scalar triple product r1 . (r2 x r3)
-            numerator = r1(1)*(r2(2)*r3(3) - r2(3)*r3(2)) &
-                        + r1(2)*(r2(3)*r3(1) - r2(1)*r3(3)) &
-                        + r1(3)*(r2(1)*r3(2) - r2(2)*r3(1))
-
-            denominator = r1_mag*r2_mag*r3_mag &
-                          + dot_product(r1, r2)*r3_mag &
-                          + dot_product(r2, r3)*r1_mag &
-                          + dot_product(r3, r1)*r2_mag
-
-            ! Solid angle = 2 * atan2(num, den).
-            ! atan2(0,0) = 0 per IEEE 754, so degenerate triangles
-            ! contribute nothing without special casing.
-            fraction = fraction + atan2(numerator, denominator)
-        end do
+        if (p == 0) then
+            ! 2D winding number: sum signed angles subtended by
+            ! each boundary edge at the query point.
+            do q = 1, gpu_boundary_edge_count(pid)
+                d1(1) = gpu_boundary_v(q, 1, 1, pid) - point(1)
+                d1(2) = gpu_boundary_v(q, 1, 2, pid) - point(2)
+                d2(1) = gpu_boundary_v(q, 2, 1, pid) - point(1)
+                d2(2) = gpu_boundary_v(q, 2, 2, pid) - point(2)
+
+                ! Signed angle = atan2(d1 x d2, d1 . d2)
+                fraction = fraction + atan2( &
+                           d1(1)*d2(2) - d1(2)*d2(1), &
+                           d1(1)*d2(1) + d1(2)*d2(2))
+            end do
+
+            ! 2D winding number = total angle / (2*pi)
+            fraction = fraction/(2.0_wp*acos(-1.0_wp))
+        else
+            ! 3D winding number: sum solid angles via Van
+            ! Oosterom-Strackee formula.
+            do q = 1, ntrs
+                r1 = gpu_trs_v(1, :, q, pid) - point
+                r2 = gpu_trs_v(2, :, q, pid) - point
+                r3 = gpu_trs_v(3, :, q, pid) - point
+
+                r1_mag = sqrt(dot_product(r1, r1))
+                r2_mag = sqrt(dot_product(r2, r2))
+                r3_mag = sqrt(dot_product(r3, r3))
+
+                ! tan(Omega/2) = numerator / denominator
+                ! numerator = scalar triple product r1 . (r2 x r3)
+                numerator = r1(1)*(r2(2)*r3(3) - r2(3)*r3(2)) &
+                            + r1(2)*(r2(3)*r3(1) - r2(1)*r3(3)) &
+                            + r1(3)*(r2(1)*r3(2) - r2(2)*r3(1))
+
+                denominator = r1_mag*r2_mag*r3_mag &
+                              + dot_product(r1, r2)*r3_mag &
+                              + dot_product(r2, r3)*r1_mag &
+                              + dot_product(r3, r1)*r2_mag
+
+                ! atan2(0,0) = 0 per IEEE 754, so degenerate
+                ! triangles contribute nothing.
+                fraction = fraction + atan2(numerator, denominator)
+            end do
 
-        ! Winding number = total solid angle / (4 * pi)
-        ! Each triangle contributes 2*atan2, so sum / (2*pi)
-        fraction = fraction/(2.0_wp*acos(-1.0_wp))
+            ! Winding number = total solid angle / (4*pi)
+            ! Each triangle contributes 2*atan2, so sum / (2*pi)
+            fraction = fraction/(2.0_wp*acos(-1.0_wp))
+        end if
 
     end function f_model_is_inside_flat
 

From a819dd9fab4a61e11687094ac72f8507c67ff508 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Thu, 12 Mar 2026 19:48:47 -0500
Subject: [PATCH 4/5] fix: guard degenerate triangles explicitly and fix lint

Add explicit tiny() guard for degenerate triangles (zero-area or query
point on vertex) instead of relying on atan2(0,0) behavior, which is
compiler-dependent. Add comment explaining the Moller-Trumbore epsilon
threshold.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/common/m_model.fpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/common/m_model.fpp b/src/common/m_model.fpp
index e84ae93f02..655cddadad 100644
--- a/src/common/m_model.fpp
+++ b/src/common/m_model.fpp
@@ -641,6 +641,10 @@ contains
                 r2_mag = sqrt(dot_product(r2, r2))
                 r3_mag = sqrt(dot_product(r3, r3))
 
+                ! Skip degenerate triangles (zero-area or query
+                ! point coincident with a vertex).
+                if (r1_mag*r2_mag*r3_mag < tiny(1.0_wp)) cycle
+
                 ! tan(Omega/2) = numerator / denominator
                 ! numerator = scalar triple product r1 . (r2 x r3)
                 numerator = r1(1)*(r2(2)*r3(3) - r2(3)*r3(2)) &
@@ -652,8 +656,6 @@ contains
                               + dot_product(r2, r3)*r1_mag &
                               + dot_product(r3, r1)*r2_mag
 
-                ! atan2(0,0) = 0 per IEEE 754, so degenerate
-                ! triangles contribute nothing.
                 fraction = fraction + atan2(numerator, denominator)
             end do
 
@@ -690,6 +692,8 @@ contains
         h = f_cross(ray%d, edge2)
         a = dot_product(edge1, h)
 
+        ! Ray nearly parallel to triangle plane; threshold matches
+        ! the original implementation and is safe for wp precision.
         if (abs(a) < 1e-7_wp) return
 
         f = 1.0_wp/a

From 58273b10ed6c8fb586b5dfa42de799cdcec0c1c3 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Thu, 12 Mar 2026 19:57:16 -0500
Subject: [PATCH 5/5] fix: precision-safe epsilon, accurate comments per review

- Use max(1e-7, 10*epsilon) for Moller-Trumbore parallel-ray threshold
  so single-precision builds get a sensible floor above machine epsilon.
- Fix comment on degenerate guard: only fires for vertex-coincident
  query points (subnormal/zero magnitudes), not general degeneracy.
- Fix winding number normalization comment: each atan2 returns
  Omega/2, so dividing by 2*pi gives sum(Omega)/(4*pi).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/common/m_model.fpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/common/m_model.fpp b/src/common/m_model.fpp
index 655cddadad..f655130662 100644
--- a/src/common/m_model.fpp
+++ b/src/common/m_model.fpp
@@ -641,8 +641,8 @@ contains
                 r2_mag = sqrt(dot_product(r2, r2))
                 r3_mag = sqrt(dot_product(r3, r3))
 
-                ! Skip degenerate triangles (zero-area or query
-                ! point coincident with a vertex).
+                ! Skip if query point is coincident with a vertex
+                ! (magnitudes are zero/subnormal).
                 if (r1_mag*r2_mag*r3_mag < tiny(1.0_wp)) cycle
 
                 ! tan(Omega/2) = numerator / denominator
@@ -659,8 +659,8 @@ contains
                 fraction = fraction + atan2(numerator, denominator)
             end do
 
-            ! Winding number = total solid angle / (4*pi)
-            ! Each triangle contributes 2*atan2, so sum / (2*pi)
+            ! Each atan2 returns Omega/2 per triangle; divide
+            ! by 2*pi to get winding number = sum(Omega)/(4*pi).
             fraction = fraction/(2.0_wp*acos(-1.0_wp))
         end if
 
@@ -692,9 +692,9 @@ contains
         h = f_cross(ray%d, edge2)
         a = dot_product(edge1, h)
 
-        ! Ray nearly parallel to triangle plane; threshold matches
-        ! the original implementation and is safe for wp precision.
-        if (abs(a) < 1e-7_wp) return
+        ! Ray nearly parallel to triangle plane. In single precision
+        ! builds epsilon(1.0) ~ 1.2e-7, so use 10*epsilon as a floor.
+        if (abs(a) < max(1e-7_wp, 10.0_wp*epsilon(1.0_wp))) return
 
         f = 1.0_wp/a
         s = ray%o - triangle%v(1, :)