From 51adf07a2095a143bd229238b57e4730f7ae9084 Mon Sep 17 00:00:00 2001
From: Baris Demir <baris.demir@arm.com>
Date: Wed, 4 Mar 2026 13:07:06 +0000
Subject: [PATCH] Arm backend: Fix flaky INT comparisons

Signed-off-by: Baris Demir <baris.demir@arm.com>
Change-Id: I8d80f021de0c466e8a708ec2c0d581c9e1e9b55b
---
 backends/arm/test/ops/test_ceil.py            |  6 +-
 .../arm/test/tester/analyze_output_utils.py   | 57 ++++++++++++++-----
 2 files changed, 49 insertions(+), 14 deletions(-)

diff --git a/backends/arm/test/ops/test_ceil.py b/backends/arm/test/ops/test_ceil.py
index 0c59b2a4291..e6a4be74f33 100644
--- a/backends/arm/test/ops/test_ceil.py
+++ b/backends/arm/test/ops/test_ceil.py
@@ -28,7 +28,11 @@ def forward(self, x: torch.Tensor):
 
 zeros = torch.zeros(1, 10, 10, 10)
 ones = torch.ones(10, 10, 10)
-rand = torch.rand(10, 10) - 0.5
+_rng = torch.Generator().manual_seed(0)
+# Keep values away from integer boundaries to avoid unstable ceil flips due to
+# tiny quantization noise, while still covering mixed-sign random data.
+rand_raw = torch.rand(10, 10, generator=_rng) - 0.5
+rand = torch.where(rand_raw >= 0, rand_raw + 0.1, rand_raw - 0.1)
 randn_pos = torch.randn(1, 4, 4, 4) + 10
 randn_neg = torch.randn(1, 4, 4, 4) - 10
 ramp = torch.arange(-16, 16, 0.2)
diff --git a/backends/arm/test/tester/analyze_output_utils.py b/backends/arm/test/tester/analyze_output_utils.py
index 022a0686cf6..995c5ad04dc 100644
--- a/backends/arm/test/tester/analyze_output_utils.py
+++ b/backends/arm/test/tester/analyze_output_utils.py
@@ -370,22 +370,35 @@ def compare_rel_frobenius_and_cosine_similarity(
         - Inf values will be set to max/min representable by the dtype * quantization scale
         - Values lower than the scale will be set to 0.0
     If the reference is all zeros, the function returns without testing.
+
+    To reduce false positives in quantized testing, the Frobenius check is
+    skipped when reference norm is at quantization-noise scale, and a small
+    Frobenius overflow is accepted when cosine similarity is very high.
     """
 
+    quant_scale_for_guards: float | None = None
+    posinf_value: float | None = None
+    neginf_value: float | None = None
     if clean_reference:
         if quantization_parameters:
             scale = quantization_parameters.scale
+            assert isinstance(
+                scale, (torch.Tensor, int, float)
+            ), f"Unsupported quantization scale type: {type(scale)!r}"
+            quant_scale_for_guards = (
+                float(scale.max().item())
+                if isinstance(scale, torch.Tensor)
+                else float(scale)
+            )
             dtype_info = torch.iinfo(quantization_parameters.dtype)
-            _max = dtype_info.max * scale
-            _min = dtype_info.min * scale
+            assert quant_scale_for_guards is not None
+            posinf_value = float(dtype_info.max) * quant_scale_for_guards
+            neginf_value = float(dtype_info.min) * quant_scale_for_guards
             reference_output = reference_output.where(
                 torch.abs(reference_output) >= scale, 0.0
             )
-        else:
-            _max = None
-            _min = None
         reference_output = reference_output.nan_to_num(
-            nan=0.0, posinf=_max, neginf=_min
+            nan=0.0, posinf=posinf_value, neginf=neginf_value
         )
 
     reference_all_zeros = torch.count_nonzero(reference_output).item() == 0
@@ -403,14 +416,32 @@ def compare_rel_frobenius_and_cosine_similarity(
         test_output.flatten(), reference_output.flatten(), dim=0
     ).item()
 
-    if (
-        frobenius_threshold is not None
-        and relative_frobenius_error > frobenius_threshold
-    ):
-        raise AssertionError(
-            f"Tensor-wise comparison failed: Relative frobenius norm error {relative_frobenius_error} exceeds threshold {frobenius_threshold}."
-            f" (Cosine similarity: {cosine_similarity}, threshold {cosine_threshold})."
+    # Relative Frobenius is unstable when the reference norm is at quantization-noise scale.
+    reference_numel_sqrt = reference_output.numel() ** 0.5
+    low_norm_floor = 1e-8
+    if quant_scale_for_guards is not None:
+        low_norm_floor = max(
+            low_norm_floor, quant_scale_for_guards * reference_numel_sqrt
         )
+    run_frobenius_check = reference_frobenius_norm > low_norm_floor
+
+    if run_frobenius_check and frobenius_threshold is not None:
+        # If cosine is very high, slightly discount Frobenius error to avoid
+        # borderline failures dominated by quantization noise.
+        high_cosine_floor = (
+            max(0.98, cosine_threshold) if cosine_threshold is not None else 0.98
+        )
+        effective_relative_frobenius_error = relative_frobenius_error
+        if cosine_similarity >= high_cosine_floor:
+            effective_relative_frobenius_error = max(
+                0.0, relative_frobenius_error - 0.02
+            )
+
+        if effective_relative_frobenius_error > frobenius_threshold:
+            raise AssertionError(
+                f"Tensor-wise comparison failed: Relative frobenius norm error {relative_frobenius_error} exceeds threshold {frobenius_threshold}."
+                f" (Cosine similarity: {cosine_similarity}, threshold {cosine_threshold})."
+            )
 
     if (
         cosine_threshold is not None