From 51adf07a2095a143bd229238b57e4730f7ae9084 Mon Sep 17 00:00:00 2001 From: Baris Demir Date: Wed, 4 Mar 2026 13:07:06 +0000 Subject: [PATCH] Arm backend: Fix flaky INT comparisons Signed-off-by: Baris Demir Change-Id: I8d80f021de0c466e8a708ec2c0d581c9e1e9b55b --- backends/arm/test/ops/test_ceil.py | 6 +- .../arm/test/tester/analyze_output_utils.py | 57 ++++++++++++++----- 2 files changed, 49 insertions(+), 14 deletions(-) diff --git a/backends/arm/test/ops/test_ceil.py b/backends/arm/test/ops/test_ceil.py index 0c59b2a4291..e6a4be74f33 100644 --- a/backends/arm/test/ops/test_ceil.py +++ b/backends/arm/test/ops/test_ceil.py @@ -28,7 +28,11 @@ def forward(self, x: torch.Tensor): zeros = torch.zeros(1, 10, 10, 10) ones = torch.ones(10, 10, 10) -rand = torch.rand(10, 10) - 0.5 +_rng = torch.Generator().manual_seed(0) +# Keep values away from integer boundaries to avoid unstable ceil flips due to +# tiny quantization noise, while still covering mixed-sign random data. +rand_raw = torch.rand(10, 10, generator=_rng) - 0.5 +rand = torch.where(rand_raw >= 0, rand_raw + 0.1, rand_raw - 0.1) randn_pos = torch.randn(1, 4, 4, 4) + 10 randn_neg = torch.randn(1, 4, 4, 4) - 10 ramp = torch.arange(-16, 16, 0.2) diff --git a/backends/arm/test/tester/analyze_output_utils.py b/backends/arm/test/tester/analyze_output_utils.py index 022a0686cf6..995c5ad04dc 100644 --- a/backends/arm/test/tester/analyze_output_utils.py +++ b/backends/arm/test/tester/analyze_output_utils.py @@ -370,22 +370,35 @@ def compare_rel_frobenius_and_cosine_similarity( - Inf values will be set to max/min representable by the dtype * quantization scale - Values lower than the scale will be set to 0.0 If the reference is all zeros, the function returns without testing. + + To reduce false positives in quantized testing, the Frobenius check is + skipped when reference norm is at quantization-noise scale, and a small + Frobenius overflow is accepted when cosine similarity is very high. """ + quant_scale_for_guards: float | None = None + posinf_value: float | None = None + neginf_value: float | None = None if clean_reference: if quantization_parameters: scale = quantization_parameters.scale + assert isinstance( + scale, (torch.Tensor, int, float) + ), f"Unsupported quantization scale type: {type(scale)!r}" + quant_scale_for_guards = ( + float(scale.max().item()) + if isinstance(scale, torch.Tensor) + else float(scale) + ) dtype_info = torch.iinfo(quantization_parameters.dtype) - _max = dtype_info.max * scale - _min = dtype_info.min * scale + assert quant_scale_for_guards is not None + posinf_value = float(dtype_info.max) * quant_scale_for_guards + neginf_value = float(dtype_info.min) * quant_scale_for_guards reference_output = reference_output.where( torch.abs(reference_output) >= scale, 0.0 ) - else: - _max = None - _min = None reference_output = reference_output.nan_to_num( - nan=0.0, posinf=_max, neginf=_min + nan=0.0, posinf=posinf_value, neginf=neginf_value ) reference_all_zeros = torch.count_nonzero(reference_output).item() == 0 @@ -403,14 +416,32 @@ def compare_rel_frobenius_and_cosine_similarity( test_output.flatten(), reference_output.flatten(), dim=0 ).item() - if ( - frobenius_threshold is not None - and relative_frobenius_error > frobenius_threshold - ): - raise AssertionError( - f"Tensor-wise comparison failed: Relative frobenius norm error {relative_frobenius_error} exceeds threshold {frobenius_threshold}." - f" (Cosine similarity: {cosine_similarity}, threshold {cosine_threshold})." + # Relative Frobenius is unstable when the reference norm is at quantization-noise scale. + reference_numel_sqrt = reference_output.numel() ** 0.5 + low_norm_floor = 1e-8 + if quant_scale_for_guards is not None: + low_norm_floor = max( + low_norm_floor, quant_scale_for_guards * reference_numel_sqrt ) + run_frobenius_check = reference_frobenius_norm > low_norm_floor + + if run_frobenius_check and frobenius_threshold is not None: + # If cosine is very high, slightly discount Frobenius error to avoid + # borderline failures dominated by quantization noise. + high_cosine_floor = ( + max(0.98, cosine_threshold) if cosine_threshold is not None else 0.98 + ) + effective_relative_frobenius_error = relative_frobenius_error + if cosine_similarity >= high_cosine_floor: + effective_relative_frobenius_error = max( + 0.0, relative_frobenius_error - 0.02 + ) + + if effective_relative_frobenius_error > frobenius_threshold: + raise AssertionError( + f"Tensor-wise comparison failed: Relative frobenius norm error {relative_frobenius_error} exceeds threshold {frobenius_threshold}." + f" (Cosine similarity: {cosine_similarity}, threshold {cosine_threshold})." + ) if ( cosine_threshold is not None