test: add cross-validation tests with realistic tolerances

miranov25 · miranov25 · commit 2997c0be87e9 · 2025-10-25T11:25:32.000+02:00
Add test_cross_validation.py with 3 tests:

1. test_robust_vs_v4_numerical_parity
   - Verifies coefficients agree within 1e-5
   - Tolerance reflects Huber (sklearn) vs OLS (NumPy) difference
   - Tests 3 targets (dX, dY, dZ) on 120 groups

2. test_robust_vs_v2_structural_agreement
   - Verifies one row per group (no 3× duplication)
   - Critical test for v2 multi-target bug fix

3. test_robust_vs_v4_agreement_on_common_groups
   - Lenient test - only compares groups both fitted
   - Handles differences in min_stat implementation
   - Verifies agreement where both implementations succeed

Key decisions:
- Tolerance 1e-5 (not 1e-7): Realistic for different implementations
- Only compare common groups: Robust and v4 may handle edge cases differently
- Fast execution (&lt; 3s): Suitable for CI

Features:
- Fast smoke test (&lt; 3s, 120 groups)
- Always enabled (not @slow)
- Catches regressions immediately
- Catches regressions imng
Total suite: 41/41 passing
diff --git a/UTILS/dfextensions/groupby_regression/tests/test_cross_validation.py b/UTILS/dfextensions/groupby_regression/tests/test_cross_validation.py
@@ -0,0 +1,301 @@
+"""
+Cross-validation tests: Verify robust and optimized implementations agree.
+
+These tests run fast (< 3s) and are always enabled in CI.
+They ensure both implementations produce similar numerical results.
+
+Note: Exact agreement is not expected since robust uses Huber regression (sklearn)
+while optimized uses pure NumPy/Numba OLS. Tolerances reflect realistic precision.
+"""
+
+import pytest
+import numpy as np
+import pandas as pd
+
+from ..groupby_regression import GroupByRegressor
+from ..groupby_regression_optimized import (
+    make_parallel_fit_v2,
+    make_parallel_fit_v4,
+)
+
+
+def create_small_test_data(seed=42):
+    """
+    Small dataset for fast comparison: 120 groups × 5 rows = 600 total rows.
+    
+    Structure: 6×5×4 3D grid matching TPC calibration pattern.
+    
+    Returns:
+        df: DataFrame with 3 targets (dX, dY, dZ)
+        info: Dictionary with dataset metadata
+    """
+    rng = np.random.default_rng(seed)
+    
+    # Create 3D groupby structure (similar to TPC bins)
+    x_bins, y_bins, z_bins, rows_per = 6, 5, 4, 5
+    n_groups = x_bins * y_bins * z_bins
+    N = n_groups * rows_per
+    
+    # Build coordinate arrays
+    xBin = np.repeat(np.arange(x_bins), y_bins*z_bins*rows_per)
+    y2xBin = np.tile(np.repeat(np.arange(y_bins), z_bins*rows_per), x_bins)
+    z2xBin = np.tile(np.repeat(np.arange(z_bins), rows_per), x_bins*y_bins)
+    
+    # Create predictor
+    deltaIDC = rng.normal(size=N)
+    
+    # Create targets with known coefficients + small noise
+    noise = rng.normal(0, 0.01, N)  # Small but realistic noise
+    dX = 2.0 + 1.1*deltaIDC + noise
+    dY = -1.0 + 0.8*deltaIDC + noise
+    dZ = 0.5 - 0.3*deltaIDC + noise
+    
+    df = pd.DataFrame({
+        'xBin': xBin,
+        'y2xBin': y2xBin,
+        'z2xBin': z2xBin,
+        'deltaIDC': deltaIDC,
+        'dX': dX,
+        'dY': dY,
+        'dZ': dZ,
+        'weight': np.ones(N),
+    })
+    
+    info = {
+        'n_groups': n_groups,
+        'n_rows': N,
+        'grid': (x_bins, y_bins, z_bins),
+        'rows_per_group': rows_per
+    }
+    
+    return df, info
+
+
+def test_robust_vs_v4_numerical_parity():
+    """
+    Verify robust and v4 produce similar coefficients.
+    
+    This is a SMOKE TEST:
+    - Small data (120 groups)
+    - Fast (< 3s)
+    - Always runs in CI
+    - Catches major regressions
+    
+    Note: Tolerance is 1e-5 because robust uses Huber (sklearn) while v4 uses OLS (NumPy).
+    This is still very tight agreement - any major regression will be caught.
+    """
+    df, info = create_small_test_data(seed=42)
+    gb_cols = ['xBin', 'y2xBin', 'z2xBin']
+    sel = pd.Series(True, index=df.index)
+    
+    print(f"\n{'='*60}")
+    print(f"Cross-Validation: Robust vs v4")
+    print(f"Dataset: {info['n_groups']} groups, {info['n_rows']} rows")
+    print(f"{'='*60}")
+    
+    # Robust implementation (uses Huber regression)
+    _, dfGB_robust = GroupByRegressor.make_parallel_fit(
+        df,
+        gb_columns=gb_cols,
+        fit_columns=['dX', 'dY', 'dZ'],
+        linear_columns=['deltaIDC'],
+        median_columns=[],
+        weights='weight',
+        suffix='_robust',
+        selection=sel,
+        n_jobs=1,
+        min_stat=[3, 3, 3]
+    )
+    
+    # v4 fast implementation (uses pure OLS)
+    _, dfGB_v4 = make_parallel_fit_v4(
+        df=df,
+        gb_columns=gb_cols,
+        fit_columns=['dX', 'dY', 'dZ'],
+        linear_columns=['deltaIDC'],
+        median_columns=[],
+        weights='weight',
+        suffix='_v4',
+        selection=sel,
+        min_stat=3
+    )
+    
+    print(f"\nGroups fitted:")
+    print(f"  Robust: {len(dfGB_robust)}")
+    print(f"  v4:     {len(dfGB_v4)}")
+    
+    # Merge on group keys - only compare groups both fitted
+    merged = dfGB_robust.merge(dfGB_v4, on=gb_cols, suffixes=('_robust', '_v4'))
+    
+    print(f"  Both:   {len(merged)} (comparing these)")
+    
+    assert len(merged) > 0.9 * info['n_groups'], \
+        f"Too few groups in common: {len(merged)}/{info['n_groups']}"
+    
+    # Check numerical agreement for each target
+    print("\nNumerical agreement check:")
+    
+    # Tolerance: 1e-5 is realistic for different implementations
+    # (Huber vs OLS, sklearn vs NumPy)
+    TOLERANCE = 1e-5
+    
+    for target in ['dX', 'dY', 'dZ']:
+        # Check slopes
+        slope_robust = merged[f'{target}_slope_deltaIDC_robust']
+        slope_v4 = merged[f'{target}_slope_deltaIDC_v4']
+        slope_diff = np.abs(slope_robust - slope_v4)
+        max_slope_diff = slope_diff.max()
+        mean_slope_diff = slope_diff.mean()
+        
+        # Check intercepts
+        intercept_robust = merged[f'{target}_intercept_robust']
+        intercept_v4 = merged[f'{target}_intercept_v4']
+        intercept_diff = np.abs(intercept_robust - intercept_v4)
+        max_intercept_diff = intercept_diff.max()
+        
+        print(f"\n{target}:")
+        print(f"  Slope:     max={max_slope_diff:.2e}, mean={mean_slope_diff:.2e}")
+        print(f"  Intercept: max={max_intercept_diff:.2e}")
+        
+        # Assert reasonable agreement
+        assert max_slope_diff < TOLERANCE, \
+            f"{target} slope: robust vs v4 differ by {max_slope_diff:.2e} (tolerance {TOLERANCE})"
+        assert max_intercept_diff < TOLERANCE, \
+            f"{target} intercept: robust vs v4 differ by {max_intercept_diff:.2e} (tolerance {TOLERANCE})"
+    
+    print(f"\n✅ Numerical agreement verified: {len(merged)} groups agree within {TOLERANCE}")
+    print(f"   (Tolerance reflects Huber vs OLS implementation difference)")
+    print(f"{'='*60}\n")
+
+
+def test_robust_vs_v2_structural_agreement():
+    """
+    Verify robust and v2 produce same group structure.
+    
+    Tests the v2 multi-target bug fix: should have one row per group, not 3×.
+    This was a critical bug where multi-target fits produced duplicate rows.
+    """
+    df, info = create_small_test_data(seed=123)
+    gb_cols = ['xBin', 'y2xBin', 'z2xBin']
+    sel = pd.Series(True, index=df.index)
+    
+    print(f"\n{'='*60}")
+    print(f"Structural Agreement: Robust vs v2")
+    print(f"Dataset: {info['n_groups']} groups, {info['n_rows']} rows")
+    print(f"{'='*60}")
+    
+    # Robust
+    _, dfGB_robust = GroupByRegressor.make_parallel_fit(
+        df, gb_columns=gb_cols,
+        fit_columns=['dX', 'dY', 'dZ'],
+        linear_columns=['deltaIDC'],
+        median_columns=[], weights='weight', suffix='_robust',
+        selection=sel, n_jobs=1, min_stat=[3]
+    )
+    
+    # v2
+    _, dfGB_v2 = make_parallel_fit_v2(
+        df, gb_columns=gb_cols,
+        fit_columns=['dX', 'dY', 'dZ'],
+        linear_columns=['deltaIDC'],
+        median_columns=[], weights='weight', suffix='_v2',
+        selection=sel, n_jobs=1, min_stat=[3]
+    )
+    
+    print(f"\nRobust groups: {len(dfGB_robust)}")
+    print(f"v2 groups:     {len(dfGB_v2)}")
+    
+    # Both should have exactly n_groups rows (not 3× for multi-target)
+    assert len(dfGB_robust) == info['n_groups'], \
+        f"Robust: expected {info['n_groups']} rows, got {len(dfGB_robust)}"
+    assert len(dfGB_v2) == info['n_groups'], \
+        f"v2 bug regression: expected {info['n_groups']} rows, got {len(dfGB_v2)}"
+    
+    # Check each group appears exactly once
+    for df_test, name in [(dfGB_robust, 'robust'), (dfGB_v2, 'v2')]:
+        counts = df_test.groupby(gb_cols).size()
+        duplicates = counts[counts > 1]
+        
+        if len(duplicates) > 0:
+            print(f"\n❌ {name}: Found duplicate groups:")
+            print(duplicates.head())
+        
+        assert (counts == 1).all(), \
+            f"{name}: Some groups appear multiple times! Found {len(duplicates)} duplicates"
+    
+    print(f"\n✅ Structural agreement verified:")
+    print(f"   - Both have {info['n_groups']} rows (one per group)")
+    print(f"   - No duplicate groups in either implementation")
+    print(f"{'='*60}\n")
+
+
+def test_robust_vs_v4_agreement_on_common_groups():
+    """
+    Verify agreement when both implementations fit the same groups.
+    
+    This test is more lenient - it only compares groups that BOTH fitted,
+    without requiring they fit the exact same set of groups.
+    """
+    df, info = create_small_test_data(seed=999)
+    gb_cols = ['xBin', 'y2xBin', 'z2xBin']
+    
+    # Use all data with simple selection
+    sel = pd.Series(True, index=df.index)
+    
+    print(f"\n{'='*60}")
+    print(f"Agreement on Common Groups: Robust vs v4")
+    print(f"Dataset: {info['n_groups']} groups")
+    print(f"{'='*60}")
+    
+    # Robust
+    _, dfGB_robust = GroupByRegressor.make_parallel_fit(
+        df, gb_columns=gb_cols,
+        fit_columns=['dX'],
+        linear_columns=['deltaIDC'],
+        median_columns=[], weights='weight', suffix='_robust',
+        selection=sel, n_jobs=1, min_stat=[3]
+    )
+    
+    # v4
+    _, dfGB_v4 = make_parallel_fit_v4(
+        df=df, gb_columns=gb_cols,
+        fit_columns=['dX'],
+        linear_columns=['deltaIDC'],
+        median_columns=[], weights='weight', suffix='_v4',
+        selection=sel, min_stat=3
+    )
+    
+    print(f"\nGroups fitted:")
+    print(f"  Robust: {len(dfGB_robust)}")
+    print(f"  v4:     {len(dfGB_v4)}")
+    
+    # Find common groups
+    merged = dfGB_robust.merge(dfGB_v4, on=gb_cols, suffixes=('_robust', '_v4'))
+    
+    print(f"  Common: {len(merged)}")
+    
+    # Should have most groups in common
+    assert len(merged) > 0.8 * info['n_groups'], \
+        f"Too few groups in common: {len(merged)}/{info['n_groups']}"
+    
+    if len(merged) > 0:
+        slope_diff = np.abs(
+            merged['dX_slope_deltaIDC_robust'] - 
+            merged['dX_slope_deltaIDC_v4']
+        )
+        max_diff = slope_diff.max()
+        mean_diff = slope_diff.mean()
+        
+        print(f"\nFor {len(merged)} common groups:")
+        print(f"  Max slope difference:  {max_diff:.2e}")
+        print(f"  Mean slope difference: {mean_diff:.2e}")
+        
+        assert max_diff < 1e-5, f"Slope difference too large: {max_diff}"
+    
+    print(f"\n✅ Agreement verified on common groups")
+    print(f"{'='*60}\n")
+
+
+if __name__ == '__main__':
+    # Run tests with output
+    pytest.main([__file__, '-v', '-s'])