Skip to content

Commit 2997c0b

Browse files
author
miranov25
committed
test: add cross-validation tests with realistic tolerances
Add test_cross_validation.py with 3 tests: 1. test_robust_vs_v4_numerical_parity - Verifies coefficients agree within 1e-5 - Tolerance reflects Huber (sklearn) vs OLS (NumPy) difference - Tests 3 targets (dX, dY, dZ) on 120 groups 2. test_robust_vs_v2_structural_agreement - Verifies one row per group (no 3× duplication) - Critical test for v2 multi-target bug fix 3. test_robust_vs_v4_agreement_on_common_groups - Lenient test - only compares groups both fitted - Handles differences in min_stat implementation - Verifies agreement where both implementations succeed Key decisions: - Tolerance 1e-5 (not 1e-7): Realistic for different implementations - Only compare common groups: Robust and v4 may handle edge cases differently - Fast execution (< 3s): Suitable for CI Features: - Fast smoke test (< 3s, 120 groups) - Always enabled (not @slow) - Catches regressions immediately - Catches regressions imng Total suite: 41/41 passing
1 parent 4d90f73 commit 2997c0b

File tree

1 file changed

+301
-0
lines changed

1 file changed

+301
-0
lines changed
Lines changed: 301 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,301 @@
1+
"""
2+
Cross-validation tests: Verify robust and optimized implementations agree.
3+
4+
These tests run fast (< 3s) and are always enabled in CI.
5+
They ensure both implementations produce similar numerical results.
6+
7+
Note: Exact agreement is not expected since robust uses Huber regression (sklearn)
8+
while optimized uses pure NumPy/Numba OLS. Tolerances reflect realistic precision.
9+
"""
10+
11+
import pytest
12+
import numpy as np
13+
import pandas as pd
14+
15+
from ..groupby_regression import GroupByRegressor
16+
from ..groupby_regression_optimized import (
17+
make_parallel_fit_v2,
18+
make_parallel_fit_v4,
19+
)
20+
21+
22+
def create_small_test_data(seed=42):
23+
"""
24+
Small dataset for fast comparison: 120 groups × 5 rows = 600 total rows.
25+
26+
Structure: 6×5×4 3D grid matching TPC calibration pattern.
27+
28+
Returns:
29+
df: DataFrame with 3 targets (dX, dY, dZ)
30+
info: Dictionary with dataset metadata
31+
"""
32+
rng = np.random.default_rng(seed)
33+
34+
# Create 3D groupby structure (similar to TPC bins)
35+
x_bins, y_bins, z_bins, rows_per = 6, 5, 4, 5
36+
n_groups = x_bins * y_bins * z_bins
37+
N = n_groups * rows_per
38+
39+
# Build coordinate arrays
40+
xBin = np.repeat(np.arange(x_bins), y_bins*z_bins*rows_per)
41+
y2xBin = np.tile(np.repeat(np.arange(y_bins), z_bins*rows_per), x_bins)
42+
z2xBin = np.tile(np.repeat(np.arange(z_bins), rows_per), x_bins*y_bins)
43+
44+
# Create predictor
45+
deltaIDC = rng.normal(size=N)
46+
47+
# Create targets with known coefficients + small noise
48+
noise = rng.normal(0, 0.01, N) # Small but realistic noise
49+
dX = 2.0 + 1.1*deltaIDC + noise
50+
dY = -1.0 + 0.8*deltaIDC + noise
51+
dZ = 0.5 - 0.3*deltaIDC + noise
52+
53+
df = pd.DataFrame({
54+
'xBin': xBin,
55+
'y2xBin': y2xBin,
56+
'z2xBin': z2xBin,
57+
'deltaIDC': deltaIDC,
58+
'dX': dX,
59+
'dY': dY,
60+
'dZ': dZ,
61+
'weight': np.ones(N),
62+
})
63+
64+
info = {
65+
'n_groups': n_groups,
66+
'n_rows': N,
67+
'grid': (x_bins, y_bins, z_bins),
68+
'rows_per_group': rows_per
69+
}
70+
71+
return df, info
72+
73+
74+
def test_robust_vs_v4_numerical_parity():
75+
"""
76+
Verify robust and v4 produce similar coefficients.
77+
78+
This is a SMOKE TEST:
79+
- Small data (120 groups)
80+
- Fast (< 3s)
81+
- Always runs in CI
82+
- Catches major regressions
83+
84+
Note: Tolerance is 1e-5 because robust uses Huber (sklearn) while v4 uses OLS (NumPy).
85+
This is still very tight agreement - any major regression will be caught.
86+
"""
87+
df, info = create_small_test_data(seed=42)
88+
gb_cols = ['xBin', 'y2xBin', 'z2xBin']
89+
sel = pd.Series(True, index=df.index)
90+
91+
print(f"\n{'='*60}")
92+
print(f"Cross-Validation: Robust vs v4")
93+
print(f"Dataset: {info['n_groups']} groups, {info['n_rows']} rows")
94+
print(f"{'='*60}")
95+
96+
# Robust implementation (uses Huber regression)
97+
_, dfGB_robust = GroupByRegressor.make_parallel_fit(
98+
df,
99+
gb_columns=gb_cols,
100+
fit_columns=['dX', 'dY', 'dZ'],
101+
linear_columns=['deltaIDC'],
102+
median_columns=[],
103+
weights='weight',
104+
suffix='_robust',
105+
selection=sel,
106+
n_jobs=1,
107+
min_stat=[3, 3, 3]
108+
)
109+
110+
# v4 fast implementation (uses pure OLS)
111+
_, dfGB_v4 = make_parallel_fit_v4(
112+
df=df,
113+
gb_columns=gb_cols,
114+
fit_columns=['dX', 'dY', 'dZ'],
115+
linear_columns=['deltaIDC'],
116+
median_columns=[],
117+
weights='weight',
118+
suffix='_v4',
119+
selection=sel,
120+
min_stat=3
121+
)
122+
123+
print(f"\nGroups fitted:")
124+
print(f" Robust: {len(dfGB_robust)}")
125+
print(f" v4: {len(dfGB_v4)}")
126+
127+
# Merge on group keys - only compare groups both fitted
128+
merged = dfGB_robust.merge(dfGB_v4, on=gb_cols, suffixes=('_robust', '_v4'))
129+
130+
print(f" Both: {len(merged)} (comparing these)")
131+
132+
assert len(merged) > 0.9 * info['n_groups'], \
133+
f"Too few groups in common: {len(merged)}/{info['n_groups']}"
134+
135+
# Check numerical agreement for each target
136+
print("\nNumerical agreement check:")
137+
138+
# Tolerance: 1e-5 is realistic for different implementations
139+
# (Huber vs OLS, sklearn vs NumPy)
140+
TOLERANCE = 1e-5
141+
142+
for target in ['dX', 'dY', 'dZ']:
143+
# Check slopes
144+
slope_robust = merged[f'{target}_slope_deltaIDC_robust']
145+
slope_v4 = merged[f'{target}_slope_deltaIDC_v4']
146+
slope_diff = np.abs(slope_robust - slope_v4)
147+
max_slope_diff = slope_diff.max()
148+
mean_slope_diff = slope_diff.mean()
149+
150+
# Check intercepts
151+
intercept_robust = merged[f'{target}_intercept_robust']
152+
intercept_v4 = merged[f'{target}_intercept_v4']
153+
intercept_diff = np.abs(intercept_robust - intercept_v4)
154+
max_intercept_diff = intercept_diff.max()
155+
156+
print(f"\n{target}:")
157+
print(f" Slope: max={max_slope_diff:.2e}, mean={mean_slope_diff:.2e}")
158+
print(f" Intercept: max={max_intercept_diff:.2e}")
159+
160+
# Assert reasonable agreement
161+
assert max_slope_diff < TOLERANCE, \
162+
f"{target} slope: robust vs v4 differ by {max_slope_diff:.2e} (tolerance {TOLERANCE})"
163+
assert max_intercept_diff < TOLERANCE, \
164+
f"{target} intercept: robust vs v4 differ by {max_intercept_diff:.2e} (tolerance {TOLERANCE})"
165+
166+
print(f"\n✅ Numerical agreement verified: {len(merged)} groups agree within {TOLERANCE}")
167+
print(f" (Tolerance reflects Huber vs OLS implementation difference)")
168+
print(f"{'='*60}\n")
169+
170+
171+
def test_robust_vs_v2_structural_agreement():
172+
"""
173+
Verify robust and v2 produce same group structure.
174+
175+
Tests the v2 multi-target bug fix: should have one row per group, not 3×.
176+
This was a critical bug where multi-target fits produced duplicate rows.
177+
"""
178+
df, info = create_small_test_data(seed=123)
179+
gb_cols = ['xBin', 'y2xBin', 'z2xBin']
180+
sel = pd.Series(True, index=df.index)
181+
182+
print(f"\n{'='*60}")
183+
print(f"Structural Agreement: Robust vs v2")
184+
print(f"Dataset: {info['n_groups']} groups, {info['n_rows']} rows")
185+
print(f"{'='*60}")
186+
187+
# Robust
188+
_, dfGB_robust = GroupByRegressor.make_parallel_fit(
189+
df, gb_columns=gb_cols,
190+
fit_columns=['dX', 'dY', 'dZ'],
191+
linear_columns=['deltaIDC'],
192+
median_columns=[], weights='weight', suffix='_robust',
193+
selection=sel, n_jobs=1, min_stat=[3]
194+
)
195+
196+
# v2
197+
_, dfGB_v2 = make_parallel_fit_v2(
198+
df, gb_columns=gb_cols,
199+
fit_columns=['dX', 'dY', 'dZ'],
200+
linear_columns=['deltaIDC'],
201+
median_columns=[], weights='weight', suffix='_v2',
202+
selection=sel, n_jobs=1, min_stat=[3]
203+
)
204+
205+
print(f"\nRobust groups: {len(dfGB_robust)}")
206+
print(f"v2 groups: {len(dfGB_v2)}")
207+
208+
# Both should have exactly n_groups rows (not 3× for multi-target)
209+
assert len(dfGB_robust) == info['n_groups'], \
210+
f"Robust: expected {info['n_groups']} rows, got {len(dfGB_robust)}"
211+
assert len(dfGB_v2) == info['n_groups'], \
212+
f"v2 bug regression: expected {info['n_groups']} rows, got {len(dfGB_v2)}"
213+
214+
# Check each group appears exactly once
215+
for df_test, name in [(dfGB_robust, 'robust'), (dfGB_v2, 'v2')]:
216+
counts = df_test.groupby(gb_cols).size()
217+
duplicates = counts[counts > 1]
218+
219+
if len(duplicates) > 0:
220+
print(f"\n{name}: Found duplicate groups:")
221+
print(duplicates.head())
222+
223+
assert (counts == 1).all(), \
224+
f"{name}: Some groups appear multiple times! Found {len(duplicates)} duplicates"
225+
226+
print(f"\n✅ Structural agreement verified:")
227+
print(f" - Both have {info['n_groups']} rows (one per group)")
228+
print(f" - No duplicate groups in either implementation")
229+
print(f"{'='*60}\n")
230+
231+
232+
def test_robust_vs_v4_agreement_on_common_groups():
233+
"""
234+
Verify agreement when both implementations fit the same groups.
235+
236+
This test is more lenient - it only compares groups that BOTH fitted,
237+
without requiring they fit the exact same set of groups.
238+
"""
239+
df, info = create_small_test_data(seed=999)
240+
gb_cols = ['xBin', 'y2xBin', 'z2xBin']
241+
242+
# Use all data with simple selection
243+
sel = pd.Series(True, index=df.index)
244+
245+
print(f"\n{'='*60}")
246+
print(f"Agreement on Common Groups: Robust vs v4")
247+
print(f"Dataset: {info['n_groups']} groups")
248+
print(f"{'='*60}")
249+
250+
# Robust
251+
_, dfGB_robust = GroupByRegressor.make_parallel_fit(
252+
df, gb_columns=gb_cols,
253+
fit_columns=['dX'],
254+
linear_columns=['deltaIDC'],
255+
median_columns=[], weights='weight', suffix='_robust',
256+
selection=sel, n_jobs=1, min_stat=[3]
257+
)
258+
259+
# v4
260+
_, dfGB_v4 = make_parallel_fit_v4(
261+
df=df, gb_columns=gb_cols,
262+
fit_columns=['dX'],
263+
linear_columns=['deltaIDC'],
264+
median_columns=[], weights='weight', suffix='_v4',
265+
selection=sel, min_stat=3
266+
)
267+
268+
print(f"\nGroups fitted:")
269+
print(f" Robust: {len(dfGB_robust)}")
270+
print(f" v4: {len(dfGB_v4)}")
271+
272+
# Find common groups
273+
merged = dfGB_robust.merge(dfGB_v4, on=gb_cols, suffixes=('_robust', '_v4'))
274+
275+
print(f" Common: {len(merged)}")
276+
277+
# Should have most groups in common
278+
assert len(merged) > 0.8 * info['n_groups'], \
279+
f"Too few groups in common: {len(merged)}/{info['n_groups']}"
280+
281+
if len(merged) > 0:
282+
slope_diff = np.abs(
283+
merged['dX_slope_deltaIDC_robust'] -
284+
merged['dX_slope_deltaIDC_v4']
285+
)
286+
max_diff = slope_diff.max()
287+
mean_diff = slope_diff.mean()
288+
289+
print(f"\nFor {len(merged)} common groups:")
290+
print(f" Max slope difference: {max_diff:.2e}")
291+
print(f" Mean slope difference: {mean_diff:.2e}")
292+
293+
assert max_diff < 1e-5, f"Slope difference too large: {max_diff}"
294+
295+
print(f"\n✅ Agreement verified on common groups")
296+
print(f"{'='*60}\n")
297+
298+
299+
if __name__ == '__main__':
300+
# Run tests with output
301+
pytest.main([__file__, '-v', '-s'])

0 commit comments

Comments
 (0)