Skip to content

Commit 6ebf223

Browse files
author
miranov25
committed
Fix precision measurement in AliasDataFrame compression
Fixes two critical bugs in compress_columns precision measurement: 1. Integer overflow in RMSE calculation: - uint8*uint8 arithmetic caused overflow (248*248 wraps in uint8) - Cast to float64 before calculation to prevent overflow - Added errstate context and robust median fallback 2. Non-finite value handling: - NaN/inf values contaminated precision metrics - Now filter to finite values before calculating statistics - Track and report excluded sample count Changes: - Cast original/decompressed to float64 before diff calculation - Apply finite mask: only calculate metrics on valid (finite) pairs - Add fields: n_samples, n_total, fraction_nonfinite - Update describe_compression to show sample counts and non-finite % - Consistent output structure: always same 6 fields in precision_info Impact: - dEdxTPC RMSE now correct: 0.54 (was showing 57.6 due to overflow) - Diagnostics match ROOT validation - Clear reporting when data has NaN/inf values Example output: Precision: RMSE=0.545488, Max=2.500000, Mean=0.014717 Samples: 9,632,172/9,632,172, Non-finite: 0.00% Related: Compression feature for TPC residuals (35% file size reduction) ATO-628
1 parent f2e537f commit 6ebf223

File tree

1 file changed

+35
-5
lines changed

1 file changed

+35
-5
lines changed

UTILS/dfextensions/AliasDataFrame.py

Lines changed: 35 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -676,12 +676,36 @@ def compress_columns(self, compression_spec, suffix='_c', drop_original=True,
676676
self.materialize_alias(temp_decompressed)
677677
decompressed_values = self.df[temp_decompressed].values
678678

679-
# Compute precision metrics
680-
diff = original_values - decompressed_values
679+
# Compute precision metrics on finite values only
680+
orig = original_values.astype(np.float64)
681+
decomp = decompressed_values.astype(np.float64)
682+
finite_mask = np.isfinite(orig) & np.isfinite(decomp)
683+
684+
n_total = len(orig)
685+
n_finite = int(finite_mask.sum())
686+
687+
# Always calculate on finite subset (NaN if empty)
688+
if n_finite > 0:
689+
diff = orig[finite_mask] - decomp[finite_mask]
690+
with np.errstate(over='ignore', invalid='ignore'):
691+
rmse = float(np.sqrt(np.mean(diff ** 2)))
692+
if not np.isfinite(rmse):
693+
rmse = float(np.sqrt(np.median(diff ** 2)) * 1.2533)
694+
max_error = float(np.max(np.abs(diff)))
695+
mean_error = float(np.mean(diff))
696+
else:
697+
rmse = float('nan')
698+
max_error = float('nan')
699+
mean_error = float('nan')
700+
701+
# Always same structure
681702
precision_info = {
682-
'rmse': float(np.sqrt(np.mean(diff**2))),
683-
'max_error': float(np.max(np.abs(diff))),
684-
'mean_error': float(np.mean(diff))
703+
'n_samples': n_finite,
704+
'n_total': n_total,
705+
'fraction_nonfinite': float((n_total - n_finite) / n_total) if n_total > 0 else 0.0,
706+
'rmse': rmse,
707+
'max_error': max_error,
708+
'mean_error': mean_error
685709
}
686710

687711
# Clean up temporary column
@@ -883,3 +907,9 @@ def describe_compression(self):
883907
print(f" Precision: RMSE={prec['rmse']:.6f}, "
884908
f"Max={prec['max_error']:.6f}, "
885909
f"Mean={prec['mean_error']:.6f}")
910+
# Add sample count info
911+
n_samples = prec.get('n_samples', 0)
912+
n_total = prec.get('n_total', n_samples)
913+
frac_nonfinite = prec.get('fraction_nonfinite', 0.0)
914+
#if frac_nonfinite >= 0:
915+
print(f" Samples: {n_samples:,}/{n_total:,}, "f"Non-finite: {frac_nonfinite*100:.2f}%")

0 commit comments

Comments
 (0)