From 38eb02089ec09ca1401b8caea7f072dce1c42706 Mon Sep 17 00:00:00 2001
From: Aurash Karimi <a.karimi@imperial.ac.uk>
Date: Tue, 19 May 2026 13:16:10 +0100
Subject: [PATCH 1/6] add similar library to render diffs from regression test
 results

---
 Cargo.lock          | 10 ++++++
 Cargo.toml          |  1 +
 tests/regression.rs | 77 ++++++++++++++++++++++++++++-----------------
 3 files changed, 59 insertions(+), 29 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index b37a8226f..585d2bc86 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1051,6 +1051,7 @@ dependencies = [
  "rstest",
  "serde",
  "serde_string_enum",
+ "similar",
  "strum 0.28.0",
  "tempfile",
  "toml",
@@ -1513,6 +1514,15 @@ version = "1.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
 
+[[package]]
+name = "similar"
+version = "3.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "04d93e861ede2e497b47833469b8ec9d5c07fa4c78ce7a00f6eb7dd8168b4b3f"
+dependencies = [
+ "bstr",
+]
+
 [[package]]
 name = "siphasher"
 version = "1.0.1"
diff --git a/Cargo.toml b/Cargo.toml
index 1e71909c0..0a28dcfd7 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -43,6 +43,7 @@ erased-serde = "0.4.10"
 assert_cmd = "2.2.2"
 map-macro = "0.3.0"
 rstest = {version = "0.26.1", default-features = false, features = ["crate-name"]}
+similar = "3.1.0"
 yaml-rust2 = {version = "0.11.0", default-features = false}
 
 [build-dependencies]
diff --git a/tests/regression.rs b/tests/regression.rs
index 9466d08c0..8b156395a 100644
--- a/tests/regression.rs
+++ b/tests/regression.rs
@@ -2,7 +2,9 @@
 use anyhow::Result;
 use float_cmp::approx_eq;
 use itertools::Itertools;
+use similar::{ChangeTag, TextDiff};
 use std::env;
+use std::fmt::Write as _;
 use std::fs::{File, read_dir};
 use std::io::{BufRead, BufReader};
 use std::path::{Path, PathBuf};
@@ -95,43 +97,28 @@ fn compare_lines(
     let lines1 = read_lines(&output_dir1.join(file_name));
     let lines2 = read_lines(&output_dir2.join(file_name));
 
-    // Check for different number of lines
-    if lines1.len() != lines2.len() {
-        errors.push(format!(
-            "{file_name}: Different number of lines: {} vs {}",
-            lines1.len(),
-            lines2.len()
-        ));
+    // check number of lines equal
+    let mut has_mismatch = lines1.len() != lines2.len();
+
+    // check each line is the same within numerical tolerance
+    if !has_mismatch {
+        has_mismatch = lines1
+            .iter()
+            .zip(&lines2)
+            .any(|(line1, line2)| !compare_line(line1, line2));
     }
 
-    // Compare each line
-    for (idx, (line1, line2)) in lines1.into_iter().zip(lines2).enumerate() {
-        let line_num = idx + 1; // (1-based) line number
-        if !compare_line(line_num, &line1, &line2, file_name, errors) {
-            errors.push(format!(
-                "{file_name}: line {line_num}:\n    + \"{line1}\"\n    - \"{line2}\""
-            ));
-        }
+    if has_mismatch {
+        let diff = render_diff(&lines1, &lines2);
+        errors.push(format!("{file_name}: output differs\n{diff}"));
     }
 }
 
-fn compare_line(
-    num: usize,
-    line1: &str,
-    line2: &str,
-    file_name: &str,
-    errors: &mut Vec<String>,
-) -> bool {
+fn compare_line(line1: &str, line2: &str) -> bool {
     let fields1 = line1.split(',').collect_vec();
     let fields2 = line2.split(',').collect_vec();
     if fields1.len() != fields2.len() {
-        errors.push(format!(
-            "{}: line {}: Different number of fields: {} vs {}",
-            file_name,
-            num,
-            fields1.len(),
-            fields2.len()
-        ));
+        return false;
     }
 
     // Check every field matches
@@ -141,6 +128,38 @@ fn compare_line(
     })
 }
 
+/// Given two lists of lines which don't match, render a diff showing
+/// which lines were added/removed, with line numbers from the original files.
+fn render_diff(lines1: &[String], lines2: &[String]) -> String {
+    let text1 = lines1.join("\n");
+    let text2 = lines2.join("\n");
+    let diff = TextDiff::from_lines(&text1, &text2);
+
+    let mut out = String::new();
+    let mut line_num1 = 1;
+    let mut line_num2 = 1;
+    for change in diff.iter_all_changes() {
+        let line = change.to_string();
+        let line = line.trim_end_matches('\n');
+        match change.tag() {
+            ChangeTag::Delete => {
+                let _ = writeln!(out, "-L{line_num1}: {line}");
+                line_num1 += 1;
+            }
+            ChangeTag::Insert => {
+                let _ = writeln!(out, "+L{line_num2}: {line}");
+                line_num2 += 1;
+            }
+            ChangeTag::Equal => {
+                line_num1 += 1;
+                line_num2 += 1;
+            }
+        }
+    }
+
+    out
+}
+
 /// Parse a string into an `f64`, returning `None` if parsing fails or value is infinite/NaN
 fn parse_finite(s: &str) -> Option<f64> {
     s.parse().ok().filter(|f: &f64| f.is_finite())

From 50ad02312bab406e73ce0cdaaf68f154d6504953 Mon Sep 17 00:00:00 2001
From: Aurash Karimi <a.karimi@imperial.ac.uk>
Date: Tue, 19 May 2026 16:01:30 +0100
Subject: [PATCH 2/6] Make diffing of regression tests less spammy when most
 lines are within floating precision add tests that check diffs rendered are
 as expected

---
 Cargo.lock          |  10 +++
 Cargo.toml          |   1 +
 tests/regression.rs | 204 +++++++++++++++++++++++++++++++++++++-------
 3 files changed, 185 insertions(+), 30 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 585d2bc86..e2cb20d89 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1046,6 +1046,7 @@ dependencies = [
  "itertools 0.14.0",
  "log",
  "map-macro",
+ "ordered-float",
  "petgraph",
  "platform-info",
  "rstest",
@@ -1144,6 +1145,15 @@ version = "0.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d"
 
+[[package]]
+name = "ordered-float"
+version = "5.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b7d950ca161dc355eaf28f82b11345ed76c6e1f6eb1f4f4479e0323b9e2fbd0e"
+dependencies = [
+ "num-traits",
+]
+
 [[package]]
 name = "percent-encoding"
 version = "2.3.2"
diff --git a/Cargo.toml b/Cargo.toml
index 0a28dcfd7..3b4d2d7f8 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -42,6 +42,7 @@ erased-serde = "0.4.10"
 [dev-dependencies]
 assert_cmd = "2.2.2"
 map-macro = "0.3.0"
+ordered-float = "5.1.0"
 rstest = {version = "0.26.1", default-features = false, features = ["crate-name"]}
 similar = "3.1.0"
 yaml-rust2 = {version = "0.11.0", default-features = false}
diff --git a/tests/regression.rs b/tests/regression.rs
index 8b156395a..da628b6ea 100644
--- a/tests/regression.rs
+++ b/tests/regression.rs
@@ -2,7 +2,8 @@
 use anyhow::Result;
 use float_cmp::approx_eq;
 use itertools::Itertools;
-use similar::{ChangeTag, TextDiff};
+use ordered_float::NotNan;
+use similar::{Algorithm, DiffOp, DiffTag, capture_diff_slices};
 use std::env;
 use std::fmt::Write as _;
 use std::fs::{File, read_dir};
@@ -97,10 +98,8 @@ fn compare_lines(
     let lines1 = read_lines(&output_dir1.join(file_name));
     let lines2 = read_lines(&output_dir2.join(file_name));
 
-    // check number of lines equal
+    // Check whether files differ using the existing field-by-field tolerance rules.
     let mut has_mismatch = lines1.len() != lines2.len();
-
-    // check each line is the same within numerical tolerance
     if !has_mismatch {
         has_mismatch = lines1
             .iter()
@@ -109,7 +108,8 @@ fn compare_lines(
     }
 
     if has_mismatch {
-        let diff = render_diff(&lines1, &lines2);
+        let diff_ops = capture_csv_diff_ops(&lines1, &lines2);
+        let diff = render_diff(&diff_ops, &lines1, &lines2);
         errors.push(format!("{file_name}: output differs\n{diff}"));
     }
 }
@@ -121,38 +121,70 @@ fn compare_line(line1: &str, line2: &str) -> bool {
         return false;
     }
 
-    // Check every field matches
-    fields1.into_iter().zip(fields2).all(|(f1, f2)| {
-        // First try to compare fields as floating-point values, falling back on string comparison
-        try_compare_floats(f1, f2).unwrap_or_else(|| f1 == f2)
-    })
+    fields1
+        .into_iter()
+        .zip(fields2)
+        .all(|(f1, f2)| try_compare_floats(f1, f2).unwrap_or_else(|| f1 == f2))
+}
+
+fn capture_csv_diff_ops(lines1: &[String], lines2: &[String]) -> Vec<DiffOp> {
+    let parsed1 = parse_csv_lines(lines1);
+    let parsed2 = parse_csv_lines(lines2);
+    capture_diff_slices(Algorithm::Myers, &parsed1, &parsed2)
+}
+
+fn has_non_equal_diff_ops(diff_ops: &[DiffOp]) -> bool {
+    diff_ops.iter().any(|op| op.tag() != DiffTag::Equal)
 }
 
 /// Given two lists of lines which don't match, render a diff showing
 /// which lines were added/removed, with line numbers from the original files.
-fn render_diff(lines1: &[String], lines2: &[String]) -> String {
-    let text1 = lines1.join("\n");
-    let text2 = lines2.join("\n");
-    let diff = TextDiff::from_lines(&text1, &text2);
-
+fn render_diff(diff_ops: &[DiffOp], lines1: &[String], lines2: &[String]) -> String {
     let mut out = String::new();
-    let mut line_num1 = 1;
-    let mut line_num2 = 1;
-    for change in diff.iter_all_changes() {
-        let line = change.to_string();
-        let line = line.trim_end_matches('\n');
-        match change.tag() {
-            ChangeTag::Delete => {
-                let _ = writeln!(out, "-L{line_num1}: {line}");
-                line_num1 += 1;
+    for op in diff_ops {
+        let (tag, old_range, new_range) = op.as_tag_tuple();
+        match tag {
+            DiffTag::Equal => {}
+            DiffTag::Delete => {
+                for old_idx in old_range {
+                    let _ = writeln!(out, "-L{}: {}", old_idx + 1, lines1[old_idx]);
+                }
             }
-            ChangeTag::Insert => {
-                let _ = writeln!(out, "+L{line_num2}: {line}");
-                line_num2 += 1;
+            DiffTag::Insert => {
+                for new_idx in new_range {
+                    let _ = writeln!(out, "+L{}: {}", new_idx + 1, lines2[new_idx]);
+                }
             }
-            ChangeTag::Equal => {
-                line_num1 += 1;
-                line_num2 += 1;
+            DiffTag::Replace => {
+                let old_start = old_range.start;
+                let new_start = new_range.start;
+                let paired_len = old_range.len().min(new_range.len());
+
+                for idx in 0..paired_len {
+                    let old_idx = old_start + idx;
+                    let new_idx = new_start + idx;
+                    if !compare_line(&lines1[old_idx], &lines2[new_idx]) {
+                        let _ = writeln!(out, "-L{}: {}", old_idx + 1, lines1[old_idx]);
+                        let _ = writeln!(out, "+L{}: {}", new_idx + 1, lines2[new_idx]);
+                    }
+                }
+
+                for (old_idx, line) in lines1
+                    .iter()
+                    .enumerate()
+                    .take(old_range.end)
+                    .skip(old_start + paired_len)
+                {
+                    let _ = writeln!(out, "-L{}: {}", old_idx + 1, line);
+                }
+                for (new_idx, line) in lines2
+                    .iter()
+                    .enumerate()
+                    .take(new_range.end)
+                    .skip(new_start + paired_len)
+                {
+                    let _ = writeln!(out, "+L{}: {}", new_idx + 1, line);
+                }
             }
         }
     }
@@ -160,6 +192,43 @@ fn render_diff(lines1: &[String], lines2: &[String]) -> String {
     out
 }
 
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+struct CsvLine(Vec<CsvField>);
+
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+enum CsvField {
+    Float(NotNan<f64>),
+    Text(String),
+}
+
+fn parse_csv_lines(lines: &[String]) -> Vec<CsvLine> {
+    lines.iter().map(|line| parse_csv_line(line)).collect()
+}
+
+fn parse_csv_line(line: &str) -> CsvLine {
+    CsvLine(line.split(',').map(parse_csv_field).collect())
+}
+
+fn parse_csv_field(field: &str) -> CsvField {
+    if let Some(float) = parse_finite(field) {
+        CsvField::Float(quantise_float(float))
+    } else {
+        CsvField::Text(field.to_string())
+    }
+}
+
+fn quantise_float(value: f64) -> NotNan<f64> {
+    let scaled = value / FLOAT_CMP_TOLERANCE;
+    let quantised = if scaled.is_finite() {
+        scaled.round() * FLOAT_CMP_TOLERANCE
+    } else {
+        value
+    };
+
+    let quantised = if quantised == -0.0 { 0.0 } else { quantised };
+    NotNan::new(quantised).expect("quantised float should always be finite")
+}
+
 /// Parse a string into an `f64`, returning `None` if parsing fails or value is infinite/NaN
 fn parse_finite(s: &str) -> Option<f64> {
     s.parse().ok().filter(|f: &f64| f.is_finite())
@@ -204,3 +273,78 @@ fn read_lines(path: &Path) -> Vec<String> {
         .map_while(Result::ok)
         .collect()
 }
+
+#[test]
+fn tolerated_float_change_yields_no_diff_ops() {
+    let old_lines = vec!["asset,1.00000000001".to_string()];
+    let new_lines = vec!["asset,1.00000000002".to_string()];
+
+    let diff_ops = capture_csv_diff_ops(&old_lines, &new_lines);
+    assert!(!has_non_equal_diff_ops(&diff_ops));
+}
+
+#[test]
+fn parse_csv_lines_normalises_floats_within_tolerance() {
+    let lines1 = vec![
+        "asset_a,1.000000000001,region_1".to_string(),
+        "asset_b,2.5,region_2".to_string(),
+    ];
+    let lines2 = vec![
+        "asset_a,1.000000000002,region_1".to_string(),
+        "asset_b,2.5,region_2".to_string(),
+    ];
+
+    let parsed1 = parse_csv_lines(&lines1);
+    let parsed2 = parse_csv_lines(&lines2);
+
+    assert_eq!(parsed1, parsed2);
+}
+
+#[test]
+fn render_diff_ignores_tolerated_float_changes() {
+    let old_lines = vec![
+        "asset_a,1.00000000001".to_string(),
+        "asset_b,2.0".to_string(),
+    ];
+    let new_lines = vec![
+        "asset_a,1.00000000002".to_string(),
+        "asset_b,3.0".to_string(),
+    ];
+
+    let diff_ops = capture_csv_diff_ops(&old_lines, &new_lines);
+    let diff = render_diff(&diff_ops, &old_lines, &new_lines);
+
+    assert!(!diff.contains("asset_a"));
+    assert!(diff.contains("asset_b,2.0"));
+    assert!(diff.contains("asset_b,3.0"));
+}
+
+#[test]
+fn render_diff_ignores_tolerated_float_differences_one_line_missing() {
+    let old_lines = vec![
+        "asset_a,1.000000000001".to_string(),
+        "asset_b,2.000000000001".to_string(),
+        "asset_c,3.0".to_string(),
+        "asset_d,4.000000000001".to_string(),
+        "asset_e,5.000000000001".to_string(),
+        "asset_f,6.000000000001".to_string(),
+    ];
+    let new_lines = vec![
+        "asset_a,1.000000000002".to_string(),
+        "asset_b,2.000000000002".to_string(),
+        "asset_d,4.000000000002".to_string(),
+        "asset_e,5.000000000002".to_string(),
+        "asset_f,6.000000000002".to_string(),
+    ];
+
+    let diff_ops = capture_csv_diff_ops(&old_lines, &new_lines);
+    let diff = render_diff(&diff_ops, &old_lines, &new_lines);
+
+    assert!(!diff.contains("asset_a"));
+    assert!(!diff.contains("asset_b"));
+    assert!(!diff.contains("asset_d"));
+    assert!(!diff.contains("asset_e"));
+    assert!(!diff.contains("asset_f"));
+    assert!(diff.contains("-L3: asset_c,3.0"));
+    assert!(!diff.contains("asset_c,9.0"));
+}

From 797fdf8777cc88411ac4337d318649f9a60b7f2e Mon Sep 17 00:00:00 2001
From: Aurash Karimi <a.karimi@imperial.ac.uk>
Date: Tue, 19 May 2026 16:12:19 +0100
Subject: [PATCH 3/6] add test for quantisation edge case

---
 tests/regression.rs | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/tests/regression.rs b/tests/regression.rs
index da628b6ea..a42e67bad 100644
--- a/tests/regression.rs
+++ b/tests/regression.rs
@@ -137,8 +137,8 @@ fn has_non_equal_diff_ops(diff_ops: &[DiffOp]) -> bool {
     diff_ops.iter().any(|op| op.tag() != DiffTag::Equal)
 }
 
-/// Given two lists of lines which don't match, render a diff showing
-/// which lines were added/removed, with line numbers from the original files.
+/// Render a line-based diff from `DiffOp`s, including old/new line numbers.
+/// For replaced lines, pairs that are equal under `compare_line` are omitted.
 fn render_diff(diff_ops: &[DiffOp], lines1: &[String], lines2: &[String]) -> String {
     let mut out = String::new();
     for op in diff_ops {
@@ -348,3 +348,23 @@ fn render_diff_ignores_tolerated_float_differences_one_line_missing() {
     assert!(diff.contains("-L3: asset_c,3.0"));
     assert!(!diff.contains("asset_c,9.0"));
 }
+
+#[test]
+fn render_diff_ignores_quantisation_boundary_tolerance_case() {
+    let old_lines = vec![
+        "asset_a,25.852906323049822".to_string(),
+        "asset_b,2.0".to_string(),
+    ];
+    let new_lines = vec![
+        "asset_a,25.852906323050078".to_string(),
+        "asset_b,3.0".to_string(),
+    ];
+
+    // `asset_a` differs only within tolerance, while `asset_b` is a real change.
+    let diff_ops = capture_csv_diff_ops(&old_lines, &new_lines);
+    let diff = render_diff(&diff_ops, &old_lines, &new_lines);
+
+    assert!(!diff.contains("asset_a"));
+    assert!(diff.contains("-L2: asset_b,2.0"));
+    assert!(diff.contains("+L2: asset_b,3.0"));
+}

From 3d5b4a71e8e3c8d2be4d0de1c77db3042d362bcc Mon Sep 17 00:00:00 2001
From: Aurash Karimi <a.karimi@imperial.ac.uk>
Date: Wed, 20 May 2026 11:37:04 +0100
Subject: [PATCH 4/6] add colours to  diffing render

---
 Cargo.lock          | 12 +++++++++++-
 Cargo.toml          |  1 +
 tests/regression.rs | 29 +++++++++++++++++++++++------
 3 files changed, 35 insertions(+), 7 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index e2cb20d89..6339f39c6 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -311,6 +311,15 @@ dependencies = [
  "windows-sys 0.59.0",
 ]
 
+[[package]]
+name = "colored"
+version = "3.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "faf9468729b8cbcea668e36183cb69d317348c2e08e994829fb56ebfdfbaac34"
+dependencies = [
+ "windows-sys 0.61.0",
+]
+
 [[package]]
 name = "convert_case"
 version = "0.8.0"
@@ -499,7 +508,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4316185f709b23713e41e3195f90edef7fb00c3ed4adc79769cf09cc762a3b29"
 dependencies = [
  "chrono",
- "colored",
+ "colored 2.2.0",
  "log",
 ]
 
@@ -1031,6 +1040,7 @@ dependencies = [
  "chrono",
  "clap",
  "clap-markdown",
+ "colored 3.1.1",
  "csv",
  "derive_more",
  "dirs",
diff --git a/Cargo.toml b/Cargo.toml
index 3b4d2d7f8..88d625913 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -41,6 +41,7 @@ erased-serde = "0.4.10"
 
 [dev-dependencies]
 assert_cmd = "2.2.2"
+colored = "3.0.0"
 map-macro = "0.3.0"
 ordered-float = "5.1.0"
 rstest = {version = "0.26.1", default-features = false, features = ["crate-name"]}
diff --git a/tests/regression.rs b/tests/regression.rs
index a42e67bad..389c86ae8 100644
--- a/tests/regression.rs
+++ b/tests/regression.rs
@@ -1,5 +1,6 @@
 //! Common code for running regression tests.
 use anyhow::Result;
+use colored::Colorize;
 use float_cmp::approx_eq;
 use itertools::Itertools;
 use ordered_float::NotNan;
@@ -147,12 +148,20 @@ fn render_diff(diff_ops: &[DiffOp], lines1: &[String], lines2: &[String]) -> Str
             DiffTag::Equal => {}
             DiffTag::Delete => {
                 for old_idx in old_range {
-                    let _ = writeln!(out, "-L{}: {}", old_idx + 1, lines1[old_idx]);
+                    let _ = writeln!(
+                        out,
+                        "{}",
+                        format!("-L{}: {}", old_idx + 1, lines1[old_idx]).red()
+                    );
                 }
             }
             DiffTag::Insert => {
                 for new_idx in new_range {
-                    let _ = writeln!(out, "+L{}: {}", new_idx + 1, lines2[new_idx]);
+                    let _ = writeln!(
+                        out,
+                        "{}",
+                        format!("+L{}: {}", new_idx + 1, lines2[new_idx]).green()
+                    );
                 }
             }
             DiffTag::Replace => {
@@ -164,8 +173,16 @@ fn render_diff(diff_ops: &[DiffOp], lines1: &[String], lines2: &[String]) -> Str
                     let old_idx = old_start + idx;
                     let new_idx = new_start + idx;
                     if !compare_line(&lines1[old_idx], &lines2[new_idx]) {
-                        let _ = writeln!(out, "-L{}: {}", old_idx + 1, lines1[old_idx]);
-                        let _ = writeln!(out, "+L{}: {}", new_idx + 1, lines2[new_idx]);
+                        let _ = writeln!(
+                            out,
+                            "{}",
+                            format!("-L{}: {}", old_idx + 1, lines1[old_idx]).red()
+                        );
+                        let _ = writeln!(
+                            out,
+                            "{}",
+                            format!("+L{}: {}", new_idx + 1, lines2[new_idx]).green()
+                        );
                     }
                 }
 
@@ -175,7 +192,7 @@ fn render_diff(diff_ops: &[DiffOp], lines1: &[String], lines2: &[String]) -> Str
                     .take(old_range.end)
                     .skip(old_start + paired_len)
                 {
-                    let _ = writeln!(out, "-L{}: {}", old_idx + 1, line);
+                    let _ = writeln!(out, "{}", format!("-L{}: {}", old_idx + 1, line).red());
                 }
                 for (new_idx, line) in lines2
                     .iter()
@@ -183,7 +200,7 @@ fn render_diff(diff_ops: &[DiffOp], lines1: &[String], lines2: &[String]) -> Str
                     .take(new_range.end)
                     .skip(new_start + paired_len)
                 {
-                    let _ = writeln!(out, "+L{}: {}", new_idx + 1, line);
+                    let _ = writeln!(out, "{}", format!("+L{}: {}", new_idx + 1, line).green());
                 }
             }
         }

From c9da3d583c94b4a5f4084c252e2e0245eaba8930 Mon Sep 17 00:00:00 2001
From: Aurash Karimi <a.karimi@imperial.ac.uk>
Date: Thu, 28 May 2026 14:25:45 +0100
Subject: [PATCH 5/6] add binning method for floating comparisons add more
 robust testing and more cases for csv diff

---
 tests/regression.rs | 583 +++++++++++++++++++++++++++-----------------
 1 file changed, 355 insertions(+), 228 deletions(-)

diff --git a/tests/regression.rs b/tests/regression.rs
index 389c86ae8..4c4cce222 100644
--- a/tests/regression.rs
+++ b/tests/regression.rs
@@ -1,12 +1,9 @@
 //! Common code for running regression tests.
 use anyhow::Result;
 use colored::Colorize;
-use float_cmp::approx_eq;
-use itertools::Itertools;
-use ordered_float::NotNan;
-use similar::{Algorithm, DiffOp, DiffTag, capture_diff_slices};
+use similar::{Algorithm, ChangeTag, TextDiff};
 use std::env;
-use std::fmt::Write as _;
+use std::fmt::Write;
 use std::fs::{File, read_dir};
 use std::io::{BufRead, BufReader};
 use std::path::{Path, PathBuf};
@@ -38,9 +35,19 @@ define_regression_test_with_patches!(simple_ironing_out);
 
 // ------  END: regression tests  ------
 
-/// The tolerance when comparing floating-point values in CSV files
+/// Tolerance for comparing floating-point values in CSV lines.
 const FLOAT_CMP_TOLERANCE: f64 = 1e-10;
 
+/// Return a value offset from the base by a fraction of the tolerance.
+fn offset_from(base: f64, multiplier: f64) -> f64 {
+    base + FLOAT_CMP_TOLERANCE * multiplier
+}
+
+/// Build a CSV line from a name and floating-point value.
+fn csv_line(name: &str, value: f64) -> String {
+    format!("{name},{value}")
+}
+
 /// Run a regression test for the given example with optional extra arguments to `muse2 run`.
 fn run_regression_test(example: &str, extra_args: &[&str]) {
     // Allow user to set output dir for regression tests so they can examine results. This is
@@ -80,7 +87,9 @@ fn compare_output_dirs(cur_output_dir1: &Path, test_data_dir: &Path, debug_model
 
     let mut errors = Vec::new();
     for file_name in file_names1 {
-        compare_lines(cur_output_dir1, test_data_dir, &file_name, &mut errors);
+        if let Some(diff) = diff_csv_file(cur_output_dir1, test_data_dir, &file_name) {
+            errors.push(format!("{file_name}: output differs\n{diff}"));
+        }
     }
 
     assert!(
@@ -90,177 +99,197 @@ fn compare_output_dirs(cur_output_dir1: &Path, test_data_dir: &Path, debug_model
     );
 }
 
-fn compare_lines(
-    output_dir1: &Path,
-    output_dir2: &Path,
-    file_name: &str,
-    errors: &mut Vec<String>,
-) {
+fn diff_csv_file(output_dir1: &Path, output_dir2: &Path, file_name: &str) -> Option<String> {
     let lines1 = read_lines(&output_dir1.join(file_name));
     let lines2 = read_lines(&output_dir2.join(file_name));
 
-    // Check whether files differ using the existing field-by-field tolerance rules.
-    let mut has_mismatch = lines1.len() != lines2.len();
-    if !has_mismatch {
-        has_mismatch = lines1
-            .iter()
-            .zip(&lines2)
-            .any(|(line1, line2)| !compare_line(line1, line2));
-    }
-
-    if has_mismatch {
-        let diff_ops = capture_csv_diff_ops(&lines1, &lines2);
-        let diff = render_diff(&diff_ops, &lines1, &lines2);
-        errors.push(format!("{file_name}: output differs\n{diff}"));
-    }
+    compute_normalised_diff(&lines1, &lines2)
 }
 
-fn compare_line(line1: &str, line2: &str) -> bool {
-    let fields1 = line1.split(',').collect_vec();
-    let fields2 = line2.split(',').collect_vec();
-    if fields1.len() != fields2.len() {
-        return false;
+/// Compute a line diff after replacing tolerance-equivalent rows with a shared canonical row.
+fn compute_normalised_diff(lines1: &[String], lines2: &[String]) -> Option<String> {
+    if files_match_with_tolerance(lines1, lines2) {
+        return None;
     }
 
-    fields1
-        .into_iter()
-        .zip(fields2)
-        .all(|(f1, f2)| try_compare_floats(f1, f2).unwrap_or_else(|| f1 == f2))
-}
+    let (normalised_lines1, normalised_lines2) = normalise_tolerant_equivalents(lines1, lines2);
 
-fn capture_csv_diff_ops(lines1: &[String], lines2: &[String]) -> Vec<DiffOp> {
-    let parsed1 = parse_csv_lines(lines1);
-    let parsed2 = parse_csv_lines(lines2);
-    capture_diff_slices(Algorithm::Myers, &parsed1, &parsed2)
+    (normalised_lines1 != normalised_lines2)
+        .then(|| render_normalised_diff(&normalised_lines1, &normalised_lines2))
 }
 
-fn has_non_equal_diff_ops(diff_ops: &[DiffOp]) -> bool {
-    diff_ops.iter().any(|op| op.tag() != DiffTag::Equal)
+/// Quick pre-check to skip normalisation/diff rendering when rows already match within tolerance.
+fn files_match_with_tolerance(lines1: &[String], lines2: &[String]) -> bool {
+    lines1.len() == lines2.len()
+        && lines1
+            .iter()
+            .zip(lines2)
+            .all(|(line1, line2)| lines_match_with_tolerance(line1, line2))
 }
 
-/// Render a line-based diff from `DiffOp`s, including old/new line numbers.
-/// For replaced lines, pairs that are equal under `compare_line` are omitted.
-fn render_diff(diff_ops: &[DiffOp], lines1: &[String], lines2: &[String]) -> String {
+/// Render a human-readable diff from already-normalised lines.
+///
+/// Reported line numbers are relative to the normalised views and may differ from the original
+/// unnormalised row positions when tolerance-based replacements occur.
+fn render_normalised_diff(old_lines: &[String], new_lines: &[String]) -> String {
+    let old = old_lines.join("\n");
+    let new = new_lines.join("\n");
+    let diff = TextDiff::configure()
+        .algorithm(Algorithm::Myers)
+        .diff_lines(&old, &new);
     let mut out = String::new();
-    for op in diff_ops {
-        let (tag, old_range, new_range) = op.as_tag_tuple();
-        match tag {
-            DiffTag::Equal => {}
-            DiffTag::Delete => {
-                for old_idx in old_range {
-                    let _ = writeln!(
-                        out,
-                        "{}",
-                        format!("-L{}: {}", old_idx + 1, lines1[old_idx]).red()
-                    );
-                }
+    let mut old_line_number = 1;
+    let mut new_line_number = 1;
+    let mut old_index = 0usize;
+    let mut new_index = 0usize;
+
+    for change in diff.iter_all_changes() {
+        let line = match change.tag() {
+            ChangeTag::Equal => {
+                old_line_number += 1;
+                new_line_number += 1;
+                old_index += 1;
+                new_index += 1;
+                continue;
             }
-            DiffTag::Insert => {
-                for new_idx in new_range {
-                    let _ = writeln!(
-                        out,
-                        "{}",
-                        format!("+L{}: {}", new_idx + 1, lines2[new_idx]).green()
-                    );
-                }
+            ChangeTag::Delete => {
+                let value = &old_lines[old_index];
+                let line = format!("-L{old_line_number}: {value}").red().to_string();
+                old_line_number += 1;
+                old_index += 1;
+                line
             }
-            DiffTag::Replace => {
-                let old_start = old_range.start;
-                let new_start = new_range.start;
-                let paired_len = old_range.len().min(new_range.len());
-
-                for idx in 0..paired_len {
-                    let old_idx = old_start + idx;
-                    let new_idx = new_start + idx;
-                    if !compare_line(&lines1[old_idx], &lines2[new_idx]) {
-                        let _ = writeln!(
-                            out,
-                            "{}",
-                            format!("-L{}: {}", old_idx + 1, lines1[old_idx]).red()
-                        );
-                        let _ = writeln!(
-                            out,
-                            "{}",
-                            format!("+L{}: {}", new_idx + 1, lines2[new_idx]).green()
-                        );
-                    }
-                }
-
-                for (old_idx, line) in lines1
-                    .iter()
-                    .enumerate()
-                    .take(old_range.end)
-                    .skip(old_start + paired_len)
-                {
-                    let _ = writeln!(out, "{}", format!("-L{}: {}", old_idx + 1, line).red());
-                }
-                for (new_idx, line) in lines2
-                    .iter()
-                    .enumerate()
-                    .take(new_range.end)
-                    .skip(new_start + paired_len)
-                {
-                    let _ = writeln!(out, "{}", format!("+L{}: {}", new_idx + 1, line).green());
-                }
+            ChangeTag::Insert => {
+                let value = &new_lines[new_index];
+                let line = format!("+L{new_line_number}: {value}").green().to_string();
+                new_line_number += 1;
+                new_index += 1;
+                line
             }
-        }
+        };
+        out.push_str(&line);
+        out.push('\n');
     }
 
+    writeln!(
+        out,
+        "\nNote: line numbers and floats are approximate within tolerance: {FLOAT_CMP_TOLERANCE}."
+    )
+    .expect("writing to String should not fail");
+    writeln!(
+        out,
+        "Small discrepancies are possible between the diff and the actual files."
+    )
+    .expect("writing to String should not fail");
+
     out
 }
 
-#[derive(Debug, Clone, PartialEq, Eq, Hash)]
-struct CsvLine(Vec<CsvField>);
+/// Replace rows that are equal within `FLOAT_CMP_TOLERANCE` with a shared canonical row.
+///
+/// We build bins keyed by canonical line values. A line is added to the first existing bin only
+/// if it matches every row already in that bin; otherwise a new bin is created with that line as
+/// the canonical value.
+///
+/// We process `lines1` first and then `lines2` so canonical keys are stable with respect to the
+/// old output ordering.
+fn normalise_tolerant_equivalents(
+    lines1: &[String],
+    lines2: &[String],
+) -> (Vec<String>, Vec<String>) {
+    let mut bins = Vec::new();
+
+    let mut bins1 = Vec::with_capacity(lines1.len());
+    for line in lines1 {
+        let bin = assign_line_to_bin(line, &mut bins);
+        bins1.push(bin);
+    }
+
+    let mut bins2 = Vec::with_capacity(lines2.len());
+    for line in lines2 {
+        let bin = assign_line_to_bin(line, &mut bins);
+        bins2.push(bin);
+    }
 
-#[derive(Debug, Clone, PartialEq, Eq, Hash)]
-enum CsvField {
-    Float(NotNan<f64>),
-    Text(String),
+    let normalised1 = bins1
+        .into_iter()
+        .map(|bin| {
+            bins.get(bin)
+                .expect("bin key should exist")
+                .canonical
+                .clone()
+        })
+        .collect();
+    let normalised2 = bins2
+        .into_iter()
+        .map(|bin| {
+            bins.get(bin)
+                .expect("bin key should exist")
+                .canonical
+                .clone()
+        })
+        .collect();
+
+    (normalised1, normalised2)
 }
 
-fn parse_csv_lines(lines: &[String]) -> Vec<CsvLine> {
-    lines.iter().map(|line| parse_csv_line(line)).collect()
+/// Assign a line to the first bin it can join without breaking complete-link tolerance.
+fn assign_line_to_bin(line: &str, bins: &mut Vec<LineBin>) -> usize {
+    if let Some(bin) = find_matching_bin(line, bins) {
+        bins[bin].members.push(line.to_string());
+        return bin;
+    }
+
+    bins.push(LineBin::new(line));
+    bins.len() - 1
 }
 
-fn parse_csv_line(line: &str) -> CsvLine {
-    CsvLine(line.split(',').map(parse_csv_field).collect())
+/// Find the first bin whose current members all match the candidate line.
+fn find_matching_bin(line: &str, bins: &[LineBin]) -> Option<usize> {
+    bins.iter().position(|bin| line_matches_bin(line, bin))
 }
 
-fn parse_csv_field(field: &str) -> CsvField {
-    if let Some(float) = parse_finite(field) {
-        CsvField::Float(quantise_float(float))
-    } else {
-        CsvField::Text(field.to_string())
-    }
+/// Check whether a candidate line is within tolerance of every line already in a bin.
+fn line_matches_bin(line: &str, bin: &LineBin) -> bool {
+    bin.members
+        .iter()
+        .all(|member| lines_match_with_tolerance(line, member))
 }
 
-fn quantise_float(value: f64) -> NotNan<f64> {
-    let scaled = value / FLOAT_CMP_TOLERANCE;
-    let quantised = if scaled.is_finite() {
-        scaled.round() * FLOAT_CMP_TOLERANCE
-    } else {
-        value
-    };
+#[derive(Debug)]
+struct LineBin {
+    canonical: String,
+    members: Vec<String>,
+}
 
-    let quantised = if quantised == -0.0 { 0.0 } else { quantised };
-    NotNan::new(quantised).expect("quantised float should always be finite")
+impl LineBin {
+    /// Create a new bin seeded with the provided line.
+    fn new(line: &str) -> Self {
+        Self {
+            canonical: line.to_string(),
+            members: vec![line.to_string()],
+        }
+    }
 }
 
-/// Parse a string into an `f64`, returning `None` if parsing fails or value is infinite/NaN
+/// Parse a string into an `f64`, returning `None` if parsing fails or value is infinite/NaN.
 fn parse_finite(s: &str) -> Option<f64> {
     s.parse().ok().filter(|f: &f64| f.is_finite())
 }
 
-fn try_compare_floats(s1: &str, s2: &str) -> Option<bool> {
-    let float1 = parse_finite(s1)?;
-    let float2 = parse_finite(s2)?;
+fn lines_match_with_tolerance(line1: &str, line2: &str) -> bool {
+    let fields1: Vec<_> = line1.split(',').collect();
+    let fields2: Vec<_> = line2.split(',').collect();
+    if fields1.len() != fields2.len() {
+        return false;
+    }
 
-    Some(approx_eq!(
-        f64,
-        float1,
-        float2,
-        epsilon = FLOAT_CMP_TOLERANCE
-    ))
+    fields1.into_iter().zip(fields2).all(|(field1, field2)| {
+        match (parse_finite(field1), parse_finite(field2)) {
+            (Some(float1), Some(float2)) => (float1 - float2).abs() <= FLOAT_CMP_TOLERANCE,
+            _ => field1 == field2,
+        }
+    })
 }
 
 /// Get the names of CSV files expected to appear in the given folder
@@ -291,97 +320,195 @@ fn read_lines(path: &Path) -> Vec<String> {
         .collect()
 }
 
-#[test]
-fn tolerated_float_change_yields_no_diff_ops() {
-    let old_lines = vec!["asset,1.00000000001".to_string()];
-    let new_lines = vec!["asset,1.00000000002".to_string()];
+mod tests {
+    use super::*;
 
-    let diff_ops = capture_csv_diff_ops(&old_lines, &new_lines);
-    assert!(!has_non_equal_diff_ops(&diff_ops));
-}
+    #[test]
+    fn tolerated_float_change_is_normalised_away() {
+        let old_lines = vec![csv_line("asset", offset_from(1.0, 0.25))];
+        let new_lines = vec![csv_line("asset", offset_from(1.0, 0.75))];
 
-#[test]
-fn parse_csv_lines_normalises_floats_within_tolerance() {
-    let lines1 = vec![
-        "asset_a,1.000000000001,region_1".to_string(),
-        "asset_b,2.5,region_2".to_string(),
-    ];
-    let lines2 = vec![
-        "asset_a,1.000000000002,region_1".to_string(),
-        "asset_b,2.5,region_2".to_string(),
-    ];
-
-    let parsed1 = parse_csv_lines(&lines1);
-    let parsed2 = parse_csv_lines(&lines2);
-
-    assert_eq!(parsed1, parsed2);
-}
+        let (normalised_old, normalised_new) =
+            normalise_tolerant_equivalents(&old_lines, &new_lines);
+        assert_eq!(normalised_old, normalised_new);
+    }
 
-#[test]
-fn render_diff_ignores_tolerated_float_changes() {
-    let old_lines = vec![
-        "asset_a,1.00000000001".to_string(),
-        "asset_b,2.0".to_string(),
-    ];
-    let new_lines = vec![
-        "asset_a,1.00000000002".to_string(),
-        "asset_b,3.0".to_string(),
-    ];
-
-    let diff_ops = capture_csv_diff_ops(&old_lines, &new_lines);
-    let diff = render_diff(&diff_ops, &old_lines, &new_lines);
-
-    assert!(!diff.contains("asset_a"));
-    assert!(diff.contains("asset_b,2.0"));
-    assert!(diff.contains("asset_b,3.0"));
-}
+    #[test]
+    fn normalise_lines_rounds_float_fields() {
+        let lines1 = vec![
+            csv_line("asset_a", offset_from(1.0, 0.1)) + ",region_1",
+            "asset_b,2.5,region_2".to_string(),
+        ];
+        let expected = vec![
+            csv_line("asset_a", offset_from(1.0, 0.2)) + ",region_1",
+            "asset_b,2.5,region_2".to_string(),
+        ];
+
+        let (normalised_lines, normalised_expected) =
+            normalise_tolerant_equivalents(&lines1, &expected);
+        assert_eq!(normalised_lines, normalised_expected);
+    }
 
-#[test]
-fn render_diff_ignores_tolerated_float_differences_one_line_missing() {
-    let old_lines = vec![
-        "asset_a,1.000000000001".to_string(),
-        "asset_b,2.000000000001".to_string(),
-        "asset_c,3.0".to_string(),
-        "asset_d,4.000000000001".to_string(),
-        "asset_e,5.000000000001".to_string(),
-        "asset_f,6.000000000001".to_string(),
-    ];
-    let new_lines = vec![
-        "asset_a,1.000000000002".to_string(),
-        "asset_b,2.000000000002".to_string(),
-        "asset_d,4.000000000002".to_string(),
-        "asset_e,5.000000000002".to_string(),
-        "asset_f,6.000000000002".to_string(),
-    ];
-
-    let diff_ops = capture_csv_diff_ops(&old_lines, &new_lines);
-    let diff = render_diff(&diff_ops, &old_lines, &new_lines);
-
-    assert!(!diff.contains("asset_a"));
-    assert!(!diff.contains("asset_b"));
-    assert!(!diff.contains("asset_d"));
-    assert!(!diff.contains("asset_e"));
-    assert!(!diff.contains("asset_f"));
-    assert!(diff.contains("-L3: asset_c,3.0"));
-    assert!(!diff.contains("asset_c,9.0"));
-}
+    #[test]
+    fn render_diff_ignores_tolerated_float_changes() {
+        let old_lines = vec![
+            csv_line("asset_a", offset_from(1.0, 0.25)),
+            "asset_b,2.0".to_string(),
+        ];
+        let new_lines = vec![
+            csv_line("asset_a", offset_from(1.0, 0.75)),
+            "asset_b,3.0".to_string(),
+        ];
+
+        let diff = compute_normalised_diff(&old_lines, &new_lines)
+            .expect("asset_b changed so diff is expected");
+
+        assert!(!diff.contains("asset_a"));
+        assert!(diff.contains("asset_b,2.0"));
+        assert!(diff.contains("asset_b,3.0"));
+    }
+
+    #[test]
+    fn render_diff_ignores_tolerated_float_differences_one_line_missing() {
+        let old_lines = vec![
+            csv_line("asset_a", offset_from(1.0, 0.1)),
+            csv_line("asset_b", offset_from(2.0, 0.1)),
+            "asset_c,3.0".to_string(),
+            csv_line("asset_d", offset_from(4.0, 0.1)),
+            csv_line("asset_e", offset_from(5.0, 0.1)),
+            csv_line("asset_f", offset_from(6.0, 0.1)),
+        ];
+        let new_lines = vec![
+            csv_line("asset_a", offset_from(1.0, 0.2)),
+            csv_line("asset_b", offset_from(2.0, 0.2)),
+            csv_line("asset_d", offset_from(4.0, 0.2)),
+            csv_line("asset_e", offset_from(5.0, 0.2)),
+            csv_line("asset_f", offset_from(6.0, 0.2)),
+        ];
+
+        let diff = compute_normalised_diff(&old_lines, &new_lines)
+            .expect("asset_c is missing so this should still report a diff");
+
+        assert!(!diff.contains("asset_a"));
+        assert!(!diff.contains("asset_b"));
+        assert!(!diff.contains("asset_d"));
+        assert!(!diff.contains("asset_e"));
+        assert!(!diff.contains("asset_f"));
+        assert!(diff.contains("-L3: asset_c,3.0"));
+        assert!(!diff.contains("asset_c,9.0"));
+    }
+
+    #[test]
+    fn render_diff_keeps_deleted_line_text_for_almost_duplicate_rows() {
+        let old_lines = vec![
+            csv_line("asset_dup", offset_from(1.0, 0.1)),
+            csv_line("asset_dup", offset_from(1.0, 0.2)),
+        ];
+        let new_lines = vec![csv_line("asset_dup", offset_from(1.0, 0.2))];
+
+        let diff = compute_normalised_diff(&old_lines, &new_lines)
+            .expect("A deleted duplicate-ish row should produce a reported diff");
+
+        let expected_deleted_line = csv_line("asset_dup", offset_from(1.0, 0.1));
+        assert!(
+            diff.contains(&format!("-L1: {expected_deleted_line}")),
+            "Deleted line should be shown with its original text, but diff was:\n{diff}"
+        );
+    }
+
+    #[test]
+    fn diff_computation_correctly_splits_bins_when_rows_exceed_tolerance() {
+        let old_lines = vec![
+            csv_line("asset_a", offset_from(1.0, 0.9)),
+            csv_line("asset_dup", offset_from(2.0, 0.0)),
+            "asset_z,9.0".to_string(),
+            csv_line("asset_dup", offset_from(2.0, 0.9)),
+            "asset_z,9.0".to_string(),
+        ];
+        let new_lines = vec![
+            csv_line("asset_a", offset_from(1.0, 0.9)),
+            csv_line("asset_dup", offset_from(2.0, 0.0)),
+            "asset_z,9.0".to_string(),
+            csv_line("asset_dup", offset_from(2.0, 2.1)),
+            "asset_z,9.0".to_string(),
+        ];
+
+        let diff = compute_normalised_diff(&old_lines, &new_lines)
+            .expect("A deleted row should produce a reported diff");
+
+        // 2.0000000009 is within tolerance of 2.0000000000 so they are binned together.
+        assert!(
+            diff.contains(&format!(
+                "-L4: {}",
+                csv_line("asset_dup", offset_from(2.0, 0.0))
+            )),
+            "The row that stays in the original bin should still be shown, but diff was:\n{diff}"
+        );
+        // 2.0000000021 is outside tolerance of that bin so it must split into a new bin.
+        assert!(
+            diff.contains(&format!(
+                "+L4: {}",
+                csv_line("asset_dup", offset_from(2.0, 2.1))
+            )),
+            "The out-of-tolerance row should split into its own bin, but diff was:\n{diff}"
+        );
+    }
 
-#[test]
-fn render_diff_ignores_quantisation_boundary_tolerance_case() {
-    let old_lines = vec![
-        "asset_a,25.852906323049822".to_string(),
-        "asset_b,2.0".to_string(),
-    ];
-    let new_lines = vec![
-        "asset_a,25.852906323050078".to_string(),
-        "asset_b,3.0".to_string(),
-    ];
-
-    // `asset_a` differs only within tolerance, while `asset_b` is a real change.
-    let diff_ops = capture_csv_diff_ops(&old_lines, &new_lines);
-    let diff = render_diff(&diff_ops, &old_lines, &new_lines);
-
-    assert!(!diff.contains("asset_a"));
-    assert!(diff.contains("-L2: asset_b,2.0"));
-    assert!(diff.contains("+L2: asset_b,3.0"));
+    // Not really a test, just demonstrates the approximation behaviour in a corner case
+    // where there are duplicates, the diff output can only show the actual number in the file
+    // within the tolerance
+    #[test]
+    fn diff_computation_is_approximate_and_order_dependent_part_1() {
+        let old_lines = vec![
+            csv_line("asset_a", offset_from(1.0, 0.1)),
+            csv_line("asset_dup", offset_from(2.0, 0.1)),
+            csv_line("asset_dup", offset_from(2.0, 0.2)),
+            "asset_z,9.0".to_string(),
+        ];
+        let new_lines = vec![
+            csv_line("asset_a", offset_from(1.0, 0.2)),
+            csv_line("asset_dup", offset_from(2.0, 0.2)),
+            "asset_z,9.0".to_string(),
+        ];
+
+        let diff = compute_normalised_diff(&old_lines, &new_lines)
+            .expect("A deleted row should produce a reported diff");
+
+        assert!(
+            diff.contains(&format!(
+                "-L3: {}",
+                csv_line("asset_dup", offset_from(2.0, 0.1))
+            )),
+            "A deleted duplicate-ish row should still be shown, but diff was:\n{diff}"
+        );
+    }
+
+    // Not really a test, just demonstrates the approximation behaviour in a corner case
+    // where there are duplicates, the diff output can only show the actual number in the file
+    // within the tolerance
+    #[test]
+    fn diff_computation_is_approximate_and_order_dependent_part_2() {
+        let old_lines = vec![
+            csv_line("asset_a", offset_from(1.0, 0.25)),
+            csv_line("asset_dup", offset_from(2.0, 0.2)),
+            csv_line("asset_dup", offset_from(2.0, 0.1)),
+            "asset_z,9.0".to_string(),
+        ];
+        let new_lines = vec![
+            csv_line("asset_a", offset_from(1.0, 0.75)),
+            csv_line("asset_dup", offset_from(2.0, 0.2)),
+            "asset_z,9.0".to_string(),
+        ];
+
+        let diff = compute_normalised_diff(&old_lines, &new_lines)
+            .expect("A deleted row should produce a reported diff");
+
+        assert!(
+            diff.contains(&format!(
+                "-L3: {}",
+                csv_line("asset_dup", offset_from(2.0, 0.2))
+            )),
+            "A deleted duplicate-ish row should still be shown, but diff was:\n{diff}"
+        );
+    }
 }

From 3bc9bf45a77cc33534e140bdcc50e14d4f469c96 Mon Sep 17 00:00:00 2001
From: Aurash Karimi <a.karimi@imperial.ac.uk>
Date: Thu, 28 May 2026 14:43:48 +0100
Subject: [PATCH 6/6] move testing helpers to test module

---
 tests/regression.rs | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/tests/regression.rs b/tests/regression.rs
index 4c4cce222..89eac3e2c 100644
--- a/tests/regression.rs
+++ b/tests/regression.rs
@@ -38,16 +38,6 @@ define_regression_test_with_patches!(simple_ironing_out);
 /// Tolerance for comparing floating-point values in CSV lines.
 const FLOAT_CMP_TOLERANCE: f64 = 1e-10;
 
-/// Return a value offset from the base by a fraction of the tolerance.
-fn offset_from(base: f64, multiplier: f64) -> f64 {
-    base + FLOAT_CMP_TOLERANCE * multiplier
-}
-
-/// Build a CSV line from a name and floating-point value.
-fn csv_line(name: &str, value: f64) -> String {
-    format!("{name},{value}")
-}
-
 /// Run a regression test for the given example with optional extra arguments to `muse2 run`.
 fn run_regression_test(example: &str, extra_args: &[&str]) {
     // Allow user to set output dir for regression tests so they can examine results. This is
@@ -323,6 +313,16 @@ fn read_lines(path: &Path) -> Vec<String> {
 mod tests {
     use super::*;
 
+    /// Return a value offset from the base by a fraction of the tolerance.
+    fn offset_from(base: f64, multiplier: f64) -> f64 {
+        base + FLOAT_CMP_TOLERANCE * multiplier
+    }
+
+    /// Build a CSV line from a name and floating-point value.
+    fn csv_line(name: &str, value: f64) -> String {
+        format!("{name},{value}")
+    }
+
     #[test]
     fn tolerated_float_change_is_normalised_away() {
         let old_lines = vec![csv_line("asset", offset_from(1.0, 0.25))];