From b3c2640721c37870bd521e5a331db143a7ec35ce Mon Sep 17 00:00:00 2001
From: Hiroki Kobayashi <kobayashi@hiramu.net>
Date: Wed, 24 Jun 2026 17:37:07 +0900
Subject: [PATCH] fix(diar): deterministic, robust K-Means re-clustering via
 n_init (best-of-N)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The offline VBx speaker-count adjustment (re-clustering detected clusters down to the
constrained count) called KMeansClustering with a random seed and a single
initialization. This is both non-deterministic and fragile: small/boundary speakers
collapse run-to-run (observed on a 4-speaker meeting clip, cause-(ii) swinging
~10%↔~30% across runs; the smallest speaker's recall flips 80%↔0%).

- KMeansClustering: default unseeded fallback now uses a fixed seed (0) instead of
  UInt64.random; add clusterWithCentroidsNInit which runs N deterministic
  initializations (seeds base..base+N-1) and returns the lowest-inertia result
  (sklearn-style n_init).
- VBxClustering: the speaker-count re-clustering now uses n_init=10, baseSeed=0.

Result: re-clustering is fully deterministic and robustly keeps fragile speakers
(the 4-speaker clip now scores ~9.2% consistently across 5+ runs and on-device).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 .../Offline/Clustering/KMeansClustering.swift | 41 ++++++++++++++++++-
 .../Offline/Clustering/VBxClustering.swift    |  8 +++-
 2 files changed, 46 insertions(+), 3 deletions(-)

diff --git a/Sources/FluidAudio/Diarizer/Offline/Clustering/KMeansClustering.swift b/Sources/FluidAudio/Diarizer/Offline/Clustering/KMeansClustering.swift
index a0615e119..296c03624 100644
--- a/Sources/FluidAudio/Diarizer/Offline/Clustering/KMeansClustering.swift
+++ b/Sources/FluidAudio/Diarizer/Offline/Clustering/KMeansClustering.swift
@@ -61,7 +61,7 @@ struct KMeansClustering {
             return (Array(0..<count), embeddings)
         }
 
-        var rng = SeededRNG(seed: seed ?? UInt64.random(in: 0...UInt64.max))
+        var rng = SeededRNG(seed: seed ?? 0)
         let normalized = normalizeEmbeddings(embeddings)
         var centroids = initializeCentroids(from: normalized, k: k, rng: &rng)
         var assignments = [Int](repeating: 0, count: count)
@@ -89,6 +89,45 @@ struct KMeansClustering {
         return (assignments, centroids)
     }
 
+    /// Runs K-Means `nInit` times with deterministic seeds (baseSeed, baseSeed+1, …) and returns
+    /// the lowest-inertia result (sklearn-style `n_init`). Single random-seed init is both
+    /// non-deterministic and fragile: it collapses small/边界 speakers run-to-run (ICT 4-spk
+    /// 小牧 が kept↔collapse で揺れ、~10%↔~30% を実証)。Best-of-N with fixed seeds makes the
+    /// re-clustering both reproducible and robust.
+    static func clusterWithCentroidsNInit(
+        embeddings: [[Double]],
+        numClusters: Int,
+        maxIterations: Int = 300,
+        nInit: Int = 10,
+        baseSeed: UInt64 = 0
+    ) -> (clusters: [Int], centroids: [[Double]]) {
+        guard embeddings.count > numClusters, nInit > 1 else {
+            return clusterWithCentroids(
+                embeddings: embeddings, numClusters: numClusters,
+                maxIterations: maxIterations, seed: baseSeed)
+        }
+        let normalized = normalizeEmbeddings(embeddings)
+        var best: (clusters: [Int], centroids: [[Double]])?
+        var bestInertia = Double.greatestFiniteMagnitude
+        for i in 0..<nInit {
+            let result = clusterWithCentroids(
+                embeddings: embeddings, numClusters: numClusters,
+                maxIterations: maxIterations, seed: baseSeed &+ UInt64(i))
+            // inertia = Σ ‖normalized(emb) − assignedCentroid‖²(centroids も normalized 空間)
+            var inertia: Double = 0
+            for (idx, c) in result.clusters.enumerated() where c >= 0 && c < result.centroids.count {
+                inertia += euclideanDistanceSquared(normalized[idx], result.centroids[c])
+            }
+            if inertia < bestInertia {
+                bestInertia = inertia
+                best = result
+            }
+        }
+        return best ?? clusterWithCentroids(
+            embeddings: embeddings, numClusters: numClusters,
+            maxIterations: maxIterations, seed: baseSeed)
+    }
+
     private static func normalizeEmbeddings(_ embeddings: [[Double]]) -> [[Double]] {
         embeddings.map { embedding in
             var norm: Double = 0
diff --git a/Sources/FluidAudio/Diarizer/Offline/Clustering/VBxClustering.swift b/Sources/FluidAudio/Diarizer/Offline/Clustering/VBxClustering.swift
index dc9232417..1ec518f5f 100644
--- a/Sources/FluidAudio/Diarizer/Offline/Clustering/VBxClustering.swift
+++ b/Sources/FluidAudio/Diarizer/Offline/Clustering/VBxClustering.swift
@@ -704,10 +704,14 @@ struct VBxClustering {
             "Speaker count \(detectedCount) outside bounds [\(constraints.minSpeakers), \(constraints.maxSpeakers)]; re-clustering to \(targetCount)"
         )
 
-        let (kmeansClusters, centroids) = KMeansClustering.clusterWithCentroids(
+        // n_init=10 の決定的初期化から最小 inertia を採用(sklearn 流)。単一ランダム初期化は
+        // 脆い話者を非決定的に collapse させる(ICT 小牧で実証、~10%↔~30% の揺れ)。
+        let (kmeansClusters, centroids) = KMeansClustering.clusterWithCentroidsNInit(
             embeddings: trainingEmbeddings,
             numClusters: targetCount,
-            maxIterations: 100
+            maxIterations: 100,
+            nInit: 10,
+            baseSeed: 0
         )
 
         return VBxOutput(