From b3c2640721c37870bd521e5a331db143a7ec35ce Mon Sep 17 00:00:00 2001 From: Hiroki Kobayashi Date: Wed, 24 Jun 2026 17:37:07 +0900 Subject: [PATCH] fix(diar): deterministic, robust K-Means re-clustering via n_init (best-of-N) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The offline VBx speaker-count adjustment (re-clustering detected clusters down to the constrained count) called KMeansClustering with a random seed and a single initialization. This is both non-deterministic and fragile: small/boundary speakers collapse run-to-run (observed on a 4-speaker meeting clip, cause-(ii) swinging ~10%↔~30% across runs; the smallest speaker's recall flips 80%↔0%). - KMeansClustering: default unseeded fallback now uses a fixed seed (0) instead of UInt64.random; add clusterWithCentroidsNInit which runs N deterministic initializations (seeds base..base+N-1) and returns the lowest-inertia result (sklearn-style n_init). - VBxClustering: the speaker-count re-clustering now uses n_init=10, baseSeed=0. Result: re-clustering is fully deterministic and robustly keeps fragile speakers (the 4-speaker clip now scores ~9.2% consistently across 5+ runs and on-device). Co-Authored-By: Claude Opus 4.8 --- .../Offline/Clustering/KMeansClustering.swift | 41 ++++++++++++++++++- .../Offline/Clustering/VBxClustering.swift | 8 +++- 2 files changed, 46 insertions(+), 3 deletions(-) diff --git a/Sources/FluidAudio/Diarizer/Offline/Clustering/KMeansClustering.swift b/Sources/FluidAudio/Diarizer/Offline/Clustering/KMeansClustering.swift index a0615e119..296c03624 100644 --- a/Sources/FluidAudio/Diarizer/Offline/Clustering/KMeansClustering.swift +++ b/Sources/FluidAudio/Diarizer/Offline/Clustering/KMeansClustering.swift @@ -61,7 +61,7 @@ struct KMeansClustering { return (Array(0.. (clusters: [Int], centroids: [[Double]]) { + guard embeddings.count > numClusters, nInit > 1 else { + return clusterWithCentroids( + embeddings: embeddings, numClusters: numClusters, + maxIterations: maxIterations, seed: baseSeed) + } + let normalized = normalizeEmbeddings(embeddings) + var best: (clusters: [Int], centroids: [[Double]])? + var bestInertia = Double.greatestFiniteMagnitude + for i in 0..= 0 && c < result.centroids.count { + inertia += euclideanDistanceSquared(normalized[idx], result.centroids[c]) + } + if inertia < bestInertia { + bestInertia = inertia + best = result + } + } + return best ?? clusterWithCentroids( + embeddings: embeddings, numClusters: numClusters, + maxIterations: maxIterations, seed: baseSeed) + } + private static func normalizeEmbeddings(_ embeddings: [[Double]]) -> [[Double]] { embeddings.map { embedding in var norm: Double = 0 diff --git a/Sources/FluidAudio/Diarizer/Offline/Clustering/VBxClustering.swift b/Sources/FluidAudio/Diarizer/Offline/Clustering/VBxClustering.swift index dc9232417..1ec518f5f 100644 --- a/Sources/FluidAudio/Diarizer/Offline/Clustering/VBxClustering.swift +++ b/Sources/FluidAudio/Diarizer/Offline/Clustering/VBxClustering.swift @@ -704,10 +704,14 @@ struct VBxClustering { "Speaker count \(detectedCount) outside bounds [\(constraints.minSpeakers), \(constraints.maxSpeakers)]; re-clustering to \(targetCount)" ) - let (kmeansClusters, centroids) = KMeansClustering.clusterWithCentroids( + // n_init=10 の決定的初期化から最小 inertia を採用(sklearn 流)。単一ランダム初期化は + // 脆い話者を非決定的に collapse させる(ICT 小牧で実証、~10%↔~30% の揺れ)。 + let (kmeansClusters, centroids) = KMeansClustering.clusterWithCentroidsNInit( embeddings: trainingEmbeddings, numClusters: targetCount, - maxIterations: 100 + maxIterations: 100, + nInit: 10, + baseSeed: 0 ) return VBxOutput(