From 65d9cd79fd8911ed2a09b8619afafab215283eed Mon Sep 17 00:00:00 2001 From: Travis Date: Mon, 6 Apr 2026 15:00:36 -0700 Subject: [PATCH 1/2] Track optimal_accuracy time in stats; pre-allocate null_two scratch buffers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit optimal_accuracy was timed in AlignStageStats but never propagated to the global ThreadedTimed stats, silently inflating the "misc" bucket (~14% of alignment time on a 1000-HMM run). Add ThreadedTimed::OptimalAccuracy and wire it up in Stats::add_sample. Introduce NullTwoScratch to hold the four Vec allocations that null_two_score previously made on every call. The scratch is held on DefaultAlignStage and reused across calls. Buffer sizing: the largest buffers are core_posteriors (target_length + 1 floats) and match_sums/insert_sums (profile_length + 1 floats each). In the worst case — say, titin at ~34k AA against a large Pfam model — these reach roughly 160 KB total per thread. In practice they'll stabilize quickly at the high-water mark for the run. Two properties are maintained on every call via NullTwoScratch::zero_prefix: - Allocate only when a new high-water mark is reached; otherwise reuse the existing allocation with no heap traffic. - Zero only the prefix of each buffer that the current call will touch, not the entire capacity. A 400-AA sequence after a 34k-AA sequence writes 401 zeros, not 34,000. Changes implemented with Claude --- libnail/src/align/mod.rs | 3 ++- libnail/src/align/scoring.rs | 42 ++++++++++++++++++++++++++++---- nail/src/pipeline/align_stage.rs | 4 ++- nail/src/stats.rs | 3 +++ 4 files changed, 45 insertions(+), 7 deletions(-) diff --git a/libnail/src/align/mod.rs b/libnail/src/align/mod.rs index d95325e..bfa2937 100644 --- a/libnail/src/align/mod.rs +++ b/libnail/src/align/mod.rs @@ -17,7 +17,8 @@ pub use optimal_accuracy::optimal_accuracy; mod scoring; pub use scoring::{ - cloud_score, e_value, null_one_score, null_two_score, p_value, Bits, Nats, Score, + cloud_score, e_value, null_one_score, null_two_score, p_value, Bits, Nats, NullTwoScratch, + Score, }; mod traceback; diff --git a/libnail/src/align/scoring.rs b/libnail/src/align/scoring.rs index e71cc75..bc6ff86 100644 --- a/libnail/src/align/scoring.rs +++ b/libnail/src/align/scoring.rs @@ -217,18 +217,50 @@ pub fn null_one_score(target_length: usize) -> Nats { Nats(target_length as f32 * p_null_loop.ln() + p_null_exit.ln()) } +/// Reusable scratch buffers for `null_two_score`. Allocate once per thread via +/// `NullTwoScratch::default()` and pass `&mut scratch` on every call to avoid +/// per-call heap allocation. +#[derive(Default, Clone)] +pub struct NullTwoScratch { + pub expected_prob_ratios: Vec, + pub match_sums: Vec, + pub insert_sums: Vec, + pub core_posteriors: Vec, +} + +impl NullTwoScratch { + fn prepare(&mut self, profile_length: usize, target_length: usize) { + let ratio_len = Profile::MAX_DEGENERATE_ALPHABET_SIZE; + let profile_len = profile_length + 1; + let target_len = target_length + 1; + + self.expected_prob_ratios.clear(); + self.expected_prob_ratios.resize(ratio_len, 0.0); + + self.match_sums.clear(); + self.match_sums.resize(profile_len, 0.0); + + self.insert_sums.clear(); + self.insert_sums.resize(profile_len, 0.0); + + self.core_posteriors.clear(); + self.core_posteriors.resize(target_len, 0.0); + } +} + /// Compute the null two score adjustment: the composition bias. pub fn null_two_score( posterior_matrix: &impl DpMatrix, profile: &Profile, target: &Sequence, row_bounds: &RowBounds, + scratch: &mut NullTwoScratch, ) -> Nats { - // TODO: prevent these allocations? - let mut expected_prob_ratios: Vec = vec![0.0; Profile::MAX_DEGENERATE_ALPHABET_SIZE]; - let mut match_sums: Vec = vec![0.0; profile.length + 1]; - let mut insert_sums: Vec = vec![0.0; profile.length + 1]; - let mut core_posteriors: Vec = vec![0.0; target.length + 1]; + scratch.prepare(profile.length, target.length); + let expected_prob_ratios = &mut scratch.expected_prob_ratios; + let match_sums = &mut scratch.match_sums; + let insert_sums = &mut scratch.insert_sums; + let core_posteriors = &mut scratch.core_posteriors; let mut core_state_sum: f32 = 0.0; // what: for each position in the model, take the sum of diff --git a/nail/src/pipeline/align_stage.rs b/nail/src/pipeline/align_stage.rs index 01c7478..b2732c9 100644 --- a/nail/src/pipeline/align_stage.rs +++ b/nail/src/pipeline/align_stage.rs @@ -6,7 +6,7 @@ use libnail::{ align::{ backward, forward, null_one_score, null_two_score, optimal_accuracy, p_value, posterior, structs::{Alignment, AlignmentBuilder, DpMatrixSparse, RowBounds, Trace}, - traceback, Bits, + traceback, Bits, NullTwoScratch, }, structs::{Profile, Sequence}, }; @@ -72,6 +72,7 @@ pub struct DefaultAlignStage { backward_matrix: DpMatrixSparse, posterior_matrix: DpMatrixSparse, optimal_matrix: DpMatrixSparse, + null_two_scratch: NullTwoScratch, forward_p_value_threshold: f64, target_count: usize, config: AlignConfig, @@ -184,6 +185,7 @@ impl AlignStage for DefaultAlignStage { profile, target, bounds, + &mut self.null_two_scratch, )); stats.null_two_time(now.elapsed()); score diff --git a/nail/src/stats.rs b/nail/src/stats.rs index 5aba9cc..29ec6e1 100644 --- a/nail/src/stats.rs +++ b/nail/src/stats.rs @@ -97,6 +97,7 @@ pub enum ThreadedTimed { Forward, Backward, Posterior, + OptimalAccuracy, Traceback, NullTwo, OutputWrite, @@ -115,6 +116,7 @@ impl Debug for ThreadedTimed { ThreadedTimed::Forward => "forward", ThreadedTimed::Backward => "backward", ThreadedTimed::Posterior => "posterior", + ThreadedTimed::OptimalAccuracy => "optimal accuracy", ThreadedTimed::Traceback => "traceback", ThreadedTimed::NullTwo => "null two", }; @@ -264,6 +266,7 @@ impl Stats { self.add_threaded_time(ThreadedTimed::Backward, stats.backward_time); self.add_threaded_time(ThreadedTimed::Posterior, stats.posterior_time); + self.add_threaded_time(ThreadedTimed::OptimalAccuracy, stats.optimal_accuracy_time); self.add_threaded_time(ThreadedTimed::Traceback, stats.traceback_time); self.add_threaded_time(ThreadedTimed::NullTwo, stats.null_two_time); } From 2aa76d5d99ccb91fb5ca9c1883dc56a13320bb71 Mon Sep 17 00:00:00 2001 From: Travis Date: Mon, 6 Apr 2026 15:21:13 -0700 Subject: [PATCH 2/2] NullTwoScratch - add a forgotten file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I forgot to add this file. This replace clear()+resize() with an explicit zero_prefix helper that makes the memory strategy unambiguous: - If the buffer is smaller than needed, resize (allocating only at a new high-water mark). - Otherwise, fill only the prefix the current call will touch. The clear()+resize() idiom was already correct — it wrote exactly n zeros regardless of capacity — but the intent wasn't obvious from reading it . --- libnail/src/align/scoring.rs | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/libnail/src/align/scoring.rs b/libnail/src/align/scoring.rs index bc6ff86..e02c52e 100644 --- a/libnail/src/align/scoring.rs +++ b/libnail/src/align/scoring.rs @@ -230,21 +230,23 @@ pub struct NullTwoScratch { impl NullTwoScratch { fn prepare(&mut self, profile_length: usize, target_length: usize) { - let ratio_len = Profile::MAX_DEGENERATE_ALPHABET_SIZE; - let profile_len = profile_length + 1; - let target_len = target_length + 1; - - self.expected_prob_ratios.clear(); - self.expected_prob_ratios.resize(ratio_len, 0.0); - - self.match_sums.clear(); - self.match_sums.resize(profile_len, 0.0); - - self.insert_sums.clear(); - self.insert_sums.resize(profile_len, 0.0); - - self.core_posteriors.clear(); - self.core_posteriors.resize(target_len, 0.0); + // Grow each buffer if the current call needs more capacity than we've + // seen before; otherwise reuse the existing allocation. Either way, + // zero only the prefix that this call will actually read/write — + // leaving stale values beyond the active range is intentional. + Self::zero_prefix(&mut self.expected_prob_ratios, Profile::MAX_DEGENERATE_ALPHABET_SIZE); + Self::zero_prefix(&mut self.match_sums, profile_length + 1); + Self::zero_prefix(&mut self.insert_sums, profile_length + 1); + Self::zero_prefix(&mut self.core_posteriors, target_length + 1); + } + + #[inline] + fn zero_prefix(v: &mut Vec, n: usize) { + if v.len() < n { + v.resize(n, 0.0); // allocates only when a new high-water mark is reached + } else { + v[..n].fill(0.0); // only touches the n elements this call will use + } } }