From 476ce2942ecc7d1b08774f5d8f061b301b600036 Mon Sep 17 00:00:00 2001 From: Lars Francke Date: Thu, 12 Mar 2026 09:03:11 +0100 Subject: [PATCH 1/2] fix: add wall-clock certificate expiry check to webhook TLS rotation The rotation interval uses tokio's monotonic clock, but certificate validity uses wall-clock time. When these diverge (hibernation, VM migration, cgroup freezing), the certificate can expire before rotation. Add a periodic wall-clock check (every 5 minutes) that compares SystemTime::now() against the certificate's not_after field and triggers early rotation if the cert is within 4 hours of expiry. Fixes: https://github.com/stackabletech/operator-rs/issues/1174 --- .../src/tls/cert_resolver.rs | 34 +++++++++++++++---- crates/stackable-webhook/src/tls/mod.rs | 30 ++++++++++++++++ 2 files changed, 58 insertions(+), 6 deletions(-) diff --git a/crates/stackable-webhook/src/tls/cert_resolver.rs b/crates/stackable-webhook/src/tls/cert_resolver.rs index d355b956c..ff0beb365 100644 --- a/crates/stackable-webhook/src/tls/cert_resolver.rs +++ b/crates/stackable-webhook/src/tls/cert_resolver.rs @@ -1,4 +1,4 @@ -use std::sync::Arc; +use std::{sync::Arc, time::SystemTime}; use arc_swap::ArcSwap; use snafu::{OptionExt, ResultExt, Snafu}; @@ -57,6 +57,9 @@ pub struct CertificateResolver { /// Using a [`ArcSwap`] (over e.g. [`tokio::sync::RwLock`]), so that we can easily /// (and performant) bridge between async write and sync read. current_certified_key: ArcSwap, + /// The wall-clock expiry time (`not_after`) of the current certificate. + /// Used to detect clock drift between monotonic and wall-clock time. + current_not_after: ArcSwap, subject_alterative_dns_names: Arc>, certificate_tx: mpsc::Sender, @@ -68,7 +71,7 @@ impl CertificateResolver { certificate_tx: mpsc::Sender, ) -> Result { let subject_alterative_dns_names = Arc::new(subject_alterative_dns_names); - let certified_key = Self::generate_new_certificate_inner( + let (certified_key, not_after) = Self::generate_new_certificate_inner( subject_alterative_dns_names.clone(), &certificate_tx, ) @@ -77,20 +80,37 @@ impl CertificateResolver { Ok(Self { subject_alterative_dns_names, current_certified_key: ArcSwap::new(certified_key), + current_not_after: ArcSwap::new(Arc::new(not_after)), certificate_tx, }) } pub async fn rotate_certificate(&self) -> Result<()> { - let certified_key = self.generate_new_certificate().await?; + let (certified_key, not_after) = self.generate_new_certificate().await?; // TODO: Sign the new cert somehow with the old cert. See https://github.com/stackabletech/decisions/issues/56 self.current_certified_key.store(certified_key); + self.current_not_after.store(Arc::new(not_after)); Ok(()) } - async fn generate_new_certificate(&self) -> Result> { + /// Returns `true` if the current certificate is expired or will expire + /// within the given `buffer` duration according to wall-clock time. + /// + /// This catches cases where the monotonic timer (used by `tokio::time`) + /// has drifted from wall-clock time, e.g. due to system hibernation. + pub fn needs_rotation(&self, buffer: std::time::Duration) -> bool { + let not_after = **self.current_not_after.load(); + // If subtraction underflows (buffer > time since epoch), fall back to + // UNIX_EPOCH so that the comparison always triggers rotation. + let deadline = not_after + .checked_sub(buffer) + .unwrap_or(SystemTime::UNIX_EPOCH); + SystemTime::now() >= deadline + } + + async fn generate_new_certificate(&self) -> Result<(Arc, SystemTime)> { let subject_alterative_dns_names = self.subject_alterative_dns_names.clone(); Self::generate_new_certificate_inner(subject_alterative_dns_names, &self.certificate_tx) .await @@ -106,7 +126,7 @@ impl CertificateResolver { async fn generate_new_certificate_inner( subject_alterative_dns_names: Arc>, certificate_tx: &mpsc::Sender, - ) -> Result> { + ) -> Result<(Arc, SystemTime)> { // The certificate generations can take a while, so we use `spawn_blocking` let (cert, certified_key) = tokio::task::spawn_blocking(move || { let tls_provider = @@ -144,12 +164,14 @@ impl CertificateResolver { .await .context(TokioSpawnBlockingSnafu)??; + let not_after = cert.tbs_certificate.validity.not_after.to_system_time(); + certificate_tx .send(cert) .await .map_err(|_err| CertificateResolverError::SendCertificateToChannel)?; - Ok(certified_key) + Ok((certified_key, not_after)) } } diff --git a/crates/stackable-webhook/src/tls/mod.rs b/crates/stackable-webhook/src/tls/mod.rs index 3c5d36482..f5c4eee07 100644 --- a/crates/stackable-webhook/src/tls/mod.rs +++ b/crates/stackable-webhook/src/tls/mod.rs @@ -42,6 +42,14 @@ pub const WEBHOOK_CA_LIFETIME: Duration = Duration::from_hours_unchecked(24); pub const WEBHOOK_CERTIFICATE_LIFETIME: Duration = Duration::from_hours_unchecked(24); pub const WEBHOOK_CERTIFICATE_ROTATION_INTERVAL: Duration = Duration::from_hours_unchecked(20); +/// How often to check wall-clock time for certificate expiry (5 minutes). +/// This catches clock drift from system hibernation, VM migration, etc. +const WALL_CLOCK_CHECK_INTERVAL: Duration = Duration::from_minutes_unchecked(5); + +/// Rotate the certificate when it is within this buffer of expiry according +/// to wall-clock time (4 hours before the 24h certificate expires). +const WALL_CLOCK_EXPIRY_BUFFER: Duration = Duration::from_hours_unchecked(4); + pub type Result = std::result::Result; #[derive(Debug, Snafu)] @@ -156,6 +164,13 @@ impl TlsServer { let start = tokio::time::Instant::now() + *WEBHOOK_CERTIFICATE_ROTATION_INTERVAL; let mut interval = tokio::time::interval_at(start, *WEBHOOK_CERTIFICATE_ROTATION_INTERVAL); + // A separate, shorter interval to check wall-clock time against the + // certificate's not_after. This catches monotonic/wall-clock drift + // caused by hibernation, VM migration, cgroup freezing, etc. + let wall_clock_check_start = tokio::time::Instant::now() + *WALL_CLOCK_CHECK_INTERVAL; + let mut wall_clock_check = + tokio::time::interval_at(wall_clock_check_start, *WALL_CLOCK_CHECK_INTERVAL); + let tls_acceptor = TlsAcceptor::from(Arc::new(config)); let tcp_listener = TcpListener::bind(socket_addr) .await @@ -207,6 +222,21 @@ impl TlsServer { .context(RotateCertificateSnafu)? } + // Wall-clock check: detect if the certificate is near expiry + // even though the monotonic rotation interval hasn't fired yet. + // This handles clock drift from hibernation, VM migration, etc. + _ = wall_clock_check.tick() => { + if cert_resolver.needs_rotation(*WALL_CLOCK_EXPIRY_BUFFER) { + tracing::info!("wall-clock check detected certificate near expiry, rotating early"); + cert_resolver + .rotate_certificate() + .await + .context(RotateCertificateSnafu)?; + // Reset the monotonic interval so we don't double-rotate + interval.reset(); + } + } + // This is cancellation-safe. If cancelled, no new connections are accepted. tcp_connection = tcp_listener.accept() => { let (tcp_stream, remote_addr) = match tcp_connection { From 003a628c41b8d64983cb829a6325ba434bbf9ef0 Mon Sep 17 00:00:00 2001 From: Lars Francke Date: Thu, 12 Mar 2026 11:09:41 +0100 Subject: [PATCH 2/2] refactor: replace dual rotation timers with single wall-clock check Remove the monotonic 20h rotation interval and the supplementary wall-clock check. Instead, use a single periodic check (every 5 min) that compares wall-clock time against the certificate's not_after. Also derive the expiry buffer from the certificate lifetime (1/6) so it scales if the lifetime ever changes, and add comments documenting the relationship between lifetime and check interval. --- crates/stackable-webhook/src/tls/mod.rs | 69 +++++++++++-------------- 1 file changed, 31 insertions(+), 38 deletions(-) diff --git a/crates/stackable-webhook/src/tls/mod.rs b/crates/stackable-webhook/src/tls/mod.rs index f5c4eee07..da61b1764 100644 --- a/crates/stackable-webhook/src/tls/mod.rs +++ b/crates/stackable-webhook/src/tls/mod.rs @@ -39,16 +39,25 @@ use crate::{ mod cert_resolver; pub const WEBHOOK_CA_LIFETIME: Duration = Duration::from_hours_unchecked(24); -pub const WEBHOOK_CERTIFICATE_LIFETIME: Duration = Duration::from_hours_unchecked(24); -pub const WEBHOOK_CERTIFICATE_ROTATION_INTERVAL: Duration = Duration::from_hours_unchecked(20); -/// How often to check wall-clock time for certificate expiry (5 minutes). -/// This catches clock drift from system hibernation, VM migration, etc. -const WALL_CLOCK_CHECK_INTERVAL: Duration = Duration::from_minutes_unchecked(5); - -/// Rotate the certificate when it is within this buffer of expiry according -/// to wall-clock time (4 hours before the 24h certificate expires). -const WALL_CLOCK_EXPIRY_BUFFER: Duration = Duration::from_hours_unchecked(4); +/// The wall-clock lifetime of generated webhook certificates. If this is ever +/// reduced, ensure it stays well above [`CERTIFICATE_ROTATION_CHECK_INTERVAL`] +/// (currently 5 minutes), otherwise the certificate could expire between checks. +const WEBHOOK_CERTIFICATE_LIFETIME_HOURS: u64 = 24; +pub const WEBHOOK_CERTIFICATE_LIFETIME: Duration = + Duration::from_hours_unchecked(WEBHOOK_CERTIFICATE_LIFETIME_HOURS); + +/// How often to check whether the certificate needs rotation. This is +/// intentionally independent of the certificate lifetime — it controls how +/// quickly we detect wall-clock drift (from hibernation, VM migration, etc.), +/// not how long the certificate lives. +const CERTIFICATE_ROTATION_CHECK_INTERVAL: Duration = Duration::from_minutes_unchecked(5); + +/// Rotate the certificate when less than 1/6 of its lifetime remains +/// (4 hours for the current 24h lifetime). Derived from +/// [`WEBHOOK_CERTIFICATE_LIFETIME`] so it scales if the lifetime changes. +const CERTIFICATE_EXPIRY_BUFFER: Duration = + Duration::from_minutes_unchecked(WEBHOOK_CERTIFICATE_LIFETIME_HOURS * 60 / 6); pub type Result = std::result::Result; @@ -161,15 +170,12 @@ impl TlsServer { router, } = self; - let start = tokio::time::Instant::now() + *WEBHOOK_CERTIFICATE_ROTATION_INTERVAL; - let mut interval = tokio::time::interval_at(start, *WEBHOOK_CERTIFICATE_ROTATION_INTERVAL); - - // A separate, shorter interval to check wall-clock time against the - // certificate's not_after. This catches monotonic/wall-clock drift - // caused by hibernation, VM migration, cgroup freezing, etc. - let wall_clock_check_start = tokio::time::Instant::now() + *WALL_CLOCK_CHECK_INTERVAL; - let mut wall_clock_check = - tokio::time::interval_at(wall_clock_check_start, *WALL_CLOCK_CHECK_INTERVAL); + // Periodically check whether the certificate needs rotation based on + // wall-clock time. This avoids the monotonic vs wall-clock drift problem + // that can occur during hibernation, VM migration, or cgroup freezing. + let check_start = tokio::time::Instant::now() + *CERTIFICATE_ROTATION_CHECK_INTERVAL; + let mut rotation_check = + tokio::time::interval_at(check_start, *CERTIFICATE_ROTATION_CHECK_INTERVAL); let tls_acceptor = TlsAcceptor::from(Arc::new(config)); let tcp_listener = TcpListener::bind(socket_addr) @@ -198,11 +204,10 @@ impl TlsServer { loop { let tls_acceptor = tls_acceptor.clone(); - // Wait for either a new TCP connection or the certificate rotation interval tick tokio::select! { // We opt for a biased execution of arms to make sure we always check if a - // shutdown signal was received or the certificate needs rotation based on the - // interval. This ensures, we always use a valid certificate for the TLS connection. + // shutdown signal was received or the certificate needs rotation before + // accepting new connections. biased; // Once a shutdown signal is received (this future becomes `Poll::Ready`), break out @@ -213,27 +218,15 @@ impl TlsServer { break; } - // This is cancellation-safe. If this branch is cancelled, the tick is NOT consumed. - // As such, we will not miss rotating the certificate. - _ = interval.tick() => { - cert_resolver - .rotate_certificate() - .await - .context(RotateCertificateSnafu)? - } - - // Wall-clock check: detect if the certificate is near expiry - // even though the monotonic rotation interval hasn't fired yet. - // This handles clock drift from hibernation, VM migration, etc. - _ = wall_clock_check.tick() => { - if cert_resolver.needs_rotation(*WALL_CLOCK_EXPIRY_BUFFER) { - tracing::info!("wall-clock check detected certificate near expiry, rotating early"); + // Check wall-clock time to decide if the certificate needs rotation. + // This is cancellation-safe: if cancelled, the tick is NOT consumed. + _ = rotation_check.tick() => { + if cert_resolver.needs_rotation(*CERTIFICATE_EXPIRY_BUFFER) { + tracing::info!("certificate approaching expiry, rotating"); cert_resolver .rotate_certificate() .await .context(RotateCertificateSnafu)?; - // Reset the monotonic interval so we don't double-rotate - interval.reset(); } }