From 49133cdc221aeb1cd9c556dd915370765a97bd02 Mon Sep 17 00:00:00 2001 From: flashboy Date: Wed, 28 Jan 2026 11:28:27 -0500 Subject: [PATCH 1/3] Add restart policy + exit on consecutive health failures --- backend/src/lib/server.rs | 32 +++++++++++++++++++++++++++++--- docker-compose.yml | 1 + 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/backend/src/lib/server.rs b/backend/src/lib/server.rs index 8aa9935..a134f8b 100644 --- a/backend/src/lib/server.rs +++ b/backend/src/lib/server.rs @@ -29,6 +29,12 @@ use super::serializable_event::SerializableEventData; /// Stores the Unix timestamp (in seconds) of the last event received from the ring type LastEventTime = Arc; +/// Tracks consecutive unhealthy health checks +type ConsecutiveUnhealthyCount = Arc; + +/// Number of consecutive unhealthy checks before triggering process exit +const UNHEALTHY_THRESHOLD: u64 = 3; + #[derive(Debug, Clone, Serialize, Deserialize)] pub struct TopAccessesData { pub account: Vec>, @@ -224,7 +230,7 @@ async fn run_event_forwarder_task( .duration_since(UNIX_EPOCH) .unwrap_or_default() .as_secs(); - last_event_time.store(now_secs, Ordering::Relaxed); + // last_event_time.store(now_secs, Ordering::Relaxed); // Track txn_hash from TxnHeaderStart events if let EventName::TxnHeaderStart = event_data.event_name { @@ -362,6 +368,7 @@ async fn handle_connection( async fn health_handler( last_event_time: LastEventTime, + consecutive_unhealthy: ConsecutiveUnhealthyCount, ) -> Result>, hyper::Error> { let now_secs = SystemTime::now() .duration_since(UNIX_EPOCH) @@ -371,10 +378,25 @@ async fn health_handler( let is_healthy = now_secs.saturating_sub(last_event) <= 10; let body = if is_healthy { + consecutive_unhealthy.store(0, Ordering::Relaxed); info!("Health check passed"); r#"{"success": true}"# } else { - warn!("Health check failed - last event time: {} seconds ago", now_secs.saturating_sub(last_event)); + let count = consecutive_unhealthy.fetch_add(1, Ordering::Relaxed) + 1; + warn!( + "Health check failed - last event time: {} seconds ago (consecutive failures: {})", + now_secs.saturating_sub(last_event), + count + ); + + if count >= UNHEALTHY_THRESHOLD { + error!( + "Health check failed {} consecutive times, exiting to trigger restart", + count + ); + std::process::exit(1); + } + r#"{"success": false}"# }; @@ -392,15 +414,19 @@ async fn run_health_server( let listener = tokio::net::TcpListener::bind(health_addr).await?; info!("Health server listening on: {}", health_addr); + let consecutive_unhealthy: ConsecutiveUnhealthyCount = Arc::new(AtomicU64::new(0)); + loop { let (stream, _) = listener.accept().await?; let io = hyper_util::rt::TokioIo::new(stream); let last_event_time = last_event_time.clone(); + let consecutive_unhealthy = consecutive_unhealthy.clone(); tokio::spawn(async move { let service = service_fn(move |_req: Request| { let last_event_time = last_event_time.clone(); - async move { health_handler(last_event_time).await } + let consecutive_unhealthy = consecutive_unhealthy.clone(); + async move { health_handler(last_event_time, consecutive_unhealthy).await } }); if let Err(e) = http1::Builder::new().serve_connection(io, service).await { diff --git a/docker-compose.yml b/docker-compose.yml index 30565fb..32a424e 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -7,6 +7,7 @@ services: dockerfile: Dockerfile container_name: backend network_mode: host + restart: always environment: - RUST_LOG=info volumes: From 9a63d0732118db422ba735809b9793a7fa93152c Mon Sep 17 00:00:00 2001 From: flashboy Date: Wed, 28 Jan 2026 11:33:59 -0500 Subject: [PATCH 2/3] store last event time --- backend/src/lib/server.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/src/lib/server.rs b/backend/src/lib/server.rs index a134f8b..cdb35c9 100644 --- a/backend/src/lib/server.rs +++ b/backend/src/lib/server.rs @@ -230,7 +230,7 @@ async fn run_event_forwarder_task( .duration_since(UNIX_EPOCH) .unwrap_or_default() .as_secs(); - // last_event_time.store(now_secs, Ordering::Relaxed); + last_event_time.store(now_secs, Ordering::Relaxed); // Track txn_hash from TxnHeaderStart events if let EventName::TxnHeaderStart = event_data.event_name { From 37f7a4abb5850bb6f7db57cb1c86e45c6412772f Mon Sep 17 00:00:00 2001 From: flashboy Date: Wed, 28 Jan 2026 11:39:32 -0500 Subject: [PATCH 3/3] change to 30 second interval instead of 3 failed attempts --- backend/src/lib/server.rs | 45 +++++++++++++++++---------------------- 1 file changed, 20 insertions(+), 25 deletions(-) diff --git a/backend/src/lib/server.rs b/backend/src/lib/server.rs index cdb35c9..a2c470c 100644 --- a/backend/src/lib/server.rs +++ b/backend/src/lib/server.rs @@ -29,11 +29,11 @@ use super::serializable_event::SerializableEventData; /// Stores the Unix timestamp (in seconds) of the last event received from the ring type LastEventTime = Arc; -/// Tracks consecutive unhealthy health checks -type ConsecutiveUnhealthyCount = Arc; +/// Seconds without events before health check reports unhealthy +const UNHEALTHY_THRESHOLD_SECS: u64 = 10; -/// Number of consecutive unhealthy checks before triggering process exit -const UNHEALTHY_THRESHOLD: u64 = 3; +/// Seconds without events before triggering process exit +const EXIT_THRESHOLD_SECS: u64 = 30; #[derive(Debug, Clone, Serialize, Deserialize)] pub struct TopAccessesData { @@ -368,35 +368,34 @@ async fn handle_connection( async fn health_handler( last_event_time: LastEventTime, - consecutive_unhealthy: ConsecutiveUnhealthyCount, ) -> Result>, hyper::Error> { let now_secs = SystemTime::now() .duration_since(UNIX_EPOCH) .unwrap_or_default() .as_secs(); let last_event = last_event_time.load(Ordering::Relaxed); - let is_healthy = now_secs.saturating_sub(last_event) <= 10; + let time_since_last_event = now_secs.saturating_sub(last_event); + + // Exit process if no events received for EXIT_THRESHOLD_SECS + if time_since_last_event >= EXIT_THRESHOLD_SECS { + error!( + "No events received for {} seconds (threshold: {}), exiting to trigger restart", + time_since_last_event, + EXIT_THRESHOLD_SECS + ); + std::process::exit(1); + } + + let is_healthy = time_since_last_event <= UNHEALTHY_THRESHOLD_SECS; let body = if is_healthy { - consecutive_unhealthy.store(0, Ordering::Relaxed); info!("Health check passed"); r#"{"success": true}"# } else { - let count = consecutive_unhealthy.fetch_add(1, Ordering::Relaxed) + 1; warn!( - "Health check failed - last event time: {} seconds ago (consecutive failures: {})", - now_secs.saturating_sub(last_event), - count + "Health check failed - last event time: {} seconds ago", + time_since_last_event ); - - if count >= UNHEALTHY_THRESHOLD { - error!( - "Health check failed {} consecutive times, exiting to trigger restart", - count - ); - std::process::exit(1); - } - r#"{"success": false}"# }; @@ -414,19 +413,15 @@ async fn run_health_server( let listener = tokio::net::TcpListener::bind(health_addr).await?; info!("Health server listening on: {}", health_addr); - let consecutive_unhealthy: ConsecutiveUnhealthyCount = Arc::new(AtomicU64::new(0)); - loop { let (stream, _) = listener.accept().await?; let io = hyper_util::rt::TokioIo::new(stream); let last_event_time = last_event_time.clone(); - let consecutive_unhealthy = consecutive_unhealthy.clone(); tokio::spawn(async move { let service = service_fn(move |_req: Request| { let last_event_time = last_event_time.clone(); - let consecutive_unhealthy = consecutive_unhealthy.clone(); - async move { health_handler(last_event_time, consecutive_unhealthy).await } + async move { health_handler(last_event_time).await } }); if let Err(e) = http1::Builder::new().serve_connection(io, service).await {