From e4d6c42fc6976aeca6ef08c232684742298aeb20 Mon Sep 17 00:00:00 2001 From: Jordan Gonzalez <30836115+duncanista@users.noreply.github.com> Date: Thu, 19 Mar 2026 16:16:02 +0100 Subject: [PATCH 1/6] add retry strategy --- bottlecap/src/traces/trace_flusher.rs | 28 ++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/bottlecap/src/traces/trace_flusher.rs b/bottlecap/src/traces/trace_flusher.rs index 7e43eab23..317cbabc0 100644 --- a/bottlecap/src/traces/trace_flusher.rs +++ b/bottlecap/src/traces/trace_flusher.rs @@ -6,6 +6,7 @@ use libdd_common::Endpoint; use libdd_trace_utils::{ config_utils::trace_intake_url_prefixed, send_data::SendData, + send_with_retry::{RetryBackoffType, RetryStrategy}, trace_utils::{self}, tracer_payload::TracerPayloadCollection, }; @@ -19,6 +20,11 @@ use crate::lifecycle::invocation::processor::S_TO_MS; use crate::traces::http_client::HttpClient; use crate::traces::trace_aggregator_service::AggregatorHandle; +/// Retry strategy for trace flushing: 5 retries with no delay between attempts. +fn trace_retry_strategy() -> RetryStrategy { + RetryStrategy::new(5, 0, RetryBackoffType::Constant, None) +} + pub struct TraceFlusher { pub aggregator_handle: AggregatorHandle, pub config: Arc, @@ -113,7 +119,11 @@ impl TraceFlusher { let traces_with_tags: Vec<_> = trace_builders .into_iter() .map(|info| { - let trace = info.builder.with_api_key(api_key.as_str()).build(); + let trace = info + .builder + .with_api_key(api_key.as_str()) + .with_retry_strategy(trace_retry_strategy()) + .build(); (trace, info.header_tags) }) .collect(); @@ -125,12 +135,16 @@ impl TraceFlusher { let additional_traces: Vec<_> = traces_with_tags .iter() .filter_map(|(trace, tags)| match trace.get_payloads() { - TracerPayloadCollection::V07(payloads) => Some(SendData::new( - trace.len(), - TracerPayloadCollection::V07(payloads.clone()), - tags.to_tracer_header_tags(), - &endpoint, - )), + TracerPayloadCollection::V07(payloads) => { + let mut send_data = SendData::new( + trace.len(), + TracerPayloadCollection::V07(payloads.clone()), + tags.to_tracer_header_tags(), + &endpoint, + ); + send_data.set_retry_strategy(trace_retry_strategy()); + Some(send_data) + } // All payloads in the extension are V07 (produced by // collect_pb_trace_chunks), so this branch is unreachable. _ => None, From 4d03c5e155022585abfbb406bf676773af5f3eed Mon Sep 17 00:00:00 2001 From: Jordan Gonzalez <30836115+duncanista@users.noreply.github.com> Date: Thu, 19 Mar 2026 16:22:28 +0100 Subject: [PATCH 2/6] add retries for stats --- bottlecap/src/traces/stats_flusher.rs | 56 +++++++++++++++------------ bottlecap/src/traces/trace_flusher.rs | 19 +++++++-- 2 files changed, 47 insertions(+), 28 deletions(-) diff --git a/bottlecap/src/traces/stats_flusher.rs b/bottlecap/src/traces/stats_flusher.rs index 0096df803..e663b4f98 100644 --- a/bottlecap/src/traces/stats_flusher.rs +++ b/bottlecap/src/traces/stats_flusher.rs @@ -10,6 +10,8 @@ use crate::config; use crate::lifecycle::invocation::processor::S_TO_MS; use crate::traces::http_client::HttpClient; use crate::traces::stats_aggregator::StatsAggregator; + +const STATS_FLUSH_RETRY_COUNT: usize = 5; use dogstatsd::api_key::ApiKeyFactory; use libdd_common::Endpoint; use libdd_trace_protobuf::pb; @@ -93,32 +95,38 @@ impl StatsFlusher { let stats_url = trace_stats_url(&self.config.site); - let start = std::time::Instant::now(); - - let resp = stats_utils::send_stats_payload_with_client( - serialized_stats_payload, - endpoint, - api_key.as_str(), - Some(&self.http_client), - ) - .await; - let elapsed = start.elapsed(); - debug!( - "STATS | Stats request to {} took {} ms", - stats_url, - elapsed.as_millis() - ); - match resp { - Ok(()) => { - debug!("STATS | Successfully flushed stats"); - None - } - Err(e) => { - // Network/server errors are temporary - return stats for retry - error!("STATS | Error sending stats: {e:?}"); - Some(stats) + for attempt in 1..=STATS_FLUSH_RETRY_COUNT { + let start = std::time::Instant::now(); + let resp = stats_utils::send_stats_payload_with_client( + serialized_stats_payload.clone(), + endpoint, + api_key.as_str(), + Some(&self.http_client), + ) + .await; + let elapsed = start.elapsed(); + + match resp { + Ok(()) => { + debug!( + "STATS | Successfully flushed stats to {stats_url} in {} ms (attempt {attempt}/{STATS_FLUSH_RETRY_COUNT})", + elapsed.as_millis() + ); + return None; + } + Err(e) => { + error!( + "STATS | Failed to send stats to {stats_url} in {} ms (attempt {attempt}/{STATS_FLUSH_RETRY_COUNT}): {e:?}", + elapsed.as_millis() + ); + } } } + + error!( + "STATS | Exhausted all {STATS_FLUSH_RETRY_COUNT} attempts, returning stats for redrive" + ); + Some(stats) } /// Flushes stats from the aggregator. diff --git a/bottlecap/src/traces/trace_flusher.rs b/bottlecap/src/traces/trace_flusher.rs index 317cbabc0..b5c0acd42 100644 --- a/bottlecap/src/traces/trace_flusher.rs +++ b/bottlecap/src/traces/trace_flusher.rs @@ -188,12 +188,23 @@ impl TraceFlusher { debug!("TRACES | Flushing {} traces", coalesced_traces.len()); for trace in &coalesced_traces { - let send_result = trace.send(&http_client).await.last_result; - - if let Err(e) = send_result { - error!("TRACES | Request failed: {e:?}"); + let result = trace.send(&http_client).await; + + if let Err(e) = &result.last_result { + error!( + "TRACES | Request failed after {} attempts ({} timeouts, {} network errors, {} status code errors): {e:?}", + result.requests_count, + result.errors_timeout, + result.errors_network, + result.errors_status_code, + ); return Some(coalesced_traces); } + + debug!( + "TRACES | Successfully sent trace ({} attempts, {} bytes)", + result.requests_count, result.bytes_sent, + ); } debug!("TRACES | Flushing took {} ms", start.elapsed().as_millis()); From 406a09c86958774ffa5e38c27048ff181ae2d7d9 Mon Sep 17 00:00:00 2001 From: Jordan Gonzalez <30836115+duncanista@users.noreply.github.com> Date: Thu, 19 Mar 2026 16:23:36 +0100 Subject: [PATCH 3/6] consistent retries to 3 --- bottlecap/src/traces/stats_flusher.rs | 2 +- bottlecap/src/traces/trace_flusher.rs | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/bottlecap/src/traces/stats_flusher.rs b/bottlecap/src/traces/stats_flusher.rs index e663b4f98..7bb12ec42 100644 --- a/bottlecap/src/traces/stats_flusher.rs +++ b/bottlecap/src/traces/stats_flusher.rs @@ -11,7 +11,7 @@ use crate::lifecycle::invocation::processor::S_TO_MS; use crate::traces::http_client::HttpClient; use crate::traces::stats_aggregator::StatsAggregator; -const STATS_FLUSH_RETRY_COUNT: usize = 5; +const STATS_FLUSH_RETRY_COUNT: usize = 3; use dogstatsd::api_key::ApiKeyFactory; use libdd_common::Endpoint; use libdd_trace_protobuf::pb; diff --git a/bottlecap/src/traces/trace_flusher.rs b/bottlecap/src/traces/trace_flusher.rs index b5c0acd42..df2139a37 100644 --- a/bottlecap/src/traces/trace_flusher.rs +++ b/bottlecap/src/traces/trace_flusher.rs @@ -20,9 +20,9 @@ use crate::lifecycle::invocation::processor::S_TO_MS; use crate::traces::http_client::HttpClient; use crate::traces::trace_aggregator_service::AggregatorHandle; -/// Retry strategy for trace flushing: 5 retries with no delay between attempts. +/// Retry strategy for trace flushing: 3 retries with no delay between attempts. fn trace_retry_strategy() -> RetryStrategy { - RetryStrategy::new(5, 0, RetryBackoffType::Constant, None) + RetryStrategy::new(3, 0, RetryBackoffType::Constant, None) } pub struct TraceFlusher { From 95933b42e9597d82d62f250e100fc3095a7d345b Mon Sep 17 00:00:00 2001 From: Jordan Gonzalez <30836115+duncanista@users.noreply.github.com> Date: Thu, 19 Mar 2026 16:26:52 +0100 Subject: [PATCH 4/6] consistent env var across crates --- bottlecap/src/traces/stats_flusher.rs | 13 +++++-------- bottlecap/src/traces/trace_flusher.rs | 13 +++++++++++-- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/bottlecap/src/traces/stats_flusher.rs b/bottlecap/src/traces/stats_flusher.rs index 7bb12ec42..a52c31021 100644 --- a/bottlecap/src/traces/stats_flusher.rs +++ b/bottlecap/src/traces/stats_flusher.rs @@ -6,12 +6,11 @@ use std::sync::Arc; use tokio::sync::Mutex; use tokio::sync::OnceCell; +use crate::FLUSH_RETRY_COUNT; use crate::config; use crate::lifecycle::invocation::processor::S_TO_MS; use crate::traces::http_client::HttpClient; use crate::traces::stats_aggregator::StatsAggregator; - -const STATS_FLUSH_RETRY_COUNT: usize = 3; use dogstatsd::api_key::ApiKeyFactory; use libdd_common::Endpoint; use libdd_trace_protobuf::pb; @@ -95,7 +94,7 @@ impl StatsFlusher { let stats_url = trace_stats_url(&self.config.site); - for attempt in 1..=STATS_FLUSH_RETRY_COUNT { + for attempt in 1..=FLUSH_RETRY_COUNT { let start = std::time::Instant::now(); let resp = stats_utils::send_stats_payload_with_client( serialized_stats_payload.clone(), @@ -109,23 +108,21 @@ impl StatsFlusher { match resp { Ok(()) => { debug!( - "STATS | Successfully flushed stats to {stats_url} in {} ms (attempt {attempt}/{STATS_FLUSH_RETRY_COUNT})", + "STATS | Successfully flushed stats to {stats_url} in {} ms (attempt {attempt}/{FLUSH_RETRY_COUNT})", elapsed.as_millis() ); return None; } Err(e) => { error!( - "STATS | Failed to send stats to {stats_url} in {} ms (attempt {attempt}/{STATS_FLUSH_RETRY_COUNT}): {e:?}", + "STATS | Failed to send stats to {stats_url} in {} ms (attempt {attempt}/{FLUSH_RETRY_COUNT}): {e:?}", elapsed.as_millis() ); } } } - error!( - "STATS | Exhausted all {STATS_FLUSH_RETRY_COUNT} attempts, returning stats for redrive" - ); + error!("STATS | Exhausted all {FLUSH_RETRY_COUNT} attempts, returning stats for redrive"); Some(stats) } diff --git a/bottlecap/src/traces/trace_flusher.rs b/bottlecap/src/traces/trace_flusher.rs index df2139a37..fa320e22b 100644 --- a/bottlecap/src/traces/trace_flusher.rs +++ b/bottlecap/src/traces/trace_flusher.rs @@ -15,14 +15,23 @@ use std::sync::Arc; use tokio::task::JoinSet; use tracing::{debug, error}; +use crate::FLUSH_RETRY_COUNT; use crate::config::Config; use crate::lifecycle::invocation::processor::S_TO_MS; use crate::traces::http_client::HttpClient; use crate::traces::trace_aggregator_service::AggregatorHandle; -/// Retry strategy for trace flushing: 3 retries with no delay between attempts. +/// Retry strategy for trace flushing using the shared `FLUSH_RETRY_COUNT` +/// with no delay between attempts. In Lambda, every millisecond of wall-clock +/// time matters, and the per-attempt request timeout already bounds how long +/// each retry can take. fn trace_retry_strategy() -> RetryStrategy { - RetryStrategy::new(3, 0, RetryBackoffType::Constant, None) + RetryStrategy::new( + u32::try_from(FLUSH_RETRY_COUNT).expect("FLUSH_RETRY_COUNT fits in u32"), + 0, + RetryBackoffType::Constant, + None, + ) } pub struct TraceFlusher { From 812520b31c75308443bf6937eed9c60e1f9846fb Mon Sep 17 00:00:00 2001 From: Jordan Gonzalez <30836115+duncanista@users.noreply.github.com> Date: Thu, 19 Mar 2026 16:30:45 +0100 Subject: [PATCH 5/6] remove expect --- bottlecap/src/traces/trace_flusher.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bottlecap/src/traces/trace_flusher.rs b/bottlecap/src/traces/trace_flusher.rs index fa320e22b..89f40050e 100644 --- a/bottlecap/src/traces/trace_flusher.rs +++ b/bottlecap/src/traces/trace_flusher.rs @@ -27,7 +27,7 @@ use crate::traces::trace_aggregator_service::AggregatorHandle; /// each retry can take. fn trace_retry_strategy() -> RetryStrategy { RetryStrategy::new( - u32::try_from(FLUSH_RETRY_COUNT).expect("FLUSH_RETRY_COUNT fits in u32"), + u32::try_from(FLUSH_RETRY_COUNT).unwrap_or(3), 0, RetryBackoffType::Constant, None, From 5910350dba3eac7a1362ddaf6bc5da58ce041d74 Mon Sep 17 00:00:00 2001 From: Jordan Gonzalez <30836115+duncanista@users.noreply.github.com> Date: Thu, 19 Mar 2026 16:43:25 +0100 Subject: [PATCH 6/6] reduce logging on one failure in stats --- bottlecap/src/traces/stats_flusher.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bottlecap/src/traces/stats_flusher.rs b/bottlecap/src/traces/stats_flusher.rs index a52c31021..1d4b4ae4d 100644 --- a/bottlecap/src/traces/stats_flusher.rs +++ b/bottlecap/src/traces/stats_flusher.rs @@ -114,7 +114,7 @@ impl StatsFlusher { return None; } Err(e) => { - error!( + debug!( "STATS | Failed to send stats to {stats_url} in {} ms (attempt {attempt}/{FLUSH_RETRY_COUNT}): {e:?}", elapsed.as_millis() );