Skip to content

Commit 2c0d4bf

Browse files
authored
fix(spans): Detect subprocess crash during startup health check (#109832)
The SpanFlusher subprocess can crash during startup due to Redis connectivity issues, but the main consumer thread continues to wait in _wait_for_process_to_become_healthy() for a healthy signal that will never come. This makes health check function in the main thread determine if the subprocess is alive or has already been terminated to fail faster. Refs STREAM-741
1 parent 45b07af commit 2c0d4bf

2 files changed

Lines changed: 39 additions & 4 deletions

File tree

src/sentry/spans/consumers/process/flusher.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,14 @@ def _wait_for_process_to_become_healthy(self, process_index: int):
179179
if self.process_healthy_since[process_index].value != 0:
180180
break
181181

182+
process = self.processes[process_index]
183+
if not process.is_alive():
184+
shards = self.process_to_shards_map[process_index]
185+
exitcode = getattr(process, "exitcode", None)
186+
raise RuntimeError(
187+
f"process {process_index} (shards {shards}) exited during startup (exitcode={exitcode})"
188+
)
189+
182190
if time.time() - start_time > max_unhealthy_seconds:
183191
shards = self.process_to_shards_map[process_index]
184192
raise RuntimeError(

tests/sentry/spans/consumers/process/test_flusher.py

Lines changed: 31 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -165,15 +165,14 @@ def test_multi_producer_sliced_integration_with_arroyo_local_producer() -> None:
165165
manager.close()
166166

167167

168-
def test_flusher_waits_for_processes_to_start() -> None:
168+
def test_flusher_waits_for_exited_processes_during_startup() -> None:
169169
"""Test that the flusher waits for all processes to become healthy during initialization."""
170170
buffer = SpansBuffer(assigned_shards=[0])
171171

172-
# Patch SpanFlusher.main to never set healthy_since, simulating a process that fails to start
172+
# exit without setting healthy_since, simulating a process that fails early
173173
def never_healthy_main(
174174
buffer, shards, stopped, current_drift, backpressure_since, healthy_since, produce_to_pipe
175175
):
176-
# Don't set healthy_since.value, simulating a process that never becomes healthy
177176
return
178177

179178
with (
@@ -183,7 +182,35 @@ def never_healthy_main(
183182
"spans.buffer.flusher.max-unhealthy-seconds": 0.5,
184183
"spans.buffer.flusher.use-stuck-detector": False,
185184
}
186-
), # Should raise RuntimeError because the process never reports as healthy
185+
),
186+
pytest.raises(RuntimeError, match="process 0 \\(shards \\[0\\]\\) exited during startup"),
187+
):
188+
SpanFlusher(
189+
buffer,
190+
next_step=Noop(),
191+
produce_to_pipe=lambda _: None,
192+
)
193+
194+
195+
def test_flusher_timeout_waiting_for_processes_startup() -> None:
196+
"""Test that the flusher times out when a process stays alive but never becomes healthy."""
197+
buffer = SpansBuffer(assigned_shards=[0])
198+
199+
# block without setting healthy_since, simulating a process that hangs during startup
200+
def hang_main(
201+
buffer, shards, stopped, current_drift, backpressure_since, healthy_since, produce_to_pipe
202+
):
203+
while not stopped.value:
204+
sleep(0.05)
205+
206+
with (
207+
mock.patch.object(SpanFlusher, "main", hang_main),
208+
override_options(
209+
{
210+
"spans.buffer.flusher.max-unhealthy-seconds": 0.5,
211+
"spans.buffer.flusher.use-stuck-detector": False,
212+
}
213+
),
187214
pytest.raises(RuntimeError, match="process 0 \\(shards \\[0\\]\\) didn't start up"),
188215
):
189216
SpanFlusher(

0 commit comments

Comments
 (0)