Skip to content

Commit 2c78489

Browse files
committed
Handle exceptions in data reload loop to prevent silent data staleness
The Reloader thread/process in LocalDataIngester crashes on any unhandled exception (e.g. transient network errors when reading from remote filesystems like GCS). Once the reload loop dies, TensorBoard continues serving stale data with no indication to the user. Wrap the reload loop body in a try/except so that transient errors are logged and the next reload cycle proceeds normally.
1 parent 81e6137 commit 2c78489

2 files changed

Lines changed: 52 additions & 12 deletions

File tree

tensorboard/backend/event_processing/data_ingester.py

Lines changed: 18 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -98,18 +98,24 @@ def start(self):
9898

9999
def _reload():
100100
while True:
101-
start = time.time()
102-
logger.info("TensorBoard reload process beginning")
103-
for path, name in self._path_to_run.items():
104-
self._multiplexer.AddRunsFromDirectory(path, name)
105-
logger.info(
106-
"TensorBoard reload process: Reload the whole Multiplexer"
107-
)
108-
self._multiplexer.Reload()
109-
duration = time.time() - start
110-
logger.info(
111-
"TensorBoard done reloading. Load took %0.3f secs", duration
112-
)
101+
try:
102+
start = time.time()
103+
logger.info("TensorBoard reload process beginning")
104+
for path, name in self._path_to_run.items():
105+
self._multiplexer.AddRunsFromDirectory(path, name)
106+
logger.info(
107+
"TensorBoard reload process: Reload the whole Multiplexer"
108+
)
109+
self._multiplexer.Reload()
110+
duration = time.time() - start
111+
logger.info(
112+
"TensorBoard done reloading. Load took %0.3f secs",
113+
duration,
114+
)
115+
except Exception:
116+
logger.error(
117+
"TensorBoard reload failed", exc_info=True
118+
)
113119
if self._reload_interval == 0:
114120
# Only load the multiplexer once. Do not continuously reload.
115121
break

tensorboard/backend/event_processing/data_ingester_test.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,40 @@ def __init__(
6161
self.window_title = window_title
6262

6363

64+
class ReloadErrorHandlingTest(tb_test.TestCase):
65+
"""Tests that the reload loop survives transient errors."""
66+
67+
def test_reload_continues_after_exception(self):
68+
"""Reload loop should log errors and continue, not crash."""
69+
flags = FakeFlags(logdir="logdir", reload_interval=0, reload_task="blocking")
70+
ingester = data_ingester.LocalDataIngester(flags)
71+
# Make AddRunsFromDirectory raise on the first call.
72+
with mock.patch.object(
73+
ingester._multiplexer, "AddRunsFromDirectory", side_effect=OSError("network error")
74+
) as mock_add:
75+
with mock.patch.object(
76+
ingester._multiplexer, "Reload"
77+
) as mock_reload:
78+
# Should not raise despite the OSError.
79+
ingester.start()
80+
mock_add.assert_called_once()
81+
# Reload should not be called since AddRunsFromDirectory raised first.
82+
mock_reload.assert_not_called()
83+
84+
def test_reload_continues_after_reload_exception(self):
85+
"""Reload loop should survive errors from Reload() as well."""
86+
flags = FakeFlags(logdir="logdir", reload_interval=0, reload_task="blocking")
87+
ingester = data_ingester.LocalDataIngester(flags)
88+
with mock.patch.object(
89+
ingester._multiplexer, "AddRunsFromDirectory"
90+
):
91+
with mock.patch.object(
92+
ingester._multiplexer, "Reload", side_effect=RuntimeError("reload failed")
93+
):
94+
# Should not raise despite the RuntimeError.
95+
ingester.start()
96+
97+
6498
class GetEventFileActiveFilterTest(tb_test.TestCase):
6599
def testDisabled(self):
66100
flags = FakeFlags(logdir="logdir", reload_multifile=False)

0 commit comments

Comments
 (0)