|
| 1 | +# Copyright 2023–2025 Google LLC |
| 2 | +# |
| 3 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | +# you may not use this file except in compliance with the License. |
| 5 | +# You may obtain a copy of the License at |
| 6 | +# |
| 7 | +# https://www.apache.org/licenses/LICENSE-2.0 |
| 8 | +# |
| 9 | +# Unless required by applicable law or agreed to in writing, software |
| 10 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | +# See the License for the specific language governing permissions and |
| 13 | +# limitations under the License. |
| 14 | + |
| 15 | +"""Tests for monitoring metrics""" |
| 16 | +import unittest |
| 17 | +from types import SimpleNamespace |
| 18 | +from unittest import mock |
| 19 | + |
| 20 | +import numpy as np |
| 21 | + |
| 22 | +from maxtext.common.metric_logger import MetricLogger |
| 23 | + |
| 24 | + |
| 25 | +class MetricLoggerAbortTest(unittest.TestCase): |
| 26 | + def _make_logger(self, abort_on_nan_loss, abort_on_inf_loss): |
| 27 | + logger = MetricLogger.__new__(MetricLogger) # skip __init__ |
| 28 | + logger.config = SimpleNamespace( |
| 29 | + abort_on_nan_loss=abort_on_nan_loss, |
| 30 | + abort_on_inf_loss=abort_on_inf_loss, |
| 31 | + enable_tensorboard=True, |
| 32 | + metrics_file="/tmp/fake_metrics.jsonl", |
| 33 | + gcs_metrics=True, |
| 34 | + managed_mldiagnostics=True, |
| 35 | + ) |
| 36 | + return logger |
| 37 | + |
| 38 | + def _metrics(self, loss): |
| 39 | + return {"scalar": {"learning/loss": loss}} |
| 40 | + |
| 41 | + @mock.patch("jax.process_index", return_value=0) |
| 42 | + def test_abort_on_nan_exits_after_writes(self, _): |
| 43 | + logger = self._make_logger(True, False) |
| 44 | + |
| 45 | + with ( |
| 46 | + mock.patch.object(logger, "log_metrics") as log_metrics, |
| 47 | + mock.patch.object(logger, "write_metrics_to_tensorboard") as tb, |
| 48 | + mock.patch.object(logger, "write_metrics_locally") as local, |
| 49 | + mock.patch.object(logger, "write_metrics_for_gcs") as gcs, |
| 50 | + mock.patch.object(logger, "write_metrics_to_managed_mldiagnostics") as mldiag, |
| 51 | + ): |
| 52 | + with self.assertRaises(SystemExit) as cm: |
| 53 | + logger.write_metrics(self._metrics(np.nan), step=1, is_training=True) |
| 54 | + |
| 55 | + self.assertEqual(cm.exception.code, 1) |
| 56 | + log_metrics.assert_called_once() |
| 57 | + tb.assert_called_once() |
| 58 | + local.assert_called_once() |
| 59 | + gcs.assert_called_once() |
| 60 | + mldiag.assert_called_once() |
| 61 | + |
| 62 | + @mock.patch("jax.process_index", return_value=0) |
| 63 | + def test_abort_on_inf_exits_after_writes(self, _): |
| 64 | + logger = self._make_logger(False, True) |
| 65 | + with mock.patch.object(logger, "log_metrics"), \ |
| 66 | + mock.patch.object(logger, "write_metrics_to_tensorboard"), \ |
| 67 | + mock.patch.object(logger, "write_metrics_locally"), \ |
| 68 | + mock.patch.object(logger, "write_metrics_for_gcs"), \ |
| 69 | + mock.patch.object(logger, "write_metrics_to_managed_mldiagnostics"): |
| 70 | + with self.assertRaises(SystemExit): |
| 71 | + logger.write_metrics(self._metrics(np.inf), step=1, is_training=True) |
| 72 | + |
| 73 | + def test_finite_loss_does_not_exit(self): |
| 74 | + logger = self._make_logger(True, True) |
| 75 | + with mock.patch.object(logger, "log_metrics"), \ |
| 76 | + mock.patch.object(logger, "write_metrics_to_tensorboard"), \ |
| 77 | + mock.patch.object(logger, "write_metrics_locally"), \ |
| 78 | + mock.patch.object(logger, "write_metrics_to_managed_mldiagnostics"), \ |
| 79 | + mock.patch("jax.process_index", return_value=1): # skip gcs branch |
| 80 | + logger.write_metrics(self._metrics(1.23), step=1, is_training=True) |
| 81 | + |
| 82 | + def test_abort_flags_disabled_does_not_exit(self): |
| 83 | + logger = self._make_logger(False, False) |
| 84 | + with mock.patch.object(logger, "log_metrics"), \ |
| 85 | + mock.patch.object(logger, "write_metrics_to_tensorboard"), \ |
| 86 | + mock.patch.object(logger, "write_metrics_locally"), \ |
| 87 | + mock.patch.object(logger, "write_metrics_to_managed_mldiagnostics"), \ |
| 88 | + mock.patch("jax.process_index", return_value=1): |
| 89 | + logger.write_metrics(self._metrics(np.nan), step=1, is_training=True) |
0 commit comments