From e04551231590e76a1415f874ac4b25b7c4a8f73e Mon Sep 17 00:00:00 2001
From: Curtis Anderson <curtis@mlcommons.org>
Date: Fri, 26 Jun 2026 17:12:34 -0700
Subject: [PATCH 1/4] test(perf): eliminate ~30s of unintended waits in three
 fast-lane tests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- test_collects_multiple_errors: pass skip_remote_checks=True. Both
  dependency checks are mocked to raise, but the test left
  hosts=['node1','node2'] in args, which triggered a real SSH probe
  to nonexistent hosts and ate ~20s of TCP connect timeouts before
  the assertion ran.
- test_bcast_precedes_barrier_in_executed_heredoc_with_mocked_mpi4py:
  patch time.sleep around the in-process exec of SHARED_FS_PROBE_SCRIPT.
  The probe's rank-0 D-49 quiesce path calls time.sleep(5.0); the unit
  test only locks call ordering, not timing.
- test_rank0_emits_markers_and_non_rank0_silent[0]: same root cause —
  rank 0 hits the 5s quiesce. monkeypatch time.sleep for the test.

No behavioral changes to production code.
---
 tests/unit/test_cluster_collector.py  | 5 +++++
 tests/unit/test_shared_fs_probe.py    | 6 ++++++
 tests/unit/test_validation_helpers.py | 6 +++++-
 3 files changed, 16 insertions(+), 1 deletion(-)
diff --git a/tests/unit/test_cluster_collector.py b/tests/unit/test_cluster_collector.py
index a5eda47b..2489e4b3 100755
--- a/tests/unit/test_cluster_collector.py
+++ b/tests/unit/test_cluster_collector.py
@@ -3554,6 +3554,11 @@ def test_rank0_emits_markers_and_non_rank0_silent(
             ["probe", str(tmp_path), "silence-test-uuid"],
         )
 
+        # The probe's rank-0 D-49 quiesce path sleeps 5s; neutralize for the
+        # unit test (we're only locking the stdout marker contract).
+        import time as _time
+        monkeypatch.setattr(_time, "sleep", lambda *_a, **_kw: None)
+
         from mlpstorage_py.cluster_collector import SHARED_FS_PROBE_SCRIPT
 
         captured = io.StringIO()
diff --git a/tests/unit/test_shared_fs_probe.py b/tests/unit/test_shared_fs_probe.py
index a5c1bf0f..a0a94a8e 100644
--- a/tests/unit/test_shared_fs_probe.py
+++ b/tests/unit/test_shared_fs_probe.py
@@ -618,15 +618,21 @@ class _FakeMPI:
         saved_argv = sys.argv
         saved_mpi4py = sys.modules.get("mpi4py")
         saved_mpi = sys.modules.get("mpi4py.MPI")
+        # The probe's rank-0 D-49 quiesce path sleeps 5s; neutralize for the
+        # unit test (we're only locking call ordering, not timing).
+        import time as _time
+        saved_sleep = _time.sleep
         try:
             sys.modules["mpi4py"] = fake_mpi4py
             sys.modules["mpi4py.MPI"] = _FakeMPI()
+            _time.sleep = lambda *_a, **_kw: None
             sys.argv = ["<probe>", str(tmp_path), "test-uuid", out_file]
             namespace = {"__name__": "__main__"}
             # The heredoc body calls sys.exit at the end; trap it.
             with pytest.raises(SystemExit):
                 exec(SHARED_FS_PROBE_SCRIPT, namespace)
         finally:
+            _time.sleep = saved_sleep
             sys.argv = saved_argv
             if saved_mpi4py is not None:
                 sys.modules["mpi4py"] = saved_mpi4py
diff --git a/tests/unit/test_validation_helpers.py b/tests/unit/test_validation_helpers.py
index 69ae443f..c121c90d 100755
--- a/tests/unit/test_validation_helpers.py
+++ b/tests/unit/test_validation_helpers.py
@@ -155,7 +155,11 @@ def test_collects_multiple_errors(self, mock_dlio, mock_mpi):
         mock_logger = MagicMock()
 
         with pytest.raises(DependencyError) as exc_info:
-            validate_benchmark_environment(args, logger=mock_logger)
+            # skip_remote_checks: hosts=['node1','node2'] would otherwise
+            # trigger a real SSH probe to nonexistent hosts (~20s of
+            # connect timeouts); this test only asserts that multiple
+            # errors accumulate, which the MPI+DLIO mocks already cover.
+            validate_benchmark_environment(args, logger=mock_logger, skip_remote_checks=True)
 
         # First error should be raised (MPI)
         assert "MPI not found" in str(exc_info.value)

From b4c8e220273fa37c111579764a447b1457e903eb Mon Sep 17 00:00:00 2001
From: Curtis Anderson <curtis@mlcommons.org>
Date: Fri, 26 Jun 2026 17:12:51 -0700
Subject: [PATCH 2/4] test: mark genuinely-slow tests as slow, declare marker
 in kv_cache

- tests/integration: mark test_init_then_closed_datagen_no_env_var slow
  (~17.5s; full in-process CLI dispatcher exercising init + datagen).
- kv_cache_benchmark/pyproject.toml: declare 'slow' marker and default
  to '-m not slow' (parity with the root suite). Without this, the
  next two slow marks would emit PytestUnknownMarkWarning and still
  run by default.
- kv_cache_benchmark/tests: mark test_gpu_overflow_to_cpu slow
  (~32s; 100 x 10K-token allocations) and
  test_profile_allocate_vs_access_overhead slow (~5.8s; profiling).

Net effect on default test run: root suite drops from 86s to 39s,
kv_cache suite drops from 155s to 98s.
---
 kv_cache_benchmark/pyproject.toml                     | 5 ++++-
 kv_cache_benchmark/tests/test_kv_cache.py             | 2 ++
 tests/integration/test_canonical_layout_end_to_end.py | 1 +
 3 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/kv_cache_benchmark/pyproject.toml b/kv_cache_benchmark/pyproject.toml
index 3eaf156c..64db0488 100755
--- a/kv_cache_benchmark/pyproject.toml
+++ b/kv_cache_benchmark/pyproject.toml
@@ -110,4 +110,7 @@ ignore_missing_imports = true
 testpaths = ["tests", "."]
 python_files = ["test_*.py"]
 python_functions = ["test_*"]
-addopts = "-v --tb=short"
+addopts = "-v --tb=short -m 'not slow'"
+markers = [
+    "slow: tests that take >5s (e.g., large GPU-overflow allocations, profiling). Excluded from the default suite; opt in with `pytest -m slow` (or `pytest -m ''` to run everything).",
+]
diff --git a/kv_cache_benchmark/tests/test_kv_cache.py b/kv_cache_benchmark/tests/test_kv_cache.py
index 31d5b1af..b493a817 100644
--- a/kv_cache_benchmark/tests/test_kv_cache.py
+++ b/kv_cache_benchmark/tests/test_kv_cache.py
@@ -1102,6 +1102,7 @@ def test_allocation_prefers_gpu(self, multi_tier_cache_with_gpu):
         assert success is True
         assert location == 'gpu'
     
+    @pytest.mark.slow
     def test_gpu_overflow_to_cpu(self, multi_tier_cache_with_gpu):
         """When GPU is full, should overflow to CPU."""
         # Fill GPU with large allocations
@@ -3899,6 +3900,7 @@ def test_part5_one_tier_nvme_only_eviction(self, tiny_model):
 class TestBottleneckProfiling:
     """Profile bottleneck detection in the KV cache benchmark."""
 
+    @pytest.mark.slow
     def test_profile_allocate_vs_access_overhead(self):
         """Profile allocate vs access operations to identify bottleneck ratios."""
         import time as time_mod
diff --git a/tests/integration/test_canonical_layout_end_to_end.py b/tests/integration/test_canonical_layout_end_to_end.py
index 98c62592..fc4f6df0 100644
--- a/tests/integration/test_canonical_layout_end_to_end.py
+++ b/tests/integration/test_canonical_layout_end_to_end.py
@@ -402,6 +402,7 @@ class TestInitThenRunFullCliDispatch:
     runs on any dev box (does not require DLIO/openmpi).
     """
 
+    @pytest.mark.slow
     def test_init_then_closed_datagen_no_env_var(self, tmp_path, monkeypatch):
         """RED today: the second invocation raises ConfigurationError E101
         even though `mlpstorage init` wrote a valid sentinel.

From 69d4ab50a7dc8b347e274a1a5c1003bb234a1530 Mon Sep 17 00:00:00 2001
From: Curtis Anderson <curtis@mlcommons.org>
Date: Fri, 26 Jun 2026 17:13:02 -0700
Subject: [PATCH 3/4] ci: run all four test suites, not just tests/
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The previous workflow ran 'uv run pytest', which picked up only the
root pyproject's testpaths=['tests']. The three sibling suites
(mlpstorage_py/tests, vdb_benchmark/tests, kv_cache_benchmark/tests)
were never executed in CI, so regressions in those areas could land
without CI catching them — exactly the gap that PRs #551-#560 had
to fix by hand.

Each suite is invoked in its own step:
- tests/ and vdb_benchmark/tests/ can't be collected in one pytest
  process (both define a top-level 'tests' package whose conftest.py
  modules collide via pytest's ImportPathMismatchError).
- Each suite's pyproject defines its own '-m not slow' default, so
  subprocess-level invocation is the correct boundary.

Also installs vdb_benchmark and kv_cache_benchmark editable so their
imports resolve.
---
 .github/workflows/test.yml | 29 +++++++++++++++++++++++++----
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 2ff5c5e1..c1703fc3 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -45,8 +45,29 @@ jobs:
         sudo apt-get install -y libopenmpi-dev openmpi-common openmpi-bin 
 
     - name: Install package and test dependencies
-      run: uv sync # Automatically creates the environment and locks dependencies
+      run: uv sync --extra test
 
-    # 4. Heavy Lifting (Runs only if linting passed)
-    - name: Run unit tests
-      run: uv run pytest # Runs your tests safely inside the managed environment
+    - name: Install subpackages (vdb_benchmark, kv_cache_benchmark)
+      # These have their own pyproject.toml and tests; install editable so
+      # their imports resolve and their `test` extras are available.
+      run: |
+        uv pip install -e ./vdb_benchmark
+        uv pip install -e ./kv_cache_benchmark
+
+    # 4. Run the four test suites separately.
+    # We can't collect them together: `tests/` and `vdb_benchmark/tests/`
+    # both have a top-level package named `tests` whose conftest.py modules
+    # collide under pytest's rootdir-relative import (ImportPathMismatchError).
+    # Each suite's pyproject defines its own `slow` marker and `-m 'not slow'`
+    # default, so subprocess-level invocation is correct.
+    - name: Run root test suite (tests/)
+      run: uv run pytest tests
+
+    - name: Run mlpstorage_py test suite
+      run: uv run pytest mlpstorage_py/tests
+
+    - name: Run vdb_benchmark test suite
+      run: uv run pytest vdb_benchmark/tests
+
+    - name: Run kv_cache_benchmark test suite
+      run: uv run pytest kv_cache_benchmark/tests

From 1e26fee0a4f5ba18cfe44be2c3b83a77271467de Mon Sep 17 00:00:00 2001
From: Curtis Anderson <curtis@mlcommons.org>
Date: Fri, 26 Jun 2026 17:13:09 -0700
Subject: [PATCH 4/4] chore: bump version 3.0.23 -> 3.0.25; regenerate uv.lock

PR #550 bumps 3.0.23 -> 3.0.24. This PR lands on top, so bump to
3.0.25 directly. uv.lock regenerated to reflect the new project
version (no dependency changes).
---
 pyproject.toml | 2 +-
 uv.lock        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index a2f6b505..3b47387b 100755
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "mlpstorage"
-version = "3.0.23"
+version = "3.0.25"
 description = "MLPerf Storage Benchmark Suite"
 readme = "README.md"
 license = {text = "Apache-2.0"}
diff --git a/uv.lock b/uv.lock
index 944e15f8..6e1ed50e 100644
--- a/uv.lock
+++ b/uv.lock
@@ -518,7 +518,7 @@ wheels = [
 
 [[package]]
 name = "mlpstorage"
-version = "3.0.23"
+version = "3.0.25"
 source = { editable = "." }
 dependencies = [
     { name = "dlio-benchmark", marker = "sys_platform == 'linux'" },