From d2b3f4471c9ab6cfc7d82aff6fd6f3cc98a9a5ba Mon Sep 17 00:00:00 2001
From: lif <>
Date: Fri, 20 Mar 2026 06:17:15 +0000
Subject: [PATCH 1/2] phd-runner option to defer guest cleanup on failure

When `--manual-stop-on-failure` is passed, each propolis-server in failed
test cases is left running (if it hadn't been shut down by the test case
prior to failure explicitly), and its address is echoed to the operator
such that they can e.g. connect to its serial console to investigate or
debug whatever may have caused the test failure. The test suite pauses
until the instances left in this state are shut down manually, then
continues running further tests (unless interrupted).

Aside from convenience, this can be useful vs. reproducing test failures
with manually-reconstructed scenarios via a transcription of a phd-test's
instance spec and steps, which may have differences due to human-scale
timing of guest shell command invocations.
---
 phd-tests/framework/src/lib.rs         | 20 ++++++++-
 phd-tests/framework/src/test_vm/mod.rs | 61 +++++++++++++++++++++++++-
 phd-tests/runner/src/config.rs         | 10 +++++
 phd-tests/runner/src/execute.rs        | 19 +++++++-
 4 files changed, 106 insertions(+), 4 deletions(-)
diff --git a/phd-tests/framework/src/lib.rs b/phd-tests/framework/src/lib.rs
index 96cb2177b..12491eb75 100644
--- a/phd-tests/framework/src/lib.rs
+++ b/phd-tests/framework/src/lib.rs
@@ -43,7 +43,10 @@ use test_vm::{
     environment::EnvironmentSpec, spec::VmSpec, VmConfig, VmLocation,
 };
 use tokio::{
-    sync::mpsc::{UnboundedReceiver, UnboundedSender},
+    sync::{
+        mpsc::{UnboundedReceiver, UnboundedSender},
+        watch,
+    },
     task::JoinHandle,
 };
 
@@ -63,6 +66,8 @@ pub(crate) mod zfs;
 pub struct TestCtx {
     pub(crate) framework: Arc<Framework>,
     pub(crate) output_dir: Utf8PathBuf,
+    pub(crate) success_sigint_rx:
+        Option<(watch::Receiver<Option<bool>>, watch::Receiver<bool>)>,
 }
 
 /// An instance of the PHD test framework.
@@ -239,6 +244,17 @@ impl TestCtx {
         )
         .await
     }
+
+    /// When phd-runner is configured to leave instances running on failed
+    /// tests, the watch channel whose Receiver is passed to this function is
+    /// used to indicate to the instance cleanup task that a test *has* failed.
+    pub fn set_cleanup_task_outcome_receiver(
+        &mut self,
+        success_rx: watch::Receiver<Option<bool>>,
+        sigint_rx: watch::Receiver<bool>,
+    ) {
+        self.success_sigint_rx = Some((success_rx, sigint_rx));
+    }
 }
 
 // The framework implementation includes some "runner-only" functions
@@ -330,7 +346,7 @@ impl Framework {
     pub fn test_ctx(self: &Arc<Self>, fully_qualified_name: String) -> TestCtx {
         let output_dir =
             self.tmp_directory.as_path().join(&fully_qualified_name);
-        TestCtx { framework: self.clone(), output_dir }
+        TestCtx { framework: self.clone(), output_dir, success_sigint_rx: None }
     }
 
     /// Resets the state of any stateful objects in the framework to prepare it
diff --git a/phd-tests/framework/src/test_vm/mod.rs b/phd-tests/framework/src/test_vm/mod.rs
index d9d8b1c64..dbd0b0465 100644
--- a/phd-tests/framework/src/test_vm/mod.rs
+++ b/phd-tests/framework/src/test_vm/mod.rs
@@ -43,7 +43,7 @@ use propolis_client::{
 use propolis_client::{Client, ResponseValue};
 use thiserror::Error;
 use tokio::{
-    sync::{mpsc::UnboundedSender, oneshot},
+    sync::{mpsc::UnboundedSender, oneshot, watch},
     task::JoinHandle,
     time::timeout,
 };
@@ -272,6 +272,13 @@ pub struct TestVm {
 
     state: VmState,
 
+    /// If we should wait for operator intervention before terminating this
+    /// instance, a `Some((success_rx, _))` here will be sent a `Some(false)`.
+    /// While waiting, a `Some((_, sigint_rx))` will be sent a `true` if the
+    /// user sends a keyboard interrupt to indicate we should stop waiting.
+    success_sigint_rx:
+        Option<(watch::Receiver<Option<bool>>, watch::Receiver<bool>)>,
+
     /// Sending a task handle to this channel will ensure that the task runs to
     /// completion as part of the post-test cleanup fixture (i.e. before any
     /// other tests run).
@@ -328,6 +335,7 @@ impl TestVm {
                 environment.clone(),
                 params,
                 ctx.framework.cleanup_task_channel(),
+                ctx.success_sigint_rx.clone(),
             ),
         }
     }
@@ -338,6 +346,10 @@ impl TestVm {
         environment_spec: EnvironmentSpec,
         mut params: ServerProcessParameters,
         cleanup_task_tx: UnboundedSender<JoinHandle<()>>,
+        success_sigint_rx: Option<(
+            watch::Receiver<Option<bool>>,
+            watch::Receiver<bool>,
+        )>,
     ) -> Result<Self> {
         let metrics = environment_spec.metrics.as_ref().map(|m| match m {
             MetricsLocation::Local => {
@@ -372,6 +384,7 @@ impl TestVm {
             guest_os,
             state: VmState::New,
             cleanup_task_tx,
+            success_sigint_rx,
         })
     }
 
@@ -1134,6 +1147,9 @@ impl Drop for TestVm {
 
         let disks: Vec<_> = self.vm_spec().disk_handles.drain(..).collect();
 
+        let success_sigint_rx_opt = self.success_sigint_rx.take();
+        let name = self.vm_spec().vm_name.to_owned();
+
         // The order in which the task destroys objects is important: the server
         // can't be killed until the client has gotten a chance to shut down
         // the VM, and the disks can't be destroyed until the server process has
@@ -1144,6 +1160,49 @@ impl Drop for TestVm {
                 // kept alive until the server process is gone.
                 let _disks = disks;
 
+                // Check if we should let the user access the instance of a
+                // failed testcase before ensuring its demolition
+                if let Some((mut success_rx, sigint_rx)) = success_sigint_rx_opt {
+                    while success_rx.changed().await.is_ok() {
+                        match *success_rx.borrow() {
+                            None => continue,
+                            Some(true) => break,
+                            Some(false) => {}
+                        }
+                        let sock = server.server_addr();
+                        let ip = sock.ip();
+                        let port = sock.port();
+                        let mut uninformed = true;
+                        // States that might be worth inspecting out-of-band
+                        while let Ok(InstanceState::Running
+                            | InstanceState::Migrating
+                            | InstanceState::Rebooting
+                            | InstanceState::Repairing
+                        ) = client
+                            .instance_get()
+                            .send()
+                            .await
+                            .map(|inst| inst.instance.state)
+                        {
+                            if *sigint_rx.borrow() { break }
+                            if uninformed {
+                                // write to stderr irrespective of log level
+                                eprintln!(r#"
+propolis-server {name:?} left running at address {sock}
+phd-runner will resume when this instance is shut down; e.g. by one of:
+
+$ propolis-cli -s {ip} -p {port} serial
+localhost:~# poweroff
+
+$ propolis-cli -s {ip} -p {port} state stop
+"#);
+                                uninformed = false;
+                            }
+                            tokio::time::sleep(Duration::from_secs(1)).await;
+                        }
+                    }
+                }
+
                 // Try to make sure the server's kernel VMM is cleaned up before
                 // killing the server process. This is best-effort; if it fails,
                 // the kernel VMM is leaked. This generally indicates a bug in
diff --git a/phd-tests/runner/src/config.rs b/phd-tests/runner/src/config.rs
index 313d389d3..88fe66617 100644
--- a/phd-tests/runner/src/config.rs
+++ b/phd-tests/runner/src/config.rs
@@ -202,6 +202,16 @@ pub struct RunOptions {
     // number of seconds, but i'm lazy...
     #[clap(long, value_parser, default_value_t = 60 * 20)]
     pub max_buildomat_wait_secs: u64,
+
+    /// When a testcase fails while this is enabled, any instances started by
+    /// the failed test are left running and phd-runner waits until they are
+    /// manually shut down out-of-band by the operator.
+    ///
+    /// This feature is intended to give the operator a chance to inspect the
+    /// state of the guest(s) easily without necessarily having to reconstruct
+    /// the scenario by hand.
+    #[clap(long, value_parser)]
+    pub manual_stop_on_failure: bool,
 }
 
 #[derive(Args, Debug)]
diff --git a/phd-tests/runner/src/execute.rs b/phd-tests/runner/src/execute.rs
index f7d878c39..978de8080 100644
--- a/phd-tests/runner/src/execute.rs
+++ b/phd-tests/runner/src/execute.rs
@@ -101,8 +101,18 @@ pub async fn run_tests_with_ctx(
         let framework = ctx.clone();
         let tc = execution.tc;
         let mut sigint_rx_task = sigint_rx.clone();
+        let mut test_ctx = framework.test_ctx(tc.fully_qualified_name());
+        let mut success_tx = None;
+        if run_opts.manual_stop_on_failure {
+            let sigint_rx_cleanup = sigint_rx.clone();
+            let (tx, success_rx) = watch::channel(None);
+            test_ctx.set_cleanup_task_outcome_receiver(
+                success_rx,
+                sigint_rx_cleanup,
+            );
+            success_tx = Some(tx);
+        }
         let test_outcome = tokio::spawn(async move {
-            let test_ctx = framework.test_ctx(tc.fully_qualified_name());
             tokio::select! {
                 // Ensure interrupt signals are always handled instead of
                 // continuing to run the test.
@@ -144,6 +154,13 @@ pub async fn run_tests_with_ctx(
             }
         );
 
+        if let Some(tx) = success_tx {
+            let succeeded = !matches!(&test_outcome, TestOutcome::Failed(_));
+            if let Err(e) = tx.send(Some(succeeded)) {
+                error!("Error sending outcome to instance cleanup tasks: {e}");
+            }
+        }
+
         match test_outcome {
             TestOutcome::Passed => stats.tests_passed += 1,
             TestOutcome::Failed(_) => {

From e2470bd1e23429ed6c73c40acc4c122750d03a43 Mon Sep 17 00:00:00 2001
From: lif <>
Date: Thu, 26 Mar 2026 04:23:16 +0000
Subject: [PATCH 2/2] PR feedback, thanks ixi

---
 phd-tests/framework/src/lib.rs         |  18 ++--
 phd-tests/framework/src/test_vm/mod.rs | 132 +++++++++++++++----------
 phd-tests/runner/src/execute.rs        |  13 ++-
 3 files changed, 93 insertions(+), 70 deletions(-)

diff --git a/phd-tests/framework/src/lib.rs b/phd-tests/framework/src/lib.rs
index 12491eb75..69257f810 100644
--- a/phd-tests/framework/src/lib.rs
+++ b/phd-tests/framework/src/lib.rs
@@ -40,13 +40,11 @@ use log_config::LogConfig;
 use port_allocator::PortAllocator;
 pub use test_vm::TestVm;
 use test_vm::{
-    environment::EnvironmentSpec, spec::VmSpec, VmConfig, VmLocation,
+    environment::EnvironmentSpec, spec::VmSpec, TestVmManualStop, VmConfig,
+    VmLocation,
 };
 use tokio::{
-    sync::{
-        mpsc::{UnboundedReceiver, UnboundedSender},
-        watch,
-    },
+    sync::mpsc::{UnboundedReceiver, UnboundedSender},
     task::JoinHandle,
 };
 
@@ -66,8 +64,7 @@ pub(crate) mod zfs;
 pub struct TestCtx {
     pub(crate) framework: Arc<Framework>,
     pub(crate) output_dir: Utf8PathBuf,
-    pub(crate) success_sigint_rx:
-        Option<(watch::Receiver<Option<bool>>, watch::Receiver<bool>)>,
+    pub(crate) manual_stop: Option<TestVmManualStop>,
 }
 
 /// An instance of the PHD test framework.
@@ -250,10 +247,9 @@ impl TestCtx {
     /// used to indicate to the instance cleanup task that a test *has* failed.
     pub fn set_cleanup_task_outcome_receiver(
         &mut self,
-        success_rx: watch::Receiver<Option<bool>>,
-        sigint_rx: watch::Receiver<bool>,
+        manual_stop: TestVmManualStop,
     ) {
-        self.success_sigint_rx = Some((success_rx, sigint_rx));
+        self.manual_stop = Some(manual_stop);
     }
 }
 
@@ -346,7 +342,7 @@ impl Framework {
     pub fn test_ctx(self: &Arc<Self>, fully_qualified_name: String) -> TestCtx {
         let output_dir =
             self.tmp_directory.as_path().join(&fully_qualified_name);
-        TestCtx { framework: self.clone(), output_dir, success_sigint_rx: None }
+        TestCtx { framework: self.clone(), output_dir, manual_stop: None }
     }
 
     /// Resets the state of any stateful objects in the framework to prepare it
diff --git a/phd-tests/framework/src/test_vm/mod.rs b/phd-tests/framework/src/test_vm/mod.rs
index dbd0b0465..3f580e6da 100644
--- a/phd-tests/framework/src/test_vm/mod.rs
+++ b/phd-tests/framework/src/test_vm/mod.rs
@@ -273,11 +273,8 @@ pub struct TestVm {
     state: VmState,
 
     /// If we should wait for operator intervention before terminating this
-    /// instance, a `Some((success_rx, _))` here will be sent a `Some(false)`.
-    /// While waiting, a `Some((_, sigint_rx))` will be sent a `true` if the
-    /// user sends a keyboard interrupt to indicate we should stop waiting.
-    success_sigint_rx:
-        Option<(watch::Receiver<Option<bool>>, watch::Receiver<bool>)>,
+    /// instance, this will be populated.
+    manual_stop: Option<TestVmManualStop>,
 
     /// Sending a task handle to this channel will ensure that the task runs to
     /// completion as part of the post-test cleanup fixture (i.e. before any
@@ -335,7 +332,7 @@ impl TestVm {
                 environment.clone(),
                 params,
                 ctx.framework.cleanup_task_channel(),
-                ctx.success_sigint_rx.clone(),
+                ctx.manual_stop.clone(),
             ),
         }
     }
@@ -346,10 +343,7 @@ impl TestVm {
         environment_spec: EnvironmentSpec,
         mut params: ServerProcessParameters,
         cleanup_task_tx: UnboundedSender<JoinHandle<()>>,
-        success_sigint_rx: Option<(
-            watch::Receiver<Option<bool>>,
-            watch::Receiver<bool>,
-        )>,
+        manual_stop: Option<TestVmManualStop>,
     ) -> Result<Self> {
         let metrics = environment_spec.metrics.as_ref().map(|m| match m {
             MetricsLocation::Local => {
@@ -384,7 +378,7 @@ impl TestVm {
             guest_os,
             state: VmState::New,
             cleanup_task_tx,
-            success_sigint_rx,
+            manual_stop,
         })
     }
 
@@ -1147,8 +1141,8 @@ impl Drop for TestVm {
 
         let disks: Vec<_> = self.vm_spec().disk_handles.drain(..).collect();
 
-        let success_sigint_rx_opt = self.success_sigint_rx.take();
-        let name = self.vm_spec().vm_name.to_owned();
+        let manual_stop_opt = self.manual_stop.take();
+        let vm_name = self.vm_spec().vm_name.to_owned();
 
         // The order in which the task destroys objects is important: the server
         // can't be killed until the client has gotten a chance to shut down
@@ -1162,45 +1156,8 @@ impl Drop for TestVm {
 
                 // Check if we should let the user access the instance of a
                 // failed testcase before ensuring its demolition
-                if let Some((mut success_rx, sigint_rx)) = success_sigint_rx_opt {
-                    while success_rx.changed().await.is_ok() {
-                        match *success_rx.borrow() {
-                            None => continue,
-                            Some(true) => break,
-                            Some(false) => {}
-                        }
-                        let sock = server.server_addr();
-                        let ip = sock.ip();
-                        let port = sock.port();
-                        let mut uninformed = true;
-                        // States that might be worth inspecting out-of-band
-                        while let Ok(InstanceState::Running
-                            | InstanceState::Migrating
-                            | InstanceState::Rebooting
-                            | InstanceState::Repairing
-                        ) = client
-                            .instance_get()
-                            .send()
-                            .await
-                            .map(|inst| inst.instance.state)
-                        {
-                            if *sigint_rx.borrow() { break }
-                            if uninformed {
-                                // write to stderr irrespective of log level
-                                eprintln!(r#"
-propolis-server {name:?} left running at address {sock}
-phd-runner will resume when this instance is shut down; e.g. by one of:
-
-$ propolis-cli -s {ip} -p {port} serial
-localhost:~# poweroff
-
-$ propolis-cli -s {ip} -p {port} state stop
-"#);
-                                uninformed = false;
-                            }
-                            tokio::time::sleep(Duration::from_secs(1)).await;
-                        }
-                    }
+                if let Some(manual_stop) = manual_stop_opt {
+                    manual_stop.wait_for_stop(vm_name, &client, &server).await;
                 }
 
                 // Try to make sure the server's kernel VMM is cleaned up before
@@ -1292,3 +1249,74 @@ async fn try_ensure_vm_destroyed(client: &Client) {
         error!(%error, "VM not destroyed after 5 seconds");
     }
 }
+
+/// For waiting for instances of failed testcases to be manually shut down,
+/// when phd-runner is invoked with --manual-stop-on-failure
+#[derive(Clone)]
+pub struct TestVmManualStop {
+    test_name: String,
+    /// If we should wait for operator intervention before terminating this
+    /// instance, this will be sent a `Some(false)`.
+    success_rx: watch::Receiver<Option<bool>>,
+    /// While waiting for instance shutdown, this may be sent a `true` if the
+    /// user sends a keyboard interrupt to indicate we should stop waiting.
+    sigint_rx: watch::Receiver<bool>,
+}
+impl TestVmManualStop {
+    pub fn new(
+        test_name: String,
+        success_rx: watch::Receiver<Option<bool>>,
+        sigint_rx: watch::Receiver<bool>,
+    ) -> Self {
+        Self { test_name, success_rx, sigint_rx }
+    }
+    async fn wait_for_stop(
+        mut self,
+        vm_name: String,
+        client: &Client,
+        server: &server::PropolisServer,
+    ) {
+        if self.success_rx.changed().await.is_ok() {
+            let success_opt = *self.success_rx.borrow();
+            if let Some(false) = success_opt {
+                let sock = server.server_addr();
+                let ip = sock.ip();
+                let port = sock.port();
+                let test_name = self.test_name;
+                let mut uninformed = true;
+                // States that might be worth inspecting out-of-band
+                while let Ok(
+                    InstanceState::Running
+                    | InstanceState::Migrating
+                    | InstanceState::Rebooting
+                    | InstanceState::Repairing,
+                ) = client
+                    .instance_get()
+                    .send()
+                    .await
+                    .map(|inst| inst.instance.state)
+                {
+                    if *self.sigint_rx.borrow() {
+                        break;
+                    }
+                    if uninformed {
+                        error!(
+                            r#"
+test {test_name:?} failed. propolis-server {vm_name:?} was left running,
+with API accessible at http://{sock}
+phd-runner will resume when this instance is shut down; e.g. by one of:
+
+$ propolis-cli -s {ip} -p {port} serial
+localhost:~# poweroff
+
+$ propolis-cli -s {ip} -p {port} state stop
+"#
+                        );
+                        uninformed = false;
+                    }
+                    tokio::time::sleep(Duration::from_secs(1)).await;
+                }
+            }
+        }
+    }
+}
diff --git a/phd-tests/runner/src/execute.rs b/phd-tests/runner/src/execute.rs
index 978de8080..ed9de04df 100644
--- a/phd-tests/runner/src/execute.rs
+++ b/phd-tests/runner/src/execute.rs
@@ -5,6 +5,7 @@
 use std::sync::Arc;
 use std::time::{Duration, Instant};
 
+use phd_framework::test_vm::TestVmManualStop;
 use phd_tests::phd_testcase::{Framework, TestCase, TestOutcome};
 use tokio::signal::unix::{signal, SignalKind};
 use tokio::sync::watch;
@@ -104,12 +105,12 @@ pub async fn run_tests_with_ctx(
         let mut test_ctx = framework.test_ctx(tc.fully_qualified_name());
         let mut success_tx = None;
         if run_opts.manual_stop_on_failure {
-            let sigint_rx_cleanup = sigint_rx.clone();
             let (tx, success_rx) = watch::channel(None);
-            test_ctx.set_cleanup_task_outcome_receiver(
+            test_ctx.set_cleanup_task_outcome_receiver(TestVmManualStop::new(
+                tc.fully_qualified_name(),
                 success_rx,
-                sigint_rx_cleanup,
-            );
+                sigint_rx.clone(),
+            ));
             success_tx = Some(tx);
         }
         let test_outcome = tokio::spawn(async move {
@@ -156,9 +157,7 @@ pub async fn run_tests_with_ctx(
 
         if let Some(tx) = success_tx {
             let succeeded = !matches!(&test_outcome, TestOutcome::Failed(_));
-            if let Err(e) = tx.send(Some(succeeded)) {
-                error!("Error sending outcome to instance cleanup tasks: {e}");
-            }
+            let _: Result<_, _> = tx.send(Some(succeeded));
         }
 
         match test_outcome {