oxidecomputer · lifning · Mar 28, 2026 · Mar 20, 2026 · Mar 26, 2026 · iximeow
diff --git a/phd-tests/framework/src/lib.rs b/phd-tests/framework/src/lib.rs
@@ -43,7 +43,10 @@ use test_vm::{
     environment::EnvironmentSpec, spec::VmSpec, VmConfig, VmLocation,
 };
 use tokio::{
-    sync::mpsc::{UnboundedReceiver, UnboundedSender},
+    sync::{
+        mpsc::{UnboundedReceiver, UnboundedSender},
+        watch,
+    },
     task::JoinHandle,
 };
 
@@ -63,6 +66,8 @@ pub(crate) mod zfs;
 pub struct TestCtx {
     pub(crate) framework: Arc<Framework>,
     pub(crate) output_dir: Utf8PathBuf,
+    pub(crate) success_sigint_rx:
+        Option<(watch::Receiver<Option<bool>>, watch::Receiver<bool>)>,
 }
 
 /// An instance of the PHD test framework.
@@ -239,6 +244,17 @@ impl TestCtx {
         )
         .await
     }
+
+    /// When phd-runner is configured to leave instances running on failed
+    /// tests, the watch channel whose Receiver is passed to this function is
+    /// used to indicate to the instance cleanup task that a test *has* failed.
+    pub fn set_cleanup_task_outcome_receiver(
+        &mut self,
+        success_rx: watch::Receiver<Option<bool>>,
+        sigint_rx: watch::Receiver<bool>,
+    ) {
+        self.success_sigint_rx = Some((success_rx, sigint_rx));
+    }
 }
 
 // The framework implementation includes some "runner-only" functions
@@ -330,7 +346,7 @@ impl Framework {
     pub fn test_ctx(self: &Arc<Self>, fully_qualified_name: String) -> TestCtx {
         let output_dir =
             self.tmp_directory.as_path().join(&fully_qualified_name);
-        TestCtx { framework: self.clone(), output_dir }
+        TestCtx { framework: self.clone(), output_dir, success_sigint_rx: None }
     }
 
     /// Resets the state of any stateful objects in the framework to prepare it

diff --git a/phd-tests/framework/src/test_vm/mod.rs b/phd-tests/framework/src/test_vm/mod.rs
@@ -43,7 +43,7 @@ use propolis_client::{
 use propolis_client::{Client, ResponseValue};
 use thiserror::Error;
 use tokio::{
-    sync::{mpsc::UnboundedSender, oneshot},
+    sync::{mpsc::UnboundedSender, oneshot, watch},
     task::JoinHandle,
     time::timeout,
 };
@@ -272,6 +272,13 @@ pub struct TestVm {
 
     state: VmState,
 
+    /// If we should wait for operator intervention before terminating this
+    /// instance, a `Some((success_rx, _))` here will be sent a `Some(false)`.
+    /// While waiting, a `Some((_, sigint_rx))` will be sent a `true` if the
+    /// user sends a keyboard interrupt to indicate we should stop waiting.
+    success_sigint_rx:
+        Option<(watch::Receiver<Option<bool>>, watch::Receiver<bool>)>,
+
     /// Sending a task handle to this channel will ensure that the task runs to
     /// completion as part of the post-test cleanup fixture (i.e. before any
     /// other tests run).
@@ -328,6 +335,7 @@ impl TestVm {
                 environment.clone(),
                 params,
                 ctx.framework.cleanup_task_channel(),
+                ctx.success_sigint_rx.clone(),
             ),
         }
     }
@@ -338,6 +346,10 @@ impl TestVm {
         environment_spec: EnvironmentSpec,
         mut params: ServerProcessParameters,
         cleanup_task_tx: UnboundedSender<JoinHandle<()>>,
+        success_sigint_rx: Option<(
+            watch::Receiver<Option<bool>>,
+            watch::Receiver<bool>,
+        )>,
     ) -> Result<Self> {
         let metrics = environment_spec.metrics.as_ref().map(|m| match m {
             MetricsLocation::Local => {
@@ -372,6 +384,7 @@ impl TestVm {
             guest_os,
             state: VmState::New,
             cleanup_task_tx,
+            success_sigint_rx,
         })
     }
 
@@ -1134,6 +1147,9 @@ impl Drop for TestVm {
 
         let disks: Vec<_> = self.vm_spec().disk_handles.drain(..).collect();
 
+        let success_sigint_rx_opt = self.success_sigint_rx.take();
+        let name = self.vm_spec().vm_name.to_owned();
+
         // The order in which the task destroys objects is important: the server
         // can't be killed until the client has gotten a chance to shut down
         // the VM, and the disks can't be destroyed until the server process has
@@ -1144,6 +1160,49 @@ impl Drop for TestVm {
                 // kept alive until the server process is gone.
                 let _disks = disks;
 
+                // Check if we should let the user access the instance of a
+                // failed testcase before ensuring its demolition
+                if let Some((mut success_rx, sigint_rx)) = success_sigint_rx_opt {
+                    while success_rx.changed().await.is_ok() {
+                        match *success_rx.borrow() {
+                            None => continue,
+                            Some(true) => break,
+                            Some(false) => {}
+                        }
+                        let sock = server.server_addr();
+                        let ip = sock.ip();
+                        let port = sock.port();
+                        let mut uninformed = true;
+                        // States that might be worth inspecting out-of-band
+                        while let Ok(InstanceState::Running
+                            | InstanceState::Migrating
+                            | InstanceState::Rebooting
+                            | InstanceState::Repairing
+                        ) = client
+                            .instance_get()
+                            .send()
+                            .await
+                            .map(|inst| inst.instance.state)
+                        {
+                            if *sigint_rx.borrow() { break }
+                            if uninformed {
+                                // write to stderr irrespective of log level
+                                eprintln!(r#"
+propolis-server {name:?} left running at address {sock}
+phd-runner will resume when this instance is shut down; e.g. by one of:
+
+$ propolis-cli -s {ip} -p {port} serial
+localhost:~# poweroff
+
+$ propolis-cli -s {ip} -p {port} state stop
+"#);
+                                uninformed = false;
+                            }
+                            tokio::time::sleep(Duration::from_secs(1)).await;
+                        }
+                    }
+                }
+
                 // Try to make sure the server's kernel VMM is cleaned up before
                 // killing the server process. This is best-effort; if it fails,
                 // the kernel VMM is leaked. This generally indicates a bug in

diff --git a/phd-tests/runner/src/config.rs b/phd-tests/runner/src/config.rs
@@ -202,6 +202,16 @@ pub struct RunOptions {
     // number of seconds, but i'm lazy...
     #[clap(long, value_parser, default_value_t = 60 * 20)]
     pub max_buildomat_wait_secs: u64,
+
+    /// When a testcase fails while this is enabled, any instances started by
+    /// the failed test are left running and phd-runner waits until they are
+    /// manually shut down out-of-band by the operator.
+    ///
+    /// This feature is intended to give the operator a chance to inspect the
+    /// state of the guest(s) easily without necessarily having to reconstruct
+    /// the scenario by hand.
+    #[clap(long, value_parser)]
+    pub manual_stop_on_failure: bool,
 }
 
 #[derive(Args, Debug)]

diff --git a/phd-tests/runner/src/execute.rs b/phd-tests/runner/src/execute.rs
@@ -101,8 +101,18 @@ pub async fn run_tests_with_ctx(
         let framework = ctx.clone();
         let tc = execution.tc;
         let mut sigint_rx_task = sigint_rx.clone();
+        let mut test_ctx = framework.test_ctx(tc.fully_qualified_name());
+        let mut success_tx = None;
+        if run_opts.manual_stop_on_failure {
+            let sigint_rx_cleanup = sigint_rx.clone();
+            let (tx, success_rx) = watch::channel(None);
+            test_ctx.set_cleanup_task_outcome_receiver(
+                success_rx,
+                sigint_rx_cleanup,
+            );
+            success_tx = Some(tx);
+        }
         let test_outcome = tokio::spawn(async move {
-            let test_ctx = framework.test_ctx(tc.fully_qualified_name());
             tokio::select! {
                 // Ensure interrupt signals are always handled instead of
                 // continuing to run the test.
@@ -144,6 +154,13 @@ pub async fn run_tests_with_ctx(
             }
         );
 
+        if let Some(tx) = success_tx {
+            let succeeded = !matches!(&test_outcome, TestOutcome::Failed(_));
+            if let Err(e) = tx.send(Some(succeeded)) {
+                error!("Error sending outcome to instance cleanup tasks: {e}");
+            }
+        }
+
         match test_outcome {
             TestOutcome::Passed => stats.tests_passed += 1,
             TestOutcome::Failed(_) => {