oxidecomputer · lifning · Mar 20, 2026 · Mar 26, 2026 · iximeow · Mar 25, 2026
diff --git a/phd-tests/framework/src/lib.rs b/phd-tests/framework/src/lib.rs
@@ -40,7 +40,8 @@ use log_config::LogConfig;
 use port_allocator::PortAllocator;
 pub use test_vm::TestVm;
 use test_vm::{
-    environment::EnvironmentSpec, spec::VmSpec, VmConfig, VmLocation,
+    environment::EnvironmentSpec, spec::VmSpec, TestVmManualStop, VmConfig,
+    VmLocation,
 };
 use tokio::{
     sync::mpsc::{UnboundedReceiver, UnboundedSender},
@@ -63,6 +64,7 @@ pub(crate) mod zfs;
 pub struct TestCtx {
     pub(crate) framework: Arc<Framework>,
     pub(crate) output_dir: Utf8PathBuf,
+    pub(crate) manual_stop: Option<TestVmManualStop>,
 }
 
 /// An instance of the PHD test framework.
@@ -239,6 +241,16 @@ impl TestCtx {
         )
         .await
     }
+
+    /// When phd-runner is configured to leave instances running on failed
+    /// tests, the watch channel whose Receiver is passed to this function is
+    /// used to indicate to the instance cleanup task that a test *has* failed.
+    pub fn set_cleanup_task_outcome_receiver(
+        &mut self,
+        manual_stop: TestVmManualStop,
+    ) {
+        self.manual_stop = Some(manual_stop);
+    }
 }
 
 // The framework implementation includes some "runner-only" functions
@@ -330,7 +342,7 @@ impl Framework {
     pub fn test_ctx(self: &Arc<Self>, fully_qualified_name: String) -> TestCtx {
         let output_dir =
             self.tmp_directory.as_path().join(&fully_qualified_name);
-        TestCtx { framework: self.clone(), output_dir }
+        TestCtx { framework: self.clone(), output_dir, manual_stop: None }
     }
 
     /// Resets the state of any stateful objects in the framework to prepare it

diff --git a/phd-tests/framework/src/test_vm/mod.rs b/phd-tests/framework/src/test_vm/mod.rs
@@ -43,7 +43,7 @@ use propolis_client::{
 use propolis_client::{Client, ResponseValue};
 use thiserror::Error;
 use tokio::{
-    sync::{mpsc::UnboundedSender, oneshot},
+    sync::{mpsc::UnboundedSender, oneshot, watch},
     task::JoinHandle,
     time::timeout,
 };
@@ -272,6 +272,10 @@ pub struct TestVm {
 
     state: VmState,
 
+    /// If we should wait for operator intervention before terminating this
+    /// instance, this will be populated.
+    manual_stop: Option<TestVmManualStop>,
+
     /// Sending a task handle to this channel will ensure that the task runs to
     /// completion as part of the post-test cleanup fixture (i.e. before any
     /// other tests run).
@@ -328,6 +332,7 @@ impl TestVm {
                 environment.clone(),
                 params,
                 ctx.framework.cleanup_task_channel(),
+                ctx.manual_stop.clone(),
             ),
         }
     }
@@ -338,6 +343,7 @@ impl TestVm {
         environment_spec: EnvironmentSpec,
         mut params: ServerProcessParameters,
         cleanup_task_tx: UnboundedSender<JoinHandle<()>>,
+        manual_stop: Option<TestVmManualStop>,
     ) -> Result<Self> {
         let metrics = environment_spec.metrics.as_ref().map(|m| match m {
             MetricsLocation::Local => {
@@ -372,6 +378,7 @@ impl TestVm {
             guest_os,
             state: VmState::New,
             cleanup_task_tx,
+            manual_stop,
         })
     }
 
@@ -1134,6 +1141,9 @@ impl Drop for TestVm {
 
         let disks: Vec<_> = self.vm_spec().disk_handles.drain(..).collect();
 
+        let manual_stop_opt = self.manual_stop.take();
+        let vm_name = self.vm_spec().vm_name.to_owned();
+
         // The order in which the task destroys objects is important: the server
         // can't be killed until the client has gotten a chance to shut down
         // the VM, and the disks can't be destroyed until the server process has
@@ -1144,6 +1154,12 @@ impl Drop for TestVm {
                 // kept alive until the server process is gone.
                 let _disks = disks;
 
+                // Check if we should let the user access the instance of a
+                // failed testcase before ensuring its demolition
+                if let Some(manual_stop) = manual_stop_opt {
+                    manual_stop.wait_for_stop(vm_name, &client, &server).await;
+                }
+
                 // Try to make sure the server's kernel VMM is cleaned up before
                 // killing the server process. This is best-effort; if it fails,
                 // the kernel VMM is leaked. This generally indicates a bug in
@@ -1233,3 +1249,74 @@ async fn try_ensure_vm_destroyed(client: &Client) {
         error!(%error, "VM not destroyed after 5 seconds");
     }
 }
+
+/// For waiting for instances of failed testcases to be manually shut down,
+/// when phd-runner is invoked with --manual-stop-on-failure
+#[derive(Clone)]
+pub struct TestVmManualStop {
+    test_name: String,
+    /// If we should wait for operator intervention before terminating this
+    /// instance, this will be sent a `Some(false)`.
+    success_rx: watch::Receiver<Option<bool>>,
+    /// While waiting for instance shutdown, this may be sent a `true` if the
+    /// user sends a keyboard interrupt to indicate we should stop waiting.
+    sigint_rx: watch::Receiver<bool>,
+}
+impl TestVmManualStop {
+    pub fn new(
+        test_name: String,
+        success_rx: watch::Receiver<Option<bool>>,
+        sigint_rx: watch::Receiver<bool>,
+    ) -> Self {
+        Self { test_name, success_rx, sigint_rx }
+    }
+    async fn wait_for_stop(
+        mut self,
+        vm_name: String,
+        client: &Client,
+        server: &server::PropolisServer,
+    ) {
+        if self.success_rx.changed().await.is_ok() {
+            let success_opt = *self.success_rx.borrow();
+            if let Some(false) = success_opt {
+                let sock = server.server_addr();
+                let ip = sock.ip();
+                let port = sock.port();
+                let test_name = self.test_name;
+                let mut uninformed = true;
+                // States that might be worth inspecting out-of-band
+                while let Ok(
+                    InstanceState::Running
+                    | InstanceState::Migrating
+                    | InstanceState::Rebooting
+                    | InstanceState::Repairing,
+                ) = client
+                    .instance_get()
+                    .send()
+                    .await
+                    .map(|inst| inst.instance.state)
+                {
+                    if *self.sigint_rx.borrow() {
+                        break;
+                    }
+                    if uninformed {
+                        error!(
+                            r#"
+test {test_name:?} failed. propolis-server {vm_name:?} was left running,
+with API accessible at http://{sock}
+phd-runner will resume when this instance is shut down; e.g. by one of:
+
+$ propolis-cli -s {ip} -p {port} serial
+localhost:~# poweroff
+
+$ propolis-cli -s {ip} -p {port} state stop
+"#
+                        );
+                        uninformed = false;
+                    }
+                    tokio::time::sleep(Duration::from_secs(1)).await;
+                }
+            }
+        }
+    }
+}
diff --git a/phd-tests/runner/src/config.rs b/phd-tests/runner/src/config.rs
@@ -202,6 +202,16 @@ pub struct RunOptions {
     // number of seconds, but i'm lazy...
     #[clap(long, value_parser, default_value_t = 60 * 20)]
     pub max_buildomat_wait_secs: u64,
+
+    /// When a testcase fails while this is enabled, any instances started by
+    /// the failed test are left running and phd-runner waits until they are
+    /// manually shut down out-of-band by the operator.
+    ///
+    /// This feature is intended to give the operator a chance to inspect the
+    /// state of the guest(s) easily without necessarily having to reconstruct
+    /// the scenario by hand.
+    #[clap(long, value_parser)]
+    pub manual_stop_on_failure: bool,
 }
 
 #[derive(Args, Debug)]

diff --git a/phd-tests/runner/src/execute.rs b/phd-tests/runner/src/execute.rs
@@ -5,6 +5,7 @@
 use std::sync::Arc;
 use std::time::{Duration, Instant};
 
+use phd_framework::test_vm::TestVmManualStop;
 use phd_tests::phd_testcase::{Framework, TestCase, TestOutcome};
 use tokio::signal::unix::{signal, SignalKind};
 use tokio::sync::watch;
@@ -101,8 +102,18 @@ pub async fn run_tests_with_ctx(
         let framework = ctx.clone();
         let tc = execution.tc;
         let mut sigint_rx_task = sigint_rx.clone();
+        let mut test_ctx = framework.test_ctx(tc.fully_qualified_name());
+        let mut success_tx = None;
+        if run_opts.manual_stop_on_failure {
+            let (tx, success_rx) = watch::channel(None);
+            test_ctx.set_cleanup_task_outcome_receiver(TestVmManualStop::new(
+                tc.fully_qualified_name(),
+                success_rx,
+                sigint_rx.clone(),
+            ));
+            success_tx = Some(tx);
+        }
         let test_outcome = tokio::spawn(async move {
-            let test_ctx = framework.test_ctx(tc.fully_qualified_name());
             tokio::select! {
                 // Ensure interrupt signals are always handled instead of
                 // continuing to run the test.
@@ -144,6 +155,11 @@ pub async fn run_tests_with_ctx(
             }
         );
 
+        if let Some(tx) = success_tx {
+            let succeeded = !matches!(&test_outcome, TestOutcome::Failed(_));
+            let _: Result<_, _> = tx.send(Some(succeeded));
+        }
+
         match test_outcome {
             TestOutcome::Passed => stats.tests_passed += 1,
             TestOutcome::Failed(_) => {