diff --git a/phd-tests/framework/src/lib.rs b/phd-tests/framework/src/lib.rs index 96cb2177b..69257f810 100644 --- a/phd-tests/framework/src/lib.rs +++ b/phd-tests/framework/src/lib.rs @@ -40,7 +40,8 @@ use log_config::LogConfig; use port_allocator::PortAllocator; pub use test_vm::TestVm; use test_vm::{ - environment::EnvironmentSpec, spec::VmSpec, VmConfig, VmLocation, + environment::EnvironmentSpec, spec::VmSpec, TestVmManualStop, VmConfig, + VmLocation, }; use tokio::{ sync::mpsc::{UnboundedReceiver, UnboundedSender}, @@ -63,6 +64,7 @@ pub(crate) mod zfs; pub struct TestCtx { pub(crate) framework: Arc, pub(crate) output_dir: Utf8PathBuf, + pub(crate) manual_stop: Option, } /// An instance of the PHD test framework. @@ -239,6 +241,16 @@ impl TestCtx { ) .await } + + /// When phd-runner is configured to leave instances running on failed + /// tests, the watch channel whose Receiver is passed to this function is + /// used to indicate to the instance cleanup task that a test *has* failed. + pub fn set_cleanup_task_outcome_receiver( + &mut self, + manual_stop: TestVmManualStop, + ) { + self.manual_stop = Some(manual_stop); + } } // The framework implementation includes some "runner-only" functions @@ -330,7 +342,7 @@ impl Framework { pub fn test_ctx(self: &Arc, fully_qualified_name: String) -> TestCtx { let output_dir = self.tmp_directory.as_path().join(&fully_qualified_name); - TestCtx { framework: self.clone(), output_dir } + TestCtx { framework: self.clone(), output_dir, manual_stop: None } } /// Resets the state of any stateful objects in the framework to prepare it diff --git a/phd-tests/framework/src/test_vm/mod.rs b/phd-tests/framework/src/test_vm/mod.rs index d9d8b1c64..3f580e6da 100644 --- a/phd-tests/framework/src/test_vm/mod.rs +++ b/phd-tests/framework/src/test_vm/mod.rs @@ -43,7 +43,7 @@ use propolis_client::{ use propolis_client::{Client, ResponseValue}; use thiserror::Error; use tokio::{ - sync::{mpsc::UnboundedSender, oneshot}, + sync::{mpsc::UnboundedSender, oneshot, watch}, task::JoinHandle, time::timeout, }; @@ -272,6 +272,10 @@ pub struct TestVm { state: VmState, + /// If we should wait for operator intervention before terminating this + /// instance, this will be populated. + manual_stop: Option, + /// Sending a task handle to this channel will ensure that the task runs to /// completion as part of the post-test cleanup fixture (i.e. before any /// other tests run). @@ -328,6 +332,7 @@ impl TestVm { environment.clone(), params, ctx.framework.cleanup_task_channel(), + ctx.manual_stop.clone(), ), } } @@ -338,6 +343,7 @@ impl TestVm { environment_spec: EnvironmentSpec, mut params: ServerProcessParameters, cleanup_task_tx: UnboundedSender>, + manual_stop: Option, ) -> Result { let metrics = environment_spec.metrics.as_ref().map(|m| match m { MetricsLocation::Local => { @@ -372,6 +378,7 @@ impl TestVm { guest_os, state: VmState::New, cleanup_task_tx, + manual_stop, }) } @@ -1134,6 +1141,9 @@ impl Drop for TestVm { let disks: Vec<_> = self.vm_spec().disk_handles.drain(..).collect(); + let manual_stop_opt = self.manual_stop.take(); + let vm_name = self.vm_spec().vm_name.to_owned(); + // The order in which the task destroys objects is important: the server // can't be killed until the client has gotten a chance to shut down // the VM, and the disks can't be destroyed until the server process has @@ -1144,6 +1154,12 @@ impl Drop for TestVm { // kept alive until the server process is gone. let _disks = disks; + // Check if we should let the user access the instance of a + // failed testcase before ensuring its demolition + if let Some(manual_stop) = manual_stop_opt { + manual_stop.wait_for_stop(vm_name, &client, &server).await; + } + // Try to make sure the server's kernel VMM is cleaned up before // killing the server process. This is best-effort; if it fails, // the kernel VMM is leaked. This generally indicates a bug in @@ -1233,3 +1249,74 @@ async fn try_ensure_vm_destroyed(client: &Client) { error!(%error, "VM not destroyed after 5 seconds"); } } + +/// For waiting for instances of failed testcases to be manually shut down, +/// when phd-runner is invoked with --manual-stop-on-failure +#[derive(Clone)] +pub struct TestVmManualStop { + test_name: String, + /// If we should wait for operator intervention before terminating this + /// instance, this will be sent a `Some(false)`. + success_rx: watch::Receiver>, + /// While waiting for instance shutdown, this may be sent a `true` if the + /// user sends a keyboard interrupt to indicate we should stop waiting. + sigint_rx: watch::Receiver, +} +impl TestVmManualStop { + pub fn new( + test_name: String, + success_rx: watch::Receiver>, + sigint_rx: watch::Receiver, + ) -> Self { + Self { test_name, success_rx, sigint_rx } + } + async fn wait_for_stop( + mut self, + vm_name: String, + client: &Client, + server: &server::PropolisServer, + ) { + if self.success_rx.changed().await.is_ok() { + let success_opt = *self.success_rx.borrow(); + if let Some(false) = success_opt { + let sock = server.server_addr(); + let ip = sock.ip(); + let port = sock.port(); + let test_name = self.test_name; + let mut uninformed = true; + // States that might be worth inspecting out-of-band + while let Ok( + InstanceState::Running + | InstanceState::Migrating + | InstanceState::Rebooting + | InstanceState::Repairing, + ) = client + .instance_get() + .send() + .await + .map(|inst| inst.instance.state) + { + if *self.sigint_rx.borrow() { + break; + } + if uninformed { + error!( + r#" +test {test_name:?} failed. propolis-server {vm_name:?} was left running, +with API accessible at http://{sock} +phd-runner will resume when this instance is shut down; e.g. by one of: + +$ propolis-cli -s {ip} -p {port} serial +localhost:~# poweroff + +$ propolis-cli -s {ip} -p {port} state stop +"# + ); + uninformed = false; + } + tokio::time::sleep(Duration::from_secs(1)).await; + } + } + } + } +} diff --git a/phd-tests/runner/src/config.rs b/phd-tests/runner/src/config.rs index 313d389d3..88fe66617 100644 --- a/phd-tests/runner/src/config.rs +++ b/phd-tests/runner/src/config.rs @@ -202,6 +202,16 @@ pub struct RunOptions { // number of seconds, but i'm lazy... #[clap(long, value_parser, default_value_t = 60 * 20)] pub max_buildomat_wait_secs: u64, + + /// When a testcase fails while this is enabled, any instances started by + /// the failed test are left running and phd-runner waits until they are + /// manually shut down out-of-band by the operator. + /// + /// This feature is intended to give the operator a chance to inspect the + /// state of the guest(s) easily without necessarily having to reconstruct + /// the scenario by hand. + #[clap(long, value_parser)] + pub manual_stop_on_failure: bool, } #[derive(Args, Debug)] diff --git a/phd-tests/runner/src/execute.rs b/phd-tests/runner/src/execute.rs index f7d878c39..ed9de04df 100644 --- a/phd-tests/runner/src/execute.rs +++ b/phd-tests/runner/src/execute.rs @@ -5,6 +5,7 @@ use std::sync::Arc; use std::time::{Duration, Instant}; +use phd_framework::test_vm::TestVmManualStop; use phd_tests::phd_testcase::{Framework, TestCase, TestOutcome}; use tokio::signal::unix::{signal, SignalKind}; use tokio::sync::watch; @@ -101,8 +102,18 @@ pub async fn run_tests_with_ctx( let framework = ctx.clone(); let tc = execution.tc; let mut sigint_rx_task = sigint_rx.clone(); + let mut test_ctx = framework.test_ctx(tc.fully_qualified_name()); + let mut success_tx = None; + if run_opts.manual_stop_on_failure { + let (tx, success_rx) = watch::channel(None); + test_ctx.set_cleanup_task_outcome_receiver(TestVmManualStop::new( + tc.fully_qualified_name(), + success_rx, + sigint_rx.clone(), + )); + success_tx = Some(tx); + } let test_outcome = tokio::spawn(async move { - let test_ctx = framework.test_ctx(tc.fully_qualified_name()); tokio::select! { // Ensure interrupt signals are always handled instead of // continuing to run the test. @@ -144,6 +155,11 @@ pub async fn run_tests_with_ctx( } ); + if let Some(tx) = success_tx { + let succeeded = !matches!(&test_outcome, TestOutcome::Failed(_)); + let _: Result<_, _> = tx.send(Some(succeeded)); + } + match test_outcome { TestOutcome::Passed => stats.tests_passed += 1, TestOutcome::Failed(_) => {