Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 14 additions & 2 deletions phd-tests/framework/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,8 @@ use log_config::LogConfig;
use port_allocator::PortAllocator;
pub use test_vm::TestVm;
use test_vm::{
environment::EnvironmentSpec, spec::VmSpec, VmConfig, VmLocation,
environment::EnvironmentSpec, spec::VmSpec, TestVmManualStop, VmConfig,
VmLocation,
};
use tokio::{
sync::mpsc::{UnboundedReceiver, UnboundedSender},
Expand All @@ -63,6 +64,7 @@ pub(crate) mod zfs;
pub struct TestCtx {
pub(crate) framework: Arc<Framework>,
pub(crate) output_dir: Utf8PathBuf,
pub(crate) manual_stop: Option<TestVmManualStop>,
}

/// An instance of the PHD test framework.
Expand Down Expand Up @@ -239,6 +241,16 @@ impl TestCtx {
)
.await
}

/// When phd-runner is configured to leave instances running on failed
/// tests, the watch channel whose Receiver is passed to this function is
/// used to indicate to the instance cleanup task that a test *has* failed.
pub fn set_cleanup_task_outcome_receiver(
&mut self,
manual_stop: TestVmManualStop,
) {
self.manual_stop = Some(manual_stop);
}
}

// The framework implementation includes some "runner-only" functions
Expand Down Expand Up @@ -330,7 +342,7 @@ impl Framework {
pub fn test_ctx(self: &Arc<Self>, fully_qualified_name: String) -> TestCtx {
let output_dir =
self.tmp_directory.as_path().join(&fully_qualified_name);
TestCtx { framework: self.clone(), output_dir }
TestCtx { framework: self.clone(), output_dir, manual_stop: None }
}

/// Resets the state of any stateful objects in the framework to prepare it
Expand Down
89 changes: 88 additions & 1 deletion phd-tests/framework/src/test_vm/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ use propolis_client::{
use propolis_client::{Client, ResponseValue};
use thiserror::Error;
use tokio::{
sync::{mpsc::UnboundedSender, oneshot},
sync::{mpsc::UnboundedSender, oneshot, watch},
task::JoinHandle,
time::timeout,
};
Expand Down Expand Up @@ -272,6 +272,10 @@ pub struct TestVm {

state: VmState,

/// If we should wait for operator intervention before terminating this
/// instance, this will be populated.
manual_stop: Option<TestVmManualStop>,

/// Sending a task handle to this channel will ensure that the task runs to
/// completion as part of the post-test cleanup fixture (i.e. before any
/// other tests run).
Expand Down Expand Up @@ -328,6 +332,7 @@ impl TestVm {
environment.clone(),
params,
ctx.framework.cleanup_task_channel(),
ctx.manual_stop.clone(),
),
}
}
Expand All @@ -338,6 +343,7 @@ impl TestVm {
environment_spec: EnvironmentSpec,
mut params: ServerProcessParameters,
cleanup_task_tx: UnboundedSender<JoinHandle<()>>,
manual_stop: Option<TestVmManualStop>,
) -> Result<Self> {
let metrics = environment_spec.metrics.as_ref().map(|m| match m {
MetricsLocation::Local => {
Expand Down Expand Up @@ -372,6 +378,7 @@ impl TestVm {
guest_os,
state: VmState::New,
cleanup_task_tx,
manual_stop,
})
}

Expand Down Expand Up @@ -1134,6 +1141,9 @@ impl Drop for TestVm {

let disks: Vec<_> = self.vm_spec().disk_handles.drain(..).collect();

let manual_stop_opt = self.manual_stop.take();
let vm_name = self.vm_spec().vm_name.to_owned();

// The order in which the task destroys objects is important: the server
// can't be killed until the client has gotten a chance to shut down
// the VM, and the disks can't be destroyed until the server process has
Expand All @@ -1144,6 +1154,12 @@ impl Drop for TestVm {
// kept alive until the server process is gone.
let _disks = disks;

// Check if we should let the user access the instance of a
// failed testcase before ensuring its demolition
if let Some(manual_stop) = manual_stop_opt {
manual_stop.wait_for_stop(vm_name, &client, &server).await;
}

// Try to make sure the server's kernel VMM is cleaned up before
// killing the server process. This is best-effort; if it fails,
// the kernel VMM is leaked. This generally indicates a bug in
Expand Down Expand Up @@ -1233,3 +1249,74 @@ async fn try_ensure_vm_destroyed(client: &Client) {
error!(%error, "VM not destroyed after 5 seconds");
}
}

/// For waiting for instances of failed testcases to be manually shut down,
/// when phd-runner is invoked with --manual-stop-on-failure
#[derive(Clone)]
pub struct TestVmManualStop {
test_name: String,
/// If we should wait for operator intervention before terminating this
/// instance, this will be sent a `Some(false)`.
success_rx: watch::Receiver<Option<bool>>,
/// While waiting for instance shutdown, this may be sent a `true` if the
/// user sends a keyboard interrupt to indicate we should stop waiting.
sigint_rx: watch::Receiver<bool>,
}
impl TestVmManualStop {
pub fn new(
test_name: String,
success_rx: watch::Receiver<Option<bool>>,
sigint_rx: watch::Receiver<bool>,
) -> Self {
Self { test_name, success_rx, sigint_rx }
}
async fn wait_for_stop(
mut self,
vm_name: String,
client: &Client,
server: &server::PropolisServer,
) {
if self.success_rx.changed().await.is_ok() {
let success_opt = *self.success_rx.borrow();
if let Some(false) = success_opt {
let sock = server.server_addr();
let ip = sock.ip();
let port = sock.port();
let test_name = self.test_name;
let mut uninformed = true;
// States that might be worth inspecting out-of-band
while let Ok(
InstanceState::Running
| InstanceState::Migrating
| InstanceState::Rebooting
| InstanceState::Repairing,
) = client
.instance_get()
.send()
.await
.map(|inst| inst.instance.state)
{
if *self.sigint_rx.borrow() {
break;
}
if uninformed {
error!(
r#"
test {test_name:?} failed. propolis-server {vm_name:?} was left running,
with API accessible at http://{sock}
phd-runner will resume when this instance is shut down; e.g. by one of:

$ propolis-cli -s {ip} -p {port} serial
localhost:~# poweroff

$ propolis-cli -s {ip} -p {port} state stop
"#
);
uninformed = false;
}
tokio::time::sleep(Duration::from_secs(1)).await;
}
}
}
}
}
10 changes: 10 additions & 0 deletions phd-tests/runner/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,16 @@ pub struct RunOptions {
// number of seconds, but i'm lazy...
#[clap(long, value_parser, default_value_t = 60 * 20)]
pub max_buildomat_wait_secs: u64,

/// When a testcase fails while this is enabled, any instances started by
/// the failed test are left running and phd-runner waits until they are
/// manually shut down out-of-band by the operator.
///
/// This feature is intended to give the operator a chance to inspect the
/// state of the guest(s) easily without necessarily having to reconstruct
/// the scenario by hand.
#[clap(long, value_parser)]
pub manual_stop_on_failure: bool,
}

#[derive(Args, Debug)]
Expand Down
18 changes: 17 additions & 1 deletion phd-tests/runner/src/execute.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
use std::sync::Arc;
use std::time::{Duration, Instant};

use phd_framework::test_vm::TestVmManualStop;
use phd_tests::phd_testcase::{Framework, TestCase, TestOutcome};
use tokio::signal::unix::{signal, SignalKind};
use tokio::sync::watch;
Expand Down Expand Up @@ -101,8 +102,18 @@ pub async fn run_tests_with_ctx(
let framework = ctx.clone();
let tc = execution.tc;
let mut sigint_rx_task = sigint_rx.clone();
let mut test_ctx = framework.test_ctx(tc.fully_qualified_name());
let mut success_tx = None;
if run_opts.manual_stop_on_failure {
let (tx, success_rx) = watch::channel(None);
test_ctx.set_cleanup_task_outcome_receiver(TestVmManualStop::new(
tc.fully_qualified_name(),
success_rx,
sigint_rx.clone(),
));
success_tx = Some(tx);
}
let test_outcome = tokio::spawn(async move {
let test_ctx = framework.test_ctx(tc.fully_qualified_name());
tokio::select! {
// Ensure interrupt signals are always handled instead of
// continuing to run the test.
Expand Down Expand Up @@ -144,6 +155,11 @@ pub async fn run_tests_with_ctx(
}
);

if let Some(tx) = success_tx {
let succeeded = !matches!(&test_outcome, TestOutcome::Failed(_));
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(also if you clone the outcome and send that along it instead of just failed-or-not, might be nice to have "test failed because ..." as part of the prelude to a particular VM's informational blurb? might get too wordy. also not really attached to this idea as much as it'd be nice to distinguish the receivers versus the "is Option<bool> the test status, or is that the bool, hmm")

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

in my experience the cause of the failure is usually right above this message in the log, so i didn't feel a particular pull to pass it along, but i definitely wouldn't be opposed if you can think of a case for it

let _: Result<_, _> = tx.send(Some(succeeded));
}

match test_outcome {
TestOutcome::Passed => stats.tests_passed += 1,
TestOutcome::Failed(_) => {
Expand Down
Loading