Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 18 additions & 2 deletions phd-tests/framework/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,10 @@ use test_vm::{
environment::EnvironmentSpec, spec::VmSpec, VmConfig, VmLocation,
};
use tokio::{
sync::mpsc::{UnboundedReceiver, UnboundedSender},
sync::{
mpsc::{UnboundedReceiver, UnboundedSender},
watch,
},
task::JoinHandle,
};

Expand All @@ -63,6 +66,8 @@ pub(crate) mod zfs;
pub struct TestCtx {
pub(crate) framework: Arc<Framework>,
pub(crate) output_dir: Utf8PathBuf,
pub(crate) success_sigint_rx:
Option<(watch::Receiver<Option<bool>>, watch::Receiver<bool>)>,
}

/// An instance of the PHD test framework.
Expand Down Expand Up @@ -239,6 +244,17 @@ impl TestCtx {
)
.await
}

/// When phd-runner is configured to leave instances running on failed
/// tests, the watch channel whose Receiver is passed to this function is
/// used to indicate to the instance cleanup task that a test *has* failed.
pub fn set_cleanup_task_outcome_receiver(
&mut self,
success_rx: watch::Receiver<Option<bool>>,
sigint_rx: watch::Receiver<bool>,
) {
self.success_sigint_rx = Some((success_rx, sigint_rx));
}
}

// The framework implementation includes some "runner-only" functions
Expand Down Expand Up @@ -330,7 +346,7 @@ impl Framework {
pub fn test_ctx(self: &Arc<Self>, fully_qualified_name: String) -> TestCtx {
let output_dir =
self.tmp_directory.as_path().join(&fully_qualified_name);
TestCtx { framework: self.clone(), output_dir }
TestCtx { framework: self.clone(), output_dir, success_sigint_rx: None }
}

/// Resets the state of any stateful objects in the framework to prepare it
Expand Down
61 changes: 60 additions & 1 deletion phd-tests/framework/src/test_vm/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ use propolis_client::{
use propolis_client::{Client, ResponseValue};
use thiserror::Error;
use tokio::{
sync::{mpsc::UnboundedSender, oneshot},
sync::{mpsc::UnboundedSender, oneshot, watch},
task::JoinHandle,
time::timeout,
};
Expand Down Expand Up @@ -272,6 +272,13 @@ pub struct TestVm {

state: VmState,

/// If we should wait for operator intervention before terminating this
/// instance, a `Some((success_rx, _))` here will be sent a `Some(false)`.
/// While waiting, a `Some((_, sigint_rx))` will be sent a `true` if the
/// user sends a keyboard interrupt to indicate we should stop waiting.
success_sigint_rx:
Option<(watch::Receiver<Option<bool>>, watch::Receiver<bool>)>,

/// Sending a task handle to this channel will ensure that the task runs to
/// completion as part of the post-test cleanup fixture (i.e. before any
/// other tests run).
Expand Down Expand Up @@ -328,6 +335,7 @@ impl TestVm {
environment.clone(),
params,
ctx.framework.cleanup_task_channel(),
ctx.success_sigint_rx.clone(),
),
}
}
Expand All @@ -338,6 +346,10 @@ impl TestVm {
environment_spec: EnvironmentSpec,
mut params: ServerProcessParameters,
cleanup_task_tx: UnboundedSender<JoinHandle<()>>,
success_sigint_rx: Option<(
watch::Receiver<Option<bool>>,
watch::Receiver<bool>,
)>,
) -> Result<Self> {
let metrics = environment_spec.metrics.as_ref().map(|m| match m {
MetricsLocation::Local => {
Expand Down Expand Up @@ -372,6 +384,7 @@ impl TestVm {
guest_os,
state: VmState::New,
cleanup_task_tx,
success_sigint_rx,
})
}

Expand Down Expand Up @@ -1134,6 +1147,9 @@ impl Drop for TestVm {

let disks: Vec<_> = self.vm_spec().disk_handles.drain(..).collect();

let success_sigint_rx_opt = self.success_sigint_rx.take();
let name = self.vm_spec().vm_name.to_owned();

// The order in which the task destroys objects is important: the server
// can't be killed until the client has gotten a chance to shut down
// the VM, and the disks can't be destroyed until the server process has
Expand All @@ -1144,6 +1160,49 @@ impl Drop for TestVm {
// kept alive until the server process is gone.
let _disks = disks;

// Check if we should let the user access the instance of a
// failed testcase before ensuring its demolition
if let Some((mut success_rx, sigint_rx)) = success_sigint_rx_opt {
Comment thread
lifning marked this conversation as resolved.
Outdated
while success_rx.changed().await.is_ok() {
match *success_rx.borrow() {
None => continue,
Some(true) => break,
Some(false) => {}
}
Comment thread
lifning marked this conversation as resolved.
Outdated
let sock = server.server_addr();
let ip = sock.ip();
let port = sock.port();
let mut uninformed = true;
// States that might be worth inspecting out-of-band
while let Ok(InstanceState::Running
| InstanceState::Migrating
| InstanceState::Rebooting
| InstanceState::Repairing
) = client
.instance_get()
.send()
.await
.map(|inst| inst.instance.state)
{
if *sigint_rx.borrow() { break }
if uninformed {
// write to stderr irrespective of log level
eprintln!(r#"
propolis-server {name:?} left running at address {sock}
Comment thread
lifning marked this conversation as resolved.
Outdated
phd-runner will resume when this instance is shut down; e.g. by one of:

$ propolis-cli -s {ip} -p {port} serial
localhost:~# poweroff

$ propolis-cli -s {ip} -p {port} state stop
"#);
Comment thread
lifning marked this conversation as resolved.
Outdated
uninformed = false;
}
tokio::time::sleep(Duration::from_secs(1)).await;
}
}
}

// Try to make sure the server's kernel VMM is cleaned up before
// killing the server process. This is best-effort; if it fails,
// the kernel VMM is leaked. This generally indicates a bug in
Expand Down
10 changes: 10 additions & 0 deletions phd-tests/runner/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,16 @@ pub struct RunOptions {
// number of seconds, but i'm lazy...
#[clap(long, value_parser, default_value_t = 60 * 20)]
pub max_buildomat_wait_secs: u64,

/// When a testcase fails while this is enabled, any instances started by
/// the failed test are left running and phd-runner waits until they are
/// manually shut down out-of-band by the operator.
///
/// This feature is intended to give the operator a chance to inspect the
/// state of the guest(s) easily without necessarily having to reconstruct
/// the scenario by hand.
#[clap(long, value_parser)]
pub manual_stop_on_failure: bool,
}

#[derive(Args, Debug)]
Expand Down
19 changes: 18 additions & 1 deletion phd-tests/runner/src/execute.rs
Original file line number Diff line number Diff line change
Expand Up @@ -101,8 +101,18 @@ pub async fn run_tests_with_ctx(
let framework = ctx.clone();
let tc = execution.tc;
let mut sigint_rx_task = sigint_rx.clone();
let mut test_ctx = framework.test_ctx(tc.fully_qualified_name());
let mut success_tx = None;
if run_opts.manual_stop_on_failure {
let sigint_rx_cleanup = sigint_rx.clone();
let (tx, success_rx) = watch::channel(None);
test_ctx.set_cleanup_task_outcome_receiver(
success_rx,
sigint_rx_cleanup,
);
success_tx = Some(tx);
}
let test_outcome = tokio::spawn(async move {
let test_ctx = framework.test_ctx(tc.fully_qualified_name());
tokio::select! {
// Ensure interrupt signals are always handled instead of
// continuing to run the test.
Expand Down Expand Up @@ -144,6 +154,13 @@ pub async fn run_tests_with_ctx(
}
);

if let Some(tx) = success_tx {
let succeeded = !matches!(&test_outcome, TestOutcome::Failed(_));
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(also if you clone the outcome and send that along it instead of just failed-or-not, might be nice to have "test failed because ..." as part of the prelude to a particular VM's informational blurb? might get too wordy. also not really attached to this idea as much as it'd be nice to distinguish the receivers versus the "is Option<bool> the test status, or is that the bool, hmm")

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

in my experience the cause of the failure is usually right above this message in the log, so i didn't feel a particular pull to pass it along, but i definitely wouldn't be opposed if you can think of a case for it

if let Err(e) = tx.send(Some(succeeded)) {
error!("Error sending outcome to instance cleanup tasks: {e}");
Comment thread
lifning marked this conversation as resolved.
Outdated
}
}

match test_outcome {
TestOutcome::Passed => stats.tests_passed += 1,
TestOutcome::Failed(_) => {
Expand Down
Loading