oxidecomputer · jgallagher · Mar 13, 2026 · Mar 16, 2026 · Mar 16, 2026 · Mar 16, 2026
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/live-tests/Cargo.toml b/live-tests/Cargo.toml
@@ -15,6 +15,7 @@ omicron-workspace-hack.workspace = true
 [dev-dependencies]
 anyhow.workspace = true
 assert_matches.workspace = true
+chrono.workspace = true
 dns-service-client.workspace = true
 dropshot.workspace = true
 futures.workspace = true
@@ -35,10 +36,13 @@ omicron-test-utils.workspace = true
 omicron-uuid-kinds.workspace = true
 reqwest.workspace = true
 serde.workspace = true
+serde_json.workspace = true
 slog.workspace = true
 slog-error-chain.workspace = true
+strum.workspace = true
 textwrap.workspace = true
 tokio.workspace = true
+update-engine.workspace = true
 uuid.workspace = true
 
 [lints]

diff --git a/live-tests/README.adoc b/live-tests/README.adoc
@@ -6,7 +6,7 @@ This package is not built or tested by default because the tests generally can't
 
 == Why a separate test suite?
 
-What makes these tests different from the rest of the test suite is that they require connectivity to the underlay network of the deployed system and they make API calls to various components in that system and they assume that this will behave like a real production system.  By contrast, the normal tests instead _set up_ a bunch of components using simulated sled agents and localhost networking, which is great for starting from a predictable state and running tests in parallel, but the simulated sled agents and networking make it impossible to exercise quite a lot of Reconfigurator's functionality.  
+What makes these tests different from the rest of the test suite is that they require connectivity to the underlay network of the deployed system and they make API calls to various components in that system and they assume that this will behave like a real production system.  By contrast, the normal tests instead _set up_ a bunch of components using simulated sled agents and localhost networking, which is great for starting from a predictable state and running tests in parallel, but the simulated sled agents and networking make it impossible to exercise quite a lot of Reconfigurator's functionality.
 
 There are also the `end-to-end-tests`.  That environment is more realistic than the main test suite, but not faithful enough for many Reconfigurator tests.
 
@@ -22,14 +22,14 @@ First, deploy Omicron using `a4x2` or one of the hardware test rigs.
 
 Ensure the system's target blueprint is enabled. The live tests require this to avoid a case where the live tests generate blueprints based on a target blueprint that is not current, and then make a bunch of changes to the system unrelated to the tests.
 
-On a fresh system, you will have to enable the target blueprint yourself:
+Ensure the system's blueprint planner is disabled. Live tests may generate various blueprints with content that won't make sense to the planner, and we don't want the planner to run mid-test and attempt to "fix" the state of the system.
+
+On a fresh system, you will have to disable the blueprint planner yourself:
 
 ```
-omdb --destructive nexus blueprints target enable current
+omdb --destructive nexus reconfigurator-config set --planner-enabled false
 ```
 
-This essentially enables reconfigurator, causing it to constantly try to make the system match its target blueprint. You only need to do this once in the lifetime of the system, not every time you re-run the live tests.
-
 At this point the system is prepared for testing. In your Omicron workspace, run `cargo xtask live-tests` to build an archive and then follow the instructions:
 
 ```

diff --git a/live-tests/tests/common/mod.rs b/live-tests/tests/common/mod.rs
@@ -12,7 +12,6 @@ use nexus_config::PostgresConfigWithUrl;
 use nexus_db_queries::context::OpContext;
 use nexus_db_queries::db::DataStore;
 use nexus_types::deployment::SledFilter;
-use omicron_common::address::Ipv6Subnet;
 use slog::info;
 use slog::o;
 use std::ffi::OsStr;
@@ -69,6 +68,23 @@ impl LiveTestContext {
         &self.datastore
     }
 
+    /// Establish a new `DataStore` connection pointed at this deployed system's
+    /// database
+    ///
+    /// Most consumers should prefer `datastore()`, which returns a reference to
+    /// a `DataStore` constructed when this context was created. This method is
+    /// useful if a caller needs to reevaluate what Cockroach instances are
+    /// available in DNS (e.g., due to zone expungement) or needs a `DataStore`
+    /// instance that is not shared.
+    pub async fn new_datastore_connection(
+        &self,
+    ) -> anyhow::Result<(OpContext, Arc<DataStore>)> {
+        let log = &self.logctx.log;
+        let datastore = create_datastore(log, &self.resolver).await?;
+        let opctx = OpContext::for_tests(log.clone(), datastore.clone());
+        Ok((opctx, datastore))
+    }
+
     /// Returns a client for a Nexus internal API at the given socket address
     pub fn specific_internal_nexus_client(
         &self,
@@ -96,20 +112,14 @@ impl LiveTestContext {
 }
 
 fn create_resolver(log: &slog::Logger) -> Result<Resolver, anyhow::Error> {
-    // In principle, we should look at /etc/resolv.conf to find the DNS servers.
-    // In practice, this usually isn't populated today.  See
-    // oxidecomputer/omicron#2122.
-    //
-    // However, the address selected below should work for most existing Omicron
-    // deployments today.  That's because while the base subnet is in principle
-    // configurable in config-rss.toml, it's very uncommon to change it from the
-    // default value used here.
-    let subnet = Ipv6Subnet::new("fd00:1122:3344:0100::".parse().unwrap());
-    eprintln!("note: using DNS server for subnet {}", subnet.net());
-    internal_dns_resolver::Resolver::new_from_subnet(log.clone(), subnet)
-        .with_context(|| {
-            format!("creating DNS resolver for subnet {}", subnet.net())
-        })
+    // The internal DNS servers are populated in /etc/resolv.conf in the switch
+    // zone, which is where we expect live tests to run. Notify the user that
+    // we're going to attempt DNS resolution via the default system path.
+    eprintln!(
+        "note: using DNS from system config (typically /etc/resolv.conf)",
+    );
+    internal_dns_resolver::Resolver::new_from_system_conf(log.clone())
+        .context("creating DNS resolver from system config")
 }
 
 /// Creates a DataStore pointing at the CockroachDB cluster that's in DNS
@@ -213,7 +223,7 @@ async fn check_hardware_environment(
 ) -> Result<(), anyhow::Error> {
     const ALLOWED_GIMLET_SERIALS: &[&str] = &[
         // Serial number lists can be generated with:
-        // inventron env system list -Hpo serial -F type=gimlet <ENVIRONMENT>
+        // inventron env system list -Hpo serial -F type=cosmo -F type=gimlet <ENVIRONMENT>
 
         // test rig: "madrid"
         "BRM42220081",
@@ -222,19 +232,19 @@ async fn check_hardware_environment(
         "BRM42220004",
         // test rig: "london"
         "BRM42220036",
-        "BRM42220062",
+        "2CN2M459",
         "BRM42220030",
-        "BRM44220007",
+        "2RGCFG10",
         // test rig: "dublin"
-        "BRM42220026",
+        "2F8JEXDK",
         "BRM27230037",
         "BRM23230018",
         "BRM23230010",
         // test rig: "berlin"
         "BRM42220011",
         "BRM44220007",
         "BRM42220082",
-        "BRM06240029",
+        "271FVPY0",
     ];
 
     // Refuse to operate in an environment that might contain real Oxide

diff --git a/live-tests/tests/common/reconfigurator.rs b/live-tests/tests/common/reconfigurator.rs
@@ -28,9 +28,28 @@ use std::time::Duration;
 pub async fn blueprint_load_target_enabled(
     log: &slog::Logger,
     nexus: &nexus_lockstep_client::Client,
+) -> Result<Blueprint, anyhow::Error> {
+    blueprint_load_target_impl(log, nexus, true).await
+}
+
+/// Return the current target blueprint
+///
+/// Also validates that it's disabled.  If an operator has enabled execution, we
+/// don't want to proceed with tests.
+pub async fn blueprint_load_target_disabled(
+    log: &slog::Logger,
+    nexus: &nexus_lockstep_client::Client,
+) -> Result<Blueprint, anyhow::Error> {
+    blueprint_load_target_impl(log, nexus, false).await
+}
+
+async fn blueprint_load_target_impl(
+    log: &slog::Logger,
+    nexus: &nexus_lockstep_client::Client,
+    expect_enabled: bool,
 ) -> Result<Blueprint, anyhow::Error> {
     // Fetch the current target configuration.
-    info!(log, "editing current target blueprint");
+    info!(log, "loading current target blueprint");
     let target_blueprint = nexus
         .blueprint_target_view()
         .await
@@ -40,9 +59,11 @@ pub async fn blueprint_load_target_enabled(
     debug!(log, "found current target blueprint";
         "blueprint_id" => %target_blueprint.target_id
     );
+
+    let expect_inverse = if !expect_enabled { "enabled" } else { "disabled" };
     ensure!(
-        target_blueprint.enabled,
-        "refusing to operate on a system with target blueprint disabled"
+        target_blueprint.enabled == expect_enabled,
+        "refusing to operate on a system with target blueprint {expect_inverse}"
     );
 
     let blueprint = nexus
@@ -78,14 +99,59 @@ pub async fn blueprint_load_target_enabled(
 /// case, a developer enables the initial target blueprint before running these
 /// tests and then doesn't need to think about it again for the lifetime of
 /// their test environment.
-pub async fn blueprint_edit_current_target(
+pub async fn blueprint_edit_current_target_enabled<F>(
+    log: &slog::Logger,
+    nexus: &nexus_lockstep_client::Client,
+    edit_fn: F,
+) -> Result<(Blueprint, Blueprint), anyhow::Error>
+where
+    F: FnOnce(&mut BlueprintBuilder) -> Result<(), anyhow::Error>,
+{
+    blueprint_edit_current_target_impl(log, nexus, true, edit_fn).await
+}
+
+/// Modify the system by editing the current target blueprint
-/// Modify the system by editing the current target blueprint
+/// Modify the system by editing the current target blueprint, verifying that the current target is disabled
-/// Modify the system by editing the current target blueprint
+/// Modify the system by editing the current target blueprint, verifying that the current target is disabled
+///
+/// More precisely, this function:
+///
+/// - fetches the current target blueprint
+/// - creates a new BlueprintBuilder based on it
+/// - invokes the caller's `edit_fn`, which may modify the builder however it
+///   likes
+/// - generates a new blueprint (thus based on the current target)
+/// - uploads the new blueprint
+/// - sets the new blueprint as the current target
+/// - disables the new blueprint
+///
+/// ## Errors
+///
+/// This function fails if the current target blueprint is not already disabled.
+/// Callers of this function expect execution to be - and remain -  disabled. If
+/// that isn't the case, we don't want to inadvertently proceed.
+pub async fn blueprint_edit_current_target_disabled<F>(
+    log: &slog::Logger,
+    nexus: &nexus_lockstep_client::Client,
+    edit_fn: F,
+) -> Result<(Blueprint, Blueprint), anyhow::Error>
+where
+    F: FnOnce(&mut BlueprintBuilder) -> Result<(), anyhow::Error>,
+{
+    blueprint_edit_current_target_impl(log, nexus, false, edit_fn).await
+}
+
+async fn blueprint_edit_current_target_impl<F>(
     log: &slog::Logger,
     nexus: &nexus_lockstep_client::Client,
-    edit_fn: &dyn Fn(&mut BlueprintBuilder) -> Result<(), anyhow::Error>,
-) -> Result<(Blueprint, Blueprint), anyhow::Error> {
+    expect_enabled: bool,
+    edit_fn: F,
+) -> Result<(Blueprint, Blueprint), anyhow::Error>
+where
+    F: FnOnce(&mut BlueprintBuilder) -> Result<(), anyhow::Error>,
+{
     // Fetch the current target configuration.
     info!(log, "editing current target blueprint");
-    let blueprint1 = blueprint_load_target_enabled(log, nexus).await?;
+    let blueprint1 =
+        blueprint_load_target_impl(log, nexus, expect_enabled).await?;
 
     // Make a new builder based on that blueprint and use `edit_fn` to edit it.
     let mut builder = BlueprintBuilder::new_based_on(
@@ -113,14 +179,15 @@ pub async fn blueprint_edit_current_target(
     );
     nexus
         .blueprint_target_set(&BlueprintTargetSet {
-            enabled: true,
+            enabled: expect_enabled,
             target_id: blueprint2.id,
         })
         .await
-        .expect("setting new target");
+        .context("setting new target")?;
     info!(log, "finished editing target blueprint";
         "old_target_id" => %blueprint1.id,
         "new_target_id" => %blueprint2.id,
+        "enabled" => %expect_enabled,
     );
 
     Ok((blueprint1, blueprint2))