Skip to content

Logs not archived on a fast failing propolis server #10211

@leftwo

Description

@leftwo

We had a problem where a propolis server failed very early after zone creation. The logs for this zone were not archived anywhere (that I could find).

Here are sled-agent logs showing the start of instance creation

2026-04-02 19:09:48.736Z INFO SledAgent/665 (InstanceManager) on BRM42220011: ensuring instance is registered
    file = sled-agent/src/instance_manager.rs:717
    instance_id = 86110b83-9239-4f45-90b3-65c7f15bb118
    local_config = InstanceSledLocalConfig { hostname: Hostname("big-boi-cosmo-llm"), nics: [NetworkInterface { id: 7b8f48e6-75ca-4603-bf3a-480694a602c2, kind: Instance { id: 86110b83-9239-4f45-90b3-65c7f15bb118 }, name: Name("net0"), ip_config: DualStack { v4: PrivateIpv4Config { ip: 172.30.0.15, subnet: Ipv4Net { addr: 172.30.0.0, width: 22 },.... (snip)...., dhcp_config: DhcpConfig { dns_servers: [1.1.1.1, 9.9.9.9], host_domain: None, search_domains: [] }, delegated_zvols: [] }
    metadata = InstanceMetadata { silo_id: d1f0279c-d31e-422f-910e-216a766ae35f, project_id: 5d4a1ffc-533d-4c39-b64f-fbd0e11f559e }
    migration_id = None
    propolis_addr = [fdac:cbfb:faa2:102::1:10c]:12400
    propolis_id = 6365676d-587b-4234-93ad-cc74fb95ce12
    propolis_spec = VmmSpec(InstanceSpec { board: Board { cpus: 128, memory_mb: 828416, chipset: I440Fx(I440Fx { enable_pcie: false }), guest_hv_interface: Bhyve, cpuid: Some(Cpuid { entries: [CpuidEntry { leaf: 0, subleaf: None, eax: 13, ebx: 1752462657, ecx: 1145913699, edx: 1769238117 }, CpuidEntry { leaf: 1, subleaf: None, eax: 10489617, ebx: 2048, ecx:  ... (more snip) ... backend_id: Uuid(e455a487-1662-4e49-b1d2-17c80a87c353), pci_path: PciPath { bus: 0, device: 16, function: 0 }, serial_number: [98, 105, 103, 45, 98, 111, 105, 45, 99, 111, 115, 109, 111, 45, 108, 108, 109, 45, 117, 98] }), Name("pvpanic"): QemuPvpanic(QemuPvpanic { enable_isa: true }), Name("vsock"): VirtioSocket(VirtioSocket { guest_cid: 16, pci_path: PciPath { bus: 0, device: 25, function: 0 } })}, smbios: None })
    vmm_runtime = VmmRuntimeState { state: Starting, generation: Generation(2), time_updated: 2026-04-02T19:09:48.734994100Z }
2026-04-02 19:09:48.737Z INFO SledAgent/665 (InstanceManager) on BRM42220011: registering new instance
    file = sled-agent/src/instance_manager.rs:751
    instance_id = 86110b83-9239-4f45-90b3-65c7f15bb118
    migration_id = None
    propolis_id = 6365676d-587b-4234-93ad-cc74fb95ce12
2026-04-02 19:09:48.737Z INFO SledAgent/665 (InstanceManager) on BRM42220011: initializing new Instance
    file = sled-agent/src/instance.rs:1980
    instance_id = 86110b83-9239-4f45-90b3-65c7f15bb118
    migration_id = None
    propolis_id = 6365676d-587b-4234-93ad-cc74fb95ce12
    state = InstanceInitialState { vmm_spec: VmmSpec(InstanceSpec { board: Board { cpus: 128, memory_mb: 828416, chipset: I440Fx(I440Fx { enable_pcie: false }), guest_hv_interface: Bhyve, cpuid: Some(Cpuid { entries: [CpuidEntry { leaf: 0, subleaf: None, eax: 13, ... (big snip)...  dhcp_config: DhcpConfig { dns_servers: [1.1.1.1, 9.9.9.9], host_domain: None, search_domains: [] }, delegated_zvols: [] }, vmm_runtime: VmmRuntimeState { state: Starting, generation: Generation(2), time_updated: 2026-04-02T19:09:48.734994100Z }, propolis_addr: [fdac:cbfb:faa2:102::1:10c]:12400, migration_id: None }
2026-04-02 19:09:48.737Z INFO SledAgent/665 (dropshot (SledAgent)) on BRM42220011: request completed
    file = /home/build/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/dropshot-0.16.7/src/server.rs:867
    latency_us = 1498
    local_addr = [fdac:cbfb:faa2:102::1]:12345
    method = PUT
    remote_addr = [fdac:cbfb:faa2:103::4]:43870
    req_id = 0d08cbf7-88fe-4abb-976b-b9d3686b3246
    response_code = 200
    uri = /vmms/6365676d-587b-4234-93ad-cc74fb95ce12
2026-04-02 19:09:48.830Z INFO SledAgent/665 (dropshot (SledAgent)) on BRM42220011: request completed
    file = /home/build/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/dropshot-0.16.7/src/server.rs:867
    latency_us = 125
    local_addr = [fdac:cbfb:faa2:102::1]:12345
    method = PUT
    remote_addr = [fdac:cbfb:faa2:103::4]:51216
    req_id = 8445f4c8-d98c-4ca7-9456-6b9f5d1dbc79
    response_code = 200
    uri = /vmms/6365676d-587b-4234-93ad-cc74fb95ce12/state
2026-04-02 19:09:48.831Z INFO SledAgent/665 (PortManager) on BRM42220011: Created OPTE port
    file = illumos-utils/src/opte/port_manager.rs:609
    port = Port { inner: PortInner(PortData { name: "opte268", ip: DualStack { v4: PrivateIpv4Config { ip: 172.30.0.15, subnet: Ipv4Net { addr: 172.30.0.0, width: 22 }, transit_ips: [] }, v6: PrivateIpv6Config { ip: fd95:82fa:83a6::f, subnet: Ipv6Net { addr: fd95:82fa:83a6::, width: 64 }, transit_ips: [] } }, mac: MacAddr6([168, 64, 37, 240, 0, 10]), slot: 0, vni: Vni { inner: 3385740 }, gateway: Gateway { mac: MacAddr([168, 64, 37, 255, 119, 119]), ips: DualStack { v4: 172.30.0.1, v6: fd95:82fa:83a6::1 } } }) }
2026-04-02 19:09:48.863Z INFO SledAgent/665 (InstanceManager) on BRM42220011: Configuring new Omicron zone: oxz_propolis-server_6365676d-587b-4234-93ad-cc74fb95ce12
    file = illumos-utils/src/zone.rs:304
    instance_id = 86110b83-9239-4f45-90b3-65c7f15bb118
    propolis_id = 6365676d-587b-4234-93ad-cc74fb95ce12
2026-04-02 19:09:48.886Z INFO SledAgent/665 (InstanceManager) on BRM42220011: Installing Omicron zone: oxz_propolis-server_6365676d-587b-4234-93ad-cc74fb95ce12
    file = illumos-utils/src/zone.rs:342
    instance_id = 86110b83-9239-4f45-90b3-65c7f15bb118
    propolis_id = 6365676d-587b-4234-93ad-cc74fb95ce12
2026-04-02 19:09:49.037Z INFO SledAgent/665 (HealthMonitor) on BRM42220011: Successfully retrieved SMF services
    file = illumos-utils/src/svcs.rs:41
2026-04-02 19:09:50.119Z INFO SledAgent/665 (InstanceManager) on BRM42220011: Profile for oxz_propolis-server_6365676d-587b-4234-93ad-cc74fb95ce12:
    <!DOCTYPE service_bundle SYSTEM "/usr/share/lib/xml/dtd/service_bundle.dtd.1">
    <service_bundle type="profile" name="omicron">
      <service version="1" type="service" name="system/illumos/propolis-server">
        <instance enabled="true" name="default">
          <property_group type="application" name="config">
            <propval type="astring" name="datalink" value='oxControlInstance19'/>
            <propval type="astring" name="gateway" value='fdac:cbfb:faa2:102::1'/>
            <propval type="astring" name="listen_addr" value='fdac:cbfb:faa2:102::1:10c'/>
            <propval type="astring" name="listen_port" value='12400'/>
            <propval type="astring" name="metric_addr" value='dns'/>
          </property_group>
        </instance>
      </service>
    </service_bundle>
    file = sled-agent/src/profile.rs:34
    instance_id = 86110b83-9239-4f45-90b3-65c7f15bb118
    prop2026-04-02 19:09:50.119Z INFO SledAgent/665 (InstanceManager) on BRM42220011: Booting oxz_propolis-server_6365676d-587b-4234-93ad-cc74fb95ce12 zone
    file = illumos-utils/src/running_zone.rs:296
    instance_id = 86110b83-9239-4f45-90b3-65c7f15bb118
    propolis_id = 6365676d-587b-4234-93ad-cc74fb95ce12
    zone = oxz_propolis-server_6365676d-587b-4234-93ad-cc74fb95ce12
2026-04-02 19:09:50.376Z WARN SledAgent/665 (InstanceManager) on BRM42220011: wait for service svc:/milestone/single-user:default in zone Some("oxz_propolis-server_6365676d-587b-4234-93ad-cc74fb95ce12") failed: Property not found. retry in 58.707557ms
    file = illumos-utils/src/svc.rs:36
    instance_id = 86110b83-9239-4f45-90b3-65c7f15bb118
    propolis_id = 6365676d-587b-4234-93ad-cc74fb95ce12
    zone = oxz_propolis-server_6365676d-587b-4234-93ad-cc74fb95ce12
2026-04-02 19:09:50.442Z WARN SledAgent/665 (InstanceManager) on BRM42220011: wait for service svc:/milestone/single-user:default in zone Some("oxz_propolis-server_6365676d-587b-4234-93ad-cc74fb95ce12") failed: Property not found. retry in 123.808475ms
    file = illumos-utils/src/svc.rs:36
    instance_id = 86110b83-9239-4f45-90b3-65c7f15bb118
    propolis_id = 6365676d-587b-4234-93ad-cc74fb95ce12
    zone = oxz_propolis-server_6365676d-587b-4234-93ad-cc74fb95ce12
2026-04-02 19:09:50.576Z WARN SledAgent/665 (InstanceManager) on BRM42220011: wait for service svc:/milestone/single-user:default in zone Some("oxz_propolis-server_6365676d-587b-4234-93ad-cc74fb95ce12") failed: Property not found. retry in 177.60285ms
    file = illumos-utils/src/svc.rs:36
    instance_id = 86110b83-9239-4f45-90b3-65c7f15bb118
    propolis_id = 6365676d-587b-4234-93ad-cc74fb95ce12
    zone = oxz_propolis-server_6365676d-587b-4234-93ad-cc74fb95ce12
2026-04-02 19:09:50.762Z WARN SledAgent/665 (InstanceManager) on BRM42220011: wait for service svc:/milestone/single-user:default in zone Some("oxz_propolis-server_6365676d-587b-4234-93ad-cc74fb95ce12") failed: Property not found. retry in 365.834712ms
    file = illumos-utils/src/svc.rs:36
    instance_id = 86110b83-9239-4f45-90b3-65c7f15bb118
    propolis_id = 6365676d-587b-4234-93ad-cc74fb95ce12
    zone = oxz_propolis-server_6365676d-587b-4234-93ad-cc74fb95ce12
2026-04-02 19:09:51.138Z WARN SledAgent/665 (InstanceManager) on BRM42220011: wait for service svc:/milestone/single-user:default in zone Some("oxz_propolis-server_6365676d-587b-4234-93ad-cc74fb95ce12") failed: Property not found. retry in 953.829633ms
    file = illumos-utils/src/svc.rs:36
    instance_id = 86110b83-9239-4f45-90b3-65c7f15bb118
    propolis_id = 6365676d-587b-4234-93ad-cc74fb95ce12
    zone = oxz_propolis-server_6365676d-587b-4234-93ad-cc74fb95ce12
2026-04-02 19:09:52.101Z WARN SledAgent/665 (InstanceManager) on BRM42220011: wait for service svc:/milestone/single-user:default in zone Some("oxz_propolis-server_6365676d-587b-4234-93ad-cc74fb95ce12") failed: Property not found. retry in 1.485542147s
    file = illumos-utils/src/svc.rs:36
    instance_id = 86110b83-9239-4f45-90b3-65c7f15bb118
    propolis_id = 6365676d-587b-4234-93ad-cc74fb95ce12
    zone = oxz_propolis-server_6365676d-587b-4234-93ad-cc74fb95ce12
2026-04-02 19:09:53.595Z WARN SledAgent/665 (InstanceManager) on BRM42220011: wait for service svc:/milestone/single-user:default in zone Some("oxz_propolis-server_6365676d-587b-4234-93ad-cc74fb95ce12") failed: Property not found. retry in 1.418449032s
    file = illumos-utils/src/svc.rs:36
    instance_id = 86110b83-9239-4f45-90b3-65c7f15bb118
    propolis_id = 6365676d-587b-4234-93ad-cc74fb95ce12
    zone = oxz_propolis-server_6365676d-587b-4234-93ad-cc74fb95ce12
2026-04-02 19:09:55.022Z WARN SledAgent/665 (InstanceManager) on BRM42220011: wait for service svc:/milestone/single-user:default in zone Some("oxz_propolis-server_6365676d-587b-4234-93ad-cc74fb95ce12") failed: Property not found. retry in 1.387494671s
    file = illumos-utils/src/svc.rs:36
    instance_id = 86110b83-9239-4f45-90b3-65c7f15bb118
    propolis_id = 6365676d-587b-4234-93ad-cc74fb95ce12
    zone = oxz_propolis-server_6365676d-587b-4234-93ad-cc74fb95ce12
2026-04-02 19:09:55.811Z WARN SledAgent/665 on BRM42220011: sample queue is too full, dropping oldest samples
    file = oximeter/instruments/src/kstat/sampler.rs:792
    n_current_samples = 0
    n_new_samples = 640
    n_overflow_samples = 140
    sled_id = 449dbb23-1bb7-44d2-b486-7ff74ddbe786
2026-04-02 19:09:56.433Z INFO SledAgent/665 (InstanceManager) on BRM42220011: Started propolis in zone: oxz_propolis-server_6365676d-587b-4234-93ad-cc74fb95ce12
    file = sled-agent/src/instance.rs:2582
    instance_id = 86110b83-9239-4f45-90b3-65c7f15bb118
    propolis_id = 6365676d-587b-4234-93ad-cc74fb95ce12
2026-04-02 19:09:56.442Z WARN SledAgent/665 (InstanceManager) on BRM42220011: wait for service svc:/system/illumos/propolis-server:default in zone Some("oxz_propolis-server_6365676d-587b-4234-93ad-cc74fb95ce12") failed: Property not found. retry in 39.547332ms
    file = illumos-utils/src/svc.rs:36
    instance_id = 86110b83-9239-4f45-90b3-65c7f15bb118
    propolis_id = 6365676d-587b-4234-93ad-cc74fb95ce12
2026-04-02 19:09:56.489Z WARN SledAgent/665 (InstanceManager) on BRM42220011: wait for service svc:/system/illumos/propolis-server:default in zone Some("oxz_propolis-server_6365676d-587b-4234-93ad-cc74fb95ce12") failed: Property not found. retry in 56.705466ms
    file = illumos-utils/src/svc.rs:36
    instance_id = 86110b83-9239-4f45-90b3-65c7f15bb118
    propolis_id = 6365676d-587b-4234-93ad-cc74fb95ce12
2026-04-02 19:09:56.555Z WARN SledAgent/665 (InstanceManager) on BRM42220011: wait for service svc:/system/illumos/propolis-server:default in zone Some("oxz_propolis-server_6365676d-587b-4234-93ad-cc74fb95ce12") failed: Property not found. retry in 252.3017ms
    file = illumos-utils/src/svc.rs:36
    instance_id = 86110b83-9239-4f45-90b3-65c7f15bb118
    propolis_id = 6365676d-587b-4234-93ad-cc74fb95ce12
2026-04-02 19:09:56.817Z WARN SledAgent/665 (InstanceManager) on BRM42220011: wait for service svc:/system/illumos/propolis-server:default in zone Some("oxz_propolis-server_6365676d-587b-4234-93ad-cc74fb95ce12") failed: Property not found. retry in 401.999304ms
    file = illumos-utils/src/svc.rs:36
    instance_id = 86110b83-9239-4f45-90b3-65c7f15bb118
    propolis_id = 6365676d-587b-4234-93ad-cc74fb95ce12
2026-04-02 19:09:57.228Z INFO SledAgent/665 (InstanceManager) on BRM42220011: Propolis SMF service is online
    file = sled-agent/src/instance.rs:2593
    instance_id = 86110b83-9239-4f45-90b3-65c7f15bb118
    propolis_id = 6365676d-587b-4234-93ad-cc74fb95ce12
2026-04-02 19:09:57.230Z WARN SledAgent/665 (InstanceManager) on BRM42220011: failed to await http server (Communication Error: error sending request for url (http://[fdac:cbfb:faa2:102::1:10c]:12400/instance)), will retry in 41.539466ms
    error = Communication Error: error sending request for url (http://[fdac:cbfb:faa2:102::1:10c]:12400/instance)
    file = sled-agent/src/instance.rs:170
    instance_id = 86110b83-9239-4f45-90b3-65c7f15bb118
    propolis_id = 6365676d-587b-4234-93ad-cc74fb95ce12
2026-04-02 19:09:57.275Z INFO SledAgent/665 (InstanceManager) on BRM42220011: Propolis HTTP server online
    file = sled-agent/src/instance.rs:2621
    instance_id = 86110b83-9239-4f45-90b3-65c7f15bb118
    propolis_id = 6365676d-587b-4234-93ad-cc74fb95ce12
2026-04-02 19:09:57.355Z INFO SledAgent/665 (InstanceManager) on BRM42220011: result of instance_ensure call is Err(Error Response: status: 500 Internal Server Error; headers: {"content-type": "application/json", "x-request-id": "d6721302-4216-4b2c-a16b-71ee71d57441", "content-length": "124", "date": "Thu, 02 Apr 2026 19:09:57 GMT"}; value: Error { error_code: Some("Internal"), message: "Internal Server Error", request_id: "d6721302-4216-4b2c-a16b-71ee71d57441" })
    file = sled-agent/src/instance.rs:1234
    instance_id = 86110b83-9239-4f45-90b3-65c7f15bb118
    propolis_id = 6365676d-587b-4234-93ad-cc74fb95ce12
2026-04-02 19:09:57.355Z ERRO SledAgent/665 (InstanceManager) on BRM42220011: failed to create Propolis VM
    error = Failure from Propolis Client: Error Response: status: 500 Internal Server Error; headers: {"content-type": "application/json", "x-request-id": "d6721302-4216-4b2c-a16b-71ee71d57441", "content-length": "124", "date": "Thu, 02 Apr 2026 19:09:57 GMT"}; value: Error { error_code: Some("Internal"), message: "Internal Server Error", request_id: "d6721302-4216-4b2c-a16b-71ee71d57441" }
    file = sled-agent/src/instance.rs:2352
    instance_id = 86110b83-9239-4f45-90b3-65c7f15bb118
    propolis_id = 6365676d-587b-4234-93ad-cc74fb95ce12olis_id = 6365676d-587b-4234-93ad-cc74fb95ce12
WARNING: Failed to delete the xde device. It must be deleted out of band, and it will not be possible to recreate the xde device until then. Error: CommandError(DeleteXde, System { {"msg":"Removed route set for subnet","v":0,"name":"SledAgent","level":30,"time":"2026-04-02T19:09:57.362474005Z","hostname":"BRM42220011","pid":665,"component":"PortManager","file":"illumos-utils/src/opte/port_manager.rs:1232","id":"RouterI { vni: Vni(3385740), kind: System }"}
errno{"msg":"Removed route set for subnet","v":0,"name":"SledAgent","level":30,"time":"2026-04-02T19:09:57.362503862Z","hostname":"BRM42220011","pid":665,"component":"PortManager","file":"illumos-utils/src/opte/port_manager.rs:1232","id":"RouterId { vni: Vni(3385740), kind: Custom(V4(Ipv4Net { addr: 172.30.0.0, width: 22 })) }"}
: {"msg":"Removed route set for subnet","v":0,"name":"SledAgent","level":30,"time":"2026-04-02T19:09:57.362533779Z","hostname":"BRM42220011","pid":665,"component":"PortManager","file":"illumos-utils/src/opte/port_manager.rs:1232","id":"RouterId { vni: Vni(3385740), kind: Custom(V6(Ipv6Net { addr: fd95:82fa:83a6::, width: 64 })) }"}
16, msg: "failed to destroy DLS devnet: 16" })
2026-04-02 19:09:57.362Z INFO SledAgent/665 (InstanceManager) on BRM42220011: instance runner exited main loop
    file = sled-agent/src/instance.rs:849
    instance_id = 86110b83-9239-4f45-90b3-65c7f15bb118
    propolis_id = 6365676d-587b-4234-93ad-cc74fb95ce12
2026-04-02 19:09:57.362Z INFO SledAgent/665 (InstanceManager) on BRM42220011: Publishing instance state update to Nexus
    file = sled-agent/src/instance.rs:943
    instance_id = 86110b83-9239-4f45-90b3-65c7f15bb118
    propolis_id = 6365676d-587b-4234-93ad-cc74fb95ce12
    state = SledVmmState { vmm_state: VmmRuntimeState { state: Failed, generation: Generation(3), time_updated: 2026-04-02T19:09:57.362633669Z }, migration_in: None, migration_out: None }
2026-04-02 19:09:59.170Z INFO SledAgent/665 (InstanceManager) on BRM42220011: halt_and_remove_logged: Previous zone state: Running
    file = illumos-utils/src/zone.rs:462
    instance_id = 86110b83-9239-4f45-90b3-65c7f15bb118
    propolis_id = 6365676d-587b-4234-93ad-cc74fb95ce12
    zone = oxz_propolis-server_6365676d-587b-4234-93ad-cc74fb95ce12
2026-04-02 19:09:59.170Z INFO SledAgent/665 (InstanceManager) on BRM42220011: Stopped and uninstalled zone
    file = illumos-utils/src/running_zone.rs:638
    instance_id = 86110b83-9239-4f45-90b3-65c7f15bb118
    propolis_id = 6365676d-587b-4234-93ad-cc74fb95ce12
    zone = oxz_propolis-server_6365676d-587b-4234-93ad-cc74fb95ce12

If we are fast, we can grab the logs from propolis server, but once the zone goes away, the logs are gone and not archived anywhere. oxlog will give us the log file for the short time when the zone is running, but once the zone is torn down, oxlog does not know where to find any logs (and, it appears there are none to find).

Places looked:

BRM42220011 # oxlog logs --current --archived --extra --show-empty oxz_propolis-server_6365676d-587b-4234-93ad-cc74fb95ce12
BRM42220011 # find /pool -name \*oxz_propolis-server_6365676d-587b-4234-93ad-cc74fb95ce12\* -print 2> /dev/null
/pool/ext/2fb71592-7d13-40ba-b6ab-ccfaed2e7291/crypt/zone/oxz_propolis-server_6365676d-587b-4234-93ad-cc74fb95ce12
BRM42220011 # ls -l /pool/ext/2fb71592-7d13-40ba-b6ab-ccfaed2e7291/crypt/zone/oxz_propolis-server_6365676d-587b-4234-93ad-cc74fb95ce12
total 17
drwxr-xr-x   2 root     sys            2 Apr  2 19:09 dev
BRM42220011 # ls -l /pool/ext/2fb71592-7d13-40ba-b6ab-ccfaed2e7291/crypt/zone/oxz_propolis-server_6365676d-587b-4234-93ad-cc74fb95ce12/dev/ 
total 0
BRM42220011 # /opt/oxide/sled-agent/zone-bundle list-zones | grep 6365676d
BRM42220011 # /opt/oxide/sled-agent/zone-bundle list | grep 6365676d

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions