diff --git a/.github/buildomat/jobs/bench.sh b/.github/buildomat/jobs/bench.sh index b96773f0..019e6a39 100644 --- a/.github/buildomat/jobs/bench.sh +++ b/.github/buildomat/jobs/bench.sh @@ -39,8 +39,19 @@ fi pfexec /usr/lib/brand/omicron1/baseline -w /var/run/brand/omicron1/baseline +# Resolve the invoking user for ownership restoration. When this script is run +# elevated locally, the effective `id -un` resolves to root and would chown the +# tree to root. Prefer `$SUDO_USER` (set by sudo), then `logname`, which reports +# the login user across pfexec. Fall back to `id` in CI, however. +run_user=${SUDO_USER:-$(logname 2>/dev/null || id -un)} +run_group=$(id -gn "$run_user" 2>/dev/null || id -gn) + function cleanup { - pfexec chown -R `id -un`:`id -gn` . + # A restore-to-owner that resolves to root would defeat its own purpose. + # Skip rather than re-root the tree. + if [[ $run_user != root ]]; then + pfexec chown -R "$run_user":"$run_group" . + fi if [[ -z $BUILDOMAT_JOB_ID ]]; then pfexec rm -rf /input/xde fi @@ -67,7 +78,14 @@ function get_artifact { return $curl_res } -OUT_DIR=/work/bench-results +# TGT_BASE allows one to run this more easily in their local +# environment: +# +# TGT_BASE=/var/tmp ./bench.sh +# +TGT_BASE=${TGT_BASE:=/work} + +OUT_DIR=$TGT_BASE/bench-results mkdir -p $OUT_DIR mkdir -p target/criterion @@ -118,6 +136,6 @@ cargo ubench cp -r target/criterion $OUT_DIR cp -r target/xde-bench $OUT_DIR -pushd /work +pushd $TGT_BASE tar -caf bench-results.tgz bench-results popd diff --git a/.github/buildomat/jobs/opteadm.sh b/.github/buildomat/jobs/opteadm.sh index 6838a743..62cf2977 100755 --- a/.github/buildomat/jobs/opteadm.sh +++ b/.github/buildomat/jobs/opteadm.sh @@ -41,9 +41,16 @@ ptime -m cargo build --release popd +# TGT_BASE allows one to run this more easily in their local +# environment: +# +# TGT_BASE=/var/tmp ./opteadm.sh +# +TGT_BASE=${TGT_BASE:=/work} + for x in debug release do - mkdir -p /work/$x - cp target/$x/opteadm /work/$x/ - sha256sum "target/$x/opteadm" > "/work/$x/opteadm.$x.sha256" + mkdir -p $TGT_BASE/$x + cp target/$x/opteadm $TGT_BASE/$x/ + sha256sum "target/$x/opteadm" > "$TGT_BASE/$x/opteadm.$x.sha256" done diff --git a/.github/buildomat/jobs/test.sh b/.github/buildomat/jobs/test.sh index 544eea88..b32dd7ce 100755 --- a/.github/buildomat/jobs/test.sh +++ b/.github/buildomat/jobs/test.sh @@ -22,17 +22,36 @@ set -o xtrace pfexec pkg install brand/omicron1 brand/omicron1/tools opte +# TGT_BASE mirrors the artifact location used by xde.sh. Override it to match a +# local xde.sh run, e.g. TGT_BASE=/var/tmp ./test.sh, so the test binaries are +# found without forced writing to the root-owned /work. +TGT_BASE=${TGT_BASE:=/work} + if [[ -z $BUILDOMAT_JOB_ID ]]; then echo Note: if you are running this locally, you must run the xde.sh job first echo to have the artifacts at the expected spot. pfexec mkdir -p /input/xde - pfexec ln -s /work /input/xde/work + # Replace any stale symlink from an interrupted prior run so the link is + # idempotent across local re-runs. + pfexec rm -f /input/xde/work + pfexec ln -s $TGT_BASE /input/xde/work fi pfexec /usr/lib/brand/omicron1/baseline -w /var/run/brand/omicron1/baseline +# Resolve the invoking user for ownership restoration. When this script is run +# elevated locally, the effective `id -un` resolves to root and would chown the +# tree to root. Prefer `$SUDO_USER` (set by sudo), then `logname`, which reports +# the login user across pfexec. Fall back to `id` in CI, however. +run_user=${SUDO_USER:-$(logname 2>/dev/null || id -un)} +run_group=$(id -gn "$run_user" 2>/dev/null || id -gn) + function cleanup { - pfexec chown -R `id -un`:`id -gn` . + # A restore-to-owner that resolves to root would defeat its own purpose. + # Skip rather than re-root the tree. + if [[ $run_user != root ]]; then + pfexec chown -R "$run_user":"$run_group" . + fi if [[ -z $BUILDOMAT_JOB_ID ]]; then pfexec rm -rf /input/xde fi @@ -98,6 +117,9 @@ pfexec /input/xde/work/test/multicast_validation --nocapture --test-threads=1 pfexec chmod +x /input/xde/work/test/multicast_source_filter pfexec /input/xde/work/test/multicast_source_filter --nocapture --test-threads=1 +pfexec chmod +x /input/xde/work/test/multicast_multi_nexthop +pfexec /input/xde/work/test/multicast_multi_nexthop --nocapture --test-threads=1 + banner "teardown" # Ensure full driver teardown is exercised after tests complete pfexec rem_drv xde diff --git a/.github/buildomat/jobs/xde.sh b/.github/buildomat/jobs/xde.sh index cf676a1a..a41eda63 100755 --- a/.github/buildomat/jobs/xde.sh +++ b/.github/buildomat/jobs/xde.sh @@ -18,6 +18,7 @@ #: "=/work/test/multicast_multi_sub", #: "=/work/test/multicast_validation", #: "=/work/test/multicast_source_filter", +#: "=/work/test/multicast_multi_nexthop", #: "=/work/xde.conf", #: ] #: @@ -62,7 +63,7 @@ install_pkg jq pushd xde -cp xde.conf /work/xde.conf +cp xde.conf $TGT_BASE/xde.conf header "check style" ptime -m cargo +$NIGHTLY fmt -p xde -p xde-link -- --check @@ -140,9 +141,15 @@ multicast_source_filter_test=$( cargo build -q --test multicast_source_filter --message-format=json |\ jq -r "select(.profile.test == true) | .filenames[]" ) -mkdir -p /work/test -cp $loopback_test /work/test/loopback -cp $multicast_rx_test /work/test/multicast_rx -cp $multicast_multi_sub_test /work/test/multicast_multi_sub -cp $multicast_validation_test /work/test/multicast_validation -cp $multicast_source_filter_test /work/test/multicast_source_filter +cargo build --test multicast_multi_nexthop +multicast_multi_nexthop_test=$( + cargo build -q --test multicast_multi_nexthop --message-format=json |\ + jq -r "select(.profile.test == true) | .filenames[]" +) +mkdir -p $TGT_BASE/test +cp $loopback_test $TGT_BASE/test/loopback +cp $multicast_rx_test $TGT_BASE/test/multicast_rx +cp $multicast_multi_sub_test $TGT_BASE/test/multicast_multi_sub +cp $multicast_validation_test $TGT_BASE/test/multicast_validation +cp $multicast_source_filter_test $TGT_BASE/test/multicast_source_filter +cp $multicast_multi_nexthop_test $TGT_BASE/test/multicast_multi_nexthop diff --git a/Cargo.lock b/Cargo.lock index 94f50821..134509c2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2616,6 +2616,7 @@ name = "xde-tests" version = "0.1.0" dependencies = [ "anyhow", + "dlpi", "libnet", "opte-ioctl", "opte-test-utils", diff --git a/Cargo.toml b/Cargo.toml index 7fd38941..322367ea 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -48,6 +48,7 @@ crc32fast = { version = "1", default-features = false } criterion = "0.8" ctor = "0.10" darling = "0.23" +dlpi = { git = "https://github.com/oxidecomputer/dlpi-sys", default-features = false } dyn-clone = "1.0" heapless = "0.8" ingot = "0.1.1" diff --git a/dtrace/opte-mcast-delivery.d b/dtrace/opte-mcast-delivery.d index babd5a1d..7d309576 100644 --- a/dtrace/opte-mcast-delivery.d +++ b/dtrace/opte-mcast-delivery.d @@ -137,6 +137,9 @@ mcast-local-delivery { @by_vni["DELIVER", this->vni] = count(); @by_port[this->port] = count(); @by_group["DELIVER", this->group_str] = count(); + + /* Per-(port,group,vni) delivery matrix for end-to-end fan-out verification. */ + @deliver_by_port_group[this->port, this->group_str, this->vni] = count(); } mcast-local-delivery @@ -165,6 +168,9 @@ mcast-underlay-fwd { @by_vni["UNDERLAY", this->vni] = count(); @by_underlay["UNDERLAY", this->underlay_str] = count(); @by_nexthop_unicast[this->next_hop_str] = count(); + + /* Per-(group,vni,next-hop) fan-out; cross-checks ddm-peers rear-port count. */ + @fwd_by_group_nh[this->underlay_str, this->vni, this->next_hop_str] = count(); } mcast-underlay-fwd @@ -421,6 +427,11 @@ mcast-source-filtered { @by_vni["FILTERED", this->vni] = count(); @by_port[this->port] = count(); @filtered_by_mode[this->mode_str] = count(); + + /* Per-(event,scope,group,vni) drops for end-to-end loss attribution. + * The scope names the address space of the group column: overlay for the + * inner multicast group, underlay for the outer delivery address. */ + @drops["FILTERED", "overlay", this->dst_str, this->vni] = count(); } mcast-source-filtered @@ -454,6 +465,9 @@ mcast-fwd-source-filtered { @by_vni["FWD_FILT", this->vni] = count(); @by_nexthop_unicast[this->next_hop_str] = count(); @filtered_by_mode[this->mode_str] = count(); + + /* Per-(event,scope,group,vni) drops; see mcast-source-filtered. */ + @drops["FWD_FILT", "overlay", this->dst_str, this->vni] = count(); } mcast-fwd-source-filtered @@ -473,9 +487,13 @@ mcast-no-fwd-entry { /* arg0=underlay_ptr, arg1=vni */ this->underlay = (in6_addr_t *)arg0; this->vni = arg1; + this->underlay_str = inet_ntoa6(this->underlay); /* Always track aggregations */ @by_event["NOFWD"] = count(); + + /* Per-(event,scope,group,vni) drops; see mcast-source-filtered. */ + @drops["NOFWD", "underlay", this->underlay_str, this->vni] = count(); } mcast-no-fwd-entry @@ -499,10 +517,16 @@ END printa(@by_underlay); printf("\nLocal delivery by port:\n"); printa(@by_port); + printf("\nDelivery matrix (port, group, vni):\n"); + printa(@deliver_by_port_group); printf("\nForwarding by unicast next hop (routing address):\n"); printa(@by_nexthop_unicast); + printf("\nForwarding fan-out (underlay group, vni, next hop):\n"); + printa(@fwd_by_group_nh); printf("\nSource filtering by mode:\n"); printa(@filtered_by_mode); + printf("\nDrops (event, scope, group, vni):\n"); + printa(@drops); printf("\nConfig ops:\n"); printa(@cfg_counts); } diff --git a/xde-tests/Cargo.toml b/xde-tests/Cargo.toml index 6ca3dc3a..2747ea36 100644 --- a/xde-tests/Cargo.toml +++ b/xde-tests/Cargo.toml @@ -12,6 +12,7 @@ opte-test-utils.workspace = true oxide-vpc.workspace = true anyhow.workspace = true +dlpi.workspace = true libnet.workspace = true rand.workspace = true slog.workspace = true diff --git a/xde-tests/src/lib.rs b/xde-tests/src/lib.rs index a3772df9..8ed6398a 100644 --- a/xde-tests/src/lib.rs +++ b/xde-tests/src/lib.rs @@ -8,6 +8,16 @@ use anyhow::Result; use anyhow::anyhow; use anyhow::bail; use opte_ioctl::OpteHdl; +use opte_test_utils::Ethernet; +use opte_test_utils::Ethertype; +use opte_test_utils::GENEVE_PORT; +use opte_test_utils::Geneve; +use opte_test_utils::HeaderLen; +use opte_test_utils::IngotIpProto; +use opte_test_utils::Ipv4; +use opte_test_utils::Ipv6; +use opte_test_utils::MsgBlk; +use opte_test_utils::Udp; use oxide_vpc::api::AddFwRuleReq; use oxide_vpc::api::AddRouterEntryReq; use oxide_vpc::api::Address; @@ -150,6 +160,20 @@ pub const GENEVE_UNDERLAY_FILTER: &str = "ip6 and udp port 6081"; /// The simnet pair creates a loopback underlay for multicast tests. pub const UNDERLAY_TEST_DEVICE: &str = "xde_test_sim1"; +/// Underlay device used to inject raw frames into the receive path. +/// +/// A frame written here (the simnet `end_a`) is received on its peer +/// [`UNDERLAY_TEST_DEVICE`] (`end_b`), rises through `xde_test_vnic1`'s MAC +/// client, and reaches XDE's `xde_rx` callback. +pub const UNDERLAY_INJECT_DEVICE: &str = "xde_test_sim0"; + +/// Service access point is bound on the raw injection stream purely to reach +/// DLPI's `DL_IDLE` state, a precondition of `dlpi_send`. For ethernet the +/// service access point is the ethertype. In `DLPI_RAW` it plays no role in +/// building the frame, so this is an unused experimental ethertype chosen to +/// avoid demuxing real inbound traffic back into the stream. +const INJECT_SAP: u32 = 0x4000; + /// This is a wrapper around the ztest::Zone object that encapsulates common /// logic needed for running the OPTE tests zones used in this test suite. pub struct OpteZone { @@ -729,6 +753,135 @@ pub fn ensure_underlay_admin_scoped_route_v6(interface: &str) -> Result<()> { Ok(()) } +/// Inject a raw Geneve-over-IPv6 multicast frame onto the underlay receive path. +/// +/// Builds the full wire frame for an IPv4 multicast datagram tunnelled in +/// Geneve and writes it to [`UNDERLAY_INJECT_DEVICE`] in DLPI raw mode, so it +/// arrives at XDE's `xde_rx` callback exactly as a frame from a remote sled +/// would. This exercises `handle_mcast_rx` in isolation: no Tx processing and +/// thus no `guest_loopback` same-sled delivery occurs, unlike a guest send via +/// [`OpteZone::send_udp_v4`]/[`OpteZone::send_udp_v6`]. +/// +/// `underlay_group` is the outer IPv6 multicast destination (the subscribed +/// [`MulticastUnderlay`] group). `inner_src`/`inner_dst` are the inner IPv4 +/// source (subject to source filtering) and multicast destination group. `vni` is the +/// Geneve VNI. The Rx path keys delivery on the outer group rather than the VNI, +/// but a well-formed value is required for the frame to parse. +/// +/// # Errors +/// +/// Returns an error if the DLPI link cannot be opened in raw mode or the frame +/// cannot be transmitted. +/// +/// # Examples +/// +/// ```ignore +/// inject_underlay_mcast_v4( +/// &mcast_underlay, // underlay_group +/// "10.0.0.1".parse().unwrap(), // inner_src +/// Ipv4Addr::from([224, 0, 0, 251]), // inner_dst +/// Vni::new(DEFAULT_MULTICAST_VNI)?, // vni +/// MCAST_TEST_PORT, // dst_port +/// b"rx-only", // payload +/// )?; +/// ``` +pub fn inject_underlay_mcast_v4( + underlay_group: &MulticastUnderlay, + inner_src: Ipv4Addr, + inner_dst: Ipv4Addr, + vni: Vni, + dst_port: u16, + payload: &[u8], +) -> Result<()> { + let outer_group = underlay_group.addr(); + let outer_group_bytes = outer_group.bytes(); + + // Inner Ethernet header. The Rx path rewrites this destination MAC to the + // canonical multicast MAC derived from the inner IP, so the value set here + // is overwritten before delivery. + let inner_eth = Ethernet { + destination: MacAddr::from([0x01, 0x00, 0x5e, 0x00, 0x00, 0x01]), + source: MacAddr::from([0x00, 0x16, 0x3e, 0x00, 0x00, 0x01]), + ethertype: Ethertype::IPV4, + }; + let inner_ip = Ipv4 { + source: inner_src, + destination: inner_dst, + protocol: IngotIpProto::UDP, + hop_limit: 64, + total_len: (Ipv4::MINIMUM_LENGTH + Udp::MINIMUM_LENGTH + payload.len()) + as u16, + ..Default::default() + }; + let inner_udp = Udp { + source: 0x1234, + destination: dst_port, + length: (Udp::MINIMUM_LENGTH + payload.len()) as u16, + ..Default::default() + }; + + let mut inner_pkt = + MsgBlk::new_ethernet_pkt((inner_eth, inner_ip, inner_udp)); + if !payload.is_empty() { + inner_pkt.append(MsgBlk::copy(payload)); + } + let inner_len = inner_pkt.byte_len(); + + // Geneve with no options. The default protocol type is Ethernet (0x6558). + let geneve = Geneve { vni, ..Default::default() }; + + let outer_udp = Udp { + source: 0x1e61, + destination: GENEVE_PORT, + length: (Udp::MINIMUM_LENGTH + geneve.packet_length() + inner_len) + as u16, + ..Default::default() + }; + let outer_ip = Ipv6 { + source: "fd00::1".parse().unwrap(), + destination: outer_group, + next_header: IngotIpProto::UDP, + hop_limit: 64, + payload_len: outer_udp.length, + ..Default::default() + }; + // Outer Ethernet: IPv6 multicast MAC per RFC 2464 (33:33 + low 32 bits). + let outer_eth = Ethernet { + destination: MacAddr::from([ + 0x33, + 0x33, + outer_group_bytes[12], + outer_group_bytes[13], + outer_group_bytes[14], + outer_group_bytes[15], + ]), + source: MacAddr::from([0x00, 0x11, 0x22, 0x33, 0x44, 0x55]), + ethertype: Ethertype::IPV6, + }; + + let mut frame = + MsgBlk::new_ethernet_pkt((outer_eth, outer_ip, outer_udp, geneve)); + frame.append(inner_pkt); + let bytes: Vec = frame.iter().flat_map(|n| n.iter().copied()).collect(); + + // Open the underlay link in raw mode and transmit the assembled frame. + // The handle is closed when `_h` drops, before this function returns. + let handle = dlpi::open(UNDERLAY_INJECT_DEVICE, dlpi::sys::DLPI_RAW) + .map_err(|e| { + anyhow!("dlpi::open({UNDERLAY_INJECT_DEVICE}) failed: {e}") + })?; + let _h = dlpi::DropHandle(handle); + + // `dlpi_send` requires the stream in DL_IDLE, which `dlpi_bind` provides; + // an unbound send is rejected with DL_OUTSTATE. See [`INJECT_SAP`] for why + // the bound service access point is arbitrary in DLPI_RAW. + dlpi::bind(handle, INJECT_SAP) + .map_err(|e| anyhow!("dlpi::bind on {UNDERLAY_INJECT_DEVICE}: {e}"))?; + dlpi::send(handle, &[], &bytes, None) + .map_err(|e| anyhow!("dlpi::send on {UNDERLAY_INJECT_DEVICE}: {e}"))?; + Ok(()) +} + /// Global multicast group state that cleans up M2P mappings and forwarding /// entries on drop. Port-specific subscriptions are handled automatically by /// [`OptePort::drop()`]. diff --git a/xde-tests/tests/multicast_multi_nexthop.rs b/xde-tests/tests/multicast_multi_nexthop.rs index 4c1f9ffd..1e471412 100644 --- a/xde-tests/tests/multicast_multi_nexthop.rs +++ b/xde-tests/tests/multicast_multi_nexthop.rs @@ -4,18 +4,23 @@ // Copyright 2025 Oxide Computer Company -//! XDE multicast multi-next-hop fanout tests. +//! XDE multicast replication-target fanout and redundant-next-hop collapse +//! tests. //! -//! These tests validate that when multiple next hops are configured with -//! different replication modes, OPTE sends a separate packet to each next hop -//! with the correct replication flag in the Geneve header. +//! Distinct replication targets represent distinct multicast delivery sets, so +//! XDE emits one packet per target carrying the correct Geneve flag. Redundant +//! next hops sharing a target are alternate switch paths to the same delivery +//! set, so they collapse to a single per-flow copy via ECMP select-one rather +//! than fanning out a duplicate. use anyhow::Result; use opte_ioctl::OpteHdl; use opte_test_utils::geneve_verify; use oxide_vpc::api::DEFAULT_MULTICAST_VNI; +use oxide_vpc::api::IpAddr; use oxide_vpc::api::IpCidr; use oxide_vpc::api::Ipv4Addr; +use oxide_vpc::api::Ipv6Addr; use oxide_vpc::api::McastForwardingNextHop; use oxide_vpc::api::MulticastUnderlay; use oxide_vpc::api::NextHopV6; @@ -26,20 +31,21 @@ use xde_tests::GENEVE_UNDERLAY_FILTER; use xde_tests::IPV4_MULTICAST_CIDR; use xde_tests::MCAST_TEST_PORT; use xde_tests::MulticastGroup; +use xde_tests::SNOOP_TIMEOUT_EXPECT_NONE; use xde_tests::SnoopGuard; use xde_tests::UNDERLAY_TEST_DEVICE; #[test] fn test_multicast_multi_nexthop_fanout() -> Result<()> { - // Test that multicast forwarding with multiple next hops sends packets to - // all configured destinations, each with the correct replication flag. + // Test that multicast forwarding with multiple replication targets sends one + // packet per target, each with the correct replication flag. // // This test configures two next hops with different replication modes: // - NextHop 1: External replication (to boundary switch) // - NextHop 2: Underlay replication (sled-to-sled) // - // After sending one multicast packet, we verify that two distinct Geneve - // packets appear on the underlay, each with the correct replication flag. + // After sending one multicast packet, we verify that the External and + // Underlay targets each produce a Geneve packet with the correct flag. let topol = xde_tests::two_node_topology()?; let mcast_group = Ipv4Addr::from([224, 1, 2, 100]); @@ -54,8 +60,8 @@ fn test_multicast_multi_nexthop_fanout() -> Result<()> { // Use different addresses since NextHopV6 is the key in the forwarding table. // In production, these would be different switch addresses. // For single-sled testing, we use two synthetic addresses. - let nexthop1: oxide_vpc::api::Ipv6Addr = "fd77::1".parse().unwrap(); - let nexthop2: oxide_vpc::api::Ipv6Addr = "fd77::2".parse().unwrap(); + let nexthop1: Ipv6Addr = "fd77::1".parse().unwrap(); + let nexthop2: Ipv6Addr = "fd77::2".parse().unwrap(); mcast.set_forwarding(vec![ McastForwardingNextHop { @@ -194,3 +200,221 @@ fn test_multicast_multi_nexthop_fanout() -> Result<()> { Ok(()) } + +#[test] +fn test_multicast_dual_external_select_one() -> Result<()> { + // Two External next hops are redundant switch paths to the same external + // multicast network, so the flow must yield a single egress copy. Exercised + // for both any-source (ASM) and source-specific (SSM) entries, since + // selection is filter-aware. + + let topol = xde_tests::two_node_topology()?; + let sender_ip: IpAddr = topol.nodes[0].port.ip().into(); + + // ASM: both hops accept any source via the default `Exclude(empty)` filter. + assert_dual_select_one( + &topol, + Ipv4Addr::from([224, 1, 2, 101]), + MulticastUnderlay::new("ff04::e001:265".parse().unwrap()).unwrap(), + SourceFilter::default(), + Replication::External, + ["fd77::1", "fd77::2"], + )?; + + // SSM: both hops `Include` the sender, so both admit this flow's source and + // remain ECMP candidates. + assert_dual_select_one( + &topol, + Ipv4Addr::from([224, 1, 2, 102]), + MulticastUnderlay::new("ff04::e001:266".parse().unwrap()).unwrap(), + SourceFilter::Include([sender_ip].into_iter().collect()), + Replication::External, + ["fd77::1", "fd77::2"], + )?; + + Ok(()) +} + +#[test] +fn test_multicast_dual_underlay_select_one() -> Result<()> { + // Two Underlay next hops are redundant switch paths to the same sled + // subscribers, so the flow must leave this sled as a single underlay copy + // rather than a duplicate the Rx path could not dedup. Exercised for both + // ASM and SSM entries. + + let topol = xde_tests::two_node_topology()?; + let sender_ip: IpAddr = topol.nodes[0].port.ip().into(); + + assert_dual_select_one( + &topol, + Ipv4Addr::from([224, 1, 2, 105]), + MulticastUnderlay::new("ff04::e001:269".parse().unwrap()).unwrap(), + SourceFilter::default(), + Replication::Underlay, + ["fd77::5", "fd77::6"], + )?; + + assert_dual_select_one( + &topol, + Ipv4Addr::from([224, 1, 2, 106]), + MulticastUnderlay::new("ff04::e001:270".parse().unwrap()).unwrap(), + SourceFilter::Include([sender_ip].into_iter().collect()), + Replication::Underlay, + ["fd77::5", "fd77::6"], + )?; + + Ok(()) +} + +#[test] +fn test_multicast_dual_both_select_one() -> Result<()> { + // Two Both next hops are redundant switch paths to the same external network + // and the same sled subscribers. Since both targets see the same candidate + // set, the egress and underlay selections land on the same switch. The flow + // leaves as a single copy carrying the Both flag while the peer is fully + // suppressed. Exercised for both ASM and SSM entries. + + let topol = xde_tests::two_node_topology()?; + let sender_ip: IpAddr = topol.nodes[0].port.ip().into(); + + assert_dual_select_one( + &topol, + Ipv4Addr::from([224, 1, 2, 103]), + MulticastUnderlay::new("ff04::e001:267".parse().unwrap()).unwrap(), + SourceFilter::default(), + Replication::Both, + ["fd77::3", "fd77::4"], + )?; + + assert_dual_select_one( + &topol, + Ipv4Addr::from([224, 1, 2, 104]), + MulticastUnderlay::new("ff04::e001:268".parse().unwrap()).unwrap(), + SourceFilter::Include([sender_ip].into_iter().collect()), + Replication::Both, + ["fd77::3", "fd77::4"], + )?; + + Ok(()) +} + +/// Program two redundant next hops sharing a replication target, send one +/// packet, and assert that exactly one copy leaves carrying the requested +/// replication flag. +/// +/// Switches sharing a target reach the same multicast delivery set, so a flow +/// needs a single copy per target. For a homogeneous pair, the egress and +/// underlay selections index the same candidate set with the same flow hash +/// and pick the same hop, so the result is one copy with the configured flag and +/// the peer is suppressed. +fn assert_dual_select_one( + topol: &xde_tests::Topology, + mcast_group: Ipv4Addr, + mcast_underlay: MulticastUnderlay, + source_filter: SourceFilter, + replication: Replication, + nexthops: [&str; 2], +) -> Result<()> { + let vni = Vni::new(DEFAULT_MULTICAST_VNI)?; + let mcast = MulticastGroup::new(mcast_group.into(), mcast_underlay)?; + + let nexthop1: Ipv6Addr = nexthops[0].parse().unwrap(); + let nexthop2: Ipv6Addr = nexthops[1].parse().unwrap(); + + mcast.set_forwarding(vec![ + McastForwardingNextHop { + next_hop: NextHopV6::new(nexthop1, vni), + replication, + source_filter: source_filter.clone(), + }, + McastForwardingNextHop { + next_hop: NextHopV6::new(nexthop2, vni), + replication, + source_filter, + }, + ])?; + + let mcast_cidr = IpCidr::Ip4(IPV4_MULTICAST_CIDR.parse().unwrap()); + topol.nodes[0].port.add_multicast_router_entry(mcast_cidr)?; + + topol.nodes[0] + .port + .subscribe_multicast(mcast_group.into()) + .expect("subscribe port 0 should succeed"); + + // Confirm both next hops are programmed for failover. + let hdl = OpteHdl::open()?; + let mfwd = hdl.dump_mcast_fwd()?; + let entry = mfwd + .entries + .iter() + .find(|e| e.underlay == mcast_underlay) + .expect("missing multicast forwarding entry for underlay group"); + + assert_eq!( + entry + .next_hops + .iter() + .filter(|hop| hop.replication == replication) + .count(), + 2, + "expected both next hops programmed with {replication:?}; got: {:?}", + entry.next_hops + ); + + let sender_v4 = topol.nodes[0].port.ip(); + let payload = "dual select-one"; + + // 1st send: exactly one copy carrying the configured replication flag. + { + let mut snoop = + SnoopGuard::start(UNDERLAY_TEST_DEVICE, GENEVE_UNDERLAY_FILTER)?; + + topol.nodes[0].zone.send_udp_v4( + sender_v4, + mcast_group, + MCAST_TEST_PORT, + payload, + )?; + + let out = snoop.assert_packet("single underlay copy"); + let stdout = String::from_utf8_lossy(&out.stdout); + let packets = geneve_verify::extract_snoop_hex(&stdout) + .expect("snoop output should contain a hex dump"); + let bytes = geneve_verify::parse_snoop_hex(&packets[0]) + .expect("captured packet should parse as hex"); + let info = geneve_verify::parse_geneve_packet(&bytes) + .expect("captured packet should parse as Geneve"); + assert_eq!( + info.replication, + Some(replication), + "selected copy must carry {replication:?} replication" + ); + } + + // 2nd send: a snoop waiting for two packets must time out, proving the + // redundant switch path for the same target emitted no duplicate copy. + { + let mut snoop = SnoopGuard::start_with_count( + UNDERLAY_TEST_DEVICE, + GENEVE_UNDERLAY_FILTER, + 2, + )?; + + topol.nodes[0].zone.send_udp_v4( + sender_v4, + mcast_group, + MCAST_TEST_PORT, + payload, + )?; + + if let Ok(out) = snoop.wait_with_timeout(SNOOP_TIMEOUT_EXPECT_NONE) { + let stdout = String::from_utf8_lossy(&out.stdout); + panic!( + "expected a single copy, but snoop captured a duplicate:\n{stdout}" + ); + } + } + + Ok(()) +} diff --git a/xde-tests/tests/multicast_rx.rs b/xde-tests/tests/multicast_rx.rs index 2415ab15..d97ad80e 100644 --- a/xde-tests/tests/multicast_rx.rs +++ b/xde-tests/tests/multicast_rx.rs @@ -27,6 +27,8 @@ use oxide_vpc::api::NextHopV6; use oxide_vpc::api::Replication; use oxide_vpc::api::SourceFilter; use oxide_vpc::api::Vni; +use std::thread; +use std::time::Duration; use xde_tests::GENEVE_UNDERLAY_FILTER; use xde_tests::IPV4_MULTICAST_CIDR; use xde_tests::IPV6_ADMIN_LOCAL_MULTICAST_CIDR; @@ -35,6 +37,7 @@ use xde_tests::MulticastGroup; use xde_tests::SNOOP_TIMEOUT_EXPECT_NONE; use xde_tests::SnoopGuard; use xde_tests::UNDERLAY_TEST_DEVICE; +use xde_tests::inject_underlay_mcast_v4; #[test] fn test_xde_multicast_rx_dual_family() -> Result<()> { @@ -267,6 +270,113 @@ fn test_xde_multicast_rx_dual_family() -> Result<()> { Ok(()) } +#[test] +fn test_multicast_rx_only_delivery() -> Result<()> { + // Rx-path isolation test: drive `handle_mcast_rx` directly by injecting a + // raw Geneve-over-IPv6 multicast frame onto the underlay, with no Tx ever + // issued from a guest. + // + // The dual-family test relies on `OpteZone::send_udp_v4`/`send_udp_v6`, which on + // a single sled also trigger the Tx `guest_loopback` same-sled delivery. + // Here, we never send from a guest, so a delivered packet can only have + // arrived via the underlay receive path. + + let topol = xde_tests::two_node_topology()?; + + // IPv4 multicast group mapped to its admin-local IPv6 underlay address per + // Omicron's map_external_to_underlay_ip() (last 4 bytes encode the IPv4). + let mcast_group = Ipv4Addr::from([224, 0, 0, 251]); + let vni = Vni::new(DEFAULT_MULTICAST_VNI)?; + let mcast_underlay = + MulticastUnderlay::new("ff04::e000:fb".parse().unwrap()).unwrap(); + + // Establish the M2P mapping (cleaned up on drop). No forwarding entry is + // configured because forwarding drives Tx replication only. + let _mcast = MulticastGroup::new(mcast_group.into(), mcast_underlay)?; + + // Allow IPv4 multicast through the receiver's firewall and subscribe it. + let mcast_cidr = IpCidr::Ip4(IPV4_MULTICAST_CIDR.parse().unwrap()); + topol.nodes[1].port.add_multicast_router_entry(mcast_cidr)?; + topol.nodes[1] + .port + .subscribe_multicast(mcast_group.into()) + .expect("subscribe receiver port should succeed"); + + // Confirm the subscription is present before injecting. + let hdl = OpteHdl::open()?; + let subs = hdl.dump_mcast_subs()?; + let p1 = topol.nodes[1].port.name().to_string(); + let s_entry = subs + .entries + .iter() + .find(|e| e.underlay == mcast_underlay) + .expect("missing multicast subscription entry for underlay group"); + assert!( + s_entry.has_port(&p1), + "expected {p1} to be subscribed; got {:?}", + s_entry.subscribers + ); + + // Snoop the receiver's guest device for the delivered inner packet. + let dev_name_b = topol.nodes[1].port.name().to_string(); + let filter = + format!("udp and ip dst {mcast_group} and port {MCAST_TEST_PORT}"); + let mut snoop_rx = SnoopGuard::start(&dev_name_b, &filter)?; + + // Inject a raw underlay frame. The inner source mirrors a remote sender + // (node 0's overlay address). Note that delivery is keyed on the outer + // group, not the arrival VNIC or VNI. + // + // `SnoopGuard::start` spawns `snoop` and returns before the capture is + // actually live, so a single frame can race ahead of snoop and be missed. + // We therefore re-inject until the capture observes a frame; the resulting + // duplicate multicast deliveries are harmless. + // + // Injection runs on this initial thread by design. illumos privileges are + // per-LWP, and `dlpi_open` resolves the link through a dlmgmtd door call + // that requires privileges `pfexec` grants only to the process's first + // thread; a frame injected from a freshly spawned thread fails link lookup + // with ENOLINK. The blocking snoop wait needs no such privilege, so it runs + // on the worker thread instead. + let payload = b"rx-only delivery"; + let inner_src = topol.nodes[0].port.ip(); + let ctx = format!("on {dev_name_b}"); + let snoop_handle = thread::spawn(move || snoop_rx.assert_packet(&ctx)); + + while !snoop_handle.is_finished() { + inject_underlay_mcast_v4( + &mcast_underlay, + inner_src, + mcast_group, + vni, + MCAST_TEST_PORT, + payload, + )?; + thread::sleep(Duration::from_millis(100)); + } + + let snoop_output = snoop_handle.join().unwrap(); + + let stdout = String::from_utf8_lossy(&snoop_output.stdout); + assert!( + stdout.contains("224.0.0.251"), + "expected destination 224.0.0.251 in snoop output:\n{stdout}" + ); + assert!( + stdout.contains("delivery"), + "expected payload substring 'delivery' in snoop output:\n{stdout}" + ); + + // L2 dest is rewritten by XDE to the canonical IPv4 multicast MAC per + // RFC 1112: 01:00:5e + low 23 bits of 224.0.0.251 -> 01:00:5e:00:00:fb. + assert!( + stdout.to_ascii_lowercase().contains("0100 5e00 00fb"), + "expected IPv4 multicast MAC '0100 5e00 00fb' in snoop output; got:\n{stdout}" + ); + + Ok(()) +} + #[test] fn test_reject_link_local_underlay_ff02() -> Result<()> { let hdl = OpteHdl::open()?; diff --git a/xde/src/xde.rs b/xde/src/xde.rs index 2c814957..244cae1c 100644 --- a/xde/src/xde.rs +++ b/xde/src/xde.rs @@ -320,11 +320,11 @@ use oxide_vpc::engine::router; const ETHERNET_MTU: u16 = 1500; -// Type alias for multicast forwarding table: -// Maps underlay multicast addresses to next hops with replication and source filters. -// The source filter is the aggregated filter for the destination sled (union of -// all subscriber filters on that sled). Packets are only forwarded if the -// aggregated filter allows the source. +// Type alias for multicast forwarding table: maps underlay multicast addresses +// to switch next hops with replication and source filters. Each source filter is +// aggregated over the subscriber set reachable through that next hop. Packets are +// only forwarded to a selected next hop if its aggregated filter allows the +// source. type McastForwardingTable = BTreeMap< MulticastUnderlay, BTreeMap, @@ -2302,6 +2302,113 @@ struct MulticastRxContext<'a> { inner_eth_off: usize, } +/// The replication target an ECMP next hop selection runs over. +/// +/// A next hop is a switch endpoint, and the switch is the replication engine. +/// XDE's fanout is across replication targets, not redundant switch endpoints: +/// the targets are external egress and underlay delivery. Next hops sharing a +/// target are redundant paths to the same multicast delivery set, so a flow +/// should use one of them rather than one copy per switch. Both targets admit +/// `Both` (replication) next hops, which contribute to egress and underlay +/// independently. +#[derive(Clone, Copy)] +enum ReplicationTarget { + /// Egress to the external network via the switch front panel. + External, + /// Underlay delivery to sleds behind the switch. + Underlay, +} + +impl ReplicationTarget { + /// Whether a next hop with this `replication` mode serves this target. + fn includes(self, replication: &Replication) -> bool { + match self { + ReplicationTarget::External => { + matches!(replication, Replication::External | Replication::Both) + } + ReplicationTarget::Underlay => { + matches!(replication, Replication::Underlay | Replication::Both) + } + } + } +} + +/// The next hop chosen to carry a flow's single copy for each replication +/// target. +/// +/// A field is `None` when no next hop for that target admits the flow's source. +struct ReplicationSelection { + external: Option, + underlay: Option, +} + +/// Select one next hop per replication target to carry a flow's single copy. +/// +/// The control plane programs multiple next hops sharing a target for switch +/// redundancy, not to represent disjoint multicast destination sets. For a +/// given target, each candidate switch reaches the same external network or the +/// same sled subscribers because group membership is mirrored across the +/// redundant switches. A multicast stream, therefore, needs only a single copy +/// per target leaving this sled. Emitting to every candidate would duplicate +/// the stream, and a receiver cannot tell duplicate copies apart, so it cannot +/// deduplicate. +/// +/// One candidate is chosen per target per flow while all remain programmed in +/// the forwarding table, so any peer can carry the flow on failover. The caller +/// suppresses the redundant copy on the others. +/// +/// Only next hops whose source filter admits `inner_src` are candidates. For an +/// any-source group (the default `Exclude(empty)`) every hop for the target +/// qualifies. +/// +/// For a source-filtered group, only the hops that permit this source do, so a +/// denied source never selects a hop that would have dropped it while another +/// would have forwarded. +/// +/// Among the candidates, selection is keyed on the inner flow's L4 hash (the +/// flow's CRC32, the same key the V2B boundary path uses to ECMP over tunnel +/// endpoints). For multicast, that hash includes the inner source and group +/// (and L4 fields when present), so a given flow pins deterministically to one +/// switch across reboots and OPTE instances while distinct flows are spread +/// across switches. +/// +/// Each target is resolved independently. The eligible count is not known in +/// advance because the source filter depends on the flow, so candidates are +/// counted in one pass and the `hash % count` index is taken in a second, +/// mirroring the boundary path's `nth(hash % len)` without materialising the +/// filtered set. +fn select_nexthops( + next_hops: &BTreeMap, + inner_src: oxide_vpc::api::IpAddr, + l4_hash: u32, +) -> ReplicationSelection { + // A candidate is eligible when its source filter admits this flow and its + // next hop serves the target. The count pass precedes the indexing pass. + let select = |target: ReplicationTarget| { + let count = next_hops + .iter() + .filter(|(_, (replication, source_filter))| { + source_filter.allows(inner_src) && target.includes(replication) + }) + .count(); + (count > 0).then(|| l4_hash as usize % count).and_then(|idx| { + next_hops + .iter() + .filter(|(_, (replication, source_filter))| { + source_filter.allows(inner_src) + && target.includes(replication) + }) + .map(|(next_hop, _)| *next_hop) + .nth(idx) + }) + }; + + ReplicationSelection { + external: select(ReplicationTarget::External), + underlay: select(ReplicationTarget::Underlay), + } +} + /// Handle multicast packet forwarding for same-sled delivery and underlay /// replication based on the XDE-wide multicast forwarding table. /// @@ -2401,11 +2508,11 @@ fn handle_mcast_tx<'a>( } } - // Next hop forwarding: send packets to configured next hops. + // Next hop forwarding: send packets to configured switch next hops. // - // At the leaf level, we process all next hops in the forwarding table. - // Each next hop's `Replication` is a Tx-only instruction telling the switch - // which ports to replicate to: + // At the leaf level, we process the forwarding table, but we do not + // transmit to every next hop. Each next hop's `Replication` is a Tx-only + // instruction telling the chosen switch which ports to replicate to: // - External: ports set for external multicast traffic (egress to external networks) // - Underlay: replicate to other sleds (using multicast outer dst) // - Both: both external and underlay replication @@ -2422,12 +2529,32 @@ fn handle_mcast_tx<'a>( } if let Some(next_hops) = cpu_mcast_fwd.get(&underlay_key) { - // We found forwarding entries, replicate to each next hop + // A next hop is a switch, and the switch replicates to every destination + // in the requested target's multicast delivery set. Next hops sharing a + // target are redundant switch paths to that set: external candidates + // reach the same external multicast network, and underlay candidates + // reach the same sled subscribers. Emitting to every next hop for a + // target would duplicate the stream. + // + // We therefore run a two-pass `%` ECMP select-one per target. The first + // pass counts source-eligible candidates for the target, and the second + // selects `l4_hash % count` in the same stable order. Because the + // candidate switches are redundant, picking one avoids duplication. The + // two targets are selected independently, so a `Both` replication next + // hop can be the choice for one target and not the other. + let ReplicationSelection { + external: chosen_external, + underlay: chosen_underlay, + } = select_nexthops(next_hops, ctx.inner_src, ctx.l4_hash); + + // Iterate the programmed next hops, narrowing each to its choice. for (next_hop, (replication, source_filter)) in next_hops.iter() { - // Check aggregated source filter before forwarding. - // This filter is the union of all subscriber filters for - // this next hop. If no subscriber would accept this source, - // skip forwarding. + // Check aggregated source filter before forwarding. This filter is + // the union of all subscriber filters for destinations reachable + // through this next hop. If no subscriber would accept this source, + // we skip forwarding. Selection has already excluded this hop. This + // second check preserves per-hop drop telemetry for filtered + // entries. if !source_filter.allows(ctx.inner_src) { let xde = get_xde_state(); xde.stats.vals.mcast_tx_fwd_source_filtered().incr(1); @@ -2444,6 +2571,41 @@ fn handle_mcast_tx<'a>( continue; } + // Compose the per-flow selections into this hop's effective + // replication. A hop keeps a target only if it is that target's + // choice. + // + // A `Both` replication hop is narrowed when it is the choice for + // one target but not the other, and skipped entirely when it is the + // choice for neither. This emits exactly one external copy and one + // underlay copy per flow while each target can land on a different + // switch. + let keep_external = chosen_external.as_ref() == Some(next_hop); + let keep_underlay = chosen_underlay.as_ref() == Some(next_hop); + let effective_replication = match replication { + Replication::External => { + if keep_external { + Replication::External + } else { + continue; + } + } + Replication::Underlay => { + if keep_underlay { + Replication::Underlay + } else { + continue; + } + } + Replication::Both => match (keep_external, keep_underlay) { + (true, true) => Replication::Both, + (true, false) => Replication::External, + (false, true) => Replication::Underlay, + (false, false) => continue, + }, + Replication::Reserved => Replication::Reserved, + }; + // Clone packet with headers using pullup let Ok(mut fwd_pkt) = ctx.out_pkt.pullup(NonZeroUsize::new(pullup_len)) @@ -2481,7 +2643,11 @@ fn handle_mcast_tx<'a>( } // Update Geneve multicast option with the Tx-only replication // instruction for the switch. - update_mcast_replication(&mut fwd_pkt, geneve_offset, *replication); + update_mcast_replication( + &mut fwd_pkt, + geneve_offset, + effective_replication, + ); // Route to switch unicast address to determine which underlay // port/MAC to use. Packet destination is multicast address with @@ -2525,7 +2691,7 @@ fn handle_mcast_tx<'a>( (AF_INET6 as usize, &outer_ip6 as *const _ as uintptr_t); // Fire DTrace probes and increment stats based on replication mode - match replication { + match effective_replication { oxide_vpc::api::Replication::Underlay => { __dtrace_probe_mcast__underlay__fwd( af, @@ -3848,7 +4014,9 @@ fn set_mcast_forwarding_hdlr( // Validation of admin-local IPv6 (ff04::/16) happens at deserialization let underlay = req.underlay; - // Fleet-level multicast: enforce DEFAULT_MULTICAST_VNI for all replication modes. + // Fleet-level multicast: enforce DEFAULT_MULTICAST_VNI for all replication + // modes. + // // NextHopV6.addr must be unicast (switch address for routing). // The packet will be sent to the multicast address (req.underlay). for entry in &req.next_hops {