Skip to content

Commit 947bef4

Browse files
committed
Review discussion: Cap size of routing cache.
This also includes more details on the relative costs we're trying to offset, and the futur ease for impling/finding a suitable hashmap.
1 parent 653f8e6 commit 947bef4

3 files changed

Lines changed: 57 additions & 6 deletions

File tree

crates/opte-api/src/lib.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ pub use ulp::*;
4747
///
4848
/// We rely on CI and the check-api-version.sh script to verify that
4949
/// this number is incremented anytime the oxide-api code changes.
50-
pub const API_VERSION: u64 = 28;
50+
pub const API_VERSION: u64 = 29;
5151

5252
/// Major version of the OPTE package.
5353
pub const MAJOR_VERSION: u64 = 0;

xde/src/lib.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
// License, v. 2.0. If a copy of the MPL was not distributed with this
33
// file, You can obtain one at https://mozilla.org/MPL/2.0/.
44

5-
// Copyright 2022 Oxide Computer Company
5+
// Copyright 2024 Oxide Computer Company
66

77
// xde - A mac provider for OPTE-based network implementations.
88
#![feature(extern_types)]

xde/src/route.rs

Lines changed: 55 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,9 @@
1+
// This Source Code Form is subject to the terms of the Mozilla Public
2+
// License, v. 2.0. If a copy of the MPL was not distributed with this
3+
// file, You can obtain one at https://mozilla.org/MPL/2.0/.
4+
5+
// Copyright 2024 Oxide Computer Company
6+
17
use crate::ip;
28
use crate::sys;
39
use crate::xde::xde_underlay_port;
@@ -18,6 +24,9 @@ use opte::engine::ip6::Ipv6Addr;
1824
/// The duration a cached route remains valid for.
1925
const MAX_ROUTE_LIFETIME: Duration = Duration::from_millis(100);
2026

27+
/// Maximum cache size, set to prevent excessive map modification latency.
28+
const MAX_CACHE_ENTRIES: usize = 512;
29+
2130
extern "C" {
2231
pub fn __dtrace_probe_next__hop(
2332
dst: uintptr_t,
@@ -144,7 +153,7 @@ fn netstack_rele(ns: *mut ip::netstack_t) {
144153
//
145154
// Let's say this host (sled1) wants to send a packet to sled2. Our
146155
// sled1 host lives on network `fd00:<rack1_sled1>::/64` while our
147-
// sled2 host lives on `fd00:<rack1_seld2>::/64` -- the key point
156+
// sled2 host lives on `fd00:<rack1_sled2>::/64` -- the key point
148157
// being they are two different networks and thus must be routed to
149158
// talk to each other. For sled1 to send this packet it will attempt
150159
// to look up destination `fd00:<rack1_sled2>::7777` (in this case
@@ -387,6 +396,35 @@ fn next_hop<'a>(
387396
}
388397

389398
/// A simple caching layer over `next_hop`.
399+
///
400+
/// [`next_hop`] has a latency distribution which roughly looks like this:
401+
/// ```text
402+
/// t(ns) Count
403+
/// 1024 | 337
404+
/// 1280 | 108
405+
/// 1536 |@@@@@@@@@@@@@@@@@@@@@ 376883
406+
/// 1792 |@@@@@@@@@@@@@@@ 264693
407+
/// 2048 |@ 17798
408+
/// 2304 |@ 14791
409+
/// 2560 |@@ 32901
410+
/// 2816 |@ 10730
411+
/// 3072 | 3459
412+
/// ```
413+
///
414+
/// Naturally, bringing this down to O(ns) is desirable. Usually, illumos
415+
/// holds `ire_t`s per `conn_t`, but we're aiming to be more fine-grained
416+
/// with DDM -- so we need a tradeoff between 'asking about the best route
417+
/// per-packet' and 'holding a route until it is expired'. We choose, for noe,
418+
/// to hold a route for 100ms.
419+
///
420+
/// Note, this uses a `BTreeMap`, but we would prefer the more consistent
421+
/// (faster) add/remove costs of a `HashMap`. As `BTreeMap` modification costs
422+
/// outpace the cost of `next_hop` between 256--512 entries, we currently set 512
423+
/// as a cap on cache size to prevent significant packet stalls. This may be tricky
424+
/// to tune.
425+
///
426+
/// (See: https://github.com/oxidecomputer/opte/pull/499#discussion_r1581164767
427+
/// for some performance numbers.)
390428
#[derive(Clone)]
391429
pub struct RouteCache(Arc<KRwLock<BTreeMap<RouteKey, CachedRoute>>>);
392430

@@ -432,15 +470,28 @@ impl RouteCache {
432470
_ => {}
433471
}
434472

473+
// We've had a definitive flow miss, but we need to cap the cache
474+
// size to prevent excessive modification latencies at high flow
475+
// counts.
476+
// If full and we have no old entry to update, drop the lock and do
477+
// not insert.
478+
// XXX: Want to profile in future to see if LRU expiry is
479+
// affordable/sane here.
480+
// XXX: A HashMap would exchange insert cost for lookup.
481+
let route_cache = (maybe_route.is_some()
482+
|| route_cache.len() <= MAX_CACHE_ENTRIES)
483+
.then_some(route_cache);
484+
435485
// `next_hop` might fail for myriad reasons, but we still
436486
// send the packet on an underlay device depending on our
437487
// progress. However, we do not want to cache bad mappings.
438-
match next_hop(&key, xde) {
439-
Ok(route) => {
488+
match (route_cache, next_hop(&key, xde)) {
489+
(Some(mut route_cache), Ok(route)) => {
440490
route_cache.insert(key, route.cached(xde, t));
441491
route
442492
}
443-
Err(underlay_dev) => Route {
493+
(None, Ok(route)) => route,
494+
(_, Err(underlay_dev)) => Route {
444495
src: EtherAddr::zero(),
445496
dst: EtherAddr::zero(),
446497
underlay_dev,

0 commit comments

Comments
 (0)