1+ // This Source Code Form is subject to the terms of the Mozilla Public
2+ // License, v. 2.0. If a copy of the MPL was not distributed with this
3+ // file, You can obtain one at https://mozilla.org/MPL/2.0/.
4+
5+ // Copyright 2024 Oxide Computer Company
6+
17use crate :: ip;
28use crate :: sys;
39use crate :: xde:: xde_underlay_port;
@@ -18,6 +24,9 @@ use opte::engine::ip6::Ipv6Addr;
1824/// The duration a cached route remains valid for.
1925const MAX_ROUTE_LIFETIME : Duration = Duration :: from_millis ( 100 ) ;
2026
27+ /// Maximum cache size, set to prevent excessive map modification latency.
28+ const MAX_CACHE_ENTRIES : usize = 512 ;
29+
2130extern "C" {
2231 pub fn __dtrace_probe_next__hop (
2332 dst : uintptr_t ,
@@ -144,7 +153,7 @@ fn netstack_rele(ns: *mut ip::netstack_t) {
144153//
145154// Let's say this host (sled1) wants to send a packet to sled2. Our
146155// sled1 host lives on network `fd00:<rack1_sled1>::/64` while our
147- // sled2 host lives on `fd00:<rack1_seld2 >::/64` -- the key point
156+ // sled2 host lives on `fd00:<rack1_sled2 >::/64` -- the key point
148157// being they are two different networks and thus must be routed to
149158// talk to each other. For sled1 to send this packet it will attempt
150159// to look up destination `fd00:<rack1_sled2>::7777` (in this case
@@ -387,6 +396,35 @@ fn next_hop<'a>(
387396}
388397
389398/// A simple caching layer over `next_hop`.
399+ ///
400+ /// [`next_hop`] has a latency distribution which roughly looks like this:
401+ /// ```text
402+ /// t(ns) Count
403+ /// 1024 | 337
404+ /// 1280 | 108
405+ /// 1536 |@@@@@@@@@@@@@@@@@@@@@ 376883
406+ /// 1792 |@@@@@@@@@@@@@@@ 264693
407+ /// 2048 |@ 17798
408+ /// 2304 |@ 14791
409+ /// 2560 |@@ 32901
410+ /// 2816 |@ 10730
411+ /// 3072 | 3459
412+ /// ```
413+ ///
414+ /// Naturally, bringing this down to O(ns) is desirable. Usually, illumos
415+ /// holds `ire_t`s per `conn_t`, but we're aiming to be more fine-grained
416+ /// with DDM -- so we need a tradeoff between 'asking about the best route
417+ /// per-packet' and 'holding a route until it is expired'. We choose, for noe,
418+ /// to hold a route for 100ms.
419+ ///
420+ /// Note, this uses a `BTreeMap`, but we would prefer the more consistent
421+ /// (faster) add/remove costs of a `HashMap`. As `BTreeMap` modification costs
422+ /// outpace the cost of `next_hop` between 256--512 entries, we currently set 512
423+ /// as a cap on cache size to prevent significant packet stalls. This may be tricky
424+ /// to tune.
425+ ///
426+ /// (See: https://github.com/oxidecomputer/opte/pull/499#discussion_r1581164767
427+ /// for some performance numbers.)
390428#[ derive( Clone ) ]
391429pub struct RouteCache ( Arc < KRwLock < BTreeMap < RouteKey , CachedRoute > > > ) ;
392430
@@ -432,15 +470,28 @@ impl RouteCache {
432470 _ => { }
433471 }
434472
473+ // We've had a definitive flow miss, but we need to cap the cache
474+ // size to prevent excessive modification latencies at high flow
475+ // counts.
476+ // If full and we have no old entry to update, drop the lock and do
477+ // not insert.
478+ // XXX: Want to profile in future to see if LRU expiry is
479+ // affordable/sane here.
480+ // XXX: A HashMap would exchange insert cost for lookup.
481+ let route_cache = ( maybe_route. is_some ( )
482+ || route_cache. len ( ) <= MAX_CACHE_ENTRIES )
483+ . then_some ( route_cache) ;
484+
435485 // `next_hop` might fail for myriad reasons, but we still
436486 // send the packet on an underlay device depending on our
437487 // progress. However, we do not want to cache bad mappings.
438- match next_hop ( & key, xde) {
439- Ok ( route) => {
488+ match ( route_cache , next_hop ( & key, xde) ) {
489+ ( Some ( mut route_cache ) , Ok ( route) ) => {
440490 route_cache. insert ( key, route. cached ( xde, t) ) ;
441491 route
442492 }
443- Err ( underlay_dev) => Route {
493+ ( None , Ok ( route) ) => route,
494+ ( _, Err ( underlay_dev) ) => Route {
444495 src : EtherAddr :: zero ( ) ,
445496 dst : EtherAddr :: zero ( ) ,
446497 underlay_dev,
0 commit comments