Skip to content

Commit e4bcfdf

Browse files
authored
fix(gateway): allow local sandbox jwt to not expire (NVIDIA#1721)
1 parent d5b79e5 commit e4bcfdf

15 files changed

Lines changed: 657 additions & 29 deletions

File tree

architecture/gateway.md

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -60,8 +60,11 @@ checks the returned pod binding against the live pod UID, and verifies the pod's
6060
controlling `Sandbox` ownerReference against the live Sandbox CR UID and
6161
sandbox-id label before minting the gateway JWT. Supervisors renew gateway JWTs
6262
in memory before expiry only while the sandbox record still exists. Older tokens
63-
are not server-revoked; deployments bound replay exposure with short
64-
`gateway_jwt.ttl_secs` lifetimes.
63+
are not server-revoked; shared deployments bound replay exposure with short
64+
`gateway_jwt.ttl_secs` lifetimes. The config default is
65+
`gateway_jwt.ttl_secs = 0` for local single-player Docker, Podman, and VM
66+
gateways; those tokens carry `exp = 0` and do not expire. Kubernetes and other
67+
shared deployments should set a positive TTL.
6568

6669
Gateway JWT signing-key rotation is currently an offline operator action. The
6770
runtime loads one active signing key and one matching public verification key

crates/openshell-core/src/config.rs

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -504,7 +504,8 @@ pub struct GatewayJwtConfig {
504504
/// hostname-or-`openshell` placeholder if unset.
505505
#[serde(default = "default_gateway_id")]
506506
pub gateway_id: String,
507-
/// Token lifetime in seconds. Defaults to 1 hour.
507+
/// Token lifetime in seconds. A value of 0 disables expiration and is
508+
/// intended only for local single-player deployments.
508509
#[serde(default = "default_sandbox_token_ttl_secs")]
509510
pub ttl_secs: u64,
510511
}
@@ -514,7 +515,7 @@ fn default_gateway_id() -> String {
514515
}
515516

516517
const fn default_sandbox_token_ttl_secs() -> u64 {
517-
3_600
518+
0
518519
}
519520

520521
fn default_roles_claim() -> String {
@@ -726,7 +727,7 @@ mod tests {
726727
#[cfg(unix)]
727728
use super::is_reachable_unix_socket;
728729
use super::{
729-
ComputeDriverKind, Config, DEFAULT_SERVICE_ROUTING_DOMAIN, detect_driver,
730+
ComputeDriverKind, Config, DEFAULT_SERVICE_ROUTING_DOMAIN, GatewayJwtConfig, detect_driver,
730731
docker_host_unix_socket_path, is_unix_socket, podman_socket_candidates_from_env,
731732
podman_socket_responds,
732733
};
@@ -781,6 +782,18 @@ mod tests {
781782
assert!(!cfg.auth.allow_unauthenticated_users);
782783
}
783784

785+
#[test]
786+
fn gateway_jwt_ttl_defaults_to_non_expiring() {
787+
let cfg: GatewayJwtConfig = serde_json::from_value(serde_json::json!({
788+
"signing_key_path": "/tmp/signing.pem",
789+
"public_key_path": "/tmp/public.pem",
790+
"kid_path": "/tmp/kid"
791+
}))
792+
.expect("gateway JWT config should deserialize with default ttl");
793+
794+
assert_eq!(cfg.ttl_secs, 0);
795+
}
796+
784797
#[test]
785798
fn service_routing_allows_loopback_plaintext_http_by_default() {
786799
let cfg = Config::new(None);

crates/openshell-sandbox/src/grpc_client.rs

Lines changed: 25 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -389,7 +389,7 @@ async fn refresh_token_loop(
389389
/// Compute the next refresh delay: 80 % of the time remaining until the
390390
/// current token's `exp`, plus up to 10 % jitter, with a small lower bound
391391
/// for already-expired tokens and capped at 12 h. If the token can't be parsed
392-
/// (legacy/non-JWT bearer)
392+
/// (legacy/non-JWT bearer) or carries the `exp = 0` non-expiring sentinel,
393393
/// default to 6 h.
394394
fn compute_refresh_delay(slot: &TokenSlot) -> Duration {
395395
let token = slot
@@ -404,11 +404,16 @@ fn compute_refresh_delay(slot: &TokenSlot) -> Duration {
404404
.map_or(0, |d| d.as_millis()),
405405
)
406406
.unwrap_or(i64::MAX);
407-
let remaining_ms = parse_jwt_exp_ms(bearer).map_or(21_600_000, |exp| exp - now_ms); // 6 h fallback
408-
let mut delay_ms = if remaining_ms <= 0 {
409-
1_000
410-
} else {
411-
(remaining_ms * 8 / 10).clamp(1_000, 43_200_000)
407+
let mut delay_ms = match parse_jwt_exp_ms(bearer) {
408+
Some(0) | None => 21_600_000,
409+
Some(exp) => {
410+
let remaining_ms = exp - now_ms;
411+
if remaining_ms <= 0 {
412+
1_000
413+
} else {
414+
(remaining_ms * 8 / 10).clamp(1_000, 43_200_000)
415+
}
416+
}
412417
};
413418
// Up to 10 % jitter, derived deterministically from token bytes so
414419
// unit tests are reproducible without injecting an RNG.
@@ -494,6 +499,20 @@ mod auth_tests {
494499
assert!((1..60).contains(&delay.as_secs()));
495500
}
496501

502+
#[test]
503+
fn compute_refresh_delay_treats_exp_zero_as_non_expiring() {
504+
use base64::Engine as _;
505+
let payload = base64::engine::general_purpose::URL_SAFE_NO_PAD.encode(r#"{"exp":0}"#);
506+
let token = format!("h.{payload}.s");
507+
let bearer = AsciiMetadataValue::try_from(format!("Bearer {token}")).unwrap();
508+
let slot: TokenSlot = Arc::new(RwLock::new(bearer));
509+
let delay = compute_refresh_delay(&slot);
510+
assert!(
511+
(6 * 60 * 60..=7 * 60 * 60).contains(&delay.as_secs()),
512+
"non-expiring tokens should use the fallback refresh delay, got {delay:?}"
513+
);
514+
}
515+
497516
#[test]
498517
fn compute_refresh_delay_supports_short_token_ttl() {
499518
use base64::Engine as _;

crates/openshell-server/src/auth/sandbox_jwt.rs

Lines changed: 51 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ use tracing::{debug, warn};
3131
/// reuse the same subject namespace without breaking handler equality
3232
/// checks.
3333
const SPIFFE_SUBJECT_PREFIX: &str = "spiffe://openshell/sandbox/";
34+
const SANDBOX_JWT_EXP_LEEWAY_SECS: i64 = 60;
3435

3536
/// JWT claim set serialized in every gateway-minted sandbox token.
3637
#[derive(Debug, Serialize, Deserialize)]
@@ -100,7 +101,11 @@ impl SandboxJwtIssuer {
100101
#[allow(clippy::result_large_err)] // `tonic::Status` is the natural error here
101102
pub fn mint(&self, sandbox_id: &str) -> Result<MintedToken, Status> {
102103
let now = now_secs();
103-
let exp = now + i64::try_from(self.ttl.as_secs()).unwrap_or(3_600);
104+
let exp = if self.ttl.is_zero() {
105+
0
106+
} else {
107+
now.saturating_add(i64::try_from(self.ttl.as_secs()).unwrap_or(3_600))
108+
};
104109
let claims = SandboxJwtClaims {
105110
sub: format!("{SPIFFE_SUBJECT_PREFIX}{sandbox_id}"),
106111
iss: self.issuer.clone(),
@@ -178,6 +183,7 @@ impl SandboxJwtAuthenticator {
178183
validation.set_issuer(&[&self.issuer]);
179184
validation.set_audience(&[&self.audience]);
180185
validation.set_required_spec_claims(&["iss", "aud", "exp", "sub"]);
186+
validation.validate_exp = false;
181187

182188
let data =
183189
decode::<SandboxJwtClaims>(token, &self.decoding_key, &validation).map_err(|e| {
@@ -186,6 +192,7 @@ impl SandboxJwtAuthenticator {
186192
})?;
187193

188194
let claims = data.claims;
195+
validate_exp(claims.exp)?;
189196
Ok(Some(Principal::Sandbox(SandboxPrincipal {
190197
sandbox_id: claims.sandbox_id,
191198
source: SandboxIdentitySource::BootstrapJwt { issuer: claims.iss },
@@ -212,6 +219,20 @@ impl Authenticator for SandboxJwtAuthenticator {
212219
}
213220
}
214221

222+
#[allow(clippy::result_large_err)]
223+
fn validate_exp(exp: i64) -> Result<(), Status> {
224+
if exp == 0 {
225+
return Ok(());
226+
}
227+
228+
if exp < now_secs().saturating_sub(SANDBOX_JWT_EXP_LEEWAY_SECS) {
229+
debug!("sandbox JWT expired");
230+
return Err(Status::unauthenticated("invalid token: ExpiredSignature"));
231+
}
232+
233+
Ok(())
234+
}
235+
215236
fn now_secs() -> i64 {
216237
i64::try_from(
217238
SystemTime::now()
@@ -236,12 +257,16 @@ mod tests {
236257
}
237258

238259
fn pair() -> (SandboxJwtIssuer, SandboxJwtAuthenticator) {
260+
pair_with_ttl(Duration::from_secs(3600))
261+
}
262+
263+
fn pair_with_ttl(ttl: Duration) -> (SandboxJwtIssuer, SandboxJwtAuthenticator) {
239264
let mat = generate_jwt_key().expect("jwt key");
240265
let issuer = SandboxJwtIssuer::from_pem(
241266
mat.signing_key_pem.as_bytes(),
242267
mat.kid.clone(),
243268
"test-gateway",
244-
Duration::from_secs(3600),
269+
ttl,
245270
)
246271
.unwrap();
247272
let auth = SandboxJwtAuthenticator::from_pem(
@@ -276,6 +301,30 @@ mod tests {
276301
}
277302
}
278303

304+
#[tokio::test]
305+
async fn ttl_zero_mints_non_expiring_token() {
306+
let (issuer, auth) = pair_with_ttl(Duration::ZERO);
307+
let minted = issuer.mint("sandbox-never").unwrap();
308+
assert_eq!(minted.expires_at_ms, 0);
309+
310+
let principal = auth
311+
.authenticate(&header_map_with_bearer(&minted.token), "/anything")
312+
.await
313+
.unwrap()
314+
.expect("exp=0 token should authenticate");
315+
assert!(matches!(principal, Principal::Sandbox(_)));
316+
317+
let mut validation = Validation::new(Algorithm::EdDSA);
318+
validation.algorithms = vec![Algorithm::EdDSA];
319+
validation.set_issuer(&["openshell-gateway:test-gateway"]);
320+
validation.set_audience(&["openshell-gateway:test-gateway"]);
321+
validation.set_required_spec_claims(&["iss", "aud", "exp", "sub"]);
322+
validation.validate_exp = false;
323+
let decoded = decode::<SandboxJwtClaims>(&minted.token, &auth.decoding_key, &validation)
324+
.expect("token should decode");
325+
assert_eq!(decoded.claims.exp, 0);
326+
}
327+
279328
#[tokio::test]
280329
async fn token_signed_by_other_key_is_rejected() {
281330
let (_, auth_a) = pair();

crates/openshell-server/src/cli.rs

Lines changed: 43 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -618,6 +618,13 @@ fn effective_single_driver(args: &RunArgs) -> Option<ComputeDriverKind> {
618618
}
619619
}
620620

621+
fn is_singleplayer_driver(args: &RunArgs) -> bool {
622+
matches!(
623+
effective_single_driver(args),
624+
Some(ComputeDriverKind::Docker | ComputeDriverKind::Podman | ComputeDriverKind::Vm)
625+
)
626+
}
627+
621628
fn resolve_mtls_auth_enabled(
622629
args: &RunArgs,
623630
matches: &ArgMatches,
@@ -634,10 +641,7 @@ fn resolve_mtls_auth_enabled(
634641
return false;
635642
}
636643

637-
matches!(
638-
effective_single_driver(args),
639-
Some(ComputeDriverKind::Docker | ComputeDriverKind::Podman | ComputeDriverKind::Vm)
640-
)
644+
is_singleplayer_driver(args)
641645
}
642646

643647
/// Build [`VmComputeConfig`] from the `[openshell.drivers.vm]` table
@@ -1376,6 +1380,41 @@ ssh_session_ttl_secs = 1234
13761380
assert_eq!(file.openshell.gateway.ssh_session_ttl_secs, Some(1234));
13771381
}
13781382

1383+
#[test]
1384+
fn singleplayer_driver_matches_only_one_local_driver() {
1385+
for driver in ["docker", "podman", "vm"] {
1386+
let (args, _) = parse_with_args(&[
1387+
"openshell-gateway",
1388+
"--db-url",
1389+
"sqlite::memory:",
1390+
"--drivers",
1391+
driver,
1392+
]);
1393+
assert!(
1394+
super::is_singleplayer_driver(&args),
1395+
"{driver} should be singleplayer"
1396+
);
1397+
}
1398+
1399+
let (k8s, _) = parse_with_args(&[
1400+
"openshell-gateway",
1401+
"--db-url",
1402+
"sqlite::memory:",
1403+
"--drivers",
1404+
"kubernetes",
1405+
]);
1406+
assert!(!super::is_singleplayer_driver(&k8s));
1407+
1408+
let (multi, _) = parse_with_args(&[
1409+
"openshell-gateway",
1410+
"--db-url",
1411+
"sqlite::memory:",
1412+
"--drivers",
1413+
"docker,podman",
1414+
]);
1415+
assert!(!super::is_singleplayer_driver(&multi));
1416+
}
1417+
13791418
#[test]
13801419
fn file_populates_service_routing_fields() {
13811420
let _lock = ENV_LOCK

crates/openshell-server/src/defaults.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ pub fn complete_local_jwt_config() -> Result<Option<GatewayJwtConfig>> {
104104
public_key_path: paths.public_key,
105105
kid_path: paths.kid,
106106
gateway_id: "openshell".to_string(),
107-
ttl_secs: 3_600,
107+
ttl_secs: 0,
108108
})),
109109
_ => Err(miette::miette!(
110110
"partial local sandbox JWT state in {}: expected jwt/signing.pem, jwt/public.pem, and jwt/kid",
@@ -237,6 +237,6 @@ mod tests {
237237
assert_eq!(config.public_key_path, tmp.path().join("jwt/public.pem"));
238238
assert_eq!(config.kid_path, tmp.path().join("jwt/kid"));
239239
assert_eq!(config.gateway_id, "openshell");
240-
assert_eq!(config.ttl_secs, 3_600);
240+
assert_eq!(config.ttl_secs, 0);
241241
}
242242
}

crates/openshell-server/src/lib.rs

Lines changed: 51 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -705,6 +705,7 @@ async fn build_compute_runtime(
705705
) -> Result<ComputeRuntime> {
706706
let driver = configured_compute_driver(config)?;
707707
info!(driver = %driver, "Using compute driver");
708+
warn_if_kubernetes_sandbox_jwt_expiry_disabled(config, driver);
708709

709710
match driver {
710711
ComputeDriverKind::Kubernetes => {
@@ -878,13 +879,30 @@ fn configured_compute_driver(config: &Config) -> Result<ComputeDriverKind> {
878879
}
879880
}
880881

882+
fn kubernetes_sandbox_jwt_expiry_disabled(config: &Config, driver: ComputeDriverKind) -> bool {
883+
matches!(driver, ComputeDriverKind::Kubernetes)
884+
&& config
885+
.gateway_jwt
886+
.as_ref()
887+
.is_some_and(|jwt| jwt.ttl_secs == 0)
888+
}
889+
890+
fn warn_if_kubernetes_sandbox_jwt_expiry_disabled(config: &Config, driver: ComputeDriverKind) {
891+
if kubernetes_sandbox_jwt_expiry_disabled(config, driver) {
892+
warn!(
893+
"Kubernetes gateway configured with non-expiring sandbox JWTs (gateway_jwt.ttl_secs = 0); set ttl_secs > 0 for shared Kubernetes deployments"
894+
);
895+
}
896+
}
897+
881898
#[cfg(test)]
882899
mod tests {
883900
use super::{
884901
ConnectionProtocol, MultiplexService, ServerState, TlsAcceptor,
885902
allow_plaintext_service_http, classify_initial_bytes, configured_compute_driver,
886903
gateway_listener_addresses, is_benign_tls_handshake_failure,
887-
kubernetes_config_for_k8s_sa_bootstrap, serve_gateway_listener,
904+
kubernetes_config_for_k8s_sa_bootstrap, kubernetes_sandbox_jwt_expiry_disabled,
905+
serve_gateway_listener,
888906
};
889907
use openshell_core::{
890908
ComputeDriverKind, Config,
@@ -1288,6 +1306,38 @@ mod tests {
12881306
);
12891307
}
12901308

1309+
#[test]
1310+
fn kubernetes_sandbox_jwt_expiry_disabled_warns_only_for_kubernetes_zero_ttl() {
1311+
fn config_with_jwt_ttl(ttl_secs: u64) -> Config {
1312+
let mut config = Config::new(None);
1313+
config.gateway_jwt = Some(openshell_core::GatewayJwtConfig {
1314+
signing_key_path: "/tmp/signing.pem".into(),
1315+
public_key_path: "/tmp/public.pem".into(),
1316+
kid_path: "/tmp/kid".into(),
1317+
gateway_id: "openshell".to_string(),
1318+
ttl_secs,
1319+
});
1320+
config
1321+
}
1322+
1323+
assert!(kubernetes_sandbox_jwt_expiry_disabled(
1324+
&config_with_jwt_ttl(0),
1325+
ComputeDriverKind::Kubernetes
1326+
));
1327+
assert!(!kubernetes_sandbox_jwt_expiry_disabled(
1328+
&config_with_jwt_ttl(3600),
1329+
ComputeDriverKind::Kubernetes
1330+
));
1331+
assert!(!kubernetes_sandbox_jwt_expiry_disabled(
1332+
&config_with_jwt_ttl(0),
1333+
ComputeDriverKind::Docker
1334+
));
1335+
assert!(!kubernetes_sandbox_jwt_expiry_disabled(
1336+
&Config::new(None),
1337+
ComputeDriverKind::Kubernetes
1338+
));
1339+
}
1340+
12911341
#[test]
12921342
fn k8s_sa_bootstrap_rejects_missing_kubernetes_driver_config() {
12931343
let err = kubernetes_config_for_k8s_sa_bootstrap(None).unwrap_err();

0 commit comments

Comments
 (0)