From d3c2ca138888627be0889b6633b389595c775dd2 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Thu, 19 Mar 2026 01:47:19 +0000 Subject: [PATCH 1/5] kms: add bootstrap and onboard KMS auth checks --- kms/src/main_service.rs | 31 +------ kms/src/main_service/upgrade_authority.rs | 34 ++++++++ kms/src/onboard_service.rs | 98 +++++++++++++++++++++-- 3 files changed, 129 insertions(+), 34 deletions(-) diff --git a/kms/src/main_service.rs b/kms/src/main_service.rs index 52573d4b..9c04a824 100644 --- a/kms/src/main_service.rs +++ b/kms/src/main_service.rs @@ -23,14 +23,14 @@ use ra_tls::{ use scale::Decode; use sha2::Digest; use tracing::info; -use upgrade_authority::BootInfo; +use upgrade_authority::{build_boot_info, BootInfo}; use crate::{ config::KmsConfig, crypto::{derive_k256_key, sign_message, sign_message_with_timestamp}, }; -mod upgrade_authority; +pub(crate) mod upgrade_authority; #[derive(Clone)] pub struct KmsState { @@ -169,32 +169,7 @@ impl RpcHandler { use_boottime_mr: bool, vm_config_str: &str, ) -> Result { - let tcb_status; - let advisory_ids; - match att.report.tdx_report() { - Some(report) => { - tcb_status = report.status.clone(); - advisory_ids = report.advisory_ids.clone(); - } - None => { - tcb_status = "".to_string(); - advisory_ids = Vec::new(); - } - }; - let app_info = att.decode_app_info_ex(use_boottime_mr, vm_config_str)?; - let boot_info = BootInfo { - attestation_mode: att.quote.mode(), - mr_aggregated: app_info.mr_aggregated.to_vec(), - os_image_hash: app_info.os_image_hash, - mr_system: app_info.mr_system.to_vec(), - app_id: app_info.app_id, - compose_hash: app_info.compose_hash, - instance_id: app_info.instance_id, - device_id: app_info.device_id, - key_provider_info: app_info.key_provider_info, - tcb_status, - advisory_ids, - }; + let boot_info = build_boot_info(att, use_boottime_mr, vm_config_str)?; let response = self .state .config diff --git a/kms/src/main_service/upgrade_authority.rs b/kms/src/main_service/upgrade_authority.rs index b461e8ad..4cdcbb28 100644 --- a/kms/src/main_service/upgrade_authority.rs +++ b/kms/src/main_service/upgrade_authority.rs @@ -5,6 +5,7 @@ use crate::config::AuthApi; use anyhow::{bail, Context, Result}; use ra_tls::attestation::AttestationMode; +use ra_tls::attestation::VerifiedAttestation; use serde::de::DeserializeOwned; use serde::{Deserialize, Serialize}; use serde_human_bytes as hex_bytes; @@ -33,6 +34,39 @@ pub(crate) struct BootInfo { pub advisory_ids: Vec, } +pub(crate) fn build_boot_info( + att: &VerifiedAttestation, + use_boottime_mr: bool, + vm_config_str: &str, +) -> Result { + let tcb_status; + let advisory_ids; + match att.report.tdx_report() { + Some(report) => { + tcb_status = report.status.clone(); + advisory_ids = report.advisory_ids.clone(); + } + None => { + tcb_status = "".to_string(); + advisory_ids = Vec::new(); + } + }; + let app_info = att.decode_app_info_ex(use_boottime_mr, vm_config_str)?; + Ok(BootInfo { + attestation_mode: att.quote.mode(), + mr_aggregated: app_info.mr_aggregated.to_vec(), + os_image_hash: app_info.os_image_hash, + mr_system: app_info.mr_system.to_vec(), + app_id: app_info.app_id, + compose_hash: app_info.compose_hash, + instance_id: app_info.instance_id, + device_id: app_info.device_id, + key_provider_info: app_info.key_provider_info, + tcb_status, + advisory_ids, + }) +} + #[derive(Debug, Serialize, Deserialize)] #[serde(rename_all = "camelCase")] pub(crate) struct BootResponse { diff --git a/kms/src/onboard_service.rs b/kms/src/onboard_service.rs index 4fb77dd2..dab76969 100644 --- a/kms/src/onboard_service.rs +++ b/kms/src/onboard_service.rs @@ -2,7 +2,9 @@ // // SPDX-License-Identifier: Apache-2.0 -use anyhow::{Context, Result}; +use std::sync::{Arc, Mutex}; + +use anyhow::{bail, Context, Result}; use dstack_guest_agent_rpc::{ dstack_guest_client::DstackGuestClient, AttestResponse, RawQuoteArgs, }; @@ -15,15 +17,18 @@ use dstack_kms_rpc::{ use fs_err as fs; use http_client::prpc::PrpcClient; use k256::ecdsa::SigningKey; -use ra_rpc::{client::RaClient, CallContext, RpcCall}; +use ra_rpc::{ + client::{CertInfo, RaClient, RaClientConfig}, + CallContext, RpcCall, +}; use ra_tls::{ - attestation::{QuoteContentType, VersionedAttestation}, + attestation::{QuoteContentType, VerifiedAttestation, VersionedAttestation}, cert::{CaCert, CertRequest}, rcgen::{Certificate, KeyPair, PKCS_ECDSA_P256_SHA256}, }; use safe_write::safe_write; -use crate::config::KmsConfig; +use crate::{config::KmsConfig, main_service::upgrade_authority::build_boot_info}; #[derive(Clone)] pub struct OnboardState { @@ -53,6 +58,11 @@ impl RpcCall for OnboardHandler { impl OnboardRpc for OnboardHandler { async fn bootstrap(self, request: BootstrapRequest) -> Result { let quote_enabled = self.state.config.onboard.quote_enabled; + if quote_enabled { + ensure_self_kms_allowed(&self.state.config) + .await + .context("KMS is not allowed to bootstrap")?; + } let keys = Keys::generate(&request.domain, quote_enabled) .await .context("Failed to generate keys")?; @@ -85,6 +95,7 @@ impl OnboardRpc for OnboardHandler { format!("{source_url}/prpc") }; let keys = Keys::onboard( + &self.state.config, &source_url, &request.domain, self.state.config.onboard.quote_enabled, @@ -222,13 +233,39 @@ impl Keys { } async fn onboard( + cfg: &KmsConfig, other_kms_url: &str, domain: &str, quote_enabled: bool, pccs_url: Option, ) -> Result { - let kms_client = RaClient::new(other_kms_url.into(), true)?; - let mut kms_client = KmsClient::new(kms_client); + let mut source_attestation_slot = None; + let mut kms_client = if quote_enabled { + let attestation_slot = Arc::new(Mutex::new(None::)); + let attestation_slot_out = attestation_slot.clone(); + let client = RaClientConfig::builder() + .tls_no_check(true) + .remote_uri(other_kms_url.to_string()) + .cert_validator(Box::new(move |info: Option| { + let Some(info) = info else { + bail!("Source KMS did not present a TLS certificate"); + }; + let Some(attestation) = info.attestation else { + bail!("Source KMS certificate does not contain attestation"); + }; + *attestation_slot_out + .lock() + .expect("source attestation mutex poisoned") = Some(attestation); + Ok(()) + })) + .maybe_pccs_url(pccs_url.clone()) + .build() + .into_client()?; + source_attestation_slot = Some(attestation_slot); + KmsClient::new(client) + } else { + KmsClient::new(RaClient::new(other_kms_url.into(), true)?) + }; if quote_enabled { let tmp_ca = kms_client.get_temp_ca_cert().await?; @@ -236,6 +273,15 @@ impl Keys { let ra_client = RaClient::new_mtls(other_kms_url.into(), ra_cert, ra_key, pccs_url) .context("Failed to create client")?; kms_client = KmsClient::new(ra_client); + let source_attestation = source_attestation_slot + .expect("source attestation slot missing") + .lock() + .expect("source attestation mutex poisoned") + .clone() + .context("Missing source KMS attestation")?; + ensure_remote_kms_allowed(cfg, &source_attestation) + .await + .context("Source KMS is not allowed for onboarding")?; } let info = dstack_client().info().await.context("Failed to get info")?; @@ -328,6 +374,11 @@ pub(crate) async fn update_certs(cfg: &KmsConfig) -> Result<()> { } pub(crate) async fn bootstrap_keys(cfg: &KmsConfig) -> Result<()> { + if cfg.onboard.quote_enabled { + ensure_self_kms_allowed(cfg) + .await + .context("KMS is not allowed to auto-bootstrap")?; + } let keys = Keys::generate( &cfg.onboard.auto_bootstrap_domain, cfg.onboard.quote_enabled, @@ -348,6 +399,41 @@ async fn app_attest(report_data: Vec) -> Result { dstack_client().attest(RawQuoteArgs { report_data }).await } +async fn ensure_self_kms_allowed(cfg: &KmsConfig) -> Result<()> { + let response = app_attest(pad64([0u8; 32])) + .await + .context("Failed to get local KMS attestation")?; + let attestation = VersionedAttestation::from_scale(&response.attestation) + .context("Failed to decode local KMS attestation")? + .into_inner(); + let verified = attestation + .verify(cfg.pccs_url.as_deref()) + .await + .context("Failed to verify local KMS attestation")?; + ensure_kms_allowed(cfg, &verified).await +} + +async fn ensure_remote_kms_allowed( + cfg: &KmsConfig, + attestation: &VerifiedAttestation, +) -> Result<()> { + ensure_kms_allowed(cfg, attestation).await +} + +async fn ensure_kms_allowed(cfg: &KmsConfig, attestation: &VerifiedAttestation) -> Result<()> { + let boot_info = build_boot_info(attestation, false, "") + .context("Failed to build KMS boot info from attestation")?; + let response = cfg + .auth_api + .is_app_allowed(&boot_info, true) + .await + .context("Failed to call KMS auth check")?; + if !response.is_allowed { + bail!("Boot denied: {}", response.reason); + } + Ok(()) +} + async fn attest_keys(p256_pubkey: &[u8], k256_pubkey: &[u8]) -> Result> { let p256_hex = hex::encode(p256_pubkey); let k256_hex = hex::encode(k256_pubkey); From 06d89a29934d4d1deab352fbf2ae2f66b5d2e360 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Thu, 19 Mar 2026 04:24:00 +0000 Subject: [PATCH 2/5] kms: enforce self authorization on trusted RPCs --- kms/src/main_service.rs | 45 ++++++++++++++++++++++- kms/src/main_service/upgrade_authority.rs | 36 ++++++++++++++++++ kms/src/onboard_service.rs | 24 +++++++----- 3 files changed, 94 insertions(+), 11 deletions(-) diff --git a/kms/src/main_service.rs b/kms/src/main_service.rs index 9c04a824..ddf79363 100644 --- a/kms/src/main_service.rs +++ b/kms/src/main_service.rs @@ -22,8 +22,9 @@ use ra_tls::{ }; use scale::Decode; use sha2::Digest; +use tokio::sync::OnceCell; use tracing::info; -use upgrade_authority::{build_boot_info, BootInfo}; +use upgrade_authority::{build_boot_info, local_kms_boot_info, BootInfo}; use crate::{ config::KmsConfig, @@ -52,6 +53,7 @@ pub struct KmsStateInner { temp_ca_cert: String, temp_ca_key: String, verifier: CvmVerifier, + self_boot_info: OnceCell, } impl KmsState { @@ -79,6 +81,7 @@ impl KmsState { temp_ca_cert, temp_ca_key, verifier, + self_boot_info: OnceCell::new(), }), }) } @@ -95,6 +98,31 @@ struct BootConfig { } impl RpcHandler { + async fn ensure_self_allowed(&self) -> Result<()> { + if !self.state.config.onboard.quote_enabled { + return Ok(()); + } + let boot_info = self + .state + .self_boot_info + .get_or_try_init(|| async { + local_kms_boot_info(self.state.config.pccs_url.as_deref()).await + }) + .await + .context("Failed to load cached self boot info")?; + let response = self + .state + .config + .auth_api + .is_app_allowed(boot_info, true) + .await + .context("Failed to call self KMS auth check")?; + if !response.is_allowed { + bail!("KMS is not allowed: {}", response.reason); + } + Ok(()) + } + fn ensure_attested(&self) -> Result<&VerifiedAttestation> { let Some(attestation) = &self.attestation else { bail!("No attestation provided"); @@ -214,6 +242,9 @@ impl KmsRpc for RpcHandler { if request.api_version > 1 { bail!("Unsupported API version: {}", request.api_version); } + self.ensure_self_allowed() + .await + .context("KMS self authorization failed")?; let BootConfig { boot_info, gateway_app_id, @@ -254,6 +285,9 @@ impl KmsRpc for RpcHandler { } async fn get_app_env_encrypt_pub_key(self, request: AppId) -> Result { + self.ensure_self_allowed() + .await + .context("KMS self authorization failed")?; let secret = kdf::derive_dh_secret( &self.state.root_ca.key, &[&request.app_id[..], "env-encrypt-key".as_bytes()], @@ -320,6 +354,9 @@ impl KmsRpc for RpcHandler { } async fn get_kms_key(self, request: GetKmsKeyRequest) -> Result { + self.ensure_self_allowed() + .await + .context("KMS self authorization failed")?; if self.state.config.onboard.quote_enabled { let _info = self.ensure_kms_allowed(&request.vm_config).await?; } @@ -333,6 +370,9 @@ impl KmsRpc for RpcHandler { } async fn get_temp_ca_cert(self) -> Result { + self.ensure_self_allowed() + .await + .context("KMS self authorization failed")?; Ok(GetTempCaCertResponse { temp_ca_cert: self.state.inner.temp_ca_cert.clone(), temp_ca_key: self.state.inner.temp_ca_key.clone(), @@ -341,6 +381,9 @@ impl KmsRpc for RpcHandler { } async fn sign_cert(self, request: SignCertRequest) -> Result { + self.ensure_self_allowed() + .await + .context("KMS self authorization failed")?; let csr = match request.api_version { 1 => { let csr = CertSigningRequestV1::decode(&mut &request.csr[..]) diff --git a/kms/src/main_service/upgrade_authority.rs b/kms/src/main_service/upgrade_authority.rs index 4cdcbb28..d2b64016 100644 --- a/kms/src/main_service/upgrade_authority.rs +++ b/kms/src/main_service/upgrade_authority.rs @@ -4,8 +4,13 @@ use crate::config::AuthApi; use anyhow::{bail, Context, Result}; +use dstack_guest_agent_rpc::{ + dstack_guest_client::DstackGuestClient, AttestResponse, RawQuoteArgs, +}; +use http_client::prpc::PrpcClient; use ra_tls::attestation::AttestationMode; use ra_tls::attestation::VerifiedAttestation; +use ra_tls::attestation::VersionedAttestation; use serde::de::DeserializeOwned; use serde::{Deserialize, Serialize}; use serde_human_bytes as hex_bytes; @@ -67,6 +72,20 @@ pub(crate) fn build_boot_info( }) } +pub(crate) async fn local_kms_boot_info(pccs_url: Option<&str>) -> Result { + let response = app_attest(pad64([0u8; 32])) + .await + .context("Failed to get local KMS attestation")?; + let attestation = VersionedAttestation::from_scale(&response.attestation) + .context("Failed to decode local KMS attestation")? + .into_inner(); + let verified = attestation + .verify(pccs_url) + .await + .context("Failed to verify local KMS attestation")?; + build_boot_info(&verified, false, "") +} + #[derive(Debug, Serialize, Deserialize)] #[serde(rename_all = "camelCase")] pub(crate) struct BootResponse { @@ -168,3 +187,20 @@ fn url_join(url: &str, path: &str) -> String { url.push_str(path); url } + +fn dstack_client() -> DstackGuestClient { + let address = dstack_types::dstack_agent_address(); + let http_client = PrpcClient::new(address); + DstackGuestClient::new(http_client) +} + +async fn app_attest(report_data: Vec) -> Result { + dstack_client().attest(RawQuoteArgs { report_data }).await +} + +fn pad64(hash: [u8; 32]) -> Vec { + let mut padded = Vec::with_capacity(64); + padded.extend_from_slice(&hash); + padded.resize(64, 0); + padded +} diff --git a/kms/src/onboard_service.rs b/kms/src/onboard_service.rs index dab76969..f6ba8b64 100644 --- a/kms/src/onboard_service.rs +++ b/kms/src/onboard_service.rs @@ -28,7 +28,10 @@ use ra_tls::{ }; use safe_write::safe_write; -use crate::{config::KmsConfig, main_service::upgrade_authority::build_boot_info}; +use crate::{ + config::KmsConfig, + main_service::upgrade_authority::{build_boot_info, local_kms_boot_info}, +}; #[derive(Clone)] pub struct OnboardState { @@ -400,17 +403,18 @@ async fn app_attest(report_data: Vec) -> Result { } async fn ensure_self_kms_allowed(cfg: &KmsConfig) -> Result<()> { - let response = app_attest(pad64([0u8; 32])) + let boot_info = local_kms_boot_info(cfg.pccs_url.as_deref()) .await - .context("Failed to get local KMS attestation")?; - let attestation = VersionedAttestation::from_scale(&response.attestation) - .context("Failed to decode local KMS attestation")? - .into_inner(); - let verified = attestation - .verify(cfg.pccs_url.as_deref()) + .context("Failed to build local KMS boot info")?; + let response = cfg + .auth_api + .is_app_allowed(&boot_info, true) .await - .context("Failed to verify local KMS attestation")?; - ensure_kms_allowed(cfg, &verified).await + .context("Failed to call KMS auth check")?; + if !response.is_allowed { + bail!("Boot denied: {}", response.reason); + } + Ok(()) } async fn ensure_remote_kms_allowed( From 157ad4ba0310ba9e6f058cf5d38509459bcc769a Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Thu, 19 Mar 2026 04:29:11 +0000 Subject: [PATCH 3/5] kms: avoid expect in onboarding auth checks --- kms/src/onboard_service.rs | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/kms/src/onboard_service.rs b/kms/src/onboard_service.rs index f6ba8b64..2a5d945f 100644 --- a/kms/src/onboard_service.rs +++ b/kms/src/onboard_service.rs @@ -256,9 +256,10 @@ impl Keys { let Some(attestation) = info.attestation else { bail!("Source KMS certificate does not contain attestation"); }; - *attestation_slot_out + let mut slot = attestation_slot_out .lock() - .expect("source attestation mutex poisoned") = Some(attestation); + .map_err(|_| anyhow::anyhow!("source attestation mutex poisoned"))?; + *slot = Some(attestation); Ok(()) })) .maybe_pccs_url(pccs_url.clone()) @@ -277,9 +278,9 @@ impl Keys { .context("Failed to create client")?; kms_client = KmsClient::new(ra_client); let source_attestation = source_attestation_slot - .expect("source attestation slot missing") + .context("source attestation slot missing")? .lock() - .expect("source attestation mutex poisoned") + .map_err(|_| anyhow::anyhow!("source attestation mutex poisoned"))? .clone() .context("Missing source KMS attestation")?; ensure_remote_kms_allowed(cfg, &source_attestation) From c2b2c7ea31a9793c5fdb43d2d2d1d5b3bf2be8fb Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Thu, 19 Mar 2026 08:58:40 +0000 Subject: [PATCH 4/5] kms: simplify self boot info cache init --- kms/src/main_service.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/kms/src/main_service.rs b/kms/src/main_service.rs index ddf79363..22d86521 100644 --- a/kms/src/main_service.rs +++ b/kms/src/main_service.rs @@ -105,9 +105,7 @@ impl RpcHandler { let boot_info = self .state .self_boot_info - .get_or_try_init(|| async { - local_kms_boot_info(self.state.config.pccs_url.as_deref()).await - }) + .get_or_try_init(|| local_kms_boot_info(self.state.config.pccs_url.as_deref())) .await .context("Failed to load cached self boot info")?; let response = self From ad924cb553fdad2a1fa8ddf55929fb0a5e761f81 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Thu, 19 Mar 2026 08:58:44 +0000 Subject: [PATCH 5/5] tests: document kms self authorization manual flow --- tests/docs/kms-self-authrization.md | 745 ++++++++++++++++++++++++++++ 1 file changed, 745 insertions(+) create mode 100644 tests/docs/kms-self-authrization.md diff --git a/tests/docs/kms-self-authrization.md b/tests/docs/kms-self-authrization.md new file mode 100644 index 00000000..eaaa85d9 --- /dev/null +++ b/tests/docs/kms-self-authrization.md @@ -0,0 +1,745 @@ +# KMS Self-Authorization Manual Integration Test Guide + +This document describes a manual, AI-executable integration test flow for the KMS self-authorization changes introduced in PR #573. + +The goal is to validate the following behaviors without depending on `kms/e2e/` from PR #538: + +1. **Bootstrap self-check**: a KMS with `quote_enabled = true` must call the auth API and verify that **itself** is allowed before bootstrap succeeds. +2. **Onboard receiver-side source check**: a new KMS with `quote_enabled = true` must reject onboarding if the **source KMS** is not allowed by the receiver's auth policy. +3. **Trusted RPC self-check**: trusted KMS RPCs such as `GetTempCaCert`, `GetKmsKey`, `GetAppKey`, and `SignCert` must fail when the running KMS is no longer allowed by its auth policy. +4. **Compatibility**: when `quote_enabled = false`, the new bootstrap/onboard self-authorization checks should be skipped. + +This guide is written as a deployment-and-test runbook so an AI agent can follow it end-to-end. + +> **Execution notes from a real run on teepod2 (2026-03-19):** +> +> 1. Do **not** assume a host-local `auth-simple` instance is reachable from a CVM. In practice, the auth API must be: +> - publicly reachable by the CVM, or +> - deployed as a sidecar/internal service inside the same test environment. +> 2. For PR validation, prefer a **prebuilt KMS test image**. The run documented here used `cr.kvin.wang/dstack-kms:kms-auth-checks-157ad4ba`. +> 3. `Boot Progress: done` only means the VM guest boot finished. It does **not** guarantee the KMS onboard endpoint is already ready. +> 4. If you inject helper scripts through `docker-compose.yaml`, prefer inline `configs.content` over `configs.file` unless you have confirmed the extra files are copied into the deployment bundle. +> 5. The onboard completion endpoint is **GET `/finish`**, not POST. +> 6. Do **not** reuse a previously captured `mr_aggregated` across redeploys. In practice, the measured value changed across fresh `kms-noquote` redeploys, so auth policies must be generated from the attestation of the **current** VM under test. +> 7. With `quote_enabled = false`, `Onboard.Bootstrap` skipped the new auth check as expected and returned an empty `attestation` field. +> 8. With `quote_enabled = false`, runtime trusted RPC self-checks were also skipped: `KMS.GetTempCaCert` still succeeded under a deny policy. +> 9. End-to-end onboard into a `quote_enabled = false` receiver did **not** complete against a quoted source KMS. The new receiver-side source check was skipped, but the flow later failed on the existing source-side `GetKmsKey` requirement with `No attestation provided`. + +--- + +## Table of Contents + +1. [Why this document exists](#1-why-this-document-exists) +2. [Test strategy](#2-test-strategy) +3. [Topology](#3-topology) +4. [Prerequisites](#4-prerequisites) +5. [Shared setup](#5-shared-setup) +6. [Test case 1: bootstrap is denied when self is not allowed](#6-test-case-1-bootstrap-is-denied-when-self-is-not-allowed) +7. [Test case 2: bootstrap succeeds after self is whitelisted](#7-test-case-2-bootstrap-succeeds-after-self-is-whitelisted) +8. [Test case 3: receiver rejects onboarding from a denied source KMS](#8-test-case-3-receiver-rejects-onboarding-from-a-denied-source-kms) +9. [Test case 4: trusted RPCs fail when the running KMS is no longer allowed](#9-test-case-4-trusted-rpcs-fail-when-the-running-kms-is-no-longer-allowed) +10. [Test case 5: `quote_enabled = false` remains compatible](#10-test-case-5-quote_enabled--false-remains-compatible) +11. [Evidence to capture](#11-evidence-to-capture) +12. [Cleanup](#12-cleanup) + +--- + +## 1. Why this document exists + +PR #538 already proposes a richer `kms/e2e/` framework, but as of **2026-03-19** it is still open/draft and touches overlapping KMS files. To avoid waiting for that PR, this guide uses: + +- existing KMS deploy flows +- `auth-simple` as a controllable auth API +- manual RPC calls via `curl` + +This keeps the test independent from PR #538 while still exercising real deployment paths. + +--- + +## 2. Test strategy + +Use **real KMS CVMs** with `quote_enabled = true` and a hot-reloadable `auth-simple` policy. + +Why `auth-simple`: + +- it implements the same `/bootAuth/kms` webhook contract used by KMS +- its config is re-read on every request +- allow/deny behavior can be changed without restarting the service + +The test intentionally focuses on **authorization decisions**, not on a new Rust test harness. + +--- + +## 3. Topology + +Use the following layout: + +```text +Host / operator machine +├── auth-simple-src (source KMS auth policy) +├── auth-simple-dst (target KMS auth policy) +├── kms-src (bootstrapped, later used as source KMS) +├── kms-dst (fresh KMS used for onboard tests) +└── optional kms-noquote (fresh KMS with quote_enabled = false) +``` + +Policy responsibilities: + +- `auth-simple-src` must authorize: + - `kms-src` itself, for bootstrap and trusted RPC self-checks + - `kms-dst`, when `kms-dst` calls `GetKmsKey` during onboarding +- `auth-simple-dst` decides whether `kms-dst` accepts `kms-src` as an allowed source KMS + +--- + +## 4. Prerequisites + +Before starting, make sure the following are available: + +1. A branch or image containing the PR #573 KMS changes +2. A working `dstack-vmm` or teepod deployment target +3. Two routable KMS onboard URLs +4. `bun` installed on the host, because `kms/auth-simple` runs on Bun +5. `jq`, `curl`, and Python 3 on the host + +Recommended references: + +- KMS deployment tutorial: `docs/tutorials/kms-cvm-deployment.md` +- KMS troubleshooting: `docs/tutorials/troubleshooting-kms-deployment.md` +- `auth-simple` usage: `kms/auth-simple/README.md` + +If deploying on teepod/dstack-vmm, the easiest pattern is: + +- deploy KMS in onboard mode +- expose the onboard page through gateway +- call `/prpc/Onboard.*?json` via HTTPS + +Strong recommendation for this manual test: + +- **publish a test KMS image first**, then deploy that image +- avoid `build:` in `docker-compose.yaml` unless you have already confirmed image builds work correctly in your VMM environment + +Using a prebuilt image significantly reduces ambiguity when a failure happens: you can focus on KMS authorization logic rather than image build or registry behavior. + +Teepod/gateway URL convention observed during a real run: + +- **onboard mode:** use the `-8000` style URL +- **runtime TLS KMS RPC after bootstrap/onboard:** use the `-8000s` style URL + +Do not assume the same external URL works before and after onboarding is finished. + +--- + +## 5. Shared setup + +### 5.1 Create a working directory + +```bash +export REPO_ROOT="$(git rev-parse --show-toplevel)" +mkdir -p /tmp/kms-self-auth +cd /tmp/kms-self-auth +``` + +### 5.2 Make the auth API reachable from the test KMS instances + +The original plan was to run two host-local `auth-simple` processes. In practice, this only works if the CVMs can reach that host directly. + +Choose one of these options: + +1. **Preferred:** deploy the auth API as a separate public service or CVM +2. **Also fine:** run the auth API as a sidecar in the same KMS test deployment +3. **Only if reachable:** run `auth-simple` on the operator host and point KMS at that reachable host/IP + +If you use the sidecar/public-service pattern, keep the same logical split: + +- source-side auth policy +- destination-side auth policy + +and make sure you still have a way to update allow/deny policy during the test. + +### 5.3 If using host-local `auth-simple`, install and start two instances + +```bash +cd "$REPO_ROOT/kms/auth-simple" +bun install +``` + +Create placeholder configs: + +```bash +cat > /tmp/kms-self-auth/auth-src.json <<'EOF' +{ + "osImages": [], + "gatewayAppId": "any", + "kms": { + "mrAggregated": [], + "devices": [], + "allowAnyDevice": true + }, + "apps": {} +} +EOF + +cat > /tmp/kms-self-auth/auth-dst.json <<'EOF' +{ + "osImages": [], + "gatewayAppId": "any", + "kms": { + "mrAggregated": [], + "devices": [], + "allowAnyDevice": true + }, + "apps": {} +} +EOF +``` + +Start the services: + +```bash +cd "$REPO_ROOT/kms/auth-simple" +AUTH_CONFIG_PATH=/tmp/kms-self-auth/auth-src.json PORT=3101 bun run start \ + >/tmp/kms-self-auth/auth-src.log 2>&1 & +echo $! >/tmp/kms-self-auth/auth-src.pid + +AUTH_CONFIG_PATH=/tmp/kms-self-auth/auth-dst.json PORT=3102 bun run start \ + >/tmp/kms-self-auth/auth-dst.log 2>&1 & +echo $! >/tmp/kms-self-auth/auth-dst.pid +``` + +Health check: + +```bash +curl -sf http://127.0.0.1:3101/ | jq . +curl -sf http://127.0.0.1:3102/ | jq . +``` + +### 5.4 Deploy `kms-src` and `kms-dst` + +Deploy two KMS CVMs using the existing KMS deployment workflow. + +Requirements for **both** VMs: + +- `core.onboard.enabled = true` +- `core.onboard.auto_bootstrap_domain = ""` +- `core.onboard.quote_enabled = true` +- `core.auth_api.type = "webhook"` + +Point them at different auth services or sidecars: + +- `kms-src` → `http://:3101` +- `kms-dst` → `http://:3102` + +If you use sidecars instead of host-local auth servers, replace those URLs with the sidecar/internal service addresses. + +If you need an example deployment template, adapt the flow in: + +- `docs/tutorials/kms-cvm-deployment.md` + +Record these values: + +```bash +export KMS_SRC_ONBOARD='https:///' +export KMS_DST_ONBOARD='https:///' +``` + +Notes: + +- The onboard endpoint is plain onboarding mode, so use `Onboard.*` +- The runtime KMS endpoint is available only after bootstrap/onboard and `/finish` + +Wait until the onboard endpoint is actually ready before continuing. A simple probe loop is recommended: + +```bash +until curl -sk -X POST "${KMS_SRC_ONBOARD%/}/prpc/Onboard.GetAttestationInfo?json" \ + -H 'Content-Type: application/json' -d '{}' >/dev/null 2>&1; do + echo "waiting for kms-src onboard endpoint..." + sleep 10 +done + +until curl -sk -X POST "${KMS_DST_ONBOARD%/}/prpc/Onboard.GetAttestationInfo?json" \ + -H 'Content-Type: application/json' -d '{}' >/dev/null 2>&1; do + echo "waiting for kms-dst onboard endpoint..." + sleep 10 +done +``` + +### 5.5 Read attestation info for both KMS instances + +```bash +curl -sf -X POST "${KMS_SRC_ONBOARD%/}/prpc/Onboard.GetAttestationInfo?json" \ + -H 'Content-Type: application/json' \ + -d '{}' | tee /tmp/kms-self-auth/kms-src-attestation.json | jq . + +curl -sf -X POST "${KMS_DST_ONBOARD%/}/prpc/Onboard.GetAttestationInfo?json" \ + -H 'Content-Type: application/json' \ + -d '{}' | tee /tmp/kms-self-auth/kms-dst-attestation.json | jq . +``` + +Expected fields: + +- `device_id` +- `mr_aggregated` +- `os_image_hash` +- `attestation_mode` + +Extract values: + +```bash +SRC_OS=$(jq -r '.os_image_hash' /tmp/kms-self-auth/kms-src-attestation.json) +SRC_MR=$(jq -r '.mr_aggregated' /tmp/kms-self-auth/kms-src-attestation.json) +SRC_DEV=$(jq -r '.device_id' /tmp/kms-self-auth/kms-src-attestation.json) + +DST_OS=$(jq -r '.os_image_hash' /tmp/kms-self-auth/kms-dst-attestation.json) +DST_MR=$(jq -r '.mr_aggregated' /tmp/kms-self-auth/kms-dst-attestation.json) +DST_DEV=$(jq -r '.device_id' /tmp/kms-self-auth/kms-dst-attestation.json) +``` + +All three values above are expected to be hex strings **without** the `0x` prefix. When writing `auth-simple` config, prepend `0x`. + +### 5.6 Helper configs + +#### Deny-by-MR config + +Use a wrong `mrAggregated` value while allowing the observed OS image: + +```bash +cat > /tmp/kms-self-auth/deny-by-mr.json <<'EOF' +{ + "osImages": ["0xREPLACE_OS"], + "gatewayAppId": "any", + "kms": { + "mrAggregated": ["0x0000000000000000000000000000000000000000000000000000000000000000"], + "devices": [], + "allowAnyDevice": true + }, + "apps": {} +} +EOF +``` + +#### Allow-single-KMS config + +```bash +cat > /tmp/kms-self-auth/allow-single.json <<'EOF' +{ + "osImages": ["0xREPLACE_OS"], + "gatewayAppId": "any", + "kms": { + "mrAggregated": ["0xREPLACE_MR"], + "devices": [], + "allowAnyDevice": true + }, + "apps": {} +} +EOF +``` + +#### Allow-source-and-target config + +```bash +cat > /tmp/kms-self-auth/allow-src-and-dst.json <<'EOF' +{ + "osImages": ["0xREPLACE_SRC_OS", "0xREPLACE_DST_OS"], + "gatewayAppId": "any", + "kms": { + "mrAggregated": ["0xREPLACE_SRC_MR", "0xREPLACE_DST_MR"], + "devices": [], + "allowAnyDevice": true + }, + "apps": {} +} +EOF +``` + +Create concrete variants: + +```bash +sed "s/REPLACE_OS/$SRC_OS/g; s/REPLACE_MR/$SRC_MR/g" \ + /tmp/kms-self-auth/allow-single.json \ + >/tmp/kms-self-auth/auth-src-allow-self.json + +sed "s/REPLACE_OS/$SRC_OS/g" \ + /tmp/kms-self-auth/deny-by-mr.json \ + >/tmp/kms-self-auth/auth-src-deny-self.json + +sed "s/REPLACE_SRC_OS/$SRC_OS/g; s/REPLACE_DST_OS/$DST_OS/g; s/REPLACE_SRC_MR/$SRC_MR/g; s/REPLACE_DST_MR/$DST_MR/g" \ + /tmp/kms-self-auth/allow-src-and-dst.json \ + >/tmp/kms-self-auth/auth-src-allow-both.json + +sed "s/REPLACE_OS/$SRC_OS/g; s/REPLACE_MR/$SRC_MR/g" \ + /tmp/kms-self-auth/allow-single.json \ + >/tmp/kms-self-auth/auth-dst-allow-src.json + +sed "s/REPLACE_OS/$SRC_OS/g" \ + /tmp/kms-self-auth/deny-by-mr.json \ + >/tmp/kms-self-auth/auth-dst-deny-src.json +``` + +Because `auth-simple` hot reloads its config on every request, switching policy is just a file copy: + +```bash +cp /tmp/kms-self-auth/auth-src-deny-self.json /tmp/kms-self-auth/auth-src.json +cp /tmp/kms-self-auth/auth-src-allow-self.json /tmp/kms-self-auth/auth-src.json +cp /tmp/kms-self-auth/auth-src-allow-both.json /tmp/kms-self-auth/auth-src.json +cp /tmp/kms-self-auth/auth-dst-deny-src.json /tmp/kms-self-auth/auth-dst.json +cp /tmp/kms-self-auth/auth-dst-allow-src.json /tmp/kms-self-auth/auth-dst.json +``` + +--- + +## 6. Test case 1: bootstrap is denied when self is not allowed + +### Purpose + +Verify that a KMS with `quote_enabled = true` refuses bootstrap if the auth API denies **its own** measurements. + +### Steps + +1. Make sure `kms-src` is still fresh and not bootstrapped yet. +2. Apply the deny-self policy to `auth-simple-src`: + +```bash +cp /tmp/kms-self-auth/auth-src-deny-self.json /tmp/kms-self-auth/auth-src.json +``` + +3. Call bootstrap: + +```bash +curl -sf -X POST "${KMS_SRC_ONBOARD%/}/prpc/Onboard.Bootstrap?json" \ + -H 'Content-Type: application/json' \ + -d '{"domain":"kms-src.example.test"}' \ + | tee /tmp/kms-self-auth/bootstrap-src-denied.json | jq . +``` + +### Expected result + +- the response contains `.error` +- the error should indicate bootstrap was denied because the KMS is not allowed + +Acceptable examples: + +- `KMS is not allowed to bootstrap` +- `Boot denied: ...` + +### Failure interpretation + +If bootstrap succeeds under the deny policy, the self-check is not working. + +--- + +## 7. Test case 2: bootstrap succeeds after self is whitelisted + +### Purpose + +Verify that bootstrap succeeds once the same KMS is explicitly allowed. + +### Steps + +1. Switch `auth-simple-src` to allow `kms-src`: + +```bash +cp /tmp/kms-self-auth/auth-src-allow-self.json /tmp/kms-self-auth/auth-src.json +``` + +2. Retry bootstrap: + +```bash +curl -sf -X POST "${KMS_SRC_ONBOARD%/}/prpc/Onboard.Bootstrap?json" \ + -H 'Content-Type: application/json' \ + -d '{"domain":"kms-src.example.test"}' \ + | tee /tmp/kms-self-auth/bootstrap-src-allowed.json | jq . +``` + +3. Finish onboarding mode so the process can restart into normal TLS KMS mode: + +```bash +curl -sf "${KMS_SRC_ONBOARD%/}/finish" +``` + +4. Wait for the runtime KMS endpoint to become available and record it as: + +```bash +export KMS_SRC_RUNTIME='https://' +``` + +On teepod-style deployments, this is often the `-8000s` URL rather than the original onboard `-8000` URL. + +5. Probe runtime metadata: + +```bash +curl -sk "${KMS_SRC_RUNTIME%/}/prpc/KMS.GetMeta?json" \ + | tee /tmp/kms-self-auth/kms-src-meta.json | jq . +``` + +### Expected result + +- bootstrap returns `ca_pubkey`, `k256_pubkey`, and `attestation` +- `/finish` returns `OK` +- `KMS.GetMeta` succeeds after restart + +--- + +## 8. Test case 3: receiver rejects onboarding from a denied source KMS + +### Purpose + +Verify that the onboarding receiver rejects a source KMS whose attestation is denied by the receiver's auth API. + +### Important setup note + +For this scenario to reach the receiver-side check: + +- `auth-simple-src` must allow **both** `kms-src` and `kms-dst` + - `kms-src` must allow itself, because trusted RPC self-checks run on the source + - `kms-src` must also allow `kms-dst`, because `GetKmsKey` verifies the caller KMS +- `auth-simple-dst` must initially **deny** `kms-src` + +### Steps + +1. Apply source policy that allows both KMS instances: + +```bash +cp /tmp/kms-self-auth/auth-src-allow-both.json /tmp/kms-self-auth/auth-src.json +``` + +2. Apply receiver policy that denies `kms-src`: + +```bash +cp /tmp/kms-self-auth/auth-dst-deny-src.json /tmp/kms-self-auth/auth-dst.json +``` + +3. Attempt onboarding from `kms-dst`: + +```bash +curl -sf -X POST "${KMS_DST_ONBOARD%/}/prpc/Onboard.Onboard?json" \ + -H 'Content-Type: application/json' \ + -d "{\"source_url\":\"${KMS_SRC_RUNTIME%/}/prpc\",\"domain\":\"kms-dst.example.test\"}" \ + | tee /tmp/kms-self-auth/onboard-dst-denied.json | jq . +``` + +### Expected result + +- the response contains `.error` +- the error should indicate the source KMS is not allowed, or onboarding failed because source authorization was denied + +### Then verify the positive path + +4. Switch receiver policy to allow `kms-src`: + +```bash +cp /tmp/kms-self-auth/auth-dst-allow-src.json /tmp/kms-self-auth/auth-dst.json +``` + +5. Retry onboarding: + +```bash +curl -sf -X POST "${KMS_DST_ONBOARD%/}/prpc/Onboard.Onboard?json" \ + -H 'Content-Type: application/json' \ + -d "{\"source_url\":\"${KMS_SRC_RUNTIME%/}/prpc\",\"domain\":\"kms-dst.example.test\"}" \ + | tee /tmp/kms-self-auth/onboard-dst-allowed.json | jq . +``` + +6. Finish onboarding mode on `kms-dst`: + +```bash +curl -sf "${KMS_DST_ONBOARD%/}/finish" +``` + +7. Wait for the runtime endpoint and record: + +```bash +export KMS_DST_RUNTIME='https://' +``` + +Again, when TLS passthrough is used, prefer the `-8000s` URL for runtime KMS RPCs. + +8. Probe runtime metadata: + +```bash +curl -sk "${KMS_DST_RUNTIME%/}/prpc/KMS.GetMeta?json" \ + | tee /tmp/kms-self-auth/kms-dst-meta.json | jq . +``` + +### Expected result + +- first onboard attempt is rejected +- second onboard attempt succeeds +- `kms-dst` starts normally after `/finish` + +--- + +## 9. Test case 4: trusted RPCs fail when the running KMS is no longer allowed + +### Purpose + +Verify that a running KMS re-checks its own authorization on trusted RPCs. + +### Recommended canary RPC + +Use `GetTempCaCert` first. It is simpler than `GetAppKey` because it does not require preparing an attested app client, but it still exercises the new runtime self-check. + +### Steps + +1. While `kms-src` is healthy, confirm the canary RPC works: + +```bash +curl -sk "${KMS_SRC_RUNTIME%/}/prpc/KMS.GetTempCaCert?json" \ + | tee /tmp/kms-self-auth/get-temp-ca-allowed.json | jq . +``` + +2. Flip `auth-simple-src` to deny `kms-src` itself: + +```bash +cp /tmp/kms-self-auth/auth-src-deny-self.json /tmp/kms-self-auth/auth-src.json +``` + +3. Retry the same RPC: + +```bash +curl -sk "${KMS_SRC_RUNTIME%/}/prpc/KMS.GetTempCaCert?json" \ + | tee /tmp/kms-self-auth/get-temp-ca-denied.json | jq . +``` + +### Expected result + +- before the policy flip: `GetTempCaCert` succeeds +- after the policy flip: the response contains `.error` +- the error should indicate KMS self-authorization failed, or that the KMS is not allowed + +### Optional deeper checks + +If you already have tooling for attested app/KMS clients, also verify: + +- `GetKmsKey` fails when source KMS denies itself +- `GetAppKey` fails when KMS denies itself +- `SignCert` fails when KMS denies itself + +The important part is that the running KMS must not rely only on bootstrap-time authorization. + +--- + +## 10. Test case 5: `quote_enabled = false` remains compatible + +### Purpose + +Verify that the new checks are skipped when `quote_enabled = false`. + +### Suggested minimal coverage + +Deploy an extra KMS named `kms-noquote` with: + +```toml +[core.onboard] +enabled = true +auto_bootstrap_domain = "" +quote_enabled = false +``` + +Point it to an auth policy that would otherwise deny it. + +### Check A: bootstrap compatibility + +1. Deploy `kms-noquote` with a deny policy. +2. Call: + +```bash +curl -sf -X POST "${KMS_NOQUOTE_ONBOARD%/}/prpc/Onboard.Bootstrap?json" \ + -H 'Content-Type: application/json' \ + -d '{"domain":"kms-noquote.example.test"}' \ + | tee /tmp/kms-self-auth/bootstrap-noquote.json | jq . +``` + +### Expected result + +- bootstrap succeeds even though the auth policy would deny a quoted KMS +- the response's `attestation` field is empty + +### Optional runtime compatibility check + +After bootstrap and `GET /finish`, probe a trusted RPC while the auth policy still denies the KMS: + +```bash +curl -sk "${KMS_NOQUOTE_RUNTIME%/}/prpc/KMS.GetTempCaCert?json" \ + | tee /tmp/kms-self-auth/get-temp-ca-noquote-deny.json | jq . +``` + +Expected result: + +- `GetTempCaCert` still succeeds, because the new runtime self-check is skipped when `quote_enabled = false` + +### Check B: noquote receiver still cannot onboard from a quoted source + +If you want to test the onboard path too: + +1. keep `kms-src` allowed on the source side +2. deploy `kms-noquote` as a fresh onboarding target +3. keep the receiver-side policy in deny mode +4. call `Onboard.Onboard` + +Expected result: + +- the new receiver-side source authorization check is skipped +- but end-to-end onboarding still fails later with a source-side error similar to: + +```json +{ + "error": "Failed to onboard: Request failed with status=400 Bad Request, error={\"error\":\"No attestation provided\"}" +} +``` + +Reason: + +- this failure is **not** from the new receiver-side check added in PR #573 +- it comes from the existing source-side `GetKmsKey` path, which still expects attestation from the onboarding target +- therefore this failure is **correct** when the target KMS has `quote_enabled = false` but the source KMS still requires attested callers +- so `quote_enabled = false` compatibility is intentionally limited to: + - bootstrap + - skipping the new receiver-side source check + - skipping the new runtime self-check +- it does **not** mean end-to-end noquote onboarding into a quoted source KMS should succeed + +--- + +## 11. Evidence to capture + +For each run, save: + +1. `Onboard.GetAttestationInfo` output for every KMS +2. auth config snapshots used for each step +3. bootstrap/onboard RPC responses +4. `KMS.GetMeta` output after successful boot +5. `GetTempCaCert` allow/deny responses +6. relevant CVM logs if a step fails unexpectedly + +Recommended archive: + +```bash +tar czf /tmp/kms-self-auth-results.tar.gz /tmp/kms-self-auth +``` + +--- + +## 12. Cleanup + +Stop local auth services: + +```bash +kill "$(cat /tmp/kms-self-auth/auth-src.pid)" || true +kill "$(cat /tmp/kms-self-auth/auth-dst.pid)" || true +``` + +Then remove test CVMs using your normal `vmm-cli.py remove` or teepod cleanup flow. + +--- + +## Success criteria summary + +The change is considered validated if all of the following are true: + +1. bootstrap fails under deny policy when `quote_enabled = true` +2. bootstrap succeeds after self allowlisting +3. onboarding rejects a denied source KMS on the receiver side +4. runtime trusted RPCs stop working after the source KMS is removed from the allowlist +5. with `quote_enabled = false`, bootstrap and runtime trusted RPCs skip the new checks +6. with `quote_enabled = false`, receiver-side onboarding does not fail on the **new** source-authorization check, but it still correctly fails against a quoted source KMS that requires attested callers