From 1814dbf01d7b2793c62a33f81a021709f0a4ac13 Mon Sep 17 00:00:00 2001 From: Junyi Ou Date: Tue, 23 Jun 2026 15:28:06 -0400 Subject: [PATCH 1/2] chore(agent): track post-enroll QUIC tunnel connectivity probe gap --- devolutions-agent/src/enrollment.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/devolutions-agent/src/enrollment.rs b/devolutions-agent/src/enrollment.rs index c3c915cc9..5a6ba5080 100644 --- a/devolutions-agent/src/enrollment.rs +++ b/devolutions-agent/src/enrollment.rs @@ -109,6 +109,14 @@ pub async fn enroll_agent( let (key_pem, csr_pem) = generate_key_and_csr(&agent_name)?; let enroll_response = request_enrollment(gateway_url, enrollment_token, &csr_pem).await?; + + // TODO(agent-tunnel): enrollment success here only means the HTTPS cert exchange on + // POST /jet/tunnel/enroll succeeded and the config was persisted — it does NOT verify the + // QUIC tunnel (UDP, quic_endpoint) can actually be established. The installer's + // EnrollAgentTunnel custom action reports "success" on this return, so a blocked QUIC port + // (e.g. firewall on UDP 4433) yields a green install while the agent never comes online and + // silently auto-reconnects forever. Add a short post-enroll QUIC connectivity probe to + // enroll_response.quic_endpoint and surface a clear warning/failure when it can't connect. persist_enrollment_response(agent_name, advertise_subnets, enroll_response, &key_pem) } From 7f038413177b9fe9f3ff2456bd73352b6a0ecf72 Mon Sep 17 00:00:00 2001 From: Junyi Ou Date: Tue, 23 Jun 2026 16:28:24 -0400 Subject: [PATCH 2/2] fix(agent): verify QUIC tunnel connectivity during install enrollment The MSI EnrollAgentTunnel step reported success once enrollment (HTTPS, 7171) completed, even when the QUIC data tunnel (UDP, 4433) could not be established, leaving the agent silently offline. The installer now runs `devolutions-agent probe-tunnel`, which opens the mTLS/QUIC tunnel and performs a Heartbeat round-trip, and fails the step with an actionable message when connectivity can't be established. --- devolutions-agent/src/enrollment.rs | 8 - devolutions-agent/src/main.rs | 30 +++ devolutions-agent/src/tunnel.rs | 198 ++++++++++++++++-- .../Actions/CustomActions.cs | 58 +++++ 4 files changed, 264 insertions(+), 30 deletions(-) diff --git a/devolutions-agent/src/enrollment.rs b/devolutions-agent/src/enrollment.rs index 5a6ba5080..c3c915cc9 100644 --- a/devolutions-agent/src/enrollment.rs +++ b/devolutions-agent/src/enrollment.rs @@ -109,14 +109,6 @@ pub async fn enroll_agent( let (key_pem, csr_pem) = generate_key_and_csr(&agent_name)?; let enroll_response = request_enrollment(gateway_url, enrollment_token, &csr_pem).await?; - - // TODO(agent-tunnel): enrollment success here only means the HTTPS cert exchange on - // POST /jet/tunnel/enroll succeeded and the config was persisted — it does NOT verify the - // QUIC tunnel (UDP, quic_endpoint) can actually be established. The installer's - // EnrollAgentTunnel custom action reports "success" on this return, so a blocked QUIC port - // (e.g. firewall on UDP 4433) yields a green install while the agent never comes online and - // silently auto-reconnects forever. Add a short post-enroll QUIC connectivity probe to - // enroll_response.quic_endpoint and surface a clear warning/failure when it can't connect. persist_enrollment_response(agent_name, advertise_subnets, enroll_response, &key_pem) } diff --git a/devolutions-agent/src/main.rs b/devolutions-agent/src/main.rs index 485c6db2e..0ad61f0c6 100644 --- a/devolutions-agent/src/main.rs +++ b/devolutions-agent/src/main.rs @@ -38,6 +38,7 @@ mod service; use std::env; use std::io::{self, BufRead}; use std::sync::mpsc; +use std::time::Duration; use anyhow::{Context as _, Result, bail}; use ceviche::Service; @@ -285,6 +286,35 @@ fn main() { std::process::exit(1); } } + "probe-tunnel" => { + let timeout_secs = match env::args().nth(2) { + Some(value) => match value.parse() { + Ok(timeout_secs) => timeout_secs, + Err(error) => { + eprintln!("[ERROR] Invalid probe timeout: {error}"); + std::process::exit(1); + } + }, + None => 15, + }; + + let conf_handle = match ConfHandle::init() { + Ok(conf_handle) => conf_handle, + Err(error) => { + eprintln!("[ERROR] Failed to load agent configuration: {error:#}"); + std::process::exit(1); + } + }; + let conf = conf_handle.get_conf(); + let rt = tokio::runtime::Runtime::new().expect("failed to create tokio runtime"); + if let Err(error) = rt.block_on(devolutions_agent::tunnel::probe_tunnel_connectivity( + &conf.tunnel, + Duration::from_secs(timeout_secs), + )) { + eprintln!("[ERROR] Tunnel connectivity probe failed: {error:#}"); + std::process::exit(1); + } + } _ => { eprintln!("[ERROR] Invalid command: {cmd}"); } diff --git a/devolutions-agent/src/tunnel.rs b/devolutions-agent/src/tunnel.rs index c48d865b5..efecff3e3 100644 --- a/devolutions-agent/src/tunnel.rs +++ b/devolutions-agent/src/tunnel.rs @@ -190,25 +190,10 @@ enum ConnectionOutcome { CertRenewed, } -/// Run a single QUIC tunnel connection lifetime: config → connect → event loop. -/// -/// - `Ok(Shutdown)`: graceful shutdown, exit the task. -/// - `Ok(CertRenewed)`: certificate renewed; caller should reconnect immediately. -/// - `Err(...)`: connection lost or handshake failed — caller should retry with backoff. -async fn run_single_connection( - conf_handle: &ConfHandle, - shutdown_signal: &mut ShutdownSignal, -) -> anyhow::Result { - // Ensure rustls crypto provider is installed (ring). - let _ = rustls::crypto::ring::default_provider().install_default(); - - let agent_conf = conf_handle.get_conf(); - let tunnel_conf = &agent_conf.tunnel; - - let cert_path = &tunnel_conf.client_cert_path; - let key_path = &tunnel_conf.client_key_path; - let ca_path = &tunnel_conf.gateway_ca_cert_path; - +/// Build the route advertisement payload from the current tunnel configuration. +fn route_advertisements( + tunnel_conf: &crate::config::TunnelConf, +) -> anyhow::Result<(Vec, Vec)> { let advertise_subnets: Vec = tunnel_conf .advertise_subnets .iter() @@ -260,23 +245,33 @@ async fn run_single_connection( "Advertising subnets and domains" ); + Ok((advertise_subnets, advertise_domains)) +} + +/// Build the mTLS client config, resolve the gateway endpoint, and perform the +/// QUIC handshake, returning the live endpoint and connection. +async fn connect_to_gateway( + tunnel_conf: &crate::config::TunnelConf, +) -> anyhow::Result<(quinn::Endpoint, quinn::Connection)> { + // Ensure rustls crypto provider is installed (ring). + let _ = rustls::crypto::ring::default_provider().install_default(); // -- Build rustls ClientConfig -- let certs: Vec> = rustls_pemfile::certs(&mut std::io::BufReader::new( - std::fs::File::open(cert_path.as_str()).context("open client cert file")?, + std::fs::File::open(tunnel_conf.client_cert_path.as_str()).context("open client cert file")?, )) .collect::, _>>() .context("parse client certificates")?; let key = rustls_pemfile::private_key(&mut std::io::BufReader::new( - std::fs::File::open(key_path.as_str()).context("open client key file")?, + std::fs::File::open(tunnel_conf.client_key_path.as_str()).context("open client key file")?, )) .context("parse private key file")? .context("no private key found in file")?; let mut roots = rustls::RootCertStore::empty(); let ca_certs: Vec> = rustls_pemfile::certs(&mut std::io::BufReader::new( - std::fs::File::open(ca_path.as_str()).context("open CA cert file")?, + std::fs::File::open(tunnel_conf.gateway_ca_cert_path.as_str()).context("open CA cert file")?, )) .collect::, _>>() .context("parse CA certificates")?; @@ -363,6 +358,84 @@ async fn run_single_connection( info!("QUIC connection established"); + Ok((endpoint, connection)) +} + +/// Probe end-to-end QUIC tunnel connectivity to the Gateway, bounded by `timeout`. +/// +/// Connects (mTLS over QUIC) and performs a single `Heartbeat` → `HeartbeatAck` +/// round-trip, proving UDP reachability, the QUIC + mTLS handshake, and a working +/// bidirectional control stream. Returns an error if the tunnel is disabled, the +/// handshake or round-trip fails, or `timeout` elapses first. +pub async fn probe_tunnel_connectivity( + tunnel_conf: &crate::config::TunnelConf, + timeout: Duration, +) -> anyhow::Result<()> { + if !tunnel_conf.enabled { + bail!("agent tunnel is not enabled"); + } + + // Bound only the connectivity check itself — connect plus the Heartbeat round-trip. + // Once the HeartbeatAck proves the tunnel works, teardown must never be able to turn + // that proven success into a timeout failure. + let (endpoint, connection) = tokio::time::timeout(timeout, async { + let (endpoint, connection) = connect_to_gateway(tunnel_conf).await?; + + let mut ctrl: ControlStream<_, _> = connection.open_bi().await.context("open control stream")?.into(); + + // A bare Heartbeat is enough to prove connectivity: the gateway acks any + // Heartbeat unconditionally with the echoed timestamp. We deliberately do NOT + // send a RouteAdvertise here — that would mutate the gateway's route registry, + // and a connectivity probe must be side-effect-free. + let timestamp_ms = current_time_millis(); + ctrl.send(&ControlMessage::heartbeat(timestamp_ms, 0)) + .await + .context("send probe Heartbeat")?; + + match ctrl.recv().await.context("receive probe HeartbeatAck")? { + ControlMessage::HeartbeatAck { + timestamp_ms: echoed_timestamp_ms, + .. + } if echoed_timestamp_ms == timestamp_ms => Ok((endpoint, connection)), + ControlMessage::HeartbeatAck { .. } => bail!("probe HeartbeatAck timestamp mismatch"), + unexpected => bail!("unexpected probe response: {unexpected:?}"), + } + }) + .await + .context("agent tunnel connectivity probe timed out")??; + + info!("Agent tunnel connectivity probe succeeded"); + + // Connectivity is already proven, so teardown is best-effort and must never fail the + // probe. Drain (bounded, result ignored) so the queued CONNECTION_CLOSE is flushed + // before the runtime and endpoint are dropped — otherwise the gateway keeps this + // probe's same-agent_id connection registered until its idle timeout, and that late + // cleanup can unregister the real agent. + connection.close(0u32.into(), b"probe-complete"); + let _ = tokio::time::timeout(Duration::from_secs(3), endpoint.wait_idle()).await; + + Ok(()) +} + +/// Run a single QUIC tunnel connection lifetime: config → connect → event loop. +/// +/// - `Ok(Shutdown)`: graceful shutdown, exit the task. +/// - `Ok(CertRenewed)`: certificate renewed; caller should reconnect immediately. +/// - `Err(_)`: connection lost or handshake failed — caller should retry with backoff. +async fn run_single_connection( + conf_handle: &ConfHandle, + shutdown_signal: &mut ShutdownSignal, +) -> anyhow::Result { + let agent_conf = conf_handle.get_conf(); + let tunnel_conf = &agent_conf.tunnel; + + let cert_path = &tunnel_conf.client_cert_path; + let key_path = &tunnel_conf.client_key_path; + let ca_path = &tunnel_conf.gateway_ca_cert_path; + + let (advertise_subnets, advertise_domains) = route_advertisements(tunnel_conf)?; + let (_endpoint, connection) = connect_to_gateway(tunnel_conf).await?; + // -- Open control stream -- let mut ctrl: ControlStream<_, _> = connection.open_bi().await.context("open control stream")?.into(); @@ -638,3 +711,84 @@ async fn run_session_proxy(advertise_subnets: Vec, send: quinn::Sen .await .inspect_err(|e| error!(%e, "Session proxy failed")); } + +#[cfg(test)] +mod tests { + use std::time::Duration; + + use camino::Utf8PathBuf; + + use super::probe_tunnel_connectivity; + use crate::config::TunnelConf; + + /// A minimal enabled tunnel config; individual tests override what they need. + fn tunnel_conf_template() -> TunnelConf { + TunnelConf { + enabled: true, + gateway_endpoint: String::new(), + client_cert_path: Utf8PathBuf::new(), + client_key_path: Utf8PathBuf::new(), + gateway_ca_cert_path: Utf8PathBuf::new(), + advertise_subnets: Vec::new(), + advertise_domains: Vec::new(), + auto_detect_domain: false, + heartbeat_interval_secs: 15, + route_advertise_interval_secs: 60, + server_spki_sha256: None, + } + } + + #[tokio::test] + async fn probe_fails_fast_when_tunnel_disabled() { + let mut conf = tunnel_conf_template(); + conf.enabled = false; + + let error = probe_tunnel_connectivity(&conf, Duration::from_secs(5)) + .await + .expect_err("probe must fail when the tunnel is disabled"); + + assert!( + format!("{error:#}").contains("not enabled"), + "unexpected error: {error:#}" + ); + } + + #[tokio::test] + async fn probe_times_out_when_gateway_unreachable() { + // Throwaway PEMs so the pre-connect file reads succeed. The QUIC handshake never + // completes because nothing listens on the target port, so the probe must hit its + // own timeout rather than hang. + let cert_key = + rcgen::generate_simple_self_signed(vec!["localhost".to_owned()]).expect("generate self-signed cert"); + let cert_pem = cert_key.cert.pem(); + let key_pem = cert_key.key_pair.serialize_pem(); + + let dir = std::env::temp_dir().join(format!("dgw-probe-test-{}", uuid::Uuid::new_v4())); + std::fs::create_dir_all(&dir).expect("create temp dir"); + let cert_path = dir.join("client.crt"); + let key_path = dir.join("client.key"); + let ca_path = dir.join("ca.crt"); + std::fs::write(&cert_path, &cert_pem).expect("write client cert"); + std::fs::write(&key_path, &key_pem).expect("write client key"); + std::fs::write(&ca_path, &cert_pem).expect("write ca cert"); + + let mut conf = tunnel_conf_template(); + // 127.0.0.1:1 is reserved and unbound; the QUIC handshake cannot complete. + conf.gateway_endpoint = "127.0.0.1:1".to_owned(); + conf.client_cert_path = Utf8PathBuf::from_path_buf(cert_path).expect("utf8 cert path"); + conf.client_key_path = Utf8PathBuf::from_path_buf(key_path).expect("utf8 key path"); + conf.gateway_ca_cert_path = Utf8PathBuf::from_path_buf(ca_path).expect("utf8 ca path"); + + let started = std::time::Instant::now(); + let result = probe_tunnel_connectivity(&conf, Duration::from_secs(2)).await; + let elapsed = started.elapsed(); + + let _ = std::fs::remove_dir_all(&dir); + + assert!(result.is_err(), "probe must fail when the gateway is unreachable"); + assert!( + elapsed < Duration::from_secs(15), + "probe must fail fast, took {elapsed:?}" + ); + } +} diff --git a/package/AgentWindowsManaged/Actions/CustomActions.cs b/package/AgentWindowsManaged/Actions/CustomActions.cs index 6ebe4b33f..96964c42a 100644 --- a/package/AgentWindowsManaged/Actions/CustomActions.cs +++ b/package/AgentWindowsManaged/Actions/CustomActions.cs @@ -612,6 +612,64 @@ ActionResult Fail(string msg) WriteTunnelAdvertisementsToConfig(session, subnetsArg, domainsArg); } + string probeArguments = "probe-tunnel 15"; + session.Log($"Running tunnel connectivity probe: {exePath} {probeArguments}"); + + ProcessStartInfo probeStartInfo = new(exePath, probeArguments) + { + UseShellExecute = false, + RedirectStandardOutput = true, + RedirectStandardError = true, + CreateNoWindow = true, + WorkingDirectory = ProgramDataDirectory, + }; + + using Process probeProcess = Process.Start(probeStartInfo); + Task probeStdoutTask = probeProcess.StandardOutput.ReadToEndAsync(); + Task probeStderrTask = probeProcess.StandardError.ReadToEndAsync(); + + if (!probeProcess.WaitForExit(30_000)) + { + try + { + probeProcess.Kill(); + } + catch + { + // Already exited between WaitForExit timing out and Kill firing. + } + + try + { + Task.WaitAll(new Task[] { probeStdoutTask, probeStderrTask }, 5000); + } + catch + { + // Observed. + } + + return Fail("Agent tunnel connectivity probe timed out. Verify UDP connectivity to the Gateway QUIC endpoint (typically UDP 4433) and firewall/NAT rules."); + } + + probeProcess.WaitForExit(); + string probeStdout = probeStdoutTask.GetAwaiter().GetResult(); + string probeStderr = probeStderrTask.GetAwaiter().GetResult(); + + if (!string.IsNullOrEmpty(probeStdout)) + { + session.Log($"tunnel probe stdout: {Redact(probeStdout)}"); + } + if (!string.IsNullOrEmpty(probeStderr)) + { + session.Log($"tunnel probe stderr: {Redact(probeStderr)}"); + } + + if (probeProcess.ExitCode != 0) + { + string detail = !string.IsNullOrWhiteSpace(probeStderr) ? Redact(probeStderr).Trim() : $"exit code {probeProcess.ExitCode}"; + return Fail($"Agent tunnel connectivity probe failed: {detail}. Verify UDP connectivity to the Gateway QUIC endpoint (typically UDP 4433) and firewall/NAT rules."); + } + session.Log("Agent tunnel enrollment completed successfully"); return ActionResult.Success; }