diff --git a/.github/workflows/midstream-container-build.yml b/.github/workflows/midstream-container-build.yml index a68a8275..bfd6b2f8 100644 --- a/.github/workflows/midstream-container-build.yml +++ b/.github/workflows/midstream-container-build.yml @@ -135,6 +135,7 @@ jobs: docker buildx imagetools create \ -t "${{ env.IMAGE_REGISTRY }}/gateway:${{ github.sha }}" \ -t "${{ env.IMAGE_REGISTRY }}/gateway:midstream" \ + -t "${{ env.IMAGE_REGISTRY }}/gateway:dev" \ "${{ env.IMAGE_REGISTRY }}/gateway:${{ github.sha }}-amd64" \ "${{ env.IMAGE_REGISTRY }}/gateway:${{ github.sha }}-arm64" @@ -159,5 +160,6 @@ jobs: docker buildx imagetools create \ -t "${{ env.IMAGE_REGISTRY }}/cluster:${{ github.sha }}" \ -t "${{ env.IMAGE_REGISTRY }}/cluster:midstream" \ + -t "${{ env.IMAGE_REGISTRY }}/cluster:dev" \ "${{ env.IMAGE_REGISTRY }}/cluster:${{ github.sha }}-amd64" \ "${{ env.IMAGE_REGISTRY }}/cluster:${{ github.sha }}-arm64" diff --git a/crates/openshell-bootstrap/src/build.rs b/crates/openshell-bootstrap/src/build.rs index eaa22131..19f3ea35 100644 --- a/crates/openshell-bootstrap/src/build.rs +++ b/crates/openshell-bootstrap/src/build.rs @@ -10,7 +10,6 @@ use std::collections::HashMap; use std::path::Path; -use bollard::Docker; use bollard::query_parameters::BuildImageOptionsBuilder; use futures::StreamExt; use miette::{IntoDiagnostic, Result, WrapErr}; @@ -46,9 +45,8 @@ pub async fn build_and_push_image( on_log(format!( "Pushing image {tag} into gateway \"{gateway_name}\"" )); - let local_docker = Docker::connect_with_local_defaults() - .into_diagnostic() - .wrap_err("failed to connect to local Docker daemon")?; + let local_docker = crate::docker::connect_local_auto() + .wrap_err("failed to connect to local container runtime")?; let container = container_name(gateway_name); let images: Vec<&str> = vec![tag]; push_local_images(&local_docker, &local_docker, &container, &images, on_log).await?; @@ -68,9 +66,8 @@ async fn build_image( build_args: &HashMap, on_log: &mut impl FnMut(String), ) -> Result<()> { - let docker = Docker::connect_with_local_defaults() - .into_diagnostic() - .wrap_err("failed to connect to local Docker daemon")?; + let docker = crate::docker::connect_local_auto() + .wrap_err("failed to connect to local container runtime")?; // Compute the relative path of the Dockerfile within the context. let dockerfile_relative = dockerfile_path diff --git a/crates/openshell-bootstrap/src/docker.rs b/crates/openshell-bootstrap/src/docker.rs index ff29bbd5..bccf60de 100644 --- a/crates/openshell-bootstrap/src/docker.rs +++ b/crates/openshell-bootstrap/src/docker.rs @@ -263,6 +263,33 @@ pub(crate) fn connect_local(runtime: ContainerRuntime) -> Result { } } +/// Connect to the local container runtime with auto-detection. +/// +/// This is a convenience wrapper for code paths that need a Docker client +/// but don't have a `ContainerRuntime` value available. It auto-detects +/// the runtime (Podman preferred) and connects via `connect_local`. +pub(crate) fn connect_local_auto() -> Result { + let runtime = crate::container_runtime::detect_runtime(None)?; + connect_local(runtime) +} + +/// Connect to the local container runtime for an existing gateway. +/// +/// Resolution order: +/// 1. Stored runtime from gateway metadata (if metadata exists) +/// 2. Auto-detect runtime via `detect_runtime` (propagates error on failure) +/// +/// This is used by code paths that have a gateway `name` but no `runtime` +/// in scope. Unlike `connect_local_auto()`, this checks metadata first so +/// that gateways deployed with a specific runtime reconnect to the same one. +pub(crate) fn connect_for_gateway(name: &str) -> Result { + let runtime = match crate::metadata::get_gateway_metadata(name) { + Some(m) => m.container_runtime, + None => crate::container_runtime::detect_runtime(None)?, + }; + connect_local(runtime) +} + /// Build a rich, user-friendly error when a container runtime is not reachable. fn runtime_not_reachable_error( runtime: ContainerRuntime, @@ -851,6 +878,11 @@ pub async fn ensure_container( env_vars.push("GPU_ENABLED=true".to_string()); } + // Pass the container runtime to the entrypoint so it can select the + // appropriate networking stack (nftables kube-proxy for Podman, iptables + // DNS proxy for Docker, etc.). + env_vars.push(format!("CONTAINER_RUNTIME={}", runtime.binary_name())); + let env = Some(env_vars); // Set the health check explicitly on the container config so it works diff --git a/crates/openshell-bootstrap/src/lib.rs b/crates/openshell-bootstrap/src/lib.rs index 3eb78ab8..e7290c04 100644 --- a/crates/openshell-bootstrap/src/lib.rs +++ b/crates/openshell-bootstrap/src/lib.rs @@ -544,7 +544,7 @@ where .collect(); if !images.is_empty() { log("[status] Deploying components".to_string()); - let local_docker = Docker::connect_with_local_defaults().into_diagnostic()?; + let local_docker = docker::connect_local(runtime)?; let container = container_name(&name); let on_log_ref = Arc::clone(&on_log); let mut push_log = move |msg: String| { @@ -669,7 +669,7 @@ pub async fn extract_and_store_pki( ) -> Result<()> { let docker = match remote { Some(r) => create_ssh_docker_client(r).await?, - None => Docker::connect_with_local_defaults().into_diagnostic()?, + None => docker::connect_for_gateway(name)?, }; let cname = docker::find_gateway_container(&docker, port).await?; let bundle = load_existing_pki_bundle(&docker, &cname, constants::KUBECONFIG_PATH) @@ -684,7 +684,7 @@ pub async fn ensure_gateway_image( registry_username: Option<&str>, registry_token: Option<&str>, ) -> Result { - let docker = Docker::connect_with_local_defaults().into_diagnostic()?; + let docker = docker::connect_local_auto()?; let image_ref = format!("{}:{version}", image::DEFAULT_GATEWAY_IMAGE); ensure_image(&docker, &image_ref, registry_username, registry_token).await?; Ok(image_ref) @@ -712,7 +712,7 @@ pub async fn gateway_container_logs( let docker = match remote { Some(remote_opts) => create_ssh_docker_client(remote_opts).await?, - None => Docker::connect_with_local_defaults().into_diagnostic()?, + None => docker::connect_for_gateway(name)?, }; let container = container_name(name); @@ -765,7 +765,7 @@ pub async fn gateway_container_logs( /// Returns an empty string on any Docker/connection error so callers don't /// need to worry about error handling. pub async fn fetch_gateway_logs(name: &str, n: usize) -> String { - let docker = match Docker::connect_with_local_defaults() { + let docker = match docker::connect_local_auto() { Ok(d) => d, Err(_) => return String::new(), }; diff --git a/deploy/docker/Dockerfile.images b/deploy/docker/Dockerfile.images index 837f4fb9..05149765 100644 --- a/deploy/docker/Dockerfile.images +++ b/deploy/docker/Dockerfile.images @@ -233,6 +233,7 @@ RUN dnf install -y fedora-repos && \ dnf install -y \ ca-certificates \ iptables \ + nftables \ util-linux \ bind-utils \ && dnf clean all diff --git a/deploy/docker/cluster-entrypoint.sh b/deploy/docker/cluster-entrypoint.sh index 14f661ba..769a2945 100644 --- a/deploy/docker/cluster-entrypoint.sh +++ b/deploy/docker/cluster-entrypoint.sh @@ -34,22 +34,13 @@ yaml_quote() { printf "'%s'" "$(printf '%s' "$1" | sed "s/'/''/g")" } -# --------------------------------------------------------------------------- -# Select iptables backend -# --------------------------------------------------------------------------- -# Some kernels (e.g. Jetson Linux 5.15-tegra) have the nf_tables subsystem -# but lack the nft_compat bridge that allows flannel and kube-proxy to use -# xt extension modules (xt_comment, xt_conntrack). Detect this by probing -# whether xt_comment is usable via the current iptables backend. If the -# probe fails, switch to iptables-legacy. Set USE_IPTABLES_LEGACY=1 -# externally to skip the probe and force the legacy backend. # --------------------------------------------------------------------------- # Check br_netfilter kernel module # --------------------------------------------------------------------------- # br_netfilter makes the kernel pass bridge (pod-to-pod) traffic through -# iptables. Without it, kube-proxy's DNAT rules for ClusterIP services are -# never applied to pod traffic, so pods cannot reach services such as -# kube-dns (10.43.0.10), breaking all in-cluster DNS resolution. +# netfilter (iptables or nftables). Without it, kube-proxy's DNAT rules for +# ClusterIP services are never applied to pod traffic, so pods cannot reach +# services such as kube-dns (10.43.0.10), breaking all in-cluster DNS. # # The module must be loaded on the HOST before the container starts — # containers cannot load kernel modules themselves. If it is missing, log a @@ -65,25 +56,37 @@ if [ ! -f /proc/sys/net/bridge/bridge-nf-call-iptables ]; then echo " echo br_netfilter | sudo tee /etc/modules-load.d/br_netfilter.conf" >&2 fi -if [ -z "${USE_IPTABLES_LEGACY:-}" ]; then - if iptables -t filter -N _xt_probe 2>/dev/null; then - _probe_rc=0 - iptables -t filter -A _xt_probe -m comment --comment "probe" -j ACCEPT \ - 2>/dev/null || _probe_rc=$? - iptables -t filter -D _xt_probe -m comment --comment "probe" -j ACCEPT \ - 2>/dev/null || true - iptables -t filter -X _xt_probe 2>/dev/null || true - [ "$_probe_rc" -ne 0 ] && USE_IPTABLES_LEGACY=1 +# --------------------------------------------------------------------------- +# Select iptables backend (Docker only) +# --------------------------------------------------------------------------- +# Under Podman with nftables kube-proxy mode, the iptables backend probe is +# unnecessary — kube-proxy uses nft directly. Flannel still uses the iptables +# binary but through the nft compat shim which doesn't need the xt probe. +# +# Under Docker (or unset runtime), probe whether xt_comment is usable. Some +# kernels (e.g. Jetson Linux 5.15-tegra) have nf_tables but lack the +# nft_compat bridge. If the probe fails, switch to iptables-legacy. +if [ "${CONTAINER_RUNTIME:-}" != "podman" ]; then + if [ -z "${USE_IPTABLES_LEGACY:-}" ]; then + if iptables -t filter -N _xt_probe 2>/dev/null; then + _probe_rc=0 + iptables -t filter -A _xt_probe -m comment --comment "probe" -j ACCEPT \ + 2>/dev/null || _probe_rc=$? + iptables -t filter -D _xt_probe -m comment --comment "probe" -j ACCEPT \ + 2>/dev/null || true + iptables -t filter -X _xt_probe 2>/dev/null || true + [ "$_probe_rc" -ne 0 ] && USE_IPTABLES_LEGACY=1 + fi fi -fi -if [ "${USE_IPTABLES_LEGACY:-0}" = "1" ]; then - echo "iptables nf_tables xt extension bridge unavailable — switching to iptables-legacy" - if update-alternatives --set iptables /usr/sbin/iptables-legacy 2>/dev/null && - update-alternatives --set ip6tables /usr/sbin/ip6tables-legacy 2>/dev/null; then - echo "Now using iptables-legacy mode" - else - echo "Warning: could not switch to iptables-legacy — cluster networking may fail" + if [ "${USE_IPTABLES_LEGACY:-0}" = "1" ]; then + echo "iptables nf_tables xt extension bridge unavailable — switching to iptables-legacy" + if update-alternatives --set iptables /usr/sbin/iptables-legacy 2>/dev/null && + update-alternatives --set ip6tables /usr/sbin/ip6tables-legacy 2>/dev/null; then + echo "Now using iptables-legacy mode" + else + echo "Warning: could not switch to iptables-legacy — cluster networking may fail" + fi fi fi @@ -174,13 +177,20 @@ setup_dns_proxy() { echo "Configured k3s DNS to use ${CONTAINER_IP} (proxied to Docker DNS)" } -if ! setup_dns_proxy; then - echo "DNS proxy setup failed, falling back to public DNS servers" - echo "Note: this may not work on Docker Desktop (Mac/Windows)" - cat >"$RESOLV_CONF" <"$RESOLV_CONF" < %{buildroot}%{_modulesloaddir}/%{name}.conf << 'EOF' -# Load legacy iptables kernel modules required by k3s flannel CNI. -# Modern kernels use nf_tables by default; these modules provide the -# legacy iptables interface that k3s's bundled iptables-legacy needs. -ip_tables -iptable_nat -iptable_filter -iptable_mangle +# Load br_netfilter for K3s bridge networking. +# Required so kube-proxy DNAT rules (iptables or nftables) apply to +# bridged pod-to-pod traffic for ClusterIP service resolution. +br_netfilter +EOF + +# Install sysctl.d config for bridge netfilter settings required by K3s. +install -d %{buildroot}%{_sysctldir} +cat > %{buildroot}%{_sysctldir}/99-%{name}.conf << 'EOF' +# Enable bridge netfilter call chains for K3s pod-to-service networking. +# Required after br_netfilter is loaded so kube-proxy DNAT rules apply +# to bridged pod traffic. +net.bridge.bridge-nf-call-iptables = 1 +net.bridge.bridge-nf-call-ip6tables = 1 EOF # Install Python SDK modules (test files are intentionally excluded) @@ -138,6 +148,12 @@ echo "rpm" > %{buildroot}%{python3_sitelib}/%{name}-%{version}.dist-info/INSTALL # RECORD can be empty for RPM-managed installs touch %{buildroot}%{python3_sitelib}/%{name}-%{version}.dist-info/RECORD +%post +# Load br_netfilter immediately so a reboot is not required after install. +# The modules-load.d config handles subsequent boots. +modprobe br_netfilter > /dev/null 2>&1 || : +%sysctl_apply 99-%{name}.conf + %check # Smoke-test the CLI binary %{buildroot}%{_bindir}/%{name} --version @@ -153,6 +169,7 @@ PYTHONPATH=%{buildroot}%{python3_sitelib} %{python3} -c "from importlib.metadata %doc README.md %{_bindir}/%{name} %{_modulesloaddir}/%{name}.conf +%{_sysctldir}/99-%{name}.conf %files -n python3-%{name} %license LICENSE