From 7a1ef3636b2605a563e329405fdd644985b5f113 Mon Sep 17 00:00:00 2001 From: Radovan Fuchs Date: Wed, 20 May 2026 15:10:10 +0200 Subject: [PATCH 01/20] Add TLS fixes for konflux run --- .../lightspeed/e2e-mock-tls-inference.yaml | 104 +++++++++++ .../lightspeed/llama-stack-openai.yaml | 8 + tests/e2e-prow/rhoai/scripts/e2e-ops.sh | 163 +++++++++++++++++- .../server-mode/lightspeed-stack-tls.yaml | 2 +- tests/e2e/features/environment.py | 3 + tests/e2e/features/steps/proxy.py | 2 + tests/e2e/features/steps/tls.py | 116 +++++++++++-- tests/e2e/features/tls.feature | 5 +- tests/e2e/mock_tls_inference_server/server.py | 25 ++- tests/e2e/test_list.txt | 27 --- 10 files changed, 405 insertions(+), 50 deletions(-) create mode 100644 tests/e2e-prow/rhoai/manifests/lightspeed/e2e-mock-tls-inference.yaml diff --git a/tests/e2e-prow/rhoai/manifests/lightspeed/e2e-mock-tls-inference.yaml b/tests/e2e-prow/rhoai/manifests/lightspeed/e2e-mock-tls-inference.yaml new file mode 100644 index 000000000..6797de24a --- /dev/null +++ b/tests/e2e-prow/rhoai/manifests/lightspeed/e2e-mock-tls-inference.yaml @@ -0,0 +1,104 @@ +# Mock HTTPS OpenAI API for tls.feature (Konflux / Prow; no Docker Compose). +# Llama Stack run.yaml uses https://e2e-mock-tls-inference..svc.cluster.local:8443|8444|8445/v1 +apiVersion: v1 +kind: Pod +metadata: + name: e2e-mock-tls-inference + labels: + app: e2e-mock-tls-inference +spec: + securityContext: + runAsNonRoot: true + seccompProfile: + type: RuntimeDefault + containers: + - name: e2e-mock-tls-inference + image: python:3.12-slim + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + runAsNonRoot: true + runAsUser: 1000 + seccompProfile: + type: RuntimeDefault + env: + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: PYTHONPATH + value: /app:/tmp/pydeps + command: + - /bin/sh + - -c + - | + set -e + pip install --quiet --no-cache-dir --target /tmp/pydeps 'trustme>=1.2.1' 'cryptography>=42.0.0' + NS="${POD_NAMESPACE:-default}" + export TLS_CERT_DNS_NAMES="mock-tls-inference,localhost,127.0.0.1,e2e-mock-tls-inference,e2e-mock-tls-inference.${NS}.svc.cluster.local" + exec python /app/server.py + ports: + - containerPort: 8443 + name: tls + - containerPort: 8444 + name: mtls + - containerPort: 8445 + name: mismatch + volumeMounts: + - name: server-script + mountPath: /app/server.py + subPath: server.py + readOnly: true + - name: certs-work + mountPath: /certs + readinessProbe: + exec: + command: + - python3 + - -c + - | + import ssl, urllib.request + ctx = ssl.create_default_context() + ctx.check_hostname = False + ctx.verify_mode = ssl.CERT_NONE + urllib.request.urlopen("https://localhost:8443/health", context=ctx) + initialDelaySeconds: 8 + periodSeconds: 5 + livenessProbe: + exec: + command: + - python3 + - -c + - | + import ssl, urllib.request + ctx = ssl.create_default_context() + ctx.check_hostname = False + ctx.verify_mode = ssl.CERT_NONE + urllib.request.urlopen("https://localhost:8443/health", context=ctx) + initialDelaySeconds: 15 + periodSeconds: 20 + volumes: + - name: server-script + configMap: + name: e2e-mock-tls-inference-script + - name: certs-work + emptyDir: {} +--- +apiVersion: v1 +kind: Service +metadata: + name: e2e-mock-tls-inference +spec: + selector: + app: e2e-mock-tls-inference + ports: + - name: tls + port: 8443 + targetPort: tls + - name: mtls + port: 8444 + targetPort: mtls + - name: mismatch + port: 8445 + targetPort: mismatch diff --git a/tests/e2e-prow/rhoai/manifests/lightspeed/llama-stack-openai.yaml b/tests/e2e-prow/rhoai/manifests/lightspeed/llama-stack-openai.yaml index 3f2a6583c..b182a2463 100644 --- a/tests/e2e-prow/rhoai/manifests/lightspeed/llama-stack-openai.yaml +++ b/tests/e2e-prow/rhoai/manifests/lightspeed/llama-stack-openai.yaml @@ -206,6 +206,10 @@ spec: mountPath: /tmp/interception-proxy-ca.pem subPath: ca.pem readOnly: true + # tls.feature: client/CA PEMs from Secret e2e-mock-tls-certs (optional). + - name: mock-tls-certs + mountPath: /certs + readOnly: true volumes: - name: app-root emptyDir: {} @@ -222,3 +226,7 @@ spec: secret: secretName: e2e-interception-proxy-ca optional: true + - name: mock-tls-certs + secret: + secretName: e2e-mock-tls-certs + optional: true diff --git a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh index 7f7b3d9a4..332a429c2 100755 --- a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh +++ b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh @@ -25,6 +25,8 @@ # disrupt-llama-stack - Delete llama-stack pod to disrupt connection # deploy-e2e-tunnel-proxy - Deploy in-cluster tunnel proxy (proxy.feature step) # deploy-e2e-interception-proxy - Deploy in-cluster interception proxy (proxy.feature step) +# deploy-e2e-mock-tls-inference - Deploy mock HTTPS inference server (tls.feature step) +# sync-mock-tls-certs-secret - Publish /certs PEMs to Secret for llama-stack mount set -e @@ -331,7 +333,55 @@ cmd_restart_lightspeed() { echo "✓ Lightspeed restart complete" } +cmd_reload_llama_stack_config() { + local llama_pod_name="llama-stack-service" + local tmp + + echo "===== Reloading llama-stack run.yaml (container restart, no pod recreate) =====" + tmp=$(mktemp) + if ! oc get configmap llama-stack-config -n "$NAMESPACE" \ + -o jsonpath='{.data.run\.yaml}' >"$tmp"; then + rm -f "$tmp" + echo "ERROR: failed to read llama-stack-config run.yaml" >&2 + return 1 + fi + if [[ ! -s "$tmp" ]]; then + rm -f "$tmp" + echo "ERROR: llama-stack-config run.yaml is empty" >&2 + return 1 + fi + if ! oc cp "$tmp" "$NAMESPACE/$llama_pod_name:/opt/app-root/run.yaml" \ + -c llama-stack-container; then + rm -f "$tmp" + echo "ERROR: failed to copy run.yaml into llama-stack pod" >&2 + return 1 + fi + rm -f "$tmp" + echo "Restarting llama-stack-container to pick up run.yaml..." + oc exec -n "$NAMESPACE" "$llama_pod_name" -c llama-stack-container -- bash -c 'kill 1' \ + 2>/dev/null || true + wait_for_pod "$llama_pod_name" 45 + if ! wait_for_llama_stack_http_health 35; then + echo "===== Llama-stack reload FAILED (HTTP not healthy) =====" + return 1 + fi + if ! cmd_restart_llama_port_forward; then + echo "ERROR: Llama pod is up but localhost:${LOCAL_LLAMA_PORT:-8321} port-forward failed" + return 1 + fi + echo "===== Llama-stack config reload complete =====" +} + cmd_restart_llama_stack() { + if [[ "${E2E_KONFLUX_E2E:-0}" == "1" && "${E2E_LLAMA_RELOAD_CONFIG_ONLY:-0}" == "1" ]]; then + if oc get pod llama-stack-service -n "$NAMESPACE" &>/dev/null; then + if cmd_reload_llama_stack_config; then + return 0 + fi + echo "WARN: llama config reload failed; falling back to full pod restart" >&2 + fi + fi + echo "===== Restoring llama-stack service =====" # Pod.spec is largely immutable; delete so apply creates a pod with current volumes/env. echo "Deleting llama-stack pod (if any) before apply..." @@ -350,6 +400,14 @@ cmd_restart_llama_stack() { exit 1 fi fi + if [[ "${E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA:-0}" == "1" \ + && "${E2E_LLAMA_RELOAD_CONFIG_ONLY:-0}" != "1" ]]; then + echo "[e2e-ops] Syncing e2e-mock-tls-certs secret before llama-stack apply..." + if ! cmd_sync_mock_tls_certs_secret; then + echo "===== Llama-stack restore FAILED (mock TLS certs secret sync) =====" + exit 1 + fi + fi _LLAMA_SVC_FQDN="llama-stack-service-svc.${NAMESPACE}.svc.cluster.local" oc create secret generic llama-stack-ip-secret \ --from-literal=key="$_LLAMA_SVC_FQDN" \ @@ -365,7 +423,17 @@ cmd_restart_llama_stack() { exit 1 fi fi - if ! wait_for_llama_stack_http_health 50; then + if [[ "${E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA:-0}" == "1" ]]; then + if ! _verify_mock_tls_certs_mounted_in_llama; then + echo "===== Llama-stack restore FAILED (mock TLS certs not mounted) =====" + exit 1 + fi + fi + local llama_health_attempts=50 + if [[ "${E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA:-0}" == "1" ]]; then + llama_health_attempts=75 + fi + if ! wait_for_llama_stack_http_health "$llama_health_attempts"; then echo "===== Llama-stack restore FAILED (HTTP not healthy) =====" exit 1 fi @@ -709,6 +777,69 @@ cmd_copy_interception_proxy_ca_to_llama() { cmd_sync_interception_proxy_ca_secret } +_MOCK_TLS_CERT_FILES=( + ca.crt + client.crt + client.key + untrusted-ca.crt + expired-ca.crt + untrusted-client.crt + untrusted-client.key + expired-client.crt +) + +cmd_sync_mock_tls_certs_secret() { + local mock_pod_name tmpdir f + mock_pod_name=$(oc get pod -n "$NAMESPACE" -l app=e2e-mock-tls-inference \ + -o jsonpath='{.items[0].metadata.name}' 2>/dev/null) || mock_pod_name="" + + if [[ -z "$mock_pod_name" ]]; then + echo "ERROR: no e2e-mock-tls-inference pod in namespace $NAMESPACE" >&2 + echo " Run: e2e-ops.sh deploy-e2e-mock-tls-inference" >&2 + return 1 + fi + + tmpdir=$(mktemp -d) + for f in "${_MOCK_TLS_CERT_FILES[@]}"; do + if ! oc exec -n "$NAMESPACE" "$mock_pod_name" -c e2e-mock-tls-inference -- \ + cat "/certs/$f" >"$tmpdir/$f"; then + echo "ERROR: failed to read /certs/$f from e2e-mock-tls-inference pod" >&2 + rm -rf "$tmpdir" + return 1 + fi + if [[ ! -s "$tmpdir/$f" ]]; then + echo "ERROR: /certs/$f is empty in e2e-mock-tls-inference pod" >&2 + rm -rf "$tmpdir" + return 1 + fi + done + + if ! oc create secret generic e2e-mock-tls-certs \ + --from-file="$tmpdir" \ + -n "$NAMESPACE" \ + --dry-run=client -o yaml | oc apply -f -; then + echo "ERROR: failed to apply e2e-mock-tls-certs secret" >&2 + rm -rf "$tmpdir" + return 1 + fi + rm -rf "$tmpdir" + echo "✓ Secret e2e-mock-tls-certs updated (${#_MOCK_TLS_CERT_FILES[@]} files)" +} + +_verify_mock_tls_certs_mounted_in_llama() { + local llama_pod_name="llama-stack-service" + if oc exec -n "$NAMESPACE" "$llama_pod_name" -c llama-stack-container -- \ + sh -c 'test -s /certs/ca.crt && test -s /certs/client.crt && test -s /certs/client.key'; then + echo "✓ mock TLS certs present under /certs in llama-stack" + return 0 + fi + echo "ERROR: /certs missing or incomplete in llama-stack pod" >&2 + oc get secret e2e-mock-tls-certs -n "$NAMESPACE" 2>&1 || true + oc exec -n "$NAMESPACE" "$llama_pod_name" -c llama-stack-container -- \ + ls -la /certs 2>&1 || true + return 1 +} + _e2e_repo_root() { cd "$SCRIPT_DIR/../../../.." && pwd } @@ -745,6 +876,28 @@ cmd_deploy_e2e_interception_proxy() { echo "✓ e2e-interception-proxy ready at http://e2e-interception-proxy.${NAMESPACE}.svc.cluster.local:8889" } +cmd_deploy_e2e_mock_tls_inference() { + local repo_root + repo_root="$(_e2e_repo_root)" + echo "Deploying e2e-mock-tls-inference in namespace $NAMESPACE..." + oc create configmap e2e-mock-tls-inference-script -n "$NAMESPACE" \ + --from-file=server.py="$repo_root/tests/e2e/mock_tls_inference_server/server.py" \ + --dry-run=client -o yaml | oc apply -f - + oc delete pod e2e-mock-tls-inference -n "$NAMESPACE" --ignore-not-found=true --wait=true 2>/dev/null || true + oc apply -n "$NAMESPACE" -f "$MANIFEST_DIR/e2e-mock-tls-inference.yaml" + if ! oc wait pod/e2e-mock-tls-inference -n "$NAMESPACE" --for=condition=Ready --timeout=240s; then + echo "ERROR: e2e-mock-tls-inference failed to become ready" >&2 + oc describe pod/e2e-mock-tls-inference -n "$NAMESPACE" 2>/dev/null | tail -30 || true + oc logs e2e-mock-tls-inference -n "$NAMESPACE" --tail=40 2>&1 || true + return 1 + fi + echo "✓ e2e-mock-tls-inference ready at https://e2e-mock-tls-inference.${NAMESPACE}.svc.cluster.local:8443" + if ! cmd_sync_mock_tls_certs_secret; then + echo "WARNING: mock TLS server is up but e2e-mock-tls-certs secret sync failed" >&2 + return 1 + fi +} + cmd_disrupt_llama_stack() { local pod_name="llama-stack-service" @@ -815,6 +968,12 @@ case "$COMMAND" in deploy-e2e-interception-proxy) cmd_deploy_e2e_interception_proxy ;; + deploy-e2e-mock-tls-inference) + cmd_deploy_e2e_mock_tls_inference + ;; + sync-mock-tls-certs-secret) + cmd_sync_mock_tls_certs_secret + ;; *) echo "Usage: $0 [args...]" echo "" @@ -833,6 +992,8 @@ case "$COMMAND" in echo " sync-interception-proxy-ca-secret - Publish trustme CA to Secret for llama mount" echo " deploy-e2e-tunnel-proxy - Deploy in-cluster tunnel proxy pod" echo " deploy-e2e-interception-proxy - Deploy in-cluster interception proxy pod" + echo " deploy-e2e-mock-tls-inference - Deploy mock HTTPS inference server (tls.feature)" + echo " sync-mock-tls-certs-secret - Publish mock TLS /certs PEMs to Secret for llama" exit 1 ;; esac diff --git a/tests/e2e/configuration/server-mode/lightspeed-stack-tls.yaml b/tests/e2e/configuration/server-mode/lightspeed-stack-tls.yaml index babdc2b99..fd45ea744 100644 --- a/tests/e2e/configuration/server-mode/lightspeed-stack-tls.yaml +++ b/tests/e2e/configuration/server-mode/lightspeed-stack-tls.yaml @@ -8,7 +8,7 @@ service: access_log: true llama_stack: use_as_library_client: false - url: http://llama-stack:8321 + url: http://${env.E2E_LLAMA_HOSTNAME}:8321 api_key: xyzzy user_data_collection: feedback_enabled: true diff --git a/tests/e2e/features/environment.py b/tests/e2e/features/environment.py index fdca1247c..e97f993a5 100644 --- a/tests/e2e/features/environment.py +++ b/tests/e2e/features/environment.py @@ -26,6 +26,7 @@ reset_llama_stack_disrupt_once_tracking, reset_llama_stack_was_running, ) +from tests.e2e.features.steps.tls import reset_tls_prow_restart_optimization_state from tests.e2e.utils.llama_stack_utils import register_shield from tests.e2e.utils.prow_utils import ( restart_pod, @@ -451,6 +452,8 @@ def before_feature(context: Context, feature: Feature) -> None: context.active_lightspeed_stack_config_basename = None # One real Llama disruption per feature (module-level flag; survives context resets) reset_llama_stack_disrupt_once_tracking() + if feature.filename and "tls.feature" in feature.filename: + reset_tls_prow_restart_optimization_state() try: max_flaky = int(os.getenv("E2E_FLAKY_MAX_ATTEMPTS", _E2E_FLAKY_MAX_ATTEMPTS)) diff --git a/tests/e2e/features/steps/proxy.py b/tests/e2e/features/steps/proxy.py index 1511250dd..597204d92 100644 --- a/tests/e2e/features/steps/proxy.py +++ b/tests/e2e/features/steps/proxy.py @@ -295,6 +295,8 @@ def restore_if_modified(context: Context) -> None: _stop_proxy(context, "tunnel_proxy", "proxy_loop") _stop_proxy(context, "interception_proxy", "interception_proxy_loop") os.environ.pop("E2E_COPY_INTERCEPTION_CA_TO_LLAMA", None) + os.environ.pop("E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA", None) + os.environ.pop("E2E_LLAMA_RELOAD_CONFIG_ONLY", None) if hasattr(context, "needs_interception_ca_on_llama"): delattr(context, "needs_interception_ca_on_llama") diff --git a/tests/e2e/features/steps/tls.py b/tests/e2e/features/steps/tls.py index 66d56adcc..0bf7b6905 100644 --- a/tests/e2e/features/steps/tls.py +++ b/tests/e2e/features/steps/tls.py @@ -9,6 +9,7 @@ """ import copy +import os from typing import Any, Optional from behave import given # pyright: ignore[reportAttributeAccessIssue] @@ -19,16 +20,12 @@ load_llama_config, write_llama_config, ) +from tests.e2e.utils.prow_utils import get_namespace, run_e2e_ops +from tests.e2e.utils.utils import is_prow_environment -_TLS_PROVIDER_BASE: dict[str, Any] = { - "provider_id": "tls-openai", - "provider_type": "remote::openai", - "config": { - "api_key": "test-key", - "base_url": "https://mock-tls-inference:8443/v1", - "allowed_models": ["mock-tls-model"], - }, -} +_MOCK_TLS_PORT_TLS = 8443 +_MOCK_TLS_PORT_MTLS = 8444 +_MOCK_TLS_PORT_HOSTNAME_MISMATCH = 8445 _TLS_MODEL_RESOURCE: dict[str, str] = { "model_id": "mock-tls-model", @@ -36,6 +33,74 @@ "provider_model_id": "mock-tls-model", } +_mock_tls_cluster_deploy_state: dict[str, bool] = {"done": False} +_tls_llama_warm_in_prow: dict[str, bool] = {"done": False} + + +def reset_tls_prow_restart_optimization_state() -> None: + """Reset per-feature Prow restart optimizations (call from ``before_feature``).""" + _tls_llama_warm_in_prow["done"] = False + os.environ.pop("E2E_LLAMA_RELOAD_CONFIG_ONLY", None) + + +def _prepare_tls_prow_llama_restart_env() -> None: + """Set env vars so e2e-ops can reload run.yaml instead of recreating the pod.""" + os.environ["E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA"] = "1" + if _tls_llama_warm_in_prow["done"]: + os.environ["E2E_LLAMA_RELOAD_CONFIG_ONLY"] = "1" + else: + os.environ.pop("E2E_LLAMA_RELOAD_CONFIG_ONLY", None) + + +def _cluster_mock_tls_inference_host() -> str: + """DNS name of the in-cluster mock TLS inference server (Konflux / Prow).""" + explicit = os.getenv("E2E_MOCK_TLS_INFERENCE_HOST", "").strip() + if explicit: + return explicit + return f"e2e-mock-tls-inference.{get_namespace()}.svc.cluster.local" + + +def _mock_tls_base_url(port: int) -> str: + """OpenAI-compatible base URL for the mock TLS inference server.""" + if is_prow_environment(): + host = _cluster_mock_tls_inference_host() + else: + host = "mock-tls-inference" + return f"https://{host}:{port}/v1" + + +def _tls_provider_base() -> dict[str, Any]: + """Default tls-openai provider dict with environment-appropriate base_url.""" + return { + "provider_id": "tls-openai", + "provider_type": "remote::openai", + "config": { + "api_key": "test-key", + "base_url": _mock_tls_base_url(_MOCK_TLS_PORT_TLS), + "allowed_models": ["mock-tls-model"], + }, + } + + +def _deploy_cluster_mock_tls_inference() -> None: + """Deploy the in-cluster mock TLS inference pod (Konflux / Prow).""" + if _mock_tls_cluster_deploy_state["done"]: + print("Using existing e2e-mock-tls-inference deployment") + return + + result = run_e2e_ops("deploy-e2e-mock-tls-inference", timeout=300) + print(result.stdout, end="") + if result.returncode != 0: + raise AssertionError( + "Failed to deploy e2e-mock-tls-inference: " + f"{result.stderr or result.stdout}" + ) + os.environ.setdefault( + "E2E_MOCK_TLS_INFERENCE_HOST", + _cluster_mock_tls_inference_host(), + ) + _mock_tls_cluster_deploy_state["done"] = True + def _ensure_tls_provider(config: dict[str, Any]) -> dict[str, Any]: """Find or create the tls-openai inference provider in the config. @@ -59,7 +124,7 @@ def _ensure_tls_provider(config: dict[str, Any]) -> dict[str, Any]: return provider # Provider not found — add it - provider = copy.deepcopy(_TLS_PROVIDER_BASE) + provider = copy.deepcopy(_tls_provider_base()) inference.append(provider) # Also register the model resource @@ -85,8 +150,14 @@ def _configure_tls(tls_config: dict[str, Any], base_url: Optional[str] = None) - provider.setdefault("config", {}).setdefault("network", {}) if base_url is not None: provider["config"]["base_url"] = base_url + else: + provider["config"]["base_url"] = _mock_tls_base_url(_MOCK_TLS_PORT_TLS) provider["config"]["network"]["tls"] = tls_config write_llama_config(config) + if is_prow_environment(): + _prepare_tls_prow_llama_restart_env() + if not _tls_llama_warm_in_prow["done"]: + _tls_llama_warm_in_prow["done"] = True # --- Background Steps --- @@ -94,6 +165,15 @@ def _configure_tls(tls_config: dict[str, Any], base_url: Optional[str] = None) - # run.yaml (see proxy.py). Restart steps are listed in tls.feature / proxy.feature. +@given("The mock TLS inference server is deployed") +def deploy_mock_tls_inference_server(context: Context) -> None: + """Ensure mock TLS inference is reachable (Compose locally, pod in Prow).""" + if is_prow_environment(): + _deploy_cluster_mock_tls_inference() + return + print("Using docker-compose mock-tls-inference service") + + # --- TLS Configuration Steps --- @@ -124,7 +204,7 @@ def configure_tls_mtls(context: Context) -> None: "client_cert": "/certs/client.crt", "client_key": "/certs/client.key", }, - base_url="https://mock-tls-inference:8444/v1", + base_url=_mock_tls_base_url(_MOCK_TLS_PORT_MTLS), ) @@ -139,7 +219,7 @@ def configure_mtls_no_client_cert(context: Context) -> None: """Configure run.yaml for mTLS port without client cert (should fail).""" _configure_tls( {"verify": "/certs/ca.crt"}, - base_url="https://mock-tls-inference:8444/v1", + base_url=_mock_tls_base_url(_MOCK_TLS_PORT_MTLS), ) @@ -152,7 +232,7 @@ def configure_mtls_wrong_client_cert(context: Context) -> None: "client_cert": "/certs/ca.crt", "client_key": "/certs/client.key", }, - base_url="https://mock-tls-inference:8444/v1", + base_url=_mock_tls_base_url(_MOCK_TLS_PORT_MTLS), ) @@ -165,7 +245,7 @@ def configure_mtls_untrusted_client_cert(context: Context) -> None: "client_cert": "/certs/untrusted-client.crt", "client_key": "/certs/untrusted-client.key", }, - base_url="https://mock-tls-inference:8444/v1", + base_url=_mock_tls_base_url(_MOCK_TLS_PORT_MTLS), ) @@ -178,7 +258,7 @@ def configure_mtls_expired_client_cert(context: Context) -> None: "client_cert": "/certs/expired-client.crt", "client_key": "/certs/client.key", }, - base_url="https://mock-tls-inference:8444/v1", + base_url=_mock_tls_base_url(_MOCK_TLS_PORT_MTLS), ) @@ -187,7 +267,7 @@ def configure_tls_hostname_mismatch(context: Context) -> None: """Configure run.yaml to connect to hostname-mismatch server (should fail).""" _configure_tls( {"verify": "/certs/ca.crt"}, - base_url="https://mock-tls-inference:8445/v1", + base_url=_mock_tls_base_url(_MOCK_TLS_PORT_HOSTNAME_MISMATCH), ) @@ -200,7 +280,7 @@ def configure_mtls_hostname_mismatch(context: Context) -> None: "client_cert": "/certs/client.crt", "client_key": "/certs/client.key", }, - base_url="https://mock-tls-inference:8445/v1", + base_url=_mock_tls_base_url(_MOCK_TLS_PORT_HOSTNAME_MISMATCH), ) @@ -211,7 +291,7 @@ def configure_tls_min_version_hostname_mismatch(context: Context, version: str) """Configure run.yaml with TLS min version against hostname-mismatch server.""" _configure_tls( {"verify": "/certs/ca.crt", "min_version": version}, - base_url="https://mock-tls-inference:8445/v1", + base_url=_mock_tls_base_url(_MOCK_TLS_PORT_HOSTNAME_MISMATCH), ) diff --git a/tests/e2e/features/tls.feature b/tests/e2e/features/tls.feature index a900b1c0f..15215408e 100644 --- a/tests/e2e/features/tls.feature +++ b/tests/e2e/features/tls.feature @@ -1,8 +1,10 @@ -@e2e_group_1 @skip-in-library-mode @skip-in-prow +@e2e_group_1 @skip-in-library-mode Feature: TLS configuration for remote inference providers Validate that Llama Stack's NetworkConfig.tls settings are applied correctly when connecting to a remote inference provider over HTTPS. + # Only Llama run.yaml changes per scenario; LCS uses lightspeed-stack-tls.yaml throughout. + Background: Given The service is started locally And The system is in default state @@ -10,6 +12,7 @@ Feature: TLS configuration for remote inference providers And the Lightspeed stack configuration directory is "tests/e2e/configuration" And The service uses the lightspeed-stack-tls.yaml configuration And The service is restarted + And The mock TLS inference server is deployed And The original Llama Stack config is restored if modified Scenario: Inference succeeds with TLS verification disabled diff --git a/tests/e2e/mock_tls_inference_server/server.py b/tests/e2e/mock_tls_inference_server/server.py index bfb4cbae5..25bd23a0c 100644 --- a/tests/e2e/mock_tls_inference_server/server.py +++ b/tests/e2e/mock_tls_inference_server/server.py @@ -13,6 +13,7 @@ import datetime import json +import os import ssl import threading import time @@ -29,6 +30,25 @@ MTLS_PORT = 8444 HOSTNAME_MISMATCH_PORT = 8445 +_DEFAULT_SERVER_CERT_DNS_NAMES: tuple[str, ...] = ( + "mock-tls-inference", + "localhost", + "127.0.0.1", +) + + +def _server_cert_dns_names() -> tuple[str, ...]: + """Return DNS identities for the main server certificate. + + Reads comma-separated ``TLS_CERT_DNS_NAMES`` (set in Konflux/Prow manifest). + Falls back to Docker Compose defaults when unset. + """ + raw = os.environ.get("TLS_CERT_DNS_NAMES", "").strip() + if not raw: + return _DEFAULT_SERVER_CERT_DNS_NAMES + names = tuple(name.strip() for name in raw.split(",") if name.strip()) + return names or _DEFAULT_SERVER_CERT_DNS_NAMES + class OpenAIHandler(BaseHTTPRequestHandler): """Handles OpenAI-compatible API requests over HTTPS.""" @@ -221,8 +241,9 @@ def main() -> None: # Generate CA and certificates ca = trustme.CA() - # Server cert with SANs for Docker service name and localhost - server_cert = ca.issue_cert("mock-tls-inference", "localhost", "127.0.0.1") + server_dns_names = _server_cert_dns_names() + print(f" Server cert DNS names: {', '.join(server_dns_names)}") + server_cert = ca.issue_cert(*server_dns_names) # Client cert for mTLS testing (use a simple hostname without spaces) client_cert = ca.issue_cert("tls-e2e-test-client") diff --git a/tests/e2e/test_list.txt b/tests/e2e/test_list.txt index 34e1b8647..857021536 100644 --- a/tests/e2e/test_list.txt +++ b/tests/e2e/test_list.txt @@ -1,29 +1,2 @@ -features/authorized_noop.feature -features/health.feature -features/info.feature -features/models.feature -features/rest_api.feature -features/smoketests.feature -features/authorized_noop_token.feature -features/conversation_cache_v2.feature -features/conversations.feature -features/faiss.feature -features/inline_rag.feature -features/feedback.feature -features/query.feature -features/responses.feature -features/responses_streaming.feature -features/rlsapi_v1.feature -features/streaming_query.feature -features/http_401_unauthorized.feature -features/authorized_rh_identity.feature -features/rbac.feature -features/rlsapi_v1_errors.feature -features/llama_stack_disrupted.feature -features/mcp.feature -features/mcp_servers_api.feature -features/mcp_servers_api_auth.feature -features/mcp_servers_api_no_config.feature -features/proxy.feature features/tls.feature features/opentelemetry.feature From 99b1b315a66a09fb0f1418b2c67f0ef7d3720e05 Mon Sep 17 00:00:00 2001 From: Radovan Fuchs Date: Wed, 20 May 2026 19:10:11 +0200 Subject: [PATCH 02/20] add all tests --- tests/e2e/test_list.txt | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/tests/e2e/test_list.txt b/tests/e2e/test_list.txt index 857021536..26926a81f 100644 --- a/tests/e2e/test_list.txt +++ b/tests/e2e/test_list.txt @@ -1,2 +1,29 @@ +features/authorized_noop.feature +features/health.feature +features/info.feature +features/models.feature +features/rest_api.feature +features/smoketests.feature +features/authorized_noop_token.feature +features/conversation_cache_v2.feature +features/conversations.feature +features/faiss.feature +features/inline_rag.feature +features/feedback.feature +features/query.feature +features/responses.feature +features/responses_streaming.feature +features/rlsapi_v1.feature +features/streaming_query.feature +features/http_401_unauthorized.feature +features/authorized_rh_identity.feature +features/rbac.feature +features/rlsapi_v1_errors.feature +features/llama_stack_disrupted.feature +features/mcp.feature +features/mcp_servers_api.feature +features/mcp_servers_api_auth.feature +features/mcp_servers_api_no_config.feature +features/proxy.feature features/tls.feature -features/opentelemetry.feature +features/opentelemetry.feature \ No newline at end of file From 61f0d3699fbe5d2a40e4fe4d472f85b739b0ffac Mon Sep 17 00:00:00 2001 From: Radovan Fuchs Date: Thu, 21 May 2026 06:53:31 +0200 Subject: [PATCH 03/20] set proxy tests to skipped --- tests/e2e/features/proxy.feature | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/e2e/features/proxy.feature b/tests/e2e/features/proxy.feature index 907c4317d..00fde258a 100644 --- a/tests/e2e/features/proxy.feature +++ b/tests/e2e/features/proxy.feature @@ -1,4 +1,4 @@ -@e2e_group_3 @skip-in-library-mode +@e2e_group_3 @skip-in-library-mode @skip-in-prow Feature: Proxy and TLS networking tests for Llama Stack providers Verify that the Lightspeed Stack works correctly when Llama Stack's From c2c47c385656dd7b81c0af7e6ebfaf3b552f68b6 Mon Sep 17 00:00:00 2001 From: Radovan Fuchs Date: Thu, 21 May 2026 11:20:21 +0200 Subject: [PATCH 04/20] remove optimizations for restarts --- tests/e2e/features/steps/tls.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/tests/e2e/features/steps/tls.py b/tests/e2e/features/steps/tls.py index 0bf7b6905..3d7e7fdb4 100644 --- a/tests/e2e/features/steps/tls.py +++ b/tests/e2e/features/steps/tls.py @@ -34,22 +34,23 @@ } _mock_tls_cluster_deploy_state: dict[str, bool] = {"done": False} -_tls_llama_warm_in_prow: dict[str, bool] = {"done": False} def reset_tls_prow_restart_optimization_state() -> None: - """Reset per-feature Prow restart optimizations (call from ``before_feature``).""" - _tls_llama_warm_in_prow["done"] = False + """Reset per-feature Prow state (call from ``before_feature``).""" + _mock_tls_cluster_deploy_state["done"] = False os.environ.pop("E2E_LLAMA_RELOAD_CONFIG_ONLY", None) + os.environ.pop("E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA", None) def _prepare_tls_prow_llama_restart_env() -> None: - """Set env vars so e2e-ops can reload run.yaml instead of recreating the pod.""" + """Set env vars so e2e-ops always recreates the llama pod (no config-only reload). + + TLS scenarios change run.yaml and rely on /certs volume mounts; full pod + restarts are slower but more reliable than ``kill 1`` reload on Konflux. + """ os.environ["E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA"] = "1" - if _tls_llama_warm_in_prow["done"]: - os.environ["E2E_LLAMA_RELOAD_CONFIG_ONLY"] = "1" - else: - os.environ.pop("E2E_LLAMA_RELOAD_CONFIG_ONLY", None) + os.environ.pop("E2E_LLAMA_RELOAD_CONFIG_ONLY", None) def _cluster_mock_tls_inference_host() -> str: @@ -156,8 +157,6 @@ def _configure_tls(tls_config: dict[str, Any], base_url: Optional[str] = None) - write_llama_config(config) if is_prow_environment(): _prepare_tls_prow_llama_restart_env() - if not _tls_llama_warm_in_prow["done"]: - _tls_llama_warm_in_prow["done"] = True # --- Background Steps --- From b5fec6dd5e80c2569a03192e88e85fc9e3a84353 Mon Sep 17 00:00:00 2001 From: Radovan Fuchs Date: Fri, 22 May 2026 08:31:36 +0200 Subject: [PATCH 05/20] fix failing tests --- .../lightspeed-stack-integration-test.yaml | 2 + tests/e2e-prow/rhoai/scripts/e2e-ops.sh | 84 ++++++++++++++----- tests/e2e/features/steps/tls.py | 1 + tests/e2e/features/tls.feature | 2 +- tests/e2e/utils/prow_utils.py | 51 +++++++---- 5 files changed, 102 insertions(+), 38 deletions(-) diff --git a/.tekton/integration-tests/pipeline/lightspeed-stack-integration-test.yaml b/.tekton/integration-tests/pipeline/lightspeed-stack-integration-test.yaml index 3110ccea7..266ea1b96 100644 --- a/.tekton/integration-tests/pipeline/lightspeed-stack-integration-test.yaml +++ b/.tekton/integration-tests/pipeline/lightspeed-stack-integration-test.yaml @@ -167,6 +167,8 @@ spec: echo "========== End parameters ==========" - name: lightspeed-stack-integration-tests description: Task to run integration tests from lightspeed-stack repository + # Full Behave suite (proxy + tls) can exceed 2h; needs PipelineRun timeouts >= this value. + timeout: 3h params: - name: SNAPSHOT value: $(params.SNAPSHOT) diff --git a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh index 332a429c2..c82fbc53c 100755 --- a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh +++ b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh @@ -414,7 +414,17 @@ cmd_restart_llama_stack() { -n "$NAMESPACE" \ --dry-run=client -o yaml | oc apply -f - oc apply -n "$NAMESPACE" -f "$MANIFEST_DIR/llama-stack-openai.yaml" - wait_for_pod "llama-stack-service" 90 + local llama_pod_wait=90 + local llama_health_attempts=50 + if [[ "${E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA:-0}" == "1" ]]; then + llama_pod_wait=120 + llama_health_attempts=100 + fi + if ! wait_for_pod "llama-stack-service" "$llama_pod_wait"; then + echo "===== Llama-stack restore FAILED (pod not ready) =====" + oc describe pod llama-stack-service -n "$NAMESPACE" 2>/dev/null | tail -40 || true + exit 1 + fi echo "Labeling pod for service..." oc label pod llama-stack-service pod=llama-stack-service -n "$NAMESPACE" --overwrite if [[ "${E2E_COPY_INTERCEPTION_CA_TO_LLAMA:-0}" == "1" ]]; then @@ -429,12 +439,9 @@ cmd_restart_llama_stack() { exit 1 fi fi - local llama_health_attempts=50 - if [[ "${E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA:-0}" == "1" ]]; then - llama_health_attempts=75 - fi if ! wait_for_llama_stack_http_health "$llama_health_attempts"; then echo "===== Llama-stack restore FAILED (HTTP not healthy) =====" + oc logs llama-stack-service -n "$NAMESPACE" -c llama-stack-container --tail=80 2>&1 || true exit 1 fi else @@ -542,6 +549,9 @@ cmd_restart_llama_port_forward() { local local_port="${LOCAL_LLAMA_PORT:-8321}" local remote_port="${REMOTE_LLAMA_PORT:-8321}" local max_attempts=6 + if [[ "${E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA:-0}" == "1" ]]; then + max_attempts=10 + fi local pf_pid local pf_resource local llama_pf_log="/tmp/port-forward-llama.log" @@ -788,8 +798,23 @@ _MOCK_TLS_CERT_FILES=( expired-client.crt ) +_mock_tls_certs_secret_is_complete() { + local f data + if ! oc get secret e2e-mock-tls-certs -n "$NAMESPACE" &>/dev/null; then + return 1 + fi + for f in "${_MOCK_TLS_CERT_FILES[@]}"; do + data=$(oc get secret e2e-mock-tls-certs -n "$NAMESPACE" \ + -o "go-template={{index .data \"${f}\"}}" 2>/dev/null) || return 1 + if [[ -z "$data" ]]; then + return 1 + fi + done + return 0 +} + cmd_sync_mock_tls_certs_secret() { - local mock_pod_name tmpdir f + local mock_pod_name tmpdir f attempt mock_pod_name=$(oc get pod -n "$NAMESPACE" -l app=e2e-mock-tls-inference \ -o jsonpath='{.items[0].metadata.name}' 2>/dev/null) || mock_pod_name="" @@ -799,19 +824,29 @@ cmd_sync_mock_tls_certs_secret() { return 1 fi + if _mock_tls_certs_secret_is_complete; then + echo "✓ e2e-mock-tls-certs secret already complete, skipping sync" + return 0 + fi + + if ! oc wait pod/"$mock_pod_name" -n "$NAMESPACE" --for=condition=Ready --timeout=60s 2>/dev/null; then + echo "WARNING: e2e-mock-tls-inference not Ready before cert sync" >&2 + fi + tmpdir=$(mktemp -d) for f in "${_MOCK_TLS_CERT_FILES[@]}"; do - if ! oc exec -n "$NAMESPACE" "$mock_pod_name" -c e2e-mock-tls-inference -- \ - cat "/certs/$f" >"$tmpdir/$f"; then - echo "ERROR: failed to read /certs/$f from e2e-mock-tls-inference pod" >&2 - rm -rf "$tmpdir" - return 1 - fi - if [[ ! -s "$tmpdir/$f" ]]; then - echo "ERROR: /certs/$f is empty in e2e-mock-tls-inference pod" >&2 - rm -rf "$tmpdir" - return 1 - fi + for attempt in 1 2 3; do + if oc exec -n "$NAMESPACE" "$mock_pod_name" -c e2e-mock-tls-inference -- \ + cat "/certs/$f" >"$tmpdir/$f" 2>/dev/null && [[ -s "$tmpdir/$f" ]]; then + break + fi + if [[ $attempt -eq 3 ]]; then + echo "ERROR: failed to read /certs/$f from e2e-mock-tls-inference pod" >&2 + rm -rf "$tmpdir" + return 1 + fi + sleep 3 + done done if ! oc create secret generic e2e-mock-tls-certs \ @@ -828,11 +863,16 @@ cmd_sync_mock_tls_certs_secret() { _verify_mock_tls_certs_mounted_in_llama() { local llama_pod_name="llama-stack-service" - if oc exec -n "$NAMESPACE" "$llama_pod_name" -c llama-stack-container -- \ - sh -c 'test -s /certs/ca.crt && test -s /certs/client.crt && test -s /certs/client.key'; then - echo "✓ mock TLS certs present under /certs in llama-stack" - return 0 - fi + local attempt + for attempt in 1 2 3 4 5 6; do + if oc exec -n "$NAMESPACE" "$llama_pod_name" -c llama-stack-container -- \ + sh -c 'test -s /certs/ca.crt && test -s /certs/client.crt && test -s /certs/client.key' \ + 2>/dev/null; then + echo "✓ mock TLS certs present under /certs in llama-stack" + return 0 + fi + sleep 3 + done echo "ERROR: /certs missing or incomplete in llama-stack pod" >&2 oc get secret e2e-mock-tls-certs -n "$NAMESPACE" 2>&1 || true oc exec -n "$NAMESPACE" "$llama_pod_name" -c llama-stack-container -- \ diff --git a/tests/e2e/features/steps/tls.py b/tests/e2e/features/steps/tls.py index 3d7e7fdb4..4ff8978ba 100644 --- a/tests/e2e/features/steps/tls.py +++ b/tests/e2e/features/steps/tls.py @@ -96,6 +96,7 @@ def _deploy_cluster_mock_tls_inference() -> None: "Failed to deploy e2e-mock-tls-inference: " f"{result.stderr or result.stdout}" ) + _prepare_tls_prow_llama_restart_env() os.environ.setdefault( "E2E_MOCK_TLS_INFERENCE_HOST", _cluster_mock_tls_inference_host(), diff --git a/tests/e2e/features/tls.feature b/tests/e2e/features/tls.feature index 15215408e..412e5b04f 100644 --- a/tests/e2e/features/tls.feature +++ b/tests/e2e/features/tls.feature @@ -12,8 +12,8 @@ Feature: TLS configuration for remote inference providers And the Lightspeed stack configuration directory is "tests/e2e/configuration" And The service uses the lightspeed-stack-tls.yaml configuration And The service is restarted - And The mock TLS inference server is deployed And The original Llama Stack config is restored if modified + And The mock TLS inference server is deployed Scenario: Inference succeeds with TLS verification disabled Given Llama Stack is configured with TLS verification disabled diff --git a/tests/e2e/utils/prow_utils.py b/tests/e2e/utils/prow_utils.py index ff771904b..a6a594973 100644 --- a/tests/e2e/utils/prow_utils.py +++ b/tests/e2e/utils/prow_utils.py @@ -7,6 +7,7 @@ import os import subprocess import tempfile +import time from typing import Optional @@ -93,7 +94,12 @@ def restart_pod(container_name: str) -> None: """ if container_name in _LLAMA_RESTART_NAMES: op = "restart-llama-stack" - timeout = 420 + # TLS feature: full pod recreate + cert sync + health can exceed 7 min on Konflux. + timeout = ( + 900 + if os.environ.get("E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA") == "1" + else 420 + ) elif container_name in _LIGHTSPEED_RESTART_NAMES: op = "restart-lightspeed" # Pod wait (up to ~120s) + port-forward retries + slow Konflux/Prow clusters. @@ -105,20 +111,35 @@ def restart_pod(container_name: str) -> None: ) op = "restart-lightspeed" timeout = 200 - try: - result = run_e2e_ops(op, timeout=timeout) - print(result.stdout, end="") - if result.returncode != 0: - print(result.stderr, end="") - detail = (result.stderr or result.stdout or "").strip() - raise subprocess.CalledProcessError( - result.returncode, - op, - detail or None, - ) - except subprocess.TimeoutExpired as e: - print(f"Failed to restart pod {container_name}: {e}") - raise + max_attempts = 2 if op == "restart-llama-stack" else 1 + last_error: subprocess.CalledProcessError | subprocess.TimeoutExpired | None = ( + None + ) + for attempt in range(1, max_attempts + 1): + try: + result = run_e2e_ops(op, timeout=timeout) + print(result.stdout, end="") + if result.returncode != 0: + print(result.stderr, end="") + detail = (result.stderr or result.stdout or "").strip() + raise subprocess.CalledProcessError( + result.returncode, + op, + detail or None, + ) + return + except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as err: + last_error = err + if attempt < max_attempts: + print( + f"⚠️ {op} failed (attempt {attempt}/{max_attempts}), " + "retrying after 20s..." + ) + time.sleep(20) + if last_error is not None: + if isinstance(last_error, subprocess.TimeoutExpired): + print(f"Failed to restart pod {container_name}: {last_error}") + raise last_error def restore_llama_stack_pod() -> None: From 3cd258085cee81fa81c9ee5810f061cf2c5c40a7 Mon Sep 17 00:00:00 2001 From: Radovan Fuchs Date: Fri, 22 May 2026 13:00:19 +0200 Subject: [PATCH 06/20] fix failing tests --- tests/e2e-prow/rhoai/scripts/e2e-ops.sh | 11 ++++- tests/e2e/features/steps/proxy.py | 8 ++++ tests/e2e/features/steps/tls.py | 59 ++++++++++++++++++++++--- tests/e2e/utils/prow_utils.py | 21 +++++++-- 4 files changed, 89 insertions(+), 10 deletions(-) diff --git a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh index c82fbc53c..0f206c0e3 100755 --- a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh +++ b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh @@ -360,9 +360,16 @@ cmd_reload_llama_stack_config() { echo "Restarting llama-stack-container to pick up run.yaml..." oc exec -n "$NAMESPACE" "$llama_pod_name" -c llama-stack-container -- bash -c 'kill 1' \ 2>/dev/null || true - wait_for_pod "$llama_pod_name" 45 - if ! wait_for_llama_stack_http_health 35; then + local reload_pod_wait=45 + local reload_health_attempts=35 + if [[ "${E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA:-0}" == "1" ]]; then + reload_pod_wait=60 + reload_health_attempts=50 + fi + wait_for_pod "$llama_pod_name" "$reload_pod_wait" + if ! wait_for_llama_stack_http_health "$reload_health_attempts"; then echo "===== Llama-stack reload FAILED (HTTP not healthy) =====" + oc logs llama-stack-service -n "$NAMESPACE" -c llama-stack-container --tail=80 2>&1 || true return 1 fi if ! cmd_restart_llama_port_forward; then diff --git a/tests/e2e/features/steps/proxy.py b/tests/e2e/features/steps/proxy.py index 2c2b19b5a..374218478 100644 --- a/tests/e2e/features/steps/proxy.py +++ b/tests/e2e/features/steps/proxy.py @@ -320,6 +320,14 @@ def restore_if_modified(context: Context) -> None: @given("Llama Stack is restarted") def restart_llama_stack(context: Context) -> None: """Restart the Llama Stack container.""" + from tests.e2e.features.steps.tls import ( + is_tls_configuration_feature, + restart_llama_for_tls_feature, + ) + + if is_tls_configuration_feature(context): + restart_llama_for_tls_feature(context) + return restart_container("llama-stack") diff --git a/tests/e2e/features/steps/tls.py b/tests/e2e/features/steps/tls.py index 4ff8978ba..5f5da53be 100644 --- a/tests/e2e/features/steps/tls.py +++ b/tests/e2e/features/steps/tls.py @@ -34,25 +34,74 @@ } _mock_tls_cluster_deploy_state: dict[str, bool] = {"done": False} +_tls_prow_restart_state: dict[str, bool] = {"full_restart_done": False} def reset_tls_prow_restart_optimization_state() -> None: """Reset per-feature Prow state (call from ``before_feature``).""" _mock_tls_cluster_deploy_state["done"] = False + _tls_prow_restart_state["full_restart_done"] = False os.environ.pop("E2E_LLAMA_RELOAD_CONFIG_ONLY", None) os.environ.pop("E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA", None) -def _prepare_tls_prow_llama_restart_env() -> None: - """Set env vars so e2e-ops always recreates the llama pod (no config-only reload). +def is_tls_configuration_feature(context: Context) -> bool: + """Return True when the active Behave feature is ``tls.feature``.""" + feature = getattr(context, "feature", None) + if feature is None: + return False + name = getattr(feature, "name", "") or "" + return "TLS configuration" in name - TLS scenarios change run.yaml and rely on /certs volume mounts; full pod - restarts are slower but more reliable than ``kill 1`` reload on Konflux. - """ + +def _prepare_tls_prow_llama_full_restart_env() -> None: + """Env for a full llama pod recreate (first TLS scenario / recovery).""" os.environ["E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA"] = "1" os.environ.pop("E2E_LLAMA_RELOAD_CONFIG_ONLY", None) +def _prepare_tls_prow_llama_reload_env() -> None: + """Env for config-only reload (run.yaml already on pod with /certs mounted).""" + os.environ["E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA"] = "1" + os.environ["E2E_LLAMA_RELOAD_CONFIG_ONLY"] = "1" + + +def restart_llama_for_tls_feature(context: Context) -> None: + """Restart Llama for TLS tests: one full recreate per feature, then reload. + + Full pod delete+apply after every scenario (~16×) is flaky on Konflux + (cert sync, mount races, port-forward). Later scenarios only change run.yaml + in the ConfigMap; ``oc cp`` + container restart is enough. + """ + from tests.e2e.utils.utils import restart_container + + if not is_prow_environment(): + restart_container("llama-stack") + return + + if _tls_prow_restart_state["full_restart_done"]: + _prepare_tls_prow_llama_reload_env() + else: + _prepare_tls_prow_llama_full_restart_env() + + try: + restart_container("llama-stack") + except Exception: + _tls_prow_restart_state["full_restart_done"] = False + raise + + _tls_prow_restart_state["full_restart_done"] = True + + +def _prepare_tls_prow_llama_restart_env() -> None: + """Set env before writing run.yaml (used by ``_configure_tls``).""" + os.environ["E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA"] = "1" + if _tls_prow_restart_state["full_restart_done"]: + os.environ["E2E_LLAMA_RELOAD_CONFIG_ONLY"] = "1" + else: + os.environ.pop("E2E_LLAMA_RELOAD_CONFIG_ONLY", None) + + def _cluster_mock_tls_inference_host() -> str: """DNS name of the in-cluster mock TLS inference server (Konflux / Prow).""" explicit = os.getenv("E2E_MOCK_TLS_INFERENCE_HOST", "").strip() diff --git a/tests/e2e/utils/prow_utils.py b/tests/e2e/utils/prow_utils.py index a6a594973..20b9f3d7b 100644 --- a/tests/e2e/utils/prow_utils.py +++ b/tests/e2e/utils/prow_utils.py @@ -111,7 +111,15 @@ def restart_pod(container_name: str) -> None: ) op = "restart-lightspeed" timeout = 200 - max_attempts = 2 if op == "restart-llama-stack" else 1 + max_attempts = ( + 3 + if op == "restart-llama-stack" + and os.environ.get("E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA") == "1" + and os.environ.get("E2E_LLAMA_RELOAD_CONFIG_ONLY") != "1" + else 2 + if op == "restart-llama-stack" + else 1 + ) last_error: subprocess.CalledProcessError | subprocess.TimeoutExpired | None = ( None ) @@ -131,11 +139,18 @@ def restart_pod(container_name: str) -> None: except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as err: last_error = err if attempt < max_attempts: + retry_delay = 30 if os.environ.get("E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA") == "1" else 20 print( f"⚠️ {op} failed (attempt {attempt}/{max_attempts}), " - "retrying after 20s..." + f"retrying after {retry_delay}s..." ) - time.sleep(20) + time.sleep(retry_delay) + if ( + op == "restart-llama-stack" + and os.environ.get("E2E_LLAMA_RELOAD_CONFIG_ONLY") == "1" + ): + # Reload failed; next attempt does full pod recreate. + os.environ.pop("E2E_LLAMA_RELOAD_CONFIG_ONLY", None) if last_error is not None: if isinstance(last_error, subprocess.TimeoutExpired): print(f"Failed to restart pod {container_name}: {last_error}") From 70b69c9b79d3a9a6e66915d26eee750ecbe6c82a Mon Sep 17 00:00:00 2001 From: Radovan Fuchs Date: Mon, 25 May 2026 12:11:02 +0200 Subject: [PATCH 07/20] add logging --- tests/e2e-prow/rhoai/scripts/e2e-ops.sh | 104 ++++++++++++++++++++++-- tests/e2e/features/steps/tls.py | 8 ++ tests/e2e/utils/prow_utils.py | 51 +++++++++--- 3 files changed, 144 insertions(+), 19 deletions(-) diff --git a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh index 0f206c0e3..62167198d 100755 --- a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh +++ b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh @@ -38,6 +38,37 @@ E2E_LSC_PORT_FORWARD_PID_FILE="${E2E_LSC_PORT_FORWARD_PID_FILE:-/tmp/e2e-lightsp E2E_LLAMA_PORT_FORWARD_PID_FILE="${E2E_LLAMA_PORT_FORWARD_PID_FILE:-/tmp/e2e-llama-port-forward.pid}" E2E_JWKS_PORT_FORWARD_PID_FILE="${E2E_JWKS_PORT_FORWARD_PID_FILE:-/tmp/e2e-jwks-port-forward.pid}" +# Llama restart exit codes (grep Konflux logs for E2E_LLAMA_RESTART_FAILED_PHASE=): +# 10 reload: run.yaml copy failed +# 11 reload: pod not Ready within wait +# 12 reload: in-pod /v1/health failed +# 13 reload: localhost:8321 port-forward failed +# 20 full restart: pod not Ready +# 21 full restart: in-pod /v1/health failed +# 22 full restart: localhost:8321 port-forward failed + +E2E_OPS_CURRENT_PHASE="" +E2E_OPS_PHASE_START=0 + +_e2e_ops_phase() { + if [[ -n "${E2E_OPS_CURRENT_PHASE:-}" && "${E2E_OPS_PHASE_START:-0}" -gt 0 ]]; then + local elapsed=$(( $(date +%s) - E2E_OPS_PHASE_START )) + echo "[e2e-ops] <<< ${E2E_OPS_CURRENT_PHASE} (${elapsed}s)" + fi + E2E_OPS_CURRENT_PHASE="$1" + E2E_OPS_PHASE_START=$(date +%s) + echo "E2E_OPS_PHASE=$1" + echo "[e2e-ops] >>> $1" +} + +_e2e_ops_llama_restart_fail() { + local phase="$1" + local code="$2" + echo "E2E_LLAMA_RESTART_FAILED_PHASE=$phase" + echo "E2E_LLAMA_RESTART_EXIT_CODE=$code" + exit "$code" +} + # ============================================================================ # Helper functions # ============================================================================ @@ -337,26 +368,30 @@ cmd_reload_llama_stack_config() { local llama_pod_name="llama-stack-service" local tmp + echo "E2E_LLAMA_RESTART_MODE=reload" echo "===== Reloading llama-stack run.yaml (container restart, no pod recreate) =====" + _e2e_ops_phase "reload_read_configmap" tmp=$(mktemp) if ! oc get configmap llama-stack-config -n "$NAMESPACE" \ -o jsonpath='{.data.run\.yaml}' >"$tmp"; then rm -f "$tmp" echo "ERROR: failed to read llama-stack-config run.yaml" >&2 - return 1 + _e2e_ops_llama_restart_fail "reload_configmap_read" 10 fi if [[ ! -s "$tmp" ]]; then rm -f "$tmp" echo "ERROR: llama-stack-config run.yaml is empty" >&2 - return 1 + _e2e_ops_llama_restart_fail "reload_configmap_empty" 10 fi + _e2e_ops_phase "reload_oc_cp_run_yaml" if ! oc cp "$tmp" "$NAMESPACE/$llama_pod_name:/opt/app-root/run.yaml" \ -c llama-stack-container; then rm -f "$tmp" echo "ERROR: failed to copy run.yaml into llama-stack pod" >&2 - return 1 + _e2e_ops_llama_restart_fail "reload_oc_cp" 10 fi rm -f "$tmp" + _e2e_ops_phase "reload_kill_container" echo "Restarting llama-stack-container to pick up run.yaml..." oc exec -n "$NAMESPACE" "$llama_pod_name" -c llama-stack-container -- bash -c 'kill 1' \ 2>/dev/null || true @@ -366,16 +401,24 @@ cmd_reload_llama_stack_config() { reload_pod_wait=60 reload_health_attempts=50 fi - wait_for_pod "$llama_pod_name" "$reload_pod_wait" + _e2e_ops_phase "reload_wait_pod_ready" + if ! wait_for_pod "$llama_pod_name" "$reload_pod_wait"; then + echo "===== Llama-stack reload FAILED (pod not ready) =====" + oc describe pod llama-stack-service -n "$NAMESPACE" 2>/dev/null | tail -40 || true + _e2e_ops_llama_restart_fail "reload_pod_not_ready" 11 + fi + _e2e_ops_phase "reload_wait_in_pod_health" if ! wait_for_llama_stack_http_health "$reload_health_attempts"; then echo "===== Llama-stack reload FAILED (HTTP not healthy) =====" oc logs llama-stack-service -n "$NAMESPACE" -c llama-stack-container --tail=80 2>&1 || true - return 1 + _e2e_ops_llama_restart_fail "reload_in_pod_health" 12 fi + _e2e_ops_phase "reload_port_forward" if ! cmd_restart_llama_port_forward; then echo "ERROR: Llama pod is up but localhost:${LOCAL_LLAMA_PORT:-8321} port-forward failed" - return 1 + _e2e_ops_llama_restart_fail "reload_port_forward" 13 fi + _e2e_ops_phase "reload_done" echo "===== Llama-stack config reload complete =====" } @@ -386,10 +429,13 @@ cmd_restart_llama_stack() { return 0 fi echo "WARN: llama config reload failed; falling back to full pod restart" >&2 + echo "E2E_LLAMA_RESTART_FALLBACK=reload_to_full" fi fi + echo "E2E_LLAMA_RESTART_MODE=full" echo "===== Restoring llama-stack service =====" + _e2e_ops_phase "full_delete_pod" # Pod.spec is largely immutable; delete so apply creates a pod with current volumes/env. echo "Deleting llama-stack pod (if any) before apply..." timeout 45 oc delete pod llama-stack-service -n "$NAMESPACE" --ignore-not-found=true --wait=true 2>/dev/null || { @@ -399,6 +445,7 @@ cmd_restart_llama_stack() { echo "Applying pod manifest..." if [[ "${E2E_KONFLUX_E2E:-0}" == "1" ]]; then + _e2e_ops_phase "full_apply_manifest" # Interception-proxy e2e: refresh Secret before pod recreate so the volume mount is populated. if [[ "${E2E_COPY_INTERCEPTION_CA_TO_LLAMA:-0}" == "1" ]]; then echo "[e2e-ops] Syncing e2e-interception-proxy-ca secret before llama-stack apply..." @@ -427,10 +474,11 @@ cmd_restart_llama_stack() { llama_pod_wait=120 llama_health_attempts=100 fi + _e2e_ops_phase "full_wait_pod_ready" if ! wait_for_pod "llama-stack-service" "$llama_pod_wait"; then echo "===== Llama-stack restore FAILED (pod not ready) =====" oc describe pod llama-stack-service -n "$NAMESPACE" 2>/dev/null | tail -40 || true - exit 1 + _e2e_ops_llama_restart_fail "full_pod_not_ready" 20 fi echo "Labeling pod for service..." oc label pod llama-stack-service pod=llama-stack-service -n "$NAMESPACE" --overwrite @@ -446,12 +494,14 @@ cmd_restart_llama_stack() { exit 1 fi fi + _e2e_ops_phase "full_wait_in_pod_health" if ! wait_for_llama_stack_http_health "$llama_health_attempts"; then echo "===== Llama-stack restore FAILED (HTTP not healthy) =====" oc logs llama-stack-service -n "$NAMESPACE" -c llama-stack-container --tail=80 2>&1 || true - exit 1 + _e2e_ops_llama_restart_fail "full_in_pod_health" 21 fi else + _e2e_ops_phase "full_apply_prow_manifest" # Prow: vLLM Llama Stack image (matches pipeline.sh / pipeline-services.sh) # Use sed instead of envsubst to avoid blanking $VAR references in embedded bash scripts sed "s|\${LLAMA_STACK_IMAGE}|${LLAMA_STACK_IMAGE:-}|g" "$MANIFEST_DIR/llama-stack-prow.yaml" | @@ -461,14 +511,46 @@ cmd_restart_llama_stack() { oc label pod llama-stack-service pod=llama-stack-service -n "$NAMESPACE" --overwrite fi + _e2e_ops_phase "full_port_forward" if ! cmd_restart_llama_port_forward; then echo "ERROR: Llama pod is up but localhost:${LOCAL_LLAMA_PORT:-8321} port-forward failed" - exit 1 + _e2e_ops_llama_restart_fail "full_port_forward" 22 fi + _e2e_ops_phase "full_done" echo "===== Llama-stack restore complete =====" } +cmd_diagnose_llama_restart() { + echo "===== Llama-stack restart diagnostics (namespace=$NAMESPACE) =====" + echo "E2E_KONFLUX_E2E=${E2E_KONFLUX_E2E:-0} E2E_LLAMA_RELOAD_CONFIG_ONLY=${E2E_LLAMA_RELOAD_CONFIG_ONLY:-0}" + oc get pod llama-stack-service -n "$NAMESPACE" -o wide 2>&1 || true + echo "--- container restarts / state ---" + oc get pod llama-stack-service -n "$NAMESPACE" \ + -o jsonpath='{.status.containerStatuses[0].restartCount} restarts ready={.status.containerStatuses[0].ready}{"\n"}' 2>&1 || true + echo "--- in-pod GET /v1/health ---" + if _llama_stack_http_health_once; then + echo "OK" + else + echo "FAIL" + fi + echo "--- localhost:${LOCAL_LLAMA_PORT:-8321}/v1/health via port-forward ---" + if verify_llama_local_forward 5; then + echo "OK" + else + echo "FAIL (pipeline PID file: ${E2E_LLAMA_PORT_FORWARD_PID_FILE:-unset})" + if [[ -f "${E2E_LLAMA_PORT_FORWARD_PID_FILE:-}" ]]; then + read -r pf_pid <"${E2E_LLAMA_PORT_FORWARD_PID_FILE}" 2>/dev/null || true + echo "saved_pf_pid=${pf_pid:-} alive=$(kill -0 "$pf_pid" 2>/dev/null && echo yes || echo no)" + fi + fi + echo "--- tls-openai in llama-stack-config (grep) ---" + oc get configmap llama-stack-config -n "$NAMESPACE" -o jsonpath='{.data.run\.yaml}' 2>/dev/null \ + | grep -E 'tls-openai|network:|client_cert|verify:' | head -20 || true + echo "--- llama container log tail ---" + oc logs llama-stack-service -n "$NAMESPACE" -c llama-stack-container --tail=40 2>&1 || true +} + cmd_restart_port_forward() { local local_port="${LOCAL_PORT:-8080}" local remote_port="${REMOTE_PORT:-8080}" @@ -1021,6 +1103,9 @@ case "$COMMAND" in sync-mock-tls-certs-secret) cmd_sync_mock_tls_certs_secret ;; + diagnose-llama-restart) + cmd_diagnose_llama_restart + ;; *) echo "Usage: $0 [args...]" echo "" @@ -1041,6 +1126,7 @@ case "$COMMAND" in echo " deploy-e2e-interception-proxy - Deploy in-cluster interception proxy pod" echo " deploy-e2e-mock-tls-inference - Deploy mock HTTPS inference server (tls.feature)" echo " sync-mock-tls-certs-secret - Publish mock TLS /certs PEMs to Secret for llama" + echo " diagnose-llama-restart - Snapshot pod/health/forward/config for debugging" exit 1 ;; esac diff --git a/tests/e2e/features/steps/tls.py b/tests/e2e/features/steps/tls.py index 5f5da53be..329886fd3 100644 --- a/tests/e2e/features/steps/tls.py +++ b/tests/e2e/features/steps/tls.py @@ -81,8 +81,16 @@ def restart_llama_for_tls_feature(context: Context) -> None: if _tls_prow_restart_state["full_restart_done"]: _prepare_tls_prow_llama_reload_env() + mode = "reload" else: _prepare_tls_prow_llama_full_restart_env() + mode = "full" + + scenario = getattr(getattr(context, "scenario", None), "name", "") or "?" + print( + f"[tls.feature] Llama Stack restart: mode={mode} scenario={scenario!r}", + flush=True, + ) try: restart_container("llama-stack") diff --git a/tests/e2e/utils/prow_utils.py b/tests/e2e/utils/prow_utils.py index 20b9f3d7b..3dc97d7ac 100644 --- a/tests/e2e/utils/prow_utils.py +++ b/tests/e2e/utils/prow_utils.py @@ -80,6 +80,28 @@ def wait_for_pod_health(pod_name: str, max_attempts: int = 12) -> None: _LIGHTSPEED_RESTART_NAMES = frozenset({"lightspeed-stack", "lightspeed-stack-service"}) +def _print_llama_restart_diagnostics_from_output(output: str) -> None: + """Extract e2e-ops phase markers from restart-llama-stack stdout.""" + markers = ( + "E2E_LLAMA_RESTART_MODE=", + "E2E_LLAMA_RESTART_FAILED_PHASE=", + "E2E_LLAMA_RESTART_FALLBACK=", + "E2E_LLAMA_RESTART_EXIT_CODE=", + "E2E_OPS_PHASE=", + ) + printed = False + for line in output.splitlines(): + if any(line.startswith(m) for m in markers) or "[e2e-ops] <<<" in line: + print(line, flush=True) + printed = True + if printed: + print( + "See docs/e2e_testing.md § Konflux Llama restart diagnostics " + "for phase meanings and fixes.", + flush=True, + ) + + def restart_pod(container_name: str) -> None: """Restart Llama Stack or Lightspeed pod in OpenShift/Prow (not Docker). @@ -96,9 +118,7 @@ def restart_pod(container_name: str) -> None: op = "restart-llama-stack" # TLS feature: full pod recreate + cert sync + health can exceed 7 min on Konflux. timeout = ( - 900 - if os.environ.get("E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA") == "1" - else 420 + 900 if os.environ.get("E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA") == "1" else 420 ) elif container_name in _LIGHTSPEED_RESTART_NAMES: op = "restart-lightspeed" @@ -116,13 +136,9 @@ def restart_pod(container_name: str) -> None: if op == "restart-llama-stack" and os.environ.get("E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA") == "1" and os.environ.get("E2E_LLAMA_RELOAD_CONFIG_ONLY") != "1" - else 2 - if op == "restart-llama-stack" - else 1 - ) - last_error: subprocess.CalledProcessError | subprocess.TimeoutExpired | None = ( - None + else 2 if op == "restart-llama-stack" else 1 ) + last_error: subprocess.CalledProcessError | subprocess.TimeoutExpired | None = None for attempt in range(1, max_attempts + 1): try: result = run_e2e_ops(op, timeout=timeout) @@ -138,8 +154,23 @@ def restart_pod(container_name: str) -> None: return except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as err: last_error = err + if op == "restart-llama-stack" and isinstance( + err, subprocess.CalledProcessError + ): + _print_llama_restart_diagnostics_from_output( + (err.stdout or "") + (err.stderr or "") + ) + try: + diag = run_e2e_ops("diagnose-llama-restart", timeout=90) + print(diag.stdout, end="") + except (subprocess.CalledProcessError, subprocess.TimeoutExpired): + pass if attempt < max_attempts: - retry_delay = 30 if os.environ.get("E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA") == "1" else 20 + retry_delay = ( + 30 + if os.environ.get("E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA") == "1" + else 20 + ) print( f"⚠️ {op} failed (attempt {attempt}/{max_attempts}), " f"retrying after {retry_delay}s..." From 1fbca1afc962be191790ac08845c42a338a98029 Mon Sep 17 00:00:00 2001 From: Radovan Fuchs Date: Mon, 25 May 2026 12:16:45 +0200 Subject: [PATCH 08/20] fix --- tests/e2e-prow/rhoai/scripts/e2e-ops.sh | 3 ++- tests/e2e/features/steps/tls.py | 18 ++++++++++++++++-- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh index 62167198d..fcdeb864b 100755 --- a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh +++ b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh @@ -399,7 +399,8 @@ cmd_reload_llama_stack_config() { local reload_health_attempts=35 if [[ "${E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA:-0}" == "1" ]]; then reload_pod_wait=60 - reload_health_attempts=50 + # Fail reload faster when stack stays unhealthy (then full recreate runs once). + reload_health_attempts=24 fi _e2e_ops_phase "reload_wait_pod_ready" if ! wait_for_pod "$llama_pod_name" "$reload_pod_wait"; then diff --git a/tests/e2e/features/steps/tls.py b/tests/e2e/features/steps/tls.py index 329886fd3..f1f1693a6 100644 --- a/tests/e2e/features/steps/tls.py +++ b/tests/e2e/features/steps/tls.py @@ -34,13 +34,17 @@ } _mock_tls_cluster_deploy_state: dict[str, bool] = {"done": False} -_tls_prow_restart_state: dict[str, bool] = {"full_restart_done": False} +_tls_prow_restart_state: dict[str, bool] = { + "full_restart_done": False, + "force_full_restart": False, +} def reset_tls_prow_restart_optimization_state() -> None: """Reset per-feature Prow state (call from ``before_feature``).""" _mock_tls_cluster_deploy_state["done"] = False _tls_prow_restart_state["full_restart_done"] = False + _tls_prow_restart_state["force_full_restart"] = False os.environ.pop("E2E_LLAMA_RELOAD_CONFIG_ONLY", None) os.environ.pop("E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA", None) @@ -79,7 +83,10 @@ def restart_llama_for_tls_feature(context: Context) -> None: restart_container("llama-stack") return - if _tls_prow_restart_state["full_restart_done"]: + if _tls_prow_restart_state.pop("force_full_restart", False): + _prepare_tls_prow_llama_full_restart_env() + mode = "full_forced" + elif _tls_prow_restart_state["full_restart_done"]: _prepare_tls_prow_llama_reload_env() mode = "reload" else: @@ -136,6 +143,8 @@ def _tls_provider_base() -> dict[str, Any]: "api_key": "test-key", "base_url": _mock_tls_base_url(_MOCK_TLS_PORT_TLS), "allowed_models": ["mock-tls-model"], + # Avoid hitting the mock on every container restart (Konflux reload path). + "refresh_models": False, }, } @@ -211,6 +220,7 @@ def _configure_tls(tls_config: dict[str, Any], base_url: Optional[str] = None) - provider["config"]["base_url"] = base_url else: provider["config"]["base_url"] = _mock_tls_base_url(_MOCK_TLS_PORT_TLS) + provider.setdefault("config", {})["refresh_models"] = False provider["config"]["network"]["tls"] = tls_config write_llama_config(config) if is_prow_environment(): @@ -291,6 +301,10 @@ def configure_mtls_wrong_client_cert(context: Context) -> None: }, base_url=_mock_tls_base_url(_MOCK_TLS_PORT_MTLS), ) + # Konflux: reload after this config often never becomes healthy until timeout, + # then falls back to full recreate (~7 min). Skip reload for this scenario. + if is_prow_environment(): + _tls_prow_restart_state["force_full_restart"] = True @given("Llama Stack is configured for mTLS with untrusted client certificate") From 9b30f01517d379bd9dfdeae798fc0432d4c3a3ed Mon Sep 17 00:00:00 2001 From: Radovan Fuchs Date: Mon, 25 May 2026 16:16:25 +0200 Subject: [PATCH 09/20] fix for tls restarts --- tests/e2e-prow/rhoai/scripts/e2e-ops.sh | 45 +++++++++++++++++++++---- 1 file changed, 38 insertions(+), 7 deletions(-) diff --git a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh index fcdeb864b..0850bb75f 100755 --- a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh +++ b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh @@ -289,6 +289,30 @@ _llama_stack_http_health_once() { return 1 } +# Before recreating lightspeed-stack, Llama must answer /v1/health (reload can be Ready but not listening yet). +_wait_for_llama_before_lightspeed_restart() { + local max_attempts="${1:-25}" + local attempt + + echo "Waiting for Llama Stack before lightspeed-stack restart..." + for ((attempt=1; attempt<=max_attempts; attempt++)); do + if _llama_stack_http_health_once; then + echo "✓ Llama Stack healthy before LCS restart (attempt $attempt/$max_attempts)" + return 0 + fi + if [[ $attempt -lt $max_attempts ]]; then + sleep 2 + fi + done + echo "⚠️ Llama Stack not healthy after $((max_attempts * 2))s — restoring before LCS restart..." + # Use full recreate (reload may have left the process still starting or wedged). + if ! E2E_LLAMA_RELOAD_CONFIG_ONLY=0 cmd_restart_llama_stack; then + echo "⚠️ Llama Stack restore failed; LCS may be slow to start" + return 1 + fi + return 0 +} + # After the pod is Ready, confirm the process is actually serving HTTP (not only kubelet probes). wait_for_llama_stack_http_health() { local max_attempts="${1:-35}" @@ -319,12 +343,9 @@ cmd_restart_lightspeed() { echo "Restarting lightspeed-stack service..." # LCS hangs at startup if Llama Stack is unreachable (blocks Llama handshake, - # never opens port 8080, readiness probe never passes). Ensure Llama Stack - # is healthy before recreating the LCS pod. - if ! _llama_stack_http_health_once 2>/dev/null; then - echo "⚠️ Llama Stack not healthy — restoring before LCS restart..." - cmd_restart_llama_stack || echo "⚠️ Llama Stack restore failed; LCS may be slow to start" - fi + # never opens port 8080, readiness probe never passes). After a Konflux + # config reload, the pod can be Ready before /v1/health responds — poll first. + _wait_for_llama_before_lightspeed_restart 25 # Delete existing pod (short wait so hook stays within timeout; force if needed) timeout 20 oc delete pod lightspeed-stack-service -n "$NAMESPACE" --ignore-not-found=true --wait=true 2>/dev/null || { @@ -354,13 +375,23 @@ cmd_restart_lightspeed() { oc label pod lightspeed-stack-service pod=lightspeed-stack-service -n "$NAMESPACE" --overwrite # Re-establish port-forwards (may succeed even if readiness was slow) - cmd_restart_port_forward + local forward_ok=true + if ! cmd_restart_port_forward; then + forward_ok=false + echo "⚠️ Lightspeed port-forward on :${LOCAL_PORT:-8080} failed" + e2e_ops_diagnose_forward_failure + fi cmd_restart_jwks_port_forward || echo "⚠️ Mock JWKS port-forward failed (RBAC tests may fail)" if [[ "$pod_ready" == "false" ]]; then + echo "E2E_LIGHTSPEED_RESTART_FAILED_PHASE=pod_not_ready" echo "⚠️ Lightspeed restart completed but pod was slow to become ready" return 1 fi + if [[ "$forward_ok" == "false" ]]; then + echo "E2E_LIGHTSPEED_RESTART_FAILED_PHASE=port_forward" + return 1 + fi echo "✓ Lightspeed restart complete" } From bdb29dc46321c8bfb248c10ef922aaa15d5f43a4 Mon Sep 17 00:00:00 2001 From: Radovan Fuchs Date: Mon, 25 May 2026 22:14:24 +0200 Subject: [PATCH 10/20] fix --- tests/e2e-prow/rhoai/scripts/e2e-ops.sh | 29 +++++++++++++++++-------- tests/e2e/utils/prow_utils.py | 4 ++-- 2 files changed, 22 insertions(+), 11 deletions(-) diff --git a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh index 0850bb75f..3cbf125ac 100755 --- a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh +++ b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh @@ -361,14 +361,23 @@ cmd_restart_lightspeed() { oc apply -n "$NAMESPACE" -f - # Wait for pod to be ready (TCP probe passes when app listens on 8080). - # Don't let a timeout here abort the function — still attempt port-forward - # and diagnostics so later scenarios have a chance to recover. + # Manifest readiness: initialDelay 20 + failureThreshold 30 * period 5 = up to ~170s. + local lcs_pod_wait=40 + if [[ "${E2E_KONFLUX_E2E:-0}" == "1" ]]; then + lcs_pod_wait=65 + fi local pod_ready=true - if ! wait_for_pod "lightspeed-stack-service" 40; then - pod_ready=false - echo "⚠️ Pod not ready within 120s — dumping diagnostics:" - oc describe pod lightspeed-stack-service -n "$NAMESPACE" 2>&1 | tail -30 || true - oc logs lightspeed-stack-service -n "$NAMESPACE" --tail=40 2>&1 || true + if ! wait_for_pod "lightspeed-stack-service" "$lcs_pod_wait"; then + echo "⚠️ Pod not ready within $((lcs_pod_wait * 3))s — extended wait (Konflux LCS startup)..." + if ! wait_for_pod "lightspeed-stack-service" 25; then + pod_ready=false + echo "⚠️ Pod still not ready — dumping diagnostics:" + oc describe pod lightspeed-stack-service -n "$NAMESPACE" 2>&1 | tail -40 || true + oc logs lightspeed-stack-service -n "$NAMESPACE" \ + -c lightspeed-stack-container --tail=80 2>&1 || true + else + echo "✓ Pod became ready during extended wait" + fi fi # Re-label pod for service discovery @@ -597,8 +606,10 @@ cmd_restart_port_forward() { # Let the kernel release LISTEN sockets after pkill (avoids immediate "address already in use") sleep 3 - # Service can lag endpoints after pod recreate; pod-direct forward is more reliable. - if [[ $attempt -le 2 ]]; then + # Service forward waits for endpoints; after LCS recreate use pod-direct sooner on Konflux. + if [[ "${E2E_KONFLUX_E2E:-0}" == "1" ]]; then + pf_resource="pod/lightspeed-stack-service" + elif [[ $attempt -le 2 ]]; then pf_resource="svc/lightspeed-stack-service-svc" else pf_resource="pod/lightspeed-stack-service" diff --git a/tests/e2e/utils/prow_utils.py b/tests/e2e/utils/prow_utils.py index 3dc97d7ac..d1df3bd40 100644 --- a/tests/e2e/utils/prow_utils.py +++ b/tests/e2e/utils/prow_utils.py @@ -122,8 +122,8 @@ def restart_pod(container_name: str) -> None: ) elif container_name in _LIGHTSPEED_RESTART_NAMES: op = "restart-lightspeed" - # Pod wait (up to ~120s) + port-forward retries + slow Konflux/Prow clusters. - timeout = 320 + # Konflux LCS: up to ~195s pod wait + extended wait + port-forward retries. + timeout = 420 if os.environ.get("E2E_KONFLUX_E2E") == "1" else 320 else: print( f"Warning: restart_pod({container_name!r}) unknown; " From dcffd894bcf6257608dc65a03885b98201712347 Mon Sep 17 00:00:00 2001 From: Radovan Fuchs Date: Tue, 26 May 2026 06:47:14 +0200 Subject: [PATCH 11/20] fix --- tests/e2e-prow/rhoai/scripts/e2e-ops.sh | 315 ++++-------------------- tests/e2e/features/environment.py | 4 +- tests/e2e/features/steps/proxy.py | 1 - tests/e2e/features/steps/tls.py | 63 +---- tests/e2e/utils/prow_utils.py | 91 +------ 5 files changed, 67 insertions(+), 407 deletions(-) diff --git a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh index 3cbf125ac..b5741ef02 100755 --- a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh +++ b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh @@ -26,7 +26,7 @@ # deploy-e2e-tunnel-proxy - Deploy in-cluster tunnel proxy (proxy.feature step) # deploy-e2e-interception-proxy - Deploy in-cluster interception proxy (proxy.feature step) # deploy-e2e-mock-tls-inference - Deploy mock HTTPS inference server (tls.feature step) -# sync-mock-tls-certs-secret - Publish /certs PEMs to Secret for llama-stack mount +# sync-mock-tls-certs-secret - Publish /certs PEMs to Secret for llama-stack mount set -e @@ -38,37 +38,6 @@ E2E_LSC_PORT_FORWARD_PID_FILE="${E2E_LSC_PORT_FORWARD_PID_FILE:-/tmp/e2e-lightsp E2E_LLAMA_PORT_FORWARD_PID_FILE="${E2E_LLAMA_PORT_FORWARD_PID_FILE:-/tmp/e2e-llama-port-forward.pid}" E2E_JWKS_PORT_FORWARD_PID_FILE="${E2E_JWKS_PORT_FORWARD_PID_FILE:-/tmp/e2e-jwks-port-forward.pid}" -# Llama restart exit codes (grep Konflux logs for E2E_LLAMA_RESTART_FAILED_PHASE=): -# 10 reload: run.yaml copy failed -# 11 reload: pod not Ready within wait -# 12 reload: in-pod /v1/health failed -# 13 reload: localhost:8321 port-forward failed -# 20 full restart: pod not Ready -# 21 full restart: in-pod /v1/health failed -# 22 full restart: localhost:8321 port-forward failed - -E2E_OPS_CURRENT_PHASE="" -E2E_OPS_PHASE_START=0 - -_e2e_ops_phase() { - if [[ -n "${E2E_OPS_CURRENT_PHASE:-}" && "${E2E_OPS_PHASE_START:-0}" -gt 0 ]]; then - local elapsed=$(( $(date +%s) - E2E_OPS_PHASE_START )) - echo "[e2e-ops] <<< ${E2E_OPS_CURRENT_PHASE} (${elapsed}s)" - fi - E2E_OPS_CURRENT_PHASE="$1" - E2E_OPS_PHASE_START=$(date +%s) - echo "E2E_OPS_PHASE=$1" - echo "[e2e-ops] >>> $1" -} - -_e2e_ops_llama_restart_fail() { - local phase="$1" - local code="$2" - echo "E2E_LLAMA_RESTART_FAILED_PHASE=$phase" - echo "E2E_LLAMA_RESTART_EXIT_CODE=$code" - exit "$code" -} - # ============================================================================ # Helper functions # ============================================================================ @@ -289,30 +258,6 @@ _llama_stack_http_health_once() { return 1 } -# Before recreating lightspeed-stack, Llama must answer /v1/health (reload can be Ready but not listening yet). -_wait_for_llama_before_lightspeed_restart() { - local max_attempts="${1:-25}" - local attempt - - echo "Waiting for Llama Stack before lightspeed-stack restart..." - for ((attempt=1; attempt<=max_attempts; attempt++)); do - if _llama_stack_http_health_once; then - echo "✓ Llama Stack healthy before LCS restart (attempt $attempt/$max_attempts)" - return 0 - fi - if [[ $attempt -lt $max_attempts ]]; then - sleep 2 - fi - done - echo "⚠️ Llama Stack not healthy after $((max_attempts * 2))s — restoring before LCS restart..." - # Use full recreate (reload may have left the process still starting or wedged). - if ! E2E_LLAMA_RELOAD_CONFIG_ONLY=0 cmd_restart_llama_stack; then - echo "⚠️ Llama Stack restore failed; LCS may be slow to start" - return 1 - fi - return 0 -} - # After the pod is Ready, confirm the process is actually serving HTTP (not only kubelet probes). wait_for_llama_stack_http_health() { local max_attempts="${1:-35}" @@ -343,9 +288,12 @@ cmd_restart_lightspeed() { echo "Restarting lightspeed-stack service..." # LCS hangs at startup if Llama Stack is unreachable (blocks Llama handshake, - # never opens port 8080, readiness probe never passes). After a Konflux - # config reload, the pod can be Ready before /v1/health responds — poll first. - _wait_for_llama_before_lightspeed_restart 25 + # never opens port 8080, readiness probe never passes). Ensure Llama Stack + # is healthy before recreating the LCS pod. + if ! _llama_stack_http_health_once 2>/dev/null; then + echo "⚠️ Llama Stack not healthy — restoring before LCS restart..." + cmd_restart_llama_stack || echo "⚠️ Llama Stack restore failed; LCS may be slow to start" + fi # Delete existing pod (short wait so hook stays within timeout; force if needed) timeout 20 oc delete pod lightspeed-stack-service -n "$NAMESPACE" --ignore-not-found=true --wait=true 2>/dev/null || { @@ -361,122 +309,36 @@ cmd_restart_lightspeed() { oc apply -n "$NAMESPACE" -f - # Wait for pod to be ready (TCP probe passes when app listens on 8080). - # Manifest readiness: initialDelay 20 + failureThreshold 30 * period 5 = up to ~170s. + # Don't let a timeout here abort the function — still attempt port-forward + # and diagnostics so later scenarios have a chance to recover. + local pod_ready=true local lcs_pod_wait=40 if [[ "${E2E_KONFLUX_E2E:-0}" == "1" ]]; then lcs_pod_wait=65 fi - local pod_ready=true if ! wait_for_pod "lightspeed-stack-service" "$lcs_pod_wait"; then - echo "⚠️ Pod not ready within $((lcs_pod_wait * 3))s — extended wait (Konflux LCS startup)..." - if ! wait_for_pod "lightspeed-stack-service" 25; then - pod_ready=false - echo "⚠️ Pod still not ready — dumping diagnostics:" - oc describe pod lightspeed-stack-service -n "$NAMESPACE" 2>&1 | tail -40 || true - oc logs lightspeed-stack-service -n "$NAMESPACE" \ - -c lightspeed-stack-container --tail=80 2>&1 || true - else - echo "✓ Pod became ready during extended wait" - fi + pod_ready=false + echo "⚠️ Pod not ready within 120s — dumping diagnostics:" + oc describe pod lightspeed-stack-service -n "$NAMESPACE" 2>&1 | tail -30 || true + oc logs lightspeed-stack-service -n "$NAMESPACE" --tail=40 2>&1 || true fi # Re-label pod for service discovery oc label pod lightspeed-stack-service pod=lightspeed-stack-service -n "$NAMESPACE" --overwrite # Re-establish port-forwards (may succeed even if readiness was slow) - local forward_ok=true - if ! cmd_restart_port_forward; then - forward_ok=false - echo "⚠️ Lightspeed port-forward on :${LOCAL_PORT:-8080} failed" - e2e_ops_diagnose_forward_failure - fi + cmd_restart_port_forward cmd_restart_jwks_port_forward || echo "⚠️ Mock JWKS port-forward failed (RBAC tests may fail)" if [[ "$pod_ready" == "false" ]]; then - echo "E2E_LIGHTSPEED_RESTART_FAILED_PHASE=pod_not_ready" echo "⚠️ Lightspeed restart completed but pod was slow to become ready" return 1 fi - if [[ "$forward_ok" == "false" ]]; then - echo "E2E_LIGHTSPEED_RESTART_FAILED_PHASE=port_forward" - return 1 - fi echo "✓ Lightspeed restart complete" } -cmd_reload_llama_stack_config() { - local llama_pod_name="llama-stack-service" - local tmp - - echo "E2E_LLAMA_RESTART_MODE=reload" - echo "===== Reloading llama-stack run.yaml (container restart, no pod recreate) =====" - _e2e_ops_phase "reload_read_configmap" - tmp=$(mktemp) - if ! oc get configmap llama-stack-config -n "$NAMESPACE" \ - -o jsonpath='{.data.run\.yaml}' >"$tmp"; then - rm -f "$tmp" - echo "ERROR: failed to read llama-stack-config run.yaml" >&2 - _e2e_ops_llama_restart_fail "reload_configmap_read" 10 - fi - if [[ ! -s "$tmp" ]]; then - rm -f "$tmp" - echo "ERROR: llama-stack-config run.yaml is empty" >&2 - _e2e_ops_llama_restart_fail "reload_configmap_empty" 10 - fi - _e2e_ops_phase "reload_oc_cp_run_yaml" - if ! oc cp "$tmp" "$NAMESPACE/$llama_pod_name:/opt/app-root/run.yaml" \ - -c llama-stack-container; then - rm -f "$tmp" - echo "ERROR: failed to copy run.yaml into llama-stack pod" >&2 - _e2e_ops_llama_restart_fail "reload_oc_cp" 10 - fi - rm -f "$tmp" - _e2e_ops_phase "reload_kill_container" - echo "Restarting llama-stack-container to pick up run.yaml..." - oc exec -n "$NAMESPACE" "$llama_pod_name" -c llama-stack-container -- bash -c 'kill 1' \ - 2>/dev/null || true - local reload_pod_wait=45 - local reload_health_attempts=35 - if [[ "${E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA:-0}" == "1" ]]; then - reload_pod_wait=60 - # Fail reload faster when stack stays unhealthy (then full recreate runs once). - reload_health_attempts=24 - fi - _e2e_ops_phase "reload_wait_pod_ready" - if ! wait_for_pod "$llama_pod_name" "$reload_pod_wait"; then - echo "===== Llama-stack reload FAILED (pod not ready) =====" - oc describe pod llama-stack-service -n "$NAMESPACE" 2>/dev/null | tail -40 || true - _e2e_ops_llama_restart_fail "reload_pod_not_ready" 11 - fi - _e2e_ops_phase "reload_wait_in_pod_health" - if ! wait_for_llama_stack_http_health "$reload_health_attempts"; then - echo "===== Llama-stack reload FAILED (HTTP not healthy) =====" - oc logs llama-stack-service -n "$NAMESPACE" -c llama-stack-container --tail=80 2>&1 || true - _e2e_ops_llama_restart_fail "reload_in_pod_health" 12 - fi - _e2e_ops_phase "reload_port_forward" - if ! cmd_restart_llama_port_forward; then - echo "ERROR: Llama pod is up but localhost:${LOCAL_LLAMA_PORT:-8321} port-forward failed" - _e2e_ops_llama_restart_fail "reload_port_forward" 13 - fi - _e2e_ops_phase "reload_done" - echo "===== Llama-stack config reload complete =====" -} - cmd_restart_llama_stack() { - if [[ "${E2E_KONFLUX_E2E:-0}" == "1" && "${E2E_LLAMA_RELOAD_CONFIG_ONLY:-0}" == "1" ]]; then - if oc get pod llama-stack-service -n "$NAMESPACE" &>/dev/null; then - if cmd_reload_llama_stack_config; then - return 0 - fi - echo "WARN: llama config reload failed; falling back to full pod restart" >&2 - echo "E2E_LLAMA_RESTART_FALLBACK=reload_to_full" - fi - fi - - echo "E2E_LLAMA_RESTART_MODE=full" echo "===== Restoring llama-stack service =====" - _e2e_ops_phase "full_delete_pod" # Pod.spec is largely immutable; delete so apply creates a pod with current volumes/env. echo "Deleting llama-stack pod (if any) before apply..." timeout 45 oc delete pod llama-stack-service -n "$NAMESPACE" --ignore-not-found=true --wait=true 2>/dev/null || { @@ -486,7 +348,6 @@ cmd_restart_llama_stack() { echo "Applying pod manifest..." if [[ "${E2E_KONFLUX_E2E:-0}" == "1" ]]; then - _e2e_ops_phase "full_apply_manifest" # Interception-proxy e2e: refresh Secret before pod recreate so the volume mount is populated. if [[ "${E2E_COPY_INTERCEPTION_CA_TO_LLAMA:-0}" == "1" ]]; then echo "[e2e-ops] Syncing e2e-interception-proxy-ca secret before llama-stack apply..." @@ -495,8 +356,7 @@ cmd_restart_llama_stack() { exit 1 fi fi - if [[ "${E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA:-0}" == "1" \ - && "${E2E_LLAMA_RELOAD_CONFIG_ONLY:-0}" != "1" ]]; then + if [[ "${E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA:-0}" == "1" ]]; then echo "[e2e-ops] Syncing e2e-mock-tls-certs secret before llama-stack apply..." if ! cmd_sync_mock_tls_certs_secret; then echo "===== Llama-stack restore FAILED (mock TLS certs secret sync) =====" @@ -509,18 +369,7 @@ cmd_restart_llama_stack() { -n "$NAMESPACE" \ --dry-run=client -o yaml | oc apply -f - oc apply -n "$NAMESPACE" -f "$MANIFEST_DIR/llama-stack-openai.yaml" - local llama_pod_wait=90 - local llama_health_attempts=50 - if [[ "${E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA:-0}" == "1" ]]; then - llama_pod_wait=120 - llama_health_attempts=100 - fi - _e2e_ops_phase "full_wait_pod_ready" - if ! wait_for_pod "llama-stack-service" "$llama_pod_wait"; then - echo "===== Llama-stack restore FAILED (pod not ready) =====" - oc describe pod llama-stack-service -n "$NAMESPACE" 2>/dev/null | tail -40 || true - _e2e_ops_llama_restart_fail "full_pod_not_ready" 20 - fi + wait_for_pod "llama-stack-service" 90 echo "Labeling pod for service..." oc label pod llama-stack-service pod=llama-stack-service -n "$NAMESPACE" --overwrite if [[ "${E2E_COPY_INTERCEPTION_CA_TO_LLAMA:-0}" == "1" ]]; then @@ -535,14 +384,15 @@ cmd_restart_llama_stack() { exit 1 fi fi - _e2e_ops_phase "full_wait_in_pod_health" + local llama_health_attempts=50 + if [[ "${E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA:-0}" == "1" ]]; then + llama_health_attempts=75 + fi if ! wait_for_llama_stack_http_health "$llama_health_attempts"; then echo "===== Llama-stack restore FAILED (HTTP not healthy) =====" - oc logs llama-stack-service -n "$NAMESPACE" -c llama-stack-container --tail=80 2>&1 || true - _e2e_ops_llama_restart_fail "full_in_pod_health" 21 + exit 1 fi else - _e2e_ops_phase "full_apply_prow_manifest" # Prow: vLLM Llama Stack image (matches pipeline.sh / pipeline-services.sh) # Use sed instead of envsubst to avoid blanking $VAR references in embedded bash scripts sed "s|\${LLAMA_STACK_IMAGE}|${LLAMA_STACK_IMAGE:-}|g" "$MANIFEST_DIR/llama-stack-prow.yaml" | @@ -552,46 +402,14 @@ cmd_restart_llama_stack() { oc label pod llama-stack-service pod=llama-stack-service -n "$NAMESPACE" --overwrite fi - _e2e_ops_phase "full_port_forward" if ! cmd_restart_llama_port_forward; then echo "ERROR: Llama pod is up but localhost:${LOCAL_LLAMA_PORT:-8321} port-forward failed" - _e2e_ops_llama_restart_fail "full_port_forward" 22 + exit 1 fi - _e2e_ops_phase "full_done" echo "===== Llama-stack restore complete =====" } -cmd_diagnose_llama_restart() { - echo "===== Llama-stack restart diagnostics (namespace=$NAMESPACE) =====" - echo "E2E_KONFLUX_E2E=${E2E_KONFLUX_E2E:-0} E2E_LLAMA_RELOAD_CONFIG_ONLY=${E2E_LLAMA_RELOAD_CONFIG_ONLY:-0}" - oc get pod llama-stack-service -n "$NAMESPACE" -o wide 2>&1 || true - echo "--- container restarts / state ---" - oc get pod llama-stack-service -n "$NAMESPACE" \ - -o jsonpath='{.status.containerStatuses[0].restartCount} restarts ready={.status.containerStatuses[0].ready}{"\n"}' 2>&1 || true - echo "--- in-pod GET /v1/health ---" - if _llama_stack_http_health_once; then - echo "OK" - else - echo "FAIL" - fi - echo "--- localhost:${LOCAL_LLAMA_PORT:-8321}/v1/health via port-forward ---" - if verify_llama_local_forward 5; then - echo "OK" - else - echo "FAIL (pipeline PID file: ${E2E_LLAMA_PORT_FORWARD_PID_FILE:-unset})" - if [[ -f "${E2E_LLAMA_PORT_FORWARD_PID_FILE:-}" ]]; then - read -r pf_pid <"${E2E_LLAMA_PORT_FORWARD_PID_FILE}" 2>/dev/null || true - echo "saved_pf_pid=${pf_pid:-} alive=$(kill -0 "$pf_pid" 2>/dev/null && echo yes || echo no)" - fi - fi - echo "--- tls-openai in llama-stack-config (grep) ---" - oc get configmap llama-stack-config -n "$NAMESPACE" -o jsonpath='{.data.run\.yaml}' 2>/dev/null \ - | grep -E 'tls-openai|network:|client_cert|verify:' | head -20 || true - echo "--- llama container log tail ---" - oc logs llama-stack-service -n "$NAMESPACE" -c llama-stack-container --tail=40 2>&1 || true -} - cmd_restart_port_forward() { local local_port="${LOCAL_PORT:-8080}" local remote_port="${REMOTE_PORT:-8080}" @@ -606,10 +424,8 @@ cmd_restart_port_forward() { # Let the kernel release LISTEN sockets after pkill (avoids immediate "address already in use") sleep 3 - # Service forward waits for endpoints; after LCS recreate use pod-direct sooner on Konflux. - if [[ "${E2E_KONFLUX_E2E:-0}" == "1" ]]; then - pf_resource="pod/lightspeed-stack-service" - elif [[ $attempt -le 2 ]]; then + # Service can lag endpoints after pod recreate; pod-direct forward is more reliable. + if [[ $attempt -le 2 ]]; then pf_resource="svc/lightspeed-stack-service-svc" else pf_resource="pod/lightspeed-stack-service" @@ -681,9 +497,6 @@ cmd_restart_llama_port_forward() { local local_port="${LOCAL_LLAMA_PORT:-8321}" local remote_port="${REMOTE_LLAMA_PORT:-8321}" local max_attempts=6 - if [[ "${E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA:-0}" == "1" ]]; then - max_attempts=10 - fi local pf_pid local pf_resource local llama_pf_log="/tmp/port-forward-llama.log" @@ -694,10 +507,10 @@ cmd_restart_llama_port_forward() { kill_stale_llama_forward "$local_port" sleep 3 - if [[ $attempt -le 2 ]]; then - pf_resource="svc/llama-stack-service-svc" - else + if [[ "${E2E_KONFLUX_E2E:-0}" == "1" ]] || [[ $attempt -ge 3 ]]; then pf_resource="pod/llama-stack-service" + else + pf_resource="svc/llama-stack-service-svc" fi echo "Llama port-forward attempt $attempt/$max_attempts -> $pf_resource" @@ -930,23 +743,8 @@ _MOCK_TLS_CERT_FILES=( expired-client.crt ) -_mock_tls_certs_secret_is_complete() { - local f data - if ! oc get secret e2e-mock-tls-certs -n "$NAMESPACE" &>/dev/null; then - return 1 - fi - for f in "${_MOCK_TLS_CERT_FILES[@]}"; do - data=$(oc get secret e2e-mock-tls-certs -n "$NAMESPACE" \ - -o "go-template={{index .data \"${f}\"}}" 2>/dev/null) || return 1 - if [[ -z "$data" ]]; then - return 1 - fi - done - return 0 -} - cmd_sync_mock_tls_certs_secret() { - local mock_pod_name tmpdir f attempt + local mock_pod_name tmpdir f mock_pod_name=$(oc get pod -n "$NAMESPACE" -l app=e2e-mock-tls-inference \ -o jsonpath='{.items[0].metadata.name}' 2>/dev/null) || mock_pod_name="" @@ -956,29 +754,19 @@ cmd_sync_mock_tls_certs_secret() { return 1 fi - if _mock_tls_certs_secret_is_complete; then - echo "✓ e2e-mock-tls-certs secret already complete, skipping sync" - return 0 - fi - - if ! oc wait pod/"$mock_pod_name" -n "$NAMESPACE" --for=condition=Ready --timeout=60s 2>/dev/null; then - echo "WARNING: e2e-mock-tls-inference not Ready before cert sync" >&2 - fi - tmpdir=$(mktemp -d) for f in "${_MOCK_TLS_CERT_FILES[@]}"; do - for attempt in 1 2 3; do - if oc exec -n "$NAMESPACE" "$mock_pod_name" -c e2e-mock-tls-inference -- \ - cat "/certs/$f" >"$tmpdir/$f" 2>/dev/null && [[ -s "$tmpdir/$f" ]]; then - break - fi - if [[ $attempt -eq 3 ]]; then - echo "ERROR: failed to read /certs/$f from e2e-mock-tls-inference pod" >&2 - rm -rf "$tmpdir" - return 1 - fi - sleep 3 - done + if ! oc exec -n "$NAMESPACE" "$mock_pod_name" -c e2e-mock-tls-inference -- \ + cat "/certs/$f" >"$tmpdir/$f"; then + echo "ERROR: failed to read /certs/$f from e2e-mock-tls-inference pod" >&2 + rm -rf "$tmpdir" + return 1 + fi + if [[ ! -s "$tmpdir/$f" ]]; then + echo "ERROR: /certs/$f is empty in e2e-mock-tls-inference pod" >&2 + rm -rf "$tmpdir" + return 1 + fi done if ! oc create secret generic e2e-mock-tls-certs \ @@ -995,16 +783,11 @@ cmd_sync_mock_tls_certs_secret() { _verify_mock_tls_certs_mounted_in_llama() { local llama_pod_name="llama-stack-service" - local attempt - for attempt in 1 2 3 4 5 6; do - if oc exec -n "$NAMESPACE" "$llama_pod_name" -c llama-stack-container -- \ - sh -c 'test -s /certs/ca.crt && test -s /certs/client.crt && test -s /certs/client.key' \ - 2>/dev/null; then - echo "✓ mock TLS certs present under /certs in llama-stack" - return 0 - fi - sleep 3 - done + if oc exec -n "$NAMESPACE" "$llama_pod_name" -c llama-stack-container -- \ + sh -c 'test -s /certs/ca.crt && test -s /certs/client.crt && test -s /certs/client.key'; then + echo "✓ mock TLS certs present under /certs in llama-stack" + return 0 + fi echo "ERROR: /certs missing or incomplete in llama-stack pod" >&2 oc get secret e2e-mock-tls-certs -n "$NAMESPACE" 2>&1 || true oc exec -n "$NAMESPACE" "$llama_pod_name" -c llama-stack-container -- \ @@ -1146,9 +929,6 @@ case "$COMMAND" in sync-mock-tls-certs-secret) cmd_sync_mock_tls_certs_secret ;; - diagnose-llama-restart) - cmd_diagnose_llama_restart - ;; *) echo "Usage: $0 [args...]" echo "" @@ -1167,9 +947,8 @@ case "$COMMAND" in echo " sync-interception-proxy-ca-secret - Publish trustme CA to Secret for llama mount" echo " deploy-e2e-tunnel-proxy - Deploy in-cluster tunnel proxy pod" echo " deploy-e2e-interception-proxy - Deploy in-cluster interception proxy pod" - echo " deploy-e2e-mock-tls-inference - Deploy mock HTTPS inference server (tls.feature)" - echo " sync-mock-tls-certs-secret - Publish mock TLS /certs PEMs to Secret for llama" - echo " diagnose-llama-restart - Snapshot pod/health/forward/config for debugging" + echo " deploy-e2e-mock-tls-inference - Deploy mock HTTPS inference server (tls.feature)" + echo " sync-mock-tls-certs-secret - Publish mock TLS /certs to Secret for llama mount" exit 1 ;; esac diff --git a/tests/e2e/features/environment.py b/tests/e2e/features/environment.py index e97f993a5..025c536e4 100644 --- a/tests/e2e/features/environment.py +++ b/tests/e2e/features/environment.py @@ -26,7 +26,7 @@ reset_llama_stack_disrupt_once_tracking, reset_llama_stack_was_running, ) -from tests.e2e.features.steps.tls import reset_tls_prow_restart_optimization_state +from tests.e2e.features.steps.tls import reset_tls_prow_state from tests.e2e.utils.llama_stack_utils import register_shield from tests.e2e.utils.prow_utils import ( restart_pod, @@ -453,7 +453,7 @@ def before_feature(context: Context, feature: Feature) -> None: # One real Llama disruption per feature (module-level flag; survives context resets) reset_llama_stack_disrupt_once_tracking() if feature.filename and "tls.feature" in feature.filename: - reset_tls_prow_restart_optimization_state() + reset_tls_prow_state() try: max_flaky = int(os.getenv("E2E_FLAKY_MAX_ATTEMPTS", _E2E_FLAKY_MAX_ATTEMPTS)) diff --git a/tests/e2e/features/steps/proxy.py b/tests/e2e/features/steps/proxy.py index 374218478..7755cca91 100644 --- a/tests/e2e/features/steps/proxy.py +++ b/tests/e2e/features/steps/proxy.py @@ -306,7 +306,6 @@ def restore_if_modified(context: Context) -> None: _stop_proxy(context, "interception_proxy", "interception_proxy_loop") os.environ.pop("E2E_COPY_INTERCEPTION_CA_TO_LLAMA", None) os.environ.pop("E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA", None) - os.environ.pop("E2E_LLAMA_RELOAD_CONFIG_ONLY", None) if hasattr(context, "needs_interception_ca_on_llama"): delattr(context, "needs_interception_ca_on_llama") diff --git a/tests/e2e/features/steps/tls.py b/tests/e2e/features/steps/tls.py index f1f1693a6..74620c806 100644 --- a/tests/e2e/features/steps/tls.py +++ b/tests/e2e/features/steps/tls.py @@ -34,18 +34,11 @@ } _mock_tls_cluster_deploy_state: dict[str, bool] = {"done": False} -_tls_prow_restart_state: dict[str, bool] = { - "full_restart_done": False, - "force_full_restart": False, -} -def reset_tls_prow_restart_optimization_state() -> None: +def reset_tls_prow_state() -> None: """Reset per-feature Prow state (call from ``before_feature``).""" _mock_tls_cluster_deploy_state["done"] = False - _tls_prow_restart_state["full_restart_done"] = False - _tls_prow_restart_state["force_full_restart"] = False - os.environ.pop("E2E_LLAMA_RELOAD_CONFIG_ONLY", None) os.environ.pop("E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA", None) @@ -58,63 +51,26 @@ def is_tls_configuration_feature(context: Context) -> bool: return "TLS configuration" in name -def _prepare_tls_prow_llama_full_restart_env() -> None: - """Env for a full llama pod recreate (first TLS scenario / recovery).""" - os.environ["E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA"] = "1" - os.environ.pop("E2E_LLAMA_RELOAD_CONFIG_ONLY", None) - - -def _prepare_tls_prow_llama_reload_env() -> None: - """Env for config-only reload (run.yaml already on pod with /certs mounted).""" +def _prepare_tls_prow_llama_restart_env() -> None: + """Set env for full llama pod recreate with mock TLS certs mounted.""" os.environ["E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA"] = "1" - os.environ["E2E_LLAMA_RELOAD_CONFIG_ONLY"] = "1" def restart_llama_for_tls_feature(context: Context) -> None: - """Restart Llama for TLS tests: one full recreate per feature, then reload. - - Full pod delete+apply after every scenario (~16×) is flaky on Konflux - (cert sync, mount races, port-forward). Later scenarios only change run.yaml - in the ConfigMap; ``oc cp`` + container restart is enough. - """ + """Restart Llama for TLS tests (full pod recreate on Prow/Konflux).""" from tests.e2e.utils.utils import restart_container if not is_prow_environment(): restart_container("llama-stack") return - if _tls_prow_restart_state.pop("force_full_restart", False): - _prepare_tls_prow_llama_full_restart_env() - mode = "full_forced" - elif _tls_prow_restart_state["full_restart_done"]: - _prepare_tls_prow_llama_reload_env() - mode = "reload" - else: - _prepare_tls_prow_llama_full_restart_env() - mode = "full" - + _prepare_tls_prow_llama_restart_env() scenario = getattr(getattr(context, "scenario", None), "name", "") or "?" print( - f"[tls.feature] Llama Stack restart: mode={mode} scenario={scenario!r}", + f"[tls.feature] Llama Stack restart: full recreate scenario={scenario!r}", flush=True, ) - - try: - restart_container("llama-stack") - except Exception: - _tls_prow_restart_state["full_restart_done"] = False - raise - - _tls_prow_restart_state["full_restart_done"] = True - - -def _prepare_tls_prow_llama_restart_env() -> None: - """Set env before writing run.yaml (used by ``_configure_tls``).""" - os.environ["E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA"] = "1" - if _tls_prow_restart_state["full_restart_done"]: - os.environ["E2E_LLAMA_RELOAD_CONFIG_ONLY"] = "1" - else: - os.environ.pop("E2E_LLAMA_RELOAD_CONFIG_ONLY", None) + restart_container("llama-stack") def _cluster_mock_tls_inference_host() -> str: @@ -143,7 +99,6 @@ def _tls_provider_base() -> dict[str, Any]: "api_key": "test-key", "base_url": _mock_tls_base_url(_MOCK_TLS_PORT_TLS), "allowed_models": ["mock-tls-model"], - # Avoid hitting the mock on every container restart (Konflux reload path). "refresh_models": False, }, } @@ -301,10 +256,6 @@ def configure_mtls_wrong_client_cert(context: Context) -> None: }, base_url=_mock_tls_base_url(_MOCK_TLS_PORT_MTLS), ) - # Konflux: reload after this config often never becomes healthy until timeout, - # then falls back to full recreate (~7 min). Skip reload for this scenario. - if is_prow_environment(): - _tls_prow_restart_state["force_full_restart"] = True @given("Llama Stack is configured for mTLS with untrusted client certificate") diff --git a/tests/e2e/utils/prow_utils.py b/tests/e2e/utils/prow_utils.py index d1df3bd40..58a1866cd 100644 --- a/tests/e2e/utils/prow_utils.py +++ b/tests/e2e/utils/prow_utils.py @@ -7,7 +7,6 @@ import os import subprocess import tempfile -import time from typing import Optional @@ -80,28 +79,6 @@ def wait_for_pod_health(pod_name: str, max_attempts: int = 12) -> None: _LIGHTSPEED_RESTART_NAMES = frozenset({"lightspeed-stack", "lightspeed-stack-service"}) -def _print_llama_restart_diagnostics_from_output(output: str) -> None: - """Extract e2e-ops phase markers from restart-llama-stack stdout.""" - markers = ( - "E2E_LLAMA_RESTART_MODE=", - "E2E_LLAMA_RESTART_FAILED_PHASE=", - "E2E_LLAMA_RESTART_FALLBACK=", - "E2E_LLAMA_RESTART_EXIT_CODE=", - "E2E_OPS_PHASE=", - ) - printed = False - for line in output.splitlines(): - if any(line.startswith(m) for m in markers) or "[e2e-ops] <<<" in line: - print(line, flush=True) - printed = True - if printed: - print( - "See docs/e2e_testing.md § Konflux Llama restart diagnostics " - "for phase meanings and fixes.", - flush=True, - ) - - def restart_pod(container_name: str) -> None: """Restart Llama Stack or Lightspeed pod in OpenShift/Prow (not Docker). @@ -116,13 +93,13 @@ def restart_pod(container_name: str) -> None: """ if container_name in _LLAMA_RESTART_NAMES: op = "restart-llama-stack" - # TLS feature: full pod recreate + cert sync + health can exceed 7 min on Konflux. + # TLS: full pod recreate + cert sync + health on Konflux can exceed 7 min. timeout = ( 900 if os.environ.get("E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA") == "1" else 420 ) elif container_name in _LIGHTSPEED_RESTART_NAMES: op = "restart-lightspeed" - # Konflux LCS: up to ~195s pod wait + extended wait + port-forward retries. + # Konflux LCS readiness can take ~195s (probe budget in lightspeed-stack.yaml). timeout = 420 if os.environ.get("E2E_KONFLUX_E2E") == "1" else 320 else: print( @@ -131,61 +108,15 @@ def restart_pod(container_name: str) -> None: ) op = "restart-lightspeed" timeout = 200 - max_attempts = ( - 3 - if op == "restart-llama-stack" - and os.environ.get("E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA") == "1" - and os.environ.get("E2E_LLAMA_RELOAD_CONFIG_ONLY") != "1" - else 2 if op == "restart-llama-stack" else 1 - ) - last_error: subprocess.CalledProcessError | subprocess.TimeoutExpired | None = None - for attempt in range(1, max_attempts + 1): - try: - result = run_e2e_ops(op, timeout=timeout) - print(result.stdout, end="") - if result.returncode != 0: - print(result.stderr, end="") - detail = (result.stderr or result.stdout or "").strip() - raise subprocess.CalledProcessError( - result.returncode, - op, - detail or None, - ) - return - except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as err: - last_error = err - if op == "restart-llama-stack" and isinstance( - err, subprocess.CalledProcessError - ): - _print_llama_restart_diagnostics_from_output( - (err.stdout or "") + (err.stderr or "") - ) - try: - diag = run_e2e_ops("diagnose-llama-restart", timeout=90) - print(diag.stdout, end="") - except (subprocess.CalledProcessError, subprocess.TimeoutExpired): - pass - if attempt < max_attempts: - retry_delay = ( - 30 - if os.environ.get("E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA") == "1" - else 20 - ) - print( - f"⚠️ {op} failed (attempt {attempt}/{max_attempts}), " - f"retrying after {retry_delay}s..." - ) - time.sleep(retry_delay) - if ( - op == "restart-llama-stack" - and os.environ.get("E2E_LLAMA_RELOAD_CONFIG_ONLY") == "1" - ): - # Reload failed; next attempt does full pod recreate. - os.environ.pop("E2E_LLAMA_RELOAD_CONFIG_ONLY", None) - if last_error is not None: - if isinstance(last_error, subprocess.TimeoutExpired): - print(f"Failed to restart pod {container_name}: {last_error}") - raise last_error + try: + result = run_e2e_ops(op, timeout=timeout) + print(result.stdout, end="") + if result.returncode != 0: + print(result.stderr, end="") + raise subprocess.CalledProcessError(result.returncode, op) + except subprocess.TimeoutExpired as e: + print(f"Failed to restart pod {container_name}: {e}") + raise def restore_llama_stack_pod() -> None: From dd2e3190dfc7883e6a2065f901a835e69494110b Mon Sep 17 00:00:00 2001 From: Radovan Fuchs Date: Tue, 26 May 2026 09:51:37 +0200 Subject: [PATCH 12/20] fix --- tests/e2e/features/environment.py | 6 ++++- tests/e2e/features/steps/tls.py | 33 ++++++++++++++++++++++++++- tests/e2e/features/tls.feature | 4 ++-- tests/e2e/utils/llama_config_utils.py | 20 ++++++++++++++++ tests/e2e/utils/prow_utils.py | 9 +++++++- 5 files changed, 67 insertions(+), 5 deletions(-) diff --git a/tests/e2e/features/environment.py b/tests/e2e/features/environment.py index 025c536e4..abcabe577 100644 --- a/tests/e2e/features/environment.py +++ b/tests/e2e/features/environment.py @@ -26,7 +26,10 @@ reset_llama_stack_disrupt_once_tracking, reset_llama_stack_was_running, ) -from tests.e2e.features.steps.tls import reset_tls_prow_state +from tests.e2e.features.steps.tls import ( + prepare_tls_feature_entry_on_prow, + reset_tls_prow_state, +) from tests.e2e.utils.llama_stack_utils import register_shield from tests.e2e.utils.prow_utils import ( restart_pod, @@ -454,6 +457,7 @@ def before_feature(context: Context, feature: Feature) -> None: reset_llama_stack_disrupt_once_tracking() if feature.filename and "tls.feature" in feature.filename: reset_tls_prow_state() + prepare_tls_feature_entry_on_prow() try: max_flaky = int(os.getenv("E2E_FLAKY_MAX_ATTEMPTS", _E2E_FLAKY_MAX_ATTEMPTS)) diff --git a/tests/e2e/features/steps/tls.py b/tests/e2e/features/steps/tls.py index 74620c806..67f5fa360 100644 --- a/tests/e2e/features/steps/tls.py +++ b/tests/e2e/features/steps/tls.py @@ -17,10 +17,12 @@ from tests.e2e.utils.llama_config_utils import ( backup_llama_config, + clear_llama_config_backup, load_llama_config, + reset_llama_run_config_to_pipeline_default, write_llama_config, ) -from tests.e2e.utils.prow_utils import get_namespace, run_e2e_ops +from tests.e2e.utils.prow_utils import get_namespace, restart_pod, run_e2e_ops from tests.e2e.utils.utils import is_prow_environment _MOCK_TLS_PORT_TLS = 8443 @@ -40,6 +42,35 @@ def reset_tls_prow_state() -> None: """Reset per-feature Prow state (call from ``before_feature``).""" _mock_tls_cluster_deploy_state["done"] = False os.environ.pop("E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA", None) + clear_llama_config_backup() + + +def prepare_tls_feature_entry_on_prow() -> None: + """Baseline cluster state when tls.feature runs after other features in test_list. + + Earlier features (disrupted, MCP) delete or reconfigure Llama without mock TLS + certs. Isolated tls.feature runs skip that churn, which is why the same Gherkin + passes alone but flakes mid-feature in the full suite. + """ + if not is_prow_environment(): + return + print("[tls.feature] Prow/Konflux entry: reset run.yaml and warm Llama + mock TLS...") + reset_llama_run_config_to_pipeline_default() + result = run_e2e_ops("deploy-e2e-mock-tls-inference", timeout=300) + print(result.stdout, end="") + if result.returncode != 0: + raise RuntimeError( + "tls.feature entry: deploy-e2e-mock-tls-inference failed: " + f"{result.stderr or result.stdout}" + ) + _mock_tls_cluster_deploy_state["done"] = True + _prepare_tls_prow_llama_restart_env() + os.environ.setdefault( + "E2E_MOCK_TLS_INFERENCE_HOST", + _cluster_mock_tls_inference_host(), + ) + restart_pod("llama-stack") + print("[tls.feature] Prow/Konflux entry baseline complete", flush=True) def is_tls_configuration_feature(context: Context) -> bool: diff --git a/tests/e2e/features/tls.feature b/tests/e2e/features/tls.feature index 412e5b04f..97c089067 100644 --- a/tests/e2e/features/tls.feature +++ b/tests/e2e/features/tls.feature @@ -10,10 +10,10 @@ Feature: TLS configuration for remote inference providers And The system is in default state And REST API service prefix is /v1 And the Lightspeed stack configuration directory is "tests/e2e/configuration" - And The service uses the lightspeed-stack-tls.yaml configuration - And The service is restarted And The original Llama Stack config is restored if modified And The mock TLS inference server is deployed + And The service uses the lightspeed-stack-tls.yaml configuration + And The service is restarted Scenario: Inference succeeds with TLS verification disabled Given Llama Stack is configured with TLS verification disabled diff --git a/tests/e2e/utils/llama_config_utils.py b/tests/e2e/utils/llama_config_utils.py index eb5f67b9d..e8fdf4832 100644 --- a/tests/e2e/utils/llama_config_utils.py +++ b/tests/e2e/utils/llama_config_utils.py @@ -3,6 +3,7 @@ import os import shutil import tempfile +from pathlib import Path from typing import Any, Optional import yaml @@ -20,6 +21,25 @@ _llama_config_backup_key: dict[str, Optional[str]] = {"value": None} +def clear_llama_config_backup() -> None: + """Drop in-memory run.yaml backup (e.g. at start of tls.feature).""" + _llama_config_backup_key["value"] = None + + +def reset_llama_run_config_to_pipeline_default() -> None: + """Reset llama-stack-config run.yaml to Konflux/Prow pipeline seed (run-ci.yaml).""" + if not is_prow_environment(): + return + run_ci = ( + Path(__file__).resolve().parents[1] / "configs" / "run-ci.yaml" + ) + if not run_ci.is_file(): + print(f"WARN: pipeline run.yaml seed not found at {run_ci}", flush=True) + return + print(f"Resetting llama-stack-config from {run_ci.name}...", flush=True) + update_llama_run_configmap(str(run_ci)) + + def _local_llama_config_path() -> str: """Return local run.yaml path for Docker/local e2e execution.""" return os.getenv("E2E_LLAMA_CONFIG_PATH", _DEFAULT_LOCAL_LLAMA_CONFIG_PATH) diff --git a/tests/e2e/utils/prow_utils.py b/tests/e2e/utils/prow_utils.py index 58a1866cd..7ac79f3ac 100644 --- a/tests/e2e/utils/prow_utils.py +++ b/tests/e2e/utils/prow_utils.py @@ -113,7 +113,14 @@ def restart_pod(container_name: str) -> None: print(result.stdout, end="") if result.returncode != 0: print(result.stderr, end="") - raise subprocess.CalledProcessError(result.returncode, op) + combined = f"{result.stdout or ''}\n{result.stderr or ''}".strip() + tail = "\n".join(combined.splitlines()[-25:]) if combined else "" + detail = tail or f"exit {result.returncode}" + raise subprocess.CalledProcessError( + result.returncode, + op, + detail, + ) except subprocess.TimeoutExpired as e: print(f"Failed to restart pod {container_name}: {e}") raise From da56ae23ce6e1d474eaaf36cafbbba2b6412169b Mon Sep 17 00:00:00 2001 From: Radovan Fuchs Date: Tue, 26 May 2026 13:31:49 +0200 Subject: [PATCH 13/20] extend llama-stack timeout --- tests/e2e-prow/rhoai/scripts/e2e-ops.sh | 22 +++++++++++++++++++--- tests/e2e/utils/prow_utils.py | 19 +++++++++++++------ 2 files changed, 32 insertions(+), 9 deletions(-) diff --git a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh index b5741ef02..6a0a4371b 100755 --- a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh +++ b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh @@ -339,6 +339,7 @@ cmd_restart_lightspeed() { cmd_restart_llama_stack() { echo "===== Restoring llama-stack service =====" + echo "[e2e-ops] restart-llama-stack env: E2E_KONFLUX_E2E=${E2E_KONFLUX_E2E:-0} E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA=${E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA:-0}" # Pod.spec is largely immutable; delete so apply creates a pod with current volumes/env. echo "Deleting llama-stack pod (if any) before apply..." timeout 45 oc delete pod llama-stack-service -n "$NAMESPACE" --ignore-not-found=true --wait=true 2>/dev/null || { @@ -369,7 +370,19 @@ cmd_restart_llama_stack() { -n "$NAMESPACE" \ --dry-run=client -o yaml | oc apply -f - oc apply -n "$NAMESPACE" -f "$MANIFEST_DIR/llama-stack-openai.yaml" - wait_for_pod "llama-stack-service" 90 + local llama_pod_wait=90 + if [[ "${E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA:-0}" == "1" ]]; then + # readinessProbe: 20s + 36*5s = 200s; clone/enrich/RAG on Konflux often needs 400s+ total. + llama_pod_wait=180 + fi + echo "[e2e-ops] Waiting for llama-stack-service Ready (max ${llama_pod_wait} attempts, $((llama_pod_wait * 3))s)..." + if ! wait_for_pod "llama-stack-service" "$llama_pod_wait"; then + echo "===== Llama-stack restore FAILED (pod not Ready within $((llama_pod_wait * 3))s) =====" + oc get pod llama-stack-service -n "$NAMESPACE" -o wide 2>&1 || true + oc describe pod llama-stack-service -n "$NAMESPACE" 2>&1 | tail -50 || true + oc logs llama-stack-service -n "$NAMESPACE" -c llama-stack-container --tail=80 2>&1 || true + exit 1 + fi echo "Labeling pod for service..." oc label pod llama-stack-service pod=llama-stack-service -n "$NAMESPACE" --overwrite if [[ "${E2E_COPY_INTERCEPTION_CA_TO_LLAMA:-0}" == "1" ]]; then @@ -386,7 +399,7 @@ cmd_restart_llama_stack() { fi local llama_health_attempts=50 if [[ "${E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA:-0}" == "1" ]]; then - llama_health_attempts=75 + llama_health_attempts=100 fi if ! wait_for_llama_stack_http_health "$llama_health_attempts"; then echo "===== Llama-stack restore FAILED (HTTP not healthy) =====" @@ -497,11 +510,14 @@ cmd_restart_llama_port_forward() { local local_port="${LOCAL_LLAMA_PORT:-8321}" local remote_port="${REMOTE_LLAMA_PORT:-8321}" local max_attempts=6 + if [[ "${E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA:-0}" == "1" ]]; then + max_attempts=10 + fi local pf_pid local pf_resource local llama_pf_log="/tmp/port-forward-llama.log" - echo "Re-establishing Llama Stack port-forward on $local_port:$remote_port..." + echo "Re-establishing Llama Stack port-forward on $local_port:$remote_port (max $max_attempts attempts)..." for ((attempt=1; attempt<=max_attempts; attempt++)); do kill_stale_llama_forward "$local_port" diff --git a/tests/e2e/utils/prow_utils.py b/tests/e2e/utils/prow_utils.py index 7ac79f3ac..c94e17c13 100644 --- a/tests/e2e/utils/prow_utils.py +++ b/tests/e2e/utils/prow_utils.py @@ -93,10 +93,14 @@ def restart_pod(container_name: str) -> None: """ if container_name in _LLAMA_RESTART_NAMES: op = "restart-llama-stack" - # TLS: full pod recreate + cert sync + health on Konflux can exceed 7 min. - timeout = ( - 900 if os.environ.get("E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA") == "1" else 420 - ) + # Subprocess cap must exceed e2e-ops internal waits (pod + in-pod health + port-forward). + # Konflux TLS full recreate: ~6–12 min typical, 15+ min under load (user-reported 400s+). + if os.environ.get("E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA") == "1": + timeout = 1200 + elif os.environ.get("E2E_KONFLUX_E2E") == "1": + timeout = 720 + else: + timeout = 420 elif container_name in _LIGHTSPEED_RESTART_NAMES: op = "restart-lightspeed" # Konflux LCS readiness can take ~195s (probe budget in lightspeed-stack.yaml). @@ -133,8 +137,11 @@ def restore_llama_stack_pod() -> None: subprocess.CalledProcessError: If oc/e2e-ops restore fails. subprocess.TimeoutExpired: If the operation times out. """ - # wait_for_pod (up to ~180s) + in-pod /v1/health polling (~105s) — allow headroom. - result = run_e2e_ops("restart-llama-stack", timeout=420) + if os.environ.get("E2E_KONFLUX_E2E") == "1": + timeout = 720 + else: + timeout = 420 + result = run_e2e_ops("restart-llama-stack", timeout=timeout) print(result.stdout, end="") if result.returncode != 0: print(result.stderr, end="") From 56ca5dcd8344205cc871347602a05deedbbb7825 Mon Sep 17 00:00:00 2001 From: Radovan Fuchs Date: Tue, 26 May 2026 19:12:13 +0200 Subject: [PATCH 14/20] fix sync between mock tls and llama-stack --- tests/e2e-prow/rhoai/scripts/e2e-ops.sh | 125 +++++++++++++++++++----- 1 file changed, 99 insertions(+), 26 deletions(-) diff --git a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh index 6a0a4371b..c72eb9cb2 100755 --- a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh +++ b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh @@ -759,42 +759,115 @@ _MOCK_TLS_CERT_FILES=( expired-client.crt ) -cmd_sync_mock_tls_certs_secret() { - local mock_pod_name tmpdir f - mock_pod_name=$(oc get pod -n "$NAMESPACE" -l app=e2e-mock-tls-inference \ - -o jsonpath='{.items[0].metadata.name}' 2>/dev/null) || mock_pod_name="" +_mock_tls_secret_is_complete() { + local f b64 + for f in "${_MOCK_TLS_CERT_FILES[@]}"; do + b64=$(oc get secret e2e-mock-tls-certs -n "$NAMESPACE" \ + -o "go-template={{index .data \"${f}\"}}" 2>/dev/null) || return 1 + [[ -n "$b64" ]] || return 1 + done + return 0 +} + +_get_mock_tls_inference_pod_name() { + oc get pod -n "$NAMESPACE" -l app=e2e-mock-tls-inference \ + -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true +} - if [[ -z "$mock_pod_name" ]]; then - echo "ERROR: no e2e-mock-tls-inference pod in namespace $NAMESPACE" >&2 - echo " Run: e2e-ops.sh deploy-e2e-mock-tls-inference" >&2 +_wait_for_mock_tls_inference_pod() { + if ! oc wait pod -l app=e2e-mock-tls-inference -n "$NAMESPACE" \ + --for=condition=Ready --timeout=120s 2>/dev/null; then + echo "ERROR: e2e-mock-tls-inference pod not Ready" >&2 + oc get pods -n "$NAMESPACE" -l app=e2e-mock-tls-inference -o wide 2>&1 || true return 1 fi + return 0 +} - tmpdir=$(mktemp -d) - for f in "${_MOCK_TLS_CERT_FILES[@]}"; do - if ! oc exec -n "$NAMESPACE" "$mock_pod_name" -c e2e-mock-tls-inference -- \ - cat "/certs/$f" >"$tmpdir/$f"; then - echo "ERROR: failed to read /certs/$f from e2e-mock-tls-inference pod" >&2 - rm -rf "$tmpdir" +_copy_mock_tls_cert_from_pod() { + local mock_pod_name="$1" + local cert_file="$2" + local dest="$3" + local attempt + + for ((attempt=1; attempt<=4; attempt++)); do + if oc exec --request-timeout=90 -n "$NAMESPACE" "$mock_pod_name" \ + -c e2e-mock-tls-inference -- cat "/certs/$cert_file" >"$dest" 2>/dev/null \ + && [[ -s "$dest" ]]; then + return 0 + fi + echo "[e2e-ops] WARN: read /certs/$cert_file from mock pod failed (attempt $attempt/4)" + sleep 5 + done + return 1 +} + +_recycle_mock_tls_inference_pod() { + echo "[e2e-ops] Recycling e2e-mock-tls-inference pod (stale or unresponsive)..." + oc delete pod e2e-mock-tls-inference -n "$NAMESPACE" --ignore-not-found=true --wait=true 2>/dev/null || true + sleep 3 + if ! _wait_for_mock_tls_inference_pod; then + return 1 + fi + # Certs are written at container start; allow trustme + pip to finish. + sleep 10 + return 0 +} + +cmd_sync_mock_tls_certs_secret() { + local mock_pod_name tmpdir f recycle_attempt + + if _mock_tls_secret_is_complete; then + echo "✓ Secret e2e-mock-tls-certs already complete (${#_MOCK_TLS_CERT_FILES[@]} keys); skipping sync" + return 0 + fi + + for recycle_attempt in 1 2; do + mock_pod_name=$(_get_mock_tls_inference_pod_name) + if [[ -z "$mock_pod_name" ]]; then + echo "ERROR: no e2e-mock-tls-inference pod in namespace $NAMESPACE" >&2 + echo " Run: e2e-ops.sh deploy-e2e-mock-tls-inference" >&2 return 1 fi - if [[ ! -s "$tmpdir/$f" ]]; then - echo "ERROR: /certs/$f is empty in e2e-mock-tls-inference pod" >&2 - rm -rf "$tmpdir" + + if ! _wait_for_mock_tls_inference_pod; then + [[ $recycle_attempt -lt 2 ]] && _recycle_mock_tls_inference_pod && continue return 1 fi - done - if ! oc create secret generic e2e-mock-tls-certs \ - --from-file="$tmpdir" \ - -n "$NAMESPACE" \ - --dry-run=client -o yaml | oc apply -f -; then - echo "ERROR: failed to apply e2e-mock-tls-certs secret" >&2 + tmpdir=$(mktemp -d) + local sync_ok=true + for f in "${_MOCK_TLS_CERT_FILES[@]}"; do + if ! _copy_mock_tls_cert_from_pod "$mock_pod_name" "$f" "$tmpdir/$f"; then + echo "ERROR: failed to read /certs/$f from e2e-mock-tls-inference pod" >&2 + oc logs "$mock_pod_name" -n "$NAMESPACE" -c e2e-mock-tls-inference --tail=40 2>&1 \ + | sed 's/^/[e2e-ops] /' || true + sync_ok=false + break + fi + done + + if [[ "$sync_ok" == "true" ]]; then + if ! oc create secret generic e2e-mock-tls-certs \ + --from-file="$tmpdir" \ + -n "$NAMESPACE" \ + --dry-run=client -o yaml | oc apply -f -; then + echo "ERROR: failed to apply e2e-mock-tls-certs secret" >&2 + rm -rf "$tmpdir" + return 1 + fi + rm -rf "$tmpdir" + echo "✓ Secret e2e-mock-tls-certs updated (${#_MOCK_TLS_CERT_FILES[@]} files)" + return 0 + fi + rm -rf "$tmpdir" - return 1 - fi - rm -rf "$tmpdir" - echo "✓ Secret e2e-mock-tls-certs updated (${#_MOCK_TLS_CERT_FILES[@]} files)" + if [[ $recycle_attempt -lt 2 ]]; then + _recycle_mock_tls_inference_pod || return 1 + fi + done + + return 1 } _verify_mock_tls_certs_mounted_in_llama() { From f1d29a89f9fb9bcab63de35d11cda4d0ab60512c Mon Sep 17 00:00:00 2001 From: Radovan Fuchs Date: Tue, 26 May 2026 22:27:51 +0200 Subject: [PATCH 15/20] fix lightspeed-stack restart time --- tests/e2e-prow/rhoai/scripts/e2e-ops.sh | 53 ++++++++++++++++++------- tests/e2e/utils/prow_utils.py | 4 +- 2 files changed, 41 insertions(+), 16 deletions(-) diff --git a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh index c72eb9cb2..e3c0b918b 100755 --- a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh +++ b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh @@ -45,18 +45,30 @@ E2E_JWKS_PORT_FORWARD_PID_FILE="${E2E_JWKS_PORT_FORWARD_PID_FILE:-/tmp/e2e-jwks- wait_for_pod() { local pod_name="$1" local max_attempts="${2:-24}" - + local attempt + local ready + local phase + for ((attempt=1; attempt<=max_attempts; attempt++)); do - local ready - ready=$(oc get pod "$pod_name" -n "$NAMESPACE" -o jsonpath='{.status.containerStatuses[0].ready}' 2>/dev/null || echo "false") - if [[ "$ready" == "true" ]]; then - echo "✓ Pod $pod_name ready" - return 0 + if ! oc get pod "$pod_name" -n "$NAMESPACE" &>/dev/null; then + phase="Missing" + else + phase=$(oc get pod "$pod_name" -n "$NAMESPACE" \ + -o jsonpath='{.status.phase}' 2>/dev/null || echo "Unknown") + ready=$(oc get pod "$pod_name" -n "$NAMESPACE" \ + -o jsonpath='{.status.containerStatuses[0].ready}' 2>/dev/null || echo "false") + if [[ "$ready" == "true" ]]; then + echo "✓ Pod $pod_name ready (attempt $attempt/$max_attempts)" + return 0 + fi + fi + if [[ $((attempt % 10)) -eq 0 ]]; then + echo "[e2e-ops] $pod_name not ready yet (attempt $attempt/$max_attempts, phase=${phase:-?})..." fi sleep 3 done - - echo "Pod $pod_name not ready after $((max_attempts * 3))s" + + echo "Pod $pod_name not ready after $((max_attempts * 3))s (last phase: ${phase:-unknown})" return 1 } @@ -314,17 +326,30 @@ cmd_restart_lightspeed() { local pod_ready=true local lcs_pod_wait=40 if [[ "${E2E_KONFLUX_E2E:-0}" == "1" ]]; then - lcs_pod_wait=65 + # readinessProbe: 20s + 30*5s; LCS + Llama handshake can exceed 195s on Konflux (TLS suite). + lcs_pod_wait=100 fi + echo "[e2e-ops] Waiting for lightspeed-stack-service Ready (max ${lcs_pod_wait} attempts, $((lcs_pod_wait * 3))s)..." if ! wait_for_pod "lightspeed-stack-service" "$lcs_pod_wait"; then pod_ready=false - echo "⚠️ Pod not ready within 120s — dumping diagnostics:" - oc describe pod lightspeed-stack-service -n "$NAMESPACE" 2>&1 | tail -30 || true - oc logs lightspeed-stack-service -n "$NAMESPACE" --tail=40 2>&1 || true + echo "⚠️ Pod not ready within $((lcs_pod_wait * 3))s — dumping diagnostics:" + if oc get pod lightspeed-stack-service -n "$NAMESPACE" &>/dev/null; then + oc describe pod lightspeed-stack-service -n "$NAMESPACE" 2>&1 | tail -40 || true + oc logs lightspeed-stack-service -n "$NAMESPACE" -c lightspeed-stack-container --tail=60 2>&1 || true + else + echo "[e2e-ops] lightspeed-stack-service pod not found in namespace $NAMESPACE" + oc get pods -n "$NAMESPACE" 2>&1 | grep -E 'lightspeed|NAME' || true + oc get events -n "$NAMESPACE" --field-selector involvedObject.name=lightspeed-stack-service \ + --sort-by='.lastTimestamp' 2>&1 | tail -15 || true + fi fi - # Re-label pod for service discovery - oc label pod lightspeed-stack-service pod=lightspeed-stack-service -n "$NAMESPACE" --overwrite + # Re-label pod for service discovery (ignore if pod was deleted / not created yet) + if oc get pod lightspeed-stack-service -n "$NAMESPACE" &>/dev/null; then + oc label pod lightspeed-stack-service pod=lightspeed-stack-service -n "$NAMESPACE" --overwrite + else + echo "⚠️ Cannot label lightspeed-stack-service — pod missing" + fi # Re-establish port-forwards (may succeed even if readiness was slow) cmd_restart_port_forward diff --git a/tests/e2e/utils/prow_utils.py b/tests/e2e/utils/prow_utils.py index c94e17c13..263b415a3 100644 --- a/tests/e2e/utils/prow_utils.py +++ b/tests/e2e/utils/prow_utils.py @@ -103,8 +103,8 @@ def restart_pod(container_name: str) -> None: timeout = 420 elif container_name in _LIGHTSPEED_RESTART_NAMES: op = "restart-lightspeed" - # Konflux LCS readiness can take ~195s (probe budget in lightspeed-stack.yaml). - timeout = 420 if os.environ.get("E2E_KONFLUX_E2E") == "1" else 320 + # Konflux LCS: TCP readiness + Llama handshake; TLS suite often needs 200–300s. + timeout = 480 if os.environ.get("E2E_KONFLUX_E2E") == "1" else 320 else: print( f"Warning: restart_pod({container_name!r}) unknown; " From 4079a6ab02c46f2fe372bae70047e4b4f48bace1 Mon Sep 17 00:00:00 2001 From: Radovan Fuchs Date: Wed, 27 May 2026 13:29:30 +0200 Subject: [PATCH 16/20] print logs --- tests/e2e-prow/rhoai/scripts/e2e-ops.sh | 336 +++++++----------------- 1 file changed, 94 insertions(+), 242 deletions(-) diff --git a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh index e3c0b918b..7924485e8 100755 --- a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh +++ b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh @@ -25,8 +25,6 @@ # disrupt-llama-stack - Delete llama-stack pod to disrupt connection # deploy-e2e-tunnel-proxy - Deploy in-cluster tunnel proxy (proxy.feature step) # deploy-e2e-interception-proxy - Deploy in-cluster interception proxy (proxy.feature step) -# deploy-e2e-mock-tls-inference - Deploy mock HTTPS inference server (tls.feature step) -# sync-mock-tls-certs-secret - Publish /certs PEMs to Secret for llama-stack mount set -e @@ -42,33 +40,101 @@ E2E_JWKS_PORT_FORWARD_PID_FILE="${E2E_JWKS_PORT_FORWARD_PID_FILE:-/tmp/e2e-jwks- # Helper functions # ============================================================================ +# On failure, print everything useful to stdout (captured by Behave / CI). +# Tolerates missing pods (uses events) and partial API errors. +e2e_ops_dump_pod_logs() { + local pod_name="${1:?pod name required}" + local preferred_container="${2:-}" + local log_tail="${3:-200}" + local prefix="[e2e-ops] " + local init_ctr ctr restart_count phase + + echo "${prefix}========== failure logs: pod/$pod_name (namespace $NAMESPACE) ==========" + + echo "${prefix}--- events for pod/$pod_name ---" + oc get events -n "$NAMESPACE" --field-selector "involvedObject.name=${pod_name}" \ + --sort-by='.lastTimestamp' 2>&1 | tail -50 | sed "s/^/${prefix}/" \ + || echo "${prefix}(could not list events)" + + if ! oc get pod "$pod_name" -n "$NAMESPACE" &>/dev/null; then + phase="Missing" + echo "${prefix}pod/$pod_name not found in namespace (deleted, failed, or API error)" + echo "${prefix}--- pods in $NAMESPACE ---" + oc get pods -n "$NAMESPACE" -o wide 2>&1 | sed "s/^/${prefix}/" || true + echo "${prefix}========== end failure logs: pod/$pod_name ==========" + return 0 + fi + + phase=$(oc get pod "$pod_name" -n "$NAMESPACE" -o jsonpath='{.status.phase}' 2>/dev/null || echo "?") + echo "${prefix}pod phase=$phase" + oc get pod "$pod_name" -n "$NAMESPACE" -o wide 2>&1 | sed "s/^/${prefix}/" || true + + echo "${prefix}--- oc describe pod/$pod_name ---" + oc describe pod "$pod_name" -n "$NAMESPACE" 2>&1 | sed "s/^/${prefix}/" || true + + for init_ctr in $(oc get pod "$pod_name" -n "$NAMESPACE" \ + -o jsonpath='{.spec.initContainers[*].name}' 2>/dev/null); do + [[ -n "$init_ctr" ]] || continue + echo "${prefix}--- oc logs pod/$pod_name -c $init_ctr (init, tail $log_tail) ---" + oc logs "$pod_name" -n "$NAMESPACE" -c "$init_ctr" --tail="$log_tail" 2>&1 \ + | sed "s/^/${prefix}/" \ + || echo "${prefix}(no init logs for $init_ctr yet)" + done + + for ctr in $(oc get pod "$pod_name" -n "$NAMESPACE" \ + -o jsonpath='{.spec.containers[*].name}' 2>/dev/null); do + [[ -n "$ctr" ]] || continue + echo "${prefix}--- oc logs pod/$pod_name -c $ctr (tail $log_tail) ---" + oc logs "$pod_name" -n "$NAMESPACE" -c "$ctr" --tail="$log_tail" 2>&1 \ + | sed "s/^/${prefix}/" \ + || echo "${prefix}(no logs for container $ctr)" + restart_count=$(oc get pod "$pod_name" -n "$NAMESPACE" \ + -o jsonpath="{.status.containerStatuses[?(@.name==\"${ctr}\")].restartCount}" \ + 2>/dev/null) || restart_count="0" + if [[ "${restart_count:-0}" -gt 0 ]]; then + echo "${prefix}--- oc logs pod/$pod_name -c $ctr --previous (tail $log_tail) ---" + oc logs "$pod_name" -n "$NAMESPACE" -c "$ctr" --previous --tail="$log_tail" 2>&1 \ + | sed "s/^/${prefix}/" \ + || echo "${prefix}(no --previous logs for $ctr)" + fi + done + + if [[ -n "$preferred_container" ]]; then + echo "${prefix}--- oc logs pod/$pod_name -c $preferred_container (preferred, tail $log_tail) ---" + oc logs "$pod_name" -n "$NAMESPACE" -c "$preferred_container" --tail="$log_tail" 2>&1 \ + | sed "s/^/${prefix}/" || true + fi + + echo "${prefix}========== end failure logs: pod/$pod_name ==========" +} + wait_for_pod() { local pod_name="$1" local max_attempts="${2:-24}" - local attempt - local ready - local phase + local attempt phase ready for ((attempt=1; attempt<=max_attempts; attempt++)); do if ! oc get pod "$pod_name" -n "$NAMESPACE" &>/dev/null; then phase="Missing" + ready="false" else phase=$(oc get pod "$pod_name" -n "$NAMESPACE" \ - -o jsonpath='{.status.phase}' 2>/dev/null || echo "Unknown") + -o jsonpath='{.status.phase}' 2>/dev/null || echo "?") ready=$(oc get pod "$pod_name" -n "$NAMESPACE" \ -o jsonpath='{.status.containerStatuses[0].ready}' 2>/dev/null || echo "false") - if [[ "$ready" == "true" ]]; then - echo "✓ Pod $pod_name ready (attempt $attempt/$max_attempts)" - return 0 - fi + fi + if [[ "$ready" == "true" ]]; then + echo "✓ Pod $pod_name ready (attempt $attempt/$max_attempts)" + return 0 fi if [[ $((attempt % 10)) -eq 0 ]]; then - echo "[e2e-ops] $pod_name not ready yet (attempt $attempt/$max_attempts, phase=${phase:-?})..." + echo "[e2e-ops] $pod_name not ready (attempt $attempt/$max_attempts, phase=$phase)" fi sleep 3 done echo "Pod $pod_name not ready after $((max_attempts * 3))s (last phase: ${phase:-unknown})" + e2e_ops_dump_pod_logs "$pod_name" "" 250 return 1 } @@ -286,9 +352,7 @@ wait_for_llama_stack_http_health() { fi done echo "ERROR: Llama Stack did not respond on http://127.0.0.1:8321/v1/health inside the pod" - oc get pod llama-stack-service -n "$NAMESPACE" -o wide 2>&1 || true - oc describe pod llama-stack-service -n "$NAMESPACE" 2>&1 | tail -40 || true - oc logs llama-stack-service -n "$NAMESPACE" -c llama-stack-container --tail=120 2>&1 || true + e2e_ops_dump_pod_logs "llama-stack-service" "llama-stack-container" 250 return 1 } @@ -324,32 +388,13 @@ cmd_restart_lightspeed() { # Don't let a timeout here abort the function — still attempt port-forward # and diagnostics so later scenarios have a chance to recover. local pod_ready=true - local lcs_pod_wait=40 - if [[ "${E2E_KONFLUX_E2E:-0}" == "1" ]]; then - # readinessProbe: 20s + 30*5s; LCS + Llama handshake can exceed 195s on Konflux (TLS suite). - lcs_pod_wait=100 - fi - echo "[e2e-ops] Waiting for lightspeed-stack-service Ready (max ${lcs_pod_wait} attempts, $((lcs_pod_wait * 3))s)..." - if ! wait_for_pod "lightspeed-stack-service" "$lcs_pod_wait"; then + if ! wait_for_pod "lightspeed-stack-service" 40; then pod_ready=false - echo "⚠️ Pod not ready within $((lcs_pod_wait * 3))s — dumping diagnostics:" - if oc get pod lightspeed-stack-service -n "$NAMESPACE" &>/dev/null; then - oc describe pod lightspeed-stack-service -n "$NAMESPACE" 2>&1 | tail -40 || true - oc logs lightspeed-stack-service -n "$NAMESPACE" -c lightspeed-stack-container --tail=60 2>&1 || true - else - echo "[e2e-ops] lightspeed-stack-service pod not found in namespace $NAMESPACE" - oc get pods -n "$NAMESPACE" 2>&1 | grep -E 'lightspeed|NAME' || true - oc get events -n "$NAMESPACE" --field-selector involvedObject.name=lightspeed-stack-service \ - --sort-by='.lastTimestamp' 2>&1 | tail -15 || true - fi + echo "⚠️ Pod not ready within 120s" fi - # Re-label pod for service discovery (ignore if pod was deleted / not created yet) - if oc get pod lightspeed-stack-service -n "$NAMESPACE" &>/dev/null; then - oc label pod lightspeed-stack-service pod=lightspeed-stack-service -n "$NAMESPACE" --overwrite - else - echo "⚠️ Cannot label lightspeed-stack-service — pod missing" - fi + # Re-label pod for service discovery + oc label pod lightspeed-stack-service pod=lightspeed-stack-service -n "$NAMESPACE" --overwrite # Re-establish port-forwards (may succeed even if readiness was slow) cmd_restart_port_forward @@ -357,6 +402,7 @@ cmd_restart_lightspeed() { if [[ "$pod_ready" == "false" ]]; then echo "⚠️ Lightspeed restart completed but pod was slow to become ready" + e2e_ops_dump_pod_logs "lightspeed-stack-service" "lightspeed-stack-container" 200 return 1 fi echo "✓ Lightspeed restart complete" @@ -364,7 +410,6 @@ cmd_restart_lightspeed() { cmd_restart_llama_stack() { echo "===== Restoring llama-stack service =====" - echo "[e2e-ops] restart-llama-stack env: E2E_KONFLUX_E2E=${E2E_KONFLUX_E2E:-0} E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA=${E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA:-0}" # Pod.spec is largely immutable; delete so apply creates a pod with current volumes/env. echo "Deleting llama-stack pod (if any) before apply..." timeout 45 oc delete pod llama-stack-service -n "$NAMESPACE" --ignore-not-found=true --wait=true 2>/dev/null || { @@ -382,52 +427,25 @@ cmd_restart_llama_stack() { exit 1 fi fi - if [[ "${E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA:-0}" == "1" ]]; then - echo "[e2e-ops] Syncing e2e-mock-tls-certs secret before llama-stack apply..." - if ! cmd_sync_mock_tls_certs_secret; then - echo "===== Llama-stack restore FAILED (mock TLS certs secret sync) =====" - exit 1 - fi - fi _LLAMA_SVC_FQDN="llama-stack-service-svc.${NAMESPACE}.svc.cluster.local" oc create secret generic llama-stack-ip-secret \ --from-literal=key="$_LLAMA_SVC_FQDN" \ -n "$NAMESPACE" \ --dry-run=client -o yaml | oc apply -f - oc apply -n "$NAMESPACE" -f "$MANIFEST_DIR/llama-stack-openai.yaml" - local llama_pod_wait=90 - if [[ "${E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA:-0}" == "1" ]]; then - # readinessProbe: 20s + 36*5s = 200s; clone/enrich/RAG on Konflux often needs 400s+ total. - llama_pod_wait=180 - fi - echo "[e2e-ops] Waiting for llama-stack-service Ready (max ${llama_pod_wait} attempts, $((llama_pod_wait * 3))s)..." - if ! wait_for_pod "llama-stack-service" "$llama_pod_wait"; then - echo "===== Llama-stack restore FAILED (pod not Ready within $((llama_pod_wait * 3))s) =====" - oc get pod llama-stack-service -n "$NAMESPACE" -o wide 2>&1 || true - oc describe pod llama-stack-service -n "$NAMESPACE" 2>&1 | tail -50 || true - oc logs llama-stack-service -n "$NAMESPACE" -c llama-stack-container --tail=80 2>&1 || true - exit 1 - fi + wait_for_pod "llama-stack-service" 90 echo "Labeling pod for service..." oc label pod llama-stack-service pod=llama-stack-service -n "$NAMESPACE" --overwrite if [[ "${E2E_COPY_INTERCEPTION_CA_TO_LLAMA:-0}" == "1" ]]; then if ! _verify_interception_ca_mounted_in_llama; then echo "===== Llama-stack restore FAILED (interception CA not mounted) =====" + e2e_ops_dump_pod_logs "llama-stack-service" "llama-stack-container" 200 exit 1 fi fi - if [[ "${E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA:-0}" == "1" ]]; then - if ! _verify_mock_tls_certs_mounted_in_llama; then - echo "===== Llama-stack restore FAILED (mock TLS certs not mounted) =====" - exit 1 - fi - fi - local llama_health_attempts=50 - if [[ "${E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA:-0}" == "1" ]]; then - llama_health_attempts=100 - fi - if ! wait_for_llama_stack_http_health "$llama_health_attempts"; then + if ! wait_for_llama_stack_http_health 50; then echo "===== Llama-stack restore FAILED (HTTP not healthy) =====" + e2e_ops_dump_pod_logs "llama-stack-service" "llama-stack-container" 250 exit 1 fi else @@ -442,6 +460,7 @@ cmd_restart_llama_stack() { if ! cmd_restart_llama_port_forward; then echo "ERROR: Llama pod is up but localhost:${LOCAL_LLAMA_PORT:-8321} port-forward failed" + e2e_ops_dump_pod_logs "llama-stack-service" "llama-stack-container" 200 exit 1 fi @@ -535,23 +554,20 @@ cmd_restart_llama_port_forward() { local local_port="${LOCAL_LLAMA_PORT:-8321}" local remote_port="${REMOTE_LLAMA_PORT:-8321}" local max_attempts=6 - if [[ "${E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA:-0}" == "1" ]]; then - max_attempts=10 - fi local pf_pid local pf_resource local llama_pf_log="/tmp/port-forward-llama.log" - echo "Re-establishing Llama Stack port-forward on $local_port:$remote_port (max $max_attempts attempts)..." + echo "Re-establishing Llama Stack port-forward on $local_port:$remote_port..." for ((attempt=1; attempt<=max_attempts; attempt++)); do kill_stale_llama_forward "$local_port" sleep 3 - if [[ "${E2E_KONFLUX_E2E:-0}" == "1" ]] || [[ $attempt -ge 3 ]]; then - pf_resource="pod/llama-stack-service" - else + if [[ $attempt -le 2 ]]; then pf_resource="svc/llama-stack-service-svc" + else + pf_resource="pod/llama-stack-service" fi echo "Llama port-forward attempt $attempt/$max_attempts -> $pf_resource" @@ -589,8 +605,10 @@ cmd_restart_llama_port_forward() { echo "Failed to establish Llama Stack port-forward on :$local_port" if [[ -s "$llama_pf_log" ]]; then + echo "[e2e-ops] $llama_pf_log (tail 30):" tail -30 "$llama_pf_log" 2>/dev/null | sed 's/^/[e2e-ops] /' || true fi + e2e_ops_dump_pod_logs "llama-stack-service" "llama-stack-container" 200 return 1 } @@ -773,142 +791,6 @@ cmd_copy_interception_proxy_ca_to_llama() { cmd_sync_interception_proxy_ca_secret } -_MOCK_TLS_CERT_FILES=( - ca.crt - client.crt - client.key - untrusted-ca.crt - expired-ca.crt - untrusted-client.crt - untrusted-client.key - expired-client.crt -) - -_mock_tls_secret_is_complete() { - local f b64 - for f in "${_MOCK_TLS_CERT_FILES[@]}"; do - b64=$(oc get secret e2e-mock-tls-certs -n "$NAMESPACE" \ - -o "go-template={{index .data \"${f}\"}}" 2>/dev/null) || return 1 - [[ -n "$b64" ]] || return 1 - done - return 0 -} - -_get_mock_tls_inference_pod_name() { - oc get pod -n "$NAMESPACE" -l app=e2e-mock-tls-inference \ - -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true -} - -_wait_for_mock_tls_inference_pod() { - if ! oc wait pod -l app=e2e-mock-tls-inference -n "$NAMESPACE" \ - --for=condition=Ready --timeout=120s 2>/dev/null; then - echo "ERROR: e2e-mock-tls-inference pod not Ready" >&2 - oc get pods -n "$NAMESPACE" -l app=e2e-mock-tls-inference -o wide 2>&1 || true - return 1 - fi - return 0 -} - -_copy_mock_tls_cert_from_pod() { - local mock_pod_name="$1" - local cert_file="$2" - local dest="$3" - local attempt - - for ((attempt=1; attempt<=4; attempt++)); do - if oc exec --request-timeout=90 -n "$NAMESPACE" "$mock_pod_name" \ - -c e2e-mock-tls-inference -- cat "/certs/$cert_file" >"$dest" 2>/dev/null \ - && [[ -s "$dest" ]]; then - return 0 - fi - echo "[e2e-ops] WARN: read /certs/$cert_file from mock pod failed (attempt $attempt/4)" - sleep 5 - done - return 1 -} - -_recycle_mock_tls_inference_pod() { - echo "[e2e-ops] Recycling e2e-mock-tls-inference pod (stale or unresponsive)..." - oc delete pod e2e-mock-tls-inference -n "$NAMESPACE" --ignore-not-found=true --wait=true 2>/dev/null || true - sleep 3 - if ! _wait_for_mock_tls_inference_pod; then - return 1 - fi - # Certs are written at container start; allow trustme + pip to finish. - sleep 10 - return 0 -} - -cmd_sync_mock_tls_certs_secret() { - local mock_pod_name tmpdir f recycle_attempt - - if _mock_tls_secret_is_complete; then - echo "✓ Secret e2e-mock-tls-certs already complete (${#_MOCK_TLS_CERT_FILES[@]} keys); skipping sync" - return 0 - fi - - for recycle_attempt in 1 2; do - mock_pod_name=$(_get_mock_tls_inference_pod_name) - if [[ -z "$mock_pod_name" ]]; then - echo "ERROR: no e2e-mock-tls-inference pod in namespace $NAMESPACE" >&2 - echo " Run: e2e-ops.sh deploy-e2e-mock-tls-inference" >&2 - return 1 - fi - - if ! _wait_for_mock_tls_inference_pod; then - [[ $recycle_attempt -lt 2 ]] && _recycle_mock_tls_inference_pod && continue - return 1 - fi - - tmpdir=$(mktemp -d) - local sync_ok=true - for f in "${_MOCK_TLS_CERT_FILES[@]}"; do - if ! _copy_mock_tls_cert_from_pod "$mock_pod_name" "$f" "$tmpdir/$f"; then - echo "ERROR: failed to read /certs/$f from e2e-mock-tls-inference pod" >&2 - oc logs "$mock_pod_name" -n "$NAMESPACE" -c e2e-mock-tls-inference --tail=40 2>&1 \ - | sed 's/^/[e2e-ops] /' || true - sync_ok=false - break - fi - done - - if [[ "$sync_ok" == "true" ]]; then - if ! oc create secret generic e2e-mock-tls-certs \ - --from-file="$tmpdir" \ - -n "$NAMESPACE" \ - --dry-run=client -o yaml | oc apply -f -; then - echo "ERROR: failed to apply e2e-mock-tls-certs secret" >&2 - rm -rf "$tmpdir" - return 1 - fi - rm -rf "$tmpdir" - echo "✓ Secret e2e-mock-tls-certs updated (${#_MOCK_TLS_CERT_FILES[@]} files)" - return 0 - fi - - rm -rf "$tmpdir" - if [[ $recycle_attempt -lt 2 ]]; then - _recycle_mock_tls_inference_pod || return 1 - fi - done - - return 1 -} - -_verify_mock_tls_certs_mounted_in_llama() { - local llama_pod_name="llama-stack-service" - if oc exec -n "$NAMESPACE" "$llama_pod_name" -c llama-stack-container -- \ - sh -c 'test -s /certs/ca.crt && test -s /certs/client.crt && test -s /certs/client.key'; then - echo "✓ mock TLS certs present under /certs in llama-stack" - return 0 - fi - echo "ERROR: /certs missing or incomplete in llama-stack pod" >&2 - oc get secret e2e-mock-tls-certs -n "$NAMESPACE" 2>&1 || true - oc exec -n "$NAMESPACE" "$llama_pod_name" -c llama-stack-container -- \ - ls -la /certs 2>&1 || true - return 1 -} - _e2e_repo_root() { cd "$SCRIPT_DIR/../../../.." && pwd } @@ -945,28 +827,6 @@ cmd_deploy_e2e_interception_proxy() { echo "✓ e2e-interception-proxy ready at http://e2e-interception-proxy.${NAMESPACE}.svc.cluster.local:8889" } -cmd_deploy_e2e_mock_tls_inference() { - local repo_root - repo_root="$(_e2e_repo_root)" - echo "Deploying e2e-mock-tls-inference in namespace $NAMESPACE..." - oc create configmap e2e-mock-tls-inference-script -n "$NAMESPACE" \ - --from-file=server.py="$repo_root/tests/e2e/mock_tls_inference_server/server.py" \ - --dry-run=client -o yaml | oc apply -f - - oc delete pod e2e-mock-tls-inference -n "$NAMESPACE" --ignore-not-found=true --wait=true 2>/dev/null || true - oc apply -n "$NAMESPACE" -f "$MANIFEST_DIR/e2e-mock-tls-inference.yaml" - if ! oc wait pod/e2e-mock-tls-inference -n "$NAMESPACE" --for=condition=Ready --timeout=240s; then - echo "ERROR: e2e-mock-tls-inference failed to become ready" >&2 - oc describe pod/e2e-mock-tls-inference -n "$NAMESPACE" 2>/dev/null | tail -30 || true - oc logs e2e-mock-tls-inference -n "$NAMESPACE" --tail=40 2>&1 || true - return 1 - fi - echo "✓ e2e-mock-tls-inference ready at https://e2e-mock-tls-inference.${NAMESPACE}.svc.cluster.local:8443" - if ! cmd_sync_mock_tls_certs_secret; then - echo "WARNING: mock TLS server is up but e2e-mock-tls-certs secret sync failed" >&2 - return 1 - fi -} - cmd_disrupt_llama_stack() { local pod_name="llama-stack-service" @@ -1037,12 +897,6 @@ case "$COMMAND" in deploy-e2e-interception-proxy) cmd_deploy_e2e_interception_proxy ;; - deploy-e2e-mock-tls-inference) - cmd_deploy_e2e_mock_tls_inference - ;; - sync-mock-tls-certs-secret) - cmd_sync_mock_tls_certs_secret - ;; *) echo "Usage: $0 [args...]" echo "" @@ -1061,8 +915,6 @@ case "$COMMAND" in echo " sync-interception-proxy-ca-secret - Publish trustme CA to Secret for llama mount" echo " deploy-e2e-tunnel-proxy - Deploy in-cluster tunnel proxy pod" echo " deploy-e2e-interception-proxy - Deploy in-cluster interception proxy pod" - echo " deploy-e2e-mock-tls-inference - Deploy mock HTTPS inference server (tls.feature)" - echo " sync-mock-tls-certs-secret - Publish mock TLS /certs to Secret for llama mount" exit 1 ;; esac From d40101359c3abc839fddc3d6398e92b706a6f4b1 Mon Sep 17 00:00:00 2001 From: Radovan Fuchs Date: Wed, 27 May 2026 18:04:38 +0200 Subject: [PATCH 17/20] print logs --- tests/e2e-prow/rhoai/scripts/e2e-ops.sh | 136 ++++++++++++++++++++++++ tests/e2e/features/environment.py | 6 +- tests/e2e/features/steps/tls.py | 59 +++++----- tests/e2e/utils/prow_utils.py | 2 +- 4 files changed, 165 insertions(+), 38 deletions(-) diff --git a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh index 7924485e8..f315bfe12 100755 --- a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh +++ b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh @@ -25,6 +25,8 @@ # disrupt-llama-stack - Delete llama-stack pod to disrupt connection # deploy-e2e-tunnel-proxy - Deploy in-cluster tunnel proxy (proxy.feature step) # deploy-e2e-interception-proxy - Deploy in-cluster interception proxy (proxy.feature step) +# deploy-e2e-mock-tls-inference - Deploy in-cluster mock TLS server (tls.feature) +# reload-llama-stack-config - Apply ConfigMap run.yaml in pod and restart main process set -e @@ -356,6 +358,46 @@ wait_for_llama_stack_http_health() { return 1 } +# Copy llama-stack-config run.yaml into the running pod and restart PID 1 (no init rerun). +cmd_reload_llama_stack_config() { + local pod="llama-stack-service" + local ctr="llama-stack-container" + local tmp + + if ! oc get pod "$pod" -n "$NAMESPACE" &>/dev/null; then + echo "ERROR: $pod not found; use restart-llama-stack first" >&2 + return 1 + fi + + tmp=$(mktemp) + if ! oc get configmap llama-stack-config -n "$NAMESPACE" \ + -o "go-template={{index .data \"run.yaml\"}}" >"$tmp" 2>/dev/null \ + || [[ ! -s "$tmp" ]]; then + rm -f "$tmp" + echo "ERROR: could not read run.yaml from llama-stack-config" >&2 + return 1 + fi + + echo "Reloading Llama Stack run.yaml in $pod (container restart, not pod delete)..." + oc cp "$tmp" "$NAMESPACE/$pod:/opt/app-root/run.yaml" -c "$ctr" || { + rm -f "$tmp" + return 1 + } + rm -f "$tmp" + oc exec -n "$NAMESPACE" "$pod" -c "$ctr" -- kill -1 + + if ! wait_for_pod "$pod" 40; then + echo "===== Llama-stack reload FAILED (pod not Ready) =====" + return 1 + fi + if ! wait_for_llama_stack_http_health 40; then + echo "===== Llama-stack reload FAILED (HTTP not healthy) =====" + return 1 + fi + cmd_restart_llama_port_forward || return 1 + echo "===== Llama-stack reload complete =====" +} + # ============================================================================ # Command implementations # ============================================================================ @@ -427,6 +469,12 @@ cmd_restart_llama_stack() { exit 1 fi fi + if [[ "${E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA:-0}" == "1" ]]; then + if ! cmd_sync_mock_tls_certs_secret; then + echo "===== Llama-stack restore FAILED (mock TLS cert secret sync) =====" + exit 1 + fi + fi _LLAMA_SVC_FQDN="llama-stack-service-svc.${NAMESPACE}.svc.cluster.local" oc create secret generic llama-stack-ip-secret \ --from-literal=key="$_LLAMA_SVC_FQDN" \ @@ -827,6 +875,86 @@ cmd_deploy_e2e_interception_proxy() { echo "✓ e2e-interception-proxy ready at http://e2e-interception-proxy.${NAMESPACE}.svc.cluster.local:8889" } +_MOCK_TLS_CERT_FILES=( + ca.crt client.crt client.key untrusted-ca.crt expired-ca.crt + untrusted-client.crt untrusted-client.key expired-client.crt +) + +_mock_tls_certs_secret_is_complete() { + local f present + if ! oc get secret e2e-mock-tls-certs -n "$NAMESPACE" &>/dev/null; then + return 1 + fi + present=$(oc get secret e2e-mock-tls-certs -n "$NAMESPACE" \ + -o go-template='{{range $k, $v := .data}}{{$k}} {{end}}' 2>/dev/null) || return 1 + for f in "${_MOCK_TLS_CERT_FILES[@]}"; do + [[ " $present " == *" $f "* ]] || return 1 + done + return 0 +} + +cmd_sync_mock_tls_certs_secret() { + local mock_pod="e2e-mock-tls-inference" + local mock_ctr="e2e-mock-tls-inference" + local f tmpdir from_args + + if _mock_tls_certs_secret_is_complete; then + echo "✓ Secret e2e-mock-tls-certs already complete; skipping sync" + return 0 + fi + if ! oc get pod "$mock_pod" -n "$NAMESPACE" &>/dev/null; then + echo "ERROR: $mock_pod not found (deploy-e2e-mock-tls-inference first)" >&2 + return 1 + fi + if ! oc exec -n "$NAMESPACE" "$mock_pod" -c "$mock_ctr" --request-timeout=30s -- \ + test -s /certs/ca.crt 2>/dev/null; then + echo "ERROR: mock TLS /certs not ready yet" >&2 + return 1 + fi + + tmpdir=$(mktemp -d) + for f in "${_MOCK_TLS_CERT_FILES[@]}"; do + if ! oc exec -n "$NAMESPACE" "$mock_pod" -c "$mock_ctr" --request-timeout=30s -- \ + cat "/certs/${f}" >"${tmpdir}/${f}" 2>/dev/null \ + || [[ ! -s "${tmpdir}/${f}" ]]; then + rm -rf "$tmpdir" + echo "ERROR: failed to read /certs/${f} from mock pod" >&2 + return 1 + fi + done + from_args=() + for f in "${_MOCK_TLS_CERT_FILES[@]}"; do + from_args+=(--from-file="${f}=${tmpdir}/${f}") + done + oc create secret generic e2e-mock-tls-certs \ + "${from_args[@]}" -n "$NAMESPACE" \ + --dry-run=client -o yaml | oc apply -n "$NAMESPACE" -f - + rm -rf "$tmpdir" + echo "✓ Secret e2e-mock-tls-certs updated" +} + +cmd_deploy_e2e_mock_tls_inference() { + local repo_root server_py + repo_root="$(_e2e_repo_root)" + server_py="$repo_root/tests/e2e/mock_tls_inference_server/server.py" + [[ -f "$server_py" ]] || { + echo "ERROR: missing $server_py" >&2 + return 1 + } + echo "Deploying e2e-mock-tls-inference in namespace $NAMESPACE..." + oc create configmap e2e-mock-tls-inference-script -n "$NAMESPACE" \ + --from-file=server.py="$server_py" \ + --dry-run=client -o yaml | oc apply -f - + oc apply -n "$NAMESPACE" -f "$MANIFEST_DIR/e2e-mock-tls-inference.yaml" + if ! oc wait pod/e2e-mock-tls-inference -n "$NAMESPACE" --for=condition=Ready --timeout=240s; then + echo "ERROR: e2e-mock-tls-inference not ready" >&2 + e2e_ops_dump_pod_logs "e2e-mock-tls-inference" "e2e-mock-tls-inference" 80 + return 1 + fi + cmd_sync_mock_tls_certs_secret || return 1 + echo "✓ e2e-mock-tls-inference ready" +} + cmd_disrupt_llama_stack() { local pod_name="llama-stack-service" @@ -897,6 +1025,12 @@ case "$COMMAND" in deploy-e2e-interception-proxy) cmd_deploy_e2e_interception_proxy ;; + deploy-e2e-mock-tls-inference) + cmd_deploy_e2e_mock_tls_inference + ;; + reload-llama-stack-config) + cmd_reload_llama_stack_config + ;; *) echo "Usage: $0 [args...]" echo "" @@ -915,6 +1049,8 @@ case "$COMMAND" in echo " sync-interception-proxy-ca-secret - Publish trustme CA to Secret for llama mount" echo " deploy-e2e-tunnel-proxy - Deploy in-cluster tunnel proxy pod" echo " deploy-e2e-interception-proxy - Deploy in-cluster interception proxy pod" + echo " deploy-e2e-mock-tls-inference - Deploy in-cluster mock TLS inference (tls.feature)" + echo " reload-llama-stack-config - Reload run.yaml without deleting llama pod" exit 1 ;; esac diff --git a/tests/e2e/features/environment.py b/tests/e2e/features/environment.py index abcabe577..025c536e4 100644 --- a/tests/e2e/features/environment.py +++ b/tests/e2e/features/environment.py @@ -26,10 +26,7 @@ reset_llama_stack_disrupt_once_tracking, reset_llama_stack_was_running, ) -from tests.e2e.features.steps.tls import ( - prepare_tls_feature_entry_on_prow, - reset_tls_prow_state, -) +from tests.e2e.features.steps.tls import reset_tls_prow_state from tests.e2e.utils.llama_stack_utils import register_shield from tests.e2e.utils.prow_utils import ( restart_pod, @@ -457,7 +454,6 @@ def before_feature(context: Context, feature: Feature) -> None: reset_llama_stack_disrupt_once_tracking() if feature.filename and "tls.feature" in feature.filename: reset_tls_prow_state() - prepare_tls_feature_entry_on_prow() try: max_flaky = int(os.getenv("E2E_FLAKY_MAX_ATTEMPTS", _E2E_FLAKY_MAX_ATTEMPTS)) diff --git a/tests/e2e/features/steps/tls.py b/tests/e2e/features/steps/tls.py index 67f5fa360..23a65b581 100644 --- a/tests/e2e/features/steps/tls.py +++ b/tests/e2e/features/steps/tls.py @@ -19,7 +19,6 @@ backup_llama_config, clear_llama_config_backup, load_llama_config, - reset_llama_run_config_to_pipeline_default, write_llama_config, ) from tests.e2e.utils.prow_utils import get_namespace, restart_pod, run_e2e_ops @@ -36,43 +35,17 @@ } _mock_tls_cluster_deploy_state: dict[str, bool] = {"done": False} +_tls_llama_pod_warmed: dict[str, bool] = {"done": False} def reset_tls_prow_state() -> None: """Reset per-feature Prow state (call from ``before_feature``).""" _mock_tls_cluster_deploy_state["done"] = False + _tls_llama_pod_warmed["done"] = False os.environ.pop("E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA", None) clear_llama_config_backup() -def prepare_tls_feature_entry_on_prow() -> None: - """Baseline cluster state when tls.feature runs after other features in test_list. - - Earlier features (disrupted, MCP) delete or reconfigure Llama without mock TLS - certs. Isolated tls.feature runs skip that churn, which is why the same Gherkin - passes alone but flakes mid-feature in the full suite. - """ - if not is_prow_environment(): - return - print("[tls.feature] Prow/Konflux entry: reset run.yaml and warm Llama + mock TLS...") - reset_llama_run_config_to_pipeline_default() - result = run_e2e_ops("deploy-e2e-mock-tls-inference", timeout=300) - print(result.stdout, end="") - if result.returncode != 0: - raise RuntimeError( - "tls.feature entry: deploy-e2e-mock-tls-inference failed: " - f"{result.stderr or result.stdout}" - ) - _mock_tls_cluster_deploy_state["done"] = True - _prepare_tls_prow_llama_restart_env() - os.environ.setdefault( - "E2E_MOCK_TLS_INFERENCE_HOST", - _cluster_mock_tls_inference_host(), - ) - restart_pod("llama-stack") - print("[tls.feature] Prow/Konflux entry baseline complete", flush=True) - - def is_tls_configuration_feature(context: Context) -> bool: """Return True when the active Behave feature is ``tls.feature``.""" feature = getattr(context, "feature", None) @@ -88,7 +61,12 @@ def _prepare_tls_prow_llama_restart_env() -> None: def restart_llama_for_tls_feature(context: Context) -> None: - """Restart Llama for TLS tests (full pod recreate on Prow/Konflux).""" + """Restart Llama for TLS tests. + + On Prow/Konflux the first restart per feature recreates the pod (mock TLS cert + Secret volume). Later restarts reload run.yaml in-place (``kill 1``) to avoid + re-running the heavy setup-from-source init on every scenario. + """ from tests.e2e.utils.utils import restart_container if not is_prow_environment(): @@ -97,11 +75,28 @@ def restart_llama_for_tls_feature(context: Context) -> None: _prepare_tls_prow_llama_restart_env() scenario = getattr(getattr(context, "scenario", None), "name", "") or "?" + + if not _tls_llama_pod_warmed["done"]: + print( + f"[tls.feature] Llama Stack restart: pod recreate (once per feature) " + f"scenario={scenario!r}", + flush=True, + ) + restart_pod("llama-stack") + _tls_llama_pod_warmed["done"] = True + return + print( - f"[tls.feature] Llama Stack restart: full recreate scenario={scenario!r}", + f"[tls.feature] Llama Stack restart: reload run.yaml scenario={scenario!r}", flush=True, ) - restart_container("llama-stack") + result = run_e2e_ops("reload-llama-stack-config", timeout=240) + print(result.stdout, end="") + if result.returncode != 0: + raise RuntimeError( + "tls.feature: reload-llama-stack-config failed: " + f"{result.stderr or result.stdout}" + ) def _cluster_mock_tls_inference_host() -> str: diff --git a/tests/e2e/utils/prow_utils.py b/tests/e2e/utils/prow_utils.py index 263b415a3..348c918ec 100644 --- a/tests/e2e/utils/prow_utils.py +++ b/tests/e2e/utils/prow_utils.py @@ -96,7 +96,7 @@ def restart_pod(container_name: str) -> None: # Subprocess cap must exceed e2e-ops internal waits (pod + in-pod health + port-forward). # Konflux TLS full recreate: ~6–12 min typical, 15+ min under load (user-reported 400s+). if os.environ.get("E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA") == "1": - timeout = 1200 + timeout = 900 elif os.environ.get("E2E_KONFLUX_E2E") == "1": timeout = 720 else: From 8eacfd43ec9d1314a47759d4fd217577b4b723d3 Mon Sep 17 00:00:00 2001 From: Radovan Fuchs Date: Wed, 27 May 2026 21:17:04 +0200 Subject: [PATCH 18/20] print logs --- tests/e2e-prow/rhoai/scripts/e2e-ops.sh | 20 ++++++++++++++++---- tests/e2e/features/steps/tls.py | 6 ++++-- 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh index f315bfe12..04853c167 100755 --- a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh +++ b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh @@ -379,22 +379,34 @@ cmd_reload_llama_stack_config() { fi echo "Reloading Llama Stack run.yaml in $pod (container restart, not pod delete)..." - oc cp "$tmp" "$NAMESPACE/$pod:/opt/app-root/run.yaml" -c "$ctr" || { + if ! oc cp "$tmp" "$NAMESPACE/$pod:/opt/app-root/run.yaml" -c "$ctr"; then rm -f "$tmp" + echo "ERROR: oc cp run.yaml into $pod failed" >&2 + e2e_ops_dump_pod_logs "$pod" "$ctr" 150 return 1 - } + fi rm -f "$tmp" - oc exec -n "$NAMESPACE" "$pod" -c "$ctr" -- kill -1 + # kill -1 is parsed as "signal -1, no PID" — use kill -HUP 1 (PID 1 = main process). + if ! oc exec -n "$NAMESPACE" "$pod" -c "$ctr" -- kill -HUP 1; then + echo "ERROR: kill -HUP 1 failed in $pod" >&2 + e2e_ops_dump_pod_logs "$pod" "$ctr" 150 + return 1 + fi if ! wait_for_pod "$pod" 40; then echo "===== Llama-stack reload FAILED (pod not Ready) =====" + e2e_ops_dump_pod_logs "$pod" "$ctr" 200 return 1 fi if ! wait_for_llama_stack_http_health 40; then echo "===== Llama-stack reload FAILED (HTTP not healthy) =====" + e2e_ops_dump_pod_logs "$pod" "$ctr" 200 + return 1 + fi + if ! cmd_restart_llama_port_forward; then + e2e_ops_dump_pod_logs "$pod" "$ctr" 120 return 1 fi - cmd_restart_llama_port_forward || return 1 echo "===== Llama-stack reload complete =====" } diff --git a/tests/e2e/features/steps/tls.py b/tests/e2e/features/steps/tls.py index 23a65b581..ae363d245 100644 --- a/tests/e2e/features/steps/tls.py +++ b/tests/e2e/features/steps/tls.py @@ -92,10 +92,12 @@ def restart_llama_for_tls_feature(context: Context) -> None: ) result = run_e2e_ops("reload-llama-stack-config", timeout=240) print(result.stdout, end="") + if result.stderr: + print(result.stderr, end="") if result.returncode != 0: + detail = f"{result.stdout or ''}\n{result.stderr or ''}".strip() raise RuntimeError( - "tls.feature: reload-llama-stack-config failed: " - f"{result.stderr or result.stdout}" + f"tls.feature: reload-llama-stack-config failed:\n{detail or result.returncode}" ) From e9e54bac38c1fa324cc38c85f339cb004385dad8 Mon Sep 17 00:00:00 2001 From: Radovan Fuchs Date: Thu, 28 May 2026 08:04:25 +0200 Subject: [PATCH 19/20] clear the logic --- tests/e2e-prow/rhoai/scripts/e2e-ops.sh | 444 ++++++++++++------------ tests/e2e/features/environment.py | 6 +- tests/e2e/features/steps/tls.py | 61 ++-- tests/e2e/utils/prow_utils.py | 2 +- 4 files changed, 260 insertions(+), 253 deletions(-) diff --git a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh index 04853c167..e3c0b918b 100755 --- a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh +++ b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh @@ -25,8 +25,8 @@ # disrupt-llama-stack - Delete llama-stack pod to disrupt connection # deploy-e2e-tunnel-proxy - Deploy in-cluster tunnel proxy (proxy.feature step) # deploy-e2e-interception-proxy - Deploy in-cluster interception proxy (proxy.feature step) -# deploy-e2e-mock-tls-inference - Deploy in-cluster mock TLS server (tls.feature) -# reload-llama-stack-config - Apply ConfigMap run.yaml in pod and restart main process +# deploy-e2e-mock-tls-inference - Deploy mock HTTPS inference server (tls.feature step) +# sync-mock-tls-certs-secret - Publish /certs PEMs to Secret for llama-stack mount set -e @@ -42,101 +42,33 @@ E2E_JWKS_PORT_FORWARD_PID_FILE="${E2E_JWKS_PORT_FORWARD_PID_FILE:-/tmp/e2e-jwks- # Helper functions # ============================================================================ -# On failure, print everything useful to stdout (captured by Behave / CI). -# Tolerates missing pods (uses events) and partial API errors. -e2e_ops_dump_pod_logs() { - local pod_name="${1:?pod name required}" - local preferred_container="${2:-}" - local log_tail="${3:-200}" - local prefix="[e2e-ops] " - local init_ctr ctr restart_count phase - - echo "${prefix}========== failure logs: pod/$pod_name (namespace $NAMESPACE) ==========" - - echo "${prefix}--- events for pod/$pod_name ---" - oc get events -n "$NAMESPACE" --field-selector "involvedObject.name=${pod_name}" \ - --sort-by='.lastTimestamp' 2>&1 | tail -50 | sed "s/^/${prefix}/" \ - || echo "${prefix}(could not list events)" - - if ! oc get pod "$pod_name" -n "$NAMESPACE" &>/dev/null; then - phase="Missing" - echo "${prefix}pod/$pod_name not found in namespace (deleted, failed, or API error)" - echo "${prefix}--- pods in $NAMESPACE ---" - oc get pods -n "$NAMESPACE" -o wide 2>&1 | sed "s/^/${prefix}/" || true - echo "${prefix}========== end failure logs: pod/$pod_name ==========" - return 0 - fi - - phase=$(oc get pod "$pod_name" -n "$NAMESPACE" -o jsonpath='{.status.phase}' 2>/dev/null || echo "?") - echo "${prefix}pod phase=$phase" - oc get pod "$pod_name" -n "$NAMESPACE" -o wide 2>&1 | sed "s/^/${prefix}/" || true - - echo "${prefix}--- oc describe pod/$pod_name ---" - oc describe pod "$pod_name" -n "$NAMESPACE" 2>&1 | sed "s/^/${prefix}/" || true - - for init_ctr in $(oc get pod "$pod_name" -n "$NAMESPACE" \ - -o jsonpath='{.spec.initContainers[*].name}' 2>/dev/null); do - [[ -n "$init_ctr" ]] || continue - echo "${prefix}--- oc logs pod/$pod_name -c $init_ctr (init, tail $log_tail) ---" - oc logs "$pod_name" -n "$NAMESPACE" -c "$init_ctr" --tail="$log_tail" 2>&1 \ - | sed "s/^/${prefix}/" \ - || echo "${prefix}(no init logs for $init_ctr yet)" - done - - for ctr in $(oc get pod "$pod_name" -n "$NAMESPACE" \ - -o jsonpath='{.spec.containers[*].name}' 2>/dev/null); do - [[ -n "$ctr" ]] || continue - echo "${prefix}--- oc logs pod/$pod_name -c $ctr (tail $log_tail) ---" - oc logs "$pod_name" -n "$NAMESPACE" -c "$ctr" --tail="$log_tail" 2>&1 \ - | sed "s/^/${prefix}/" \ - || echo "${prefix}(no logs for container $ctr)" - restart_count=$(oc get pod "$pod_name" -n "$NAMESPACE" \ - -o jsonpath="{.status.containerStatuses[?(@.name==\"${ctr}\")].restartCount}" \ - 2>/dev/null) || restart_count="0" - if [[ "${restart_count:-0}" -gt 0 ]]; then - echo "${prefix}--- oc logs pod/$pod_name -c $ctr --previous (tail $log_tail) ---" - oc logs "$pod_name" -n "$NAMESPACE" -c "$ctr" --previous --tail="$log_tail" 2>&1 \ - | sed "s/^/${prefix}/" \ - || echo "${prefix}(no --previous logs for $ctr)" - fi - done - - if [[ -n "$preferred_container" ]]; then - echo "${prefix}--- oc logs pod/$pod_name -c $preferred_container (preferred, tail $log_tail) ---" - oc logs "$pod_name" -n "$NAMESPACE" -c "$preferred_container" --tail="$log_tail" 2>&1 \ - | sed "s/^/${prefix}/" || true - fi - - echo "${prefix}========== end failure logs: pod/$pod_name ==========" -} - wait_for_pod() { local pod_name="$1" local max_attempts="${2:-24}" - local attempt phase ready + local attempt + local ready + local phase for ((attempt=1; attempt<=max_attempts; attempt++)); do if ! oc get pod "$pod_name" -n "$NAMESPACE" &>/dev/null; then phase="Missing" - ready="false" else phase=$(oc get pod "$pod_name" -n "$NAMESPACE" \ - -o jsonpath='{.status.phase}' 2>/dev/null || echo "?") + -o jsonpath='{.status.phase}' 2>/dev/null || echo "Unknown") ready=$(oc get pod "$pod_name" -n "$NAMESPACE" \ -o jsonpath='{.status.containerStatuses[0].ready}' 2>/dev/null || echo "false") - fi - if [[ "$ready" == "true" ]]; then - echo "✓ Pod $pod_name ready (attempt $attempt/$max_attempts)" - return 0 + if [[ "$ready" == "true" ]]; then + echo "✓ Pod $pod_name ready (attempt $attempt/$max_attempts)" + return 0 + fi fi if [[ $((attempt % 10)) -eq 0 ]]; then - echo "[e2e-ops] $pod_name not ready (attempt $attempt/$max_attempts, phase=$phase)" + echo "[e2e-ops] $pod_name not ready yet (attempt $attempt/$max_attempts, phase=${phase:-?})..." fi sleep 3 done echo "Pod $pod_name not ready after $((max_attempts * 3))s (last phase: ${phase:-unknown})" - e2e_ops_dump_pod_logs "$pod_name" "" 250 return 1 } @@ -354,62 +286,12 @@ wait_for_llama_stack_http_health() { fi done echo "ERROR: Llama Stack did not respond on http://127.0.0.1:8321/v1/health inside the pod" - e2e_ops_dump_pod_logs "llama-stack-service" "llama-stack-container" 250 + oc get pod llama-stack-service -n "$NAMESPACE" -o wide 2>&1 || true + oc describe pod llama-stack-service -n "$NAMESPACE" 2>&1 | tail -40 || true + oc logs llama-stack-service -n "$NAMESPACE" -c llama-stack-container --tail=120 2>&1 || true return 1 } -# Copy llama-stack-config run.yaml into the running pod and restart PID 1 (no init rerun). -cmd_reload_llama_stack_config() { - local pod="llama-stack-service" - local ctr="llama-stack-container" - local tmp - - if ! oc get pod "$pod" -n "$NAMESPACE" &>/dev/null; then - echo "ERROR: $pod not found; use restart-llama-stack first" >&2 - return 1 - fi - - tmp=$(mktemp) - if ! oc get configmap llama-stack-config -n "$NAMESPACE" \ - -o "go-template={{index .data \"run.yaml\"}}" >"$tmp" 2>/dev/null \ - || [[ ! -s "$tmp" ]]; then - rm -f "$tmp" - echo "ERROR: could not read run.yaml from llama-stack-config" >&2 - return 1 - fi - - echo "Reloading Llama Stack run.yaml in $pod (container restart, not pod delete)..." - if ! oc cp "$tmp" "$NAMESPACE/$pod:/opt/app-root/run.yaml" -c "$ctr"; then - rm -f "$tmp" - echo "ERROR: oc cp run.yaml into $pod failed" >&2 - e2e_ops_dump_pod_logs "$pod" "$ctr" 150 - return 1 - fi - rm -f "$tmp" - # kill -1 is parsed as "signal -1, no PID" — use kill -HUP 1 (PID 1 = main process). - if ! oc exec -n "$NAMESPACE" "$pod" -c "$ctr" -- kill -HUP 1; then - echo "ERROR: kill -HUP 1 failed in $pod" >&2 - e2e_ops_dump_pod_logs "$pod" "$ctr" 150 - return 1 - fi - - if ! wait_for_pod "$pod" 40; then - echo "===== Llama-stack reload FAILED (pod not Ready) =====" - e2e_ops_dump_pod_logs "$pod" "$ctr" 200 - return 1 - fi - if ! wait_for_llama_stack_http_health 40; then - echo "===== Llama-stack reload FAILED (HTTP not healthy) =====" - e2e_ops_dump_pod_logs "$pod" "$ctr" 200 - return 1 - fi - if ! cmd_restart_llama_port_forward; then - e2e_ops_dump_pod_logs "$pod" "$ctr" 120 - return 1 - fi - echo "===== Llama-stack reload complete =====" -} - # ============================================================================ # Command implementations # ============================================================================ @@ -442,13 +324,32 @@ cmd_restart_lightspeed() { # Don't let a timeout here abort the function — still attempt port-forward # and diagnostics so later scenarios have a chance to recover. local pod_ready=true - if ! wait_for_pod "lightspeed-stack-service" 40; then + local lcs_pod_wait=40 + if [[ "${E2E_KONFLUX_E2E:-0}" == "1" ]]; then + # readinessProbe: 20s + 30*5s; LCS + Llama handshake can exceed 195s on Konflux (TLS suite). + lcs_pod_wait=100 + fi + echo "[e2e-ops] Waiting for lightspeed-stack-service Ready (max ${lcs_pod_wait} attempts, $((lcs_pod_wait * 3))s)..." + if ! wait_for_pod "lightspeed-stack-service" "$lcs_pod_wait"; then pod_ready=false - echo "⚠️ Pod not ready within 120s" + echo "⚠️ Pod not ready within $((lcs_pod_wait * 3))s — dumping diagnostics:" + if oc get pod lightspeed-stack-service -n "$NAMESPACE" &>/dev/null; then + oc describe pod lightspeed-stack-service -n "$NAMESPACE" 2>&1 | tail -40 || true + oc logs lightspeed-stack-service -n "$NAMESPACE" -c lightspeed-stack-container --tail=60 2>&1 || true + else + echo "[e2e-ops] lightspeed-stack-service pod not found in namespace $NAMESPACE" + oc get pods -n "$NAMESPACE" 2>&1 | grep -E 'lightspeed|NAME' || true + oc get events -n "$NAMESPACE" --field-selector involvedObject.name=lightspeed-stack-service \ + --sort-by='.lastTimestamp' 2>&1 | tail -15 || true + fi fi - # Re-label pod for service discovery - oc label pod lightspeed-stack-service pod=lightspeed-stack-service -n "$NAMESPACE" --overwrite + # Re-label pod for service discovery (ignore if pod was deleted / not created yet) + if oc get pod lightspeed-stack-service -n "$NAMESPACE" &>/dev/null; then + oc label pod lightspeed-stack-service pod=lightspeed-stack-service -n "$NAMESPACE" --overwrite + else + echo "⚠️ Cannot label lightspeed-stack-service — pod missing" + fi # Re-establish port-forwards (may succeed even if readiness was slow) cmd_restart_port_forward @@ -456,7 +357,6 @@ cmd_restart_lightspeed() { if [[ "$pod_ready" == "false" ]]; then echo "⚠️ Lightspeed restart completed but pod was slow to become ready" - e2e_ops_dump_pod_logs "lightspeed-stack-service" "lightspeed-stack-container" 200 return 1 fi echo "✓ Lightspeed restart complete" @@ -464,6 +364,7 @@ cmd_restart_lightspeed() { cmd_restart_llama_stack() { echo "===== Restoring llama-stack service =====" + echo "[e2e-ops] restart-llama-stack env: E2E_KONFLUX_E2E=${E2E_KONFLUX_E2E:-0} E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA=${E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA:-0}" # Pod.spec is largely immutable; delete so apply creates a pod with current volumes/env. echo "Deleting llama-stack pod (if any) before apply..." timeout 45 oc delete pod llama-stack-service -n "$NAMESPACE" --ignore-not-found=true --wait=true 2>/dev/null || { @@ -482,8 +383,9 @@ cmd_restart_llama_stack() { fi fi if [[ "${E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA:-0}" == "1" ]]; then + echo "[e2e-ops] Syncing e2e-mock-tls-certs secret before llama-stack apply..." if ! cmd_sync_mock_tls_certs_secret; then - echo "===== Llama-stack restore FAILED (mock TLS cert secret sync) =====" + echo "===== Llama-stack restore FAILED (mock TLS certs secret sync) =====" exit 1 fi fi @@ -493,19 +395,39 @@ cmd_restart_llama_stack() { -n "$NAMESPACE" \ --dry-run=client -o yaml | oc apply -f - oc apply -n "$NAMESPACE" -f "$MANIFEST_DIR/llama-stack-openai.yaml" - wait_for_pod "llama-stack-service" 90 + local llama_pod_wait=90 + if [[ "${E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA:-0}" == "1" ]]; then + # readinessProbe: 20s + 36*5s = 200s; clone/enrich/RAG on Konflux often needs 400s+ total. + llama_pod_wait=180 + fi + echo "[e2e-ops] Waiting for llama-stack-service Ready (max ${llama_pod_wait} attempts, $((llama_pod_wait * 3))s)..." + if ! wait_for_pod "llama-stack-service" "$llama_pod_wait"; then + echo "===== Llama-stack restore FAILED (pod not Ready within $((llama_pod_wait * 3))s) =====" + oc get pod llama-stack-service -n "$NAMESPACE" -o wide 2>&1 || true + oc describe pod llama-stack-service -n "$NAMESPACE" 2>&1 | tail -50 || true + oc logs llama-stack-service -n "$NAMESPACE" -c llama-stack-container --tail=80 2>&1 || true + exit 1 + fi echo "Labeling pod for service..." oc label pod llama-stack-service pod=llama-stack-service -n "$NAMESPACE" --overwrite if [[ "${E2E_COPY_INTERCEPTION_CA_TO_LLAMA:-0}" == "1" ]]; then if ! _verify_interception_ca_mounted_in_llama; then echo "===== Llama-stack restore FAILED (interception CA not mounted) =====" - e2e_ops_dump_pod_logs "llama-stack-service" "llama-stack-container" 200 exit 1 fi fi - if ! wait_for_llama_stack_http_health 50; then + if [[ "${E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA:-0}" == "1" ]]; then + if ! _verify_mock_tls_certs_mounted_in_llama; then + echo "===== Llama-stack restore FAILED (mock TLS certs not mounted) =====" + exit 1 + fi + fi + local llama_health_attempts=50 + if [[ "${E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA:-0}" == "1" ]]; then + llama_health_attempts=100 + fi + if ! wait_for_llama_stack_http_health "$llama_health_attempts"; then echo "===== Llama-stack restore FAILED (HTTP not healthy) =====" - e2e_ops_dump_pod_logs "llama-stack-service" "llama-stack-container" 250 exit 1 fi else @@ -520,7 +442,6 @@ cmd_restart_llama_stack() { if ! cmd_restart_llama_port_forward; then echo "ERROR: Llama pod is up but localhost:${LOCAL_LLAMA_PORT:-8321} port-forward failed" - e2e_ops_dump_pod_logs "llama-stack-service" "llama-stack-container" 200 exit 1 fi @@ -614,20 +535,23 @@ cmd_restart_llama_port_forward() { local local_port="${LOCAL_LLAMA_PORT:-8321}" local remote_port="${REMOTE_LLAMA_PORT:-8321}" local max_attempts=6 + if [[ "${E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA:-0}" == "1" ]]; then + max_attempts=10 + fi local pf_pid local pf_resource local llama_pf_log="/tmp/port-forward-llama.log" - echo "Re-establishing Llama Stack port-forward on $local_port:$remote_port..." + echo "Re-establishing Llama Stack port-forward on $local_port:$remote_port (max $max_attempts attempts)..." for ((attempt=1; attempt<=max_attempts; attempt++)); do kill_stale_llama_forward "$local_port" sleep 3 - if [[ $attempt -le 2 ]]; then - pf_resource="svc/llama-stack-service-svc" - else + if [[ "${E2E_KONFLUX_E2E:-0}" == "1" ]] || [[ $attempt -ge 3 ]]; then pf_resource="pod/llama-stack-service" + else + pf_resource="svc/llama-stack-service-svc" fi echo "Llama port-forward attempt $attempt/$max_attempts -> $pf_resource" @@ -665,10 +589,8 @@ cmd_restart_llama_port_forward() { echo "Failed to establish Llama Stack port-forward on :$local_port" if [[ -s "$llama_pf_log" ]]; then - echo "[e2e-ops] $llama_pf_log (tail 30):" tail -30 "$llama_pf_log" 2>/dev/null | sed 's/^/[e2e-ops] /' || true fi - e2e_ops_dump_pod_logs "llama-stack-service" "llama-stack-container" 200 return 1 } @@ -851,6 +773,142 @@ cmd_copy_interception_proxy_ca_to_llama() { cmd_sync_interception_proxy_ca_secret } +_MOCK_TLS_CERT_FILES=( + ca.crt + client.crt + client.key + untrusted-ca.crt + expired-ca.crt + untrusted-client.crt + untrusted-client.key + expired-client.crt +) + +_mock_tls_secret_is_complete() { + local f b64 + for f in "${_MOCK_TLS_CERT_FILES[@]}"; do + b64=$(oc get secret e2e-mock-tls-certs -n "$NAMESPACE" \ + -o "go-template={{index .data \"${f}\"}}" 2>/dev/null) || return 1 + [[ -n "$b64" ]] || return 1 + done + return 0 +} + +_get_mock_tls_inference_pod_name() { + oc get pod -n "$NAMESPACE" -l app=e2e-mock-tls-inference \ + -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true +} + +_wait_for_mock_tls_inference_pod() { + if ! oc wait pod -l app=e2e-mock-tls-inference -n "$NAMESPACE" \ + --for=condition=Ready --timeout=120s 2>/dev/null; then + echo "ERROR: e2e-mock-tls-inference pod not Ready" >&2 + oc get pods -n "$NAMESPACE" -l app=e2e-mock-tls-inference -o wide 2>&1 || true + return 1 + fi + return 0 +} + +_copy_mock_tls_cert_from_pod() { + local mock_pod_name="$1" + local cert_file="$2" + local dest="$3" + local attempt + + for ((attempt=1; attempt<=4; attempt++)); do + if oc exec --request-timeout=90 -n "$NAMESPACE" "$mock_pod_name" \ + -c e2e-mock-tls-inference -- cat "/certs/$cert_file" >"$dest" 2>/dev/null \ + && [[ -s "$dest" ]]; then + return 0 + fi + echo "[e2e-ops] WARN: read /certs/$cert_file from mock pod failed (attempt $attempt/4)" + sleep 5 + done + return 1 +} + +_recycle_mock_tls_inference_pod() { + echo "[e2e-ops] Recycling e2e-mock-tls-inference pod (stale or unresponsive)..." + oc delete pod e2e-mock-tls-inference -n "$NAMESPACE" --ignore-not-found=true --wait=true 2>/dev/null || true + sleep 3 + if ! _wait_for_mock_tls_inference_pod; then + return 1 + fi + # Certs are written at container start; allow trustme + pip to finish. + sleep 10 + return 0 +} + +cmd_sync_mock_tls_certs_secret() { + local mock_pod_name tmpdir f recycle_attempt + + if _mock_tls_secret_is_complete; then + echo "✓ Secret e2e-mock-tls-certs already complete (${#_MOCK_TLS_CERT_FILES[@]} keys); skipping sync" + return 0 + fi + + for recycle_attempt in 1 2; do + mock_pod_name=$(_get_mock_tls_inference_pod_name) + if [[ -z "$mock_pod_name" ]]; then + echo "ERROR: no e2e-mock-tls-inference pod in namespace $NAMESPACE" >&2 + echo " Run: e2e-ops.sh deploy-e2e-mock-tls-inference" >&2 + return 1 + fi + + if ! _wait_for_mock_tls_inference_pod; then + [[ $recycle_attempt -lt 2 ]] && _recycle_mock_tls_inference_pod && continue + return 1 + fi + + tmpdir=$(mktemp -d) + local sync_ok=true + for f in "${_MOCK_TLS_CERT_FILES[@]}"; do + if ! _copy_mock_tls_cert_from_pod "$mock_pod_name" "$f" "$tmpdir/$f"; then + echo "ERROR: failed to read /certs/$f from e2e-mock-tls-inference pod" >&2 + oc logs "$mock_pod_name" -n "$NAMESPACE" -c e2e-mock-tls-inference --tail=40 2>&1 \ + | sed 's/^/[e2e-ops] /' || true + sync_ok=false + break + fi + done + + if [[ "$sync_ok" == "true" ]]; then + if ! oc create secret generic e2e-mock-tls-certs \ + --from-file="$tmpdir" \ + -n "$NAMESPACE" \ + --dry-run=client -o yaml | oc apply -f -; then + echo "ERROR: failed to apply e2e-mock-tls-certs secret" >&2 + rm -rf "$tmpdir" + return 1 + fi + rm -rf "$tmpdir" + echo "✓ Secret e2e-mock-tls-certs updated (${#_MOCK_TLS_CERT_FILES[@]} files)" + return 0 + fi + + rm -rf "$tmpdir" + if [[ $recycle_attempt -lt 2 ]]; then + _recycle_mock_tls_inference_pod || return 1 + fi + done + + return 1 +} + +_verify_mock_tls_certs_mounted_in_llama() { + local llama_pod_name="llama-stack-service" + if oc exec -n "$NAMESPACE" "$llama_pod_name" -c llama-stack-container -- \ + sh -c 'test -s /certs/ca.crt && test -s /certs/client.crt && test -s /certs/client.key'; then + echo "✓ mock TLS certs present under /certs in llama-stack" + return 0 + fi + echo "ERROR: /certs missing or incomplete in llama-stack pod" >&2 + oc get secret e2e-mock-tls-certs -n "$NAMESPACE" 2>&1 || true + oc exec -n "$NAMESPACE" "$llama_pod_name" -c llama-stack-container -- \ + ls -la /certs 2>&1 || true + return 1 +} + _e2e_repo_root() { cd "$SCRIPT_DIR/../../../.." && pwd } @@ -887,84 +945,26 @@ cmd_deploy_e2e_interception_proxy() { echo "✓ e2e-interception-proxy ready at http://e2e-interception-proxy.${NAMESPACE}.svc.cluster.local:8889" } -_MOCK_TLS_CERT_FILES=( - ca.crt client.crt client.key untrusted-ca.crt expired-ca.crt - untrusted-client.crt untrusted-client.key expired-client.crt -) - -_mock_tls_certs_secret_is_complete() { - local f present - if ! oc get secret e2e-mock-tls-certs -n "$NAMESPACE" &>/dev/null; then - return 1 - fi - present=$(oc get secret e2e-mock-tls-certs -n "$NAMESPACE" \ - -o go-template='{{range $k, $v := .data}}{{$k}} {{end}}' 2>/dev/null) || return 1 - for f in "${_MOCK_TLS_CERT_FILES[@]}"; do - [[ " $present " == *" $f "* ]] || return 1 - done - return 0 -} - -cmd_sync_mock_tls_certs_secret() { - local mock_pod="e2e-mock-tls-inference" - local mock_ctr="e2e-mock-tls-inference" - local f tmpdir from_args - - if _mock_tls_certs_secret_is_complete; then - echo "✓ Secret e2e-mock-tls-certs already complete; skipping sync" - return 0 - fi - if ! oc get pod "$mock_pod" -n "$NAMESPACE" &>/dev/null; then - echo "ERROR: $mock_pod not found (deploy-e2e-mock-tls-inference first)" >&2 - return 1 - fi - if ! oc exec -n "$NAMESPACE" "$mock_pod" -c "$mock_ctr" --request-timeout=30s -- \ - test -s /certs/ca.crt 2>/dev/null; then - echo "ERROR: mock TLS /certs not ready yet" >&2 - return 1 - fi - - tmpdir=$(mktemp -d) - for f in "${_MOCK_TLS_CERT_FILES[@]}"; do - if ! oc exec -n "$NAMESPACE" "$mock_pod" -c "$mock_ctr" --request-timeout=30s -- \ - cat "/certs/${f}" >"${tmpdir}/${f}" 2>/dev/null \ - || [[ ! -s "${tmpdir}/${f}" ]]; then - rm -rf "$tmpdir" - echo "ERROR: failed to read /certs/${f} from mock pod" >&2 - return 1 - fi - done - from_args=() - for f in "${_MOCK_TLS_CERT_FILES[@]}"; do - from_args+=(--from-file="${f}=${tmpdir}/${f}") - done - oc create secret generic e2e-mock-tls-certs \ - "${from_args[@]}" -n "$NAMESPACE" \ - --dry-run=client -o yaml | oc apply -n "$NAMESPACE" -f - - rm -rf "$tmpdir" - echo "✓ Secret e2e-mock-tls-certs updated" -} - cmd_deploy_e2e_mock_tls_inference() { - local repo_root server_py + local repo_root repo_root="$(_e2e_repo_root)" - server_py="$repo_root/tests/e2e/mock_tls_inference_server/server.py" - [[ -f "$server_py" ]] || { - echo "ERROR: missing $server_py" >&2 - return 1 - } echo "Deploying e2e-mock-tls-inference in namespace $NAMESPACE..." oc create configmap e2e-mock-tls-inference-script -n "$NAMESPACE" \ - --from-file=server.py="$server_py" \ + --from-file=server.py="$repo_root/tests/e2e/mock_tls_inference_server/server.py" \ --dry-run=client -o yaml | oc apply -f - + oc delete pod e2e-mock-tls-inference -n "$NAMESPACE" --ignore-not-found=true --wait=true 2>/dev/null || true oc apply -n "$NAMESPACE" -f "$MANIFEST_DIR/e2e-mock-tls-inference.yaml" if ! oc wait pod/e2e-mock-tls-inference -n "$NAMESPACE" --for=condition=Ready --timeout=240s; then - echo "ERROR: e2e-mock-tls-inference not ready" >&2 - e2e_ops_dump_pod_logs "e2e-mock-tls-inference" "e2e-mock-tls-inference" 80 + echo "ERROR: e2e-mock-tls-inference failed to become ready" >&2 + oc describe pod/e2e-mock-tls-inference -n "$NAMESPACE" 2>/dev/null | tail -30 || true + oc logs e2e-mock-tls-inference -n "$NAMESPACE" --tail=40 2>&1 || true + return 1 + fi + echo "✓ e2e-mock-tls-inference ready at https://e2e-mock-tls-inference.${NAMESPACE}.svc.cluster.local:8443" + if ! cmd_sync_mock_tls_certs_secret; then + echo "WARNING: mock TLS server is up but e2e-mock-tls-certs secret sync failed" >&2 return 1 fi - cmd_sync_mock_tls_certs_secret || return 1 - echo "✓ e2e-mock-tls-inference ready" } cmd_disrupt_llama_stack() { @@ -1040,8 +1040,8 @@ case "$COMMAND" in deploy-e2e-mock-tls-inference) cmd_deploy_e2e_mock_tls_inference ;; - reload-llama-stack-config) - cmd_reload_llama_stack_config + sync-mock-tls-certs-secret) + cmd_sync_mock_tls_certs_secret ;; *) echo "Usage: $0 [args...]" @@ -1061,8 +1061,8 @@ case "$COMMAND" in echo " sync-interception-proxy-ca-secret - Publish trustme CA to Secret for llama mount" echo " deploy-e2e-tunnel-proxy - Deploy in-cluster tunnel proxy pod" echo " deploy-e2e-interception-proxy - Deploy in-cluster interception proxy pod" - echo " deploy-e2e-mock-tls-inference - Deploy in-cluster mock TLS inference (tls.feature)" - echo " reload-llama-stack-config - Reload run.yaml without deleting llama pod" + echo " deploy-e2e-mock-tls-inference - Deploy mock HTTPS inference server (tls.feature)" + echo " sync-mock-tls-certs-secret - Publish mock TLS /certs to Secret for llama mount" exit 1 ;; esac diff --git a/tests/e2e/features/environment.py b/tests/e2e/features/environment.py index 025c536e4..abcabe577 100644 --- a/tests/e2e/features/environment.py +++ b/tests/e2e/features/environment.py @@ -26,7 +26,10 @@ reset_llama_stack_disrupt_once_tracking, reset_llama_stack_was_running, ) -from tests.e2e.features.steps.tls import reset_tls_prow_state +from tests.e2e.features.steps.tls import ( + prepare_tls_feature_entry_on_prow, + reset_tls_prow_state, +) from tests.e2e.utils.llama_stack_utils import register_shield from tests.e2e.utils.prow_utils import ( restart_pod, @@ -454,6 +457,7 @@ def before_feature(context: Context, feature: Feature) -> None: reset_llama_stack_disrupt_once_tracking() if feature.filename and "tls.feature" in feature.filename: reset_tls_prow_state() + prepare_tls_feature_entry_on_prow() try: max_flaky = int(os.getenv("E2E_FLAKY_MAX_ATTEMPTS", _E2E_FLAKY_MAX_ATTEMPTS)) diff --git a/tests/e2e/features/steps/tls.py b/tests/e2e/features/steps/tls.py index ae363d245..67f5fa360 100644 --- a/tests/e2e/features/steps/tls.py +++ b/tests/e2e/features/steps/tls.py @@ -19,6 +19,7 @@ backup_llama_config, clear_llama_config_backup, load_llama_config, + reset_llama_run_config_to_pipeline_default, write_llama_config, ) from tests.e2e.utils.prow_utils import get_namespace, restart_pod, run_e2e_ops @@ -35,17 +36,43 @@ } _mock_tls_cluster_deploy_state: dict[str, bool] = {"done": False} -_tls_llama_pod_warmed: dict[str, bool] = {"done": False} def reset_tls_prow_state() -> None: """Reset per-feature Prow state (call from ``before_feature``).""" _mock_tls_cluster_deploy_state["done"] = False - _tls_llama_pod_warmed["done"] = False os.environ.pop("E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA", None) clear_llama_config_backup() +def prepare_tls_feature_entry_on_prow() -> None: + """Baseline cluster state when tls.feature runs after other features in test_list. + + Earlier features (disrupted, MCP) delete or reconfigure Llama without mock TLS + certs. Isolated tls.feature runs skip that churn, which is why the same Gherkin + passes alone but flakes mid-feature in the full suite. + """ + if not is_prow_environment(): + return + print("[tls.feature] Prow/Konflux entry: reset run.yaml and warm Llama + mock TLS...") + reset_llama_run_config_to_pipeline_default() + result = run_e2e_ops("deploy-e2e-mock-tls-inference", timeout=300) + print(result.stdout, end="") + if result.returncode != 0: + raise RuntimeError( + "tls.feature entry: deploy-e2e-mock-tls-inference failed: " + f"{result.stderr or result.stdout}" + ) + _mock_tls_cluster_deploy_state["done"] = True + _prepare_tls_prow_llama_restart_env() + os.environ.setdefault( + "E2E_MOCK_TLS_INFERENCE_HOST", + _cluster_mock_tls_inference_host(), + ) + restart_pod("llama-stack") + print("[tls.feature] Prow/Konflux entry baseline complete", flush=True) + + def is_tls_configuration_feature(context: Context) -> bool: """Return True when the active Behave feature is ``tls.feature``.""" feature = getattr(context, "feature", None) @@ -61,12 +88,7 @@ def _prepare_tls_prow_llama_restart_env() -> None: def restart_llama_for_tls_feature(context: Context) -> None: - """Restart Llama for TLS tests. - - On Prow/Konflux the first restart per feature recreates the pod (mock TLS cert - Secret volume). Later restarts reload run.yaml in-place (``kill 1``) to avoid - re-running the heavy setup-from-source init on every scenario. - """ + """Restart Llama for TLS tests (full pod recreate on Prow/Konflux).""" from tests.e2e.utils.utils import restart_container if not is_prow_environment(): @@ -75,30 +97,11 @@ def restart_llama_for_tls_feature(context: Context) -> None: _prepare_tls_prow_llama_restart_env() scenario = getattr(getattr(context, "scenario", None), "name", "") or "?" - - if not _tls_llama_pod_warmed["done"]: - print( - f"[tls.feature] Llama Stack restart: pod recreate (once per feature) " - f"scenario={scenario!r}", - flush=True, - ) - restart_pod("llama-stack") - _tls_llama_pod_warmed["done"] = True - return - print( - f"[tls.feature] Llama Stack restart: reload run.yaml scenario={scenario!r}", + f"[tls.feature] Llama Stack restart: full recreate scenario={scenario!r}", flush=True, ) - result = run_e2e_ops("reload-llama-stack-config", timeout=240) - print(result.stdout, end="") - if result.stderr: - print(result.stderr, end="") - if result.returncode != 0: - detail = f"{result.stdout or ''}\n{result.stderr or ''}".strip() - raise RuntimeError( - f"tls.feature: reload-llama-stack-config failed:\n{detail or result.returncode}" - ) + restart_container("llama-stack") def _cluster_mock_tls_inference_host() -> str: diff --git a/tests/e2e/utils/prow_utils.py b/tests/e2e/utils/prow_utils.py index 348c918ec..263b415a3 100644 --- a/tests/e2e/utils/prow_utils.py +++ b/tests/e2e/utils/prow_utils.py @@ -96,7 +96,7 @@ def restart_pod(container_name: str) -> None: # Subprocess cap must exceed e2e-ops internal waits (pod + in-pod health + port-forward). # Konflux TLS full recreate: ~6–12 min typical, 15+ min under load (user-reported 400s+). if os.environ.get("E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA") == "1": - timeout = 900 + timeout = 1200 elif os.environ.get("E2E_KONFLUX_E2E") == "1": timeout = 720 else: From 67b7e7f39344b77f8c47516abb4e96c6885ba959 Mon Sep 17 00:00:00 2001 From: Radovan Fuchs Date: Thu, 28 May 2026 14:16:18 +0200 Subject: [PATCH 20/20] add logs dump to every failure --- tests/e2e-prow/rhoai/scripts/e2e-ops.sh | 111 +++++++++++++++++++----- tests/e2e/features/environment.py | 32 +++---- tests/e2e/utils/prow_utils.py | 8 +- 3 files changed, 113 insertions(+), 38 deletions(-) diff --git a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh index e3c0b918b..b80b41914 100755 --- a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh +++ b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh @@ -27,6 +27,7 @@ # deploy-e2e-interception-proxy - Deploy in-cluster interception proxy (proxy.feature step) # deploy-e2e-mock-tls-inference - Deploy mock HTTPS inference server (tls.feature step) # sync-mock-tls-certs-secret - Publish /certs PEMs to Secret for llama-stack mount +# dump-pod-logs [container] - Print events, describe, init + container logs (on failure) set -e @@ -42,6 +43,66 @@ E2E_JWKS_PORT_FORWARD_PID_FILE="${E2E_JWKS_PORT_FORWARD_PID_FILE:-/tmp/e2e-jwks- # Helper functions # ============================================================================ +# Print diagnostics to stdout (captured by Behave as CAPTURED STDOUT). +e2e_ops_dump_pod_logs() { + local pod_name="${1:?pod name required}" + local preferred_container="${2:-}" + local log_tail="${3:-200}" + local prefix="[e2e-ops] " + local init_ctr ctr restart_count phase + + echo "${prefix}========== failure logs: pod/$pod_name (namespace $NAMESPACE) ==========" + + echo "${prefix}--- events for pod/$pod_name ---" + oc get events -n "$NAMESPACE" --field-selector "involvedObject.name=${pod_name}" \ + --sort-by='.lastTimestamp' 2>&1 | tail -50 | sed "s/^/${prefix}/" \ + || echo "${prefix}(could not list events)" + + if ! oc get pod "$pod_name" -n "$NAMESPACE" &>/dev/null; then + echo "${prefix}pod/$pod_name not found (deleted or never created)" + oc get pods -n "$NAMESPACE" -o wide 2>&1 | sed "s/^/${prefix}/" || true + echo "${prefix}========== end failure logs: pod/$pod_name ==========" + return 0 + fi + + phase=$(oc get pod "$pod_name" -n "$NAMESPACE" -o jsonpath='{.status.phase}' 2>/dev/null || echo "?") + echo "${prefix}pod phase=$phase" + oc get pod "$pod_name" -n "$NAMESPACE" -o wide 2>&1 | sed "s/^/${prefix}/" || true + oc describe pod "$pod_name" -n "$NAMESPACE" 2>&1 | sed "s/^/${prefix}/" || true + + for init_ctr in $(oc get pod "$pod_name" -n "$NAMESPACE" \ + -o jsonpath='{.spec.initContainers[*].name}' 2>/dev/null); do + [[ -n "$init_ctr" ]] || continue + echo "${prefix}--- logs pod/$pod_name -c $init_ctr (init, tail $log_tail) ---" + oc logs "$pod_name" -n "$NAMESPACE" -c "$init_ctr" --tail="$log_tail" 2>&1 \ + | sed "s/^/${prefix}/" || echo "${prefix}(no init logs for $init_ctr)" + done + + for ctr in $(oc get pod "$pod_name" -n "$NAMESPACE" \ + -o jsonpath='{.spec.containers[*].name}' 2>/dev/null); do + [[ -n "$ctr" ]] || continue + echo "${prefix}--- logs pod/$pod_name -c $ctr (tail $log_tail) ---" + oc logs "$pod_name" -n "$NAMESPACE" -c "$ctr" --tail="$log_tail" 2>&1 \ + | sed "s/^/${prefix}/" || echo "${prefix}(no logs for $ctr)" + restart_count=$(oc get pod "$pod_name" -n "$NAMESPACE" \ + -o jsonpath="{.status.containerStatuses[?(@.name==\"${ctr}\")].restartCount}" \ + 2>/dev/null) || restart_count="0" + if [[ "${restart_count:-0}" -gt 0 ]]; then + echo "${prefix}--- logs pod/$pod_name -c $ctr --previous (tail $log_tail) ---" + oc logs "$pod_name" -n "$NAMESPACE" -c "$ctr" --previous --tail="$log_tail" 2>&1 \ + | sed "s/^/${prefix}/" || true + fi + done + + if [[ -n "$preferred_container" ]]; then + echo "${prefix}--- logs pod/$pod_name -c $preferred_container (preferred, tail $log_tail) ---" + oc logs "$pod_name" -n "$NAMESPACE" -c "$preferred_container" --tail="$log_tail" 2>&1 \ + | sed "s/^/${prefix}/" || true + fi + + echo "${prefix}========== end failure logs: pod/$pod_name ==========" +} + wait_for_pod() { local pod_name="$1" local max_attempts="${2:-24}" @@ -69,6 +130,7 @@ wait_for_pod() { done echo "Pod $pod_name not ready after $((max_attempts * 3))s (last phase: ${phase:-unknown})" + e2e_ops_dump_pod_logs "$pod_name" "" 250 return 1 } @@ -286,9 +348,7 @@ wait_for_llama_stack_http_health() { fi done echo "ERROR: Llama Stack did not respond on http://127.0.0.1:8321/v1/health inside the pod" - oc get pod llama-stack-service -n "$NAMESPACE" -o wide 2>&1 || true - oc describe pod llama-stack-service -n "$NAMESPACE" 2>&1 | tail -40 || true - oc logs llama-stack-service -n "$NAMESPACE" -c llama-stack-container --tail=120 2>&1 || true + e2e_ops_dump_pod_logs "llama-stack-service" "llama-stack-container" 250 return 1 } @@ -332,16 +392,7 @@ cmd_restart_lightspeed() { echo "[e2e-ops] Waiting for lightspeed-stack-service Ready (max ${lcs_pod_wait} attempts, $((lcs_pod_wait * 3))s)..." if ! wait_for_pod "lightspeed-stack-service" "$lcs_pod_wait"; then pod_ready=false - echo "⚠️ Pod not ready within $((lcs_pod_wait * 3))s — dumping diagnostics:" - if oc get pod lightspeed-stack-service -n "$NAMESPACE" &>/dev/null; then - oc describe pod lightspeed-stack-service -n "$NAMESPACE" 2>&1 | tail -40 || true - oc logs lightspeed-stack-service -n "$NAMESPACE" -c lightspeed-stack-container --tail=60 2>&1 || true - else - echo "[e2e-ops] lightspeed-stack-service pod not found in namespace $NAMESPACE" - oc get pods -n "$NAMESPACE" 2>&1 | grep -E 'lightspeed|NAME' || true - oc get events -n "$NAMESPACE" --field-selector involvedObject.name=lightspeed-stack-service \ - --sort-by='.lastTimestamp' 2>&1 | tail -15 || true - fi + echo "⚠️ Pod not ready within $((lcs_pod_wait * 3))s" fi # Re-label pod for service discovery (ignore if pod was deleted / not created yet) @@ -357,6 +408,7 @@ cmd_restart_lightspeed() { if [[ "$pod_ready" == "false" ]]; then echo "⚠️ Lightspeed restart completed but pod was slow to become ready" + e2e_ops_dump_pod_logs "lightspeed-stack-service" "lightspeed-stack-container" 150 return 1 fi echo "✓ Lightspeed restart complete" @@ -386,6 +438,8 @@ cmd_restart_llama_stack() { echo "[e2e-ops] Syncing e2e-mock-tls-certs secret before llama-stack apply..." if ! cmd_sync_mock_tls_certs_secret; then echo "===== Llama-stack restore FAILED (mock TLS certs secret sync) =====" + e2e_ops_dump_pod_logs "e2e-mock-tls-inference" "e2e-mock-tls-inference" 120 + e2e_ops_dump_pod_logs "llama-stack-service" "llama-stack-container" 120 exit 1 fi fi @@ -403,9 +457,6 @@ cmd_restart_llama_stack() { echo "[e2e-ops] Waiting for llama-stack-service Ready (max ${llama_pod_wait} attempts, $((llama_pod_wait * 3))s)..." if ! wait_for_pod "llama-stack-service" "$llama_pod_wait"; then echo "===== Llama-stack restore FAILED (pod not Ready within $((llama_pod_wait * 3))s) =====" - oc get pod llama-stack-service -n "$NAMESPACE" -o wide 2>&1 || true - oc describe pod llama-stack-service -n "$NAMESPACE" 2>&1 | tail -50 || true - oc logs llama-stack-service -n "$NAMESPACE" -c llama-stack-container --tail=80 2>&1 || true exit 1 fi echo "Labeling pod for service..." @@ -413,12 +464,14 @@ cmd_restart_llama_stack() { if [[ "${E2E_COPY_INTERCEPTION_CA_TO_LLAMA:-0}" == "1" ]]; then if ! _verify_interception_ca_mounted_in_llama; then echo "===== Llama-stack restore FAILED (interception CA not mounted) =====" + e2e_ops_dump_pod_logs "llama-stack-service" "llama-stack-container" 150 exit 1 fi fi if [[ "${E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA:-0}" == "1" ]]; then if ! _verify_mock_tls_certs_mounted_in_llama; then echo "===== Llama-stack restore FAILED (mock TLS certs not mounted) =====" + e2e_ops_dump_pod_logs "llama-stack-service" "llama-stack-container" 150 exit 1 fi fi @@ -442,6 +495,7 @@ cmd_restart_llama_stack() { if ! cmd_restart_llama_port_forward; then echo "ERROR: Llama pod is up but localhost:${LOCAL_LLAMA_PORT:-8321} port-forward failed" + e2e_ops_dump_pod_logs "llama-stack-service" "llama-stack-container" 150 exit 1 fi @@ -589,8 +643,10 @@ cmd_restart_llama_port_forward() { echo "Failed to establish Llama Stack port-forward on :$local_port" if [[ -s "$llama_pf_log" ]]; then + echo "[e2e-ops] $llama_pf_log (tail 30):" tail -30 "$llama_pf_log" 2>/dev/null | sed 's/^/[e2e-ops] /' || true fi + e2e_ops_dump_pod_logs "llama-stack-service" "llama-stack-container" 150 return 1 } @@ -856,7 +912,10 @@ cmd_sync_mock_tls_certs_secret() { fi if ! _wait_for_mock_tls_inference_pod; then - [[ $recycle_attempt -lt 2 ]] && _recycle_mock_tls_inference_pod && continue + if [[ $recycle_attempt -lt 2 ]] && _recycle_mock_tls_inference_pod; then + continue + fi + e2e_ops_dump_pod_logs "e2e-mock-tls-inference" "e2e-mock-tls-inference" 120 return 1 fi @@ -865,8 +924,7 @@ cmd_sync_mock_tls_certs_secret() { for f in "${_MOCK_TLS_CERT_FILES[@]}"; do if ! _copy_mock_tls_cert_from_pod "$mock_pod_name" "$f" "$tmpdir/$f"; then echo "ERROR: failed to read /certs/$f from e2e-mock-tls-inference pod" >&2 - oc logs "$mock_pod_name" -n "$NAMESPACE" -c e2e-mock-tls-inference --tail=40 2>&1 \ - | sed 's/^/[e2e-ops] /' || true + e2e_ops_dump_pod_logs "$mock_pod_name" "e2e-mock-tls-inference" 120 sync_ok=false break fi @@ -892,6 +950,7 @@ cmd_sync_mock_tls_certs_secret() { fi done + e2e_ops_dump_pod_logs "e2e-mock-tls-inference" "e2e-mock-tls-inference" 120 return 1 } @@ -956,17 +1015,23 @@ cmd_deploy_e2e_mock_tls_inference() { oc apply -n "$NAMESPACE" -f "$MANIFEST_DIR/e2e-mock-tls-inference.yaml" if ! oc wait pod/e2e-mock-tls-inference -n "$NAMESPACE" --for=condition=Ready --timeout=240s; then echo "ERROR: e2e-mock-tls-inference failed to become ready" >&2 - oc describe pod/e2e-mock-tls-inference -n "$NAMESPACE" 2>/dev/null | tail -30 || true - oc logs e2e-mock-tls-inference -n "$NAMESPACE" --tail=40 2>&1 || true + e2e_ops_dump_pod_logs "e2e-mock-tls-inference" "e2e-mock-tls-inference" 150 return 1 fi echo "✓ e2e-mock-tls-inference ready at https://e2e-mock-tls-inference.${NAMESPACE}.svc.cluster.local:8443" if ! cmd_sync_mock_tls_certs_secret; then echo "WARNING: mock TLS server is up but e2e-mock-tls-certs secret sync failed" >&2 + e2e_ops_dump_pod_logs "e2e-mock-tls-inference" "e2e-mock-tls-inference" 150 return 1 fi } +cmd_dump_pod_logs() { + local pod_name="${1:?pod name required}" + local container="${2:-}" + e2e_ops_dump_pod_logs "$pod_name" "$container" 200 +} + cmd_disrupt_llama_stack() { local pod_name="llama-stack-service" @@ -1043,6 +1108,9 @@ case "$COMMAND" in sync-mock-tls-certs-secret) cmd_sync_mock_tls_certs_secret ;; + dump-pod-logs) + cmd_dump_pod_logs "$@" + ;; *) echo "Usage: $0 [args...]" echo "" @@ -1063,6 +1131,7 @@ case "$COMMAND" in echo " deploy-e2e-interception-proxy - Deploy in-cluster interception proxy pod" echo " deploy-e2e-mock-tls-inference - Deploy mock HTTPS inference server (tls.feature)" echo " sync-mock-tls-certs-secret - Publish mock TLS /certs to Secret for llama mount" + echo " dump-pod-logs [container] - Events, describe, init + container logs" exit 1 ;; esac diff --git a/tests/e2e/features/environment.py b/tests/e2e/features/environment.py index abcabe577..7ee711494 100644 --- a/tests/e2e/features/environment.py +++ b/tests/e2e/features/environment.py @@ -241,24 +241,26 @@ def before_scenario(context: Context, scenario: Scenario) -> None: delattr(context, _attr) -def _dump_pod_logs_on_failure(scenario: Scenario, namespace: str) -> None: - """Dump llama-stack and lightspeed-stack pod logs when a scenario fails in Prow.""" +def _dump_pod_logs_on_failure( + context: Context, scenario: Scenario, namespace: str +) -> None: + """Dump pod diagnostics when a scenario fails in Prow (init + main container logs).""" if scenario.status != "failed": return - for pod in ("llama-stack-service", "lightspeed-stack-service"): - print(f"--- {pod} logs (scenario failed: {scenario.name}) ---") + pods: tuple[str, ...] = ("llama-stack-service", "lightspeed-stack-service") + feature = getattr(context, "feature", None) + feat_file = getattr(feature, "filename", "") or "" if feature else "" + if "tls.feature" in feat_file: + pods = (*pods, "e2e-mock-tls-inference") + print(f"--- scenario failed: {scenario.name!r} — dumping pod logs ---", flush=True) + for pod in pods: try: - r = subprocess.run( - ["oc", "logs", pod, "-n", namespace, "--tail=100"], - capture_output=True, - text=True, - timeout=15, - check=False, - ) - print(r.stdout or r.stderr or "(no output)") + result = run_e2e_ops("dump-pod-logs", [pod], timeout=90) + print(result.stdout, end="") + if result.stderr: + print(result.stderr, end="") except subprocess.TimeoutExpired: - print("(timed out fetching logs)") - print(f"--- end {pod} logs ---") + print(f"(timed out dumping logs for {pod})") def after_scenario(context: Context, scenario: Scenario) -> None: @@ -292,7 +294,7 @@ def after_scenario(context: Context, scenario: Scenario) -> None: """ if is_prow_environment(): _dump_pod_logs_on_failure( - scenario, os.environ.get("NAMESPACE", "e2e-rhoai-dsc") + context, scenario, os.environ.get("NAMESPACE", "e2e-rhoai-dsc") ) if getattr(context, "scenario_lightspeed_override_active", False): diff --git a/tests/e2e/utils/prow_utils.py b/tests/e2e/utils/prow_utils.py index 263b415a3..48056b243 100644 --- a/tests/e2e/utils/prow_utils.py +++ b/tests/e2e/utils/prow_utils.py @@ -118,8 +118,12 @@ def restart_pod(container_name: str) -> None: if result.returncode != 0: print(result.stderr, end="") combined = f"{result.stdout or ''}\n{result.stderr or ''}".strip() - tail = "\n".join(combined.splitlines()[-25:]) if combined else "" - detail = tail or f"exit {result.returncode}" + # Prefer full e2e-ops output when diagnostics were printed (TLS/Llama failures). + if "========== failure logs:" in combined: + detail = combined + else: + detail = "\n".join(combined.splitlines()[-40:]) if combined else "" + detail = detail or f"exit {result.returncode}" raise subprocess.CalledProcessError( result.returncode, op,