From 7a1ef3636b2605a563e329405fdd644985b5f113 Mon Sep 17 00:00:00 2001
From: Radovan Fuchs <rfuchs@rfuchs-thinkpadp1gen7.tpb.csb>
Date: Wed, 20 May 2026 15:10:10 +0200
Subject: [PATCH 01/20] Add TLS fixes for konflux run

---
 .../lightspeed/e2e-mock-tls-inference.yaml    | 104 +++++++++++
 .../lightspeed/llama-stack-openai.yaml        |   8 +
 tests/e2e-prow/rhoai/scripts/e2e-ops.sh       | 163 +++++++++++++++++-
 .../server-mode/lightspeed-stack-tls.yaml     |   2 +-
 tests/e2e/features/environment.py             |   3 +
 tests/e2e/features/steps/proxy.py             |   2 +
 tests/e2e/features/steps/tls.py               | 116 +++++++++++--
 tests/e2e/features/tls.feature                |   5 +-
 tests/e2e/mock_tls_inference_server/server.py |  25 ++-
 tests/e2e/test_list.txt                       |  27 ---
 10 files changed, 405 insertions(+), 50 deletions(-)
 create mode 100644 tests/e2e-prow/rhoai/manifests/lightspeed/e2e-mock-tls-inference.yaml
diff --git a/tests/e2e-prow/rhoai/manifests/lightspeed/e2e-mock-tls-inference.yaml b/tests/e2e-prow/rhoai/manifests/lightspeed/e2e-mock-tls-inference.yaml
new file mode 100644
index 000000000..6797de24a
--- /dev/null
+++ b/tests/e2e-prow/rhoai/manifests/lightspeed/e2e-mock-tls-inference.yaml
@@ -0,0 +1,104 @@
+# Mock HTTPS OpenAI API for tls.feature (Konflux / Prow; no Docker Compose).
+# Llama Stack run.yaml uses https://e2e-mock-tls-inference.<ns>.svc.cluster.local:8443|8444|8445/v1
+apiVersion: v1
+kind: Pod
+metadata:
+  name: e2e-mock-tls-inference
+  labels:
+    app: e2e-mock-tls-inference
+spec:
+  securityContext:
+    runAsNonRoot: true
+    seccompProfile:
+      type: RuntimeDefault
+  containers:
+    - name: e2e-mock-tls-inference
+      image: python:3.12-slim
+      securityContext:
+        allowPrivilegeEscalation: false
+        capabilities:
+          drop: ["ALL"]
+        runAsNonRoot: true
+        runAsUser: 1000
+        seccompProfile:
+          type: RuntimeDefault
+      env:
+        - name: POD_NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.namespace
+        - name: PYTHONPATH
+          value: /app:/tmp/pydeps
+      command:
+        - /bin/sh
+        - -c
+        - |
+          set -e
+          pip install --quiet --no-cache-dir --target /tmp/pydeps 'trustme>=1.2.1' 'cryptography>=42.0.0'
+          NS="${POD_NAMESPACE:-default}"
+          export TLS_CERT_DNS_NAMES="mock-tls-inference,localhost,127.0.0.1,e2e-mock-tls-inference,e2e-mock-tls-inference.${NS}.svc.cluster.local"
+          exec python /app/server.py
+      ports:
+        - containerPort: 8443
+          name: tls
+        - containerPort: 8444
+          name: mtls
+        - containerPort: 8445
+          name: mismatch
+      volumeMounts:
+        - name: server-script
+          mountPath: /app/server.py
+          subPath: server.py
+          readOnly: true
+        - name: certs-work
+          mountPath: /certs
+      readinessProbe:
+        exec:
+          command:
+            - python3
+            - -c
+            - |
+              import ssl, urllib.request
+              ctx = ssl.create_default_context()
+              ctx.check_hostname = False
+              ctx.verify_mode = ssl.CERT_NONE
+              urllib.request.urlopen("https://localhost:8443/health", context=ctx)
+        initialDelaySeconds: 8
+        periodSeconds: 5
+      livenessProbe:
+        exec:
+          command:
+            - python3
+            - -c
+            - |
+              import ssl, urllib.request
+              ctx = ssl.create_default_context()
+              ctx.check_hostname = False
+              ctx.verify_mode = ssl.CERT_NONE
+              urllib.request.urlopen("https://localhost:8443/health", context=ctx)
+        initialDelaySeconds: 15
+        periodSeconds: 20
+  volumes:
+    - name: server-script
+      configMap:
+        name: e2e-mock-tls-inference-script
+    - name: certs-work
+      emptyDir: {}
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: e2e-mock-tls-inference
+spec:
+  selector:
+    app: e2e-mock-tls-inference
+  ports:
+    - name: tls
+      port: 8443
+      targetPort: tls
+    - name: mtls
+      port: 8444
+      targetPort: mtls
+    - name: mismatch
+      port: 8445
+      targetPort: mismatch
diff --git a/tests/e2e-prow/rhoai/manifests/lightspeed/llama-stack-openai.yaml b/tests/e2e-prow/rhoai/manifests/lightspeed/llama-stack-openai.yaml
index 3f2a6583c..b182a2463 100644
--- a/tests/e2e-prow/rhoai/manifests/lightspeed/llama-stack-openai.yaml
+++ b/tests/e2e-prow/rhoai/manifests/lightspeed/llama-stack-openai.yaml
@@ -206,6 +206,10 @@ spec:
           mountPath: /tmp/interception-proxy-ca.pem
           subPath: ca.pem
           readOnly: true
+        # tls.feature: client/CA PEMs from Secret e2e-mock-tls-certs (optional).
+        - name: mock-tls-certs
+          mountPath: /certs
+          readOnly: true
   volumes:
     - name: app-root
       emptyDir: {}
@@ -222,3 +226,7 @@ spec:
       secret:
         secretName: e2e-interception-proxy-ca
         optional: true
+    - name: mock-tls-certs
+      secret:
+        secretName: e2e-mock-tls-certs
+        optional: true
diff --git a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh
index 7f7b3d9a4..332a429c2 100755
--- a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh
+++ b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh
@@ -25,6 +25,8 @@
 #   disrupt-llama-stack             - Delete llama-stack pod to disrupt connection
 #   deploy-e2e-tunnel-proxy         - Deploy in-cluster tunnel proxy (proxy.feature step)
 #   deploy-e2e-interception-proxy   - Deploy in-cluster interception proxy (proxy.feature step)
+#   deploy-e2e-mock-tls-inference   - Deploy mock HTTPS inference server (tls.feature step)
+#   sync-mock-tls-certs-secret        - Publish /certs PEMs to Secret for llama-stack mount
 
 set -e
 
@@ -331,7 +333,55 @@ cmd_restart_lightspeed() {
     echo "✓ Lightspeed restart complete"
 }
 
+cmd_reload_llama_stack_config() {
+    local llama_pod_name="llama-stack-service"
+    local tmp
+
+    echo "===== Reloading llama-stack run.yaml (container restart, no pod recreate) ====="
+    tmp=$(mktemp)
+    if ! oc get configmap llama-stack-config -n "$NAMESPACE" \
+        -o jsonpath='{.data.run\.yaml}' >"$tmp"; then
+        rm -f "$tmp"
+        echo "ERROR: failed to read llama-stack-config run.yaml" >&2
+        return 1
+    fi
+    if [[ ! -s "$tmp" ]]; then
+        rm -f "$tmp"
+        echo "ERROR: llama-stack-config run.yaml is empty" >&2
+        return 1
+    fi
+    if ! oc cp "$tmp" "$NAMESPACE/$llama_pod_name:/opt/app-root/run.yaml" \
+        -c llama-stack-container; then
+        rm -f "$tmp"
+        echo "ERROR: failed to copy run.yaml into llama-stack pod" >&2
+        return 1
+    fi
+    rm -f "$tmp"
+    echo "Restarting llama-stack-container to pick up run.yaml..."
+    oc exec -n "$NAMESPACE" "$llama_pod_name" -c llama-stack-container -- bash -c 'kill 1' \
+        2>/dev/null || true
+    wait_for_pod "$llama_pod_name" 45
+    if ! wait_for_llama_stack_http_health 35; then
+        echo "===== Llama-stack reload FAILED (HTTP not healthy) ====="
+        return 1
+    fi
+    if ! cmd_restart_llama_port_forward; then
+        echo "ERROR: Llama pod is up but localhost:${LOCAL_LLAMA_PORT:-8321} port-forward failed"
+        return 1
+    fi
+    echo "===== Llama-stack config reload complete ====="
+}
+
 cmd_restart_llama_stack() {
+    if [[ "${E2E_KONFLUX_E2E:-0}" == "1" && "${E2E_LLAMA_RELOAD_CONFIG_ONLY:-0}" == "1" ]]; then
+        if oc get pod llama-stack-service -n "$NAMESPACE" &>/dev/null; then
+            if cmd_reload_llama_stack_config; then
+                return 0
+            fi
+            echo "WARN: llama config reload failed; falling back to full pod restart" >&2
+        fi
+    fi
+
     echo "===== Restoring llama-stack service ====="
     # Pod.spec is largely immutable; delete so apply creates a pod with current volumes/env.
     echo "Deleting llama-stack pod (if any) before apply..."
@@ -350,6 +400,14 @@ cmd_restart_llama_stack() {
                 exit 1
             fi
         fi
+        if [[ "${E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA:-0}" == "1" \
+            && "${E2E_LLAMA_RELOAD_CONFIG_ONLY:-0}" != "1" ]]; then
+            echo "[e2e-ops] Syncing e2e-mock-tls-certs secret before llama-stack apply..."
+            if ! cmd_sync_mock_tls_certs_secret; then
+                echo "===== Llama-stack restore FAILED (mock TLS certs secret sync) ====="
+                exit 1
+            fi
+        fi
         _LLAMA_SVC_FQDN="llama-stack-service-svc.${NAMESPACE}.svc.cluster.local"
         oc create secret generic llama-stack-ip-secret \
             --from-literal=key="$_LLAMA_SVC_FQDN" \
@@ -365,7 +423,17 @@ cmd_restart_llama_stack() {
                 exit 1
             fi
         fi
-        if ! wait_for_llama_stack_http_health 50; then
+        if [[ "${E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA:-0}" == "1" ]]; then
+            if ! _verify_mock_tls_certs_mounted_in_llama; then
+                echo "===== Llama-stack restore FAILED (mock TLS certs not mounted) ====="
+                exit 1
+            fi
+        fi
+        local llama_health_attempts=50
+        if [[ "${E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA:-0}" == "1" ]]; then
+            llama_health_attempts=75
+        fi
+        if ! wait_for_llama_stack_http_health "$llama_health_attempts"; then
             echo "===== Llama-stack restore FAILED (HTTP not healthy) ====="
             exit 1
         fi
@@ -709,6 +777,69 @@ cmd_copy_interception_proxy_ca_to_llama() {
     cmd_sync_interception_proxy_ca_secret
 }
 
+_MOCK_TLS_CERT_FILES=(
+    ca.crt
+    client.crt
+    client.key
+    untrusted-ca.crt
+    expired-ca.crt
+    untrusted-client.crt
+    untrusted-client.key
+    expired-client.crt
+)
+
+cmd_sync_mock_tls_certs_secret() {
+    local mock_pod_name tmpdir f
+    mock_pod_name=$(oc get pod -n "$NAMESPACE" -l app=e2e-mock-tls-inference \
+        -o jsonpath='{.items[0].metadata.name}' 2>/dev/null) || mock_pod_name=""
+
+    if [[ -z "$mock_pod_name" ]]; then
+        echo "ERROR: no e2e-mock-tls-inference pod in namespace $NAMESPACE" >&2
+        echo "  Run: e2e-ops.sh deploy-e2e-mock-tls-inference" >&2
+        return 1
+    fi
+
+    tmpdir=$(mktemp -d)
+    for f in "${_MOCK_TLS_CERT_FILES[@]}"; do
+        if ! oc exec -n "$NAMESPACE" "$mock_pod_name" -c e2e-mock-tls-inference -- \
+            cat "/certs/$f" >"$tmpdir/$f"; then
+            echo "ERROR: failed to read /certs/$f from e2e-mock-tls-inference pod" >&2
+            rm -rf "$tmpdir"
+            return 1
+        fi
+        if [[ ! -s "$tmpdir/$f" ]]; then
+            echo "ERROR: /certs/$f is empty in e2e-mock-tls-inference pod" >&2
+            rm -rf "$tmpdir"
+            return 1
+        fi
+    done
+
+    if ! oc create secret generic e2e-mock-tls-certs \
+        --from-file="$tmpdir" \
+        -n "$NAMESPACE" \
+        --dry-run=client -o yaml | oc apply -f -; then
+        echo "ERROR: failed to apply e2e-mock-tls-certs secret" >&2
+        rm -rf "$tmpdir"
+        return 1
+    fi
+    rm -rf "$tmpdir"
+    echo "✓ Secret e2e-mock-tls-certs updated (${#_MOCK_TLS_CERT_FILES[@]} files)"
+}
+
+_verify_mock_tls_certs_mounted_in_llama() {
+    local llama_pod_name="llama-stack-service"
+    if oc exec -n "$NAMESPACE" "$llama_pod_name" -c llama-stack-container -- \
+        sh -c 'test -s /certs/ca.crt && test -s /certs/client.crt && test -s /certs/client.key'; then
+        echo "✓ mock TLS certs present under /certs in llama-stack"
+        return 0
+    fi
+    echo "ERROR: /certs missing or incomplete in llama-stack pod" >&2
+    oc get secret e2e-mock-tls-certs -n "$NAMESPACE" 2>&1 || true
+    oc exec -n "$NAMESPACE" "$llama_pod_name" -c llama-stack-container -- \
+        ls -la /certs 2>&1 || true
+    return 1
+}
+
 _e2e_repo_root() {
     cd "$SCRIPT_DIR/../../../.." && pwd
 }
@@ -745,6 +876,28 @@ cmd_deploy_e2e_interception_proxy() {
     echo "✓ e2e-interception-proxy ready at http://e2e-interception-proxy.${NAMESPACE}.svc.cluster.local:8889"
 }
 
+cmd_deploy_e2e_mock_tls_inference() {
+    local repo_root
+    repo_root="$(_e2e_repo_root)"
+    echo "Deploying e2e-mock-tls-inference in namespace $NAMESPACE..."
+    oc create configmap e2e-mock-tls-inference-script -n "$NAMESPACE" \
+        --from-file=server.py="$repo_root/tests/e2e/mock_tls_inference_server/server.py" \
+        --dry-run=client -o yaml | oc apply -f -
+    oc delete pod e2e-mock-tls-inference -n "$NAMESPACE" --ignore-not-found=true --wait=true 2>/dev/null || true
+    oc apply -n "$NAMESPACE" -f "$MANIFEST_DIR/e2e-mock-tls-inference.yaml"
+    if ! oc wait pod/e2e-mock-tls-inference -n "$NAMESPACE" --for=condition=Ready --timeout=240s; then
+        echo "ERROR: e2e-mock-tls-inference failed to become ready" >&2
+        oc describe pod/e2e-mock-tls-inference -n "$NAMESPACE" 2>/dev/null | tail -30 || true
+        oc logs e2e-mock-tls-inference -n "$NAMESPACE" --tail=40 2>&1 || true
+        return 1
+    fi
+    echo "✓ e2e-mock-tls-inference ready at https://e2e-mock-tls-inference.${NAMESPACE}.svc.cluster.local:8443"
+    if ! cmd_sync_mock_tls_certs_secret; then
+        echo "WARNING: mock TLS server is up but e2e-mock-tls-certs secret sync failed" >&2
+        return 1
+    fi
+}
+
 cmd_disrupt_llama_stack() {
     local pod_name="llama-stack-service"
 
@@ -815,6 +968,12 @@ case "$COMMAND" in
     deploy-e2e-interception-proxy)
         cmd_deploy_e2e_interception_proxy
         ;;
+    deploy-e2e-mock-tls-inference)
+        cmd_deploy_e2e_mock_tls_inference
+        ;;
+    sync-mock-tls-certs-secret)
+        cmd_sync_mock_tls_certs_secret
+        ;;
     *)
         echo "Usage: $0 <command> [args...]"
         echo ""
@@ -833,6 +992,8 @@ case "$COMMAND" in
         echo "  sync-interception-proxy-ca-secret   - Publish trustme CA to Secret for llama mount"
         echo "  deploy-e2e-tunnel-proxy            - Deploy in-cluster tunnel proxy pod"
         echo "  deploy-e2e-interception-proxy      - Deploy in-cluster interception proxy pod"
+        echo "  deploy-e2e-mock-tls-inference      - Deploy mock HTTPS inference server (tls.feature)"
+        echo "  sync-mock-tls-certs-secret         - Publish mock TLS /certs PEMs to Secret for llama"
         exit 1
         ;;
 esac
diff --git a/tests/e2e/configuration/server-mode/lightspeed-stack-tls.yaml b/tests/e2e/configuration/server-mode/lightspeed-stack-tls.yaml
index babdc2b99..fd45ea744 100644
--- a/tests/e2e/configuration/server-mode/lightspeed-stack-tls.yaml
+++ b/tests/e2e/configuration/server-mode/lightspeed-stack-tls.yaml
@@ -8,7 +8,7 @@ service:
   access_log: true
 llama_stack:
   use_as_library_client: false
-  url: http://llama-stack:8321
+  url: http://${env.E2E_LLAMA_HOSTNAME}:8321
   api_key: xyzzy
 user_data_collection:
   feedback_enabled: true
diff --git a/tests/e2e/features/environment.py b/tests/e2e/features/environment.py
index fdca1247c..e97f993a5 100644
--- a/tests/e2e/features/environment.py
+++ b/tests/e2e/features/environment.py
@@ -26,6 +26,7 @@
     reset_llama_stack_disrupt_once_tracking,
     reset_llama_stack_was_running,
 )
+from tests.e2e.features.steps.tls import reset_tls_prow_restart_optimization_state
 from tests.e2e.utils.llama_stack_utils import register_shield
 from tests.e2e.utils.prow_utils import (
     restart_pod,
@@ -451,6 +452,8 @@ def before_feature(context: Context, feature: Feature) -> None:
     context.active_lightspeed_stack_config_basename = None
     # One real Llama disruption per feature (module-level flag; survives context resets)
     reset_llama_stack_disrupt_once_tracking()
+    if feature.filename and "tls.feature" in feature.filename:
+        reset_tls_prow_restart_optimization_state()
 
     try:
         max_flaky = int(os.getenv("E2E_FLAKY_MAX_ATTEMPTS", _E2E_FLAKY_MAX_ATTEMPTS))
diff --git a/tests/e2e/features/steps/proxy.py b/tests/e2e/features/steps/proxy.py
index 1511250dd..597204d92 100644
--- a/tests/e2e/features/steps/proxy.py
+++ b/tests/e2e/features/steps/proxy.py
@@ -295,6 +295,8 @@ def restore_if_modified(context: Context) -> None:
     _stop_proxy(context, "tunnel_proxy", "proxy_loop")
     _stop_proxy(context, "interception_proxy", "interception_proxy_loop")
     os.environ.pop("E2E_COPY_INTERCEPTION_CA_TO_LLAMA", None)
+    os.environ.pop("E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA", None)
+    os.environ.pop("E2E_LLAMA_RELOAD_CONFIG_ONLY", None)
     if hasattr(context, "needs_interception_ca_on_llama"):
         delattr(context, "needs_interception_ca_on_llama")
 
diff --git a/tests/e2e/features/steps/tls.py b/tests/e2e/features/steps/tls.py
index 66d56adcc..0bf7b6905 100644
--- a/tests/e2e/features/steps/tls.py
+++ b/tests/e2e/features/steps/tls.py
@@ -9,6 +9,7 @@
 """
 
 import copy
+import os
 from typing import Any, Optional
 
 from behave import given  # pyright: ignore[reportAttributeAccessIssue]
@@ -19,16 +20,12 @@
     load_llama_config,
     write_llama_config,
 )
+from tests.e2e.utils.prow_utils import get_namespace, run_e2e_ops
+from tests.e2e.utils.utils import is_prow_environment
 
-_TLS_PROVIDER_BASE: dict[str, Any] = {
-    "provider_id": "tls-openai",
-    "provider_type": "remote::openai",
-    "config": {
-        "api_key": "test-key",
-        "base_url": "https://mock-tls-inference:8443/v1",
-        "allowed_models": ["mock-tls-model"],
-    },
-}
+_MOCK_TLS_PORT_TLS = 8443
+_MOCK_TLS_PORT_MTLS = 8444
+_MOCK_TLS_PORT_HOSTNAME_MISMATCH = 8445
 
 _TLS_MODEL_RESOURCE: dict[str, str] = {
     "model_id": "mock-tls-model",
@@ -36,6 +33,74 @@
     "provider_model_id": "mock-tls-model",
 }
 
+_mock_tls_cluster_deploy_state: dict[str, bool] = {"done": False}
+_tls_llama_warm_in_prow: dict[str, bool] = {"done": False}
+
+
+def reset_tls_prow_restart_optimization_state() -> None:
+    """Reset per-feature Prow restart optimizations (call from ``before_feature``)."""
+    _tls_llama_warm_in_prow["done"] = False
+    os.environ.pop("E2E_LLAMA_RELOAD_CONFIG_ONLY", None)
+
+
+def _prepare_tls_prow_llama_restart_env() -> None:
+    """Set env vars so e2e-ops can reload run.yaml instead of recreating the pod."""
+    os.environ["E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA"] = "1"
+    if _tls_llama_warm_in_prow["done"]:
+        os.environ["E2E_LLAMA_RELOAD_CONFIG_ONLY"] = "1"
+    else:
+        os.environ.pop("E2E_LLAMA_RELOAD_CONFIG_ONLY", None)
+
+
+def _cluster_mock_tls_inference_host() -> str:
+    """DNS name of the in-cluster mock TLS inference server (Konflux / Prow)."""
+    explicit = os.getenv("E2E_MOCK_TLS_INFERENCE_HOST", "").strip()
+    if explicit:
+        return explicit
+    return f"e2e-mock-tls-inference.{get_namespace()}.svc.cluster.local"
+
+
+def _mock_tls_base_url(port: int) -> str:
+    """OpenAI-compatible base URL for the mock TLS inference server."""
+    if is_prow_environment():
+        host = _cluster_mock_tls_inference_host()
+    else:
+        host = "mock-tls-inference"
+    return f"https://{host}:{port}/v1"
+
+
+def _tls_provider_base() -> dict[str, Any]:
+    """Default tls-openai provider dict with environment-appropriate base_url."""
+    return {
+        "provider_id": "tls-openai",
+        "provider_type": "remote::openai",
+        "config": {
+            "api_key": "test-key",
+            "base_url": _mock_tls_base_url(_MOCK_TLS_PORT_TLS),
+            "allowed_models": ["mock-tls-model"],
+        },
+    }
+
+
+def _deploy_cluster_mock_tls_inference() -> None:
+    """Deploy the in-cluster mock TLS inference pod (Konflux / Prow)."""
+    if _mock_tls_cluster_deploy_state["done"]:
+        print("Using existing e2e-mock-tls-inference deployment")
+        return
+
+    result = run_e2e_ops("deploy-e2e-mock-tls-inference", timeout=300)
+    print(result.stdout, end="")
+    if result.returncode != 0:
+        raise AssertionError(
+            "Failed to deploy e2e-mock-tls-inference: "
+            f"{result.stderr or result.stdout}"
+        )
+    os.environ.setdefault(
+        "E2E_MOCK_TLS_INFERENCE_HOST",
+        _cluster_mock_tls_inference_host(),
+    )
+    _mock_tls_cluster_deploy_state["done"] = True
+
 
 def _ensure_tls_provider(config: dict[str, Any]) -> dict[str, Any]:
     """Find or create the tls-openai inference provider in the config.
@@ -59,7 +124,7 @@ def _ensure_tls_provider(config: dict[str, Any]) -> dict[str, Any]:
             return provider
 
     # Provider not found — add it
-    provider = copy.deepcopy(_TLS_PROVIDER_BASE)
+    provider = copy.deepcopy(_tls_provider_base())
     inference.append(provider)
 
     # Also register the model resource
@@ -85,8 +150,14 @@ def _configure_tls(tls_config: dict[str, Any], base_url: Optional[str] = None) -
     provider.setdefault("config", {}).setdefault("network", {})
     if base_url is not None:
         provider["config"]["base_url"] = base_url
+    else:
+        provider["config"]["base_url"] = _mock_tls_base_url(_MOCK_TLS_PORT_TLS)
     provider["config"]["network"]["tls"] = tls_config
     write_llama_config(config)
+    if is_prow_environment():
+        _prepare_tls_prow_llama_restart_env()
+        if not _tls_llama_warm_in_prow["done"]:
+            _tls_llama_warm_in_prow["done"] = True
 
 
 # --- Background Steps ---
@@ -94,6 +165,15 @@ def _configure_tls(tls_config: dict[str, Any], base_url: Optional[str] = None) -
 # run.yaml (see proxy.py). Restart steps are listed in tls.feature / proxy.feature.
 
 
+@given("The mock TLS inference server is deployed")
+def deploy_mock_tls_inference_server(context: Context) -> None:
+    """Ensure mock TLS inference is reachable (Compose locally, pod in Prow)."""
+    if is_prow_environment():
+        _deploy_cluster_mock_tls_inference()
+        return
+    print("Using docker-compose mock-tls-inference service")
+
+
 # --- TLS Configuration Steps ---
 
 
@@ -124,7 +204,7 @@ def configure_tls_mtls(context: Context) -> None:
             "client_cert": "/certs/client.crt",
             "client_key": "/certs/client.key",
         },
-        base_url="https://mock-tls-inference:8444/v1",
+        base_url=_mock_tls_base_url(_MOCK_TLS_PORT_MTLS),
     )
 
 
@@ -139,7 +219,7 @@ def configure_mtls_no_client_cert(context: Context) -> None:
     """Configure run.yaml for mTLS port without client cert (should fail)."""
     _configure_tls(
         {"verify": "/certs/ca.crt"},
-        base_url="https://mock-tls-inference:8444/v1",
+        base_url=_mock_tls_base_url(_MOCK_TLS_PORT_MTLS),
     )
 
 
@@ -152,7 +232,7 @@ def configure_mtls_wrong_client_cert(context: Context) -> None:
             "client_cert": "/certs/ca.crt",
             "client_key": "/certs/client.key",
         },
-        base_url="https://mock-tls-inference:8444/v1",
+        base_url=_mock_tls_base_url(_MOCK_TLS_PORT_MTLS),
     )
 
 
@@ -165,7 +245,7 @@ def configure_mtls_untrusted_client_cert(context: Context) -> None:
             "client_cert": "/certs/untrusted-client.crt",
             "client_key": "/certs/untrusted-client.key",
         },
-        base_url="https://mock-tls-inference:8444/v1",
+        base_url=_mock_tls_base_url(_MOCK_TLS_PORT_MTLS),
     )
 
 
@@ -178,7 +258,7 @@ def configure_mtls_expired_client_cert(context: Context) -> None:
             "client_cert": "/certs/expired-client.crt",
             "client_key": "/certs/client.key",
         },
-        base_url="https://mock-tls-inference:8444/v1",
+        base_url=_mock_tls_base_url(_MOCK_TLS_PORT_MTLS),
     )
 
 
@@ -187,7 +267,7 @@ def configure_tls_hostname_mismatch(context: Context) -> None:
     """Configure run.yaml to connect to hostname-mismatch server (should fail)."""
     _configure_tls(
         {"verify": "/certs/ca.crt"},
-        base_url="https://mock-tls-inference:8445/v1",
+        base_url=_mock_tls_base_url(_MOCK_TLS_PORT_HOSTNAME_MISMATCH),
     )
 
 
@@ -200,7 +280,7 @@ def configure_mtls_hostname_mismatch(context: Context) -> None:
             "client_cert": "/certs/client.crt",
             "client_key": "/certs/client.key",
         },
-        base_url="https://mock-tls-inference:8445/v1",
+        base_url=_mock_tls_base_url(_MOCK_TLS_PORT_HOSTNAME_MISMATCH),
     )
 
 
@@ -211,7 +291,7 @@ def configure_tls_min_version_hostname_mismatch(context: Context, version: str)
     """Configure run.yaml with TLS min version against hostname-mismatch server."""
     _configure_tls(
         {"verify": "/certs/ca.crt", "min_version": version},
-        base_url="https://mock-tls-inference:8445/v1",
+        base_url=_mock_tls_base_url(_MOCK_TLS_PORT_HOSTNAME_MISMATCH),
     )
 
 
diff --git a/tests/e2e/features/tls.feature b/tests/e2e/features/tls.feature
index a900b1c0f..15215408e 100644
--- a/tests/e2e/features/tls.feature
+++ b/tests/e2e/features/tls.feature
@@ -1,8 +1,10 @@
-@e2e_group_1 @skip-in-library-mode @skip-in-prow
+@e2e_group_1 @skip-in-library-mode
 Feature: TLS configuration for remote inference providers
   Validate that Llama Stack's NetworkConfig.tls settings are applied correctly
   when connecting to a remote inference provider over HTTPS.
 
+  # Only Llama run.yaml changes per scenario; LCS uses lightspeed-stack-tls.yaml throughout.
+
   Background:
     Given The service is started locally
       And The system is in default state
@@ -10,6 +12,7 @@ Feature: TLS configuration for remote inference providers
       And the Lightspeed stack configuration directory is "tests/e2e/configuration"
       And The service uses the lightspeed-stack-tls.yaml configuration
       And The service is restarted
+      And The mock TLS inference server is deployed
       And The original Llama Stack config is restored if modified
 
   Scenario: Inference succeeds with TLS verification disabled
diff --git a/tests/e2e/mock_tls_inference_server/server.py b/tests/e2e/mock_tls_inference_server/server.py
index bfb4cbae5..25bd23a0c 100644
--- a/tests/e2e/mock_tls_inference_server/server.py
+++ b/tests/e2e/mock_tls_inference_server/server.py
@@ -13,6 +13,7 @@
 
 import datetime
 import json
+import os
 import ssl
 import threading
 import time
@@ -29,6 +30,25 @@
 MTLS_PORT = 8444
 HOSTNAME_MISMATCH_PORT = 8445
 
+_DEFAULT_SERVER_CERT_DNS_NAMES: tuple[str, ...] = (
+    "mock-tls-inference",
+    "localhost",
+    "127.0.0.1",
+)
+
+
+def _server_cert_dns_names() -> tuple[str, ...]:
+    """Return DNS identities for the main server certificate.
+
+    Reads comma-separated ``TLS_CERT_DNS_NAMES`` (set in Konflux/Prow manifest).
+    Falls back to Docker Compose defaults when unset.
+    """
+    raw = os.environ.get("TLS_CERT_DNS_NAMES", "").strip()
+    if not raw:
+        return _DEFAULT_SERVER_CERT_DNS_NAMES
+    names = tuple(name.strip() for name in raw.split(",") if name.strip())
+    return names or _DEFAULT_SERVER_CERT_DNS_NAMES
+
 
 class OpenAIHandler(BaseHTTPRequestHandler):
     """Handles OpenAI-compatible API requests over HTTPS."""
@@ -221,8 +241,9 @@ def main() -> None:
 
     # Generate CA and certificates
     ca = trustme.CA()
-    # Server cert with SANs for Docker service name and localhost
-    server_cert = ca.issue_cert("mock-tls-inference", "localhost", "127.0.0.1")
+    server_dns_names = _server_cert_dns_names()
+    print(f"  Server cert DNS names: {', '.join(server_dns_names)}")
+    server_cert = ca.issue_cert(*server_dns_names)
     # Client cert for mTLS testing (use a simple hostname without spaces)
     client_cert = ca.issue_cert("tls-e2e-test-client")
 
diff --git a/tests/e2e/test_list.txt b/tests/e2e/test_list.txt
index 34e1b8647..857021536 100644
--- a/tests/e2e/test_list.txt
+++ b/tests/e2e/test_list.txt
@@ -1,29 +1,2 @@
-features/authorized_noop.feature
-features/health.feature
-features/info.feature
-features/models.feature
-features/rest_api.feature
-features/smoketests.feature
-features/authorized_noop_token.feature
-features/conversation_cache_v2.feature
-features/conversations.feature
-features/faiss.feature
-features/inline_rag.feature
-features/feedback.feature
-features/query.feature
-features/responses.feature
-features/responses_streaming.feature
-features/rlsapi_v1.feature
-features/streaming_query.feature
-features/http_401_unauthorized.feature
-features/authorized_rh_identity.feature
-features/rbac.feature
-features/rlsapi_v1_errors.feature
-features/llama_stack_disrupted.feature
-features/mcp.feature
-features/mcp_servers_api.feature
-features/mcp_servers_api_auth.feature
-features/mcp_servers_api_no_config.feature
-features/proxy.feature
 features/tls.feature
 features/opentelemetry.feature

From 99b1b315a66a09fb0f1418b2c67f0ef7d3720e05 Mon Sep 17 00:00:00 2001
From: Radovan Fuchs <rfuchs@rfuchs-thinkpadp1gen7.tpb.csb>
Date: Wed, 20 May 2026 19:10:11 +0200
Subject: [PATCH 02/20] add all tests

---
 tests/e2e/test_list.txt | 29 ++++++++++++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

diff --git a/tests/e2e/test_list.txt b/tests/e2e/test_list.txt
index 857021536..26926a81f 100644
--- a/tests/e2e/test_list.txt
+++ b/tests/e2e/test_list.txt
@@ -1,2 +1,29 @@
+features/authorized_noop.feature
+features/health.feature
+features/info.feature
+features/models.feature
+features/rest_api.feature
+features/smoketests.feature
+features/authorized_noop_token.feature
+features/conversation_cache_v2.feature
+features/conversations.feature
+features/faiss.feature
+features/inline_rag.feature
+features/feedback.feature
+features/query.feature
+features/responses.feature
+features/responses_streaming.feature
+features/rlsapi_v1.feature
+features/streaming_query.feature
+features/http_401_unauthorized.feature
+features/authorized_rh_identity.feature
+features/rbac.feature
+features/rlsapi_v1_errors.feature
+features/llama_stack_disrupted.feature
+features/mcp.feature
+features/mcp_servers_api.feature
+features/mcp_servers_api_auth.feature
+features/mcp_servers_api_no_config.feature
+features/proxy.feature
 features/tls.feature
-features/opentelemetry.feature
+features/opentelemetry.feature
\ No newline at end of file

From 61f0d3699fbe5d2a40e4fe4d472f85b739b0ffac Mon Sep 17 00:00:00 2001
From: Radovan Fuchs <rfuchs@rfuchs-thinkpadp1gen7.tpb.csb>
Date: Thu, 21 May 2026 06:53:31 +0200
Subject: [PATCH 03/20] set proxy tests to skipped

---
 tests/e2e/features/proxy.feature | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/e2e/features/proxy.feature b/tests/e2e/features/proxy.feature
index 907c4317d..00fde258a 100644
--- a/tests/e2e/features/proxy.feature
+++ b/tests/e2e/features/proxy.feature
@@ -1,4 +1,4 @@
-@e2e_group_3 @skip-in-library-mode
+@e2e_group_3 @skip-in-library-mode @skip-in-prow
 Feature: Proxy and TLS networking tests for Llama Stack providers
 
   Verify that the Lightspeed Stack works correctly when Llama Stack's

From c2c47c385656dd7b81c0af7e6ebfaf3b552f68b6 Mon Sep 17 00:00:00 2001
From: Radovan Fuchs <rfuchs@rfuchs-thinkpadp1gen7.tpb.csb>
Date: Thu, 21 May 2026 11:20:21 +0200
Subject: [PATCH 04/20] remove optimizations for restarts

---
 tests/e2e/features/steps/tls.py | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/tests/e2e/features/steps/tls.py b/tests/e2e/features/steps/tls.py
index 0bf7b6905..3d7e7fdb4 100644
--- a/tests/e2e/features/steps/tls.py
+++ b/tests/e2e/features/steps/tls.py
@@ -34,22 +34,23 @@
 }
 
 _mock_tls_cluster_deploy_state: dict[str, bool] = {"done": False}
-_tls_llama_warm_in_prow: dict[str, bool] = {"done": False}
 
 
 def reset_tls_prow_restart_optimization_state() -> None:
-    """Reset per-feature Prow restart optimizations (call from ``before_feature``)."""
-    _tls_llama_warm_in_prow["done"] = False
+    """Reset per-feature Prow state (call from ``before_feature``)."""
+    _mock_tls_cluster_deploy_state["done"] = False
     os.environ.pop("E2E_LLAMA_RELOAD_CONFIG_ONLY", None)
+    os.environ.pop("E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA", None)
 
 
 def _prepare_tls_prow_llama_restart_env() -> None:
-    """Set env vars so e2e-ops can reload run.yaml instead of recreating the pod."""
+    """Set env vars so e2e-ops always recreates the llama pod (no config-only reload).
+
+    TLS scenarios change run.yaml and rely on /certs volume mounts; full pod
+    restarts are slower but more reliable than ``kill 1`` reload on Konflux.
+    """
     os.environ["E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA"] = "1"
-    if _tls_llama_warm_in_prow["done"]:
-        os.environ["E2E_LLAMA_RELOAD_CONFIG_ONLY"] = "1"
-    else:
-        os.environ.pop("E2E_LLAMA_RELOAD_CONFIG_ONLY", None)
+    os.environ.pop("E2E_LLAMA_RELOAD_CONFIG_ONLY", None)
 
 
 def _cluster_mock_tls_inference_host() -> str:
@@ -156,8 +157,6 @@ def _configure_tls(tls_config: dict[str, Any], base_url: Optional[str] = None) -
     write_llama_config(config)
     if is_prow_environment():
         _prepare_tls_prow_llama_restart_env()
-        if not _tls_llama_warm_in_prow["done"]:
-            _tls_llama_warm_in_prow["done"] = True
 
 
 # --- Background Steps ---

From b5fec6dd5e80c2569a03192e88e85fc9e3a84353 Mon Sep 17 00:00:00 2001
From: Radovan Fuchs <rfuchs@rfuchs-thinkpadp1gen7.tpb.csb>
Date: Fri, 22 May 2026 08:31:36 +0200
Subject: [PATCH 05/20] fix failing tests

---
 .../lightspeed-stack-integration-test.yaml    |  2 +
 tests/e2e-prow/rhoai/scripts/e2e-ops.sh       | 84 ++++++++++++++-----
 tests/e2e/features/steps/tls.py               |  1 +
 tests/e2e/features/tls.feature                |  2 +-
 tests/e2e/utils/prow_utils.py                 | 51 +++++++----
 5 files changed, 102 insertions(+), 38 deletions(-)

diff --git a/.tekton/integration-tests/pipeline/lightspeed-stack-integration-test.yaml b/.tekton/integration-tests/pipeline/lightspeed-stack-integration-test.yaml
index 3110ccea7..266ea1b96 100644
--- a/.tekton/integration-tests/pipeline/lightspeed-stack-integration-test.yaml
+++ b/.tekton/integration-tests/pipeline/lightspeed-stack-integration-test.yaml
@@ -167,6 +167,8 @@ spec:
               echo "========== End parameters =========="
     - name: lightspeed-stack-integration-tests
       description: Task to run integration tests from lightspeed-stack repository
+      # Full Behave suite (proxy + tls) can exceed 2h; needs PipelineRun timeouts >= this value.
+      timeout: 3h
       params:
         - name: SNAPSHOT
           value: $(params.SNAPSHOT)
diff --git a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh
index 332a429c2..c82fbc53c 100755
--- a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh
+++ b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh
@@ -414,7 +414,17 @@ cmd_restart_llama_stack() {
             -n "$NAMESPACE" \
             --dry-run=client -o yaml | oc apply -f -
         oc apply -n "$NAMESPACE" -f "$MANIFEST_DIR/llama-stack-openai.yaml"
-        wait_for_pod "llama-stack-service" 90
+        local llama_pod_wait=90
+        local llama_health_attempts=50
+        if [[ "${E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA:-0}" == "1" ]]; then
+            llama_pod_wait=120
+            llama_health_attempts=100
+        fi
+        if ! wait_for_pod "llama-stack-service" "$llama_pod_wait"; then
+            echo "===== Llama-stack restore FAILED (pod not ready) ====="
+            oc describe pod llama-stack-service -n "$NAMESPACE" 2>/dev/null | tail -40 || true
+            exit 1
+        fi
         echo "Labeling pod for service..."
         oc label pod llama-stack-service pod=llama-stack-service -n "$NAMESPACE" --overwrite
         if [[ "${E2E_COPY_INTERCEPTION_CA_TO_LLAMA:-0}" == "1" ]]; then
@@ -429,12 +439,9 @@ cmd_restart_llama_stack() {
                 exit 1
             fi
         fi
-        local llama_health_attempts=50
-        if [[ "${E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA:-0}" == "1" ]]; then
-            llama_health_attempts=75
-        fi
         if ! wait_for_llama_stack_http_health "$llama_health_attempts"; then
             echo "===== Llama-stack restore FAILED (HTTP not healthy) ====="
+            oc logs llama-stack-service -n "$NAMESPACE" -c llama-stack-container --tail=80 2>&1 || true
             exit 1
         fi
     else
@@ -542,6 +549,9 @@ cmd_restart_llama_port_forward() {
     local local_port="${LOCAL_LLAMA_PORT:-8321}"
     local remote_port="${REMOTE_LLAMA_PORT:-8321}"
     local max_attempts=6
+    if [[ "${E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA:-0}" == "1" ]]; then
+        max_attempts=10
+    fi
     local pf_pid
     local pf_resource
     local llama_pf_log="/tmp/port-forward-llama.log"
@@ -788,8 +798,23 @@ _MOCK_TLS_CERT_FILES=(
     expired-client.crt
 )
 
+_mock_tls_certs_secret_is_complete() {
+    local f data
+    if ! oc get secret e2e-mock-tls-certs -n "$NAMESPACE" &>/dev/null; then
+        return 1
+    fi
+    for f in "${_MOCK_TLS_CERT_FILES[@]}"; do
+        data=$(oc get secret e2e-mock-tls-certs -n "$NAMESPACE" \
+            -o "go-template={{index .data \"${f}\"}}" 2>/dev/null) || return 1
+        if [[ -z "$data" ]]; then
+            return 1
+        fi
+    done
+    return 0
+}
+
 cmd_sync_mock_tls_certs_secret() {
-    local mock_pod_name tmpdir f
+    local mock_pod_name tmpdir f attempt
     mock_pod_name=$(oc get pod -n "$NAMESPACE" -l app=e2e-mock-tls-inference \
         -o jsonpath='{.items[0].metadata.name}' 2>/dev/null) || mock_pod_name=""
 
@@ -799,19 +824,29 @@ cmd_sync_mock_tls_certs_secret() {
         return 1
     fi
 
+    if _mock_tls_certs_secret_is_complete; then
+        echo "✓ e2e-mock-tls-certs secret already complete, skipping sync"
+        return 0
+    fi
+
+    if ! oc wait pod/"$mock_pod_name" -n "$NAMESPACE" --for=condition=Ready --timeout=60s 2>/dev/null; then
+        echo "WARNING: e2e-mock-tls-inference not Ready before cert sync" >&2
+    fi
+
     tmpdir=$(mktemp -d)
     for f in "${_MOCK_TLS_CERT_FILES[@]}"; do
-        if ! oc exec -n "$NAMESPACE" "$mock_pod_name" -c e2e-mock-tls-inference -- \
-            cat "/certs/$f" >"$tmpdir/$f"; then
-            echo "ERROR: failed to read /certs/$f from e2e-mock-tls-inference pod" >&2
-            rm -rf "$tmpdir"
-            return 1
-        fi
-        if [[ ! -s "$tmpdir/$f" ]]; then
-            echo "ERROR: /certs/$f is empty in e2e-mock-tls-inference pod" >&2
-            rm -rf "$tmpdir"
-            return 1
-        fi
+        for attempt in 1 2 3; do
+            if oc exec -n "$NAMESPACE" "$mock_pod_name" -c e2e-mock-tls-inference -- \
+                cat "/certs/$f" >"$tmpdir/$f" 2>/dev/null && [[ -s "$tmpdir/$f" ]]; then
+                break
+            fi
+            if [[ $attempt -eq 3 ]]; then
+                echo "ERROR: failed to read /certs/$f from e2e-mock-tls-inference pod" >&2
+                rm -rf "$tmpdir"
+                return 1
+            fi
+            sleep 3
+        done
     done
 
     if ! oc create secret generic e2e-mock-tls-certs \
@@ -828,11 +863,16 @@ cmd_sync_mock_tls_certs_secret() {
 
 _verify_mock_tls_certs_mounted_in_llama() {
     local llama_pod_name="llama-stack-service"
-    if oc exec -n "$NAMESPACE" "$llama_pod_name" -c llama-stack-container -- \
-        sh -c 'test -s /certs/ca.crt && test -s /certs/client.crt && test -s /certs/client.key'; then
-        echo "✓ mock TLS certs present under /certs in llama-stack"
-        return 0
-    fi
+    local attempt
+    for attempt in 1 2 3 4 5 6; do
+        if oc exec -n "$NAMESPACE" "$llama_pod_name" -c llama-stack-container -- \
+            sh -c 'test -s /certs/ca.crt && test -s /certs/client.crt && test -s /certs/client.key' \
+            2>/dev/null; then
+            echo "✓ mock TLS certs present under /certs in llama-stack"
+            return 0
+        fi
+        sleep 3
+    done
     echo "ERROR: /certs missing or incomplete in llama-stack pod" >&2
     oc get secret e2e-mock-tls-certs -n "$NAMESPACE" 2>&1 || true
     oc exec -n "$NAMESPACE" "$llama_pod_name" -c llama-stack-container -- \
diff --git a/tests/e2e/features/steps/tls.py b/tests/e2e/features/steps/tls.py
index 3d7e7fdb4..4ff8978ba 100644
--- a/tests/e2e/features/steps/tls.py
+++ b/tests/e2e/features/steps/tls.py
@@ -96,6 +96,7 @@ def _deploy_cluster_mock_tls_inference() -> None:
             "Failed to deploy e2e-mock-tls-inference: "
             f"{result.stderr or result.stdout}"
         )
+    _prepare_tls_prow_llama_restart_env()
     os.environ.setdefault(
         "E2E_MOCK_TLS_INFERENCE_HOST",
         _cluster_mock_tls_inference_host(),
diff --git a/tests/e2e/features/tls.feature b/tests/e2e/features/tls.feature
index 15215408e..412e5b04f 100644
--- a/tests/e2e/features/tls.feature
+++ b/tests/e2e/features/tls.feature
@@ -12,8 +12,8 @@ Feature: TLS configuration for remote inference providers
       And the Lightspeed stack configuration directory is "tests/e2e/configuration"
       And The service uses the lightspeed-stack-tls.yaml configuration
       And The service is restarted
-      And The mock TLS inference server is deployed
       And The original Llama Stack config is restored if modified
+      And The mock TLS inference server is deployed
 
   Scenario: Inference succeeds with TLS verification disabled
     Given Llama Stack is configured with TLS verification disabled
diff --git a/tests/e2e/utils/prow_utils.py b/tests/e2e/utils/prow_utils.py
index ff771904b..a6a594973 100644
--- a/tests/e2e/utils/prow_utils.py
+++ b/tests/e2e/utils/prow_utils.py
@@ -7,6 +7,7 @@
 import os
 import subprocess
 import tempfile
+import time
 from typing import Optional
 
 
@@ -93,7 +94,12 @@ def restart_pod(container_name: str) -> None:
     """
     if container_name in _LLAMA_RESTART_NAMES:
         op = "restart-llama-stack"
-        timeout = 420
+        # TLS feature: full pod recreate + cert sync + health can exceed 7 min on Konflux.
+        timeout = (
+            900
+            if os.environ.get("E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA") == "1"
+            else 420
+        )
     elif container_name in _LIGHTSPEED_RESTART_NAMES:
         op = "restart-lightspeed"
         # Pod wait (up to ~120s) + port-forward retries + slow Konflux/Prow clusters.
@@ -105,20 +111,35 @@ def restart_pod(container_name: str) -> None:
         )
         op = "restart-lightspeed"
         timeout = 200
-    try:
-        result = run_e2e_ops(op, timeout=timeout)
-        print(result.stdout, end="")
-        if result.returncode != 0:
-            print(result.stderr, end="")
-            detail = (result.stderr or result.stdout or "").strip()
-            raise subprocess.CalledProcessError(
-                result.returncode,
-                op,
-                detail or None,
-            )
-    except subprocess.TimeoutExpired as e:
-        print(f"Failed to restart pod {container_name}: {e}")
-        raise
+    max_attempts = 2 if op == "restart-llama-stack" else 1
+    last_error: subprocess.CalledProcessError | subprocess.TimeoutExpired | None = (
+        None
+    )
+    for attempt in range(1, max_attempts + 1):
+        try:
+            result = run_e2e_ops(op, timeout=timeout)
+            print(result.stdout, end="")
+            if result.returncode != 0:
+                print(result.stderr, end="")
+                detail = (result.stderr or result.stdout or "").strip()
+                raise subprocess.CalledProcessError(
+                    result.returncode,
+                    op,
+                    detail or None,
+                )
+            return
+        except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as err:
+            last_error = err
+            if attempt < max_attempts:
+                print(
+                    f"⚠️  {op} failed (attempt {attempt}/{max_attempts}), "
+                    "retrying after 20s..."
+                )
+                time.sleep(20)
+    if last_error is not None:
+        if isinstance(last_error, subprocess.TimeoutExpired):
+            print(f"Failed to restart pod {container_name}: {last_error}")
+        raise last_error
 
 
 def restore_llama_stack_pod() -> None:

From 3cd258085cee81fa81c9ee5810f061cf2c5c40a7 Mon Sep 17 00:00:00 2001
From: Radovan Fuchs <rfuchs@rfuchs-thinkpadp1gen7.tpb.csb>
Date: Fri, 22 May 2026 13:00:19 +0200
Subject: [PATCH 06/20] fix failing tests

---
 tests/e2e-prow/rhoai/scripts/e2e-ops.sh | 11 ++++-
 tests/e2e/features/steps/proxy.py       |  8 ++++
 tests/e2e/features/steps/tls.py         | 59 ++++++++++++++++++++++---
 tests/e2e/utils/prow_utils.py           | 21 +++++++--
 4 files changed, 89 insertions(+), 10 deletions(-)

diff --git a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh
index c82fbc53c..0f206c0e3 100755
--- a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh
+++ b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh
@@ -360,9 +360,16 @@ cmd_reload_llama_stack_config() {
     echo "Restarting llama-stack-container to pick up run.yaml..."
     oc exec -n "$NAMESPACE" "$llama_pod_name" -c llama-stack-container -- bash -c 'kill 1' \
         2>/dev/null || true
-    wait_for_pod "$llama_pod_name" 45
-    if ! wait_for_llama_stack_http_health 35; then
+    local reload_pod_wait=45
+    local reload_health_attempts=35
+    if [[ "${E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA:-0}" == "1" ]]; then
+        reload_pod_wait=60
+        reload_health_attempts=50
+    fi
+    wait_for_pod "$llama_pod_name" "$reload_pod_wait"
+    if ! wait_for_llama_stack_http_health "$reload_health_attempts"; then
         echo "===== Llama-stack reload FAILED (HTTP not healthy) ====="
+        oc logs llama-stack-service -n "$NAMESPACE" -c llama-stack-container --tail=80 2>&1 || true
         return 1
     fi
     if ! cmd_restart_llama_port_forward; then
diff --git a/tests/e2e/features/steps/proxy.py b/tests/e2e/features/steps/proxy.py
index 2c2b19b5a..374218478 100644
--- a/tests/e2e/features/steps/proxy.py
+++ b/tests/e2e/features/steps/proxy.py
@@ -320,6 +320,14 @@ def restore_if_modified(context: Context) -> None:
 @given("Llama Stack is restarted")
 def restart_llama_stack(context: Context) -> None:
     """Restart the Llama Stack container."""
+    from tests.e2e.features.steps.tls import (
+        is_tls_configuration_feature,
+        restart_llama_for_tls_feature,
+    )
+
+    if is_tls_configuration_feature(context):
+        restart_llama_for_tls_feature(context)
+        return
     restart_container("llama-stack")
 
 
diff --git a/tests/e2e/features/steps/tls.py b/tests/e2e/features/steps/tls.py
index 4ff8978ba..5f5da53be 100644
--- a/tests/e2e/features/steps/tls.py
+++ b/tests/e2e/features/steps/tls.py
@@ -34,25 +34,74 @@
 }
 
 _mock_tls_cluster_deploy_state: dict[str, bool] = {"done": False}
+_tls_prow_restart_state: dict[str, bool] = {"full_restart_done": False}
 
 
 def reset_tls_prow_restart_optimization_state() -> None:
     """Reset per-feature Prow state (call from ``before_feature``)."""
     _mock_tls_cluster_deploy_state["done"] = False
+    _tls_prow_restart_state["full_restart_done"] = False
     os.environ.pop("E2E_LLAMA_RELOAD_CONFIG_ONLY", None)
     os.environ.pop("E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA", None)
 
 
-def _prepare_tls_prow_llama_restart_env() -> None:
-    """Set env vars so e2e-ops always recreates the llama pod (no config-only reload).
+def is_tls_configuration_feature(context: Context) -> bool:
+    """Return True when the active Behave feature is ``tls.feature``."""
+    feature = getattr(context, "feature", None)
+    if feature is None:
+        return False
+    name = getattr(feature, "name", "") or ""
+    return "TLS configuration" in name
 
-    TLS scenarios change run.yaml and rely on /certs volume mounts; full pod
-    restarts are slower but more reliable than ``kill 1`` reload on Konflux.
-    """
+
+def _prepare_tls_prow_llama_full_restart_env() -> None:
+    """Env for a full llama pod recreate (first TLS scenario / recovery)."""
     os.environ["E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA"] = "1"
     os.environ.pop("E2E_LLAMA_RELOAD_CONFIG_ONLY", None)
 
 
+def _prepare_tls_prow_llama_reload_env() -> None:
+    """Env for config-only reload (run.yaml already on pod with /certs mounted)."""
+    os.environ["E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA"] = "1"
+    os.environ["E2E_LLAMA_RELOAD_CONFIG_ONLY"] = "1"
+
+
+def restart_llama_for_tls_feature(context: Context) -> None:
+    """Restart Llama for TLS tests: one full recreate per feature, then reload.
+
+    Full pod delete+apply after every scenario (~16×) is flaky on Konflux
+    (cert sync, mount races, port-forward). Later scenarios only change run.yaml
+    in the ConfigMap; ``oc cp`` + container restart is enough.
+    """
+    from tests.e2e.utils.utils import restart_container
+
+    if not is_prow_environment():
+        restart_container("llama-stack")
+        return
+
+    if _tls_prow_restart_state["full_restart_done"]:
+        _prepare_tls_prow_llama_reload_env()
+    else:
+        _prepare_tls_prow_llama_full_restart_env()
+
+    try:
+        restart_container("llama-stack")
+    except Exception:
+        _tls_prow_restart_state["full_restart_done"] = False
+        raise
+
+    _tls_prow_restart_state["full_restart_done"] = True
+
+
+def _prepare_tls_prow_llama_restart_env() -> None:
+    """Set env before writing run.yaml (used by ``_configure_tls``)."""
+    os.environ["E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA"] = "1"
+    if _tls_prow_restart_state["full_restart_done"]:
+        os.environ["E2E_LLAMA_RELOAD_CONFIG_ONLY"] = "1"
+    else:
+        os.environ.pop("E2E_LLAMA_RELOAD_CONFIG_ONLY", None)
+
+
 def _cluster_mock_tls_inference_host() -> str:
     """DNS name of the in-cluster mock TLS inference server (Konflux / Prow)."""
     explicit = os.getenv("E2E_MOCK_TLS_INFERENCE_HOST", "").strip()
diff --git a/tests/e2e/utils/prow_utils.py b/tests/e2e/utils/prow_utils.py
index a6a594973..20b9f3d7b 100644
--- a/tests/e2e/utils/prow_utils.py
+++ b/tests/e2e/utils/prow_utils.py
@@ -111,7 +111,15 @@ def restart_pod(container_name: str) -> None:
         )
         op = "restart-lightspeed"
         timeout = 200
-    max_attempts = 2 if op == "restart-llama-stack" else 1
+    max_attempts = (
+        3
+        if op == "restart-llama-stack"
+        and os.environ.get("E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA") == "1"
+        and os.environ.get("E2E_LLAMA_RELOAD_CONFIG_ONLY") != "1"
+        else 2
+        if op == "restart-llama-stack"
+        else 1
+    )
     last_error: subprocess.CalledProcessError | subprocess.TimeoutExpired | None = (
         None
     )
@@ -131,11 +139,18 @@ def restart_pod(container_name: str) -> None:
         except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as err:
             last_error = err
             if attempt < max_attempts:
+                retry_delay = 30 if os.environ.get("E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA") == "1" else 20
                 print(
                     f"⚠️  {op} failed (attempt {attempt}/{max_attempts}), "
-                    "retrying after 20s..."
+                    f"retrying after {retry_delay}s..."
                 )
-                time.sleep(20)
+                time.sleep(retry_delay)
+                if (
+                    op == "restart-llama-stack"
+                    and os.environ.get("E2E_LLAMA_RELOAD_CONFIG_ONLY") == "1"
+                ):
+                    # Reload failed; next attempt does full pod recreate.
+                    os.environ.pop("E2E_LLAMA_RELOAD_CONFIG_ONLY", None)
     if last_error is not None:
         if isinstance(last_error, subprocess.TimeoutExpired):
             print(f"Failed to restart pod {container_name}: {last_error}")

From 70b69c9b79d3a9a6e66915d26eee750ecbe6c82a Mon Sep 17 00:00:00 2001
From: Radovan Fuchs <rfuchs@rfuchs-thinkpadp1gen7.tpb.csb>
Date: Mon, 25 May 2026 12:11:02 +0200
Subject: [PATCH 07/20] add logging

---
 tests/e2e-prow/rhoai/scripts/e2e-ops.sh | 104 ++++++++++++++++++++++--
 tests/e2e/features/steps/tls.py         |   8 ++
 tests/e2e/utils/prow_utils.py           |  51 +++++++++---
 3 files changed, 144 insertions(+), 19 deletions(-)

diff --git a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh
index 0f206c0e3..62167198d 100755
--- a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh
+++ b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh
@@ -38,6 +38,37 @@ E2E_LSC_PORT_FORWARD_PID_FILE="${E2E_LSC_PORT_FORWARD_PID_FILE:-/tmp/e2e-lightsp
 E2E_LLAMA_PORT_FORWARD_PID_FILE="${E2E_LLAMA_PORT_FORWARD_PID_FILE:-/tmp/e2e-llama-port-forward.pid}"
 E2E_JWKS_PORT_FORWARD_PID_FILE="${E2E_JWKS_PORT_FORWARD_PID_FILE:-/tmp/e2e-jwks-port-forward.pid}"
 
+# Llama restart exit codes (grep Konflux logs for E2E_LLAMA_RESTART_FAILED_PHASE=):
+#   10 reload: run.yaml copy failed
+#   11 reload: pod not Ready within wait
+#   12 reload: in-pod /v1/health failed
+#   13 reload: localhost:8321 port-forward failed
+#   20 full restart: pod not Ready
+#   21 full restart: in-pod /v1/health failed
+#   22 full restart: localhost:8321 port-forward failed
+
+E2E_OPS_CURRENT_PHASE=""
+E2E_OPS_PHASE_START=0
+
+_e2e_ops_phase() {
+    if [[ -n "${E2E_OPS_CURRENT_PHASE:-}" && "${E2E_OPS_PHASE_START:-0}" -gt 0 ]]; then
+        local elapsed=$(( $(date +%s) - E2E_OPS_PHASE_START ))
+        echo "[e2e-ops] <<< ${E2E_OPS_CURRENT_PHASE} (${elapsed}s)"
+    fi
+    E2E_OPS_CURRENT_PHASE="$1"
+    E2E_OPS_PHASE_START=$(date +%s)
+    echo "E2E_OPS_PHASE=$1"
+    echo "[e2e-ops] >>> $1"
+}
+
+_e2e_ops_llama_restart_fail() {
+    local phase="$1"
+    local code="$2"
+    echo "E2E_LLAMA_RESTART_FAILED_PHASE=$phase"
+    echo "E2E_LLAMA_RESTART_EXIT_CODE=$code"
+    exit "$code"
+}
+
 # ============================================================================
 # Helper functions
 # ============================================================================
@@ -337,26 +368,30 @@ cmd_reload_llama_stack_config() {
     local llama_pod_name="llama-stack-service"
     local tmp
 
+    echo "E2E_LLAMA_RESTART_MODE=reload"
     echo "===== Reloading llama-stack run.yaml (container restart, no pod recreate) ====="
+    _e2e_ops_phase "reload_read_configmap"
     tmp=$(mktemp)
     if ! oc get configmap llama-stack-config -n "$NAMESPACE" \
         -o jsonpath='{.data.run\.yaml}' >"$tmp"; then
         rm -f "$tmp"
         echo "ERROR: failed to read llama-stack-config run.yaml" >&2
-        return 1
+        _e2e_ops_llama_restart_fail "reload_configmap_read" 10
     fi
     if [[ ! -s "$tmp" ]]; then
         rm -f "$tmp"
         echo "ERROR: llama-stack-config run.yaml is empty" >&2
-        return 1
+        _e2e_ops_llama_restart_fail "reload_configmap_empty" 10
     fi
+    _e2e_ops_phase "reload_oc_cp_run_yaml"
     if ! oc cp "$tmp" "$NAMESPACE/$llama_pod_name:/opt/app-root/run.yaml" \
         -c llama-stack-container; then
         rm -f "$tmp"
         echo "ERROR: failed to copy run.yaml into llama-stack pod" >&2
-        return 1
+        _e2e_ops_llama_restart_fail "reload_oc_cp" 10
     fi
     rm -f "$tmp"
+    _e2e_ops_phase "reload_kill_container"
     echo "Restarting llama-stack-container to pick up run.yaml..."
     oc exec -n "$NAMESPACE" "$llama_pod_name" -c llama-stack-container -- bash -c 'kill 1' \
         2>/dev/null || true
@@ -366,16 +401,24 @@ cmd_reload_llama_stack_config() {
         reload_pod_wait=60
         reload_health_attempts=50
     fi
-    wait_for_pod "$llama_pod_name" "$reload_pod_wait"
+    _e2e_ops_phase "reload_wait_pod_ready"
+    if ! wait_for_pod "$llama_pod_name" "$reload_pod_wait"; then
+        echo "===== Llama-stack reload FAILED (pod not ready) ====="
+        oc describe pod llama-stack-service -n "$NAMESPACE" 2>/dev/null | tail -40 || true
+        _e2e_ops_llama_restart_fail "reload_pod_not_ready" 11
+    fi
+    _e2e_ops_phase "reload_wait_in_pod_health"
     if ! wait_for_llama_stack_http_health "$reload_health_attempts"; then
         echo "===== Llama-stack reload FAILED (HTTP not healthy) ====="
         oc logs llama-stack-service -n "$NAMESPACE" -c llama-stack-container --tail=80 2>&1 || true
-        return 1
+        _e2e_ops_llama_restart_fail "reload_in_pod_health" 12
     fi
+    _e2e_ops_phase "reload_port_forward"
     if ! cmd_restart_llama_port_forward; then
         echo "ERROR: Llama pod is up but localhost:${LOCAL_LLAMA_PORT:-8321} port-forward failed"
-        return 1
+        _e2e_ops_llama_restart_fail "reload_port_forward" 13
     fi
+    _e2e_ops_phase "reload_done"
     echo "===== Llama-stack config reload complete ====="
 }
 
@@ -386,10 +429,13 @@ cmd_restart_llama_stack() {
                 return 0
             fi
             echo "WARN: llama config reload failed; falling back to full pod restart" >&2
+            echo "E2E_LLAMA_RESTART_FALLBACK=reload_to_full"
         fi
     fi
 
+    echo "E2E_LLAMA_RESTART_MODE=full"
     echo "===== Restoring llama-stack service ====="
+    _e2e_ops_phase "full_delete_pod"
     # Pod.spec is largely immutable; delete so apply creates a pod with current volumes/env.
     echo "Deleting llama-stack pod (if any) before apply..."
     timeout 45 oc delete pod llama-stack-service -n "$NAMESPACE" --ignore-not-found=true --wait=true 2>/dev/null || {
@@ -399,6 +445,7 @@ cmd_restart_llama_stack() {
 
     echo "Applying pod manifest..."
     if [[ "${E2E_KONFLUX_E2E:-0}" == "1" ]]; then
+        _e2e_ops_phase "full_apply_manifest"
         # Interception-proxy e2e: refresh Secret before pod recreate so the volume mount is populated.
         if [[ "${E2E_COPY_INTERCEPTION_CA_TO_LLAMA:-0}" == "1" ]]; then
             echo "[e2e-ops] Syncing e2e-interception-proxy-ca secret before llama-stack apply..."
@@ -427,10 +474,11 @@ cmd_restart_llama_stack() {
             llama_pod_wait=120
             llama_health_attempts=100
         fi
+        _e2e_ops_phase "full_wait_pod_ready"
         if ! wait_for_pod "llama-stack-service" "$llama_pod_wait"; then
             echo "===== Llama-stack restore FAILED (pod not ready) ====="
             oc describe pod llama-stack-service -n "$NAMESPACE" 2>/dev/null | tail -40 || true
-            exit 1
+            _e2e_ops_llama_restart_fail "full_pod_not_ready" 20
         fi
         echo "Labeling pod for service..."
         oc label pod llama-stack-service pod=llama-stack-service -n "$NAMESPACE" --overwrite
@@ -446,12 +494,14 @@ cmd_restart_llama_stack() {
                 exit 1
             fi
         fi
+        _e2e_ops_phase "full_wait_in_pod_health"
         if ! wait_for_llama_stack_http_health "$llama_health_attempts"; then
             echo "===== Llama-stack restore FAILED (HTTP not healthy) ====="
             oc logs llama-stack-service -n "$NAMESPACE" -c llama-stack-container --tail=80 2>&1 || true
-            exit 1
+            _e2e_ops_llama_restart_fail "full_in_pod_health" 21
         fi
     else
+        _e2e_ops_phase "full_apply_prow_manifest"
         # Prow: vLLM Llama Stack image (matches pipeline.sh / pipeline-services.sh)
         # Use sed instead of envsubst to avoid blanking $VAR references in embedded bash scripts
         sed "s|\${LLAMA_STACK_IMAGE}|${LLAMA_STACK_IMAGE:-}|g" "$MANIFEST_DIR/llama-stack-prow.yaml" |
@@ -461,14 +511,46 @@ cmd_restart_llama_stack() {
         oc label pod llama-stack-service pod=llama-stack-service -n "$NAMESPACE" --overwrite
     fi
 
+    _e2e_ops_phase "full_port_forward"
     if ! cmd_restart_llama_port_forward; then
         echo "ERROR: Llama pod is up but localhost:${LOCAL_LLAMA_PORT:-8321} port-forward failed"
-        exit 1
+        _e2e_ops_llama_restart_fail "full_port_forward" 22
     fi
 
+    _e2e_ops_phase "full_done"
     echo "===== Llama-stack restore complete ====="
 }
 
+cmd_diagnose_llama_restart() {
+    echo "===== Llama-stack restart diagnostics (namespace=$NAMESPACE) ====="
+    echo "E2E_KONFLUX_E2E=${E2E_KONFLUX_E2E:-0} E2E_LLAMA_RELOAD_CONFIG_ONLY=${E2E_LLAMA_RELOAD_CONFIG_ONLY:-0}"
+    oc get pod llama-stack-service -n "$NAMESPACE" -o wide 2>&1 || true
+    echo "--- container restarts / state ---"
+    oc get pod llama-stack-service -n "$NAMESPACE" \
+        -o jsonpath='{.status.containerStatuses[0].restartCount} restarts ready={.status.containerStatuses[0].ready}{"\n"}' 2>&1 || true
+    echo "--- in-pod GET /v1/health ---"
+    if _llama_stack_http_health_once; then
+        echo "OK"
+    else
+        echo "FAIL"
+    fi
+    echo "--- localhost:${LOCAL_LLAMA_PORT:-8321}/v1/health via port-forward ---"
+    if verify_llama_local_forward 5; then
+        echo "OK"
+    else
+        echo "FAIL (pipeline PID file: ${E2E_LLAMA_PORT_FORWARD_PID_FILE:-unset})"
+        if [[ -f "${E2E_LLAMA_PORT_FORWARD_PID_FILE:-}" ]]; then
+            read -r pf_pid <"${E2E_LLAMA_PORT_FORWARD_PID_FILE}" 2>/dev/null || true
+            echo "saved_pf_pid=${pf_pid:-} alive=$(kill -0 "$pf_pid" 2>/dev/null && echo yes || echo no)"
+        fi
+    fi
+    echo "--- tls-openai in llama-stack-config (grep) ---"
+    oc get configmap llama-stack-config -n "$NAMESPACE" -o jsonpath='{.data.run\.yaml}' 2>/dev/null \
+        | grep -E 'tls-openai|network:|client_cert|verify:' | head -20 || true
+    echo "--- llama container log tail ---"
+    oc logs llama-stack-service -n "$NAMESPACE" -c llama-stack-container --tail=40 2>&1 || true
+}
+
 cmd_restart_port_forward() {
     local local_port="${LOCAL_PORT:-8080}"
     local remote_port="${REMOTE_PORT:-8080}"
@@ -1021,6 +1103,9 @@ case "$COMMAND" in
     sync-mock-tls-certs-secret)
         cmd_sync_mock_tls_certs_secret
         ;;
+    diagnose-llama-restart)
+        cmd_diagnose_llama_restart
+        ;;
     *)
         echo "Usage: $0 <command> [args...]"
         echo ""
@@ -1041,6 +1126,7 @@ case "$COMMAND" in
         echo "  deploy-e2e-interception-proxy      - Deploy in-cluster interception proxy pod"
         echo "  deploy-e2e-mock-tls-inference      - Deploy mock HTTPS inference server (tls.feature)"
         echo "  sync-mock-tls-certs-secret         - Publish mock TLS /certs PEMs to Secret for llama"
+        echo "  diagnose-llama-restart             - Snapshot pod/health/forward/config for debugging"
         exit 1
         ;;
 esac
diff --git a/tests/e2e/features/steps/tls.py b/tests/e2e/features/steps/tls.py
index 5f5da53be..329886fd3 100644
--- a/tests/e2e/features/steps/tls.py
+++ b/tests/e2e/features/steps/tls.py
@@ -81,8 +81,16 @@ def restart_llama_for_tls_feature(context: Context) -> None:
 
     if _tls_prow_restart_state["full_restart_done"]:
         _prepare_tls_prow_llama_reload_env()
+        mode = "reload"
     else:
         _prepare_tls_prow_llama_full_restart_env()
+        mode = "full"
+
+    scenario = getattr(getattr(context, "scenario", None), "name", "") or "?"
+    print(
+        f"[tls.feature] Llama Stack restart: mode={mode} scenario={scenario!r}",
+        flush=True,
+    )
 
     try:
         restart_container("llama-stack")
diff --git a/tests/e2e/utils/prow_utils.py b/tests/e2e/utils/prow_utils.py
index 20b9f3d7b..3dc97d7ac 100644
--- a/tests/e2e/utils/prow_utils.py
+++ b/tests/e2e/utils/prow_utils.py
@@ -80,6 +80,28 @@ def wait_for_pod_health(pod_name: str, max_attempts: int = 12) -> None:
 _LIGHTSPEED_RESTART_NAMES = frozenset({"lightspeed-stack", "lightspeed-stack-service"})
 
 
+def _print_llama_restart_diagnostics_from_output(output: str) -> None:
+    """Extract e2e-ops phase markers from restart-llama-stack stdout."""
+    markers = (
+        "E2E_LLAMA_RESTART_MODE=",
+        "E2E_LLAMA_RESTART_FAILED_PHASE=",
+        "E2E_LLAMA_RESTART_FALLBACK=",
+        "E2E_LLAMA_RESTART_EXIT_CODE=",
+        "E2E_OPS_PHASE=",
+    )
+    printed = False
+    for line in output.splitlines():
+        if any(line.startswith(m) for m in markers) or "[e2e-ops] <<<" in line:
+            print(line, flush=True)
+            printed = True
+    if printed:
+        print(
+            "See docs/e2e_testing.md § Konflux Llama restart diagnostics "
+            "for phase meanings and fixes.",
+            flush=True,
+        )
+
+
 def restart_pod(container_name: str) -> None:
     """Restart Llama Stack or Lightspeed pod in OpenShift/Prow (not Docker).
 
@@ -96,9 +118,7 @@ def restart_pod(container_name: str) -> None:
         op = "restart-llama-stack"
         # TLS feature: full pod recreate + cert sync + health can exceed 7 min on Konflux.
         timeout = (
-            900
-            if os.environ.get("E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA") == "1"
-            else 420
+            900 if os.environ.get("E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA") == "1" else 420
         )
     elif container_name in _LIGHTSPEED_RESTART_NAMES:
         op = "restart-lightspeed"
@@ -116,13 +136,9 @@ def restart_pod(container_name: str) -> None:
         if op == "restart-llama-stack"
         and os.environ.get("E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA") == "1"
         and os.environ.get("E2E_LLAMA_RELOAD_CONFIG_ONLY") != "1"
-        else 2
-        if op == "restart-llama-stack"
-        else 1
-    )
-    last_error: subprocess.CalledProcessError | subprocess.TimeoutExpired | None = (
-        None
+        else 2 if op == "restart-llama-stack" else 1
     )
+    last_error: subprocess.CalledProcessError | subprocess.TimeoutExpired | None = None
     for attempt in range(1, max_attempts + 1):
         try:
             result = run_e2e_ops(op, timeout=timeout)
@@ -138,8 +154,23 @@ def restart_pod(container_name: str) -> None:
             return
         except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as err:
             last_error = err
+            if op == "restart-llama-stack" and isinstance(
+                err, subprocess.CalledProcessError
+            ):
+                _print_llama_restart_diagnostics_from_output(
+                    (err.stdout or "") + (err.stderr or "")
+                )
+                try:
+                    diag = run_e2e_ops("diagnose-llama-restart", timeout=90)
+                    print(diag.stdout, end="")
+                except (subprocess.CalledProcessError, subprocess.TimeoutExpired):
+                    pass
             if attempt < max_attempts:
-                retry_delay = 30 if os.environ.get("E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA") == "1" else 20
+                retry_delay = (
+                    30
+                    if os.environ.get("E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA") == "1"
+                    else 20
+                )
                 print(
                     f"⚠️  {op} failed (attempt {attempt}/{max_attempts}), "
                     f"retrying after {retry_delay}s..."

From 1fbca1afc962be191790ac08845c42a338a98029 Mon Sep 17 00:00:00 2001
From: Radovan Fuchs <rfuchs@rfuchs-thinkpadp1gen7.tpb.csb>
Date: Mon, 25 May 2026 12:16:45 +0200
Subject: [PATCH 08/20] fix

---
 tests/e2e-prow/rhoai/scripts/e2e-ops.sh |  3 ++-
 tests/e2e/features/steps/tls.py         | 18 ++++++++++++++++--
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh
index 62167198d..fcdeb864b 100755
--- a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh
+++ b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh
@@ -399,7 +399,8 @@ cmd_reload_llama_stack_config() {
     local reload_health_attempts=35
     if [[ "${E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA:-0}" == "1" ]]; then
         reload_pod_wait=60
-        reload_health_attempts=50
+        # Fail reload faster when stack stays unhealthy (then full recreate runs once).
+        reload_health_attempts=24
     fi
     _e2e_ops_phase "reload_wait_pod_ready"
     if ! wait_for_pod "$llama_pod_name" "$reload_pod_wait"; then
diff --git a/tests/e2e/features/steps/tls.py b/tests/e2e/features/steps/tls.py
index 329886fd3..f1f1693a6 100644
--- a/tests/e2e/features/steps/tls.py
+++ b/tests/e2e/features/steps/tls.py
@@ -34,13 +34,17 @@
 }
 
 _mock_tls_cluster_deploy_state: dict[str, bool] = {"done": False}
-_tls_prow_restart_state: dict[str, bool] = {"full_restart_done": False}
+_tls_prow_restart_state: dict[str, bool] = {
+    "full_restart_done": False,
+    "force_full_restart": False,
+}
 
 
 def reset_tls_prow_restart_optimization_state() -> None:
     """Reset per-feature Prow state (call from ``before_feature``)."""
     _mock_tls_cluster_deploy_state["done"] = False
     _tls_prow_restart_state["full_restart_done"] = False
+    _tls_prow_restart_state["force_full_restart"] = False
     os.environ.pop("E2E_LLAMA_RELOAD_CONFIG_ONLY", None)
     os.environ.pop("E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA", None)
 
@@ -79,7 +83,10 @@ def restart_llama_for_tls_feature(context: Context) -> None:
         restart_container("llama-stack")
         return
 
-    if _tls_prow_restart_state["full_restart_done"]:
+    if _tls_prow_restart_state.pop("force_full_restart", False):
+        _prepare_tls_prow_llama_full_restart_env()
+        mode = "full_forced"
+    elif _tls_prow_restart_state["full_restart_done"]:
         _prepare_tls_prow_llama_reload_env()
         mode = "reload"
     else:
@@ -136,6 +143,8 @@ def _tls_provider_base() -> dict[str, Any]:
             "api_key": "test-key",
             "base_url": _mock_tls_base_url(_MOCK_TLS_PORT_TLS),
             "allowed_models": ["mock-tls-model"],
+            # Avoid hitting the mock on every container restart (Konflux reload path).
+            "refresh_models": False,
         },
     }
 
@@ -211,6 +220,7 @@ def _configure_tls(tls_config: dict[str, Any], base_url: Optional[str] = None) -
         provider["config"]["base_url"] = base_url
     else:
         provider["config"]["base_url"] = _mock_tls_base_url(_MOCK_TLS_PORT_TLS)
+    provider.setdefault("config", {})["refresh_models"] = False
     provider["config"]["network"]["tls"] = tls_config
     write_llama_config(config)
     if is_prow_environment():
@@ -291,6 +301,10 @@ def configure_mtls_wrong_client_cert(context: Context) -> None:
         },
         base_url=_mock_tls_base_url(_MOCK_TLS_PORT_MTLS),
     )
+    # Konflux: reload after this config often never becomes healthy until timeout,
+    # then falls back to full recreate (~7 min). Skip reload for this scenario.
+    if is_prow_environment():
+        _tls_prow_restart_state["force_full_restart"] = True
 
 
 @given("Llama Stack is configured for mTLS with untrusted client certificate")

From 9b30f01517d379bd9dfdeae798fc0432d4c3a3ed Mon Sep 17 00:00:00 2001
From: Radovan Fuchs <rfuchs@rfuchs-thinkpadp1gen7.tpb.csb>
Date: Mon, 25 May 2026 16:16:25 +0200
Subject: [PATCH 09/20] fix for tls restarts

---
 tests/e2e-prow/rhoai/scripts/e2e-ops.sh | 45 +++++++++++++++++++++----
 1 file changed, 38 insertions(+), 7 deletions(-)

diff --git a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh
index fcdeb864b..0850bb75f 100755
--- a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh
+++ b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh
@@ -289,6 +289,30 @@ _llama_stack_http_health_once() {
     return 1
 }
 
+# Before recreating lightspeed-stack, Llama must answer /v1/health (reload can be Ready but not listening yet).
+_wait_for_llama_before_lightspeed_restart() {
+    local max_attempts="${1:-25}"
+    local attempt
+
+    echo "Waiting for Llama Stack before lightspeed-stack restart..."
+    for ((attempt=1; attempt<=max_attempts; attempt++)); do
+        if _llama_stack_http_health_once; then
+            echo "✓ Llama Stack healthy before LCS restart (attempt $attempt/$max_attempts)"
+            return 0
+        fi
+        if [[ $attempt -lt $max_attempts ]]; then
+            sleep 2
+        fi
+    done
+    echo "⚠️  Llama Stack not healthy after $((max_attempts * 2))s — restoring before LCS restart..."
+    # Use full recreate (reload may have left the process still starting or wedged).
+    if ! E2E_LLAMA_RELOAD_CONFIG_ONLY=0 cmd_restart_llama_stack; then
+        echo "⚠️  Llama Stack restore failed; LCS may be slow to start"
+        return 1
+    fi
+    return 0
+}
+
 # After the pod is Ready, confirm the process is actually serving HTTP (not only kubelet probes).
 wait_for_llama_stack_http_health() {
     local max_attempts="${1:-35}"
@@ -319,12 +343,9 @@ cmd_restart_lightspeed() {
     echo "Restarting lightspeed-stack service..."
 
     # LCS hangs at startup if Llama Stack is unreachable (blocks Llama handshake,
-    # never opens port 8080, readiness probe never passes).  Ensure Llama Stack
-    # is healthy before recreating the LCS pod.
-    if ! _llama_stack_http_health_once 2>/dev/null; then
-        echo "⚠️  Llama Stack not healthy — restoring before LCS restart..."
-        cmd_restart_llama_stack || echo "⚠️  Llama Stack restore failed; LCS may be slow to start"
-    fi
+    # never opens port 8080, readiness probe never passes).  After a Konflux
+    # config reload, the pod can be Ready before /v1/health responds — poll first.
+    _wait_for_llama_before_lightspeed_restart 25
 
     # Delete existing pod (short wait so hook stays within timeout; force if needed)
     timeout 20 oc delete pod lightspeed-stack-service -n "$NAMESPACE" --ignore-not-found=true --wait=true 2>/dev/null || {
@@ -354,13 +375,23 @@ cmd_restart_lightspeed() {
     oc label pod lightspeed-stack-service pod=lightspeed-stack-service -n "$NAMESPACE" --overwrite
 
     # Re-establish port-forwards (may succeed even if readiness was slow)
-    cmd_restart_port_forward
+    local forward_ok=true
+    if ! cmd_restart_port_forward; then
+        forward_ok=false
+        echo "⚠️  Lightspeed port-forward on :${LOCAL_PORT:-8080} failed"
+        e2e_ops_diagnose_forward_failure
+    fi
     cmd_restart_jwks_port_forward || echo "⚠️  Mock JWKS port-forward failed (RBAC tests may fail)"
 
     if [[ "$pod_ready" == "false" ]]; then
+        echo "E2E_LIGHTSPEED_RESTART_FAILED_PHASE=pod_not_ready"
         echo "⚠️  Lightspeed restart completed but pod was slow to become ready"
         return 1
     fi
+    if [[ "$forward_ok" == "false" ]]; then
+        echo "E2E_LIGHTSPEED_RESTART_FAILED_PHASE=port_forward"
+        return 1
+    fi
     echo "✓ Lightspeed restart complete"
 }
 

From bdb29dc46321c8bfb248c10ef922aaa15d5f43a4 Mon Sep 17 00:00:00 2001
From: Radovan Fuchs <rfuchs@rfuchs-thinkpadp1gen7.tpb.csb>
Date: Mon, 25 May 2026 22:14:24 +0200
Subject: [PATCH 10/20] fix

---
 tests/e2e-prow/rhoai/scripts/e2e-ops.sh | 29 +++++++++++++++++--------
 tests/e2e/utils/prow_utils.py           |  4 ++--
 2 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh
index 0850bb75f..3cbf125ac 100755
--- a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh
+++ b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh
@@ -361,14 +361,23 @@ cmd_restart_lightspeed() {
         oc apply -n "$NAMESPACE" -f -
     
     # Wait for pod to be ready (TCP probe passes when app listens on 8080).
-    # Don't let a timeout here abort the function — still attempt port-forward
-    # and diagnostics so later scenarios have a chance to recover.
+    # Manifest readiness: initialDelay 20 + failureThreshold 30 * period 5 = up to ~170s.
+    local lcs_pod_wait=40
+    if [[ "${E2E_KONFLUX_E2E:-0}" == "1" ]]; then
+        lcs_pod_wait=65
+    fi
     local pod_ready=true
-    if ! wait_for_pod "lightspeed-stack-service" 40; then
-        pod_ready=false
-        echo "⚠️  Pod not ready within 120s — dumping diagnostics:"
-        oc describe pod lightspeed-stack-service -n "$NAMESPACE" 2>&1 | tail -30 || true
-        oc logs lightspeed-stack-service -n "$NAMESPACE" --tail=40 2>&1 || true
+    if ! wait_for_pod "lightspeed-stack-service" "$lcs_pod_wait"; then
+        echo "⚠️  Pod not ready within $((lcs_pod_wait * 3))s — extended wait (Konflux LCS startup)..."
+        if ! wait_for_pod "lightspeed-stack-service" 25; then
+            pod_ready=false
+            echo "⚠️  Pod still not ready — dumping diagnostics:"
+            oc describe pod lightspeed-stack-service -n "$NAMESPACE" 2>&1 | tail -40 || true
+            oc logs lightspeed-stack-service -n "$NAMESPACE" \
+                -c lightspeed-stack-container --tail=80 2>&1 || true
+        else
+            echo "✓ Pod became ready during extended wait"
+        fi
     fi
 
     # Re-label pod for service discovery
@@ -597,8 +606,10 @@ cmd_restart_port_forward() {
         # Let the kernel release LISTEN sockets after pkill (avoids immediate "address already in use")
         sleep 3
 
-        # Service can lag endpoints after pod recreate; pod-direct forward is more reliable.
-        if [[ $attempt -le 2 ]]; then
+        # Service forward waits for endpoints; after LCS recreate use pod-direct sooner on Konflux.
+        if [[ "${E2E_KONFLUX_E2E:-0}" == "1" ]]; then
+            pf_resource="pod/lightspeed-stack-service"
+        elif [[ $attempt -le 2 ]]; then
             pf_resource="svc/lightspeed-stack-service-svc"
         else
             pf_resource="pod/lightspeed-stack-service"
diff --git a/tests/e2e/utils/prow_utils.py b/tests/e2e/utils/prow_utils.py
index 3dc97d7ac..d1df3bd40 100644
--- a/tests/e2e/utils/prow_utils.py
+++ b/tests/e2e/utils/prow_utils.py
@@ -122,8 +122,8 @@ def restart_pod(container_name: str) -> None:
         )
     elif container_name in _LIGHTSPEED_RESTART_NAMES:
         op = "restart-lightspeed"
-        # Pod wait (up to ~120s) + port-forward retries + slow Konflux/Prow clusters.
-        timeout = 320
+        # Konflux LCS: up to ~195s pod wait + extended wait + port-forward retries.
+        timeout = 420 if os.environ.get("E2E_KONFLUX_E2E") == "1" else 320
     else:
         print(
             f"Warning: restart_pod({container_name!r}) unknown; "

From dcffd894bcf6257608dc65a03885b98201712347 Mon Sep 17 00:00:00 2001
From: Radovan Fuchs <rfuchs@rfuchs-thinkpadp1gen7.tpb.csb>
Date: Tue, 26 May 2026 06:47:14 +0200
Subject: [PATCH 11/20] fix

---
 tests/e2e-prow/rhoai/scripts/e2e-ops.sh | 315 ++++--------------------
 tests/e2e/features/environment.py       |   4 +-
 tests/e2e/features/steps/proxy.py       |   1 -
 tests/e2e/features/steps/tls.py         |  63 +----
 tests/e2e/utils/prow_utils.py           |  91 +------
 5 files changed, 67 insertions(+), 407 deletions(-)

diff --git a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh
index 3cbf125ac..b5741ef02 100755
--- a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh
+++ b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh
@@ -26,7 +26,7 @@
 #   deploy-e2e-tunnel-proxy         - Deploy in-cluster tunnel proxy (proxy.feature step)
 #   deploy-e2e-interception-proxy   - Deploy in-cluster interception proxy (proxy.feature step)
 #   deploy-e2e-mock-tls-inference   - Deploy mock HTTPS inference server (tls.feature step)
-#   sync-mock-tls-certs-secret        - Publish /certs PEMs to Secret for llama-stack mount
+#   sync-mock-tls-certs-secret      - Publish /certs PEMs to Secret for llama-stack mount
 
 set -e
 
@@ -38,37 +38,6 @@ E2E_LSC_PORT_FORWARD_PID_FILE="${E2E_LSC_PORT_FORWARD_PID_FILE:-/tmp/e2e-lightsp
 E2E_LLAMA_PORT_FORWARD_PID_FILE="${E2E_LLAMA_PORT_FORWARD_PID_FILE:-/tmp/e2e-llama-port-forward.pid}"
 E2E_JWKS_PORT_FORWARD_PID_FILE="${E2E_JWKS_PORT_FORWARD_PID_FILE:-/tmp/e2e-jwks-port-forward.pid}"
 
-# Llama restart exit codes (grep Konflux logs for E2E_LLAMA_RESTART_FAILED_PHASE=):
-#   10 reload: run.yaml copy failed
-#   11 reload: pod not Ready within wait
-#   12 reload: in-pod /v1/health failed
-#   13 reload: localhost:8321 port-forward failed
-#   20 full restart: pod not Ready
-#   21 full restart: in-pod /v1/health failed
-#   22 full restart: localhost:8321 port-forward failed
-
-E2E_OPS_CURRENT_PHASE=""
-E2E_OPS_PHASE_START=0
-
-_e2e_ops_phase() {
-    if [[ -n "${E2E_OPS_CURRENT_PHASE:-}" && "${E2E_OPS_PHASE_START:-0}" -gt 0 ]]; then
-        local elapsed=$(( $(date +%s) - E2E_OPS_PHASE_START ))
-        echo "[e2e-ops] <<< ${E2E_OPS_CURRENT_PHASE} (${elapsed}s)"
-    fi
-    E2E_OPS_CURRENT_PHASE="$1"
-    E2E_OPS_PHASE_START=$(date +%s)
-    echo "E2E_OPS_PHASE=$1"
-    echo "[e2e-ops] >>> $1"
-}
-
-_e2e_ops_llama_restart_fail() {
-    local phase="$1"
-    local code="$2"
-    echo "E2E_LLAMA_RESTART_FAILED_PHASE=$phase"
-    echo "E2E_LLAMA_RESTART_EXIT_CODE=$code"
-    exit "$code"
-}
-
 # ============================================================================
 # Helper functions
 # ============================================================================
@@ -289,30 +258,6 @@ _llama_stack_http_health_once() {
     return 1
 }
 
-# Before recreating lightspeed-stack, Llama must answer /v1/health (reload can be Ready but not listening yet).
-_wait_for_llama_before_lightspeed_restart() {
-    local max_attempts="${1:-25}"
-    local attempt
-
-    echo "Waiting for Llama Stack before lightspeed-stack restart..."
-    for ((attempt=1; attempt<=max_attempts; attempt++)); do
-        if _llama_stack_http_health_once; then
-            echo "✓ Llama Stack healthy before LCS restart (attempt $attempt/$max_attempts)"
-            return 0
-        fi
-        if [[ $attempt -lt $max_attempts ]]; then
-            sleep 2
-        fi
-    done
-    echo "⚠️  Llama Stack not healthy after $((max_attempts * 2))s — restoring before LCS restart..."
-    # Use full recreate (reload may have left the process still starting or wedged).
-    if ! E2E_LLAMA_RELOAD_CONFIG_ONLY=0 cmd_restart_llama_stack; then
-        echo "⚠️  Llama Stack restore failed; LCS may be slow to start"
-        return 1
-    fi
-    return 0
-}
-
 # After the pod is Ready, confirm the process is actually serving HTTP (not only kubelet probes).
 wait_for_llama_stack_http_health() {
     local max_attempts="${1:-35}"
@@ -343,9 +288,12 @@ cmd_restart_lightspeed() {
     echo "Restarting lightspeed-stack service..."
 
     # LCS hangs at startup if Llama Stack is unreachable (blocks Llama handshake,
-    # never opens port 8080, readiness probe never passes).  After a Konflux
-    # config reload, the pod can be Ready before /v1/health responds — poll first.
-    _wait_for_llama_before_lightspeed_restart 25
+    # never opens port 8080, readiness probe never passes).  Ensure Llama Stack
+    # is healthy before recreating the LCS pod.
+    if ! _llama_stack_http_health_once 2>/dev/null; then
+        echo "⚠️  Llama Stack not healthy — restoring before LCS restart..."
+        cmd_restart_llama_stack || echo "⚠️  Llama Stack restore failed; LCS may be slow to start"
+    fi
 
     # Delete existing pod (short wait so hook stays within timeout; force if needed)
     timeout 20 oc delete pod lightspeed-stack-service -n "$NAMESPACE" --ignore-not-found=true --wait=true 2>/dev/null || {
@@ -361,122 +309,36 @@ cmd_restart_lightspeed() {
         oc apply -n "$NAMESPACE" -f -
     
     # Wait for pod to be ready (TCP probe passes when app listens on 8080).
-    # Manifest readiness: initialDelay 20 + failureThreshold 30 * period 5 = up to ~170s.
+    # Don't let a timeout here abort the function — still attempt port-forward
+    # and diagnostics so later scenarios have a chance to recover.
+    local pod_ready=true
     local lcs_pod_wait=40
     if [[ "${E2E_KONFLUX_E2E:-0}" == "1" ]]; then
         lcs_pod_wait=65
     fi
-    local pod_ready=true
     if ! wait_for_pod "lightspeed-stack-service" "$lcs_pod_wait"; then
-        echo "⚠️  Pod not ready within $((lcs_pod_wait * 3))s — extended wait (Konflux LCS startup)..."
-        if ! wait_for_pod "lightspeed-stack-service" 25; then
-            pod_ready=false
-            echo "⚠️  Pod still not ready — dumping diagnostics:"
-            oc describe pod lightspeed-stack-service -n "$NAMESPACE" 2>&1 | tail -40 || true
-            oc logs lightspeed-stack-service -n "$NAMESPACE" \
-                -c lightspeed-stack-container --tail=80 2>&1 || true
-        else
-            echo "✓ Pod became ready during extended wait"
-        fi
+        pod_ready=false
+        echo "⚠️  Pod not ready within 120s — dumping diagnostics:"
+        oc describe pod lightspeed-stack-service -n "$NAMESPACE" 2>&1 | tail -30 || true
+        oc logs lightspeed-stack-service -n "$NAMESPACE" --tail=40 2>&1 || true
     fi
 
     # Re-label pod for service discovery
     oc label pod lightspeed-stack-service pod=lightspeed-stack-service -n "$NAMESPACE" --overwrite
 
     # Re-establish port-forwards (may succeed even if readiness was slow)
-    local forward_ok=true
-    if ! cmd_restart_port_forward; then
-        forward_ok=false
-        echo "⚠️  Lightspeed port-forward on :${LOCAL_PORT:-8080} failed"
-        e2e_ops_diagnose_forward_failure
-    fi
+    cmd_restart_port_forward
     cmd_restart_jwks_port_forward || echo "⚠️  Mock JWKS port-forward failed (RBAC tests may fail)"
 
     if [[ "$pod_ready" == "false" ]]; then
-        echo "E2E_LIGHTSPEED_RESTART_FAILED_PHASE=pod_not_ready"
         echo "⚠️  Lightspeed restart completed but pod was slow to become ready"
         return 1
     fi
-    if [[ "$forward_ok" == "false" ]]; then
-        echo "E2E_LIGHTSPEED_RESTART_FAILED_PHASE=port_forward"
-        return 1
-    fi
     echo "✓ Lightspeed restart complete"
 }
 
-cmd_reload_llama_stack_config() {
-    local llama_pod_name="llama-stack-service"
-    local tmp
-
-    echo "E2E_LLAMA_RESTART_MODE=reload"
-    echo "===== Reloading llama-stack run.yaml (container restart, no pod recreate) ====="
-    _e2e_ops_phase "reload_read_configmap"
-    tmp=$(mktemp)
-    if ! oc get configmap llama-stack-config -n "$NAMESPACE" \
-        -o jsonpath='{.data.run\.yaml}' >"$tmp"; then
-        rm -f "$tmp"
-        echo "ERROR: failed to read llama-stack-config run.yaml" >&2
-        _e2e_ops_llama_restart_fail "reload_configmap_read" 10
-    fi
-    if [[ ! -s "$tmp" ]]; then
-        rm -f "$tmp"
-        echo "ERROR: llama-stack-config run.yaml is empty" >&2
-        _e2e_ops_llama_restart_fail "reload_configmap_empty" 10
-    fi
-    _e2e_ops_phase "reload_oc_cp_run_yaml"
-    if ! oc cp "$tmp" "$NAMESPACE/$llama_pod_name:/opt/app-root/run.yaml" \
-        -c llama-stack-container; then
-        rm -f "$tmp"
-        echo "ERROR: failed to copy run.yaml into llama-stack pod" >&2
-        _e2e_ops_llama_restart_fail "reload_oc_cp" 10
-    fi
-    rm -f "$tmp"
-    _e2e_ops_phase "reload_kill_container"
-    echo "Restarting llama-stack-container to pick up run.yaml..."
-    oc exec -n "$NAMESPACE" "$llama_pod_name" -c llama-stack-container -- bash -c 'kill 1' \
-        2>/dev/null || true
-    local reload_pod_wait=45
-    local reload_health_attempts=35
-    if [[ "${E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA:-0}" == "1" ]]; then
-        reload_pod_wait=60
-        # Fail reload faster when stack stays unhealthy (then full recreate runs once).
-        reload_health_attempts=24
-    fi
-    _e2e_ops_phase "reload_wait_pod_ready"
-    if ! wait_for_pod "$llama_pod_name" "$reload_pod_wait"; then
-        echo "===== Llama-stack reload FAILED (pod not ready) ====="
-        oc describe pod llama-stack-service -n "$NAMESPACE" 2>/dev/null | tail -40 || true
-        _e2e_ops_llama_restart_fail "reload_pod_not_ready" 11
-    fi
-    _e2e_ops_phase "reload_wait_in_pod_health"
-    if ! wait_for_llama_stack_http_health "$reload_health_attempts"; then
-        echo "===== Llama-stack reload FAILED (HTTP not healthy) ====="
-        oc logs llama-stack-service -n "$NAMESPACE" -c llama-stack-container --tail=80 2>&1 || true
-        _e2e_ops_llama_restart_fail "reload_in_pod_health" 12
-    fi
-    _e2e_ops_phase "reload_port_forward"
-    if ! cmd_restart_llama_port_forward; then
-        echo "ERROR: Llama pod is up but localhost:${LOCAL_LLAMA_PORT:-8321} port-forward failed"
-        _e2e_ops_llama_restart_fail "reload_port_forward" 13
-    fi
-    _e2e_ops_phase "reload_done"
-    echo "===== Llama-stack config reload complete ====="
-}
-
 cmd_restart_llama_stack() {
-    if [[ "${E2E_KONFLUX_E2E:-0}" == "1" && "${E2E_LLAMA_RELOAD_CONFIG_ONLY:-0}" == "1" ]]; then
-        if oc get pod llama-stack-service -n "$NAMESPACE" &>/dev/null; then
-            if cmd_reload_llama_stack_config; then
-                return 0
-            fi
-            echo "WARN: llama config reload failed; falling back to full pod restart" >&2
-            echo "E2E_LLAMA_RESTART_FALLBACK=reload_to_full"
-        fi
-    fi
-
-    echo "E2E_LLAMA_RESTART_MODE=full"
     echo "===== Restoring llama-stack service ====="
-    _e2e_ops_phase "full_delete_pod"
     # Pod.spec is largely immutable; delete so apply creates a pod with current volumes/env.
     echo "Deleting llama-stack pod (if any) before apply..."
     timeout 45 oc delete pod llama-stack-service -n "$NAMESPACE" --ignore-not-found=true --wait=true 2>/dev/null || {
@@ -486,7 +348,6 @@ cmd_restart_llama_stack() {
 
     echo "Applying pod manifest..."
     if [[ "${E2E_KONFLUX_E2E:-0}" == "1" ]]; then
-        _e2e_ops_phase "full_apply_manifest"
         # Interception-proxy e2e: refresh Secret before pod recreate so the volume mount is populated.
         if [[ "${E2E_COPY_INTERCEPTION_CA_TO_LLAMA:-0}" == "1" ]]; then
             echo "[e2e-ops] Syncing e2e-interception-proxy-ca secret before llama-stack apply..."
@@ -495,8 +356,7 @@ cmd_restart_llama_stack() {
                 exit 1
             fi
         fi
-        if [[ "${E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA:-0}" == "1" \
-            && "${E2E_LLAMA_RELOAD_CONFIG_ONLY:-0}" != "1" ]]; then
+        if [[ "${E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA:-0}" == "1" ]]; then
             echo "[e2e-ops] Syncing e2e-mock-tls-certs secret before llama-stack apply..."
             if ! cmd_sync_mock_tls_certs_secret; then
                 echo "===== Llama-stack restore FAILED (mock TLS certs secret sync) ====="
@@ -509,18 +369,7 @@ cmd_restart_llama_stack() {
             -n "$NAMESPACE" \
             --dry-run=client -o yaml | oc apply -f -
         oc apply -n "$NAMESPACE" -f "$MANIFEST_DIR/llama-stack-openai.yaml"
-        local llama_pod_wait=90
-        local llama_health_attempts=50
-        if [[ "${E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA:-0}" == "1" ]]; then
-            llama_pod_wait=120
-            llama_health_attempts=100
-        fi
-        _e2e_ops_phase "full_wait_pod_ready"
-        if ! wait_for_pod "llama-stack-service" "$llama_pod_wait"; then
-            echo "===== Llama-stack restore FAILED (pod not ready) ====="
-            oc describe pod llama-stack-service -n "$NAMESPACE" 2>/dev/null | tail -40 || true
-            _e2e_ops_llama_restart_fail "full_pod_not_ready" 20
-        fi
+        wait_for_pod "llama-stack-service" 90
         echo "Labeling pod for service..."
         oc label pod llama-stack-service pod=llama-stack-service -n "$NAMESPACE" --overwrite
         if [[ "${E2E_COPY_INTERCEPTION_CA_TO_LLAMA:-0}" == "1" ]]; then
@@ -535,14 +384,15 @@ cmd_restart_llama_stack() {
                 exit 1
             fi
         fi
-        _e2e_ops_phase "full_wait_in_pod_health"
+        local llama_health_attempts=50
+        if [[ "${E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA:-0}" == "1" ]]; then
+            llama_health_attempts=75
+        fi
         if ! wait_for_llama_stack_http_health "$llama_health_attempts"; then
             echo "===== Llama-stack restore FAILED (HTTP not healthy) ====="
-            oc logs llama-stack-service -n "$NAMESPACE" -c llama-stack-container --tail=80 2>&1 || true
-            _e2e_ops_llama_restart_fail "full_in_pod_health" 21
+            exit 1
         fi
     else
-        _e2e_ops_phase "full_apply_prow_manifest"
         # Prow: vLLM Llama Stack image (matches pipeline.sh / pipeline-services.sh)
         # Use sed instead of envsubst to avoid blanking $VAR references in embedded bash scripts
         sed "s|\${LLAMA_STACK_IMAGE}|${LLAMA_STACK_IMAGE:-}|g" "$MANIFEST_DIR/llama-stack-prow.yaml" |
@@ -552,46 +402,14 @@ cmd_restart_llama_stack() {
         oc label pod llama-stack-service pod=llama-stack-service -n "$NAMESPACE" --overwrite
     fi
 
-    _e2e_ops_phase "full_port_forward"
     if ! cmd_restart_llama_port_forward; then
         echo "ERROR: Llama pod is up but localhost:${LOCAL_LLAMA_PORT:-8321} port-forward failed"
-        _e2e_ops_llama_restart_fail "full_port_forward" 22
+        exit 1
     fi
 
-    _e2e_ops_phase "full_done"
     echo "===== Llama-stack restore complete ====="
 }
 
-cmd_diagnose_llama_restart() {
-    echo "===== Llama-stack restart diagnostics (namespace=$NAMESPACE) ====="
-    echo "E2E_KONFLUX_E2E=${E2E_KONFLUX_E2E:-0} E2E_LLAMA_RELOAD_CONFIG_ONLY=${E2E_LLAMA_RELOAD_CONFIG_ONLY:-0}"
-    oc get pod llama-stack-service -n "$NAMESPACE" -o wide 2>&1 || true
-    echo "--- container restarts / state ---"
-    oc get pod llama-stack-service -n "$NAMESPACE" \
-        -o jsonpath='{.status.containerStatuses[0].restartCount} restarts ready={.status.containerStatuses[0].ready}{"\n"}' 2>&1 || true
-    echo "--- in-pod GET /v1/health ---"
-    if _llama_stack_http_health_once; then
-        echo "OK"
-    else
-        echo "FAIL"
-    fi
-    echo "--- localhost:${LOCAL_LLAMA_PORT:-8321}/v1/health via port-forward ---"
-    if verify_llama_local_forward 5; then
-        echo "OK"
-    else
-        echo "FAIL (pipeline PID file: ${E2E_LLAMA_PORT_FORWARD_PID_FILE:-unset})"
-        if [[ -f "${E2E_LLAMA_PORT_FORWARD_PID_FILE:-}" ]]; then
-            read -r pf_pid <"${E2E_LLAMA_PORT_FORWARD_PID_FILE}" 2>/dev/null || true
-            echo "saved_pf_pid=${pf_pid:-} alive=$(kill -0 "$pf_pid" 2>/dev/null && echo yes || echo no)"
-        fi
-    fi
-    echo "--- tls-openai in llama-stack-config (grep) ---"
-    oc get configmap llama-stack-config -n "$NAMESPACE" -o jsonpath='{.data.run\.yaml}' 2>/dev/null \
-        | grep -E 'tls-openai|network:|client_cert|verify:' | head -20 || true
-    echo "--- llama container log tail ---"
-    oc logs llama-stack-service -n "$NAMESPACE" -c llama-stack-container --tail=40 2>&1 || true
-}
-
 cmd_restart_port_forward() {
     local local_port="${LOCAL_PORT:-8080}"
     local remote_port="${REMOTE_PORT:-8080}"
@@ -606,10 +424,8 @@ cmd_restart_port_forward() {
         # Let the kernel release LISTEN sockets after pkill (avoids immediate "address already in use")
         sleep 3
 
-        # Service forward waits for endpoints; after LCS recreate use pod-direct sooner on Konflux.
-        if [[ "${E2E_KONFLUX_E2E:-0}" == "1" ]]; then
-            pf_resource="pod/lightspeed-stack-service"
-        elif [[ $attempt -le 2 ]]; then
+        # Service can lag endpoints after pod recreate; pod-direct forward is more reliable.
+        if [[ $attempt -le 2 ]]; then
             pf_resource="svc/lightspeed-stack-service-svc"
         else
             pf_resource="pod/lightspeed-stack-service"
@@ -681,9 +497,6 @@ cmd_restart_llama_port_forward() {
     local local_port="${LOCAL_LLAMA_PORT:-8321}"
     local remote_port="${REMOTE_LLAMA_PORT:-8321}"
     local max_attempts=6
-    if [[ "${E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA:-0}" == "1" ]]; then
-        max_attempts=10
-    fi
     local pf_pid
     local pf_resource
     local llama_pf_log="/tmp/port-forward-llama.log"
@@ -694,10 +507,10 @@ cmd_restart_llama_port_forward() {
         kill_stale_llama_forward "$local_port"
         sleep 3
 
-        if [[ $attempt -le 2 ]]; then
-            pf_resource="svc/llama-stack-service-svc"
-        else
+        if [[ "${E2E_KONFLUX_E2E:-0}" == "1" ]] || [[ $attempt -ge 3 ]]; then
             pf_resource="pod/llama-stack-service"
+        else
+            pf_resource="svc/llama-stack-service-svc"
         fi
         echo "Llama port-forward attempt $attempt/$max_attempts -> $pf_resource"
 
@@ -930,23 +743,8 @@ _MOCK_TLS_CERT_FILES=(
     expired-client.crt
 )
 
-_mock_tls_certs_secret_is_complete() {
-    local f data
-    if ! oc get secret e2e-mock-tls-certs -n "$NAMESPACE" &>/dev/null; then
-        return 1
-    fi
-    for f in "${_MOCK_TLS_CERT_FILES[@]}"; do
-        data=$(oc get secret e2e-mock-tls-certs -n "$NAMESPACE" \
-            -o "go-template={{index .data \"${f}\"}}" 2>/dev/null) || return 1
-        if [[ -z "$data" ]]; then
-            return 1
-        fi
-    done
-    return 0
-}
-
 cmd_sync_mock_tls_certs_secret() {
-    local mock_pod_name tmpdir f attempt
+    local mock_pod_name tmpdir f
     mock_pod_name=$(oc get pod -n "$NAMESPACE" -l app=e2e-mock-tls-inference \
         -o jsonpath='{.items[0].metadata.name}' 2>/dev/null) || mock_pod_name=""
 
@@ -956,29 +754,19 @@ cmd_sync_mock_tls_certs_secret() {
         return 1
     fi
 
-    if _mock_tls_certs_secret_is_complete; then
-        echo "✓ e2e-mock-tls-certs secret already complete, skipping sync"
-        return 0
-    fi
-
-    if ! oc wait pod/"$mock_pod_name" -n "$NAMESPACE" --for=condition=Ready --timeout=60s 2>/dev/null; then
-        echo "WARNING: e2e-mock-tls-inference not Ready before cert sync" >&2
-    fi
-
     tmpdir=$(mktemp -d)
     for f in "${_MOCK_TLS_CERT_FILES[@]}"; do
-        for attempt in 1 2 3; do
-            if oc exec -n "$NAMESPACE" "$mock_pod_name" -c e2e-mock-tls-inference -- \
-                cat "/certs/$f" >"$tmpdir/$f" 2>/dev/null && [[ -s "$tmpdir/$f" ]]; then
-                break
-            fi
-            if [[ $attempt -eq 3 ]]; then
-                echo "ERROR: failed to read /certs/$f from e2e-mock-tls-inference pod" >&2
-                rm -rf "$tmpdir"
-                return 1
-            fi
-            sleep 3
-        done
+        if ! oc exec -n "$NAMESPACE" "$mock_pod_name" -c e2e-mock-tls-inference -- \
+            cat "/certs/$f" >"$tmpdir/$f"; then
+            echo "ERROR: failed to read /certs/$f from e2e-mock-tls-inference pod" >&2
+            rm -rf "$tmpdir"
+            return 1
+        fi
+        if [[ ! -s "$tmpdir/$f" ]]; then
+            echo "ERROR: /certs/$f is empty in e2e-mock-tls-inference pod" >&2
+            rm -rf "$tmpdir"
+            return 1
+        fi
     done
 
     if ! oc create secret generic e2e-mock-tls-certs \
@@ -995,16 +783,11 @@ cmd_sync_mock_tls_certs_secret() {
 
 _verify_mock_tls_certs_mounted_in_llama() {
     local llama_pod_name="llama-stack-service"
-    local attempt
-    for attempt in 1 2 3 4 5 6; do
-        if oc exec -n "$NAMESPACE" "$llama_pod_name" -c llama-stack-container -- \
-            sh -c 'test -s /certs/ca.crt && test -s /certs/client.crt && test -s /certs/client.key' \
-            2>/dev/null; then
-            echo "✓ mock TLS certs present under /certs in llama-stack"
-            return 0
-        fi
-        sleep 3
-    done
+    if oc exec -n "$NAMESPACE" "$llama_pod_name" -c llama-stack-container -- \
+        sh -c 'test -s /certs/ca.crt && test -s /certs/client.crt && test -s /certs/client.key'; then
+        echo "✓ mock TLS certs present under /certs in llama-stack"
+        return 0
+    fi
     echo "ERROR: /certs missing or incomplete in llama-stack pod" >&2
     oc get secret e2e-mock-tls-certs -n "$NAMESPACE" 2>&1 || true
     oc exec -n "$NAMESPACE" "$llama_pod_name" -c llama-stack-container -- \
@@ -1146,9 +929,6 @@ case "$COMMAND" in
     sync-mock-tls-certs-secret)
         cmd_sync_mock_tls_certs_secret
         ;;
-    diagnose-llama-restart)
-        cmd_diagnose_llama_restart
-        ;;
     *)
         echo "Usage: $0 <command> [args...]"
         echo ""
@@ -1167,9 +947,8 @@ case "$COMMAND" in
         echo "  sync-interception-proxy-ca-secret   - Publish trustme CA to Secret for llama mount"
         echo "  deploy-e2e-tunnel-proxy            - Deploy in-cluster tunnel proxy pod"
         echo "  deploy-e2e-interception-proxy      - Deploy in-cluster interception proxy pod"
-        echo "  deploy-e2e-mock-tls-inference      - Deploy mock HTTPS inference server (tls.feature)"
-        echo "  sync-mock-tls-certs-secret         - Publish mock TLS /certs PEMs to Secret for llama"
-        echo "  diagnose-llama-restart             - Snapshot pod/health/forward/config for debugging"
+        echo "  deploy-e2e-mock-tls-inference        - Deploy mock HTTPS inference server (tls.feature)"
+        echo "  sync-mock-tls-certs-secret           - Publish mock TLS /certs to Secret for llama mount"
         exit 1
         ;;
 esac
diff --git a/tests/e2e/features/environment.py b/tests/e2e/features/environment.py
index e97f993a5..025c536e4 100644
--- a/tests/e2e/features/environment.py
+++ b/tests/e2e/features/environment.py
@@ -26,7 +26,7 @@
     reset_llama_stack_disrupt_once_tracking,
     reset_llama_stack_was_running,
 )
-from tests.e2e.features.steps.tls import reset_tls_prow_restart_optimization_state
+from tests.e2e.features.steps.tls import reset_tls_prow_state
 from tests.e2e.utils.llama_stack_utils import register_shield
 from tests.e2e.utils.prow_utils import (
     restart_pod,
@@ -453,7 +453,7 @@ def before_feature(context: Context, feature: Feature) -> None:
     # One real Llama disruption per feature (module-level flag; survives context resets)
     reset_llama_stack_disrupt_once_tracking()
     if feature.filename and "tls.feature" in feature.filename:
-        reset_tls_prow_restart_optimization_state()
+        reset_tls_prow_state()
 
     try:
         max_flaky = int(os.getenv("E2E_FLAKY_MAX_ATTEMPTS", _E2E_FLAKY_MAX_ATTEMPTS))
diff --git a/tests/e2e/features/steps/proxy.py b/tests/e2e/features/steps/proxy.py
index 374218478..7755cca91 100644
--- a/tests/e2e/features/steps/proxy.py
+++ b/tests/e2e/features/steps/proxy.py
@@ -306,7 +306,6 @@ def restore_if_modified(context: Context) -> None:
     _stop_proxy(context, "interception_proxy", "interception_proxy_loop")
     os.environ.pop("E2E_COPY_INTERCEPTION_CA_TO_LLAMA", None)
     os.environ.pop("E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA", None)
-    os.environ.pop("E2E_LLAMA_RELOAD_CONFIG_ONLY", None)
     if hasattr(context, "needs_interception_ca_on_llama"):
         delattr(context, "needs_interception_ca_on_llama")
 
diff --git a/tests/e2e/features/steps/tls.py b/tests/e2e/features/steps/tls.py
index f1f1693a6..74620c806 100644
--- a/tests/e2e/features/steps/tls.py
+++ b/tests/e2e/features/steps/tls.py
@@ -34,18 +34,11 @@
 }
 
 _mock_tls_cluster_deploy_state: dict[str, bool] = {"done": False}
-_tls_prow_restart_state: dict[str, bool] = {
-    "full_restart_done": False,
-    "force_full_restart": False,
-}
 
 
-def reset_tls_prow_restart_optimization_state() -> None:
+def reset_tls_prow_state() -> None:
     """Reset per-feature Prow state (call from ``before_feature``)."""
     _mock_tls_cluster_deploy_state["done"] = False
-    _tls_prow_restart_state["full_restart_done"] = False
-    _tls_prow_restart_state["force_full_restart"] = False
-    os.environ.pop("E2E_LLAMA_RELOAD_CONFIG_ONLY", None)
     os.environ.pop("E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA", None)
 
 
@@ -58,63 +51,26 @@ def is_tls_configuration_feature(context: Context) -> bool:
     return "TLS configuration" in name
 
 
-def _prepare_tls_prow_llama_full_restart_env() -> None:
-    """Env for a full llama pod recreate (first TLS scenario / recovery)."""
-    os.environ["E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA"] = "1"
-    os.environ.pop("E2E_LLAMA_RELOAD_CONFIG_ONLY", None)
-
-
-def _prepare_tls_prow_llama_reload_env() -> None:
-    """Env for config-only reload (run.yaml already on pod with /certs mounted)."""
+def _prepare_tls_prow_llama_restart_env() -> None:
+    """Set env for full llama pod recreate with mock TLS certs mounted."""
     os.environ["E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA"] = "1"
-    os.environ["E2E_LLAMA_RELOAD_CONFIG_ONLY"] = "1"
 
 
 def restart_llama_for_tls_feature(context: Context) -> None:
-    """Restart Llama for TLS tests: one full recreate per feature, then reload.
-
-    Full pod delete+apply after every scenario (~16×) is flaky on Konflux
-    (cert sync, mount races, port-forward). Later scenarios only change run.yaml
-    in the ConfigMap; ``oc cp`` + container restart is enough.
-    """
+    """Restart Llama for TLS tests (full pod recreate on Prow/Konflux)."""
     from tests.e2e.utils.utils import restart_container
 
     if not is_prow_environment():
         restart_container("llama-stack")
         return
 
-    if _tls_prow_restart_state.pop("force_full_restart", False):
-        _prepare_tls_prow_llama_full_restart_env()
-        mode = "full_forced"
-    elif _tls_prow_restart_state["full_restart_done"]:
-        _prepare_tls_prow_llama_reload_env()
-        mode = "reload"
-    else:
-        _prepare_tls_prow_llama_full_restart_env()
-        mode = "full"
-
+    _prepare_tls_prow_llama_restart_env()
     scenario = getattr(getattr(context, "scenario", None), "name", "") or "?"
     print(
-        f"[tls.feature] Llama Stack restart: mode={mode} scenario={scenario!r}",
+        f"[tls.feature] Llama Stack restart: full recreate scenario={scenario!r}",
         flush=True,
     )
-
-    try:
-        restart_container("llama-stack")
-    except Exception:
-        _tls_prow_restart_state["full_restart_done"] = False
-        raise
-
-    _tls_prow_restart_state["full_restart_done"] = True
-
-
-def _prepare_tls_prow_llama_restart_env() -> None:
-    """Set env before writing run.yaml (used by ``_configure_tls``)."""
-    os.environ["E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA"] = "1"
-    if _tls_prow_restart_state["full_restart_done"]:
-        os.environ["E2E_LLAMA_RELOAD_CONFIG_ONLY"] = "1"
-    else:
-        os.environ.pop("E2E_LLAMA_RELOAD_CONFIG_ONLY", None)
+    restart_container("llama-stack")
 
 
 def _cluster_mock_tls_inference_host() -> str:
@@ -143,7 +99,6 @@ def _tls_provider_base() -> dict[str, Any]:
             "api_key": "test-key",
             "base_url": _mock_tls_base_url(_MOCK_TLS_PORT_TLS),
             "allowed_models": ["mock-tls-model"],
-            # Avoid hitting the mock on every container restart (Konflux reload path).
             "refresh_models": False,
         },
     }
@@ -301,10 +256,6 @@ def configure_mtls_wrong_client_cert(context: Context) -> None:
         },
         base_url=_mock_tls_base_url(_MOCK_TLS_PORT_MTLS),
     )
-    # Konflux: reload after this config often never becomes healthy until timeout,
-    # then falls back to full recreate (~7 min). Skip reload for this scenario.
-    if is_prow_environment():
-        _tls_prow_restart_state["force_full_restart"] = True
 
 
 @given("Llama Stack is configured for mTLS with untrusted client certificate")
diff --git a/tests/e2e/utils/prow_utils.py b/tests/e2e/utils/prow_utils.py
index d1df3bd40..58a1866cd 100644
--- a/tests/e2e/utils/prow_utils.py
+++ b/tests/e2e/utils/prow_utils.py
@@ -7,7 +7,6 @@
 import os
 import subprocess
 import tempfile
-import time
 from typing import Optional
 
 
@@ -80,28 +79,6 @@ def wait_for_pod_health(pod_name: str, max_attempts: int = 12) -> None:
 _LIGHTSPEED_RESTART_NAMES = frozenset({"lightspeed-stack", "lightspeed-stack-service"})
 
 
-def _print_llama_restart_diagnostics_from_output(output: str) -> None:
-    """Extract e2e-ops phase markers from restart-llama-stack stdout."""
-    markers = (
-        "E2E_LLAMA_RESTART_MODE=",
-        "E2E_LLAMA_RESTART_FAILED_PHASE=",
-        "E2E_LLAMA_RESTART_FALLBACK=",
-        "E2E_LLAMA_RESTART_EXIT_CODE=",
-        "E2E_OPS_PHASE=",
-    )
-    printed = False
-    for line in output.splitlines():
-        if any(line.startswith(m) for m in markers) or "[e2e-ops] <<<" in line:
-            print(line, flush=True)
-            printed = True
-    if printed:
-        print(
-            "See docs/e2e_testing.md § Konflux Llama restart diagnostics "
-            "for phase meanings and fixes.",
-            flush=True,
-        )
-
-
 def restart_pod(container_name: str) -> None:
     """Restart Llama Stack or Lightspeed pod in OpenShift/Prow (not Docker).
 
@@ -116,13 +93,13 @@ def restart_pod(container_name: str) -> None:
     """
     if container_name in _LLAMA_RESTART_NAMES:
         op = "restart-llama-stack"
-        # TLS feature: full pod recreate + cert sync + health can exceed 7 min on Konflux.
+        # TLS: full pod recreate + cert sync + health on Konflux can exceed 7 min.
         timeout = (
             900 if os.environ.get("E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA") == "1" else 420
         )
     elif container_name in _LIGHTSPEED_RESTART_NAMES:
         op = "restart-lightspeed"
-        # Konflux LCS: up to ~195s pod wait + extended wait + port-forward retries.
+        # Konflux LCS readiness can take ~195s (probe budget in lightspeed-stack.yaml).
         timeout = 420 if os.environ.get("E2E_KONFLUX_E2E") == "1" else 320
     else:
         print(
@@ -131,61 +108,15 @@ def restart_pod(container_name: str) -> None:
         )
         op = "restart-lightspeed"
         timeout = 200
-    max_attempts = (
-        3
-        if op == "restart-llama-stack"
-        and os.environ.get("E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA") == "1"
-        and os.environ.get("E2E_LLAMA_RELOAD_CONFIG_ONLY") != "1"
-        else 2 if op == "restart-llama-stack" else 1
-    )
-    last_error: subprocess.CalledProcessError | subprocess.TimeoutExpired | None = None
-    for attempt in range(1, max_attempts + 1):
-        try:
-            result = run_e2e_ops(op, timeout=timeout)
-            print(result.stdout, end="")
-            if result.returncode != 0:
-                print(result.stderr, end="")
-                detail = (result.stderr or result.stdout or "").strip()
-                raise subprocess.CalledProcessError(
-                    result.returncode,
-                    op,
-                    detail or None,
-                )
-            return
-        except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as err:
-            last_error = err
-            if op == "restart-llama-stack" and isinstance(
-                err, subprocess.CalledProcessError
-            ):
-                _print_llama_restart_diagnostics_from_output(
-                    (err.stdout or "") + (err.stderr or "")
-                )
-                try:
-                    diag = run_e2e_ops("diagnose-llama-restart", timeout=90)
-                    print(diag.stdout, end="")
-                except (subprocess.CalledProcessError, subprocess.TimeoutExpired):
-                    pass
-            if attempt < max_attempts:
-                retry_delay = (
-                    30
-                    if os.environ.get("E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA") == "1"
-                    else 20
-                )
-                print(
-                    f"⚠️  {op} failed (attempt {attempt}/{max_attempts}), "
-                    f"retrying after {retry_delay}s..."
-                )
-                time.sleep(retry_delay)
-                if (
-                    op == "restart-llama-stack"
-                    and os.environ.get("E2E_LLAMA_RELOAD_CONFIG_ONLY") == "1"
-                ):
-                    # Reload failed; next attempt does full pod recreate.
-                    os.environ.pop("E2E_LLAMA_RELOAD_CONFIG_ONLY", None)
-    if last_error is not None:
-        if isinstance(last_error, subprocess.TimeoutExpired):
-            print(f"Failed to restart pod {container_name}: {last_error}")
-        raise last_error
+    try:
+        result = run_e2e_ops(op, timeout=timeout)
+        print(result.stdout, end="")
+        if result.returncode != 0:
+            print(result.stderr, end="")
+            raise subprocess.CalledProcessError(result.returncode, op)
+    except subprocess.TimeoutExpired as e:
+        print(f"Failed to restart pod {container_name}: {e}")
+        raise
 
 
 def restore_llama_stack_pod() -> None:

From dd2e3190dfc7883e6a2065f901a835e69494110b Mon Sep 17 00:00:00 2001
From: Radovan Fuchs <rfuchs@rfuchs-thinkpadp1gen7.tpb.csb>
Date: Tue, 26 May 2026 09:51:37 +0200
Subject: [PATCH 12/20] fix

---
 tests/e2e/features/environment.py     |  6 ++++-
 tests/e2e/features/steps/tls.py       | 33 ++++++++++++++++++++++++++-
 tests/e2e/features/tls.feature        |  4 ++--
 tests/e2e/utils/llama_config_utils.py | 20 ++++++++++++++++
 tests/e2e/utils/prow_utils.py         |  9 +++++++-
 5 files changed, 67 insertions(+), 5 deletions(-)

diff --git a/tests/e2e/features/environment.py b/tests/e2e/features/environment.py
index 025c536e4..abcabe577 100644
--- a/tests/e2e/features/environment.py
+++ b/tests/e2e/features/environment.py
@@ -26,7 +26,10 @@
     reset_llama_stack_disrupt_once_tracking,
     reset_llama_stack_was_running,
 )
-from tests.e2e.features.steps.tls import reset_tls_prow_state
+from tests.e2e.features.steps.tls import (
+    prepare_tls_feature_entry_on_prow,
+    reset_tls_prow_state,
+)
 from tests.e2e.utils.llama_stack_utils import register_shield
 from tests.e2e.utils.prow_utils import (
     restart_pod,
@@ -454,6 +457,7 @@ def before_feature(context: Context, feature: Feature) -> None:
     reset_llama_stack_disrupt_once_tracking()
     if feature.filename and "tls.feature" in feature.filename:
         reset_tls_prow_state()
+        prepare_tls_feature_entry_on_prow()
 
     try:
         max_flaky = int(os.getenv("E2E_FLAKY_MAX_ATTEMPTS", _E2E_FLAKY_MAX_ATTEMPTS))
diff --git a/tests/e2e/features/steps/tls.py b/tests/e2e/features/steps/tls.py
index 74620c806..67f5fa360 100644
--- a/tests/e2e/features/steps/tls.py
+++ b/tests/e2e/features/steps/tls.py
@@ -17,10 +17,12 @@
 
 from tests.e2e.utils.llama_config_utils import (
     backup_llama_config,
+    clear_llama_config_backup,
     load_llama_config,
+    reset_llama_run_config_to_pipeline_default,
     write_llama_config,
 )
-from tests.e2e.utils.prow_utils import get_namespace, run_e2e_ops
+from tests.e2e.utils.prow_utils import get_namespace, restart_pod, run_e2e_ops
 from tests.e2e.utils.utils import is_prow_environment
 
 _MOCK_TLS_PORT_TLS = 8443
@@ -40,6 +42,35 @@ def reset_tls_prow_state() -> None:
     """Reset per-feature Prow state (call from ``before_feature``)."""
     _mock_tls_cluster_deploy_state["done"] = False
     os.environ.pop("E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA", None)
+    clear_llama_config_backup()
+
+
+def prepare_tls_feature_entry_on_prow() -> None:
+    """Baseline cluster state when tls.feature runs after other features in test_list.
+
+    Earlier features (disrupted, MCP) delete or reconfigure Llama without mock TLS
+    certs. Isolated tls.feature runs skip that churn, which is why the same Gherkin
+    passes alone but flakes mid-feature in the full suite.
+    """
+    if not is_prow_environment():
+        return
+    print("[tls.feature] Prow/Konflux entry: reset run.yaml and warm Llama + mock TLS...")
+    reset_llama_run_config_to_pipeline_default()
+    result = run_e2e_ops("deploy-e2e-mock-tls-inference", timeout=300)
+    print(result.stdout, end="")
+    if result.returncode != 0:
+        raise RuntimeError(
+            "tls.feature entry: deploy-e2e-mock-tls-inference failed: "
+            f"{result.stderr or result.stdout}"
+        )
+    _mock_tls_cluster_deploy_state["done"] = True
+    _prepare_tls_prow_llama_restart_env()
+    os.environ.setdefault(
+        "E2E_MOCK_TLS_INFERENCE_HOST",
+        _cluster_mock_tls_inference_host(),
+    )
+    restart_pod("llama-stack")
+    print("[tls.feature] Prow/Konflux entry baseline complete", flush=True)
 
 
 def is_tls_configuration_feature(context: Context) -> bool:
diff --git a/tests/e2e/features/tls.feature b/tests/e2e/features/tls.feature
index 412e5b04f..97c089067 100644
--- a/tests/e2e/features/tls.feature
+++ b/tests/e2e/features/tls.feature
@@ -10,10 +10,10 @@ Feature: TLS configuration for remote inference providers
       And The system is in default state
       And REST API service prefix is /v1
       And the Lightspeed stack configuration directory is "tests/e2e/configuration"
-      And The service uses the lightspeed-stack-tls.yaml configuration
-      And The service is restarted
       And The original Llama Stack config is restored if modified
       And The mock TLS inference server is deployed
+      And The service uses the lightspeed-stack-tls.yaml configuration
+      And The service is restarted
 
   Scenario: Inference succeeds with TLS verification disabled
     Given Llama Stack is configured with TLS verification disabled
diff --git a/tests/e2e/utils/llama_config_utils.py b/tests/e2e/utils/llama_config_utils.py
index eb5f67b9d..e8fdf4832 100644
--- a/tests/e2e/utils/llama_config_utils.py
+++ b/tests/e2e/utils/llama_config_utils.py
@@ -3,6 +3,7 @@
 import os
 import shutil
 import tempfile
+from pathlib import Path
 from typing import Any, Optional
 
 import yaml
@@ -20,6 +21,25 @@
 _llama_config_backup_key: dict[str, Optional[str]] = {"value": None}
 
 
+def clear_llama_config_backup() -> None:
+    """Drop in-memory run.yaml backup (e.g. at start of tls.feature)."""
+    _llama_config_backup_key["value"] = None
+
+
+def reset_llama_run_config_to_pipeline_default() -> None:
+    """Reset llama-stack-config run.yaml to Konflux/Prow pipeline seed (run-ci.yaml)."""
+    if not is_prow_environment():
+        return
+    run_ci = (
+        Path(__file__).resolve().parents[1] / "configs" / "run-ci.yaml"
+    )
+    if not run_ci.is_file():
+        print(f"WARN: pipeline run.yaml seed not found at {run_ci}", flush=True)
+        return
+    print(f"Resetting llama-stack-config from {run_ci.name}...", flush=True)
+    update_llama_run_configmap(str(run_ci))
+
+
 def _local_llama_config_path() -> str:
     """Return local run.yaml path for Docker/local e2e execution."""
     return os.getenv("E2E_LLAMA_CONFIG_PATH", _DEFAULT_LOCAL_LLAMA_CONFIG_PATH)
diff --git a/tests/e2e/utils/prow_utils.py b/tests/e2e/utils/prow_utils.py
index 58a1866cd..7ac79f3ac 100644
--- a/tests/e2e/utils/prow_utils.py
+++ b/tests/e2e/utils/prow_utils.py
@@ -113,7 +113,14 @@ def restart_pod(container_name: str) -> None:
         print(result.stdout, end="")
         if result.returncode != 0:
             print(result.stderr, end="")
-            raise subprocess.CalledProcessError(result.returncode, op)
+            combined = f"{result.stdout or ''}\n{result.stderr or ''}".strip()
+            tail = "\n".join(combined.splitlines()[-25:]) if combined else ""
+            detail = tail or f"exit {result.returncode}"
+            raise subprocess.CalledProcessError(
+                result.returncode,
+                op,
+                detail,
+            )
     except subprocess.TimeoutExpired as e:
         print(f"Failed to restart pod {container_name}: {e}")
         raise

From da56ae23ce6e1d474eaaf36cafbbba2b6412169b Mon Sep 17 00:00:00 2001
From: Radovan Fuchs <rfuchs@rfuchs-thinkpadp1gen7.tpb.csb>
Date: Tue, 26 May 2026 13:31:49 +0200
Subject: [PATCH 13/20] extend llama-stack timeout

---
 tests/e2e-prow/rhoai/scripts/e2e-ops.sh | 22 +++++++++++++++++++---
 tests/e2e/utils/prow_utils.py           | 19 +++++++++++++------
 2 files changed, 32 insertions(+), 9 deletions(-)

diff --git a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh
index b5741ef02..6a0a4371b 100755
--- a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh
+++ b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh
@@ -339,6 +339,7 @@ cmd_restart_lightspeed() {
 
 cmd_restart_llama_stack() {
     echo "===== Restoring llama-stack service ====="
+    echo "[e2e-ops] restart-llama-stack env: E2E_KONFLUX_E2E=${E2E_KONFLUX_E2E:-0} E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA=${E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA:-0}"
     # Pod.spec is largely immutable; delete so apply creates a pod with current volumes/env.
     echo "Deleting llama-stack pod (if any) before apply..."
     timeout 45 oc delete pod llama-stack-service -n "$NAMESPACE" --ignore-not-found=true --wait=true 2>/dev/null || {
@@ -369,7 +370,19 @@ cmd_restart_llama_stack() {
             -n "$NAMESPACE" \
             --dry-run=client -o yaml | oc apply -f -
         oc apply -n "$NAMESPACE" -f "$MANIFEST_DIR/llama-stack-openai.yaml"
-        wait_for_pod "llama-stack-service" 90
+        local llama_pod_wait=90
+        if [[ "${E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA:-0}" == "1" ]]; then
+            # readinessProbe: 20s + 36*5s = 200s; clone/enrich/RAG on Konflux often needs 400s+ total.
+            llama_pod_wait=180
+        fi
+        echo "[e2e-ops] Waiting for llama-stack-service Ready (max ${llama_pod_wait} attempts, $((llama_pod_wait * 3))s)..."
+        if ! wait_for_pod "llama-stack-service" "$llama_pod_wait"; then
+            echo "===== Llama-stack restore FAILED (pod not Ready within $((llama_pod_wait * 3))s) ====="
+            oc get pod llama-stack-service -n "$NAMESPACE" -o wide 2>&1 || true
+            oc describe pod llama-stack-service -n "$NAMESPACE" 2>&1 | tail -50 || true
+            oc logs llama-stack-service -n "$NAMESPACE" -c llama-stack-container --tail=80 2>&1 || true
+            exit 1
+        fi
         echo "Labeling pod for service..."
         oc label pod llama-stack-service pod=llama-stack-service -n "$NAMESPACE" --overwrite
         if [[ "${E2E_COPY_INTERCEPTION_CA_TO_LLAMA:-0}" == "1" ]]; then
@@ -386,7 +399,7 @@ cmd_restart_llama_stack() {
         fi
         local llama_health_attempts=50
         if [[ "${E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA:-0}" == "1" ]]; then
-            llama_health_attempts=75
+            llama_health_attempts=100
         fi
         if ! wait_for_llama_stack_http_health "$llama_health_attempts"; then
             echo "===== Llama-stack restore FAILED (HTTP not healthy) ====="
@@ -497,11 +510,14 @@ cmd_restart_llama_port_forward() {
     local local_port="${LOCAL_LLAMA_PORT:-8321}"
     local remote_port="${REMOTE_LLAMA_PORT:-8321}"
     local max_attempts=6
+    if [[ "${E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA:-0}" == "1" ]]; then
+        max_attempts=10
+    fi
     local pf_pid
     local pf_resource
     local llama_pf_log="/tmp/port-forward-llama.log"
 
-    echo "Re-establishing Llama Stack port-forward on $local_port:$remote_port..."
+    echo "Re-establishing Llama Stack port-forward on $local_port:$remote_port (max $max_attempts attempts)..."
 
     for ((attempt=1; attempt<=max_attempts; attempt++)); do
         kill_stale_llama_forward "$local_port"
diff --git a/tests/e2e/utils/prow_utils.py b/tests/e2e/utils/prow_utils.py
index 7ac79f3ac..c94e17c13 100644
--- a/tests/e2e/utils/prow_utils.py
+++ b/tests/e2e/utils/prow_utils.py
@@ -93,10 +93,14 @@ def restart_pod(container_name: str) -> None:
     """
     if container_name in _LLAMA_RESTART_NAMES:
         op = "restart-llama-stack"
-        # TLS: full pod recreate + cert sync + health on Konflux can exceed 7 min.
-        timeout = (
-            900 if os.environ.get("E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA") == "1" else 420
-        )
+        # Subprocess cap must exceed e2e-ops internal waits (pod + in-pod health + port-forward).
+        # Konflux TLS full recreate: ~6–12 min typical, 15+ min under load (user-reported 400s+).
+        if os.environ.get("E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA") == "1":
+            timeout = 1200
+        elif os.environ.get("E2E_KONFLUX_E2E") == "1":
+            timeout = 720
+        else:
+            timeout = 420
     elif container_name in _LIGHTSPEED_RESTART_NAMES:
         op = "restart-lightspeed"
         # Konflux LCS readiness can take ~195s (probe budget in lightspeed-stack.yaml).
@@ -133,8 +137,11 @@ def restore_llama_stack_pod() -> None:
         subprocess.CalledProcessError: If oc/e2e-ops restore fails.
         subprocess.TimeoutExpired: If the operation times out.
     """
-    # wait_for_pod (up to ~180s) + in-pod /v1/health polling (~105s) — allow headroom.
-    result = run_e2e_ops("restart-llama-stack", timeout=420)
+    if os.environ.get("E2E_KONFLUX_E2E") == "1":
+        timeout = 720
+    else:
+        timeout = 420
+    result = run_e2e_ops("restart-llama-stack", timeout=timeout)
     print(result.stdout, end="")
     if result.returncode != 0:
         print(result.stderr, end="")

From 56ca5dcd8344205cc871347602a05deedbbb7825 Mon Sep 17 00:00:00 2001
From: Radovan Fuchs <rfuchs@rfuchs-thinkpadp1gen7.tpb.csb>
Date: Tue, 26 May 2026 19:12:13 +0200
Subject: [PATCH 14/20] fix sync between mock tls and llama-stack

---
 tests/e2e-prow/rhoai/scripts/e2e-ops.sh | 125 +++++++++++++++++++-----
 1 file changed, 99 insertions(+), 26 deletions(-)

diff --git a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh
index 6a0a4371b..c72eb9cb2 100755
--- a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh
+++ b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh
@@ -759,42 +759,115 @@ _MOCK_TLS_CERT_FILES=(
     expired-client.crt
 )
 
-cmd_sync_mock_tls_certs_secret() {
-    local mock_pod_name tmpdir f
-    mock_pod_name=$(oc get pod -n "$NAMESPACE" -l app=e2e-mock-tls-inference \
-        -o jsonpath='{.items[0].metadata.name}' 2>/dev/null) || mock_pod_name=""
+_mock_tls_secret_is_complete() {
+    local f b64
+    for f in "${_MOCK_TLS_CERT_FILES[@]}"; do
+        b64=$(oc get secret e2e-mock-tls-certs -n "$NAMESPACE" \
+            -o "go-template={{index .data \"${f}\"}}" 2>/dev/null) || return 1
+        [[ -n "$b64" ]] || return 1
+    done
+    return 0
+}
+
+_get_mock_tls_inference_pod_name() {
+    oc get pod -n "$NAMESPACE" -l app=e2e-mock-tls-inference \
+        -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true
+}
 
-    if [[ -z "$mock_pod_name" ]]; then
-        echo "ERROR: no e2e-mock-tls-inference pod in namespace $NAMESPACE" >&2
-        echo "  Run: e2e-ops.sh deploy-e2e-mock-tls-inference" >&2
+_wait_for_mock_tls_inference_pod() {
+    if ! oc wait pod -l app=e2e-mock-tls-inference -n "$NAMESPACE" \
+        --for=condition=Ready --timeout=120s 2>/dev/null; then
+        echo "ERROR: e2e-mock-tls-inference pod not Ready" >&2
+        oc get pods -n "$NAMESPACE" -l app=e2e-mock-tls-inference -o wide 2>&1 || true
         return 1
     fi
+    return 0
+}
 
-    tmpdir=$(mktemp -d)
-    for f in "${_MOCK_TLS_CERT_FILES[@]}"; do
-        if ! oc exec -n "$NAMESPACE" "$mock_pod_name" -c e2e-mock-tls-inference -- \
-            cat "/certs/$f" >"$tmpdir/$f"; then
-            echo "ERROR: failed to read /certs/$f from e2e-mock-tls-inference pod" >&2
-            rm -rf "$tmpdir"
+_copy_mock_tls_cert_from_pod() {
+    local mock_pod_name="$1"
+    local cert_file="$2"
+    local dest="$3"
+    local attempt
+
+    for ((attempt=1; attempt<=4; attempt++)); do
+        if oc exec --request-timeout=90 -n "$NAMESPACE" "$mock_pod_name" \
+            -c e2e-mock-tls-inference -- cat "/certs/$cert_file" >"$dest" 2>/dev/null \
+            && [[ -s "$dest" ]]; then
+            return 0
+        fi
+        echo "[e2e-ops] WARN: read /certs/$cert_file from mock pod failed (attempt $attempt/4)"
+        sleep 5
+    done
+    return 1
+}
+
+_recycle_mock_tls_inference_pod() {
+    echo "[e2e-ops] Recycling e2e-mock-tls-inference pod (stale or unresponsive)..."
+    oc delete pod e2e-mock-tls-inference -n "$NAMESPACE" --ignore-not-found=true --wait=true 2>/dev/null || true
+    sleep 3
+    if ! _wait_for_mock_tls_inference_pod; then
+        return 1
+    fi
+    # Certs are written at container start; allow trustme + pip to finish.
+    sleep 10
+    return 0
+}
+
+cmd_sync_mock_tls_certs_secret() {
+    local mock_pod_name tmpdir f recycle_attempt
+
+    if _mock_tls_secret_is_complete; then
+        echo "✓ Secret e2e-mock-tls-certs already complete (${#_MOCK_TLS_CERT_FILES[@]} keys); skipping sync"
+        return 0
+    fi
+
+    for recycle_attempt in 1 2; do
+        mock_pod_name=$(_get_mock_tls_inference_pod_name)
+        if [[ -z "$mock_pod_name" ]]; then
+            echo "ERROR: no e2e-mock-tls-inference pod in namespace $NAMESPACE" >&2
+            echo "  Run: e2e-ops.sh deploy-e2e-mock-tls-inference" >&2
             return 1
         fi
-        if [[ ! -s "$tmpdir/$f" ]]; then
-            echo "ERROR: /certs/$f is empty in e2e-mock-tls-inference pod" >&2
-            rm -rf "$tmpdir"
+
+        if ! _wait_for_mock_tls_inference_pod; then
+            [[ $recycle_attempt -lt 2 ]] && _recycle_mock_tls_inference_pod && continue
             return 1
         fi
-    done
 
-    if ! oc create secret generic e2e-mock-tls-certs \
-        --from-file="$tmpdir" \
-        -n "$NAMESPACE" \
-        --dry-run=client -o yaml | oc apply -f -; then
-        echo "ERROR: failed to apply e2e-mock-tls-certs secret" >&2
+        tmpdir=$(mktemp -d)
+        local sync_ok=true
+        for f in "${_MOCK_TLS_CERT_FILES[@]}"; do
+            if ! _copy_mock_tls_cert_from_pod "$mock_pod_name" "$f" "$tmpdir/$f"; then
+                echo "ERROR: failed to read /certs/$f from e2e-mock-tls-inference pod" >&2
+                oc logs "$mock_pod_name" -n "$NAMESPACE" -c e2e-mock-tls-inference --tail=40 2>&1 \
+                    | sed 's/^/[e2e-ops] /' || true
+                sync_ok=false
+                break
+            fi
+        done
+
+        if [[ "$sync_ok" == "true" ]]; then
+            if ! oc create secret generic e2e-mock-tls-certs \
+                --from-file="$tmpdir" \
+                -n "$NAMESPACE" \
+                --dry-run=client -o yaml | oc apply -f -; then
+                echo "ERROR: failed to apply e2e-mock-tls-certs secret" >&2
+                rm -rf "$tmpdir"
+                return 1
+            fi
+            rm -rf "$tmpdir"
+            echo "✓ Secret e2e-mock-tls-certs updated (${#_MOCK_TLS_CERT_FILES[@]} files)"
+            return 0
+        fi
+
         rm -rf "$tmpdir"
-        return 1
-    fi
-    rm -rf "$tmpdir"
-    echo "✓ Secret e2e-mock-tls-certs updated (${#_MOCK_TLS_CERT_FILES[@]} files)"
+        if [[ $recycle_attempt -lt 2 ]]; then
+            _recycle_mock_tls_inference_pod || return 1
+        fi
+    done
+
+    return 1
 }
 
 _verify_mock_tls_certs_mounted_in_llama() {

From f1d29a89f9fb9bcab63de35d11cda4d0ab60512c Mon Sep 17 00:00:00 2001
From: Radovan Fuchs <rfuchs@rfuchs-thinkpadp1gen7.tpb.csb>
Date: Tue, 26 May 2026 22:27:51 +0200
Subject: [PATCH 15/20] fix lightspeed-stack restart time

---
 tests/e2e-prow/rhoai/scripts/e2e-ops.sh | 53 ++++++++++++++++++-------
 tests/e2e/utils/prow_utils.py           |  4 +-
 2 files changed, 41 insertions(+), 16 deletions(-)

diff --git a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh
index c72eb9cb2..e3c0b918b 100755
--- a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh
+++ b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh
@@ -45,18 +45,30 @@ E2E_JWKS_PORT_FORWARD_PID_FILE="${E2E_JWKS_PORT_FORWARD_PID_FILE:-/tmp/e2e-jwks-
 wait_for_pod() {
     local pod_name="$1"
     local max_attempts="${2:-24}"
-    
+    local attempt
+    local ready
+    local phase
+
     for ((attempt=1; attempt<=max_attempts; attempt++)); do
-        local ready
-        ready=$(oc get pod "$pod_name" -n "$NAMESPACE" -o jsonpath='{.status.containerStatuses[0].ready}' 2>/dev/null || echo "false")
-        if [[ "$ready" == "true" ]]; then
-            echo "✓ Pod $pod_name ready"
-            return 0
+        if ! oc get pod "$pod_name" -n "$NAMESPACE" &>/dev/null; then
+            phase="Missing"
+        else
+            phase=$(oc get pod "$pod_name" -n "$NAMESPACE" \
+                -o jsonpath='{.status.phase}' 2>/dev/null || echo "Unknown")
+            ready=$(oc get pod "$pod_name" -n "$NAMESPACE" \
+                -o jsonpath='{.status.containerStatuses[0].ready}' 2>/dev/null || echo "false")
+            if [[ "$ready" == "true" ]]; then
+                echo "✓ Pod $pod_name ready (attempt $attempt/$max_attempts)"
+                return 0
+            fi
+        fi
+        if [[ $((attempt % 10)) -eq 0 ]]; then
+            echo "[e2e-ops] $pod_name not ready yet (attempt $attempt/$max_attempts, phase=${phase:-?})..."
         fi
         sleep 3
     done
-    
-    echo "Pod $pod_name not ready after $((max_attempts * 3))s"
+
+    echo "Pod $pod_name not ready after $((max_attempts * 3))s (last phase: ${phase:-unknown})"
     return 1
 }
 
@@ -314,17 +326,30 @@ cmd_restart_lightspeed() {
     local pod_ready=true
     local lcs_pod_wait=40
     if [[ "${E2E_KONFLUX_E2E:-0}" == "1" ]]; then
-        lcs_pod_wait=65
+        # readinessProbe: 20s + 30*5s; LCS + Llama handshake can exceed 195s on Konflux (TLS suite).
+        lcs_pod_wait=100
     fi
+    echo "[e2e-ops] Waiting for lightspeed-stack-service Ready (max ${lcs_pod_wait} attempts, $((lcs_pod_wait * 3))s)..."
     if ! wait_for_pod "lightspeed-stack-service" "$lcs_pod_wait"; then
         pod_ready=false
-        echo "⚠️  Pod not ready within 120s — dumping diagnostics:"
-        oc describe pod lightspeed-stack-service -n "$NAMESPACE" 2>&1 | tail -30 || true
-        oc logs lightspeed-stack-service -n "$NAMESPACE" --tail=40 2>&1 || true
+        echo "⚠️  Pod not ready within $((lcs_pod_wait * 3))s — dumping diagnostics:"
+        if oc get pod lightspeed-stack-service -n "$NAMESPACE" &>/dev/null; then
+            oc describe pod lightspeed-stack-service -n "$NAMESPACE" 2>&1 | tail -40 || true
+            oc logs lightspeed-stack-service -n "$NAMESPACE" -c lightspeed-stack-container --tail=60 2>&1 || true
+        else
+            echo "[e2e-ops] lightspeed-stack-service pod not found in namespace $NAMESPACE"
+            oc get pods -n "$NAMESPACE" 2>&1 | grep -E 'lightspeed|NAME' || true
+            oc get events -n "$NAMESPACE" --field-selector involvedObject.name=lightspeed-stack-service \
+                --sort-by='.lastTimestamp' 2>&1 | tail -15 || true
+        fi
     fi
 
-    # Re-label pod for service discovery
-    oc label pod lightspeed-stack-service pod=lightspeed-stack-service -n "$NAMESPACE" --overwrite
+    # Re-label pod for service discovery (ignore if pod was deleted / not created yet)
+    if oc get pod lightspeed-stack-service -n "$NAMESPACE" &>/dev/null; then
+        oc label pod lightspeed-stack-service pod=lightspeed-stack-service -n "$NAMESPACE" --overwrite
+    else
+        echo "⚠️  Cannot label lightspeed-stack-service — pod missing"
+    fi
 
     # Re-establish port-forwards (may succeed even if readiness was slow)
     cmd_restart_port_forward
diff --git a/tests/e2e/utils/prow_utils.py b/tests/e2e/utils/prow_utils.py
index c94e17c13..263b415a3 100644
--- a/tests/e2e/utils/prow_utils.py
+++ b/tests/e2e/utils/prow_utils.py
@@ -103,8 +103,8 @@ def restart_pod(container_name: str) -> None:
             timeout = 420
     elif container_name in _LIGHTSPEED_RESTART_NAMES:
         op = "restart-lightspeed"
-        # Konflux LCS readiness can take ~195s (probe budget in lightspeed-stack.yaml).
-        timeout = 420 if os.environ.get("E2E_KONFLUX_E2E") == "1" else 320
+        # Konflux LCS: TCP readiness + Llama handshake; TLS suite often needs 200–300s.
+        timeout = 480 if os.environ.get("E2E_KONFLUX_E2E") == "1" else 320
     else:
         print(
             f"Warning: restart_pod({container_name!r}) unknown; "

From 4079a6ab02c46f2fe372bae70047e4b4f48bace1 Mon Sep 17 00:00:00 2001
From: Radovan Fuchs <rfuchs@rfuchs-thinkpadp1gen7.tpb.csb>
Date: Wed, 27 May 2026 13:29:30 +0200
Subject: [PATCH 16/20] print logs

---
 tests/e2e-prow/rhoai/scripts/e2e-ops.sh | 336 +++++++-----------------
 1 file changed, 94 insertions(+), 242 deletions(-)

diff --git a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh
index e3c0b918b..7924485e8 100755
--- a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh
+++ b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh
@@ -25,8 +25,6 @@
 #   disrupt-llama-stack             - Delete llama-stack pod to disrupt connection
 #   deploy-e2e-tunnel-proxy         - Deploy in-cluster tunnel proxy (proxy.feature step)
 #   deploy-e2e-interception-proxy   - Deploy in-cluster interception proxy (proxy.feature step)
-#   deploy-e2e-mock-tls-inference   - Deploy mock HTTPS inference server (tls.feature step)
-#   sync-mock-tls-certs-secret      - Publish /certs PEMs to Secret for llama-stack mount
 
 set -e
 
@@ -42,33 +40,101 @@ E2E_JWKS_PORT_FORWARD_PID_FILE="${E2E_JWKS_PORT_FORWARD_PID_FILE:-/tmp/e2e-jwks-
 # Helper functions
 # ============================================================================
 
+# On failure, print everything useful to stdout (captured by Behave / CI).
+# Tolerates missing pods (uses events) and partial API errors.
+e2e_ops_dump_pod_logs() {
+    local pod_name="${1:?pod name required}"
+    local preferred_container="${2:-}"
+    local log_tail="${3:-200}"
+    local prefix="[e2e-ops] "
+    local init_ctr ctr restart_count phase
+
+    echo "${prefix}========== failure logs: pod/$pod_name (namespace $NAMESPACE) =========="
+
+    echo "${prefix}--- events for pod/$pod_name ---"
+    oc get events -n "$NAMESPACE" --field-selector "involvedObject.name=${pod_name}" \
+        --sort-by='.lastTimestamp' 2>&1 | tail -50 | sed "s/^/${prefix}/" \
+        || echo "${prefix}(could not list events)"
+
+    if ! oc get pod "$pod_name" -n "$NAMESPACE" &>/dev/null; then
+        phase="Missing"
+        echo "${prefix}pod/$pod_name not found in namespace (deleted, failed, or API error)"
+        echo "${prefix}--- pods in $NAMESPACE ---"
+        oc get pods -n "$NAMESPACE" -o wide 2>&1 | sed "s/^/${prefix}/" || true
+        echo "${prefix}========== end failure logs: pod/$pod_name =========="
+        return 0
+    fi
+
+    phase=$(oc get pod "$pod_name" -n "$NAMESPACE" -o jsonpath='{.status.phase}' 2>/dev/null || echo "?")
+    echo "${prefix}pod phase=$phase"
+    oc get pod "$pod_name" -n "$NAMESPACE" -o wide 2>&1 | sed "s/^/${prefix}/" || true
+
+    echo "${prefix}--- oc describe pod/$pod_name ---"
+    oc describe pod "$pod_name" -n "$NAMESPACE" 2>&1 | sed "s/^/${prefix}/" || true
+
+    for init_ctr in $(oc get pod "$pod_name" -n "$NAMESPACE" \
+        -o jsonpath='{.spec.initContainers[*].name}' 2>/dev/null); do
+        [[ -n "$init_ctr" ]] || continue
+        echo "${prefix}--- oc logs pod/$pod_name -c $init_ctr (init, tail $log_tail) ---"
+        oc logs "$pod_name" -n "$NAMESPACE" -c "$init_ctr" --tail="$log_tail" 2>&1 \
+            | sed "s/^/${prefix}/" \
+            || echo "${prefix}(no init logs for $init_ctr yet)"
+    done
+
+    for ctr in $(oc get pod "$pod_name" -n "$NAMESPACE" \
+        -o jsonpath='{.spec.containers[*].name}' 2>/dev/null); do
+        [[ -n "$ctr" ]] || continue
+        echo "${prefix}--- oc logs pod/$pod_name -c $ctr (tail $log_tail) ---"
+        oc logs "$pod_name" -n "$NAMESPACE" -c "$ctr" --tail="$log_tail" 2>&1 \
+            | sed "s/^/${prefix}/" \
+            || echo "${prefix}(no logs for container $ctr)"
+        restart_count=$(oc get pod "$pod_name" -n "$NAMESPACE" \
+            -o jsonpath="{.status.containerStatuses[?(@.name==\"${ctr}\")].restartCount}" \
+            2>/dev/null) || restart_count="0"
+        if [[ "${restart_count:-0}" -gt 0 ]]; then
+            echo "${prefix}--- oc logs pod/$pod_name -c $ctr --previous (tail $log_tail) ---"
+            oc logs "$pod_name" -n "$NAMESPACE" -c "$ctr" --previous --tail="$log_tail" 2>&1 \
+                | sed "s/^/${prefix}/" \
+                || echo "${prefix}(no --previous logs for $ctr)"
+        fi
+    done
+
+    if [[ -n "$preferred_container" ]]; then
+        echo "${prefix}--- oc logs pod/$pod_name -c $preferred_container (preferred, tail $log_tail) ---"
+        oc logs "$pod_name" -n "$NAMESPACE" -c "$preferred_container" --tail="$log_tail" 2>&1 \
+            | sed "s/^/${prefix}/" || true
+    fi
+
+    echo "${prefix}========== end failure logs: pod/$pod_name =========="
+}
+
 wait_for_pod() {
     local pod_name="$1"
     local max_attempts="${2:-24}"
-    local attempt
-    local ready
-    local phase
+    local attempt phase ready
 
     for ((attempt=1; attempt<=max_attempts; attempt++)); do
         if ! oc get pod "$pod_name" -n "$NAMESPACE" &>/dev/null; then
             phase="Missing"
+            ready="false"
         else
             phase=$(oc get pod "$pod_name" -n "$NAMESPACE" \
-                -o jsonpath='{.status.phase}' 2>/dev/null || echo "Unknown")
+                -o jsonpath='{.status.phase}' 2>/dev/null || echo "?")
             ready=$(oc get pod "$pod_name" -n "$NAMESPACE" \
                 -o jsonpath='{.status.containerStatuses[0].ready}' 2>/dev/null || echo "false")
-            if [[ "$ready" == "true" ]]; then
-                echo "✓ Pod $pod_name ready (attempt $attempt/$max_attempts)"
-                return 0
-            fi
+        fi
+        if [[ "$ready" == "true" ]]; then
+            echo "✓ Pod $pod_name ready (attempt $attempt/$max_attempts)"
+            return 0
         fi
         if [[ $((attempt % 10)) -eq 0 ]]; then
-            echo "[e2e-ops] $pod_name not ready yet (attempt $attempt/$max_attempts, phase=${phase:-?})..."
+            echo "[e2e-ops] $pod_name not ready (attempt $attempt/$max_attempts, phase=$phase)"
         fi
         sleep 3
     done
 
     echo "Pod $pod_name not ready after $((max_attempts * 3))s (last phase: ${phase:-unknown})"
+    e2e_ops_dump_pod_logs "$pod_name" "" 250
     return 1
 }
 
@@ -286,9 +352,7 @@ wait_for_llama_stack_http_health() {
         fi
     done
     echo "ERROR: Llama Stack did not respond on http://127.0.0.1:8321/v1/health inside the pod"
-    oc get pod llama-stack-service -n "$NAMESPACE" -o wide 2>&1 || true
-    oc describe pod llama-stack-service -n "$NAMESPACE" 2>&1 | tail -40 || true
-    oc logs llama-stack-service -n "$NAMESPACE" -c llama-stack-container --tail=120 2>&1 || true
+    e2e_ops_dump_pod_logs "llama-stack-service" "llama-stack-container" 250
     return 1
 }
 
@@ -324,32 +388,13 @@ cmd_restart_lightspeed() {
     # Don't let a timeout here abort the function — still attempt port-forward
     # and diagnostics so later scenarios have a chance to recover.
     local pod_ready=true
-    local lcs_pod_wait=40
-    if [[ "${E2E_KONFLUX_E2E:-0}" == "1" ]]; then
-        # readinessProbe: 20s + 30*5s; LCS + Llama handshake can exceed 195s on Konflux (TLS suite).
-        lcs_pod_wait=100
-    fi
-    echo "[e2e-ops] Waiting for lightspeed-stack-service Ready (max ${lcs_pod_wait} attempts, $((lcs_pod_wait * 3))s)..."
-    if ! wait_for_pod "lightspeed-stack-service" "$lcs_pod_wait"; then
+    if ! wait_for_pod "lightspeed-stack-service" 40; then
         pod_ready=false
-        echo "⚠️  Pod not ready within $((lcs_pod_wait * 3))s — dumping diagnostics:"
-        if oc get pod lightspeed-stack-service -n "$NAMESPACE" &>/dev/null; then
-            oc describe pod lightspeed-stack-service -n "$NAMESPACE" 2>&1 | tail -40 || true
-            oc logs lightspeed-stack-service -n "$NAMESPACE" -c lightspeed-stack-container --tail=60 2>&1 || true
-        else
-            echo "[e2e-ops] lightspeed-stack-service pod not found in namespace $NAMESPACE"
-            oc get pods -n "$NAMESPACE" 2>&1 | grep -E 'lightspeed|NAME' || true
-            oc get events -n "$NAMESPACE" --field-selector involvedObject.name=lightspeed-stack-service \
-                --sort-by='.lastTimestamp' 2>&1 | tail -15 || true
-        fi
+        echo "⚠️  Pod not ready within 120s"
     fi
 
-    # Re-label pod for service discovery (ignore if pod was deleted / not created yet)
-    if oc get pod lightspeed-stack-service -n "$NAMESPACE" &>/dev/null; then
-        oc label pod lightspeed-stack-service pod=lightspeed-stack-service -n "$NAMESPACE" --overwrite
-    else
-        echo "⚠️  Cannot label lightspeed-stack-service — pod missing"
-    fi
+    # Re-label pod for service discovery
+    oc label pod lightspeed-stack-service pod=lightspeed-stack-service -n "$NAMESPACE" --overwrite
 
     # Re-establish port-forwards (may succeed even if readiness was slow)
     cmd_restart_port_forward
@@ -357,6 +402,7 @@ cmd_restart_lightspeed() {
 
     if [[ "$pod_ready" == "false" ]]; then
         echo "⚠️  Lightspeed restart completed but pod was slow to become ready"
+        e2e_ops_dump_pod_logs "lightspeed-stack-service" "lightspeed-stack-container" 200
         return 1
     fi
     echo "✓ Lightspeed restart complete"
@@ -364,7 +410,6 @@ cmd_restart_lightspeed() {
 
 cmd_restart_llama_stack() {
     echo "===== Restoring llama-stack service ====="
-    echo "[e2e-ops] restart-llama-stack env: E2E_KONFLUX_E2E=${E2E_KONFLUX_E2E:-0} E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA=${E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA:-0}"
     # Pod.spec is largely immutable; delete so apply creates a pod with current volumes/env.
     echo "Deleting llama-stack pod (if any) before apply..."
     timeout 45 oc delete pod llama-stack-service -n "$NAMESPACE" --ignore-not-found=true --wait=true 2>/dev/null || {
@@ -382,52 +427,25 @@ cmd_restart_llama_stack() {
                 exit 1
             fi
         fi
-        if [[ "${E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA:-0}" == "1" ]]; then
-            echo "[e2e-ops] Syncing e2e-mock-tls-certs secret before llama-stack apply..."
-            if ! cmd_sync_mock_tls_certs_secret; then
-                echo "===== Llama-stack restore FAILED (mock TLS certs secret sync) ====="
-                exit 1
-            fi
-        fi
         _LLAMA_SVC_FQDN="llama-stack-service-svc.${NAMESPACE}.svc.cluster.local"
         oc create secret generic llama-stack-ip-secret \
             --from-literal=key="$_LLAMA_SVC_FQDN" \
             -n "$NAMESPACE" \
             --dry-run=client -o yaml | oc apply -f -
         oc apply -n "$NAMESPACE" -f "$MANIFEST_DIR/llama-stack-openai.yaml"
-        local llama_pod_wait=90
-        if [[ "${E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA:-0}" == "1" ]]; then
-            # readinessProbe: 20s + 36*5s = 200s; clone/enrich/RAG on Konflux often needs 400s+ total.
-            llama_pod_wait=180
-        fi
-        echo "[e2e-ops] Waiting for llama-stack-service Ready (max ${llama_pod_wait} attempts, $((llama_pod_wait * 3))s)..."
-        if ! wait_for_pod "llama-stack-service" "$llama_pod_wait"; then
-            echo "===== Llama-stack restore FAILED (pod not Ready within $((llama_pod_wait * 3))s) ====="
-            oc get pod llama-stack-service -n "$NAMESPACE" -o wide 2>&1 || true
-            oc describe pod llama-stack-service -n "$NAMESPACE" 2>&1 | tail -50 || true
-            oc logs llama-stack-service -n "$NAMESPACE" -c llama-stack-container --tail=80 2>&1 || true
-            exit 1
-        fi
+        wait_for_pod "llama-stack-service" 90
         echo "Labeling pod for service..."
         oc label pod llama-stack-service pod=llama-stack-service -n "$NAMESPACE" --overwrite
         if [[ "${E2E_COPY_INTERCEPTION_CA_TO_LLAMA:-0}" == "1" ]]; then
             if ! _verify_interception_ca_mounted_in_llama; then
                 echo "===== Llama-stack restore FAILED (interception CA not mounted) ====="
+                e2e_ops_dump_pod_logs "llama-stack-service" "llama-stack-container" 200
                 exit 1
             fi
         fi
-        if [[ "${E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA:-0}" == "1" ]]; then
-            if ! _verify_mock_tls_certs_mounted_in_llama; then
-                echo "===== Llama-stack restore FAILED (mock TLS certs not mounted) ====="
-                exit 1
-            fi
-        fi
-        local llama_health_attempts=50
-        if [[ "${E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA:-0}" == "1" ]]; then
-            llama_health_attempts=100
-        fi
-        if ! wait_for_llama_stack_http_health "$llama_health_attempts"; then
+        if ! wait_for_llama_stack_http_health 50; then
             echo "===== Llama-stack restore FAILED (HTTP not healthy) ====="
+            e2e_ops_dump_pod_logs "llama-stack-service" "llama-stack-container" 250
             exit 1
         fi
     else
@@ -442,6 +460,7 @@ cmd_restart_llama_stack() {
 
     if ! cmd_restart_llama_port_forward; then
         echo "ERROR: Llama pod is up but localhost:${LOCAL_LLAMA_PORT:-8321} port-forward failed"
+        e2e_ops_dump_pod_logs "llama-stack-service" "llama-stack-container" 200
         exit 1
     fi
 
@@ -535,23 +554,20 @@ cmd_restart_llama_port_forward() {
     local local_port="${LOCAL_LLAMA_PORT:-8321}"
     local remote_port="${REMOTE_LLAMA_PORT:-8321}"
     local max_attempts=6
-    if [[ "${E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA:-0}" == "1" ]]; then
-        max_attempts=10
-    fi
     local pf_pid
     local pf_resource
     local llama_pf_log="/tmp/port-forward-llama.log"
 
-    echo "Re-establishing Llama Stack port-forward on $local_port:$remote_port (max $max_attempts attempts)..."
+    echo "Re-establishing Llama Stack port-forward on $local_port:$remote_port..."
 
     for ((attempt=1; attempt<=max_attempts; attempt++)); do
         kill_stale_llama_forward "$local_port"
         sleep 3
 
-        if [[ "${E2E_KONFLUX_E2E:-0}" == "1" ]] || [[ $attempt -ge 3 ]]; then
-            pf_resource="pod/llama-stack-service"
-        else
+        if [[ $attempt -le 2 ]]; then
             pf_resource="svc/llama-stack-service-svc"
+        else
+            pf_resource="pod/llama-stack-service"
         fi
         echo "Llama port-forward attempt $attempt/$max_attempts -> $pf_resource"
 
@@ -589,8 +605,10 @@ cmd_restart_llama_port_forward() {
 
     echo "Failed to establish Llama Stack port-forward on :$local_port"
     if [[ -s "$llama_pf_log" ]]; then
+        echo "[e2e-ops] $llama_pf_log (tail 30):"
         tail -30 "$llama_pf_log" 2>/dev/null | sed 's/^/[e2e-ops] /' || true
     fi
+    e2e_ops_dump_pod_logs "llama-stack-service" "llama-stack-container" 200
     return 1
 }
 
@@ -773,142 +791,6 @@ cmd_copy_interception_proxy_ca_to_llama() {
     cmd_sync_interception_proxy_ca_secret
 }
 
-_MOCK_TLS_CERT_FILES=(
-    ca.crt
-    client.crt
-    client.key
-    untrusted-ca.crt
-    expired-ca.crt
-    untrusted-client.crt
-    untrusted-client.key
-    expired-client.crt
-)
-
-_mock_tls_secret_is_complete() {
-    local f b64
-    for f in "${_MOCK_TLS_CERT_FILES[@]}"; do
-        b64=$(oc get secret e2e-mock-tls-certs -n "$NAMESPACE" \
-            -o "go-template={{index .data \"${f}\"}}" 2>/dev/null) || return 1
-        [[ -n "$b64" ]] || return 1
-    done
-    return 0
-}
-
-_get_mock_tls_inference_pod_name() {
-    oc get pod -n "$NAMESPACE" -l app=e2e-mock-tls-inference \
-        -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true
-}
-
-_wait_for_mock_tls_inference_pod() {
-    if ! oc wait pod -l app=e2e-mock-tls-inference -n "$NAMESPACE" \
-        --for=condition=Ready --timeout=120s 2>/dev/null; then
-        echo "ERROR: e2e-mock-tls-inference pod not Ready" >&2
-        oc get pods -n "$NAMESPACE" -l app=e2e-mock-tls-inference -o wide 2>&1 || true
-        return 1
-    fi
-    return 0
-}
-
-_copy_mock_tls_cert_from_pod() {
-    local mock_pod_name="$1"
-    local cert_file="$2"
-    local dest="$3"
-    local attempt
-
-    for ((attempt=1; attempt<=4; attempt++)); do
-        if oc exec --request-timeout=90 -n "$NAMESPACE" "$mock_pod_name" \
-            -c e2e-mock-tls-inference -- cat "/certs/$cert_file" >"$dest" 2>/dev/null \
-            && [[ -s "$dest" ]]; then
-            return 0
-        fi
-        echo "[e2e-ops] WARN: read /certs/$cert_file from mock pod failed (attempt $attempt/4)"
-        sleep 5
-    done
-    return 1
-}
-
-_recycle_mock_tls_inference_pod() {
-    echo "[e2e-ops] Recycling e2e-mock-tls-inference pod (stale or unresponsive)..."
-    oc delete pod e2e-mock-tls-inference -n "$NAMESPACE" --ignore-not-found=true --wait=true 2>/dev/null || true
-    sleep 3
-    if ! _wait_for_mock_tls_inference_pod; then
-        return 1
-    fi
-    # Certs are written at container start; allow trustme + pip to finish.
-    sleep 10
-    return 0
-}
-
-cmd_sync_mock_tls_certs_secret() {
-    local mock_pod_name tmpdir f recycle_attempt
-
-    if _mock_tls_secret_is_complete; then
-        echo "✓ Secret e2e-mock-tls-certs already complete (${#_MOCK_TLS_CERT_FILES[@]} keys); skipping sync"
-        return 0
-    fi
-
-    for recycle_attempt in 1 2; do
-        mock_pod_name=$(_get_mock_tls_inference_pod_name)
-        if [[ -z "$mock_pod_name" ]]; then
-            echo "ERROR: no e2e-mock-tls-inference pod in namespace $NAMESPACE" >&2
-            echo "  Run: e2e-ops.sh deploy-e2e-mock-tls-inference" >&2
-            return 1
-        fi
-
-        if ! _wait_for_mock_tls_inference_pod; then
-            [[ $recycle_attempt -lt 2 ]] && _recycle_mock_tls_inference_pod && continue
-            return 1
-        fi
-
-        tmpdir=$(mktemp -d)
-        local sync_ok=true
-        for f in "${_MOCK_TLS_CERT_FILES[@]}"; do
-            if ! _copy_mock_tls_cert_from_pod "$mock_pod_name" "$f" "$tmpdir/$f"; then
-                echo "ERROR: failed to read /certs/$f from e2e-mock-tls-inference pod" >&2
-                oc logs "$mock_pod_name" -n "$NAMESPACE" -c e2e-mock-tls-inference --tail=40 2>&1 \
-                    | sed 's/^/[e2e-ops] /' || true
-                sync_ok=false
-                break
-            fi
-        done
-
-        if [[ "$sync_ok" == "true" ]]; then
-            if ! oc create secret generic e2e-mock-tls-certs \
-                --from-file="$tmpdir" \
-                -n "$NAMESPACE" \
-                --dry-run=client -o yaml | oc apply -f -; then
-                echo "ERROR: failed to apply e2e-mock-tls-certs secret" >&2
-                rm -rf "$tmpdir"
-                return 1
-            fi
-            rm -rf "$tmpdir"
-            echo "✓ Secret e2e-mock-tls-certs updated (${#_MOCK_TLS_CERT_FILES[@]} files)"
-            return 0
-        fi
-
-        rm -rf "$tmpdir"
-        if [[ $recycle_attempt -lt 2 ]]; then
-            _recycle_mock_tls_inference_pod || return 1
-        fi
-    done
-
-    return 1
-}
-
-_verify_mock_tls_certs_mounted_in_llama() {
-    local llama_pod_name="llama-stack-service"
-    if oc exec -n "$NAMESPACE" "$llama_pod_name" -c llama-stack-container -- \
-        sh -c 'test -s /certs/ca.crt && test -s /certs/client.crt && test -s /certs/client.key'; then
-        echo "✓ mock TLS certs present under /certs in llama-stack"
-        return 0
-    fi
-    echo "ERROR: /certs missing or incomplete in llama-stack pod" >&2
-    oc get secret e2e-mock-tls-certs -n "$NAMESPACE" 2>&1 || true
-    oc exec -n "$NAMESPACE" "$llama_pod_name" -c llama-stack-container -- \
-        ls -la /certs 2>&1 || true
-    return 1
-}
-
 _e2e_repo_root() {
     cd "$SCRIPT_DIR/../../../.." && pwd
 }
@@ -945,28 +827,6 @@ cmd_deploy_e2e_interception_proxy() {
     echo "✓ e2e-interception-proxy ready at http://e2e-interception-proxy.${NAMESPACE}.svc.cluster.local:8889"
 }
 
-cmd_deploy_e2e_mock_tls_inference() {
-    local repo_root
-    repo_root="$(_e2e_repo_root)"
-    echo "Deploying e2e-mock-tls-inference in namespace $NAMESPACE..."
-    oc create configmap e2e-mock-tls-inference-script -n "$NAMESPACE" \
-        --from-file=server.py="$repo_root/tests/e2e/mock_tls_inference_server/server.py" \
-        --dry-run=client -o yaml | oc apply -f -
-    oc delete pod e2e-mock-tls-inference -n "$NAMESPACE" --ignore-not-found=true --wait=true 2>/dev/null || true
-    oc apply -n "$NAMESPACE" -f "$MANIFEST_DIR/e2e-mock-tls-inference.yaml"
-    if ! oc wait pod/e2e-mock-tls-inference -n "$NAMESPACE" --for=condition=Ready --timeout=240s; then
-        echo "ERROR: e2e-mock-tls-inference failed to become ready" >&2
-        oc describe pod/e2e-mock-tls-inference -n "$NAMESPACE" 2>/dev/null | tail -30 || true
-        oc logs e2e-mock-tls-inference -n "$NAMESPACE" --tail=40 2>&1 || true
-        return 1
-    fi
-    echo "✓ e2e-mock-tls-inference ready at https://e2e-mock-tls-inference.${NAMESPACE}.svc.cluster.local:8443"
-    if ! cmd_sync_mock_tls_certs_secret; then
-        echo "WARNING: mock TLS server is up but e2e-mock-tls-certs secret sync failed" >&2
-        return 1
-    fi
-}
-
 cmd_disrupt_llama_stack() {
     local pod_name="llama-stack-service"
 
@@ -1037,12 +897,6 @@ case "$COMMAND" in
     deploy-e2e-interception-proxy)
         cmd_deploy_e2e_interception_proxy
         ;;
-    deploy-e2e-mock-tls-inference)
-        cmd_deploy_e2e_mock_tls_inference
-        ;;
-    sync-mock-tls-certs-secret)
-        cmd_sync_mock_tls_certs_secret
-        ;;
     *)
         echo "Usage: $0 <command> [args...]"
         echo ""
@@ -1061,8 +915,6 @@ case "$COMMAND" in
         echo "  sync-interception-proxy-ca-secret   - Publish trustme CA to Secret for llama mount"
         echo "  deploy-e2e-tunnel-proxy            - Deploy in-cluster tunnel proxy pod"
         echo "  deploy-e2e-interception-proxy      - Deploy in-cluster interception proxy pod"
-        echo "  deploy-e2e-mock-tls-inference        - Deploy mock HTTPS inference server (tls.feature)"
-        echo "  sync-mock-tls-certs-secret           - Publish mock TLS /certs to Secret for llama mount"
         exit 1
         ;;
 esac

From d40101359c3abc839fddc3d6398e92b706a6f4b1 Mon Sep 17 00:00:00 2001
From: Radovan Fuchs <rfuchs@rfuchs-thinkpadp1gen7.tpb.csb>
Date: Wed, 27 May 2026 18:04:38 +0200
Subject: [PATCH 17/20] print logs

---
 tests/e2e-prow/rhoai/scripts/e2e-ops.sh | 136 ++++++++++++++++++++++++
 tests/e2e/features/environment.py       |   6 +-
 tests/e2e/features/steps/tls.py         |  59 +++++-----
 tests/e2e/utils/prow_utils.py           |   2 +-
 4 files changed, 165 insertions(+), 38 deletions(-)

diff --git a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh
index 7924485e8..f315bfe12 100755
--- a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh
+++ b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh
@@ -25,6 +25,8 @@
 #   disrupt-llama-stack             - Delete llama-stack pod to disrupt connection
 #   deploy-e2e-tunnel-proxy         - Deploy in-cluster tunnel proxy (proxy.feature step)
 #   deploy-e2e-interception-proxy   - Deploy in-cluster interception proxy (proxy.feature step)
+#   deploy-e2e-mock-tls-inference   - Deploy in-cluster mock TLS server (tls.feature)
+#   reload-llama-stack-config       - Apply ConfigMap run.yaml in pod and restart main process
 
 set -e
 
@@ -356,6 +358,46 @@ wait_for_llama_stack_http_health() {
     return 1
 }
 
+# Copy llama-stack-config run.yaml into the running pod and restart PID 1 (no init rerun).
+cmd_reload_llama_stack_config() {
+    local pod="llama-stack-service"
+    local ctr="llama-stack-container"
+    local tmp
+
+    if ! oc get pod "$pod" -n "$NAMESPACE" &>/dev/null; then
+        echo "ERROR: $pod not found; use restart-llama-stack first" >&2
+        return 1
+    fi
+
+    tmp=$(mktemp)
+    if ! oc get configmap llama-stack-config -n "$NAMESPACE" \
+        -o "go-template={{index .data \"run.yaml\"}}" >"$tmp" 2>/dev/null \
+        || [[ ! -s "$tmp" ]]; then
+        rm -f "$tmp"
+        echo "ERROR: could not read run.yaml from llama-stack-config" >&2
+        return 1
+    fi
+
+    echo "Reloading Llama Stack run.yaml in $pod (container restart, not pod delete)..."
+    oc cp "$tmp" "$NAMESPACE/$pod:/opt/app-root/run.yaml" -c "$ctr" || {
+        rm -f "$tmp"
+        return 1
+    }
+    rm -f "$tmp"
+    oc exec -n "$NAMESPACE" "$pod" -c "$ctr" -- kill -1
+
+    if ! wait_for_pod "$pod" 40; then
+        echo "===== Llama-stack reload FAILED (pod not Ready) ====="
+        return 1
+    fi
+    if ! wait_for_llama_stack_http_health 40; then
+        echo "===== Llama-stack reload FAILED (HTTP not healthy) ====="
+        return 1
+    fi
+    cmd_restart_llama_port_forward || return 1
+    echo "===== Llama-stack reload complete ====="
+}
+
 # ============================================================================
 # Command implementations
 # ============================================================================
@@ -427,6 +469,12 @@ cmd_restart_llama_stack() {
                 exit 1
             fi
         fi
+        if [[ "${E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA:-0}" == "1" ]]; then
+            if ! cmd_sync_mock_tls_certs_secret; then
+                echo "===== Llama-stack restore FAILED (mock TLS cert secret sync) ====="
+                exit 1
+            fi
+        fi
         _LLAMA_SVC_FQDN="llama-stack-service-svc.${NAMESPACE}.svc.cluster.local"
         oc create secret generic llama-stack-ip-secret \
             --from-literal=key="$_LLAMA_SVC_FQDN" \
@@ -827,6 +875,86 @@ cmd_deploy_e2e_interception_proxy() {
     echo "✓ e2e-interception-proxy ready at http://e2e-interception-proxy.${NAMESPACE}.svc.cluster.local:8889"
 }
 
+_MOCK_TLS_CERT_FILES=(
+    ca.crt client.crt client.key untrusted-ca.crt expired-ca.crt
+    untrusted-client.crt untrusted-client.key expired-client.crt
+)
+
+_mock_tls_certs_secret_is_complete() {
+    local f present
+    if ! oc get secret e2e-mock-tls-certs -n "$NAMESPACE" &>/dev/null; then
+        return 1
+    fi
+    present=$(oc get secret e2e-mock-tls-certs -n "$NAMESPACE" \
+        -o go-template='{{range $k, $v := .data}}{{$k}} {{end}}' 2>/dev/null) || return 1
+    for f in "${_MOCK_TLS_CERT_FILES[@]}"; do
+        [[ " $present " == *" $f "* ]] || return 1
+    done
+    return 0
+}
+
+cmd_sync_mock_tls_certs_secret() {
+    local mock_pod="e2e-mock-tls-inference"
+    local mock_ctr="e2e-mock-tls-inference"
+    local f tmpdir from_args
+
+    if _mock_tls_certs_secret_is_complete; then
+        echo "✓ Secret e2e-mock-tls-certs already complete; skipping sync"
+        return 0
+    fi
+    if ! oc get pod "$mock_pod" -n "$NAMESPACE" &>/dev/null; then
+        echo "ERROR: $mock_pod not found (deploy-e2e-mock-tls-inference first)" >&2
+        return 1
+    fi
+    if ! oc exec -n "$NAMESPACE" "$mock_pod" -c "$mock_ctr" --request-timeout=30s -- \
+        test -s /certs/ca.crt 2>/dev/null; then
+        echo "ERROR: mock TLS /certs not ready yet" >&2
+        return 1
+    fi
+
+    tmpdir=$(mktemp -d)
+    for f in "${_MOCK_TLS_CERT_FILES[@]}"; do
+        if ! oc exec -n "$NAMESPACE" "$mock_pod" -c "$mock_ctr" --request-timeout=30s -- \
+            cat "/certs/${f}" >"${tmpdir}/${f}" 2>/dev/null \
+            || [[ ! -s "${tmpdir}/${f}" ]]; then
+            rm -rf "$tmpdir"
+            echo "ERROR: failed to read /certs/${f} from mock pod" >&2
+            return 1
+        fi
+    done
+    from_args=()
+    for f in "${_MOCK_TLS_CERT_FILES[@]}"; do
+        from_args+=(--from-file="${f}=${tmpdir}/${f}")
+    done
+    oc create secret generic e2e-mock-tls-certs \
+        "${from_args[@]}" -n "$NAMESPACE" \
+        --dry-run=client -o yaml | oc apply -n "$NAMESPACE" -f -
+    rm -rf "$tmpdir"
+    echo "✓ Secret e2e-mock-tls-certs updated"
+}
+
+cmd_deploy_e2e_mock_tls_inference() {
+    local repo_root server_py
+    repo_root="$(_e2e_repo_root)"
+    server_py="$repo_root/tests/e2e/mock_tls_inference_server/server.py"
+    [[ -f "$server_py" ]] || {
+        echo "ERROR: missing $server_py" >&2
+        return 1
+    }
+    echo "Deploying e2e-mock-tls-inference in namespace $NAMESPACE..."
+    oc create configmap e2e-mock-tls-inference-script -n "$NAMESPACE" \
+        --from-file=server.py="$server_py" \
+        --dry-run=client -o yaml | oc apply -f -
+    oc apply -n "$NAMESPACE" -f "$MANIFEST_DIR/e2e-mock-tls-inference.yaml"
+    if ! oc wait pod/e2e-mock-tls-inference -n "$NAMESPACE" --for=condition=Ready --timeout=240s; then
+        echo "ERROR: e2e-mock-tls-inference not ready" >&2
+        e2e_ops_dump_pod_logs "e2e-mock-tls-inference" "e2e-mock-tls-inference" 80
+        return 1
+    fi
+    cmd_sync_mock_tls_certs_secret || return 1
+    echo "✓ e2e-mock-tls-inference ready"
+}
+
 cmd_disrupt_llama_stack() {
     local pod_name="llama-stack-service"
 
@@ -897,6 +1025,12 @@ case "$COMMAND" in
     deploy-e2e-interception-proxy)
         cmd_deploy_e2e_interception_proxy
         ;;
+    deploy-e2e-mock-tls-inference)
+        cmd_deploy_e2e_mock_tls_inference
+        ;;
+    reload-llama-stack-config)
+        cmd_reload_llama_stack_config
+        ;;
     *)
         echo "Usage: $0 <command> [args...]"
         echo ""
@@ -915,6 +1049,8 @@ case "$COMMAND" in
         echo "  sync-interception-proxy-ca-secret   - Publish trustme CA to Secret for llama mount"
         echo "  deploy-e2e-tunnel-proxy            - Deploy in-cluster tunnel proxy pod"
         echo "  deploy-e2e-interception-proxy      - Deploy in-cluster interception proxy pod"
+        echo "  deploy-e2e-mock-tls-inference        - Deploy in-cluster mock TLS inference (tls.feature)"
+        echo "  reload-llama-stack-config            - Reload run.yaml without deleting llama pod"
         exit 1
         ;;
 esac
diff --git a/tests/e2e/features/environment.py b/tests/e2e/features/environment.py
index abcabe577..025c536e4 100644
--- a/tests/e2e/features/environment.py
+++ b/tests/e2e/features/environment.py
@@ -26,10 +26,7 @@
     reset_llama_stack_disrupt_once_tracking,
     reset_llama_stack_was_running,
 )
-from tests.e2e.features.steps.tls import (
-    prepare_tls_feature_entry_on_prow,
-    reset_tls_prow_state,
-)
+from tests.e2e.features.steps.tls import reset_tls_prow_state
 from tests.e2e.utils.llama_stack_utils import register_shield
 from tests.e2e.utils.prow_utils import (
     restart_pod,
@@ -457,7 +454,6 @@ def before_feature(context: Context, feature: Feature) -> None:
     reset_llama_stack_disrupt_once_tracking()
     if feature.filename and "tls.feature" in feature.filename:
         reset_tls_prow_state()
-        prepare_tls_feature_entry_on_prow()
 
     try:
         max_flaky = int(os.getenv("E2E_FLAKY_MAX_ATTEMPTS", _E2E_FLAKY_MAX_ATTEMPTS))
diff --git a/tests/e2e/features/steps/tls.py b/tests/e2e/features/steps/tls.py
index 67f5fa360..23a65b581 100644
--- a/tests/e2e/features/steps/tls.py
+++ b/tests/e2e/features/steps/tls.py
@@ -19,7 +19,6 @@
     backup_llama_config,
     clear_llama_config_backup,
     load_llama_config,
-    reset_llama_run_config_to_pipeline_default,
     write_llama_config,
 )
 from tests.e2e.utils.prow_utils import get_namespace, restart_pod, run_e2e_ops
@@ -36,43 +35,17 @@
 }
 
 _mock_tls_cluster_deploy_state: dict[str, bool] = {"done": False}
+_tls_llama_pod_warmed: dict[str, bool] = {"done": False}
 
 
 def reset_tls_prow_state() -> None:
     """Reset per-feature Prow state (call from ``before_feature``)."""
     _mock_tls_cluster_deploy_state["done"] = False
+    _tls_llama_pod_warmed["done"] = False
     os.environ.pop("E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA", None)
     clear_llama_config_backup()
 
 
-def prepare_tls_feature_entry_on_prow() -> None:
-    """Baseline cluster state when tls.feature runs after other features in test_list.
-
-    Earlier features (disrupted, MCP) delete or reconfigure Llama without mock TLS
-    certs. Isolated tls.feature runs skip that churn, which is why the same Gherkin
-    passes alone but flakes mid-feature in the full suite.
-    """
-    if not is_prow_environment():
-        return
-    print("[tls.feature] Prow/Konflux entry: reset run.yaml and warm Llama + mock TLS...")
-    reset_llama_run_config_to_pipeline_default()
-    result = run_e2e_ops("deploy-e2e-mock-tls-inference", timeout=300)
-    print(result.stdout, end="")
-    if result.returncode != 0:
-        raise RuntimeError(
-            "tls.feature entry: deploy-e2e-mock-tls-inference failed: "
-            f"{result.stderr or result.stdout}"
-        )
-    _mock_tls_cluster_deploy_state["done"] = True
-    _prepare_tls_prow_llama_restart_env()
-    os.environ.setdefault(
-        "E2E_MOCK_TLS_INFERENCE_HOST",
-        _cluster_mock_tls_inference_host(),
-    )
-    restart_pod("llama-stack")
-    print("[tls.feature] Prow/Konflux entry baseline complete", flush=True)
-
-
 def is_tls_configuration_feature(context: Context) -> bool:
     """Return True when the active Behave feature is ``tls.feature``."""
     feature = getattr(context, "feature", None)
@@ -88,7 +61,12 @@ def _prepare_tls_prow_llama_restart_env() -> None:
 
 
 def restart_llama_for_tls_feature(context: Context) -> None:
-    """Restart Llama for TLS tests (full pod recreate on Prow/Konflux)."""
+    """Restart Llama for TLS tests.
+
+    On Prow/Konflux the first restart per feature recreates the pod (mock TLS cert
+    Secret volume). Later restarts reload run.yaml in-place (``kill 1``) to avoid
+    re-running the heavy setup-from-source init on every scenario.
+    """
     from tests.e2e.utils.utils import restart_container
 
     if not is_prow_environment():
@@ -97,11 +75,28 @@ def restart_llama_for_tls_feature(context: Context) -> None:
 
     _prepare_tls_prow_llama_restart_env()
     scenario = getattr(getattr(context, "scenario", None), "name", "") or "?"
+
+    if not _tls_llama_pod_warmed["done"]:
+        print(
+            f"[tls.feature] Llama Stack restart: pod recreate (once per feature) "
+            f"scenario={scenario!r}",
+            flush=True,
+        )
+        restart_pod("llama-stack")
+        _tls_llama_pod_warmed["done"] = True
+        return
+
     print(
-        f"[tls.feature] Llama Stack restart: full recreate scenario={scenario!r}",
+        f"[tls.feature] Llama Stack restart: reload run.yaml scenario={scenario!r}",
         flush=True,
     )
-    restart_container("llama-stack")
+    result = run_e2e_ops("reload-llama-stack-config", timeout=240)
+    print(result.stdout, end="")
+    if result.returncode != 0:
+        raise RuntimeError(
+            "tls.feature: reload-llama-stack-config failed: "
+            f"{result.stderr or result.stdout}"
+        )
 
 
 def _cluster_mock_tls_inference_host() -> str:
diff --git a/tests/e2e/utils/prow_utils.py b/tests/e2e/utils/prow_utils.py
index 263b415a3..348c918ec 100644
--- a/tests/e2e/utils/prow_utils.py
+++ b/tests/e2e/utils/prow_utils.py
@@ -96,7 +96,7 @@ def restart_pod(container_name: str) -> None:
         # Subprocess cap must exceed e2e-ops internal waits (pod + in-pod health + port-forward).
         # Konflux TLS full recreate: ~6–12 min typical, 15+ min under load (user-reported 400s+).
         if os.environ.get("E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA") == "1":
-            timeout = 1200
+            timeout = 900
         elif os.environ.get("E2E_KONFLUX_E2E") == "1":
             timeout = 720
         else:

From 8eacfd43ec9d1314a47759d4fd217577b4b723d3 Mon Sep 17 00:00:00 2001
From: Radovan Fuchs <rfuchs@rfuchs-thinkpadp1gen7.tpb.csb>
Date: Wed, 27 May 2026 21:17:04 +0200
Subject: [PATCH 18/20] print logs

---
 tests/e2e-prow/rhoai/scripts/e2e-ops.sh | 20 ++++++++++++++++----
 tests/e2e/features/steps/tls.py         |  6 ++++--
 2 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh
index f315bfe12..04853c167 100755
--- a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh
+++ b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh
@@ -379,22 +379,34 @@ cmd_reload_llama_stack_config() {
     fi
 
     echo "Reloading Llama Stack run.yaml in $pod (container restart, not pod delete)..."
-    oc cp "$tmp" "$NAMESPACE/$pod:/opt/app-root/run.yaml" -c "$ctr" || {
+    if ! oc cp "$tmp" "$NAMESPACE/$pod:/opt/app-root/run.yaml" -c "$ctr"; then
         rm -f "$tmp"
+        echo "ERROR: oc cp run.yaml into $pod failed" >&2
+        e2e_ops_dump_pod_logs "$pod" "$ctr" 150
         return 1
-    }
+    fi
     rm -f "$tmp"
-    oc exec -n "$NAMESPACE" "$pod" -c "$ctr" -- kill -1
+    # kill -1 is parsed as "signal -1, no PID" — use kill -HUP 1 (PID 1 = main process).
+    if ! oc exec -n "$NAMESPACE" "$pod" -c "$ctr" -- kill -HUP 1; then
+        echo "ERROR: kill -HUP 1 failed in $pod" >&2
+        e2e_ops_dump_pod_logs "$pod" "$ctr" 150
+        return 1
+    fi
 
     if ! wait_for_pod "$pod" 40; then
         echo "===== Llama-stack reload FAILED (pod not Ready) ====="
+        e2e_ops_dump_pod_logs "$pod" "$ctr" 200
         return 1
     fi
     if ! wait_for_llama_stack_http_health 40; then
         echo "===== Llama-stack reload FAILED (HTTP not healthy) ====="
+        e2e_ops_dump_pod_logs "$pod" "$ctr" 200
+        return 1
+    fi
+    if ! cmd_restart_llama_port_forward; then
+        e2e_ops_dump_pod_logs "$pod" "$ctr" 120
         return 1
     fi
-    cmd_restart_llama_port_forward || return 1
     echo "===== Llama-stack reload complete ====="
 }
 
diff --git a/tests/e2e/features/steps/tls.py b/tests/e2e/features/steps/tls.py
index 23a65b581..ae363d245 100644
--- a/tests/e2e/features/steps/tls.py
+++ b/tests/e2e/features/steps/tls.py
@@ -92,10 +92,12 @@ def restart_llama_for_tls_feature(context: Context) -> None:
     )
     result = run_e2e_ops("reload-llama-stack-config", timeout=240)
     print(result.stdout, end="")
+    if result.stderr:
+        print(result.stderr, end="")
     if result.returncode != 0:
+        detail = f"{result.stdout or ''}\n{result.stderr or ''}".strip()
         raise RuntimeError(
-            "tls.feature: reload-llama-stack-config failed: "
-            f"{result.stderr or result.stdout}"
+            f"tls.feature: reload-llama-stack-config failed:\n{detail or result.returncode}"
         )
 
 

From e9e54bac38c1fa324cc38c85f339cb004385dad8 Mon Sep 17 00:00:00 2001
From: Radovan Fuchs <rfuchs@rfuchs-thinkpadp1gen7.tpb.csb>
Date: Thu, 28 May 2026 08:04:25 +0200
Subject: [PATCH 19/20] clear the logic

---
 tests/e2e-prow/rhoai/scripts/e2e-ops.sh | 444 ++++++++++++------------
 tests/e2e/features/environment.py       |   6 +-
 tests/e2e/features/steps/tls.py         |  61 ++--
 tests/e2e/utils/prow_utils.py           |   2 +-
 4 files changed, 260 insertions(+), 253 deletions(-)

diff --git a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh
index 04853c167..e3c0b918b 100755
--- a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh
+++ b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh
@@ -25,8 +25,8 @@
 #   disrupt-llama-stack             - Delete llama-stack pod to disrupt connection
 #   deploy-e2e-tunnel-proxy         - Deploy in-cluster tunnel proxy (proxy.feature step)
 #   deploy-e2e-interception-proxy   - Deploy in-cluster interception proxy (proxy.feature step)
-#   deploy-e2e-mock-tls-inference   - Deploy in-cluster mock TLS server (tls.feature)
-#   reload-llama-stack-config       - Apply ConfigMap run.yaml in pod and restart main process
+#   deploy-e2e-mock-tls-inference   - Deploy mock HTTPS inference server (tls.feature step)
+#   sync-mock-tls-certs-secret      - Publish /certs PEMs to Secret for llama-stack mount
 
 set -e
 
@@ -42,101 +42,33 @@ E2E_JWKS_PORT_FORWARD_PID_FILE="${E2E_JWKS_PORT_FORWARD_PID_FILE:-/tmp/e2e-jwks-
 # Helper functions
 # ============================================================================
 
-# On failure, print everything useful to stdout (captured by Behave / CI).
-# Tolerates missing pods (uses events) and partial API errors.
-e2e_ops_dump_pod_logs() {
-    local pod_name="${1:?pod name required}"
-    local preferred_container="${2:-}"
-    local log_tail="${3:-200}"
-    local prefix="[e2e-ops] "
-    local init_ctr ctr restart_count phase
-
-    echo "${prefix}========== failure logs: pod/$pod_name (namespace $NAMESPACE) =========="
-
-    echo "${prefix}--- events for pod/$pod_name ---"
-    oc get events -n "$NAMESPACE" --field-selector "involvedObject.name=${pod_name}" \
-        --sort-by='.lastTimestamp' 2>&1 | tail -50 | sed "s/^/${prefix}/" \
-        || echo "${prefix}(could not list events)"
-
-    if ! oc get pod "$pod_name" -n "$NAMESPACE" &>/dev/null; then
-        phase="Missing"
-        echo "${prefix}pod/$pod_name not found in namespace (deleted, failed, or API error)"
-        echo "${prefix}--- pods in $NAMESPACE ---"
-        oc get pods -n "$NAMESPACE" -o wide 2>&1 | sed "s/^/${prefix}/" || true
-        echo "${prefix}========== end failure logs: pod/$pod_name =========="
-        return 0
-    fi
-
-    phase=$(oc get pod "$pod_name" -n "$NAMESPACE" -o jsonpath='{.status.phase}' 2>/dev/null || echo "?")
-    echo "${prefix}pod phase=$phase"
-    oc get pod "$pod_name" -n "$NAMESPACE" -o wide 2>&1 | sed "s/^/${prefix}/" || true
-
-    echo "${prefix}--- oc describe pod/$pod_name ---"
-    oc describe pod "$pod_name" -n "$NAMESPACE" 2>&1 | sed "s/^/${prefix}/" || true
-
-    for init_ctr in $(oc get pod "$pod_name" -n "$NAMESPACE" \
-        -o jsonpath='{.spec.initContainers[*].name}' 2>/dev/null); do
-        [[ -n "$init_ctr" ]] || continue
-        echo "${prefix}--- oc logs pod/$pod_name -c $init_ctr (init, tail $log_tail) ---"
-        oc logs "$pod_name" -n "$NAMESPACE" -c "$init_ctr" --tail="$log_tail" 2>&1 \
-            | sed "s/^/${prefix}/" \
-            || echo "${prefix}(no init logs for $init_ctr yet)"
-    done
-
-    for ctr in $(oc get pod "$pod_name" -n "$NAMESPACE" \
-        -o jsonpath='{.spec.containers[*].name}' 2>/dev/null); do
-        [[ -n "$ctr" ]] || continue
-        echo "${prefix}--- oc logs pod/$pod_name -c $ctr (tail $log_tail) ---"
-        oc logs "$pod_name" -n "$NAMESPACE" -c "$ctr" --tail="$log_tail" 2>&1 \
-            | sed "s/^/${prefix}/" \
-            || echo "${prefix}(no logs for container $ctr)"
-        restart_count=$(oc get pod "$pod_name" -n "$NAMESPACE" \
-            -o jsonpath="{.status.containerStatuses[?(@.name==\"${ctr}\")].restartCount}" \
-            2>/dev/null) || restart_count="0"
-        if [[ "${restart_count:-0}" -gt 0 ]]; then
-            echo "${prefix}--- oc logs pod/$pod_name -c $ctr --previous (tail $log_tail) ---"
-            oc logs "$pod_name" -n "$NAMESPACE" -c "$ctr" --previous --tail="$log_tail" 2>&1 \
-                | sed "s/^/${prefix}/" \
-                || echo "${prefix}(no --previous logs for $ctr)"
-        fi
-    done
-
-    if [[ -n "$preferred_container" ]]; then
-        echo "${prefix}--- oc logs pod/$pod_name -c $preferred_container (preferred, tail $log_tail) ---"
-        oc logs "$pod_name" -n "$NAMESPACE" -c "$preferred_container" --tail="$log_tail" 2>&1 \
-            | sed "s/^/${prefix}/" || true
-    fi
-
-    echo "${prefix}========== end failure logs: pod/$pod_name =========="
-}
-
 wait_for_pod() {
     local pod_name="$1"
     local max_attempts="${2:-24}"
-    local attempt phase ready
+    local attempt
+    local ready
+    local phase
 
     for ((attempt=1; attempt<=max_attempts; attempt++)); do
         if ! oc get pod "$pod_name" -n "$NAMESPACE" &>/dev/null; then
             phase="Missing"
-            ready="false"
         else
             phase=$(oc get pod "$pod_name" -n "$NAMESPACE" \
-                -o jsonpath='{.status.phase}' 2>/dev/null || echo "?")
+                -o jsonpath='{.status.phase}' 2>/dev/null || echo "Unknown")
             ready=$(oc get pod "$pod_name" -n "$NAMESPACE" \
                 -o jsonpath='{.status.containerStatuses[0].ready}' 2>/dev/null || echo "false")
-        fi
-        if [[ "$ready" == "true" ]]; then
-            echo "✓ Pod $pod_name ready (attempt $attempt/$max_attempts)"
-            return 0
+            if [[ "$ready" == "true" ]]; then
+                echo "✓ Pod $pod_name ready (attempt $attempt/$max_attempts)"
+                return 0
+            fi
         fi
         if [[ $((attempt % 10)) -eq 0 ]]; then
-            echo "[e2e-ops] $pod_name not ready (attempt $attempt/$max_attempts, phase=$phase)"
+            echo "[e2e-ops] $pod_name not ready yet (attempt $attempt/$max_attempts, phase=${phase:-?})..."
         fi
         sleep 3
     done
 
     echo "Pod $pod_name not ready after $((max_attempts * 3))s (last phase: ${phase:-unknown})"
-    e2e_ops_dump_pod_logs "$pod_name" "" 250
     return 1
 }
 
@@ -354,62 +286,12 @@ wait_for_llama_stack_http_health() {
         fi
     done
     echo "ERROR: Llama Stack did not respond on http://127.0.0.1:8321/v1/health inside the pod"
-    e2e_ops_dump_pod_logs "llama-stack-service" "llama-stack-container" 250
+    oc get pod llama-stack-service -n "$NAMESPACE" -o wide 2>&1 || true
+    oc describe pod llama-stack-service -n "$NAMESPACE" 2>&1 | tail -40 || true
+    oc logs llama-stack-service -n "$NAMESPACE" -c llama-stack-container --tail=120 2>&1 || true
     return 1
 }
 
-# Copy llama-stack-config run.yaml into the running pod and restart PID 1 (no init rerun).
-cmd_reload_llama_stack_config() {
-    local pod="llama-stack-service"
-    local ctr="llama-stack-container"
-    local tmp
-
-    if ! oc get pod "$pod" -n "$NAMESPACE" &>/dev/null; then
-        echo "ERROR: $pod not found; use restart-llama-stack first" >&2
-        return 1
-    fi
-
-    tmp=$(mktemp)
-    if ! oc get configmap llama-stack-config -n "$NAMESPACE" \
-        -o "go-template={{index .data \"run.yaml\"}}" >"$tmp" 2>/dev/null \
-        || [[ ! -s "$tmp" ]]; then
-        rm -f "$tmp"
-        echo "ERROR: could not read run.yaml from llama-stack-config" >&2
-        return 1
-    fi
-
-    echo "Reloading Llama Stack run.yaml in $pod (container restart, not pod delete)..."
-    if ! oc cp "$tmp" "$NAMESPACE/$pod:/opt/app-root/run.yaml" -c "$ctr"; then
-        rm -f "$tmp"
-        echo "ERROR: oc cp run.yaml into $pod failed" >&2
-        e2e_ops_dump_pod_logs "$pod" "$ctr" 150
-        return 1
-    fi
-    rm -f "$tmp"
-    # kill -1 is parsed as "signal -1, no PID" — use kill -HUP 1 (PID 1 = main process).
-    if ! oc exec -n "$NAMESPACE" "$pod" -c "$ctr" -- kill -HUP 1; then
-        echo "ERROR: kill -HUP 1 failed in $pod" >&2
-        e2e_ops_dump_pod_logs "$pod" "$ctr" 150
-        return 1
-    fi
-
-    if ! wait_for_pod "$pod" 40; then
-        echo "===== Llama-stack reload FAILED (pod not Ready) ====="
-        e2e_ops_dump_pod_logs "$pod" "$ctr" 200
-        return 1
-    fi
-    if ! wait_for_llama_stack_http_health 40; then
-        echo "===== Llama-stack reload FAILED (HTTP not healthy) ====="
-        e2e_ops_dump_pod_logs "$pod" "$ctr" 200
-        return 1
-    fi
-    if ! cmd_restart_llama_port_forward; then
-        e2e_ops_dump_pod_logs "$pod" "$ctr" 120
-        return 1
-    fi
-    echo "===== Llama-stack reload complete ====="
-}
-
 # ============================================================================
 # Command implementations
 # ============================================================================
@@ -442,13 +324,32 @@ cmd_restart_lightspeed() {
     # Don't let a timeout here abort the function — still attempt port-forward
     # and diagnostics so later scenarios have a chance to recover.
     local pod_ready=true
-    if ! wait_for_pod "lightspeed-stack-service" 40; then
+    local lcs_pod_wait=40
+    if [[ "${E2E_KONFLUX_E2E:-0}" == "1" ]]; then
+        # readinessProbe: 20s + 30*5s; LCS + Llama handshake can exceed 195s on Konflux (TLS suite).
+        lcs_pod_wait=100
+    fi
+    echo "[e2e-ops] Waiting for lightspeed-stack-service Ready (max ${lcs_pod_wait} attempts, $((lcs_pod_wait * 3))s)..."
+    if ! wait_for_pod "lightspeed-stack-service" "$lcs_pod_wait"; then
         pod_ready=false
-        echo "⚠️  Pod not ready within 120s"
+        echo "⚠️  Pod not ready within $((lcs_pod_wait * 3))s — dumping diagnostics:"
+        if oc get pod lightspeed-stack-service -n "$NAMESPACE" &>/dev/null; then
+            oc describe pod lightspeed-stack-service -n "$NAMESPACE" 2>&1 | tail -40 || true
+            oc logs lightspeed-stack-service -n "$NAMESPACE" -c lightspeed-stack-container --tail=60 2>&1 || true
+        else
+            echo "[e2e-ops] lightspeed-stack-service pod not found in namespace $NAMESPACE"
+            oc get pods -n "$NAMESPACE" 2>&1 | grep -E 'lightspeed|NAME' || true
+            oc get events -n "$NAMESPACE" --field-selector involvedObject.name=lightspeed-stack-service \
+                --sort-by='.lastTimestamp' 2>&1 | tail -15 || true
+        fi
     fi
 
-    # Re-label pod for service discovery
-    oc label pod lightspeed-stack-service pod=lightspeed-stack-service -n "$NAMESPACE" --overwrite
+    # Re-label pod for service discovery (ignore if pod was deleted / not created yet)
+    if oc get pod lightspeed-stack-service -n "$NAMESPACE" &>/dev/null; then
+        oc label pod lightspeed-stack-service pod=lightspeed-stack-service -n "$NAMESPACE" --overwrite
+    else
+        echo "⚠️  Cannot label lightspeed-stack-service — pod missing"
+    fi
 
     # Re-establish port-forwards (may succeed even if readiness was slow)
     cmd_restart_port_forward
@@ -456,7 +357,6 @@ cmd_restart_lightspeed() {
 
     if [[ "$pod_ready" == "false" ]]; then
         echo "⚠️  Lightspeed restart completed but pod was slow to become ready"
-        e2e_ops_dump_pod_logs "lightspeed-stack-service" "lightspeed-stack-container" 200
         return 1
     fi
     echo "✓ Lightspeed restart complete"
@@ -464,6 +364,7 @@ cmd_restart_lightspeed() {
 
 cmd_restart_llama_stack() {
     echo "===== Restoring llama-stack service ====="
+    echo "[e2e-ops] restart-llama-stack env: E2E_KONFLUX_E2E=${E2E_KONFLUX_E2E:-0} E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA=${E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA:-0}"
     # Pod.spec is largely immutable; delete so apply creates a pod with current volumes/env.
     echo "Deleting llama-stack pod (if any) before apply..."
     timeout 45 oc delete pod llama-stack-service -n "$NAMESPACE" --ignore-not-found=true --wait=true 2>/dev/null || {
@@ -482,8 +383,9 @@ cmd_restart_llama_stack() {
             fi
         fi
         if [[ "${E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA:-0}" == "1" ]]; then
+            echo "[e2e-ops] Syncing e2e-mock-tls-certs secret before llama-stack apply..."
             if ! cmd_sync_mock_tls_certs_secret; then
-                echo "===== Llama-stack restore FAILED (mock TLS cert secret sync) ====="
+                echo "===== Llama-stack restore FAILED (mock TLS certs secret sync) ====="
                 exit 1
             fi
         fi
@@ -493,19 +395,39 @@ cmd_restart_llama_stack() {
             -n "$NAMESPACE" \
             --dry-run=client -o yaml | oc apply -f -
         oc apply -n "$NAMESPACE" -f "$MANIFEST_DIR/llama-stack-openai.yaml"
-        wait_for_pod "llama-stack-service" 90
+        local llama_pod_wait=90
+        if [[ "${E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA:-0}" == "1" ]]; then
+            # readinessProbe: 20s + 36*5s = 200s; clone/enrich/RAG on Konflux often needs 400s+ total.
+            llama_pod_wait=180
+        fi
+        echo "[e2e-ops] Waiting for llama-stack-service Ready (max ${llama_pod_wait} attempts, $((llama_pod_wait * 3))s)..."
+        if ! wait_for_pod "llama-stack-service" "$llama_pod_wait"; then
+            echo "===== Llama-stack restore FAILED (pod not Ready within $((llama_pod_wait * 3))s) ====="
+            oc get pod llama-stack-service -n "$NAMESPACE" -o wide 2>&1 || true
+            oc describe pod llama-stack-service -n "$NAMESPACE" 2>&1 | tail -50 || true
+            oc logs llama-stack-service -n "$NAMESPACE" -c llama-stack-container --tail=80 2>&1 || true
+            exit 1
+        fi
         echo "Labeling pod for service..."
         oc label pod llama-stack-service pod=llama-stack-service -n "$NAMESPACE" --overwrite
         if [[ "${E2E_COPY_INTERCEPTION_CA_TO_LLAMA:-0}" == "1" ]]; then
             if ! _verify_interception_ca_mounted_in_llama; then
                 echo "===== Llama-stack restore FAILED (interception CA not mounted) ====="
-                e2e_ops_dump_pod_logs "llama-stack-service" "llama-stack-container" 200
                 exit 1
             fi
         fi
-        if ! wait_for_llama_stack_http_health 50; then
+        if [[ "${E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA:-0}" == "1" ]]; then
+            if ! _verify_mock_tls_certs_mounted_in_llama; then
+                echo "===== Llama-stack restore FAILED (mock TLS certs not mounted) ====="
+                exit 1
+            fi
+        fi
+        local llama_health_attempts=50
+        if [[ "${E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA:-0}" == "1" ]]; then
+            llama_health_attempts=100
+        fi
+        if ! wait_for_llama_stack_http_health "$llama_health_attempts"; then
             echo "===== Llama-stack restore FAILED (HTTP not healthy) ====="
-            e2e_ops_dump_pod_logs "llama-stack-service" "llama-stack-container" 250
             exit 1
         fi
     else
@@ -520,7 +442,6 @@ cmd_restart_llama_stack() {
 
     if ! cmd_restart_llama_port_forward; then
         echo "ERROR: Llama pod is up but localhost:${LOCAL_LLAMA_PORT:-8321} port-forward failed"
-        e2e_ops_dump_pod_logs "llama-stack-service" "llama-stack-container" 200
         exit 1
     fi
 
@@ -614,20 +535,23 @@ cmd_restart_llama_port_forward() {
     local local_port="${LOCAL_LLAMA_PORT:-8321}"
     local remote_port="${REMOTE_LLAMA_PORT:-8321}"
     local max_attempts=6
+    if [[ "${E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA:-0}" == "1" ]]; then
+        max_attempts=10
+    fi
     local pf_pid
     local pf_resource
     local llama_pf_log="/tmp/port-forward-llama.log"
 
-    echo "Re-establishing Llama Stack port-forward on $local_port:$remote_port..."
+    echo "Re-establishing Llama Stack port-forward on $local_port:$remote_port (max $max_attempts attempts)..."
 
     for ((attempt=1; attempt<=max_attempts; attempt++)); do
         kill_stale_llama_forward "$local_port"
         sleep 3
 
-        if [[ $attempt -le 2 ]]; then
-            pf_resource="svc/llama-stack-service-svc"
-        else
+        if [[ "${E2E_KONFLUX_E2E:-0}" == "1" ]] || [[ $attempt -ge 3 ]]; then
             pf_resource="pod/llama-stack-service"
+        else
+            pf_resource="svc/llama-stack-service-svc"
         fi
         echo "Llama port-forward attempt $attempt/$max_attempts -> $pf_resource"
 
@@ -665,10 +589,8 @@ cmd_restart_llama_port_forward() {
 
     echo "Failed to establish Llama Stack port-forward on :$local_port"
     if [[ -s "$llama_pf_log" ]]; then
-        echo "[e2e-ops] $llama_pf_log (tail 30):"
         tail -30 "$llama_pf_log" 2>/dev/null | sed 's/^/[e2e-ops] /' || true
     fi
-    e2e_ops_dump_pod_logs "llama-stack-service" "llama-stack-container" 200
     return 1
 }
 
@@ -851,6 +773,142 @@ cmd_copy_interception_proxy_ca_to_llama() {
     cmd_sync_interception_proxy_ca_secret
 }
 
+_MOCK_TLS_CERT_FILES=(
+    ca.crt
+    client.crt
+    client.key
+    untrusted-ca.crt
+    expired-ca.crt
+    untrusted-client.crt
+    untrusted-client.key
+    expired-client.crt
+)
+
+_mock_tls_secret_is_complete() {
+    local f b64
+    for f in "${_MOCK_TLS_CERT_FILES[@]}"; do
+        b64=$(oc get secret e2e-mock-tls-certs -n "$NAMESPACE" \
+            -o "go-template={{index .data \"${f}\"}}" 2>/dev/null) || return 1
+        [[ -n "$b64" ]] || return 1
+    done
+    return 0
+}
+
+_get_mock_tls_inference_pod_name() {
+    oc get pod -n "$NAMESPACE" -l app=e2e-mock-tls-inference \
+        -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true
+}
+
+_wait_for_mock_tls_inference_pod() {
+    if ! oc wait pod -l app=e2e-mock-tls-inference -n "$NAMESPACE" \
+        --for=condition=Ready --timeout=120s 2>/dev/null; then
+        echo "ERROR: e2e-mock-tls-inference pod not Ready" >&2
+        oc get pods -n "$NAMESPACE" -l app=e2e-mock-tls-inference -o wide 2>&1 || true
+        return 1
+    fi
+    return 0
+}
+
+_copy_mock_tls_cert_from_pod() {
+    local mock_pod_name="$1"
+    local cert_file="$2"
+    local dest="$3"
+    local attempt
+
+    for ((attempt=1; attempt<=4; attempt++)); do
+        if oc exec --request-timeout=90 -n "$NAMESPACE" "$mock_pod_name" \
+            -c e2e-mock-tls-inference -- cat "/certs/$cert_file" >"$dest" 2>/dev/null \
+            && [[ -s "$dest" ]]; then
+            return 0
+        fi
+        echo "[e2e-ops] WARN: read /certs/$cert_file from mock pod failed (attempt $attempt/4)"
+        sleep 5
+    done
+    return 1
+}
+
+_recycle_mock_tls_inference_pod() {
+    echo "[e2e-ops] Recycling e2e-mock-tls-inference pod (stale or unresponsive)..."
+    oc delete pod e2e-mock-tls-inference -n "$NAMESPACE" --ignore-not-found=true --wait=true 2>/dev/null || true
+    sleep 3
+    if ! _wait_for_mock_tls_inference_pod; then
+        return 1
+    fi
+    # Certs are written at container start; allow trustme + pip to finish.
+    sleep 10
+    return 0
+}
+
+cmd_sync_mock_tls_certs_secret() {
+    local mock_pod_name tmpdir f recycle_attempt
+
+    if _mock_tls_secret_is_complete; then
+        echo "✓ Secret e2e-mock-tls-certs already complete (${#_MOCK_TLS_CERT_FILES[@]} keys); skipping sync"
+        return 0
+    fi
+
+    for recycle_attempt in 1 2; do
+        mock_pod_name=$(_get_mock_tls_inference_pod_name)
+        if [[ -z "$mock_pod_name" ]]; then
+            echo "ERROR: no e2e-mock-tls-inference pod in namespace $NAMESPACE" >&2
+            echo "  Run: e2e-ops.sh deploy-e2e-mock-tls-inference" >&2
+            return 1
+        fi
+
+        if ! _wait_for_mock_tls_inference_pod; then
+            [[ $recycle_attempt -lt 2 ]] && _recycle_mock_tls_inference_pod && continue
+            return 1
+        fi
+
+        tmpdir=$(mktemp -d)
+        local sync_ok=true
+        for f in "${_MOCK_TLS_CERT_FILES[@]}"; do
+            if ! _copy_mock_tls_cert_from_pod "$mock_pod_name" "$f" "$tmpdir/$f"; then
+                echo "ERROR: failed to read /certs/$f from e2e-mock-tls-inference pod" >&2
+                oc logs "$mock_pod_name" -n "$NAMESPACE" -c e2e-mock-tls-inference --tail=40 2>&1 \
+                    | sed 's/^/[e2e-ops] /' || true
+                sync_ok=false
+                break
+            fi
+        done
+
+        if [[ "$sync_ok" == "true" ]]; then
+            if ! oc create secret generic e2e-mock-tls-certs \
+                --from-file="$tmpdir" \
+                -n "$NAMESPACE" \
+                --dry-run=client -o yaml | oc apply -f -; then
+                echo "ERROR: failed to apply e2e-mock-tls-certs secret" >&2
+                rm -rf "$tmpdir"
+                return 1
+            fi
+            rm -rf "$tmpdir"
+            echo "✓ Secret e2e-mock-tls-certs updated (${#_MOCK_TLS_CERT_FILES[@]} files)"
+            return 0
+        fi
+
+        rm -rf "$tmpdir"
+        if [[ $recycle_attempt -lt 2 ]]; then
+            _recycle_mock_tls_inference_pod || return 1
+        fi
+    done
+
+    return 1
+}
+
+_verify_mock_tls_certs_mounted_in_llama() {
+    local llama_pod_name="llama-stack-service"
+    if oc exec -n "$NAMESPACE" "$llama_pod_name" -c llama-stack-container -- \
+        sh -c 'test -s /certs/ca.crt && test -s /certs/client.crt && test -s /certs/client.key'; then
+        echo "✓ mock TLS certs present under /certs in llama-stack"
+        return 0
+    fi
+    echo "ERROR: /certs missing or incomplete in llama-stack pod" >&2
+    oc get secret e2e-mock-tls-certs -n "$NAMESPACE" 2>&1 || true
+    oc exec -n "$NAMESPACE" "$llama_pod_name" -c llama-stack-container -- \
+        ls -la /certs 2>&1 || true
+    return 1
+}
+
 _e2e_repo_root() {
     cd "$SCRIPT_DIR/../../../.." && pwd
 }
@@ -887,84 +945,26 @@ cmd_deploy_e2e_interception_proxy() {
     echo "✓ e2e-interception-proxy ready at http://e2e-interception-proxy.${NAMESPACE}.svc.cluster.local:8889"
 }
 
-_MOCK_TLS_CERT_FILES=(
-    ca.crt client.crt client.key untrusted-ca.crt expired-ca.crt
-    untrusted-client.crt untrusted-client.key expired-client.crt
-)
-
-_mock_tls_certs_secret_is_complete() {
-    local f present
-    if ! oc get secret e2e-mock-tls-certs -n "$NAMESPACE" &>/dev/null; then
-        return 1
-    fi
-    present=$(oc get secret e2e-mock-tls-certs -n "$NAMESPACE" \
-        -o go-template='{{range $k, $v := .data}}{{$k}} {{end}}' 2>/dev/null) || return 1
-    for f in "${_MOCK_TLS_CERT_FILES[@]}"; do
-        [[ " $present " == *" $f "* ]] || return 1
-    done
-    return 0
-}
-
-cmd_sync_mock_tls_certs_secret() {
-    local mock_pod="e2e-mock-tls-inference"
-    local mock_ctr="e2e-mock-tls-inference"
-    local f tmpdir from_args
-
-    if _mock_tls_certs_secret_is_complete; then
-        echo "✓ Secret e2e-mock-tls-certs already complete; skipping sync"
-        return 0
-    fi
-    if ! oc get pod "$mock_pod" -n "$NAMESPACE" &>/dev/null; then
-        echo "ERROR: $mock_pod not found (deploy-e2e-mock-tls-inference first)" >&2
-        return 1
-    fi
-    if ! oc exec -n "$NAMESPACE" "$mock_pod" -c "$mock_ctr" --request-timeout=30s -- \
-        test -s /certs/ca.crt 2>/dev/null; then
-        echo "ERROR: mock TLS /certs not ready yet" >&2
-        return 1
-    fi
-
-    tmpdir=$(mktemp -d)
-    for f in "${_MOCK_TLS_CERT_FILES[@]}"; do
-        if ! oc exec -n "$NAMESPACE" "$mock_pod" -c "$mock_ctr" --request-timeout=30s -- \
-            cat "/certs/${f}" >"${tmpdir}/${f}" 2>/dev/null \
-            || [[ ! -s "${tmpdir}/${f}" ]]; then
-            rm -rf "$tmpdir"
-            echo "ERROR: failed to read /certs/${f} from mock pod" >&2
-            return 1
-        fi
-    done
-    from_args=()
-    for f in "${_MOCK_TLS_CERT_FILES[@]}"; do
-        from_args+=(--from-file="${f}=${tmpdir}/${f}")
-    done
-    oc create secret generic e2e-mock-tls-certs \
-        "${from_args[@]}" -n "$NAMESPACE" \
-        --dry-run=client -o yaml | oc apply -n "$NAMESPACE" -f -
-    rm -rf "$tmpdir"
-    echo "✓ Secret e2e-mock-tls-certs updated"
-}
-
 cmd_deploy_e2e_mock_tls_inference() {
-    local repo_root server_py
+    local repo_root
     repo_root="$(_e2e_repo_root)"
-    server_py="$repo_root/tests/e2e/mock_tls_inference_server/server.py"
-    [[ -f "$server_py" ]] || {
-        echo "ERROR: missing $server_py" >&2
-        return 1
-    }
     echo "Deploying e2e-mock-tls-inference in namespace $NAMESPACE..."
     oc create configmap e2e-mock-tls-inference-script -n "$NAMESPACE" \
-        --from-file=server.py="$server_py" \
+        --from-file=server.py="$repo_root/tests/e2e/mock_tls_inference_server/server.py" \
         --dry-run=client -o yaml | oc apply -f -
+    oc delete pod e2e-mock-tls-inference -n "$NAMESPACE" --ignore-not-found=true --wait=true 2>/dev/null || true
     oc apply -n "$NAMESPACE" -f "$MANIFEST_DIR/e2e-mock-tls-inference.yaml"
     if ! oc wait pod/e2e-mock-tls-inference -n "$NAMESPACE" --for=condition=Ready --timeout=240s; then
-        echo "ERROR: e2e-mock-tls-inference not ready" >&2
-        e2e_ops_dump_pod_logs "e2e-mock-tls-inference" "e2e-mock-tls-inference" 80
+        echo "ERROR: e2e-mock-tls-inference failed to become ready" >&2
+        oc describe pod/e2e-mock-tls-inference -n "$NAMESPACE" 2>/dev/null | tail -30 || true
+        oc logs e2e-mock-tls-inference -n "$NAMESPACE" --tail=40 2>&1 || true
+        return 1
+    fi
+    echo "✓ e2e-mock-tls-inference ready at https://e2e-mock-tls-inference.${NAMESPACE}.svc.cluster.local:8443"
+    if ! cmd_sync_mock_tls_certs_secret; then
+        echo "WARNING: mock TLS server is up but e2e-mock-tls-certs secret sync failed" >&2
         return 1
     fi
-    cmd_sync_mock_tls_certs_secret || return 1
-    echo "✓ e2e-mock-tls-inference ready"
 }
 
 cmd_disrupt_llama_stack() {
@@ -1040,8 +1040,8 @@ case "$COMMAND" in
     deploy-e2e-mock-tls-inference)
         cmd_deploy_e2e_mock_tls_inference
         ;;
-    reload-llama-stack-config)
-        cmd_reload_llama_stack_config
+    sync-mock-tls-certs-secret)
+        cmd_sync_mock_tls_certs_secret
         ;;
     *)
         echo "Usage: $0 <command> [args...]"
@@ -1061,8 +1061,8 @@ case "$COMMAND" in
         echo "  sync-interception-proxy-ca-secret   - Publish trustme CA to Secret for llama mount"
         echo "  deploy-e2e-tunnel-proxy            - Deploy in-cluster tunnel proxy pod"
         echo "  deploy-e2e-interception-proxy      - Deploy in-cluster interception proxy pod"
-        echo "  deploy-e2e-mock-tls-inference        - Deploy in-cluster mock TLS inference (tls.feature)"
-        echo "  reload-llama-stack-config            - Reload run.yaml without deleting llama pod"
+        echo "  deploy-e2e-mock-tls-inference        - Deploy mock HTTPS inference server (tls.feature)"
+        echo "  sync-mock-tls-certs-secret           - Publish mock TLS /certs to Secret for llama mount"
         exit 1
         ;;
 esac
diff --git a/tests/e2e/features/environment.py b/tests/e2e/features/environment.py
index 025c536e4..abcabe577 100644
--- a/tests/e2e/features/environment.py
+++ b/tests/e2e/features/environment.py
@@ -26,7 +26,10 @@
     reset_llama_stack_disrupt_once_tracking,
     reset_llama_stack_was_running,
 )
-from tests.e2e.features.steps.tls import reset_tls_prow_state
+from tests.e2e.features.steps.tls import (
+    prepare_tls_feature_entry_on_prow,
+    reset_tls_prow_state,
+)
 from tests.e2e.utils.llama_stack_utils import register_shield
 from tests.e2e.utils.prow_utils import (
     restart_pod,
@@ -454,6 +457,7 @@ def before_feature(context: Context, feature: Feature) -> None:
     reset_llama_stack_disrupt_once_tracking()
     if feature.filename and "tls.feature" in feature.filename:
         reset_tls_prow_state()
+        prepare_tls_feature_entry_on_prow()
 
     try:
         max_flaky = int(os.getenv("E2E_FLAKY_MAX_ATTEMPTS", _E2E_FLAKY_MAX_ATTEMPTS))
diff --git a/tests/e2e/features/steps/tls.py b/tests/e2e/features/steps/tls.py
index ae363d245..67f5fa360 100644
--- a/tests/e2e/features/steps/tls.py
+++ b/tests/e2e/features/steps/tls.py
@@ -19,6 +19,7 @@
     backup_llama_config,
     clear_llama_config_backup,
     load_llama_config,
+    reset_llama_run_config_to_pipeline_default,
     write_llama_config,
 )
 from tests.e2e.utils.prow_utils import get_namespace, restart_pod, run_e2e_ops
@@ -35,17 +36,43 @@
 }
 
 _mock_tls_cluster_deploy_state: dict[str, bool] = {"done": False}
-_tls_llama_pod_warmed: dict[str, bool] = {"done": False}
 
 
 def reset_tls_prow_state() -> None:
     """Reset per-feature Prow state (call from ``before_feature``)."""
     _mock_tls_cluster_deploy_state["done"] = False
-    _tls_llama_pod_warmed["done"] = False
     os.environ.pop("E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA", None)
     clear_llama_config_backup()
 
 
+def prepare_tls_feature_entry_on_prow() -> None:
+    """Baseline cluster state when tls.feature runs after other features in test_list.
+
+    Earlier features (disrupted, MCP) delete or reconfigure Llama without mock TLS
+    certs. Isolated tls.feature runs skip that churn, which is why the same Gherkin
+    passes alone but flakes mid-feature in the full suite.
+    """
+    if not is_prow_environment():
+        return
+    print("[tls.feature] Prow/Konflux entry: reset run.yaml and warm Llama + mock TLS...")
+    reset_llama_run_config_to_pipeline_default()
+    result = run_e2e_ops("deploy-e2e-mock-tls-inference", timeout=300)
+    print(result.stdout, end="")
+    if result.returncode != 0:
+        raise RuntimeError(
+            "tls.feature entry: deploy-e2e-mock-tls-inference failed: "
+            f"{result.stderr or result.stdout}"
+        )
+    _mock_tls_cluster_deploy_state["done"] = True
+    _prepare_tls_prow_llama_restart_env()
+    os.environ.setdefault(
+        "E2E_MOCK_TLS_INFERENCE_HOST",
+        _cluster_mock_tls_inference_host(),
+    )
+    restart_pod("llama-stack")
+    print("[tls.feature] Prow/Konflux entry baseline complete", flush=True)
+
+
 def is_tls_configuration_feature(context: Context) -> bool:
     """Return True when the active Behave feature is ``tls.feature``."""
     feature = getattr(context, "feature", None)
@@ -61,12 +88,7 @@ def _prepare_tls_prow_llama_restart_env() -> None:
 
 
 def restart_llama_for_tls_feature(context: Context) -> None:
-    """Restart Llama for TLS tests.
-
-    On Prow/Konflux the first restart per feature recreates the pod (mock TLS cert
-    Secret volume). Later restarts reload run.yaml in-place (``kill 1``) to avoid
-    re-running the heavy setup-from-source init on every scenario.
-    """
+    """Restart Llama for TLS tests (full pod recreate on Prow/Konflux)."""
     from tests.e2e.utils.utils import restart_container
 
     if not is_prow_environment():
@@ -75,30 +97,11 @@ def restart_llama_for_tls_feature(context: Context) -> None:
 
     _prepare_tls_prow_llama_restart_env()
     scenario = getattr(getattr(context, "scenario", None), "name", "") or "?"
-
-    if not _tls_llama_pod_warmed["done"]:
-        print(
-            f"[tls.feature] Llama Stack restart: pod recreate (once per feature) "
-            f"scenario={scenario!r}",
-            flush=True,
-        )
-        restart_pod("llama-stack")
-        _tls_llama_pod_warmed["done"] = True
-        return
-
     print(
-        f"[tls.feature] Llama Stack restart: reload run.yaml scenario={scenario!r}",
+        f"[tls.feature] Llama Stack restart: full recreate scenario={scenario!r}",
         flush=True,
     )
-    result = run_e2e_ops("reload-llama-stack-config", timeout=240)
-    print(result.stdout, end="")
-    if result.stderr:
-        print(result.stderr, end="")
-    if result.returncode != 0:
-        detail = f"{result.stdout or ''}\n{result.stderr or ''}".strip()
-        raise RuntimeError(
-            f"tls.feature: reload-llama-stack-config failed:\n{detail or result.returncode}"
-        )
+    restart_container("llama-stack")
 
 
 def _cluster_mock_tls_inference_host() -> str:
diff --git a/tests/e2e/utils/prow_utils.py b/tests/e2e/utils/prow_utils.py
index 348c918ec..263b415a3 100644
--- a/tests/e2e/utils/prow_utils.py
+++ b/tests/e2e/utils/prow_utils.py
@@ -96,7 +96,7 @@ def restart_pod(container_name: str) -> None:
         # Subprocess cap must exceed e2e-ops internal waits (pod + in-pod health + port-forward).
         # Konflux TLS full recreate: ~6–12 min typical, 15+ min under load (user-reported 400s+).
         if os.environ.get("E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA") == "1":
-            timeout = 900
+            timeout = 1200
         elif os.environ.get("E2E_KONFLUX_E2E") == "1":
             timeout = 720
         else:

From 67b7e7f39344b77f8c47516abb4e96c6885ba959 Mon Sep 17 00:00:00 2001
From: Radovan Fuchs <rfuchs@rfuchs-thinkpadp1gen7.tpb.csb>
Date: Thu, 28 May 2026 14:16:18 +0200
Subject: [PATCH 20/20] add logs dump to every failure

---
 tests/e2e-prow/rhoai/scripts/e2e-ops.sh | 111 +++++++++++++++++++-----
 tests/e2e/features/environment.py       |  32 +++----
 tests/e2e/utils/prow_utils.py           |   8 +-
 3 files changed, 113 insertions(+), 38 deletions(-)

diff --git a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh
index e3c0b918b..b80b41914 100755
--- a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh
+++ b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh
@@ -27,6 +27,7 @@
 #   deploy-e2e-interception-proxy   - Deploy in-cluster interception proxy (proxy.feature step)
 #   deploy-e2e-mock-tls-inference   - Deploy mock HTTPS inference server (tls.feature step)
 #   sync-mock-tls-certs-secret      - Publish /certs PEMs to Secret for llama-stack mount
+#   dump-pod-logs <pod> [container] - Print events, describe, init + container logs (on failure)
 
 set -e
 
@@ -42,6 +43,66 @@ E2E_JWKS_PORT_FORWARD_PID_FILE="${E2E_JWKS_PORT_FORWARD_PID_FILE:-/tmp/e2e-jwks-
 # Helper functions
 # ============================================================================
 
+# Print diagnostics to stdout (captured by Behave as CAPTURED STDOUT).
+e2e_ops_dump_pod_logs() {
+    local pod_name="${1:?pod name required}"
+    local preferred_container="${2:-}"
+    local log_tail="${3:-200}"
+    local prefix="[e2e-ops] "
+    local init_ctr ctr restart_count phase
+
+    echo "${prefix}========== failure logs: pod/$pod_name (namespace $NAMESPACE) =========="
+
+    echo "${prefix}--- events for pod/$pod_name ---"
+    oc get events -n "$NAMESPACE" --field-selector "involvedObject.name=${pod_name}" \
+        --sort-by='.lastTimestamp' 2>&1 | tail -50 | sed "s/^/${prefix}/" \
+        || echo "${prefix}(could not list events)"
+
+    if ! oc get pod "$pod_name" -n "$NAMESPACE" &>/dev/null; then
+        echo "${prefix}pod/$pod_name not found (deleted or never created)"
+        oc get pods -n "$NAMESPACE" -o wide 2>&1 | sed "s/^/${prefix}/" || true
+        echo "${prefix}========== end failure logs: pod/$pod_name =========="
+        return 0
+    fi
+
+    phase=$(oc get pod "$pod_name" -n "$NAMESPACE" -o jsonpath='{.status.phase}' 2>/dev/null || echo "?")
+    echo "${prefix}pod phase=$phase"
+    oc get pod "$pod_name" -n "$NAMESPACE" -o wide 2>&1 | sed "s/^/${prefix}/" || true
+    oc describe pod "$pod_name" -n "$NAMESPACE" 2>&1 | sed "s/^/${prefix}/" || true
+
+    for init_ctr in $(oc get pod "$pod_name" -n "$NAMESPACE" \
+        -o jsonpath='{.spec.initContainers[*].name}' 2>/dev/null); do
+        [[ -n "$init_ctr" ]] || continue
+        echo "${prefix}--- logs pod/$pod_name -c $init_ctr (init, tail $log_tail) ---"
+        oc logs "$pod_name" -n "$NAMESPACE" -c "$init_ctr" --tail="$log_tail" 2>&1 \
+            | sed "s/^/${prefix}/" || echo "${prefix}(no init logs for $init_ctr)"
+    done
+
+    for ctr in $(oc get pod "$pod_name" -n "$NAMESPACE" \
+        -o jsonpath='{.spec.containers[*].name}' 2>/dev/null); do
+        [[ -n "$ctr" ]] || continue
+        echo "${prefix}--- logs pod/$pod_name -c $ctr (tail $log_tail) ---"
+        oc logs "$pod_name" -n "$NAMESPACE" -c "$ctr" --tail="$log_tail" 2>&1 \
+            | sed "s/^/${prefix}/" || echo "${prefix}(no logs for $ctr)"
+        restart_count=$(oc get pod "$pod_name" -n "$NAMESPACE" \
+            -o jsonpath="{.status.containerStatuses[?(@.name==\"${ctr}\")].restartCount}" \
+            2>/dev/null) || restart_count="0"
+        if [[ "${restart_count:-0}" -gt 0 ]]; then
+            echo "${prefix}--- logs pod/$pod_name -c $ctr --previous (tail $log_tail) ---"
+            oc logs "$pod_name" -n "$NAMESPACE" -c "$ctr" --previous --tail="$log_tail" 2>&1 \
+                | sed "s/^/${prefix}/" || true
+        fi
+    done
+
+    if [[ -n "$preferred_container" ]]; then
+        echo "${prefix}--- logs pod/$pod_name -c $preferred_container (preferred, tail $log_tail) ---"
+        oc logs "$pod_name" -n "$NAMESPACE" -c "$preferred_container" --tail="$log_tail" 2>&1 \
+            | sed "s/^/${prefix}/" || true
+    fi
+
+    echo "${prefix}========== end failure logs: pod/$pod_name =========="
+}
+
 wait_for_pod() {
     local pod_name="$1"
     local max_attempts="${2:-24}"
@@ -69,6 +130,7 @@ wait_for_pod() {
     done
 
     echo "Pod $pod_name not ready after $((max_attempts * 3))s (last phase: ${phase:-unknown})"
+    e2e_ops_dump_pod_logs "$pod_name" "" 250
     return 1
 }
 
@@ -286,9 +348,7 @@ wait_for_llama_stack_http_health() {
         fi
     done
     echo "ERROR: Llama Stack did not respond on http://127.0.0.1:8321/v1/health inside the pod"
-    oc get pod llama-stack-service -n "$NAMESPACE" -o wide 2>&1 || true
-    oc describe pod llama-stack-service -n "$NAMESPACE" 2>&1 | tail -40 || true
-    oc logs llama-stack-service -n "$NAMESPACE" -c llama-stack-container --tail=120 2>&1 || true
+    e2e_ops_dump_pod_logs "llama-stack-service" "llama-stack-container" 250
     return 1
 }
 
@@ -332,16 +392,7 @@ cmd_restart_lightspeed() {
     echo "[e2e-ops] Waiting for lightspeed-stack-service Ready (max ${lcs_pod_wait} attempts, $((lcs_pod_wait * 3))s)..."
     if ! wait_for_pod "lightspeed-stack-service" "$lcs_pod_wait"; then
         pod_ready=false
-        echo "⚠️  Pod not ready within $((lcs_pod_wait * 3))s — dumping diagnostics:"
-        if oc get pod lightspeed-stack-service -n "$NAMESPACE" &>/dev/null; then
-            oc describe pod lightspeed-stack-service -n "$NAMESPACE" 2>&1 | tail -40 || true
-            oc logs lightspeed-stack-service -n "$NAMESPACE" -c lightspeed-stack-container --tail=60 2>&1 || true
-        else
-            echo "[e2e-ops] lightspeed-stack-service pod not found in namespace $NAMESPACE"
-            oc get pods -n "$NAMESPACE" 2>&1 | grep -E 'lightspeed|NAME' || true
-            oc get events -n "$NAMESPACE" --field-selector involvedObject.name=lightspeed-stack-service \
-                --sort-by='.lastTimestamp' 2>&1 | tail -15 || true
-        fi
+        echo "⚠️  Pod not ready within $((lcs_pod_wait * 3))s"
     fi
 
     # Re-label pod for service discovery (ignore if pod was deleted / not created yet)
@@ -357,6 +408,7 @@ cmd_restart_lightspeed() {
 
     if [[ "$pod_ready" == "false" ]]; then
         echo "⚠️  Lightspeed restart completed but pod was slow to become ready"
+        e2e_ops_dump_pod_logs "lightspeed-stack-service" "lightspeed-stack-container" 150
         return 1
     fi
     echo "✓ Lightspeed restart complete"
@@ -386,6 +438,8 @@ cmd_restart_llama_stack() {
             echo "[e2e-ops] Syncing e2e-mock-tls-certs secret before llama-stack apply..."
             if ! cmd_sync_mock_tls_certs_secret; then
                 echo "===== Llama-stack restore FAILED (mock TLS certs secret sync) ====="
+                e2e_ops_dump_pod_logs "e2e-mock-tls-inference" "e2e-mock-tls-inference" 120
+                e2e_ops_dump_pod_logs "llama-stack-service" "llama-stack-container" 120
                 exit 1
             fi
         fi
@@ -403,9 +457,6 @@ cmd_restart_llama_stack() {
         echo "[e2e-ops] Waiting for llama-stack-service Ready (max ${llama_pod_wait} attempts, $((llama_pod_wait * 3))s)..."
         if ! wait_for_pod "llama-stack-service" "$llama_pod_wait"; then
             echo "===== Llama-stack restore FAILED (pod not Ready within $((llama_pod_wait * 3))s) ====="
-            oc get pod llama-stack-service -n "$NAMESPACE" -o wide 2>&1 || true
-            oc describe pod llama-stack-service -n "$NAMESPACE" 2>&1 | tail -50 || true
-            oc logs llama-stack-service -n "$NAMESPACE" -c llama-stack-container --tail=80 2>&1 || true
             exit 1
         fi
         echo "Labeling pod for service..."
@@ -413,12 +464,14 @@ cmd_restart_llama_stack() {
         if [[ "${E2E_COPY_INTERCEPTION_CA_TO_LLAMA:-0}" == "1" ]]; then
             if ! _verify_interception_ca_mounted_in_llama; then
                 echo "===== Llama-stack restore FAILED (interception CA not mounted) ====="
+                e2e_ops_dump_pod_logs "llama-stack-service" "llama-stack-container" 150
                 exit 1
             fi
         fi
         if [[ "${E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA:-0}" == "1" ]]; then
             if ! _verify_mock_tls_certs_mounted_in_llama; then
                 echo "===== Llama-stack restore FAILED (mock TLS certs not mounted) ====="
+                e2e_ops_dump_pod_logs "llama-stack-service" "llama-stack-container" 150
                 exit 1
             fi
         fi
@@ -442,6 +495,7 @@ cmd_restart_llama_stack() {
 
     if ! cmd_restart_llama_port_forward; then
         echo "ERROR: Llama pod is up but localhost:${LOCAL_LLAMA_PORT:-8321} port-forward failed"
+        e2e_ops_dump_pod_logs "llama-stack-service" "llama-stack-container" 150
         exit 1
     fi
 
@@ -589,8 +643,10 @@ cmd_restart_llama_port_forward() {
 
     echo "Failed to establish Llama Stack port-forward on :$local_port"
     if [[ -s "$llama_pf_log" ]]; then
+        echo "[e2e-ops] $llama_pf_log (tail 30):"
         tail -30 "$llama_pf_log" 2>/dev/null | sed 's/^/[e2e-ops] /' || true
     fi
+    e2e_ops_dump_pod_logs "llama-stack-service" "llama-stack-container" 150
     return 1
 }
 
@@ -856,7 +912,10 @@ cmd_sync_mock_tls_certs_secret() {
         fi
 
         if ! _wait_for_mock_tls_inference_pod; then
-            [[ $recycle_attempt -lt 2 ]] && _recycle_mock_tls_inference_pod && continue
+            if [[ $recycle_attempt -lt 2 ]] && _recycle_mock_tls_inference_pod; then
+                continue
+            fi
+            e2e_ops_dump_pod_logs "e2e-mock-tls-inference" "e2e-mock-tls-inference" 120
             return 1
         fi
 
@@ -865,8 +924,7 @@ cmd_sync_mock_tls_certs_secret() {
         for f in "${_MOCK_TLS_CERT_FILES[@]}"; do
             if ! _copy_mock_tls_cert_from_pod "$mock_pod_name" "$f" "$tmpdir/$f"; then
                 echo "ERROR: failed to read /certs/$f from e2e-mock-tls-inference pod" >&2
-                oc logs "$mock_pod_name" -n "$NAMESPACE" -c e2e-mock-tls-inference --tail=40 2>&1 \
-                    | sed 's/^/[e2e-ops] /' || true
+                e2e_ops_dump_pod_logs "$mock_pod_name" "e2e-mock-tls-inference" 120
                 sync_ok=false
                 break
             fi
@@ -892,6 +950,7 @@ cmd_sync_mock_tls_certs_secret() {
         fi
     done
 
+    e2e_ops_dump_pod_logs "e2e-mock-tls-inference" "e2e-mock-tls-inference" 120
     return 1
 }
 
@@ -956,17 +1015,23 @@ cmd_deploy_e2e_mock_tls_inference() {
     oc apply -n "$NAMESPACE" -f "$MANIFEST_DIR/e2e-mock-tls-inference.yaml"
     if ! oc wait pod/e2e-mock-tls-inference -n "$NAMESPACE" --for=condition=Ready --timeout=240s; then
         echo "ERROR: e2e-mock-tls-inference failed to become ready" >&2
-        oc describe pod/e2e-mock-tls-inference -n "$NAMESPACE" 2>/dev/null | tail -30 || true
-        oc logs e2e-mock-tls-inference -n "$NAMESPACE" --tail=40 2>&1 || true
+        e2e_ops_dump_pod_logs "e2e-mock-tls-inference" "e2e-mock-tls-inference" 150
         return 1
     fi
     echo "✓ e2e-mock-tls-inference ready at https://e2e-mock-tls-inference.${NAMESPACE}.svc.cluster.local:8443"
     if ! cmd_sync_mock_tls_certs_secret; then
         echo "WARNING: mock TLS server is up but e2e-mock-tls-certs secret sync failed" >&2
+        e2e_ops_dump_pod_logs "e2e-mock-tls-inference" "e2e-mock-tls-inference" 150
         return 1
     fi
 }
 
+cmd_dump_pod_logs() {
+    local pod_name="${1:?pod name required}"
+    local container="${2:-}"
+    e2e_ops_dump_pod_logs "$pod_name" "$container" 200
+}
+
 cmd_disrupt_llama_stack() {
     local pod_name="llama-stack-service"
 
@@ -1043,6 +1108,9 @@ case "$COMMAND" in
     sync-mock-tls-certs-secret)
         cmd_sync_mock_tls_certs_secret
         ;;
+    dump-pod-logs)
+        cmd_dump_pod_logs "$@"
+        ;;
     *)
         echo "Usage: $0 <command> [args...]"
         echo ""
@@ -1063,6 +1131,7 @@ case "$COMMAND" in
         echo "  deploy-e2e-interception-proxy      - Deploy in-cluster interception proxy pod"
         echo "  deploy-e2e-mock-tls-inference        - Deploy mock HTTPS inference server (tls.feature)"
         echo "  sync-mock-tls-certs-secret           - Publish mock TLS /certs to Secret for llama mount"
+        echo "  dump-pod-logs <pod> [container]      - Events, describe, init + container logs"
         exit 1
         ;;
 esac
diff --git a/tests/e2e/features/environment.py b/tests/e2e/features/environment.py
index abcabe577..7ee711494 100644
--- a/tests/e2e/features/environment.py
+++ b/tests/e2e/features/environment.py
@@ -241,24 +241,26 @@ def before_scenario(context: Context, scenario: Scenario) -> None:
             delattr(context, _attr)
 
 
-def _dump_pod_logs_on_failure(scenario: Scenario, namespace: str) -> None:
-    """Dump llama-stack and lightspeed-stack pod logs when a scenario fails in Prow."""
+def _dump_pod_logs_on_failure(
+    context: Context, scenario: Scenario, namespace: str
+) -> None:
+    """Dump pod diagnostics when a scenario fails in Prow (init + main container logs)."""
     if scenario.status != "failed":
         return
-    for pod in ("llama-stack-service", "lightspeed-stack-service"):
-        print(f"--- {pod} logs (scenario failed: {scenario.name}) ---")
+    pods: tuple[str, ...] = ("llama-stack-service", "lightspeed-stack-service")
+    feature = getattr(context, "feature", None)
+    feat_file = getattr(feature, "filename", "") or "" if feature else ""
+    if "tls.feature" in feat_file:
+        pods = (*pods, "e2e-mock-tls-inference")
+    print(f"--- scenario failed: {scenario.name!r} — dumping pod logs ---", flush=True)
+    for pod in pods:
         try:
-            r = subprocess.run(
-                ["oc", "logs", pod, "-n", namespace, "--tail=100"],
-                capture_output=True,
-                text=True,
-                timeout=15,
-                check=False,
-            )
-            print(r.stdout or r.stderr or "(no output)")
+            result = run_e2e_ops("dump-pod-logs", [pod], timeout=90)
+            print(result.stdout, end="")
+            if result.stderr:
+                print(result.stderr, end="")
         except subprocess.TimeoutExpired:
-            print("(timed out fetching logs)")
-        print(f"--- end {pod} logs ---")
+            print(f"(timed out dumping logs for {pod})")
 
 
 def after_scenario(context: Context, scenario: Scenario) -> None:
@@ -292,7 +294,7 @@ def after_scenario(context: Context, scenario: Scenario) -> None:
     """
     if is_prow_environment():
         _dump_pod_logs_on_failure(
-            scenario, os.environ.get("NAMESPACE", "e2e-rhoai-dsc")
+            context, scenario, os.environ.get("NAMESPACE", "e2e-rhoai-dsc")
         )
 
     if getattr(context, "scenario_lightspeed_override_active", False):
diff --git a/tests/e2e/utils/prow_utils.py b/tests/e2e/utils/prow_utils.py
index 263b415a3..48056b243 100644
--- a/tests/e2e/utils/prow_utils.py
+++ b/tests/e2e/utils/prow_utils.py
@@ -118,8 +118,12 @@ def restart_pod(container_name: str) -> None:
         if result.returncode != 0:
             print(result.stderr, end="")
             combined = f"{result.stdout or ''}\n{result.stderr or ''}".strip()
-            tail = "\n".join(combined.splitlines()[-25:]) if combined else ""
-            detail = tail or f"exit {result.returncode}"
+            # Prefer full e2e-ops output when diagnostics were printed (TLS/Llama failures).
+            if "========== failure logs:" in combined:
+                detail = combined
+            else:
+                detail = "\n".join(combined.splitlines()[-40:]) if combined else ""
+            detail = detail or f"exit {result.returncode}"
             raise subprocess.CalledProcessError(
                 result.returncode,
                 op,