Neverdecel · Neverdecel · Jun 17, 2026 · Jun 17, 2026
diff --git a/coderag/api.py b/coderag/api.py
@@ -174,6 +174,15 @@ def delete_path(self, path: Union[str, Path]) -> int:
                 self.vectors.save()
         return len(removed)
 
+    def warm(self) -> None:
+        """Eagerly load the provider, store, vectors, and embedding model.
+
+        Done at server startup so the first query — and the demo UI's search-speed
+        badge — reflect warm performance, not the one-off lazy model load.
+        """
+        self.status()  # builds provider/store/vectors
+        self.provider.embed_query("warm up")  # loads the model + JITs the query path
+
     def status(self) -> dict:
         """Index statistics and provenance."""
         stats = self.store.stats()

diff --git a/coderag/surfaces/mcp_server.py b/coderag/surfaces/mcp_server.py
@@ -212,8 +212,7 @@ def reindex(path: Optional[str] = None, full: bool = False) -> dict:
 def _warm_up(cr: "CodeRAG") -> None:
     """Load the engine + embedding model once at startup, not on the first query."""
     try:
-        cr.status()  # builds provider/store/vectors
-        cr.provider.embed_query("warm up")  # loads the model and JITs the query path
+        cr.warm()
     except Exception:  # pragma: no cover - warm-up is best-effort
         logger.exception("MCP warm-up failed (continuing).")
 

diff --git a/coderag/surfaces/static/app.css b/coderag/surfaces/static/app.css
@@ -314,6 +314,10 @@ fieldset.field legend { padding: 0 0.3rem; color: var(--ink-3); font-family: var
 /* --- results --- */
 .results-head { display: flex; align-items: center; gap: 0.6rem; margin: 0.4rem 0 1rem; }
 .results-head h2 { font-size: 1.02rem; color: var(--ink-2); font-weight: 650; margin: 0; }
+/* demo-only: show how fast local retrieval was (separate from the AI answer) */
+.speed-badge { margin-left: auto; display: inline-flex; align-items: baseline; gap: 0.5rem; font-family: var(--mono); font-variant-numeric: tabular-nums; }
+.speed-badge .speed-ms { font-size: 0.78rem; font-weight: 700; color: var(--accent-strong); background: var(--accent-soft); border-radius: 999px; padding: 0.1rem 0.55rem; white-space: nowrap; }
+.speed-badge .speed-corpus { font-size: 0.72rem; color: var(--ink-3); }
 .results { list-style: none; margin: 0; padding: 0; display: flex; flex-direction: column; gap: 0.9rem; }
 .hit {
   background: var(--surface); border: 1px solid var(--border); border-radius: var(--radius);

diff --git a/coderag/surfaces/templates/base.html b/coderag/surfaces/templates/base.html
@@ -61,7 +61,7 @@
       {% if demo %}
       <div class="demo-banner">
         <span class="demo-dot" aria-hidden="true"></span>
-        <span><strong>Demo mode</strong> — search is unlimited; AI answers are capped at {{ demo_left }}/{{ demo_max }} this session ({{ demo_cooldown }}s between).</span>
+        <span><strong>Demo mode</strong> — instant local hybrid search, unlimited. AI-generated answers are a separate, optional step, capped at {{ demo_left }}/{{ demo_max }} this session ({{ demo_cooldown }}s between).</span>
       </div>
       {% endif %}
       {% block content %}{% endblock %}

diff --git a/coderag/surfaces/templates/index.html b/coderag/surfaces/templates/index.html
@@ -62,7 +62,16 @@
         <pre id="answer" class="answer" hidden></pre>
       </section>
     {% endif %}
-    <div class="results-head"><h2>{{ hits | length }} result{{ '' if hits | length == 1 else 's' }}</h2></div>
+    <div class="results-head">
+      <h2>{{ hits | length }} result{{ '' if hits | length == 1 else 's' }}</h2>
+      {% if demo and search_ms is defined %}
+      <span class="speed-badge" data-search-ms="{{ '%.0f' | format(search_ms) }}"
+            title="Local hybrid retrieval time — separate from the optional AI answer">
+        <span class="speed-ms">⚡ {{ '%.0f' | format(search_ms) }} ms</span>
+        {% if status %}<span class="speed-corpus">over {{ status.total_chunks }} chunks · {{ status.total_files }} files</span>{% endif %}
+      </span>
+      {% endif %}
+    </div>
     {% include "_results.html" %}
   {% else %}
     <div class="notice">No results for <strong>{{ q }}</strong> — try fewer or broader terms, or relax the filters.</div>

diff --git a/coderag/surfaces/webui.py b/coderag/surfaces/webui.py
@@ -192,16 +192,20 @@ def _run_search(
     langs: List[str],
     kinds: List[str],
     path: Optional[str],
-) -> List[SearchHit]:
+) -> Tuple[List[SearchHit], float]:
     """Search, then post-filter. Fetches extra candidates when filters are active.
 
     ``search`` has no server-side filtering, so to keep filtered results useful we pull a
-    larger candidate set and narrow it down to ``k`` here.
+    larger candidate set and narrow it down to ``k`` here. Also returns the wall-clock
+    retrieval time in milliseconds — timed around the ``.search()`` call only (not
+    filtering or highlighting) — so the demo UI can show how fast the index answers.
     """
     filtering = bool(langs or kinds or path)
     fetch = max(k, 50) if filtering else k
+    t0 = time.perf_counter()
     hits = _searcher_for(cr, dense, lexical).search(query, fetch)
-    return _apply_filters(hits, langs, kinds, path)[:k]
+    elapsed_ms = (time.perf_counter() - t0) * 1000.0
+    return _apply_filters(hits, langs, kinds, path)[:k], elapsed_ms
 
 
 # --- app factory ---
@@ -368,7 +372,7 @@ def home(
             }
         )
         if q and q.strip():
-            hits = _run_search(
+            hits, search_ms = _run_search(
                 cr,
                 q.strip(),
                 k,
@@ -379,6 +383,7 @@ def home(
                 path=path,
             )
             ctx["hits"] = _hit_views(hits)
+            ctx["search_ms"] = search_ms
             ctx["answer_qs"] = urlencode({"q": q.strip(), "k": k})
         resp = templates.TemplateResponse(request, "index.html", ctx)
         if demo:
@@ -498,6 +503,10 @@ def healthz() -> Dict[str, str]:
 def run_ui(cr: "CodeRAG", host: str = "127.0.0.1", port: int = 8501) -> None:
     import uvicorn
 
-    # Warm the index/provider so the first request isn't slow.
-    cr.status()
+    # Warm the index, provider, AND embedding model so the first request — and the demo's
+    # search-speed badge — reflect warm performance, not the one-off lazy model load.
+    try:
+        cr.warm()
+    except Exception:  # pragma: no cover - warm-up is best-effort
+        logger.exception("UI warm-up failed (continuing).")
     uvicorn.run(create_ui_app(cr), host=host, port=port)
diff --git a/tests/test_webui.py b/tests/test_webui.py
@@ -49,6 +49,7 @@ def test_search_renders_highlighted_hits(ui):
     assert "auth.py" in r.text
     assert 'class="highlight"' in r.text  # Pygments output present
     assert "/file?path=" in r.text  # citation links into the file viewer
+    assert "data-search-ms=" not in r.text  # speed badge is demo-only
 
 
 def test_filters_narrow_results(ui):
@@ -153,6 +154,16 @@ def test_demo_mode_banner_caps_and_hidden_reindex(tmp_path):
     assert cr.store.total_chunks() == n0
 
 
+def test_demo_mode_shows_search_speed_badge(tmp_path):
+    cr, client = _demo_client(tmp_path)
+    r = client.get("/", params={"q": "authenticate"})
+    # Demo mode surfaces retrieval speed and frames it as separate from AI answers.
+    assert "data-search-ms=" in r.text
+    assert "instant local" in r.text.lower()  # reworded demo banner
+    # The empty landing (no query, no results) shows no badge.
+    assert "data-search-ms=" not in client.get("/").text
+
+
 def test_demo_answer_quota_is_enforced(tmp_path):
     # No LLM backend → each allowed answer streams an "unavailable" notice, but it
     # still charges the soft per-session quota (the gate charges on attempt).